def parse_next(self, response): user_id = response.meta.get('user_id') user_type = response.meta.get('type') wm_users = WitMartUsers() # if user data is found in the db then query the data otherwise insert new record and return the newly created data. user = wm_users.find_or_insert(user_id) if 'name' in user: # for each url to task details page, parse info regarding first & last bids made for both job posts and work history for url in response.css('ul.jinjobs li'): link = url.css('a.ajobs::attr(href)').extract_first() if link is not None and link.find('user/sign/in') == -1 and link.find('/logo-design/') >= 0: temp = url.css('p')[0].extract() if temp.find("Status: Completed") >= 0: attrs = strip_tags_spaces(temp).split('|') if user_type == 'jp': user['job_post_completed_bids'].append(int(attrs[3].replace("Bids", "").strip())) if user['job_post_last_bid'] == "": user['job_post_last_bid'] = attrs[0].strip() user['job_post_first_bid'] = attrs[0].strip() else: if user['work_done_first_bid'] == "": user['work_done_first_bid'] = attrs[0].strip() user['work_done_last_bid'] = attrs[0].strip() # update user data wm_users.update_user(user) wm_users.close() # if there is next page, then go to next page next_page = response.css('div.oturning span.st4 a.reviews::attr(href)').extract_first() if next_page is not None: param = {'user_id': user_id, 'type': user_type} yield response.follow(next_page, callback=self.parse_next, meta=param)
def parse_work_history(self, response): user_id = response.meta.get('user_id') wm_users = WitMartUsers() # if user data is found in the db then query the data otherwise insert new record and return the newly created data. user = wm_users.find_or_insert(user_id) # parse and collect rating data if exist review = response.css('div.job-list h3').extract_first() if review is not None and "Rating" in review: work_done = response.css('div.job-list h3::text').extract_first() user['work_done'] = 0 if work_done is None else int(work_done[26 : -1]) for rating in response.css('div.job-list_re1 ul')[0].css('li'): temp = rating.css('a::text').extract_first()[0] user['work_done_rating_' + temp] = int(rating.css('span::text').extract_first()) user['work_done_awarded'] = int(response.css('div.job-list_re1 ul')[1].css('li')[0].css('a::text').extract_first().split(' ')[0]) user['work_done_completed'] = int(response.css('div.job-list_re1 ul')[1].css('li')[1].css('a::text').extract_first().split(' ')[0]) user['earning'] = '$0' if user['work_done'] == 0 else response.css('div.job-list_re1 p::text').extract_first() else: # set to default values user['work_done'] = 0 user['work_done_rating_5'] = 0 user['work_done_rating_4'] = 0 user['work_done_rating_3'] = 0 user['work_done_rating_2'] = 0 user['work_done_rating_1'] = 0 user['work_done_awarded'] = 0 user['work_done_completed'] = 0 user['earning'] = '$0' # parse first & last work history bids made user['work_done_first_bid'] = "" user['work_done_last_bid'] = "" for url in response.css('ul.jinjobs li'): link = url.css('a.ajobs::attr(href)').extract_first() if link is not None and link.find('user/sign/in') == -1 and link.find('/logo-design/') >= 0: temp = url.css('p')[0].extract() if temp.find("Status: Completed") >= 0: attrs = strip_tags_spaces(temp).split('|') if user['work_done_first_bid'] == "": user['work_done_first_bid'] = attrs[0].strip() user['work_done_last_bid'] = attrs[0].strip() # update user data wm_users.update_user(user) wm_users.close() next_page = response.css('div.oturning span.st4 a.reviews::attr(href)').extract_first() if next_page is not None: param = {'user_id': user_id, 'type': 'wh'} yield response.follow(next_page, callback=self.parse_next, meta=param)
def parse_job_post(self, response): user_id = response.meta.get('user_id') wm_users = WitMartUsers() # if user data is found in the db then query the data otherwise insert new record and return the newly created data. user = wm_users.find_or_insert(user_id) user['name'] = response.css('div.user-show_r h1::text').extract_first() user['location'] = response.css('div.user-show_r span span::text').extract_first() user['membership'] = response.css('div.user-show_r a.goldshow span::text').extract_first() user['verified_name'] = 1 if response.css('i.Verification4_1').extract_first() is not None or response.css('i.Verification5_1').extract_first() is not None else 0 user['verified_email'] = 1 if response.css('i.Verification2_1').extract_first() is not None else 0 user['verified_phone'] = 1 if response.css('i.Verification3_1').extract_first() is not None else 0 followers = 0 following = 0 for s in response.css('div.follow_n a'): href = s.css('a::attr(href)').extract_first() val = int(s.css('a::text').extract_first()) if href.find('followers') >= 0: followers = val elif href.find('following') >= 0: following = val user['followers'] = followers user['following'] = following user['professions'] = [] occ = response.css('div.user-show_r dd ul').extract_first() if occ is not None: occ_list = strip_tags_spaces(occ).strip() if occ_list != "": user['professions'] = occ_list.replace('Professions: ', '').split(' , ') # parse and collect rating data if exist review = response.css('div.job-list h3').extract_first() if review is not None and "Rating" in review: job_posts = response.css('div.job-list h3::text').extract_first() user['job_posts'] = 0 if job_posts is None else int(job_posts[24 : -1]) for rating in response.css('div.job-list_re ul')[0].css('li'): temp = rating.css('a::text').extract_first()[0] user['job_post_rating_' + temp] = int(rating.css('span::text').extract_first()) user['job_post_completed'] = int(response.css('div.job-list_re ul')[1].css('li')[1].css('a::text').extract_first().split(' ')[0]) user['job_post_cancelled'] = int(response.css('div.job-list_re ul')[1].css('li')[2].css('a::text').extract_first().split(' ')[0]) user['spending'] = '$0' if user['job_posts'] == 0 else response.css('div.job-list_re p::text').extract_first() else: # set to default values user['job_posts'] = 0 user['job_post_rating_5'] = 0 user['job_post_rating_4'] = 0 user['job_post_rating_3'] = 0 user['job_post_rating_2'] = 0 user['job_post_rating_1'] = 0 user['job_post_completed'] = 0 user['job_post_cancelled'] = 0 user['spending'] = '$0' # parse list of jobs posted, to collect first & last job posts and bids made user['job_post_first_bid'] = "" user['job_post_last_bid'] = "" user['job_post_completed_bids'] = [] for url in response.css('ul.jinjobs li'): link = url.css('a.ajobs::attr(href)').extract_first() if link is not None and link.find('user/sign/in') == -1 and link.find('/logo-design/') >= 0: temp = url.css('p')[0].extract() if temp.find("Status: Completed") >= 0: attrs = strip_tags_spaces(temp).split('|') user['job_post_completed_bids'].append(int(attrs[3].replace("Bids", "").strip())) if user['job_post_last_bid'] == "": user['job_post_last_bid'] = attrs[0].strip() user['job_post_first_bid'] = attrs[0].strip() # update user data wm_users.update_user(user) wm_users.close() next_page = response.css('div.oturning span.st4 a.reviews::attr(href)').extract_first() if next_page is not None: param = {'user_id': user_id, 'type': 'jp'} yield response.follow(next_page, callback=self.parse_next, meta=param)
def parse_job(self, response): job_id = response.css('input#jobid::attr(value)').extract_first() jobs = WitMartJobs() # if the page contains job id that can be parsed and the job_id is not found in collection "jobs" if job_id is not None: job_data = jobs.find_job_by_id(job_id) if job_data is None: data = {} data['job_id'] = job_id data['title'] = response.css( 'div.gj_title h2::text').extract_first().strip() temp = response.css('div.gj_title p.dq_nav a') data['employer'] = temp[0].css( 'a::attr(href)').extract_first()[3:] data['category'] = temp[1].css( 'a::text').extract_first().strip() data['type'] = temp[2].css('a::text').extract_first().strip() temp = response.css('table.t_details tr') data['status'] = strip_tags_spaces(temp[0].css('td')[0].css( 'td').extract_first())[12:] # remove 'Job Status: ' #if the element contains word "reward/Reward" if temp[1].css('td')[0].css( 'b.g-f14::text').extract_first().find('eward') >= 0: data['reward'] = strip_tags_spaces(temp[1].css( 'td')[0].css('td strong::text').extract_first()) else: # if there is another detail about the pay rate else set default to Negotiable reward = 'Negotiable' if len(temp) >= 3: reward = strip_tags_spaces( temp[2].css('td').extract_first()) idx = reward.find(':') if idx >= 0: reward = reward[idx + 2:].strip() data['reward'] = reward data['bid_start'] = strip_tags_spaces(temp[0].css('td')[1].css( 'td').extract_first())[17:] #remove 'Bidding Started: ' data['bid_end'] = strip_tags_spaces(temp[1].css('td')[1].css( 'td').extract_first())[15:] #remove 'Bidding Ended: ' # trying to parse the description. Some pages have different element format if response.css('div#j-langdes').extract_first() is not None: temp = strip_tags_spaces( response.css('div#j-langdes').extract_first()).strip() if temp == "" and response.css( 'div#j-hidefortrans').extract_first() is not None: attrs = response.css( 'div#j-hidefortrans h5::text').extract() values = response.css( 'div#j-hidefortrans div.JOBDESC').extract() for kvp in zip(attrs, values): temp += kvp[0] + ": " + strip_tags_spaces( kvp[1]) + "; " data['description'] = temp # parse bidders and who won the bid / got hired data['bid_list'] = [] data['winner_list'] = [] for bidder in response.css('div#all_bids dl.list'): user_id = bidder.css( 'dd.col1::attr(value)').extract_first() data['bid_list'].append(user_id) winner = False if bidder.css('dd.zb').extract_first() is not None: data['winner_list'].append(user_id) winner = True # insert / update connection collection self.create_connection(data['job_id'], data['employer'], user_id, winner) # parse skills required to do the task data['required_skills'] = [] for skill in response.css('div.t_des div.mt20 a'): data['required_skills'].append( skill.css('a::text').extract_first()) # insert data to collection "jobs" jobs.insert_job(data) # if total bidders are > 10 then follow the next page to crawl next bidders next_bid_page = response.css( 'i.next a::attr(href)').extract_first() if next_bid_page is not None: yield response.follow(next_bid_page, callback=self.parse_bid) else: for bidder in job_data['bid_list']: #add logo design relationship winner = False if bidder in job_data['winner_list']: winner = True self.create_connection(job_data['job_id'], job_data['employer'], bidder, winner) jobs.close()
def parse_job(self, response): level = response.meta.get('level') job_id = response.css('input#jobid::attr(value)').extract_first() wm_jobs = WitMartJobs() if job_id is not None: # job_data = wm_jobs.find_job_by_id(job_id) # if job_data is None: data = {} data['job_id'] = job_id data['title'] = response.css('div.gj_title h2::text').extract_first().strip() temp = response.css('div.gj_title p.dq_nav a') data['employer'] = temp[0].css('a::attr(href)').extract_first()[3:] data['category'] = temp[1].css('a::text').extract_first().strip() data['type'] = temp[2].css('a::text').extract_first().strip() temp = response.css('table.t_details tr') data['status'] = strip_tags_spaces(temp[0].css('td')[0].css('td').extract_first())[12:] # remove 'Job Status: ' if data['status'] == 'Completed' and data['category'] == 'Graphic & Logo Design': if temp[1].css('td')[0].css('b.g-f14::text').extract_first().find('eward') >= 0: data['reward'] = strip_tags_spaces(temp[1].css('td')[0].css('td strong::text').extract_first()) else: reward = 'Negotiable' if len(temp) >= 3: reward = strip_tags_spaces(temp[2].css('td').extract_first()) idx = reward.find(':') if idx >= 0: reward = reward[idx + 2:].strip() data['reward'] = reward data['bid_start'] = strip_tags_spaces(temp[0].css('td')[1].css('td').extract_first())[17:] #remove 'Bidding Started: ' data['bid_end'] = strip_tags_spaces(temp[1].css('td')[1].css('td').extract_first())[15:] #remove 'Bidding Ended: ' if response.css('div#j-langdes').extract_first() is not None: temp = strip_tags_spaces(response.css('div#j-langdes').extract_first()).strip() if temp == "" and response.css('div#j-hidefortrans').extract_first() is not None: attrs = response.css('div#j-hidefortrans h5::text').extract() values = response.css('div#j-hidefortrans div.JOBDESC').extract() for kvp in zip(attrs,values): temp += kvp[0] + ": " + strip_tags_spaces(kvp[1]) + "; " data['description'] = temp data['bid_list'] = [] data['winner_list'] = [] for bidder in response.css('div#all_bids dl.list'): user_id = bidder.css('dd.col1::attr(value)').extract_first() data['bid_list'].append(user_id) winner = False if bidder.css('dd.zb').extract_first() is not None: data['winner_list'].append(user_id) winner = True # call create_connection() method for each bidder for req in self.create_connection(level, job_id, data['employer'], user_id, winner): yield req data['required_skills'] = [] for skill in response.css('div.t_des div.mt20 a'): data['required_skills'].append(skill.css('a::text').extract_first()) # temporary to update records job_data = wm_jobs.find_job_by_id(job_id) if job_data is None: wm_jobs.insert_job(data) else: wm_jobs.update_job(data) next_bid_page = response.css('i.next a::attr(href)').extract_first() if next_bid_page is not None: yield response.follow(next_bid_page, callback=self.parse_bid, meta={'level': level}) # else: # for bidder in job_data['bid_list']: # #add logo design relationship # winner = False # if bidder in job_data['winner_list']: # winner = True # for req in self.create_connection(level, job_data['job_id'], job_data['employer'], bidder, winner): # yield req wm_jobs.close()
def parse_poster(self, response): user_id = response.meta.get('user_id') wm_users = WitMartUsers() user = wm_users.find_or_insert(user_id) user['name'] = response.css('div.user-show_r h1::text').extract_first() user['location'] = response.css('div.user-show_r span span::text').extract_first() user['membership'] = response.css('div.user-show_r a.goldshow span::text').extract_first() user['verified_name'] = 1 if response.css('i.Verification4_1').extract_first() is not None or response.css('i.Verification5_1').extract_first() is not None else 0 user['verified_email'] = 1 if response.css('i.Verification2_1').extract_first() is not None else 0 user['verified_phone'] = 1 if response.css('i.Verification3_1').extract_first() is not None else 0 followers = 0 following = 0 for s in response.css('div.follow_n a'): href = s.css('a::attr(href)').extract_first() val = int(s.css('a::text').extract_first()) if href.find('followers') >= 0: followers = val elif href.find('following') >= 0: following = val user['followers'] = followers user['following'] = following user['professions'] = [] occ = response.css('div.user-show_r dd ul').extract_first() if occ is not None: occ_list = strip_tags_spaces(occ).strip() if occ_list != "": user['professions'] = occ_list.replace('Professions: ', '').split(' , ') review = response.css('div.job-list h3').extract_first() if review is not None and "Rating" in review: job_posts = response.css('div.job-list h3::text').extract_first() user['job_posts'] = 0 if job_posts is None else int(job_posts[24 : -1]) for rating in response.css('div.job-list_re ul')[0].css('li'): temp = rating.css('a::text').extract_first()[0] user['job_post_rating_' + temp] = int(rating.css('span::text').extract_first()) user['job_post_completed'] = int(response.css('div.job-list_re ul')[1].css('li')[1].css('a::text').extract_first().split(' ')[0]) user['job_post_cancelled'] = int(response.css('div.job-list_re ul')[1].css('li')[2].css('a::text').extract_first().split(' ')[0]) user['spending'] = '$0' if user['job_posts'] == 0 else response.css('div.job-list_re p::text').extract_first() else: user['job_posts'] = 0 user['job_post_rating_5'] = 0 user['job_post_rating_4'] = 0 user['job_post_rating_3'] = 0 user['job_post_rating_2'] = 0 user['job_post_rating_1'] = 0 user['job_post_completed'] = 0 user['job_post_cancelled'] = 0 user['spending'] = '$0' # parse list of jobs posted, to collect first & last job posts and bids made user['job_post_first_bid'] = "" user['job_post_last_bid'] = "" user['job_post_completed_bids'] = [] wm_users.update_user(user) wm_users.close() # set parameter for initial call params = {'user_id': user_id, 'user_type': 'jp'} for req in self.parse_task(response, params): yield req