def get_max_page(self, html=None): if not html: response = urllib2.urlopen(get_allhit_url()) html = response.read() pattern = re.compile(r"<a href=\".*pageNumber=([0-9]+).{150,200}Last</a>") max_page = re.search(pattern, html) if max_page: return int(max_page.group(1)) else: return 1
def get_max_page(self, html=None): if not html: response = urllib2.urlopen(get_allhit_url()) html = response.read() pattern = re.compile( r"<a href=\".*pageNumber=([0-9]+).{150,200}Last</a>") max_page = re.search(pattern, html) if max_page: return int(max_page.group(1)) else: return 1
def run(self): pid = Pid('mturk_crawler', True) logging.info('Crawler started') start_time = datetime.datetime.now() #Fetching statistical information about groups and HITs count logging.debug("Fetching stats") main_response = urllib2.urlopen(get_allhit_url()) main_html = main_response.read() main_soup = BeautifulSoup( main_html, parseOnlyThese=SoupStrainer( text=re.compile("(^[0-9,]+ HITs|of [0-9]+ Results)"))) main_stats = [tag for tag in main_soup] hits_available = -1 groups_available = -1 if len(main_stats) > 1: hits_available_tmp = main_stats[0] hits_available_tmp = hits_available_tmp[:hits_available_tmp. find(' ')].replace( ',', '') hits_available = int(hits_available_tmp) groups_available_tmp = main_stats[1] groups_available_tmp = groups_available_tmp[ groups_available_tmp.find('of') + 3:groups_available_tmp.find('Results') - 1] groups_available = int(groups_available_tmp) main_soup = None #Fetching data from every mturk.com HITs list page logging.debug("Allhit processing") result_allhit = self.process_values( range(1, self.get_max_page(main_html) + 1), callback_allhit, self.processes_count) self.data = result_allhit['data'] self.append_errors(result_allhit['errors']) #Fetching html details for every HIT group logging.debug("Details processing") result_details = self.process_values(self.data, callback_details, self.processes_count) self.data = result_details['data'] self.append_errors(result_details['errors']) hits_downloaded = sum( [hgs['HitGroupStatus']['hits_available'] for hgs in self.data]) groups_downloaded = len(self.data) #Logging crawl information into the database success = False if groups_downloaded > 0 and hits_downloaded > 0 and groups_available / groups_downloaded <= 1.5 and hits_available / hits_downloaded <= 1.5: success = True logging.debug( "Crawl finished with success=%s. Saving main_crawl entry" % success) crawl = Crawl( **{ 'start_time': start_time, 'end_time': datetime.datetime.now(), 'success': success, 'hits_available': hits_available, 'hits_downloaded': hits_downloaded, 'groups_available': groups_available, 'groups_downloaded': groups_downloaded, #'errors': str(self.errors) # ! 'errors': '' }) crawl.save() #Adding crawl FK logging.debug("Adding FKs") result_add_crawlfk = self.process_values(self.data, callback_add_crawlfk, crawl=crawl) self.data = result_add_crawlfk['data'] self.append_errors(result_add_crawlfk['errors']) #Saving results in the database logging.debug("Saving results") result_save_database = self.process_values(self.data, callback_database) self.append_errors(result_save_database['errors']) print self.errors logging.info( "Crawler finished %ssuccessfully in %s with %d results, %d HITs (of %d and %d) and %d errors" % ("" if success else "un", (datetime.datetime.now() - start_time), groups_downloaded, hits_downloaded, groups_available, hits_available, len(self.errors))) pid.remove_pid()
def callback_allhit(pages, **kwargs): if type(pages) != type([]): raise Exception, '::callback_allhit() must be called with one list argument' def remove_newline_fields(list): while True: try: list.remove("\n") except: break return list # def is_soup(object): # soup = BeautifulSoup() # if type(object) == type(soup) or type(object) == type(ResultSet('')) or type(object) == type(Tag(soup, "div", [])): # return True # return False data = [] errors = [] # Processing every page for page_number in pages: try: # Downloading page logging.info("Downloading page: %s" % page_number) page_url = get_allhit_url(page_number) logging.debug("Downloading %s" % page_url) response = urllib2.urlopen(page_url) html = response.read() soup = BeautifulSoup(html) # Parsing HIT groups' list table = soup.find('table', cellpadding='0', cellspacing='5', border='0', width='100%') if type(table) == type(None): i = 0 while i < 3: logging.warn("Soup returned an empty table for page %s. Trying once more" % page_number); response = urllib2.urlopen(page_url) html = response.read() soup = BeautifulSoup(html) table = soup.find('table', cellpadding='0', cellspacing='5', border='0', width='100%') if type(table) != type(None): break else: table = None soup = None html = None i = i + 1 if type(table) == type(None): logging.warn("Soup returned an empty table. This should not happen. Skipping page") continue table.contents = remove_newline_fields(table.contents) # Parsing and fetching information about each group for i_group in range(0,len(table.contents)): logging.debug("Processing group %s on page %s" % (i_group,page_number)) try: group_html = table.contents[i_group] # Title title = group_html.find('a', {'class':'capsulelink'}) if type(title) != type(None): try: title = str(title.contents[0]) except: title = unicode(title.contents[0]) try: title = unicode(remove_whitespaces(title)) except: title = '' fields = group_html.findAll('td', {'align':'left','valign':'top','class':'capsule_field_text'}) if len(fields) == 7: # Requester's name and ID requester_html = remove_newline_fields(fields[0].contents)[0] requester_name = unicode(requester_html.contents[0]) requester_id = requester_html['href'] start = requester_id.index('requesterId=')+12 stop = requester_id.index('&state') requester_id = requester_id[start:stop] # HIT group expiration date hit_expiration_date = remove_newline_fields(fields[1].contents)[0] hit_expiration_date = remove_whitespaces(strip_html(hit_expiration_date)) hit_expiration_date = hit_expiration_date[:hit_expiration_date.index('(')-2] hit_expiration_date = datetime.datetime.strptime(hit_expiration_date, '%b %d, %Y') # Time alloted time_alloted = remove_newline_fields(fields[2].contents)[0] time_alloted = remove_whitespaces(strip_html(time_alloted)) time_alloted = int(time_alloted[:time_alloted.index(' ')]) # Reward reward = float(remove_newline_fields(fields[3].contents)[0][1:]) # HITs available hits_available = int(remove_newline_fields(fields[4].contents)[0]) # Description description = unicode(remove_newline_fields(fields[5].contents)[0]) # Keywords keywords_raw = remove_newline_fields(fields[6].contents) keywords = [] for i in range(0, len(keywords_raw)): try: keyword = keywords_raw[i].contents[0] keywords.append(keyword) except: continue keywords = unicode(fuse(keywords, ',')) # Qualification qualifications = '' qfields = group_html.findAll('td', {'style':'padding-right: 2em; white-space: nowrap;'}) if len(qfields) > 0: qfields = [remove_whitespaces(unicode(remove_newline_fields(qfield.contents)[0])) for qfield in qfields] qualifications = fuse(qfields, ', ') qfields = None # Occurrence date occurrence_date = datetime.datetime.now() # Group ID group_id = group_html.find('span', {'class':'capsulelink'}) group_id_hashed = False if type(group_id) != type(None): group_id = remove_newline_fields(group_id.contents)[0] if 'href' in group_id._getAttrMap(): start = group_id['href'].index('groupId=')+8 stop = group_id['href'].index('&') group_id = group_id['href'][start:stop] else: group_id_hashed = True composition = "%s;%s;%s;%s;%s;%s;%s;" % (title,requester_id, time_alloted,reward, description,keywords, qualifications) composition = smart_str(composition) group_id = hashlib.md5(composition).hexdigest() # Checking whether processed content is already stored in the database hit_group_content = None try: logging.debug("group_id=%s; requester=%s; title=%s; desc=%s; ta=%s; reward=%s" % (group_id,requester_id,title,description,time_alloted,reward)) hit_group_content = HitGroupContent.objects.get(group_id=group_id, requester_id=requester_id, title=title, description=description, time_alloted=time_alloted, reward=reward, ) except HitGroupContent.DoesNotExist: hit_group_content = HitGroupContent(**{ 'title': title, 'requester_id': requester_id, 'requester_name': requester_name, 'time_alloted': time_alloted, 'reward': reward, 'html': '', 'description': description, 'keywords': keywords, 'qualifications': qualifications, 'occurrence_date': occurrence_date, 'group_id': group_id, 'group_id_hashed': group_id_hashed }) data.append({ 'HitGroupStatus': { 'group_id': group_id, 'hits_available': hits_available, 'page_number': page_number, 'inpage_position': i_group+1, 'hit_expiration_date': hit_expiration_date, 'hit_group_content': hit_group_content } }) fields = None group_html = None except: logging.error("Failed to process group %s on %s page (%s)" % (i_group,page_number,sys.exc_info()[0].__name__)) errors.append(grab_error(sys.exc_info())) print grab_error(sys.exc_info()) table = None soup = None html = None except: logging.error("Failed to process page %d (%s)" % (page_number,sys.exc_info()[0].__name__)) errors.append(grab_error(sys.exc_info())) print grab_error(sys.exc_info()) return {'data':data,'errors':errors}
def run(self): pid = Pid('mturk_crawler', True) logging.info('Crawler started') start_time = datetime.datetime.now() #Fetching statistical information about groups and HITs count logging.debug("Fetching stats") main_response = urllib2.urlopen(get_allhit_url()) main_html = main_response.read() main_soup = BeautifulSoup(main_html, parseOnlyThese=SoupStrainer(text=re.compile("(^[0-9,]+ HITs|of [0-9]+ Results)"))) main_stats = [tag for tag in main_soup] hits_available = -1 groups_available = -1 if len(main_stats) > 1: hits_available_tmp = main_stats[0] hits_available_tmp = hits_available_tmp[:hits_available_tmp.find(' ')].replace(',', '') hits_available = int(hits_available_tmp) groups_available_tmp = main_stats[1] groups_available_tmp = groups_available_tmp[groups_available_tmp.find('of')+3:groups_available_tmp.find('Results')-1] groups_available = int(groups_available_tmp) main_soup = None #Fetching data from every mturk.com HITs list page logging.debug("Allhit processing") result_allhit = self.process_values(range(1,self.get_max_page(main_html)+1), callback_allhit, self.processes_count) self.data = result_allhit['data'] self.append_errors(result_allhit['errors']) #Fetching html details for every HIT group logging.debug("Details processing") result_details = self.process_values(self.data, callback_details, self.processes_count) self.data = result_details['data'] self.append_errors(result_details['errors']) hits_downloaded = sum([hgs['HitGroupStatus']['hits_available'] for hgs in self.data]) groups_downloaded = len(self.data) #Logging crawl information into the database success = False if groups_downloaded > 0 and hits_downloaded > 0 and groups_available/groups_downloaded <= 1.5 and hits_available/hits_downloaded <= 1.5: success = True logging.debug("Crawl finished with success=%s. Saving main_crawl entry" % success) crawl = Crawl(**{ 'start_time': start_time, 'end_time': datetime.datetime.now(), 'success': success, 'hits_available': hits_available, 'hits_downloaded': hits_downloaded, 'groups_available': groups_available, 'groups_downloaded': groups_downloaded, #'errors': str(self.errors) # ! 'errors': '' }) crawl.save() #Adding crawl FK logging.debug("Adding FKs") result_add_crawlfk = self.process_values(self.data, callback_add_crawlfk, crawl=crawl) self.data = result_add_crawlfk['data'] self.append_errors(result_add_crawlfk['errors']) #Saving results in the database logging.debug("Saving results") result_save_database = self.process_values(self.data, callback_database) self.append_errors(result_save_database['errors']) print self.errors logging.info( "Crawler finished %ssuccessfully in %s with %d results, %d HITs (of %d and %d) and %d errors" % ( "" if success else "un", (datetime.datetime.now()-start_time), groups_downloaded, hits_downloaded, groups_available, hits_available, len(self.errors) ) ) pid.remove_pid()