def handle(self, **options): self.start_time = time.time() pid = Pid('remove_bad_crawl_related', True) self.having_hits_mv = not options.get('all') self.chunk_size = options.get('chunk-size') self.chunked = not options.get('simple') self.limit = options.get('limit') self.fix_interrupted = options.get('fix-interrupted') self.crawl_count = self.get_crawls_count() if options.get('count-only') or self.crawl_count == 0: self.handle_count_only() self.fix_interrupted and self.update_interrupted_crawl_stats() # if limit is specified, show X/Y instead of just Y log.info('Starting bad crawl related data removal, {0}{1} records will ' 'be processed.'.format( '{0}/'.format(self.limit) if self.limit else '', self.crawl_count)) ids = self.get_crawl_ids() deleted = self.do_deletes(ids) log.info('Command took: {0}, {1} crawls processed.'.format( self.time_elapsed(), deleted)) pid.remove_pid()
def handle(self, **options): self.process_options(options) pid = Pid(self.pidfile, True) start_time = time.time() try: cur = connection.cursor() self.logger.info('Calling {0}({1}, {2}), start time: {3}.'.format( self.proc_name, self.start, self.end, now())) cur.callproc(self.proc_name, self.get_proc_args()) transaction.commit_unless_managed() self.logger.info('{0} for crawls from {1} to {2} took: {3}.'.format( self.proc_name, self.start, self.end, time.time() - start_time)) except Exception as e: self.logger.exception(e) finally: pid.remove_pid() if options.get('verbosity') == 0: self.logger.setLevel(logging.DEBUG)
def handle(self, **options): pid = Pid('mturk_agregates', True) start_time = time.time() log.info('Updating crawl agregates') update_crawl_agregates(only_new=True) log.info('db_update_agregates took: %s' % (time.time() - start_time)) pid.remove_pid()
def handle(self, **options): pid = Pid("mturk_aggregates", True) self.process_options(options) start_time = time.time() log.info("Updating crawl agregates") update_crawl_agregates(start=self.start, end=self.end, clear_existing=self.clear_existing) log.info("db_update_agregates took: %s" % (time.time() - start_time)) pid.remove_pid()
def handle(self, **options): """Main command entry point.""" self.options = options if self.options['list']: pass # do nothing, go straight to print_status else: pid = Pid(self.options.get('pidfile'), True) self.prepare_options() # sets self.reports and prints errors is any self.reports and (self.handle_purge() or self.handle_cache()) pid.remove_pid() self.print_status()
def handle(self, **options): pid = Pid('mtur_aggregates') self.process_options(options) start_time = time.time() log.info('Refreshing hits_mv') update_mviews(clear_existing=self.clear_existing, force=self.force, start=self.start, end=self.end) log.info('Done refreshing hits_mv db_refresh_mviews took: {0}s.'.format( time.time() - start_time)) pid.remove_pid()
def handle(self, **options): pid = Pid('mturk_crawler', True) start_time = time.time() log.info('Removing duplicate hitgroupcontent and hitgroupstatuses.') clean_duplicates() log.info('Refreshing hits_mv') update_mviews() log.info('Done refreshing hits_mv') log.info('db_refresh_mviews took: %s' % (time.time() - start_time)) pid.remove_pid()
def handle(self, **options): pid = Pid('mturk_cache_topreq', True) report_type = options.get('report-type') if report_type not in ToprequestersReport.values: log.info('Unknown report type: "{0}".'.format(report_type)) return key = ToprequestersReport.get_cache_key(report_type) display_name = ToprequestersReport.display_names[report_type] if cache.get(key) is None: log.info(('"{0}" toprequesters report missing, recalculating.' ).format(display_name)) else: if options['force']: log.info('Recalculating "{0}" toprequesters report.'.format( display_name)) else: log.info('"{0}" toprequesters still in cache, use --force flag' ' to rebuild anyway.'.format(display_name)) return days = options['days'] # no chache perform query: start_time = time.time() data = ToprequestersReport.REPORT_FUNCTION[report_type](days) log.info('Toprequesters report "{0}" generated in: {1}s.'.format( display_name, time.time() - start_time)) # too often we get no information on the success of caching if not data: log.warning('Data returned by report function is {0}!'.format(data)) else: cache.set(key, data, HOURS4) in_cache = cache.get(key, data) if in_cache is None: log.warning('Cache error - data could not be fetched!') pid.remove_pid()
def handle(self, **options): pid = Pid('mturk_diffs', True) transaction.enter_transaction_management() transaction.managed(True) start_time = time.time() try: items = Crawl.objects.filter(is_spam_computed=False ).order_by('-id')[:options['limit']] lenitems = len(items) log.info(('Starting db_update_diffs, {0} crawls will be updated.' ).format(lenitems)) for c in items: updated = update_cid(c.id) if updated > 0: c.has_diffs = True c.save() transaction.commit() except (KeyError, KeyboardInterrupt) as e: log.info(('Exception, rolling back the transaction and exiting: {0}' ).format(e)) transaction.rollback() pid.remove_pid() exit() log.info('Success! Updating {0} crawls took: {1} s'.format( lenitems, time.time() - start_time))
def handle(self, *args, **options): self.mturk_email = getattr(settings, 'MTURK_AUTH_EMAIL', None) self.mturk_password = getattr(settings, 'MTURK_AUTH_PASSWORD', None) _start_time = time.time() pid = Pid('mturk_crawler', True) log.info('crawler started: %s;;%s', args, options) if options.get('mturk_email'): self.mturk_email = options['mturk_email'] if options.get('mturk_password'): self.mturk_password = options['mturk_password'] if options.get('logconf', None): self.setup_logging(options['logconf']) if options.get('debug', False): self.setup_debug() print 'Current proccess pid: %s' % pid.actual_pid print ('To debug, type: python -c "import os,signal; ' 'os.kill(%s, signal.SIGUSR1)"\n') % pid.actual_pid self.maxworkers = options['workers'] if self.maxworkers > 9: # If you want to remote this limit, don't forget to change dbpool # object maximum number of connections. Each worker should fetch # 10 hitgroups and spawn single task for every one of them, that # will get private connection instance. So for 9 workers it's # already 9x10 = 90 connections required # # Also, for too many workers, amazon isn't returning valid data # and retrying takes much longer than using smaller amount of # workers sys.exit('Too many workers (more than 9). Quit.') start_time = datetime.datetime.now() hits_available = tasks.hits_mainpage_total() groups_available = tasks.hits_groups_total() # create crawl object that will be filled with data later crawl = Crawl.objects.create( start_time=start_time, end_time=start_time, success=True, hits_available=hits_available, hits_downloaded=0, groups_available=groups_available, groups_downloaded=groups_available) log.debug('fresh crawl object created: %s', crawl.id) # fetch those requester profiles so we could decide if their hitgroups # are public or not reqesters = RequesterProfile.objects.all_as_dict() dbpool = ThreadedConnectionPool(10, 90, 'dbname=%s user=%s password=%s' % ( settings.DATABASES['default']['NAME'], settings.DATABASES['default']['USER'], settings.DATABASES['default']['PASSWORD'])) # collection of group_ids that were already processed - this should # protect us from duplicating data processed_groups = set() total_reward = 0 hitgroups_iter = self.hits_iter() for hg_pack in hitgroups_iter: jobs = [] for hg in hg_pack: if hg['group_id'] in processed_groups: log.debug('Group already in processed_groups, skipping.') continue processed_groups.add(hg['group_id']) j = gevent.spawn(tasks.process_group, hg, crawl.id, reqesters, processed_groups, dbpool) jobs.append(j) total_reward += hg['reward'] * hg['hits_available'] log.debug('processing pack of hitgroups objects') gevent.joinall( jobs, timeout=settings.CRAWLER_GROUP_PROCESSING_TIMEOUT) # check if all jobs ended successfully for job in jobs: if not job.ready(): log.error('Killing job: %s', job) job.kill() if len(processed_groups) >= groups_available: log.info('Skipping empty groups.') # there's no need to iterate over empty groups.. break break # amazon does not like too many requests at once, so give them a # quick rest... gevent.sleep(1) dbpool.closeall() # update crawler object crawl.groups_downloaded = len(processed_groups) crawl.end_time = datetime.datetime.now() crawl.save() work_time = time.time() - _start_time log.info("""Crawl finished: created crawl id: {crawl_id}) total reward value: {total_reward} hits groups downloaded: {processed_groups} hits groups available: {groups_available} work time: {work_time:.2f} seconds """.format(crawl_id=crawl.id, total_reward=total_reward, processed_groups=len(processed_groups), groups_available=groups_available, work_time=work_time)) crawl_downloaded_pc = settings.INCOMPLETE_CRAWL_THRESHOLD crawl_warning_pc = settings.INCOMPLETE_CRAWL_WARNING_THRESHOLD crawl_time_warning = settings.CRAWLER_TIME_WARNING downloaded_pc = float(crawl.groups_downloaded) / groups_available if work_time > crawl_time_warning: log.warning(("Crawl took {0}s which seems a bit too long (more " "than {1}s), you might consider checking if correct mturk " "account is used, ignore this if high number of groups is " "experienced.").format(work_time, crawl_time_warning)) if downloaded_pc < crawl_warning_pc: log.warning(('Only {0}% of hit groups were downloaded, below ' '({1}% warning threshold) please check mturk account ' 'configuration and/or if there are any network-related ' 'problems.').format(downloaded_pc, crawl_warning_pc)) if downloaded_pc < crawl_downloaded_pc: log.warning("This crawl contains far too few groups downloaded to " "available: {0}% < {1}% downloaded threshold and will be " "considered as erroneous ({2}/{3} groups).".format( downloaded_pc, crawl_downloaded_pc, crawl.groups_downloaded, groups_available)) pid.remove_pid()
def run(self): pid = Pid('mturk_crawler', True) log.info('Crawler started') start_time = datetime.datetime.now() #Fetching statistical information about groups and HITs count log.debug("Fetching stats") main_response = urllib2.urlopen(get_allhit_url()) main_html = main_response.read() main_soup = BeautifulSoup(main_html, parseOnlyThese=SoupStrainer(text=re.compile("(^[0-9,]+ HITs|of [0-9]+ Results)"))) main_stats = [tag for tag in main_soup] hits_available = -1 groups_available = -1 if len(main_stats) > 1: hits_available_tmp = main_stats[0] hits_available_tmp = hits_available_tmp[:hits_available_tmp.find(' ')].replace(',', '') hits_available = int(hits_available_tmp) groups_available_tmp = main_stats[1] groups_available_tmp = groups_available_tmp[groups_available_tmp.find('of')+3:groups_available_tmp.find('Results')-1] groups_available = int(groups_available_tmp) main_soup = None #Fetching data from every mturk.com HITs list page log.debug("Allhit processing") result_allhit = self.process_values(range(1,self.get_max_page(main_html)+1), callback_allhit, self.processes_count) self.data = result_allhit['data'] self.append_errors(result_allhit['errors']) #Fetching html details for every HIT group log.debug("Details processing") result_details = self.process_values(self.data, callback_details, self.processes_count) self.data = result_details['data'] self.append_errors(result_details['errors']) hits_downloaded = sum([hgs['HitGroupStatus']['hits_available'] for hgs in self.data]) groups_downloaded = len(self.data) #Logging crawl information into the database success = False if groups_downloaded > 0 and hits_downloaded > 0 and groups_available/groups_downloaded <= 1.5 and hits_available/hits_downloaded <= 1.5: success = True log.debug("Crawl finished with success=%s. Saving main_crawl entry" % success) crawl = Crawl(**{ 'start_time': start_time, 'end_time': datetime.datetime.now(), 'success': success, 'hits_available': hits_available, 'hits_downloaded': hits_downloaded, 'groups_available': groups_available, 'groups_downloaded': groups_downloaded, #'errors': str(self.errors) # ! 'errors': '' }) crawl.save() #Adding crawl FK log.debug("Adding FKs") result_add_crawlfk = self.process_values(self.data, callback_add_crawlfk, crawl=crawl) self.data = result_add_crawlfk['data'] self.append_errors(result_add_crawlfk['errors']) #Saving results in the database log.debug("Saving results") result_save_database = self.process_values(self.data, callback_database) self.append_errors(result_save_database['errors']) print self.errors log.info( "Crawler finished %ssuccessfully in %s with %d results, %d HITs (of %d and %d) and %d errors" % ( "" if success else "un", (datetime.datetime.now()-start_time), groups_downloaded, hits_downloaded, groups_available, hits_available, len(self.errors) ) ) pid.remove_pid()
def handle(self, **options): pid = Pid(self.pid_file) if self.pid_file else None self.start_time = time.time() self.process_options(options) try: self.prepare_data() # query crawls in the period we want to process crawls = self.get_crawls() self.total_count = len(crawls) if self.total_count < self.min_crawls: self.log.info("Not enough crawls to process.") return done = 0 self.log.info(""" Starting {6}. {0} crawls will be processed in chunks of {3} (overlap: {7}). -- {1} to -- {2}, id from {4} to {5}. """.format(self.total_count, self.start.strftime('%y-%m-%d %H:%M:%S'), self.end.strftime('%y-%m-%d %H:%M:%S'), self.chunk_size, crawls[0].id, crawls[self.total_count - 1].id, self.display_name, self.overlap)) # iterate over overlapping chunks of crawls list for chunk in self.chunks(crawls, self.chunk_size, overlap=self.overlap): start, end = (chunk[-1].start_time, chunk[0].start_time) self.log.info(('Chunk of {0} crawls: {1}\nstart_time {2} to ' '{3}.').format(len(chunk), [c.id for c in chunk], start.strftime('%y-%m-%d %H:%M:%S'), end.strftime('%y-%m-%d %H:%M:%S'))) chunk_time = time.time() if not self.process_chunk(start, end, chunk): break chunk_time = time.time() - chunk_time self.store_chunk_time(chunk_time) done += len(chunk) - self.overlap self.log.info(('\n chunk {0} \n total {1} ' '\n ETA {4}, {2}/{3} done, ').format( humanized_time(chunk_time), humanized_time(self.get_elapsed()), done, self.total_count - self.overlap, humanized_time(self.get_eta()))) except Exception as e: self.log.exception(e) else: self.log.info('{0} crawls processed in {1}s, exiting.'.format( self.total_count, self.get_elapsed())) finally: pid and pid.remove_pid()
def handle(self, **options): """ Take ${lmit} last crawls without spam classification Classify all hit groups, update hits_mv to have proper hit classification Rebuild crawl_aggregates for a given crawl Refresh memcache """ service = get_prediction_service() pid = Pid('classify_spam', True) transaction.enter_transaction_management() transaction.managed(True) start_time = time.time() try: number_of_predictions = 0 for c in list(Crawl.objects.filter(is_spam_computed=False).order_by('-id')[:options['limit']]): log.info("processing %s", c) spam = set([]) not_spam = set([]) updated = 0 for row in query_to_dicts("""select content_id, group_id, is_spam from hits_mv where crawl_id = %s""", c.id): log.info("classyfing crawl_id: %s, %s", c.id, row) if row['is_spam'] is None: is_spam = None content = HitGroupContent.objects.get(id=row['content_id']) if content.is_spam is None: data = content.prepare_for_prediction() body = {'input': {'csvInstance': data}} prediction = service.predict(body=body, data=options['file']).execute() number_of_predictions += 1 updated += 1 content.is_spam = prediction['outputLabel'] != 'No' content.save() execute_sql("update hits_mv set is_spam = %s where crawl_id = %s and group_id = '%s'" % ('true' if content.is_spam else 'false', c.id, row['group_id'])) transaction.commit() if content.is_spam: log.info("detected spam for %s", row) spam.add(str(row['content_id'])) else: not_spam.add(str(row['content_id'])) else: log.info("is_spam already computed for %s" % row) if updated > 0: c.is_spam_computed = True c.save() log.info("done classyfing crawl") execute_sql("""UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" % (c.id, c.id)) transaction.commit() log.info("dome processing %s", c) except (KeyError, KeyboardInterrupt, HttpError), e: log.error(e) transaction.rollback() pid.remove_pid() exit()