def handle(self, *args, **options): self.mturk_email = getattr(settings, 'MTURK_AUTH_EMAIL', None) self.mturk_password = getattr(settings, 'MTURK_AUTH_PASSWORD', None) _start_time = time.time() pid = Pid('mturk_crawler', True) log.info('crawler started: %s;;%s', args, options) if options.get('mturk_email'): self.mturk_email = options['mturk_email'] if options.get('mturk_password'): self.mturk_password = options['mturk_password'] if options.get('logconf', None): self.setup_logging(options['logconf']) if options.get('debug', False): self.setup_debug() print 'Current proccess pid: %s' % pid.actual_pid print ('To debug, type: python -c "import os,signal; ' 'os.kill(%s, signal.SIGUSR1)"\n') % pid.actual_pid self.maxworkers = options['workers'] if self.maxworkers > 9: # If you want to remote this limit, don't forget to change dbpool # object maximum number of connections. Each worker should fetch # 10 hitgroups and spawn single task for every one of them, that # will get private connection instance. So for 9 workers it's # already 9x10 = 90 connections required # # Also, for too many workers, amazon isn't returning valid data # and retrying takes much longer than using smaller amount of # workers sys.exit('Too many workers (more than 9). Quit.') start_time = datetime.datetime.now() hits_available = tasks.hits_mainpage_total() groups_available = tasks.hits_groups_total() # create crawl object that will be filled with data later crawl = Crawl.objects.create( start_time=start_time, end_time=start_time, success=True, hits_available=hits_available, hits_downloaded=0, groups_available=groups_available, groups_downloaded=groups_available) log.debug('fresh crawl object created: %s', crawl.id) # fetch those requester profiles so we could decide if their hitgroups # are public or not reqesters = RequesterProfile.objects.all_as_dict() dbpool = ThreadedConnectionPool(10, 90, 'dbname=%s user=%s password=%s' % ( settings.DATABASES['default']['NAME'], settings.DATABASES['default']['USER'], settings.DATABASES['default']['PASSWORD'])) # collection of group_ids that were already processed - this should # protect us from duplicating data processed_groups = set() total_reward = 0 hitgroups_iter = self.hits_iter() for hg_pack in hitgroups_iter: jobs = [] for hg in hg_pack: if hg['group_id'] in processed_groups: log.debug('Group already in processed_groups, skipping.') continue processed_groups.add(hg['group_id']) j = gevent.spawn(tasks.process_group, hg, crawl.id, reqesters, processed_groups, dbpool) jobs.append(j) total_reward += hg['reward'] * hg['hits_available'] log.debug('processing pack of hitgroups objects') gevent.joinall( jobs, timeout=settings.CRAWLER_GROUP_PROCESSING_TIMEOUT) # check if all jobs ended successfully for job in jobs: if not job.ready(): log.error('Killing job: %s', job) job.kill() if len(processed_groups) >= groups_available: log.info('Skipping empty groups.') # there's no need to iterate over empty groups.. break break # amazon does not like too many requests at once, so give them a # quick rest... gevent.sleep(1) dbpool.closeall() # update crawler object crawl.groups_downloaded = len(processed_groups) crawl.end_time = datetime.datetime.now() crawl.save() work_time = time.time() - _start_time log.info("""Crawl finished: created crawl id: {crawl_id}) total reward value: {total_reward} hits groups downloaded: {processed_groups} hits groups available: {groups_available} work time: {work_time:.2f} seconds """.format(crawl_id=crawl.id, total_reward=total_reward, processed_groups=len(processed_groups), groups_available=groups_available, work_time=work_time)) crawl_downloaded_pc = settings.INCOMPLETE_CRAWL_THRESHOLD crawl_warning_pc = settings.INCOMPLETE_CRAWL_WARNING_THRESHOLD crawl_time_warning = settings.CRAWLER_TIME_WARNING downloaded_pc = float(crawl.groups_downloaded) / groups_available if work_time > crawl_time_warning: log.warning(("Crawl took {0}s which seems a bit too long (more " "than {1}s), you might consider checking if correct mturk " "account is used, ignore this if high number of groups is " "experienced.").format(work_time, crawl_time_warning)) if downloaded_pc < crawl_warning_pc: log.warning(('Only {0}% of hit groups were downloaded, below ' '({1}% warning threshold) please check mturk account ' 'configuration and/or if there are any network-related ' 'problems.').format(downloaded_pc, crawl_warning_pc)) if downloaded_pc < crawl_downloaded_pc: log.warning("This crawl contains far too few groups downloaded to " "available: {0}% < {1}% downloaded threshold and will be " "considered as erroneous ({2}/{3} groups).".format( downloaded_pc, crawl_downloaded_pc, crawl.groups_downloaded, groups_available)) pid.remove_pid()
def handle(self, *args, **options): _start_time = time.time() pid = Pid('mturk_crawler', True) log.info('crawler started: %s;;%s', args, options) if options.get('mturk_email'): self.mturk_email = options['mturk_email'] if options.get('mturk_password'): self.mturk_password = options['mturk_password'] if options.get('logconf', None): self.setup_logging(options['logconf']) if options.get('debug', False): self.setup_debug() print 'Current proccess pid: %s' % pid.actual_pid print 'To debug, type: python -c "import os,signal; os.kill(%s, signal.SIGUSR1)"\n' % \ pid.actual_pid self.maxworkers = options['workers'] if self.maxworkers > 9: # If you want to remote this limit, don't forget to change dbpool # object maximum number of connections. Each worker should fetch # 10 hitgroups and spawn single task for every one of them, that # will get private connection instance. So for 9 workers it's # already 9x10 = 90 connections required # # Also, for too many workers, amazon isn't returning valid data # and retrying takes much longer than using smaller amount of # workers sys.exit('Too many workers (more than 9). Quit.') start_time = datetime.datetime.now() hits_available = tasks.hits_mainpage_total() groups_available = tasks.hits_groups_total() # create crawl object that will be filled with data later crawl = Crawl.objects.create(start_time=start_time, end_time=datetime.datetime.now(), success=True, hits_available=hits_available, hits_downloaded=0, groups_available=groups_available, groups_downloaded=groups_available) log.debug('fresh crawl object created: %s', crawl.id) # fetch those requester profiles so we could decide if their hitgroups # are public or not reqesters = RequesterProfile.objects.all_as_dict() # collection of group_ids that were already processed - this should # protect us from duplicating data processed_groups = set() total_reward = 0 hitgroups_iter = self.hits_iter() for hg_pack in hitgroups_iter: jobs = [] for hg in hg_pack: j = gevent.spawn(tasks.process_group, hg, crawl.id, reqesters, processed_groups) jobs.append(j) total_reward += hg['reward'] * hg['hits_available'] log.debug('processing pack of hitgroups objects') gevent.joinall(jobs, timeout=20) # check if all jobs ended successfully for job in jobs: if not job.ready(): log.error('Killing job: %s', job) job.kill() if len(processed_groups) >= groups_available: # there's no need to iterate over empty groups.. break break # amazon does not like too many requests at once, so give them a # quick rest... gevent.sleep(1) dbpool.closeall() # update crawler object crawl.groups_downloaded = len(processed_groups) crawl.end_time = datetime.datetime.now() crawl.save() work_time = time.time() - _start_time log.info('created crawl id: %s', crawl.id) log.info('total reward value: %s', total_reward) log.info('processed hits groups downloaded: %s', len(processed_groups)) log.info('processed hits groups available: %s', groups_available) log.info('work time: %.2fsec', work_time)
def handle(self, *args, **options): self.mturk_email = getattr(settings, 'MTURK_AUTH_EMAIL', None) self.mturk_password = getattr(settings, 'MTURK_AUTH_PASSWORD', None) _start_time = time.time() pid = Pid('mturk_crawler', True) log.info('crawler started: %s;;%s', args, options) if options.get('mturk_email'): self.mturk_email = options['mturk_email'] if options.get('mturk_password'): self.mturk_password = options['mturk_password'] if options.get('logconf', None): self.setup_logging(options['logconf']) if options.get('debug', False): self.setup_debug() print 'Current proccess pid: %s' % pid.actual_pid print ('To debug, type: python -c "import os,signal; ' 'os.kill(%s, signal.SIGUSR1)"\n') % pid.actual_pid self.maxworkers = options['workers'] if self.maxworkers > 9: # If you want to remote this limit, don't forget to change dbpool # object maximum number of connections. Each worker should fetch # 10 hitgroups and spawn single task for every one of them, that # will get private connection instance. So for 9 workers it's # already 9x10 = 90 connections required # # Also, for too many workers, amazon isn't returning valid data # and retrying takes much longer than using smaller amount of # workers sys.exit('Too many workers (more than 9). Quit.') start_time = datetime.datetime.now() hits_available = tasks.hits_mainpage_total() groups_available = tasks.hits_groups_total() # create crawl object that will be filled with data later crawl = Crawl.objects.create( start_time=start_time, end_time=datetime.datetime.now(), success=True, hits_available=hits_available, hits_downloaded=0, groups_available=groups_available, groups_downloaded=groups_available) log.debug('fresh crawl object created: %s', crawl.id) # fetch those requester profiles so we could decide if their hitgroups # are public or not reqesters = RequesterProfile.objects.all_as_dict() dbpool = ThreadedConnectionPool(10, 90, 'dbname=%s user=%s password=%s' % (settings.DATABASE_NAME, settings.DATABASE_USER, settings.DATABASE_PASSWORD)) # collection of group_ids that were already processed - this should # protect us from duplicating data processed_groups = set() total_reward = 0 hitgroups_iter = self.hits_iter() for hg_pack in hitgroups_iter: jobs = [] for hg in hg_pack: j = gevent.spawn(tasks.process_group, hg, crawl.id, reqesters, processed_groups, dbpool) jobs.append(j) total_reward += hg['reward'] * hg['hits_available'] log.debug('processing pack of hitgroups objects') gevent.joinall(jobs, timeout=20) # check if all jobs ended successfully for job in jobs: if not job.ready(): log.error('Killing job: %s', job) job.kill() if len(processed_groups) >= groups_available: log.info('Skipping empty groups.') # there's no need to iterate over empty groups.. break break # amazon does not like too many requests at once, so give them a # quick rest... gevent.sleep(1) dbpool.closeall() # update crawler object crawl.groups_downloaded = len(processed_groups) crawl.end_time = datetime.datetime.now() crawl.save() work_time = time.time() - _start_time log.info('created crawl id: %s', crawl.id) log.info('total reward value: %s', total_reward) log.info('processed hits groups downloaded: %s', len(processed_groups)) log.info('processed hits groups available: %s', groups_available) log.info('work time: %.2fsec', work_time) crawl_time_warning = 300 if work_time > crawl_time_warning: log.warning("Crawl took {0} s which seems a bit too long (more than" "{0} s), you might consider checking if correct mturk account" " is used.".format(crawl_time_warning)) if crawl.groups_downloaded < groups_available * 0.9: log.warning('More than 10% of hit groups were not downloaded, ' 'please check mturk account configuration and/or if there are ' 'any network-related problems.') crawl_downloaded_pc = 0.6 if crawl.groups_downloaded < groups_available * crawl_downloaded_pc: log.warning("This crawl contains far too few groups downloaded to " "available: ({0} < {1} * {2}) and will be considered as " "erroneous".format(crawl.groups_downloaded, groups_available, crawl_downloaded_pc))
def handle(self, *args, **options): _start_time = time.time() pid = Pid('mturk_crawler', True) log.info('crawler started: %s;;%s', args, options) if options.get('mturk_email'): self.mturk_email = options['mturk_email'] if options.get('mturk_password'): self.mturk_password = options['mturk_password'] if options.get('logconf', None): self.setup_logging(options['logconf']) if options.get('debug', False): self.setup_debug() print 'Current proccess pid: %s' % pid.actual_pid print 'To debug, type: python -c "import os,signal; os.kill(%s, signal.SIGUSR1)"\n' % \ pid.actual_pid self.maxworkers = options['workers'] if self.maxworkers > 9: # If you want to remote this limit, don't forget to change dbpool # object maximum number of connections. Each worker should fetch # 10 hitgroups and spawn single task for every one of them, that # will get private connection instance. So for 9 workers it's # already 9x10 = 90 connections required # # Also, for too many workers, amazon isn't returning valid data # and retrying takes much longer than using smaller amount of # workers sys.exit('Too many workers (more than 9). Quit.') start_time = datetime.datetime.now() hits_available = tasks.hits_mainpage_total() groups_available = tasks.hits_groups_total() # create crawl object that will be filled with data later crawl = Crawl.objects.create( start_time=start_time, end_time=datetime.datetime.now(), success=True, hits_available=hits_available, hits_downloaded=0, groups_available=groups_available, groups_downloaded=groups_available) log.debug('fresh crawl object created: %s', crawl.id) # fetch those requester profiles so we could decide if their hitgroups # are public or not reqesters = RequesterProfile.objects.all_as_dict() # collection of group_ids that were already processed - this should # protect us from duplicating data processed_groups = set() total_reward = 0 hitgroups_iter = self.hits_iter() for hg_pack in hitgroups_iter: jobs = [] for hg in hg_pack: j = gevent.spawn(tasks.process_group, hg, crawl.id, reqesters, processed_groups) jobs.append(j) total_reward += hg['reward'] * hg['hits_available'] log.debug('processing pack of hitgroups objects') gevent.joinall(jobs, timeout=20) # check if all jobs ended successfully for job in jobs: if not job.ready(): log.error('Killing job: %s', job) job.kill() if len(processed_groups) >= groups_available: # there's no need to iterate over empty groups.. break break # amazon does not like too many requests at once, so give them a # quick rest... gevent.sleep(1) dbpool.closeall() # update crawler object crawl.groups_downloaded = len(processed_groups) crawl.end_time = datetime.datetime.now() crawl.save() work_time = time.time() - _start_time log.info('created crawl id: %s', crawl.id) log.info('total reward value: %s', total_reward) log.info('processed hits groups downloaded: %s', len(processed_groups)) log.info('processed hits groups available: %s', groups_available) log.info('work time: %.2fsec', work_time)