def test_get_max_dispatch(self): onetype = CrawlerDownloadType(language='other', is_support=True) onetype.save() job1 = Job(name='1', info='2', customer='ddd', priority=-1) job1.save() ctg1 = CrawlerTaskGenerator(job=job1, code='echo hello1', cron='* * * * *') ctg1.save() ct1 = CrawlerTask(job=job1, status=CrawlerTask.STATUS_FAIL, task_generator=ctg1, uri='http://www.fishc.com', args='i', from_host='1') ct1.save() # codestr2 = open('/Users/princetechs3/my_code/code2.sh','r').read() cd1 = CrawlerDownload(job=job1, code='codestr2', types=onetype) cd1.save() cds1 = CrawlerDownloadSetting(job=job1, proxy='122', cookie='22', dispatch_num=50) cds1.save() max_retry_times = 1 dispatch_num = 1 down_tasks = CrawlerTask.objects( status=CrawlerTask.STATUS_FAIL, retry_times__lte=max_retry_times)[:dispatch_num] self.assertTrue(down_tasks)
def save_task(self): uris = [] val = URLValidator(self.schemes) out_f = open(self.out_path, "r") for line in out_f: self.content_bytes += len(line) try: js = json.loads(line) except ValueError: js = ast.literal_eval(line) if not isinstance(js, dict): # logging.error("The line %s is not dict or json"%(line)) CrawlerGeneratorErrorLog(name="ERROR_URI", content="The line %s is not dict or json" % (line), hostname=socket.gethostname()).save() continue uri_data = [] try: # validate uri if 'uri' not in js: uri_de = js['uri'].encode("utf-8") val(uri_de) uri_data.append(uri_de) if 'args' not in js: uri_args = js['args'].encode("utf-8") uri_data.append(uri_args) uri_str = str(uri_data) uris.append(uri_str) else: CrawlerGeneratorErrorLog(name="ERROR_JSON", content="JSON ValidationError without key 'uri' : %s" % (js), hostname=socket.gethostname()).save() except ValidationError: CrawlerGeneratorErrorLog(name="ERROR_URI", content="URI ValidationError: %s " % (js['uri']), hostname=socket.gethostname()).save() out_f.close() os.remove(self.out_path) dereplicated_uris = dereplicate_uris(uris) for uri_str in dereplicated_uris: try: eval_uri = eval(uri_str) uri = eval_uri[0] try: args = eval_uri[1] except IndexError: args = "No more args" crawler_task = CrawlerTask(job=self.job, task_generator=self.task_generator, uri=uri, args=args, from_host=socket.gethostname()) # crawler_task.args = "" crawler_task.save() except: # logging.error("add %s failed: %s", line, traceback.format_exc(10)) content = traceback.format_exc(10) CrawlerGeneratorErrorLog(name="ERROR_URI", content=content, hostname=socket.gethostname()).save() self.save_generate_log(CrawlerGeneratorLog.STATUS_SUCCESS, "After generating, save task succeed!")
def insert_extracter_test_data(self): config = open('structure/extracters/conf_csciwlpc_local.json').read() analyzeddata = open('structure/extracters/analyzed_data_csci.json') count = 0 for line in analyzeddata: count += 1 test_job = JobMongoDB("creator", "job_%d" % count, "info", "customer", random.choice(range(1, 3)), random.choice(range(-1, 6)), datetime.datetime.now()) test_job.save() # test_extracter = ExtracterGenerator.extracter test_extracter = Extracter(count, config) test_extracter.save() test_crawlertask = CrawlerTask(test_job, uri="test_uri_%d" % count, status=7) test_crawlertask.save() ExtracterStructureConfig(job=test_job, extracter=test_extracter).save() processed_line = json.loads(line)['analyzed_data'] CrawlerAnalyzedData(crawler_task=test_crawlertask, analyzed_data=processed_line).save() print "Inserted %s Extracter Test Data " % count
def run(): print 'Downloader dispatch start' if settings.DISPATCH_BY_PRIORITY: total = 0 jobs = Job.objects(status=Job.STATUS_ON).order_by('+priority') print "All jobs Number:", jobs.count() for job in jobs: total = CrawlerTask.objects(job=job).count() print 'This job\'s tasks total number:', total dispatch_tasks_num = settings.MAX_TOTAL_DISPATCH_COUNT_ONCE # 测试每次分发的数量 tasks = CrawlerTask.objects(job=job, status=1)[:dispatch_tasks_num] print "Tasks Count:", len(tasks) if len(tasks) > dispatch_tasks_num: print "Downloader dispatch Error: Tasks number over MAX_TOTAL_DISPATCH_COUNT_ONCE:", dispatch_tasks_num break count = 0 for task in tasks: print "Downloader task dispatch :", count count += 1 dispatch_use_pool(task) # pool.map(dispatch_use_pool, tasks) # pool.close() # pool.join() # tasks = CrawlerTask.objects(status=CrawlerTask.STATUS_LIVE).order_by('job.priority')[:settings.MAX_TOTAL_DISPATCH_COUNT_ONCE] elif settings.DISPATCH_BY_HOSTNAME: # TODO:按照主机进行分发 pass
def empty_test_data(self): JobMongoDB.drop_collection() Extracter.drop_collection() CrawlerTask.drop_collection() ExtracterStructureConfig.drop_collection() CrawlerAnalyzedData.drop_collection() CrawlerExtracterInfo.drop_collection() print "Extracter Test Data Cleaned!"
def insert_test_data(): right_script = """class RawParser(object): def parser(self, crawlerdownloaddata): data = "JSON Format Data After Parsing" return data""" wrong_script = """class RawParser(object): def parser(self, crawlerdownloaddata): print crawlerdownloaddata.wrong""" try: user = User.objects.get(username='******') except: user = User.objects.create_user('user_name', 'user_email', 'password') for count in range(0, 100): status = random.choice(range(1, 3)) priority = random.choice(range(-1, 6)) test_job_mysql = JobMySQL(creator=user, name="job_%d" % count, info="info", customer="customer", status=status, priority=priority, add_datetime=django.utils.timezone.now()) test_job_mysql.save() test_job_mongodb = JobMongoDB(creator="creator_%d" % count, name="job_%d" % count, info="info", customer="customer", status=status, priority=priority, add_datetime=datetime.datetime.now()) test_job_mongodb.save() test_parser = Parser(parser_id=str(count), python_script=random.choice([right_script, wrong_script]), update_date=django.utils.timezone.now()) test_parser.save() test_crawlertask = CrawlerTask(test_job_mongodb, uri="test_uri_%d" % count, status=5) test_crawlertask.save() test_downloader = CrawlerDownload(test_job_mongodb) test_downloader.save() StructureConfig(job=test_job_mysql, parser=test_parser, job_copy_id=test_job_mongodb.id, update_date=django.utils.timezone.now()).save() CrawlerDownloadData(test_job_mongodb, test_downloader, test_crawlertask).save() print "Data Inserted!"
def test_insert_enterprise_job(self): onetype = CrawlerDownloadType(language='other', is_support=True) onetype.save() job1 = Job(name='1', info='2', customer='ddd', priority=-1) job1.save() ctg1 = CrawlerTaskGenerator(job=job1, code='echo hello1', cron='* * * * *') ctg1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://重庆/重庆理必易投资管理有限公司/500905004651063/', args='i', from_host='1') # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/云南/昆明道岚投资中心(有限合伙)/500905004651063/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/云南/大理富民兴业股权投资基金管理有限公司/532910100007315/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/北京/北京众润投资基金管理有限公司/110105018837481/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/广东/深圳市郞润承泽资产管理有限公司/440301113021601/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/云南/美好置业集团股份有限公司/530000000006503/', args='i', from_host='1') # ct1.save() # # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/江苏/江苏银凤金革资产管理有限公司/320106000236597/', args='i', from_host='1') # # ct1.save() # # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/北京/北京汇泽融盛投资有限公司/110106013355060/', args='i', from_host='1') # # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/广西/柳州化工股份有限公司/***/', args='i', from_host='1') # ct1.save() ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/安徽/安徽省徽商集团化轻股份有限公司/***/', args='i', from_host='1') ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/总局/瀚丰资本管理有限公司/100000000018983/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/江苏/江苏康耀资产管理有限公司/320125000170935/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/北京/北京匀丰资产管理有限公司/110105019391209/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/上海/中安富海投资管理有限公司/310108000565783/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/广东/深圳润阁投资管理有限公司/440301111930453/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/广东/深圳市金汇隆投资管理有限公司/440301109991545/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/山东/山东正融资产管理有限公司/371300200058462/', args='i', from_host='1') # ct1.save() # codestr2 = open('/Users/princetechs3/my_code/code2.sh','r').read() cd1 = CrawlerDownload(job=job1, code='codestr2', types=onetype) cd1.save() cds1 = CrawlerDownloadSetting(job=job1, proxy='122', cookie='22', dispatch_num=50) cds1.save() pass
def empty_test_data(): JobMongoDB.drop_collection() JobMySQL.objects.all().delete() Parser.objects.all().delete() CrawlerTask.drop_collection() CrawlerDownload.drop_collection() StructureConfig.objects.all().delete() CrawlerDownloadData.drop_collection() CrawlerAnalyzedData.drop_collection() print "Data Cleaned!"
def test_insert_20000_uri_job(self): onetype = CrawlerDownloadType(language='other', is_support=True) onetype.save() for i in range(1, 11): job = Job(name='1%s' % (str(i)), info='2%s' % (str(i)), customer='ddd%s' % (str(i)), priority=random.randint(-1, 5)) job.save() ctg1 = CrawlerTaskGenerator(job=job, code='echo hello1', cron='* * * * *') ctg1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://重庆/重庆理必易投资管理有限公司/500905004651063/', args='i', from_host='1') for j in range(1000): ct1 = CrawlerTask(job=job, task_generator=ctg1, uri='http://www.baidu.com', args='i', from_host='1') ct1.save() ct1 = CrawlerTask(job=job, task_generator=ctg1, uri='http://www.fishc.com', args='i', from_host='1') ct1.save() cd1 = CrawlerDownload(job=job, code='codestr2', types=onetype) cd1.save() cds1 = CrawlerDownloadSetting(job=job, proxy='122', cookie='22', dispatch_num=50) cds1.save() pass
def setUp(self): TestCase.setUp(self) self.test_structuregenerator = StructureGenerator() self.test_count = CrawlerTask.objects.count() if self.test_count == 0: empty_test_data() insert_test_data() else: pass self.test_crawlertasks = CrawlerTask.objects()
def test_download(self): sys.path.append('/Users/princetechs3/my_code') onetype = CrawlerDownloadType(language='python') onetype.save() job1 = Job(name='1', info='2', customer='ddd', priority=-1) job1.save() ctg1 = CrawlerTaskGenerator(job=job1, code='echo hello1', cron='* * * * *') ctg1.save() ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='http://www.baidu.com', args='i', from_host='1') ct1.save() codestr1 = open('/Users/princetechs3/my_code/code1.py', 'r').read() cd1 = CrawlerDownload(job=job1, code=codestr1, types=onetype) cd1.save() cds1 = CrawlerDownloadSetting(job=job1, proxy='122', cookie='22', dispatch_num=50) cds1.save() job = Job.objects(status=Job.STATUS_ON)[0] self.assertTrue(job) task = CrawlerTask.objects(job=job)[0] self.assertTrue(task) cd = CrawlerDownload.objects(job=task.job)[0] self.assertTrue(cd) self.assertTrue(cd.code) with open('/Users/princetechs3/my_code/jobcode1.py', 'w') as f: f.write(cd.code) self.exec_command('import jobcode1;jobcode1.run(%s)' % "'http://www.baidu.com'") # print cd.code self.assertEqual(cd.types.language, 'python') print cd.types.language
def refresh_failed_jobs(): count = 0 failed_tasks = CrawlerTask.objects(status=6) print "% d crawlertasks failed parsing now" % len(failed_tasks) for failed_task in failed_tasks: failed_task.update(status=5) count += 1 print "Refresh % d failed crawlertasks (Which failed for over max retry times)" % count logging.info( "Refresh % d failed crawlertasks (Which failed for over max retry times)" % count)
def requeue_failed_jobs(): pass # 待修改 return extracter_generator = ExtracterGenerator() failed_tasks = CrawlerTask.objects(status=8) if failed_tasks == None: logging.info("No failed extract jobs at this time") print "No failed extract jobs at this time" else: print "Current number of failed extract jobs: %d" % len(failed_tasks) count = 0 for failed_task in failed_tasks: failed_data = CrawlerExtracterInfo.objects( crawler_task=failed_task).first() if failed_data.retry_times >= 3: pass else: failed_job_data = extracter_generator.get_task_analyzed_data( failed_task) extracterstructureconfig = ExtracterStructureConfig.objects( job=data.crawlertask.job).first() failed_job_conf = extracterstructureconfig.extracter.extracter_config failed_job_priority = extracter_generator.get_task_priority( failed_task) q = None if failed_job_priority == ExtracterConsts.QUEUE_PRIORITY_TOO_HIGH: q = too_high_queue elif failed_job_priority == ExtracterConsts.QUEUE_PRIORITY_HIGH: q = high_queue elif failed_job_priority == ExtracterConsts.QUEUE_PRIORITY_NORMAL: q = normal_queue elif failed_job_priority == ExtracterConsts.QUEUE_PRIORITY_LOW: q = low_queue else: q = low_queue if (q.count + 1) > ExtracterConsts.QUEUE_MAX_LENGTH: logging.error( "Cannot requeue extract job because the queue: %s is full" % q.name) print "Cannot requeue extract job because the queue: %s is full" % q.name return None else: q.enqueue_call(func=ExtracterGenerator.extracter, args=[failed_job_conf, failed_job_data]) failed_data_retry_times = failed_data.retry_times + 1 failed_data.update(retry_times=failed_data_retry_times) count += 1 failed_task.update(status=7) print "%d failed extract jobs requeued successfully!" % count
def test_extract_function(self): test_crawlertask = CrawlerTask.objects().first() test_crawleranalyzeddata = CrawlerAnalyzedData.objects(crawler_task=test_crawlertask).first() test_extracter = self.test_extractergenerator.extracter(test_crawlertask) test_extracted_result = extracter(test_rawparser, test_crawlerdownloaddata) test_extractedinfo = CrawlerExtracterInfo.objects(job=test_crawlertask.job) self.assertTrue(test_extracted_result) if self.assertEqual(test_crawlertask.status, 9): #Clean data test_crawlertask.update(status=random.choice(range(1, 10))) else: pass test_crawlerextracteddata.delete()
def test_parser_function(self): test_crawlertask = CrawlerTask.objects().first() test_crawlerdownloaddata = CrawlerDownloadData.objects(crawlertask=test_crawlertask).first() test_rawparser = self.test_parsergenerator.get_parser(test_crawlertask) test_analyzed_data = parser_func(test_rawparser, test_crawlerdownloaddata) test_crawleranalyzeddata = CrawlerAnalyzedData.objects(Q(uri=test_crawlertask.uri) & Q( job=test_crawlertask.job)).first() self.assertIsNotNone(test_crawleranalyzeddata.analyzed_data) if self.assertEqual(test_crawlertask.status, 7): #Clean data test_crawlertask.update(status=random.choice(range(1, 8))) else: pass test_crawleranalyzeddata.delete()
def test_insert_other_job(self): onetype = CrawlerDownloadType(language='other', is_support=True) onetype.save() job1 = Job(name='1', info='2', customer='ddd', priority=-1) job1.save() ctg1 = CrawlerTaskGenerator(job=job1, code='echo hello1', cron='* * * * *') ctg1.save() ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='http://www.sougou.com', args='i', from_host='1') ct1.save() # codestr2 = open('/Users/princetechs3/my_code/code2.sh','r').read() cd1 = CrawlerDownload(job=job1, code='codestr2', types=onetype) cd1.save() cds1 = CrawlerDownloadSetting(job=job1, proxy='122', cookie='22', dispatch_num=50) cds1.save()
def requeue_failed_jobs(): structure_generator = StructureGenerator() failed_tasks = CrawlerTask.objects(status=6) if failed_tasks == None: logging.info("No failed parse jobs at this time") print "No failed parse jobs at this time" else: print "Current number of failed parse jobs: %d" % len(failed_tasks) count = 0 deleted_data_count = 0 for failed_task in failed_tasks: failed_data = CrawlerAnalyzedData.objects( crawler_task=failed_task).first() if failed_data.retry_times >= 3: failed_data.delete() deleted_data_count += 1 else: failed_job_source_data = structure_generator.get_task_source_data( failed_task) failed_job_priority = structure_generator.get_task_priority( failed_task) q = None if failed_job_priority == Consts.QUEUE_PRIORITY_TOO_HIGH: q = too_high_queue elif failed_job_priority == Consts.QUEUE_PRIORITY_HIGH: q = high_queue elif failed_job_priority == Consts.QUEUE_PRIORITY_NORMAL: q = normal_queue elif failed_job_priority == Consts.QUEUE_PRIORITY_LOW: q = low_queue else: q = low_queue if (q.count + 1) > Consts.QUEUE_MAX_LENGTH: logging.error( "Cannot requeue parse job because the queue: %s is full" % q.name) print "Cannot requeue parse job because the queue: %s is full" % q.name return None else: q.enqueue_call(func=parser_func, args=[failed_job_source_data]) failed_data_retry_times = failed_data.retry_times + 1 failed_data.update(retry_times=failed_data_retry_times) count += 1 failed_task.update(status=5) print "%d failed parse jobs requeued successfully!" % count if deleted_data_count > 0: print "Delete % d data from crawlertasks which failed for over max retry times" % deleted_data_count
def test_get_task(self): self.delete_jobs() self.insert_jobs() self.assertEqual(CrawlerDownloadType.objects.count(), 1) self.assertEqual(Job.objects.count(), 4) self.assertEqual(CrawlerTaskGenerator.objects.count(), 4) self.assertEqual(CrawlerTask.objects.count(), 4) self.assertEqual(CrawlerDownload.objects.count(), 4) self.assertEqual(CrawlerDownloadSetting.objects.count(), 4) jobs = Job.objects(status=Job.STATUS_ON).order_by('+priority') self.assertTrue(jobs) for job in jobs: tasks = CrawlerTask.objects(job=job) self.assertTrue(tasks) self.delete_jobs() count = Job.objects.count() self.assertEqual(count, 0)
def save_text(self, text, schemes=None): """ """ uris = self.read_from_strings(text, schemes) # for uri in uris: # try: # CrawlerTask(job= self.job, uri= uri, from_host= socket.gethostname()).save() # except Exception as e: # content = "%s : Error occured when saving uris %s."%(type(e), uri) # # logging.error(content) # CrawlerGeneratorErrorLog(name= "ERROR_SAVE", content= content, hostname= socket.gethostname()).save() bulk = [] for uri in uris: bulk.append(CrawlerTask(job=self.job, uri=uri, from_host=socket.gethostname())) try: CrawlerTask.objects.insert(bulk) except Exception, e: CrawlerGeneratorErrorLog(name="ERROR_SAVE", content="%s : Error occured when saving uris." % (type(e)), hostname=socket.gethostname()).save()
def insert_jobs(self): onetype = CrawlerDownloadType(language='python', is_support=True) onetype.save() job1 = Job(name='1', info='2', customer='ddd', priority=-1) job1.save() job2 = Job(name='2', priority=0) job2.save() job3 = Job(name='3', priority=2) job3.save() job4 = Job(name='4', priority=3) job4.save() ctg1 = CrawlerTaskGenerator(job=job1, code='echo hello1', cron='* * * * *') ctg1.save() ctg2 = CrawlerTaskGenerator(job=job2, code='echo hello2', cron='* * * * *') ctg2.save() ctg3 = CrawlerTaskGenerator(job=job3, code='echo hello3', cron='* * * * *') ctg3.save() ctg4 = CrawlerTaskGenerator(job=job4, code='echo hello4', cron='* * * * *') ctg4.save() CrawlerTask(job=job1, task_generator=ctg1, uri='http://www.baidu.com', args='i', from_host='1').save() CrawlerTask(job=job3, task_generator=ctg1, uri='http://www.fishc.com', args='l', from_host='1').save() CrawlerTask(job=job4, task_generator=ctg1, uri='https://xueqiu.com/', args='o', from_host='2').save() CrawlerTask(job=job2, task_generator=ctg1, uri='http://www.jb51.net/article/47957.htm', args='v', from_host='3').save() codestr1 = open('/Users/princetechs3/my_code/xuqiu.py', 'r').read() CrawlerDownload(job=job1, code=codestr1, types=onetype).save() CrawlerDownload(job=job2, code=codestr1, types=onetype).save() CrawlerDownload(job=job3, code=codestr1, types=onetype).save() CrawlerDownload(job=job4, code=codestr1, types=onetype).save() cdc1 = CrawlerDownloadSetting(job=job1, proxy='122', cookie='22', dispatch_num=50) cdc1.save() cdc2 = CrawlerDownloadSetting(job=job2, proxy='2', cookie='3', dispatch_num=60) cdc2.save() cdc3 = CrawlerDownloadSetting(job=job3, proxy='32', cookie='21', dispatch_num=70) cdc3.save() cdc4 = CrawlerDownloadSetting(job=job4, proxy='312', cookie='221', dispatch_num=100) cdc4.save()
def test_get_parser(self): test_crawlertask = CrawlerTask.objects().first() test_rawparser = self.test_parsergenerator.get_parser(test_crawlertask) self.assertIsNotNone(test_rawparser)
def filter_parsed_tasks(self): parsed_tasks = CrawlerTask.objects(status=CrawlerTask.STATUS_ANALYSIS_SUCCESS) if parsed_tasks is None: logging.info("No parsed (status = 7) tasks") return parsed_tasks
def filter_downloaded_tasks(self): downloaded_tasks = CrawlerTask.objects(status=5) if downloaded_tasks is None: logging.info("No downloaded (status = 5) tasks") return downloaded_tasks
def filter_parsed_tasks(self): parsed_tasks = CrawlerTask.objects(status=7) if parsed_tasks is None: logging.info("No parsed (status = 7) tasks") return parsed_tasks
def dispatch_use_pool(task): try: # dispatch_num = CrawlerDownloadSetting.objects(job=task.job)[0].dispatch_num dispatch_num = 1 if dispatch_num == 0: write_dispatch_alter_log(job=task.job, reason='dispatch_num is 0') return # print type(dispatch_num), dispatch_num max_retry_times = CrawlerDownloadSetting.objects( job=task.job)[0].max_retry_times if settings.OPEN_CRAWLER_FAILED_ONLY: down_tasks = CrawlerTask.objects( status=CrawlerTask.STATUS_FAIL).order_by('?')[:dispatch_num] else: if datetime.datetime.now().minute >= 56: # max_retry_times <= max_retry_times down_tasks = CrawlerTask.objects( status=CrawlerTask.STATUS_FAIL, retry_times__lte=max_retry_times).order_by( '?')[:dispatch_num] else: down_tasks = CrawlerTask.objects(status=CrawlerTask.STATUS_LIVE ).order_by('?')[:dispatch_num] if len(down_tasks) == 0: # write_dispatch_alter_log(job=task.job, reason='get down_tasks len is 0') return except Exception as e: write_dispatch_error_log(job=task.job, reason=str(e)) return None for task in down_tasks: priority = task.job.priority try: task.status = CrawlerTask.STATUS_DISPATCH if priority == -1: if len(q_down_super) >= settings.Q_DOWN_SUPER_LEN: write_dispatch_alter_log( job=task.job, reason='q_down_super lens get maxlen') continue q_down_super.enqueue(download_clawer_task, args=[task], timeout=settings.RQ_DOWNLOAD_TASK_TIMEOUT) elif priority == 0: if len(q_down_high) >= settings.Q_DOWN_HIGH_LEN: write_dispatch_alter_log( job=task.job, reason='q_down_high lens get maxlen') continue q_down_high.enqueue(download_clawer_task, args=[task], at_front=True, timeout=settings.RQ_DOWNLOAD_TASK_TIMEOUT) elif priority == 1: if len(q_down_high) >= settings.Q_DOWN_HIGH_LEN: write_dispatch_alter_log( job=task.job, reason='q_down_high lens get maxlen') continue q_down_high.enqueue(download_clawer_task, args=[task], at_front=False, timeout=settings.RQ_DOWNLOAD_TASK_TIMEOUT) elif priority == 2: if len(q_down_mid) >= settings.Q_DOWN_MID_LEN: write_dispatch_alter_log( job=task.job, reason='q_down_mid lens get maxlen') continue q_down_mid.enqueue(download_clawer_task, args=[task], at_front=True, timeout=settings.RQ_DOWNLOAD_TASK_TIMEOUT) elif priority == 3: if len(q_down_mid) >= settings.Q_DOWN_MID_LEN: write_dispatch_alter_log( job=task.job, reason='q_down_mid lens get maxlen') continue q_down_mid.enqueue(download_clawer_task, args=[task], at_front=False, timeout=settings.RQ_DOWNLOAD_TASK_TIMEOUT) elif priority == 4: if len(q_down_low) >= settings.Q_DOWN_LOW_LEN: write_dispatch_alter_log( job=task.job, reason='q_down_low lens get maxlen') continue q_down_low.enqueue(download_clawer_task, args=[task], at_front=True, timeout=settings.RQ_DOWNLOAD_TASK_TIMEOUT) elif priority == 5: if len(q_down_low) >= settings.Q_DOWN_LOW_LEN: write_dispatch_alter_log( job=task.job, reason='q_down_low lens get maxlen') continue q_down_low.enqueue(download_clawer_task, args=[task], at_front=False, timeout=settings.RQ_DOWNLOAD_TASK_TIMEOUT) task.save() write_dispatch_success_log(job=task.job, reason='success') except Exception as e: task.status = CrawlerTask.STATUS_FAIL write_dispatch_failed_log(job=task.job, reason=str(e))