def insert_job(name, text, parser_text, settings): # name = "enterprise" prior = random.randint(-1, 5) #Downloader onetype = CrawlerDownloadType(language='other', is_support=True) onetype.save() #Job job_mongodb = JobMongoDB(name=name, info="", priority=prior) job_mongodb.save() job_mysql = JobMySQL(name=name, info="", priority=prior) job_mysql.save() #Generator script = """import json\nprint json.dumps({'uri':"http://www.baidu.com"})""" cron = "* * * * *" code_type = CrawlerTaskGenerator.TYPE_PYTHON schemes = ['http', 'https'] generator = CrawlerTaskGenerator(job=job_mongodb, code=script, code_type=code_type, schemes=schemes, cron=cron) generator.save() #Downloader cds1 = CrawlerDownloadSetting(job=job_mongodb, proxy='122', cookie='22', dispatch_num=50) cds1.save() cd1 = CrawlerDownload(job=job_mongodb, code='codestr2', types=onetype) cd1.save() #Generator dp = DataPreprocess(job_mongodb.id) dp.save(text=text, settings=settings) #Structure parser = Parser(parser_id=name, python_script=parser_text, update_date=django.utils.timezone.now()) parser.save() structureconfig = StructureConfig(job_copy_id=job_mongodb.id, job=job_mysql, parser=parser, update_date=django.utils.timezone.now()) structureconfig.save()
def test_insert_20000_uri_job(self): onetype = CrawlerDownloadType(language='other', is_support=True) onetype.save() for i in range(1, 11): job = Job(name='1%s' % (str(i)), info='2%s' % (str(i)), customer='ddd%s' % (str(i)), priority=random.randint(-1, 5)) job.save() ctg1 = CrawlerTaskGenerator(job=job, code='echo hello1', cron='* * * * *') ctg1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://重庆/重庆理必易投资管理有限公司/500905004651063/', args='i', from_host='1') for j in range(1000): ct1 = CrawlerTask(job=job, task_generator=ctg1, uri='http://www.baidu.com', args='i', from_host='1') ct1.save() ct1 = CrawlerTask(job=job, task_generator=ctg1, uri='http://www.fishc.com', args='i', from_host='1') ct1.save() cd1 = CrawlerDownload(job=job, code='codestr2', types=onetype) cd1.save() cds1 = CrawlerDownloadSetting(job=job, proxy='122', cookie='22', dispatch_num=50) cds1.save() pass
def test_get_max_dispatch(self): onetype = CrawlerDownloadType(language='other', is_support=True) onetype.save() job1 = Job(name='1', info='2', customer='ddd', priority=-1) job1.save() ctg1 = CrawlerTaskGenerator(job=job1, code='echo hello1', cron='* * * * *') ctg1.save() ct1 = CrawlerTask(job=job1, status=CrawlerTask.STATUS_FAIL, task_generator=ctg1, uri='http://www.fishc.com', args='i', from_host='1') ct1.save() # codestr2 = open('/Users/princetechs3/my_code/code2.sh','r').read() cd1 = CrawlerDownload(job=job1, code='codestr2', types=onetype) cd1.save() cds1 = CrawlerDownloadSetting(job=job1, proxy='122', cookie='22', dispatch_num=50) cds1.save() max_retry_times = 1 dispatch_num = 1 down_tasks = CrawlerTask.objects( status=CrawlerTask.STATUS_FAIL, retry_times__lte=max_retry_times)[:dispatch_num] self.assertTrue(down_tasks)
def test_insert_enterprise_job(self): onetype = CrawlerDownloadType(language='other', is_support=True) onetype.save() job1 = Job(name='1', info='2', customer='ddd', priority=-1) job1.save() ctg1 = CrawlerTaskGenerator(job=job1, code='echo hello1', cron='* * * * *') ctg1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://重庆/重庆理必易投资管理有限公司/500905004651063/', args='i', from_host='1') # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/云南/昆明道岚投资中心(有限合伙)/500905004651063/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/云南/大理富民兴业股权投资基金管理有限公司/532910100007315/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/北京/北京众润投资基金管理有限公司/110105018837481/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/广东/深圳市郞润承泽资产管理有限公司/440301113021601/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/云南/美好置业集团股份有限公司/530000000006503/', args='i', from_host='1') # ct1.save() # # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/江苏/江苏银凤金革资产管理有限公司/320106000236597/', args='i', from_host='1') # # ct1.save() # # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/北京/北京汇泽融盛投资有限公司/110106013355060/', args='i', from_host='1') # # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/广西/柳州化工股份有限公司/***/', args='i', from_host='1') # ct1.save() ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/安徽/安徽省徽商集团化轻股份有限公司/***/', args='i', from_host='1') ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/总局/瀚丰资本管理有限公司/100000000018983/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/江苏/江苏康耀资产管理有限公司/320125000170935/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/北京/北京匀丰资产管理有限公司/110105019391209/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/上海/中安富海投资管理有限公司/310108000565783/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/广东/深圳润阁投资管理有限公司/440301111930453/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/广东/深圳市金汇隆投资管理有限公司/440301109991545/', args='i', from_host='1') # ct1.save() # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/山东/山东正融资产管理有限公司/371300200058462/', args='i', from_host='1') # ct1.save() # codestr2 = open('/Users/princetechs3/my_code/code2.sh','r').read() cd1 = CrawlerDownload(job=job1, code='codestr2', types=onetype) cd1.save() cds1 = CrawlerDownloadSetting(job=job1, proxy='122', cookie='22', dispatch_num=50) cds1.save() pass
def get_generator_object(self): try: generator_object = CrawlerTaskGenerator.objects(job=self.job_id, status=CrawlerTaskGenerator.STATUS_ON).first() except Exception: logging.error("Something wrong when getting generating objects") return generator_object
def remove_offline_jobs_from_crontab(self): jobs = Job.objects(status=Job.STATUS_OFF) for job in jobs: generator = CrawlerTaskGenerator.objects(job=job, status=CrawlerTaskGenerator.STATUS_ON).first() if not generator: continue comment = self._task_generator_cron_comment(job) self.crontab.remove_all(comment=comment)
def test_download(self): sys.path.append('/Users/princetechs3/my_code') onetype = CrawlerDownloadType(language='python') onetype.save() job1 = Job(name='1', info='2', customer='ddd', priority=-1) job1.save() ctg1 = CrawlerTaskGenerator(job=job1, code='echo hello1', cron='* * * * *') ctg1.save() ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='http://www.baidu.com', args='i', from_host='1') ct1.save() codestr1 = open('/Users/princetechs3/my_code/code1.py', 'r').read() cd1 = CrawlerDownload(job=job1, code=codestr1, types=onetype) cd1.save() cds1 = CrawlerDownloadSetting(job=job1, proxy='122', cookie='22', dispatch_num=50) cds1.save() job = Job.objects(status=Job.STATUS_ON)[0] self.assertTrue(job) task = CrawlerTask.objects(job=job)[0] self.assertTrue(task) cd = CrawlerDownload.objects(job=task.job)[0] self.assertTrue(cd) self.assertTrue(cd.code) with open('/Users/princetechs3/my_code/jobcode1.py', 'w') as f: f.write(cd.code) self.exec_command('import jobcode1;jobcode1.run(%s)' % "'http://www.baidu.com'") # print cd.code self.assertEqual(cd.types.language, 'python') print cd.types.language
def update_online_jobs(self): jobs = Job.objects(status=Job.STATUS_ON).order_by("+priority") print "The number of job is %d" % (len(jobs)) for job in jobs: generator = CrawlerTaskGenerator.objects(job=job).order_by('-add_datetime').first() if not generator: continue if not self._test_save_code(generator): continue if not self._test_crontab(generator): continue if not self._test_install_crontab(generator): continue if generator.status == CrawlerTaskGenerator.STATUS_ON: continue generator.status = CrawlerTaskGenerator.STATUS_ON generator.save() CrawlerTaskGenerator.objects(job=job, status=CrawlerTaskGenerator.STATUS_ON, id__ne=generator.id).update(status=CrawlerTaskGenerator.STATUS_OFF)
def test_insert_other_job(self): onetype = CrawlerDownloadType(language='other', is_support=True) onetype.save() job1 = Job(name='1', info='2', customer='ddd', priority=-1) job1.save() ctg1 = CrawlerTaskGenerator(job=job1, code='echo hello1', cron='* * * * *') ctg1.save() ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='http://www.sougou.com', args='i', from_host='1') ct1.save() # codestr2 = open('/Users/princetechs3/my_code/code2.sh','r').read() cd1 = CrawlerDownload(job=job1, code='codestr2', types=onetype) cd1.save() cds1 = CrawlerDownloadSetting(job=job1, proxy='122', cookie='22', dispatch_num=50) cds1.save()
def save_script(self, script, cron, code_type=1, schemes=[]): """ saving script with cron settings to mongodb if params are None or saving excepts return False else return True """ if script is None: content = "ScriptError : Error occured when saving script with job!" CrawlerGeneratorErrorLog(name="ERROR_SAVE", content=content, hostname=socket.gethostname()).save() return False if not CronSlices.is_valid(cron): content = "CronError : Error occured when saving cron with job!" CrawlerGeneratorErrorLog(name="ERROR_SAVE", content=content, hostname=socket.gethostname()).save() return False self.extend_schemes(schemes) try: CrawlerTaskGenerator(job=self.job, code=script, cron=cron, code_type=code_type, schemes=self.schemes).save() except Exception as e: content = "%s : Error occured when saving script with job!" % (e) CrawlerGeneratorErrorLog(name="ERROR_SAVE", content=content, hostname=socket.gethostname()).save() return False return True
def insert_jobs(self): onetype = CrawlerDownloadType(language='python', is_support=True) onetype.save() job1 = Job(name='1', info='2', customer='ddd', priority=-1) job1.save() job2 = Job(name='2', priority=0) job2.save() job3 = Job(name='3', priority=2) job3.save() job4 = Job(name='4', priority=3) job4.save() ctg1 = CrawlerTaskGenerator(job=job1, code='echo hello1', cron='* * * * *') ctg1.save() ctg2 = CrawlerTaskGenerator(job=job2, code='echo hello2', cron='* * * * *') ctg2.save() ctg3 = CrawlerTaskGenerator(job=job3, code='echo hello3', cron='* * * * *') ctg3.save() ctg4 = CrawlerTaskGenerator(job=job4, code='echo hello4', cron='* * * * *') ctg4.save() CrawlerTask(job=job1, task_generator=ctg1, uri='http://www.baidu.com', args='i', from_host='1').save() CrawlerTask(job=job3, task_generator=ctg1, uri='http://www.fishc.com', args='l', from_host='1').save() CrawlerTask(job=job4, task_generator=ctg1, uri='https://xueqiu.com/', args='o', from_host='2').save() CrawlerTask(job=job2, task_generator=ctg1, uri='http://www.jb51.net/article/47957.htm', args='v', from_host='3').save() codestr1 = open('/Users/princetechs3/my_code/xuqiu.py', 'r').read() CrawlerDownload(job=job1, code=codestr1, types=onetype).save() CrawlerDownload(job=job2, code=codestr1, types=onetype).save() CrawlerDownload(job=job3, code=codestr1, types=onetype).save() CrawlerDownload(job=job4, code=codestr1, types=onetype).save() cdc1 = CrawlerDownloadSetting(job=job1, proxy='122', cookie='22', dispatch_num=50) cdc1.save() cdc2 = CrawlerDownloadSetting(job=job2, proxy='2', cookie='3', dispatch_num=60) cdc2.save() cdc3 = CrawlerDownloadSetting(job=job3, proxy='32', cookie='21', dispatch_num=70) cdc3.save() cdc4 = CrawlerDownloadSetting(job=job4, proxy='312', cookie='221', dispatch_num=100) cdc4.save()