def exec_command(command, comment, cron): """ 通过exec执行command,解析comment出JOB的id从而获取此job的信息,并将结果保存至CrawlerGeneratorCronLog中。 """ def get_name_id_with_comment(comment): p = re.compile("name:(\w+).*?id:(\w+)") r = p.search(comment) if r: return r.groups() else: return ("", "") job_name, job_id = get_name_id_with_comment(comment) job = Job.objects(id=job_id).first() if not job: failed_reason = "Job %s with id %s is not found in mongodb!" % (job_name, job_id) logging.error(failed_reason) CrawlerGeneratorCronLog(job=job, status=CrawlerGeneratorCronLog.STATUS_FAIL, cron=cron, failed_reason=failed_reason, spend_msecs=0).save() return pid = os.getpid() start_time = time.time() status = CrawlerGeneratorCronLog.STATUS_SUCCESS failed_reason = "" try: c = compile(command, "", 'exec') exec c in globals(), locals() except Exception, e: status = CrawlerGeneratorCronLog.STATUS_FAIL failed_reason = traceback.format_exc(10)
def run(): print 'Downloader dispatch start' if settings.DISPATCH_BY_PRIORITY: total = 0 jobs = Job.objects(status=Job.STATUS_ON).order_by('+priority') print "All jobs Number:", jobs.count() for job in jobs: total = CrawlerTask.objects(job=job).count() print 'This job\'s tasks total number:', total dispatch_tasks_num = settings.MAX_TOTAL_DISPATCH_COUNT_ONCE # 测试每次分发的数量 tasks = CrawlerTask.objects(job=job, status=1)[:dispatch_tasks_num] print "Tasks Count:", len(tasks) if len(tasks) > dispatch_tasks_num: print "Downloader dispatch Error: Tasks number over MAX_TOTAL_DISPATCH_COUNT_ONCE:", dispatch_tasks_num break count = 0 for task in tasks: print "Downloader task dispatch :", count count += 1 dispatch_use_pool(task) # pool.map(dispatch_use_pool, tasks) # pool.close() # pool.join() # tasks = CrawlerTask.objects(status=CrawlerTask.STATUS_LIVE).order_by('job.priority')[:settings.MAX_TOTAL_DISPATCH_COUNT_ONCE] elif settings.DISPATCH_BY_HOSTNAME: # TODO:按照主机进行分发 pass
def remove_offline_jobs_from_crontab(self): jobs = Job.objects(status=Job.STATUS_OFF) for job in jobs: generator = CrawlerTaskGenerator.objects(job=job, status=CrawlerTaskGenerator.STATUS_ON).first() if not generator: continue comment = self._task_generator_cron_comment(job) self.crontab.remove_all(comment=comment)
def test_get_task(self): self.delete_jobs() self.insert_jobs() self.assertEqual(CrawlerDownloadType.objects.count(), 1) self.assertEqual(Job.objects.count(), 4) self.assertEqual(CrawlerTaskGenerator.objects.count(), 4) self.assertEqual(CrawlerTask.objects.count(), 4) self.assertEqual(CrawlerDownload.objects.count(), 4) self.assertEqual(CrawlerDownloadSetting.objects.count(), 4) jobs = Job.objects(status=Job.STATUS_ON).order_by('+priority') self.assertTrue(jobs) for job in jobs: tasks = CrawlerTask.objects(job=job) self.assertTrue(tasks) self.delete_jobs() count = Job.objects.count() self.assertEqual(count, 0)
def test_download(self): sys.path.append('/Users/princetechs3/my_code') onetype = CrawlerDownloadType(language='python') onetype.save() job1 = Job(name='1', info='2', customer='ddd', priority=-1) job1.save() ctg1 = CrawlerTaskGenerator(job=job1, code='echo hello1', cron='* * * * *') ctg1.save() ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='http://www.baidu.com', args='i', from_host='1') ct1.save() codestr1 = open('/Users/princetechs3/my_code/code1.py', 'r').read() cd1 = CrawlerDownload(job=job1, code=codestr1, types=onetype) cd1.save() cds1 = CrawlerDownloadSetting(job=job1, proxy='122', cookie='22', dispatch_num=50) cds1.save() job = Job.objects(status=Job.STATUS_ON)[0] self.assertTrue(job) task = CrawlerTask.objects(job=job)[0] self.assertTrue(task) cd = CrawlerDownload.objects(job=task.job)[0] self.assertTrue(cd) self.assertTrue(cd.code) with open('/Users/princetechs3/my_code/jobcode1.py', 'w') as f: f.write(cd.code) self.exec_command('import jobcode1;jobcode1.run(%s)' % "'http://www.baidu.com'") # print cd.code self.assertEqual(cd.types.language, 'python') print cd.types.language
def update_online_jobs(self): jobs = Job.objects(status=Job.STATUS_ON).order_by("+priority") print "The number of job is %d" % (len(jobs)) for job in jobs: generator = CrawlerTaskGenerator.objects(job=job).order_by('-add_datetime').first() if not generator: continue if not self._test_save_code(generator): continue if not self._test_crontab(generator): continue if not self._test_install_crontab(generator): continue if generator.status == CrawlerTaskGenerator.STATUS_ON: continue generator.status = CrawlerTaskGenerator.STATUS_ON generator.save() CrawlerTaskGenerator.objects(job=job, status=CrawlerTaskGenerator.STATUS_ON, id__ne=generator.id).update(status=CrawlerTaskGenerator.STATUS_OFF)