コード例 #1
0
ファイル: utils_generator.py プロジェクト: xiaohui2856/crawl
def exec_command(command, comment, cron):
    """
        通过exec执行command,解析comment出JOB的id从而获取此job的信息,并将结果保存至CrawlerGeneratorCronLog中。
    """

    def get_name_id_with_comment(comment):
        p = re.compile("name:(\w+).*?id:(\w+)")
        r = p.search(comment)
        if r:
            return r.groups()
        else:
            return ("", "")

    job_name, job_id = get_name_id_with_comment(comment)
    job = Job.objects(id=job_id).first()
    if not job:
        failed_reason = "Job %s with id %s is not found in mongodb!" % (job_name, job_id)
        logging.error(failed_reason)
        CrawlerGeneratorCronLog(job=job,
                                status=CrawlerGeneratorCronLog.STATUS_FAIL,
                                cron=cron,
                                failed_reason=failed_reason,
                                spend_msecs=0).save()
        return
    pid = os.getpid()
    start_time = time.time()
    status = CrawlerGeneratorCronLog.STATUS_SUCCESS
    failed_reason = ""
    try:
        c = compile(command, "", 'exec')
        exec c in globals(), locals()
    except Exception, e:
        status = CrawlerGeneratorCronLog.STATUS_FAIL
        failed_reason = traceback.format_exc(10)
コード例 #2
0
def run():
    print 'Downloader dispatch start'
    if settings.DISPATCH_BY_PRIORITY:
        total = 0
        jobs = Job.objects(status=Job.STATUS_ON).order_by('+priority')
        print "All jobs Number:", jobs.count()
        for job in jobs:
            total = CrawlerTask.objects(job=job).count()
            print 'This job\'s tasks total number:', total

            dispatch_tasks_num = settings.MAX_TOTAL_DISPATCH_COUNT_ONCE  # 测试每次分发的数量
            tasks = CrawlerTask.objects(job=job, status=1)[:dispatch_tasks_num]
            print "Tasks Count:", len(tasks)
            if len(tasks) > dispatch_tasks_num:
                print "Downloader dispatch Error: Tasks number over MAX_TOTAL_DISPATCH_COUNT_ONCE:", dispatch_tasks_num
                break

            count = 0
            for task in tasks:
                print "Downloader task dispatch :", count
                count += 1
                dispatch_use_pool(task)
            # pool.map(dispatch_use_pool, tasks)
            # pool.close()
            # pool.join()
        # tasks = CrawlerTask.objects(status=CrawlerTask.STATUS_LIVE).order_by('job.priority')[:settings.MAX_TOTAL_DISPATCH_COUNT_ONCE]
    elif settings.DISPATCH_BY_HOSTNAME:
        # TODO:按照主机进行分发
        pass
コード例 #3
0
ファイル: utils_generator.py プロジェクト: xiaohui2856/crawl
 def remove_offline_jobs_from_crontab(self):
     jobs = Job.objects(status=Job.STATUS_OFF)
     for job in jobs:
         generator = CrawlerTaskGenerator.objects(job=job, status=CrawlerTaskGenerator.STATUS_ON).first()
         if not generator:
             continue
         comment = self._task_generator_cron_comment(job)
         self.crontab.remove_all(comment=comment)
コード例 #4
0
ファイル: test_downloader.py プロジェクト: xiaohui2856/crawl
    def test_get_task(self):
        self.delete_jobs()
        self.insert_jobs()
        self.assertEqual(CrawlerDownloadType.objects.count(), 1)
        self.assertEqual(Job.objects.count(), 4)
        self.assertEqual(CrawlerTaskGenerator.objects.count(), 4)
        self.assertEqual(CrawlerTask.objects.count(), 4)
        self.assertEqual(CrawlerDownload.objects.count(), 4)
        self.assertEqual(CrawlerDownloadSetting.objects.count(), 4)

        jobs = Job.objects(status=Job.STATUS_ON).order_by('+priority')
        self.assertTrue(jobs)
        for job in jobs:
            tasks = CrawlerTask.objects(job=job)
            self.assertTrue(tasks)

        self.delete_jobs()
        count = Job.objects.count()
        self.assertEqual(count, 0)
コード例 #5
0
ファイル: test_downloader.py プロジェクト: xiaohui2856/crawl
    def test_download(self):
        sys.path.append('/Users/princetechs3/my_code')

        onetype = CrawlerDownloadType(language='python')
        onetype.save()
        job1 = Job(name='1', info='2', customer='ddd', priority=-1)
        job1.save()
        ctg1 = CrawlerTaskGenerator(job=job1,
                                    code='echo hello1',
                                    cron='* * * * *')
        ctg1.save()
        ct1 = CrawlerTask(job=job1,
                          task_generator=ctg1,
                          uri='http://www.baidu.com',
                          args='i',
                          from_host='1')
        ct1.save()
        codestr1 = open('/Users/princetechs3/my_code/code1.py', 'r').read()
        cd1 = CrawlerDownload(job=job1, code=codestr1, types=onetype)
        cd1.save()
        cds1 = CrawlerDownloadSetting(job=job1,
                                      proxy='122',
                                      cookie='22',
                                      dispatch_num=50)
        cds1.save()

        job = Job.objects(status=Job.STATUS_ON)[0]
        self.assertTrue(job)
        task = CrawlerTask.objects(job=job)[0]
        self.assertTrue(task)

        cd = CrawlerDownload.objects(job=task.job)[0]
        self.assertTrue(cd)

        self.assertTrue(cd.code)
        with open('/Users/princetechs3/my_code/jobcode1.py', 'w') as f:
            f.write(cd.code)
        self.exec_command('import jobcode1;jobcode1.run(%s)' %
                          "'http://www.baidu.com'")
        # print cd.code
        self.assertEqual(cd.types.language, 'python')
        print cd.types.language
コード例 #6
0
ファイル: utils_generator.py プロジェクト: xiaohui2856/crawl
    def update_online_jobs(self):
        jobs = Job.objects(status=Job.STATUS_ON).order_by("+priority")
        print "The number of job is %d" % (len(jobs))
        for job in jobs:
            generator = CrawlerTaskGenerator.objects(job=job).order_by('-add_datetime').first()
            if not generator:
                continue
            if not self._test_save_code(generator):
                continue
            if not self._test_crontab(generator):
                continue
            if not self._test_install_crontab(generator):
                continue

            if generator.status == CrawlerTaskGenerator.STATUS_ON:
                continue

            generator.status = CrawlerTaskGenerator.STATUS_ON
            generator.save()
            CrawlerTaskGenerator.objects(job=job, status=CrawlerTaskGenerator.STATUS_ON,
                                         id__ne=generator.id).update(status=CrawlerTaskGenerator.STATUS_OFF)