コード例 #1
0
ファイル: test_downloader.py プロジェクト: xiaohui2856/crawl
    def test_get_max_dispatch(self):
        onetype = CrawlerDownloadType(language='other', is_support=True)
        onetype.save()
        job1 = Job(name='1', info='2', customer='ddd', priority=-1)
        job1.save()
        ctg1 = CrawlerTaskGenerator(job=job1,
                                    code='echo hello1',
                                    cron='* * * * *')
        ctg1.save()
        ct1 = CrawlerTask(job=job1,
                          status=CrawlerTask.STATUS_FAIL,
                          task_generator=ctg1,
                          uri='http://www.fishc.com',
                          args='i',
                          from_host='1')
        ct1.save()
        # codestr2 = open('/Users/princetechs3/my_code/code2.sh','r').read()
        cd1 = CrawlerDownload(job=job1, code='codestr2', types=onetype)
        cd1.save()
        cds1 = CrawlerDownloadSetting(job=job1,
                                      proxy='122',
                                      cookie='22',
                                      dispatch_num=50)
        cds1.save()

        max_retry_times = 1
        dispatch_num = 1
        down_tasks = CrawlerTask.objects(
            status=CrawlerTask.STATUS_FAIL,
            retry_times__lte=max_retry_times)[:dispatch_num]
        self.assertTrue(down_tasks)
コード例 #2
0
ファイル: utils_generator.py プロジェクト: xiaohui2856/crawl
    def save_task(self):
        uris = []
        val = URLValidator(self.schemes)
        out_f = open(self.out_path, "r")
        for line in out_f:
            self.content_bytes += len(line)
            try:
                js = json.loads(line)
            except ValueError:
                js = ast.literal_eval(line)
                if not isinstance(js, dict):
                    # logging.error("The line %s is not dict or json"%(line))
                    CrawlerGeneratorErrorLog(name="ERROR_URI",
                                             content="The line %s is not dict or json" % (line),
                                             hostname=socket.gethostname()).save()
                    continue
            uri_data = []
            try:
                # validate uri
                if 'uri' not in js:
                    uri_de = js['uri'].encode("utf-8")
                    val(uri_de)
                    uri_data.append(uri_de)
                    if 'args' not in js:
                        uri_args = js['args'].encode("utf-8")
                        uri_data.append(uri_args)
                    uri_str = str(uri_data)
                    uris.append(uri_str)
                else:
                    CrawlerGeneratorErrorLog(name="ERROR_JSON",
                                             content="JSON ValidationError without key 'uri' : %s" % (js),
                                             hostname=socket.gethostname()).save()
            except ValidationError:
                CrawlerGeneratorErrorLog(name="ERROR_URI",
                                         content="URI ValidationError: %s " % (js['uri']),
                                         hostname=socket.gethostname()).save()
        out_f.close()
        os.remove(self.out_path)
        dereplicated_uris = dereplicate_uris(uris)

        for uri_str in dereplicated_uris:
            try:
                eval_uri = eval(uri_str)
                uri = eval_uri[0]
                try:
                    args = eval_uri[1]
                except IndexError:
                    args = "No more args"
                crawler_task = CrawlerTask(job=self.job,
                                           task_generator=self.task_generator,
                                           uri=uri,
                                           args=args,
                                           from_host=socket.gethostname())
                # crawler_task.args = ""
                crawler_task.save()
            except:
                # logging.error("add %s failed: %s", line, traceback.format_exc(10))
                content = traceback.format_exc(10)
                CrawlerGeneratorErrorLog(name="ERROR_URI", content=content, hostname=socket.gethostname()).save()
        self.save_generate_log(CrawlerGeneratorLog.STATUS_SUCCESS, "After generating, save task succeed!")
コード例 #3
0
ファイル: structure.py プロジェクト: xiaohui2856/crawl
    def insert_extracter_test_data(self):

        config = open('structure/extracters/conf_csciwlpc_local.json').read()
        analyzeddata = open('structure/extracters/analyzed_data_csci.json')

        count = 0
        for line in analyzeddata:
            count += 1
            test_job = JobMongoDB("creator", "job_%d" % count, "info", "customer", random.choice(range(1, 3)),
                                  random.choice(range(-1, 6)), datetime.datetime.now())
            test_job.save()

            # test_extracter = ExtracterGenerator.extracter
            test_extracter = Extracter(count, config)
            test_extracter.save()

            test_crawlertask = CrawlerTask(test_job, uri="test_uri_%d" % count, status=7)
            test_crawlertask.save()

            ExtracterStructureConfig(job=test_job, extracter=test_extracter).save()

            processed_line = json.loads(line)['analyzed_data']

            CrawlerAnalyzedData(crawler_task=test_crawlertask, analyzed_data=processed_line).save()
        print "Inserted %s Extracter Test Data " % count
コード例 #4
0
def run():
    print 'Downloader dispatch start'
    if settings.DISPATCH_BY_PRIORITY:
        total = 0
        jobs = Job.objects(status=Job.STATUS_ON).order_by('+priority')
        print "All jobs Number:", jobs.count()
        for job in jobs:
            total = CrawlerTask.objects(job=job).count()
            print 'This job\'s tasks total number:', total

            dispatch_tasks_num = settings.MAX_TOTAL_DISPATCH_COUNT_ONCE  # 测试每次分发的数量
            tasks = CrawlerTask.objects(job=job, status=1)[:dispatch_tasks_num]
            print "Tasks Count:", len(tasks)
            if len(tasks) > dispatch_tasks_num:
                print "Downloader dispatch Error: Tasks number over MAX_TOTAL_DISPATCH_COUNT_ONCE:", dispatch_tasks_num
                break

            count = 0
            for task in tasks:
                print "Downloader task dispatch :", count
                count += 1
                dispatch_use_pool(task)
            # pool.map(dispatch_use_pool, tasks)
            # pool.close()
            # pool.join()
        # tasks = CrawlerTask.objects(status=CrawlerTask.STATUS_LIVE).order_by('job.priority')[:settings.MAX_TOTAL_DISPATCH_COUNT_ONCE]
    elif settings.DISPATCH_BY_HOSTNAME:
        # TODO:按照主机进行分发
        pass
コード例 #5
0
ファイル: structure.py プロジェクト: xiaohui2856/crawl
    def empty_test_data(self):

        JobMongoDB.drop_collection()
        Extracter.drop_collection()
        CrawlerTask.drop_collection()
        ExtracterStructureConfig.drop_collection()
        CrawlerAnalyzedData.drop_collection()
        CrawlerExtracterInfo.drop_collection()
        print "Extracter Test Data Cleaned!"
コード例 #6
0
ファイル: structure.py プロジェクト: xiaohui2856/crawl
def insert_test_data():

    right_script = """class RawParser(object):
    def parser(self, crawlerdownloaddata):
        data = "JSON Format Data After Parsing"
        return data"""

    wrong_script = """class RawParser(object):
    def parser(self, crawlerdownloaddata):
        print crawlerdownloaddata.wrong"""

    try:
        user = User.objects.get(username='******')
    except:
        user = User.objects.create_user('user_name', 'user_email', 'password')

    for count in range(0, 100):
        status = random.choice(range(1, 3))
        priority = random.choice(range(-1, 6))

        test_job_mysql = JobMySQL(creator=user,
                                  name="job_%d" % count,
                                  info="info",
                                  customer="customer",
                                  status=status,
                                  priority=priority,
                                  add_datetime=django.utils.timezone.now())
        test_job_mysql.save()

        test_job_mongodb = JobMongoDB(creator="creator_%d" % count,
                                      name="job_%d" % count,
                                      info="info",
                                      customer="customer",
                                      status=status,
                                      priority=priority,
                                      add_datetime=datetime.datetime.now())
        test_job_mongodb.save()
        test_parser = Parser(parser_id=str(count),
                             python_script=random.choice([right_script, wrong_script]),
                             update_date=django.utils.timezone.now())
        test_parser.save()

        test_crawlertask = CrawlerTask(test_job_mongodb, uri="test_uri_%d" % count, status=5)
        test_crawlertask.save()

        test_downloader = CrawlerDownload(test_job_mongodb)
        test_downloader.save()

        StructureConfig(job=test_job_mysql,
                        parser=test_parser,
                        job_copy_id=test_job_mongodb.id,
                        update_date=django.utils.timezone.now()).save()

        CrawlerDownloadData(test_job_mongodb, test_downloader, test_crawlertask).save()

    print "Data Inserted!"
コード例 #7
0
ファイル: test_downloader.py プロジェクト: xiaohui2856/crawl
    def test_insert_enterprise_job(self):
        onetype = CrawlerDownloadType(language='other', is_support=True)
        onetype.save()
        job1 = Job(name='1', info='2', customer='ddd', priority=-1)
        job1.save()
        ctg1 = CrawlerTaskGenerator(job=job1,
                                    code='echo hello1',
                                    cron='* * * * *')
        ctg1.save()
        # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://重庆/重庆理必易投资管理有限公司/500905004651063/', args='i', from_host='1')
        # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/云南/昆明道岚投资中心(有限合伙)/500905004651063/', args='i', from_host='1')
        # ct1.save()
        # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/云南/大理富民兴业股权投资基金管理有限公司/532910100007315/', args='i', from_host='1')
        # ct1.save()
        # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/北京/北京众润投资基金管理有限公司/110105018837481/', args='i', from_host='1')
        # ct1.save()
        # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/广东/深圳市郞润承泽资产管理有限公司/440301113021601/', args='i', from_host='1')
        # ct1.save()
        # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/云南/美好置业集团股份有限公司/530000000006503/', args='i', from_host='1')
        # ct1.save()
        # # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/江苏/江苏银凤金革资产管理有限公司/320106000236597/', args='i', from_host='1')
        # # ct1.save()
        # # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/北京/北京汇泽融盛投资有限公司/110106013355060/', args='i', from_host='1')
        # # ct1.save()
        # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/广西/柳州化工股份有限公司/***/', args='i', from_host='1')
        # ct1.save()
        ct1 = CrawlerTask(job=job1,
                          task_generator=ctg1,
                          uri='enterprise://localhost/安徽/安徽省徽商集团化轻股份有限公司/***/',
                          args='i',
                          from_host='1')
        ct1.save()
        # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/总局/瀚丰资本管理有限公司/100000000018983/', args='i', from_host='1')
        # ct1.save()
        # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/江苏/江苏康耀资产管理有限公司/320125000170935/', args='i', from_host='1')
        # ct1.save()
        # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/北京/北京匀丰资产管理有限公司/110105019391209/', args='i', from_host='1')
        # ct1.save()
        # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/上海/中安富海投资管理有限公司/310108000565783/', args='i', from_host='1')
        # ct1.save()

        # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/广东/深圳润阁投资管理有限公司/440301111930453/', args='i', from_host='1')
        # ct1.save()
        # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/广东/深圳市金汇隆投资管理有限公司/440301109991545/', args='i', from_host='1')
        # ct1.save()
        # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://localhost/山东/山东正融资产管理有限公司/371300200058462/', args='i', from_host='1')
        # ct1.save()
        # codestr2 = open('/Users/princetechs3/my_code/code2.sh','r').read()
        cd1 = CrawlerDownload(job=job1, code='codestr2', types=onetype)
        cd1.save()
        cds1 = CrawlerDownloadSetting(job=job1,
                                      proxy='122',
                                      cookie='22',
                                      dispatch_num=50)
        cds1.save()
        pass
コード例 #8
0
ファイル: structure.py プロジェクト: xiaohui2856/crawl
def empty_test_data():
    JobMongoDB.drop_collection()
    JobMySQL.objects.all().delete()
    Parser.objects.all().delete()
    CrawlerTask.drop_collection()
    CrawlerDownload.drop_collection()
    StructureConfig.objects.all().delete()
    CrawlerDownloadData.drop_collection()
    CrawlerAnalyzedData.drop_collection()

    print "Data Cleaned!"
コード例 #9
0
ファイル: test_downloader.py プロジェクト: xiaohui2856/crawl
 def test_insert_20000_uri_job(self):
     onetype = CrawlerDownloadType(language='other', is_support=True)
     onetype.save()
     for i in range(1, 11):
         job = Job(name='1%s' % (str(i)),
                   info='2%s' % (str(i)),
                   customer='ddd%s' % (str(i)),
                   priority=random.randint(-1, 5))
         job.save()
         ctg1 = CrawlerTaskGenerator(job=job,
                                     code='echo hello1',
                                     cron='* * * * *')
         ctg1.save()
         # ct1 = CrawlerTask(job=job1, task_generator=ctg1, uri='enterprise://重庆/重庆理必易投资管理有限公司/500905004651063/', args='i', from_host='1')
         for j in range(1000):
             ct1 = CrawlerTask(job=job,
                               task_generator=ctg1,
                               uri='http://www.baidu.com',
                               args='i',
                               from_host='1')
             ct1.save()
             ct1 = CrawlerTask(job=job,
                               task_generator=ctg1,
                               uri='http://www.fishc.com',
                               args='i',
                               from_host='1')
             ct1.save()
         cd1 = CrawlerDownload(job=job, code='codestr2', types=onetype)
         cd1.save()
         cds1 = CrawlerDownloadSetting(job=job,
                                       proxy='122',
                                       cookie='22',
                                       dispatch_num=50)
         cds1.save()
     pass
コード例 #10
0
ファイル: test_structure.py プロジェクト: xiaohui2856/crawl
 def setUp(self):
     TestCase.setUp(self)
     self.test_structuregenerator = StructureGenerator()
     self.test_count = CrawlerTask.objects.count()
     if self.test_count == 0:
         empty_test_data()
         insert_test_data()
     else:
         pass
     self.test_crawlertasks = CrawlerTask.objects()
コード例 #11
0
ファイル: test_downloader.py プロジェクト: xiaohui2856/crawl
    def test_download(self):
        sys.path.append('/Users/princetechs3/my_code')

        onetype = CrawlerDownloadType(language='python')
        onetype.save()
        job1 = Job(name='1', info='2', customer='ddd', priority=-1)
        job1.save()
        ctg1 = CrawlerTaskGenerator(job=job1,
                                    code='echo hello1',
                                    cron='* * * * *')
        ctg1.save()
        ct1 = CrawlerTask(job=job1,
                          task_generator=ctg1,
                          uri='http://www.baidu.com',
                          args='i',
                          from_host='1')
        ct1.save()
        codestr1 = open('/Users/princetechs3/my_code/code1.py', 'r').read()
        cd1 = CrawlerDownload(job=job1, code=codestr1, types=onetype)
        cd1.save()
        cds1 = CrawlerDownloadSetting(job=job1,
                                      proxy='122',
                                      cookie='22',
                                      dispatch_num=50)
        cds1.save()

        job = Job.objects(status=Job.STATUS_ON)[0]
        self.assertTrue(job)
        task = CrawlerTask.objects(job=job)[0]
        self.assertTrue(task)

        cd = CrawlerDownload.objects(job=task.job)[0]
        self.assertTrue(cd)

        self.assertTrue(cd.code)
        with open('/Users/princetechs3/my_code/jobcode1.py', 'w') as f:
            f.write(cd.code)
        self.exec_command('import jobcode1;jobcode1.run(%s)' %
                          "'http://www.baidu.com'")
        # print cd.code
        self.assertEqual(cd.types.language, 'python')
        print cd.types.language
コード例 #12
0
def refresh_failed_jobs():
    count = 0
    failed_tasks = CrawlerTask.objects(status=6)
    print "% d crawlertasks failed parsing now" % len(failed_tasks)
    for failed_task in failed_tasks:
        failed_task.update(status=5)
        count += 1
    print "Refresh % d failed crawlertasks (Which failed for over max retry times)" % count
    logging.info(
        "Refresh % d failed crawlertasks (Which failed for over max retry times)"
        % count)
コード例 #13
0
def requeue_failed_jobs():
    pass  # 待修改
    return
    extracter_generator = ExtracterGenerator()
    failed_tasks = CrawlerTask.objects(status=8)

    if failed_tasks == None:
        logging.info("No failed extract jobs at this time")
        print "No failed extract jobs at this time"
    else:
        print "Current number of failed extract jobs: %d" % len(failed_tasks)
    count = 0

    for failed_task in failed_tasks:
        failed_data = CrawlerExtracterInfo.objects(
            crawler_task=failed_task).first()
        if failed_data.retry_times >= 3:
            pass
        else:
            failed_job_data = extracter_generator.get_task_analyzed_data(
                failed_task)
            extracterstructureconfig = ExtracterStructureConfig.objects(
                job=data.crawlertask.job).first()
            failed_job_conf = extracterstructureconfig.extracter.extracter_config
            failed_job_priority = extracter_generator.get_task_priority(
                failed_task)
            q = None
            if failed_job_priority == ExtracterConsts.QUEUE_PRIORITY_TOO_HIGH:
                q = too_high_queue
            elif failed_job_priority == ExtracterConsts.QUEUE_PRIORITY_HIGH:
                q = high_queue
            elif failed_job_priority == ExtracterConsts.QUEUE_PRIORITY_NORMAL:
                q = normal_queue
            elif failed_job_priority == ExtracterConsts.QUEUE_PRIORITY_LOW:
                q = low_queue
            else:
                q = low_queue
            if (q.count + 1) > ExtracterConsts.QUEUE_MAX_LENGTH:
                logging.error(
                    "Cannot requeue extract job because the queue: %s is full"
                    % q.name)
                print "Cannot requeue extract job because the queue: %s is full" % q.name
                return None
            else:
                q.enqueue_call(func=ExtracterGenerator.extracter,
                               args=[failed_job_conf, failed_job_data])
                failed_data_retry_times = failed_data.retry_times + 1
                failed_data.update(retry_times=failed_data_retry_times)
            count += 1
            failed_task.update(status=7)
    print "%d failed extract jobs requeued successfully!" % count
コード例 #14
0
ファイル: test_structure.py プロジェクト: xiaohui2856/crawl
    def test_extract_function(self):
        test_crawlertask = CrawlerTask.objects().first()
        test_crawleranalyzeddata = CrawlerAnalyzedData.objects(crawler_task=test_crawlertask).first()
        test_extracter = self.test_extractergenerator.extracter(test_crawlertask)
        test_extracted_result = extracter(test_rawparser, test_crawlerdownloaddata)
        test_extractedinfo = CrawlerExtracterInfo.objects(job=test_crawlertask.job)
        self.assertTrue(test_extracted_result)

        if self.assertEqual(test_crawlertask.status, 9):
            #Clean data
            test_crawlertask.update(status=random.choice(range(1, 10)))
        else:
            pass
        test_crawlerextracteddata.delete()
コード例 #15
0
ファイル: test_structure.py プロジェクト: xiaohui2856/crawl
 def test_parser_function(self):
     test_crawlertask = CrawlerTask.objects().first()
     test_crawlerdownloaddata = CrawlerDownloadData.objects(crawlertask=test_crawlertask).first()
     test_rawparser = self.test_parsergenerator.get_parser(test_crawlertask)
     test_analyzed_data = parser_func(test_rawparser, test_crawlerdownloaddata)
     test_crawleranalyzeddata = CrawlerAnalyzedData.objects(Q(uri=test_crawlertask.uri) & Q(
         job=test_crawlertask.job)).first()
     self.assertIsNotNone(test_crawleranalyzeddata.analyzed_data)
     if self.assertEqual(test_crawlertask.status, 7):
         #Clean data
         test_crawlertask.update(status=random.choice(range(1, 8)))
     else:
         pass
     test_crawleranalyzeddata.delete()
コード例 #16
0
ファイル: test_downloader.py プロジェクト: xiaohui2856/crawl
 def test_insert_other_job(self):
     onetype = CrawlerDownloadType(language='other', is_support=True)
     onetype.save()
     job1 = Job(name='1', info='2', customer='ddd', priority=-1)
     job1.save()
     ctg1 = CrawlerTaskGenerator(job=job1,
                                 code='echo hello1',
                                 cron='* * * * *')
     ctg1.save()
     ct1 = CrawlerTask(job=job1,
                       task_generator=ctg1,
                       uri='http://www.sougou.com',
                       args='i',
                       from_host='1')
     ct1.save()
     # codestr2 = open('/Users/princetechs3/my_code/code2.sh','r').read()
     cd1 = CrawlerDownload(job=job1, code='codestr2', types=onetype)
     cd1.save()
     cds1 = CrawlerDownloadSetting(job=job1,
                                   proxy='122',
                                   cookie='22',
                                   dispatch_num=50)
     cds1.save()
コード例 #17
0
def requeue_failed_jobs():
    structure_generator = StructureGenerator()
    failed_tasks = CrawlerTask.objects(status=6)

    if failed_tasks == None:
        logging.info("No failed parse jobs at this time")
        print "No failed parse jobs at this time"
    else:
        print "Current number of failed parse jobs: %d" % len(failed_tasks)
    count = 0
    deleted_data_count = 0
    for failed_task in failed_tasks:
        failed_data = CrawlerAnalyzedData.objects(
            crawler_task=failed_task).first()
        if failed_data.retry_times >= 3:
            failed_data.delete()
            deleted_data_count += 1
        else:
            failed_job_source_data = structure_generator.get_task_source_data(
                failed_task)
            failed_job_priority = structure_generator.get_task_priority(
                failed_task)
            q = None
            if failed_job_priority == Consts.QUEUE_PRIORITY_TOO_HIGH:
                q = too_high_queue
            elif failed_job_priority == Consts.QUEUE_PRIORITY_HIGH:
                q = high_queue
            elif failed_job_priority == Consts.QUEUE_PRIORITY_NORMAL:
                q = normal_queue
            elif failed_job_priority == Consts.QUEUE_PRIORITY_LOW:
                q = low_queue
            else:
                q = low_queue
            if (q.count + 1) > Consts.QUEUE_MAX_LENGTH:
                logging.error(
                    "Cannot requeue parse job because the queue: %s is full" %
                    q.name)
                print "Cannot requeue parse job because the queue: %s is full" % q.name
                return None
            else:
                q.enqueue_call(func=parser_func, args=[failed_job_source_data])
                failed_data_retry_times = failed_data.retry_times + 1
                failed_data.update(retry_times=failed_data_retry_times)
            count += 1
            failed_task.update(status=5)
    print "%d failed parse jobs requeued successfully!" % count
    if deleted_data_count > 0:
        print "Delete % d data from crawlertasks which failed for over max retry times" % deleted_data_count
コード例 #18
0
ファイル: test_downloader.py プロジェクト: xiaohui2856/crawl
    def test_get_task(self):
        self.delete_jobs()
        self.insert_jobs()
        self.assertEqual(CrawlerDownloadType.objects.count(), 1)
        self.assertEqual(Job.objects.count(), 4)
        self.assertEqual(CrawlerTaskGenerator.objects.count(), 4)
        self.assertEqual(CrawlerTask.objects.count(), 4)
        self.assertEqual(CrawlerDownload.objects.count(), 4)
        self.assertEqual(CrawlerDownloadSetting.objects.count(), 4)

        jobs = Job.objects(status=Job.STATUS_ON).order_by('+priority')
        self.assertTrue(jobs)
        for job in jobs:
            tasks = CrawlerTask.objects(job=job)
            self.assertTrue(tasks)

        self.delete_jobs()
        count = Job.objects.count()
        self.assertEqual(count, 0)
コード例 #19
0
ファイル: utils_generator.py プロジェクト: xiaohui2856/crawl
 def save_text(self, text, schemes=None):
     """
     """
     uris = self.read_from_strings(text, schemes)
     # for uri in uris:
     #     try:
     #         CrawlerTask(job= self.job, uri= uri, from_host= socket.gethostname()).save()
     #     except Exception as e:
     #         content = "%s : Error occured when saving uris %s."%(type(e), uri)
     #         # logging.error(content)
     #         CrawlerGeneratorErrorLog(name= "ERROR_SAVE", content= content, hostname= socket.gethostname()).save()
     bulk = []
     for uri in uris:
         bulk.append(CrawlerTask(job=self.job, uri=uri, from_host=socket.gethostname()))
     try:
         CrawlerTask.objects.insert(bulk)
     except Exception, e:
         CrawlerGeneratorErrorLog(name="ERROR_SAVE",
                                  content="%s : Error occured when saving uris." % (type(e)),
                                  hostname=socket.gethostname()).save()
コード例 #20
0
ファイル: test_downloader.py プロジェクト: xiaohui2856/crawl
    def insert_jobs(self):

        onetype = CrawlerDownloadType(language='python', is_support=True)
        onetype.save()

        job1 = Job(name='1', info='2', customer='ddd', priority=-1)
        job1.save()
        job2 = Job(name='2', priority=0)
        job2.save()
        job3 = Job(name='3', priority=2)
        job3.save()
        job4 = Job(name='4', priority=3)
        job4.save()

        ctg1 = CrawlerTaskGenerator(job=job1,
                                    code='echo hello1',
                                    cron='* * * * *')
        ctg1.save()
        ctg2 = CrawlerTaskGenerator(job=job2,
                                    code='echo hello2',
                                    cron='* * * * *')
        ctg2.save()
        ctg3 = CrawlerTaskGenerator(job=job3,
                                    code='echo hello3',
                                    cron='* * * * *')
        ctg3.save()
        ctg4 = CrawlerTaskGenerator(job=job4,
                                    code='echo hello4',
                                    cron='* * * * *')
        ctg4.save()

        CrawlerTask(job=job1,
                    task_generator=ctg1,
                    uri='http://www.baidu.com',
                    args='i',
                    from_host='1').save()
        CrawlerTask(job=job3,
                    task_generator=ctg1,
                    uri='http://www.fishc.com',
                    args='l',
                    from_host='1').save()
        CrawlerTask(job=job4,
                    task_generator=ctg1,
                    uri='https://xueqiu.com/',
                    args='o',
                    from_host='2').save()
        CrawlerTask(job=job2,
                    task_generator=ctg1,
                    uri='http://www.jb51.net/article/47957.htm',
                    args='v',
                    from_host='3').save()

        codestr1 = open('/Users/princetechs3/my_code/xuqiu.py', 'r').read()
        CrawlerDownload(job=job1, code=codestr1, types=onetype).save()
        CrawlerDownload(job=job2, code=codestr1, types=onetype).save()
        CrawlerDownload(job=job3, code=codestr1, types=onetype).save()
        CrawlerDownload(job=job4, code=codestr1, types=onetype).save()

        cdc1 = CrawlerDownloadSetting(job=job1,
                                      proxy='122',
                                      cookie='22',
                                      dispatch_num=50)
        cdc1.save()
        cdc2 = CrawlerDownloadSetting(job=job2,
                                      proxy='2',
                                      cookie='3',
                                      dispatch_num=60)
        cdc2.save()
        cdc3 = CrawlerDownloadSetting(job=job3,
                                      proxy='32',
                                      cookie='21',
                                      dispatch_num=70)
        cdc3.save()
        cdc4 = CrawlerDownloadSetting(job=job4,
                                      proxy='312',
                                      cookie='221',
                                      dispatch_num=100)
        cdc4.save()
コード例 #21
0
ファイル: test_structure.py プロジェクト: xiaohui2856/crawl
 def test_get_parser(self):
     test_crawlertask = CrawlerTask.objects().first()
     test_rawparser = self.test_parsergenerator.get_parser(test_crawlertask)
     self.assertIsNotNone(test_rawparser)
コード例 #22
0
ファイル: structure.py プロジェクト: xiaohui2856/crawl
 def filter_parsed_tasks(self):
     parsed_tasks = CrawlerTask.objects(status=CrawlerTask.STATUS_ANALYSIS_SUCCESS)
     if parsed_tasks is None:
         logging.info("No parsed (status = 7) tasks")
     return parsed_tasks
コード例 #23
0
ファイル: structure.py プロジェクト: xiaohui2856/crawl
 def filter_downloaded_tasks(self):
     downloaded_tasks = CrawlerTask.objects(status=5)
     if downloaded_tasks is None:
         logging.info("No downloaded (status = 5) tasks")
     return downloaded_tasks
コード例 #24
0
ファイル: structure.py プロジェクト: xiaohui2856/crawl
 def filter_parsed_tasks(self):
     parsed_tasks = CrawlerTask.objects(status=7)
     if parsed_tasks is None:
         logging.info("No parsed (status = 7) tasks")
     return parsed_tasks
コード例 #25
0
def dispatch_use_pool(task):
    try:
        # dispatch_num = CrawlerDownloadSetting.objects(job=task.job)[0].dispatch_num
        dispatch_num = 1
        if dispatch_num == 0:
            write_dispatch_alter_log(job=task.job, reason='dispatch_num is 0')
            return
        # print type(dispatch_num), dispatch_num
        max_retry_times = CrawlerDownloadSetting.objects(
            job=task.job)[0].max_retry_times
        if settings.OPEN_CRAWLER_FAILED_ONLY:
            down_tasks = CrawlerTask.objects(
                status=CrawlerTask.STATUS_FAIL).order_by('?')[:dispatch_num]
        else:
            if datetime.datetime.now().minute >= 56:
                # max_retry_times <= max_retry_times
                down_tasks = CrawlerTask.objects(
                    status=CrawlerTask.STATUS_FAIL,
                    retry_times__lte=max_retry_times).order_by(
                        '?')[:dispatch_num]
            else:
                down_tasks = CrawlerTask.objects(status=CrawlerTask.STATUS_LIVE
                                                 ).order_by('?')[:dispatch_num]
            if len(down_tasks) == 0:
                # write_dispatch_alter_log(job=task.job, reason='get down_tasks len is 0')
                return
    except Exception as e:
        write_dispatch_error_log(job=task.job, reason=str(e))
        return None

    for task in down_tasks:
        priority = task.job.priority
        try:
            task.status = CrawlerTask.STATUS_DISPATCH
            if priority == -1:
                if len(q_down_super) >= settings.Q_DOWN_SUPER_LEN:
                    write_dispatch_alter_log(
                        job=task.job, reason='q_down_super lens get maxlen')
                    continue
                q_down_super.enqueue(download_clawer_task,
                                     args=[task],
                                     timeout=settings.RQ_DOWNLOAD_TASK_TIMEOUT)
            elif priority == 0:
                if len(q_down_high) >= settings.Q_DOWN_HIGH_LEN:
                    write_dispatch_alter_log(
                        job=task.job, reason='q_down_high lens get maxlen')
                    continue
                q_down_high.enqueue(download_clawer_task,
                                    args=[task],
                                    at_front=True,
                                    timeout=settings.RQ_DOWNLOAD_TASK_TIMEOUT)
            elif priority == 1:
                if len(q_down_high) >= settings.Q_DOWN_HIGH_LEN:
                    write_dispatch_alter_log(
                        job=task.job, reason='q_down_high lens get maxlen')
                    continue
                q_down_high.enqueue(download_clawer_task,
                                    args=[task],
                                    at_front=False,
                                    timeout=settings.RQ_DOWNLOAD_TASK_TIMEOUT)
            elif priority == 2:
                if len(q_down_mid) >= settings.Q_DOWN_MID_LEN:
                    write_dispatch_alter_log(
                        job=task.job, reason='q_down_mid lens get maxlen')
                    continue
                q_down_mid.enqueue(download_clawer_task,
                                   args=[task],
                                   at_front=True,
                                   timeout=settings.RQ_DOWNLOAD_TASK_TIMEOUT)
            elif priority == 3:
                if len(q_down_mid) >= settings.Q_DOWN_MID_LEN:
                    write_dispatch_alter_log(
                        job=task.job, reason='q_down_mid lens get maxlen')
                    continue
                q_down_mid.enqueue(download_clawer_task,
                                   args=[task],
                                   at_front=False,
                                   timeout=settings.RQ_DOWNLOAD_TASK_TIMEOUT)
            elif priority == 4:
                if len(q_down_low) >= settings.Q_DOWN_LOW_LEN:
                    write_dispatch_alter_log(
                        job=task.job, reason='q_down_low lens get maxlen')
                    continue
                q_down_low.enqueue(download_clawer_task,
                                   args=[task],
                                   at_front=True,
                                   timeout=settings.RQ_DOWNLOAD_TASK_TIMEOUT)
            elif priority == 5:
                if len(q_down_low) >= settings.Q_DOWN_LOW_LEN:
                    write_dispatch_alter_log(
                        job=task.job, reason='q_down_low lens get maxlen')
                    continue
                q_down_low.enqueue(download_clawer_task,
                                   args=[task],
                                   at_front=False,
                                   timeout=settings.RQ_DOWNLOAD_TASK_TIMEOUT)

            task.save()
            write_dispatch_success_log(job=task.job, reason='success')
        except Exception as e:
            task.status = CrawlerTask.STATUS_FAIL
            write_dispatch_failed_log(job=task.job, reason=str(e))