コード例 #1
0
ファイル: test_arxiv.py プロジェクト: michamos/hepcrawl
def test_arxiv_crawl_twice(set_up_local_environment, expected_results):
    crawler = get_crawler_instance(
        set_up_local_environment.get('CRAWLER_HOST_URL'))

    results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=20,
        events_limit=1,
        crawler_instance=crawler,
        project=set_up_local_environment.get('CRAWLER_PROJECT'),
        spider='arXiv',
        settings={},
        **set_up_local_environment.get('CRAWLER_ARGUMENTS'))

    gotten_results = [override_generated_fields(result) for result in results]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    assert gotten_results == expected_results

    results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=20,
        crawler_instance=crawler,
        project=set_up_local_environment.get('CRAWLER_PROJECT'),
        spider='arXiv',
        settings={},
        **set_up_local_environment.get('CRAWLER_ARGUMENTS'))

    gotten_results = [override_generated_fields(result) for result in results]

    assert gotten_results == []
コード例 #2
0
def test_wsp_ftp_crawl_twice(expected_results, settings, cleanup):
    crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'), )

    results = CeleryMonitor.do_crawl(app=celery_app,
                                     monitor_timeout=5,
                                     monitor_iter_limit=20,
                                     events_limit=2,
                                     crawler_instance=crawler,
                                     project=settings.get('CRAWLER_PROJECT'),
                                     spider='WSP',
                                     settings={},
                                     **settings.get('CRAWLER_ARGUMENTS'))

    gotten_results = [override_generated_fields(result) for result in results]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    assert gotten_results == expected_results

    results = CeleryMonitor.do_crawl(app=celery_app,
                                     monitor_timeout=5,
                                     monitor_iter_limit=20,
                                     events_limit=2,
                                     crawler_instance=crawler,
                                     project=settings.get('CRAWLER_PROJECT'),
                                     spider='WSP',
                                     settings={},
                                     **settings.get('CRAWLER_ARGUMENTS'))

    gotten_results = [override_generated_fields(result) for result in results]

    assert gotten_results == []
コード例 #3
0
ファイル: test_desy.py プロジェクト: drjova/hepcrawl
def test_desy_crawl_twice(expected_results, settings, cleanup):
    crawler = get_crawler_instance(
        settings.get('CRAWLER_HOST_URL')
    )

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    gotten_records = [
        result['record'] for result in crawl_result['results_data']
    ]
    gotten_records = override_dynamic_fields_on_records(gotten_records)
    expected_results = override_dynamic_fields_on_records(expected_results)

    gotten_records = deep_sort(
        sorted(
            gotten_records,
            key=lambda record: record['titles'][0]['title'],
        )
    )
    expected_results = deep_sort(
        sorted(
            expected_results,
            key=lambda result: result['titles'][0]['title'],
        )
    )

    assert gotten_records == expected_results
    assert not crawl_result['errors']

    # Second crawl
    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 0
コード例 #4
0
ファイル: test_cds.py プロジェクト: turtle321/hepcrawl
def test_cds_crawl_twice(set_up_local_environment, expected_results):
    crawler = get_crawler_instance(
        set_up_local_environment.get('CRAWLER_HOST_URL')
    )

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=20,
        events_limit=1,
        crawler_instance=crawler,
        project=set_up_local_environment.get('CRAWLER_PROJECT'),
        spider='CDS',
        settings={},
        **set_up_local_environment.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    results_records = deep_sort(
        sorted(
            crawl_result['results_data'],
            key=lambda result: result['record']['titles'][0]['title'],
        )
    )
    expected_results = deep_sort(
        sorted(
            expected_results,
            key=lambda result: result['titles'][0]['title'],
        )
    )

    gotten_results = [
        override_generated_fields(result['record'])
        for result in results_records
    ]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    assert gotten_results == expected_results
    assert not crawl_result['errors']

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=20,
        crawler_instance=crawler,
        project=set_up_local_environment.get('CRAWLER_PROJECT'),
        spider='CDS',
        settings={},
        **set_up_local_environment.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 0
コード例 #5
0
def test_desy_crawl_twice(expected_results, settings, cleanup):
    crawler = get_crawler_instance(
        settings.get('CRAWLER_HOST_URL')
    )

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    gotten_records = [
        result['record'] for result in crawl_result['results_data']
    ]
    gotten_records = override_dynamic_fields_on_records(gotten_records)
    expected_results = override_dynamic_fields_on_records(expected_results)

    gotten_records = sorted(
            gotten_records,
            key=lambda record: record['titles'][0]['title'],
        )
    expected_results = sorted(
            expected_results,
            key=lambda result: result['titles'][0]['title'],
        )

    assert DeepDiff(gotten_records, expected_results, ignore_order=True) == {}
    assert not crawl_result['errors']

    # Second crawl
    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 0
コード例 #6
0
ファイル: test_desy.py プロジェクト: zanachka/hepcrawl
def test_desy_crawl_twice(expected_results, settings, cleanup):
    setup_correct_files(buckets=settings['buckets'],
                        **settings['CRAWLER_ARGUMENTS'])
    crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'))

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings=settings.get('CRAWLER_SETTINGS'),
        **settings.get('CRAWLER_ARGUMENTS'))

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    gotten_records = sort_list_of_records_by_record_title(
        [result['record'] for result in crawl_result['results_data']])
    expected_results = sort_list_of_records_by_record_title(expected_results)

    gotten_records = override_dynamic_fields_on_records(gotten_records)
    expected_results = override_dynamic_fields_on_records(expected_results)

    # preproces s3 urls
    for rec in gotten_records:
        for document in rec.get('documents', []):
            if settings['CRAWLER_ARGUMENTS']['s3_server'] in document['url']:
                assert "&Expires=" in document['url']
                document['url'] = document['url'].split('&Expires=')[0]

    for record, expected_record in zip(gotten_records, expected_results):
        assert DeepDiff(record, expected_record, ignore_order=True) == {}

    assert not crawl_result['errors']

    # Second crawl
    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS'))

    assert len(crawl_results) == 0
コード例 #7
0
ファイル: test_wsp.py プロジェクト: vbalbp/hepcrawl
def test_wsp(expected_results, settings, cleanup):
    crawler = get_crawler_instance(
        settings.get('CRAWLER_HOST_URL'),
    )

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='WSP',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    gotten_results = [
        override_generated_fields(result['record'])
        for result in crawl_result['results_data']
    ]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    assert gotten_results == expected_results
    assert not crawl_result['errors']
コード例 #8
0
ファイル: test_desy.py プロジェクト: drjova/hepcrawl
def test_desy_broken_xml(get_local_settings_for_broken, cleanup):
    settings = get_local_settings_for_broken
    crawler = get_crawler_instance(
        settings.get('CRAWLER_HOST_URL')
    )

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=2,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS')
    )
    crawl_result = crawl_results[0]
    result_records = crawl_result['results_data']

    assert not crawl_result['errors']
    assert len(result_records) == 1
    res = result_records[0]
    assert res['record']
    assert len(res['errors']) == 1
    assert 'ValueError' in res['errors'][0]['exception']
    assert res['errors'][0]['traceback']
    assert res['file_name'] == 'broken_record.xml'
    assert res['source_data']
コード例 #9
0
ファイル: test_pos.py プロジェクト: katrinleinweber/hepcrawl
def test_pos_conference_paper_record_and_proceedings_record(
    expected_results,
    config,
):
    crawler = get_crawler_instance(config['CRAWLER_HOST_URL'])

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=2,
        crawler_instance=crawler,
        project=config['CRAWLER_PROJECT'],
        spider='pos',
        settings={},
        **config['CRAWLER_ARGUMENTS']
    )

    gotten_results = [
        override_generated_fields(result['record']) for result in crawl_results
    ]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    gotten_results = sorted(
        gotten_results,
        key=lambda x: x['document_type']
    )
    expected_results = sorted(
        expected_results,
        key=lambda x: x['document_type']
    )

    assert gotten_results == expected_results
コード例 #10
0
ファイル: test_arxiv.py プロジェクト: ammirate/hepcrawl
def test_arxiv(
    expected_results,
    config,
    spider,
):
    crawler = get_crawler_instance(config['CRAWLER_HOST_URL'])

    results = CeleryMonitor.do_crawl(app=celery_app,
                                     monitor_timeout=5,
                                     monitor_iter_limit=100,
                                     events_limit=1,
                                     crawler_instance=crawler,
                                     project=config['CRAWLER_PROJECT'],
                                     spider=spider,
                                     settings={},
                                     **config['CRAWLER_ARGUMENTS'])

    gotten_results = [override_generated_fields(result) for result in results]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    gotten_results = deep_sort(gotten_results)
    expected_results = deep_sort(expected_results)

    assert gotten_results == expected_results
コード例 #11
0
    def test_elsevier_spider_doesnt_parse_articles_with_missing_metadata_or_wrong_doctype(
            self, teardown):
        CRAWLER_ARGS[
            "elsevier_consyn_url"] = "http://elsevier-http-server.local/elsevier_batch_feed_response_with_wrong_articles.txt"

        crawl_results = CeleryMonitor.do_crawl(
            app=celery_app,
            monitor_timeout=5,
            monitor_iter_limit=20,
            events_limit=1,
            crawler_instance=self.crawler,
            project=CONFIG["CRAWLER_PROJECT"],
            spider="elsevier",
            settings=crawler_settings,
            **CRAWLER_ARGS)

        nb_of_packages_in_s3 = len(
            [package for package in self.packages_bucket.objects.all()])

        articles_in_s3 = len(
            [article for article in self.articles_bucket.objects.all()])

        assert nb_of_packages_in_s3 == 2
        assert articles_in_s3 == 8
        assert not crawl_results
コード例 #12
0
ファイル: test_desy.py プロジェクト: zanachka/hepcrawl
def test_desy_broken_xml(settings, cleanup):
    crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'))
    setup_broken_files(buckets=settings['buckets'],
                       **settings['CRAWLER_ARGUMENTS'])

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=2,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS'))
    crawl_result = crawl_results[0]
    result_records = crawl_result['results_data']

    assert not crawl_result['errors']
    assert len(result_records) == 1
    res = result_records[0]
    assert res['record']
    assert len(res['errors']) == 1
    assert 'DoJsonError' in res['errors'][0]['exception']
    assert res['errors'][0]['traceback']
    assert res['file_name'] == 'broken_record.xml'
    assert res['source_data']
コード例 #13
0
ファイル: test_desy.py プロジェクト: katrinleinweber/hepcrawl
def test_desy(
    expected_results,
    settings,
    cleanup,
):
    crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'))

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=2,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS'))

    records = [result['record'] for result in crawl_results]

    gotten_results = override_dynamic_fields_on_records(records)
    expected_results = override_dynamic_fields_on_records(expected_results)

    gotten_results = deep_sort(
        sorted(
            gotten_results,
            key=lambda result: result['titles'][0]['title'],
        ))
    expected_results = deep_sort(
        sorted(
            expected_results,
            key=lambda result: result['titles'][0]['title'],
        ))

    assert gotten_results == expected_results
コード例 #14
0
ファイル: test_desy.py プロジェクト: michamos/hepcrawl
def test_desy(
    expected_results,
    settings,
    cleanup,
):
    crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'))

    results = CeleryMonitor.do_crawl(app=celery_app,
                                     monitor_timeout=5,
                                     monitor_iter_limit=100,
                                     events_limit=2,
                                     crawler_instance=crawler,
                                     project=settings.get('CRAWLER_PROJECT'),
                                     spider='desy',
                                     settings={},
                                     **settings.get('CRAWLER_ARGUMENTS'))

    results = sorted(results, key=lambda x: x['control_number'])

    gotten_results = override_dynamic_fields_on_records(results)
    expected_results = override_dynamic_fields_on_records(expected_results)

    assert gotten_results == expected_results

    for record in gotten_results:
        assert_ffts_content_matches_expected(record)
コード例 #15
0
ファイル: test_cds.py プロジェクト: miguelgrc/hepcrawl
def test_cds(
    expected_results,
    config,
    spider,
):
    crawler = get_crawler_instance(config['CRAWLER_HOST_URL'])

    crawl_results = CeleryMonitor.do_crawl(app=celery_app,
                                           monitor_timeout=5,
                                           monitor_iter_limit=100,
                                           events_limit=1,
                                           crawler_instance=crawler,
                                           project=config['CRAWLER_PROJECT'],
                                           spider=spider,
                                           settings={},
                                           **config['CRAWLER_ARGUMENTS'])

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    gotten_results = [
        override_generated_fields(result['record'])
        for result in crawl_result['results_data']
    ]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    assert DeepDiff(gotten_results, expected_results, ignore_order=True) == {}
    assert not crawl_result['errors']
コード例 #16
0
ファイル: test_wsp.py プロジェクト: inspirehep/hepcrawl
def test_wsp(expected_results, settings, cleanup):
    crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'), )

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=20,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='WSP',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS'))

    assert len(crawl_results) == len(expected_results)

    gotten_results = sort_list_of_records_by_record_title([
        override_generated_fields(result['record'])
        for crawl_result in crawl_results
        for result in crawl_result['results_data']
    ])
    expected_results = sort_list_of_records_by_record_title(
        [override_generated_fields(expected) for expected in expected_results])

    assert DeepDiff(gotten_results, expected_results, ignore_order=True) == {}
    assert gotten_results == expected_results

    for crawl_result in crawl_results:
        assert not crawl_result['errors']
コード例 #17
0
    def test_elsevier_spider(self, setup_s3):
        CRAWLER_ARGS[
            "elsevier_consyn_url"] = "http://elsevier-http-server.local/elsevier_batch_feed_response_mock.txt"
        expected_number_of_zip_files = 1
        expected_article_names = set([
            "10.1016/j.geomphys.2020.103892.xml",
            "10.1016/j.geomphys.2020.103898.xml",
            "10.1016/j.geomphys.2020.103925.xml",
            "10.1016/j.geomphys.2020.103921.xml",
        ])
        expected_records = get_expected_parser_responses_for_new_articles_in_s3(
        )
        crawl_results = CeleryMonitor.do_crawl(
            app=celery_app,
            monitor_timeout=5,
            monitor_iter_limit=20,
            events_limit=1,
            crawler_instance=self.crawler,
            project=CONFIG["CRAWLER_PROJECT"],
            spider="elsevier",
            settings=crawler_settings,
            **CRAWLER_ARGS)

        gotten_records = [
            result['record'] for crawl_result in crawl_results
            for result in crawl_result['results_data']
        ]

        for record in gotten_records:
            record.pop("acquisition_source")
            for document in record['documents']:
                assert CRAWLER_ARGS['s3_host'] in document[
                    'url'] and "Expires" in document['url']
                assert document['key'].endswith(".xml")
            record.pop('documents')
        extracted_articles_names = set(
            [article.key for article in self.articles_bucket.objects.all()])
        nb_of_packages_in_s3 = len(
            [package for package in self.packages_bucket.objects.all()])

        correctly_parsed_records = [
            record for record in gotten_records if record in expected_records
        ]

        assert nb_of_packages_in_s3 == expected_number_of_zip_files
        assert extracted_articles_names == expected_article_names
        assert len(correctly_parsed_records) == 2
コード例 #18
0
    def test_elsevier_spider_doesnt_add_already_existing_packages(self):
        crawl_results = CeleryMonitor.do_crawl(
            app=celery_app,
            monitor_timeout=5,
            monitor_iter_limit=20,
            events_limit=1,
            crawler_instance=self.crawler,
            project=CONFIG["CRAWLER_PROJECT"],
            spider="elsevier",
            settings=crawler_settings,
            **CRAWLER_ARGS)

        nb_of_packages_in_s3 = len(
            [package for package in self.packages_bucket.objects.all()])

        assert nb_of_packages_in_s3 == 1
        assert not crawl_results
コード例 #19
0
def test_wsp_ftp(ftp_environment, expected_results):
    crawler = get_crawler_instance(ftp_environment.get('CRAWLER_HOST_URL'), )

    results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=ftp_environment.get('CRAWLER_PROJECT'),
        spider='WSP',
        settings={},
        **ftp_environment.get('CRAWLER_ARGUMENTS'))

    gotten_results = [override_generated_fields(result) for result in results]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    assert gotten_results == expected_results
コード例 #20
0
    def test_elsevier_spider_doesnt_add_already_existing_articles(
            self, teardown):
        CRAWLER_ARGS[
            "elsevier_consyn_url"] = "http://elsevier-http-server.local/elsevier_batch_feed_response_mock_replicated.txt"

        crawl_results = CeleryMonitor.do_crawl(
            app=celery_app,
            monitor_timeout=5,
            monitor_iter_limit=20,
            events_limit=1,
            crawler_instance=self.crawler,
            project=CONFIG["CRAWLER_PROJECT"],
            spider="elsevier",
            settings=crawler_settings,
            **CRAWLER_ARGS)

        articles_in_s3 = len(
            [article for article in self.articles_bucket.objects.all()])

        assert articles_in_s3 == 4
        assert not crawl_results
コード例 #21
0
ファイル: test_desy.py プロジェクト: katrinleinweber/hepcrawl
def test_desy_broken_xml(get_local_settings_for_broken, cleanup):
    settings = get_local_settings_for_broken
    crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'))

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=2,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS'))
    res = crawl_results[0]

    assert res['record']
    assert len(res['errors']) == 1
    assert 'ValueError' in res['errors'][0]['exception']
    assert res['errors'][0]['traceback']
    assert res['file_name'] == 'broken_record.xml'
    assert res['source_data']
コード例 #22
0
def test_aps_have_document_link_to_s3(cleanup):
    expected_records_count = 1
    expected_documents_count = 1
    expected_s3_url = "http://localstack:4566/downloaded/full/b99616c5061a542667fb4fa1d5a8ab750a15c731.xml"
    expected_parameters_in_s3_url = ["AWSAccessKeyId", "Expires", "Signature"]
    expected_original_url = "http://aps-http-server.local/PhysRevD.96.095036.xml"
    settings = get_settings()
    crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'))

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=20,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='APS',
        settings=settings.get('CRAWLER_SETTINGS'),
        **settings.get('CRAWLER_ARGUMENTS'))

    gotten_records = [
        result['record'] for crawl_result in crawl_results
        for result in crawl_result['results_data']
    ]
    assert len(gotten_records) == expected_records_count
    documents = gotten_records[0]['documents']
    assert len(documents) == expected_documents_count
    assert documents[0]['original_url'] == expected_original_url
    document_url = documents[0]['url']
    assert document_url.split("?")[0] == expected_s3_url
    for parameter in expected_parameters_in_s3_url:
        assert parameter in document_url

    s3_document_response = requests.get(document_url)
    original_document_response = requests.get(documents[0]['original_url'])
    assert s3_document_response.status_code == 200
    assert s3_document_response.text == original_document_response.text
コード例 #23
0
ファイル: test_arxiv.py プロジェクト: drjova/hepcrawl
def test_arxiv(
    expected_results,
    config,
    spider,
):
    crawler = get_crawler_instance(config['CRAWLER_HOST_URL'])

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=config['CRAWLER_PROJECT'],
        spider=spider,
        settings={},
        **config['CRAWLER_ARGUMENTS']
    )

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    gotten_results = [
        override_generated_fields(result['record'])
        for result in crawl_result['results_data']
    ]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    gotten_results = deep_sort(gotten_results)
    expected_results = deep_sort(expected_results)

    assert gotten_results == expected_results
    assert not crawl_result['errors']