Exemple #1
0
def test_cds(
    expected_results,
    config,
    spider,
):
    crawler = get_crawler_instance(config['CRAWLER_HOST_URL'])

    crawl_results = CeleryMonitor.do_crawl(app=celery_app,
                                           monitor_timeout=5,
                                           monitor_iter_limit=100,
                                           events_limit=1,
                                           crawler_instance=crawler,
                                           project=config['CRAWLER_PROJECT'],
                                           spider=spider,
                                           settings={},
                                           **config['CRAWLER_ARGUMENTS'])

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    gotten_results = [
        override_generated_fields(result['record'])
        for result in crawl_result['results_data']
    ]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    assert DeepDiff(gotten_results, expected_results, ignore_order=True) == {}
    assert not crawl_result['errors']
Exemple #2
0
def test_desy_broken_xml(settings, cleanup):
    crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'))
    setup_broken_files(buckets=settings['buckets'],
                       **settings['CRAWLER_ARGUMENTS'])

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=2,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS'))
    crawl_result = crawl_results[0]
    result_records = crawl_result['results_data']

    assert not crawl_result['errors']
    assert len(result_records) == 1
    res = result_records[0]
    assert res['record']
    assert len(res['errors']) == 1
    assert 'DoJsonError' in res['errors'][0]['exception']
    assert res['errors'][0]['traceback']
    assert res['file_name'] == 'broken_record.xml'
    assert res['source_data']
Exemple #3
0
def test_wsp_ftp_crawl_twice(expected_results, settings, cleanup):
    crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'), )

    results = CeleryMonitor.do_crawl(app=celery_app,
                                     monitor_timeout=5,
                                     monitor_iter_limit=20,
                                     events_limit=2,
                                     crawler_instance=crawler,
                                     project=settings.get('CRAWLER_PROJECT'),
                                     spider='WSP',
                                     settings={},
                                     **settings.get('CRAWLER_ARGUMENTS'))

    gotten_results = [override_generated_fields(result) for result in results]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    assert gotten_results == expected_results

    results = CeleryMonitor.do_crawl(app=celery_app,
                                     monitor_timeout=5,
                                     monitor_iter_limit=20,
                                     events_limit=2,
                                     crawler_instance=crawler,
                                     project=settings.get('CRAWLER_PROJECT'),
                                     spider='WSP',
                                     settings={},
                                     **settings.get('CRAWLER_ARGUMENTS'))

    gotten_results = [override_generated_fields(result) for result in results]

    assert gotten_results == []
Exemple #4
0
 def setup(self):
     self.crawler = get_crawler_instance(CONFIG["CRAWLER_HOST_URL"])
     self.s3 = establish_s3_connection()
     self.articles_bucket = get_bucket(self.s3,
                                       CRAWLER_ARGS["files_bucket_name"])
     self.packages_bucket = get_bucket(self.s3,
                                       CRAWLER_ARGS["packages_bucket_name"])
Exemple #5
0
def test_wsp(expected_results, settings, cleanup):
    crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'), )

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=20,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='WSP',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS'))

    assert len(crawl_results) == len(expected_results)

    gotten_results = sort_list_of_records_by_record_title([
        override_generated_fields(result['record'])
        for crawl_result in crawl_results
        for result in crawl_result['results_data']
    ])
    expected_results = sort_list_of_records_by_record_title(
        [override_generated_fields(expected) for expected in expected_results])

    assert DeepDiff(gotten_results, expected_results, ignore_order=True) == {}
    assert gotten_results == expected_results

    for crawl_result in crawl_results:
        assert not crawl_result['errors']
Exemple #6
0
def test_wsp(expected_results, settings, cleanup):
    crawler = get_crawler_instance(
        settings.get('CRAWLER_HOST_URL'),
    )

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='WSP',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    gotten_results = [
        override_generated_fields(result['record'])
        for result in crawl_result['results_data']
    ]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    assert gotten_results == expected_results
    assert not crawl_result['errors']
def test_desy(
    expected_results,
    settings,
    cleanup,
):
    crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'))

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=2,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS'))

    records = [result['record'] for result in crawl_results]

    gotten_results = override_dynamic_fields_on_records(records)
    expected_results = override_dynamic_fields_on_records(expected_results)

    gotten_results = deep_sort(
        sorted(
            gotten_results,
            key=lambda result: result['titles'][0]['title'],
        ))
    expected_results = deep_sort(
        sorted(
            expected_results,
            key=lambda result: result['titles'][0]['title'],
        ))

    assert gotten_results == expected_results
Exemple #8
0
def test_desy_broken_xml(get_local_settings_for_broken, cleanup):
    settings = get_local_settings_for_broken
    crawler = get_crawler_instance(
        settings.get('CRAWLER_HOST_URL')
    )

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=2,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS')
    )
    crawl_result = crawl_results[0]
    result_records = crawl_result['results_data']

    assert not crawl_result['errors']
    assert len(result_records) == 1
    res = result_records[0]
    assert res['record']
    assert len(res['errors']) == 1
    assert 'ValueError' in res['errors'][0]['exception']
    assert res['errors'][0]['traceback']
    assert res['file_name'] == 'broken_record.xml'
    assert res['source_data']
Exemple #9
0
def test_arxiv(
    expected_results,
    config,
    spider,
):
    crawler = get_crawler_instance(config['CRAWLER_HOST_URL'])

    results = CeleryMonitor.do_crawl(app=celery_app,
                                     monitor_timeout=5,
                                     monitor_iter_limit=100,
                                     events_limit=1,
                                     crawler_instance=crawler,
                                     project=config['CRAWLER_PROJECT'],
                                     spider=spider,
                                     settings={},
                                     **config['CRAWLER_ARGUMENTS'])

    gotten_results = [override_generated_fields(result) for result in results]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    gotten_results = deep_sort(gotten_results)
    expected_results = deep_sort(expected_results)

    assert gotten_results == expected_results
Exemple #10
0
def test_arxiv_crawl_twice(set_up_local_environment, expected_results):
    crawler = get_crawler_instance(
        set_up_local_environment.get('CRAWLER_HOST_URL'))

    results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=20,
        events_limit=1,
        crawler_instance=crawler,
        project=set_up_local_environment.get('CRAWLER_PROJECT'),
        spider='arXiv',
        settings={},
        **set_up_local_environment.get('CRAWLER_ARGUMENTS'))

    gotten_results = [override_generated_fields(result) for result in results]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    assert gotten_results == expected_results

    results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=20,
        crawler_instance=crawler,
        project=set_up_local_environment.get('CRAWLER_PROJECT'),
        spider='arXiv',
        settings={},
        **set_up_local_environment.get('CRAWLER_ARGUMENTS'))

    gotten_results = [override_generated_fields(result) for result in results]

    assert gotten_results == []
Exemple #11
0
def test_pos_conference_paper_record_and_proceedings_record(
    expected_results,
    config,
):
    crawler = get_crawler_instance(config['CRAWLER_HOST_URL'])

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=2,
        crawler_instance=crawler,
        project=config['CRAWLER_PROJECT'],
        spider='pos',
        settings={},
        **config['CRAWLER_ARGUMENTS']
    )

    gotten_results = [
        override_generated_fields(result['record']) for result in crawl_results
    ]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    gotten_results = sorted(
        gotten_results,
        key=lambda x: x['document_type']
    )
    expected_results = sorted(
        expected_results,
        key=lambda x: x['document_type']
    )

    assert gotten_results == expected_results
Exemple #12
0
def test_desy(
    expected_results,
    settings,
    cleanup,
):
    crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'))

    results = CeleryMonitor.do_crawl(app=celery_app,
                                     monitor_timeout=5,
                                     monitor_iter_limit=100,
                                     events_limit=2,
                                     crawler_instance=crawler,
                                     project=settings.get('CRAWLER_PROJECT'),
                                     spider='desy',
                                     settings={},
                                     **settings.get('CRAWLER_ARGUMENTS'))

    results = sorted(results, key=lambda x: x['control_number'])

    gotten_results = override_dynamic_fields_on_records(results)
    expected_results = override_dynamic_fields_on_records(expected_results)

    assert gotten_results == expected_results

    for record in gotten_results:
        assert_ffts_content_matches_expected(record)
Exemple #13
0
def test_cds(set_up_local_environment, expected_results):
    crawler = get_crawler_instance(
        set_up_local_environment.get('CRAWLER_HOST_URL'))

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=set_up_local_environment.get('CRAWLER_PROJECT'),
        spider='CDS',
        settings={},
        **set_up_local_environment.get('CRAWLER_ARGUMENTS'))

    crawl_results = deep_sort(
        sorted(
            crawl_results,
            key=lambda result: result['record']['titles'][0]['title'],
        ))
    expected_results = deep_sort(
        sorted(
            expected_results,
            key=lambda result: result['titles'][0]['title'],
        ))

    gotten_results = [
        override_generated_fields(result['record']) for result in crawl_results
    ]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    assert gotten_results == expected_results
Exemple #14
0
def test_desy_crawl_twice(expected_results, settings, cleanup):
    crawler = get_crawler_instance(
        settings.get('CRAWLER_HOST_URL')
    )

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    gotten_records = [
        result['record'] for result in crawl_result['results_data']
    ]
    gotten_records = override_dynamic_fields_on_records(gotten_records)
    expected_results = override_dynamic_fields_on_records(expected_results)

    gotten_records = deep_sort(
        sorted(
            gotten_records,
            key=lambda record: record['titles'][0]['title'],
        )
    )
    expected_results = deep_sort(
        sorted(
            expected_results,
            key=lambda result: result['titles'][0]['title'],
        )
    )

    assert gotten_records == expected_results
    assert not crawl_result['errors']

    # Second crawl
    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 0
Exemple #15
0
def test_cds_crawl_twice(set_up_local_environment, expected_results):
    crawler = get_crawler_instance(
        set_up_local_environment.get('CRAWLER_HOST_URL')
    )

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=20,
        events_limit=1,
        crawler_instance=crawler,
        project=set_up_local_environment.get('CRAWLER_PROJECT'),
        spider='CDS',
        settings={},
        **set_up_local_environment.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    results_records = deep_sort(
        sorted(
            crawl_result['results_data'],
            key=lambda result: result['record']['titles'][0]['title'],
        )
    )
    expected_results = deep_sort(
        sorted(
            expected_results,
            key=lambda result: result['titles'][0]['title'],
        )
    )

    gotten_results = [
        override_generated_fields(result['record'])
        for result in results_records
    ]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    assert gotten_results == expected_results
    assert not crawl_result['errors']

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=20,
        crawler_instance=crawler,
        project=set_up_local_environment.get('CRAWLER_PROJECT'),
        spider='CDS',
        settings={},
        **set_up_local_environment.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 0
Exemple #16
0
def test_desy_crawl_twice(expected_results, settings, cleanup):
    crawler = get_crawler_instance(
        settings.get('CRAWLER_HOST_URL')
    )

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    gotten_records = [
        result['record'] for result in crawl_result['results_data']
    ]
    gotten_records = override_dynamic_fields_on_records(gotten_records)
    expected_results = override_dynamic_fields_on_records(expected_results)

    gotten_records = sorted(
            gotten_records,
            key=lambda record: record['titles'][0]['title'],
        )
    expected_results = sorted(
            expected_results,
            key=lambda result: result['titles'][0]['title'],
        )

    assert DeepDiff(gotten_records, expected_results, ignore_order=True) == {}
    assert not crawl_result['errors']

    # Second crawl
    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 0
Exemple #17
0
def test_desy_crawl_twice(expected_results, settings, cleanup):
    setup_correct_files(buckets=settings['buckets'],
                        **settings['CRAWLER_ARGUMENTS'])
    crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'))

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings=settings.get('CRAWLER_SETTINGS'),
        **settings.get('CRAWLER_ARGUMENTS'))

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    gotten_records = sort_list_of_records_by_record_title(
        [result['record'] for result in crawl_result['results_data']])
    expected_results = sort_list_of_records_by_record_title(expected_results)

    gotten_records = override_dynamic_fields_on_records(gotten_records)
    expected_results = override_dynamic_fields_on_records(expected_results)

    # preproces s3 urls
    for rec in gotten_records:
        for document in rec.get('documents', []):
            if settings['CRAWLER_ARGUMENTS']['s3_server'] in document['url']:
                assert "&Expires=" in document['url']
                document['url'] = document['url'].split('&Expires=')[0]

    for record, expected_record in zip(gotten_records, expected_results):
        assert DeepDiff(record, expected_record, ignore_order=True) == {}

    assert not crawl_result['errors']

    # Second crawl
    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS'))

    assert len(crawl_results) == 0
Exemple #18
0
def test_wsp_ftp(ftp_environment, expected_results):
    crawler = get_crawler_instance(ftp_environment.get('CRAWLER_HOST_URL'), )

    results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=ftp_environment.get('CRAWLER_PROJECT'),
        spider='WSP',
        settings={},
        **ftp_environment.get('CRAWLER_ARGUMENTS'))

    gotten_results = [override_generated_fields(result) for result in results]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    assert gotten_results == expected_results
Exemple #19
0
def test_desy_broken_xml(get_local_settings_for_broken, cleanup):
    settings = get_local_settings_for_broken
    crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'))

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=2,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS'))
    res = crawl_results[0]

    assert res['record']
    assert len(res['errors']) == 1
    assert 'ValueError' in res['errors'][0]['exception']
    assert res['errors'][0]['traceback']
    assert res['file_name'] == 'broken_record.xml'
    assert res['source_data']
Exemple #20
0
def test_aps_have_document_link_to_s3(cleanup):
    expected_records_count = 1
    expected_documents_count = 1
    expected_s3_url = "http://localstack:4566/downloaded/full/b99616c5061a542667fb4fa1d5a8ab750a15c731.xml"
    expected_parameters_in_s3_url = ["AWSAccessKeyId", "Expires", "Signature"]
    expected_original_url = "http://aps-http-server.local/PhysRevD.96.095036.xml"
    settings = get_settings()
    crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'))

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=20,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='APS',
        settings=settings.get('CRAWLER_SETTINGS'),
        **settings.get('CRAWLER_ARGUMENTS'))

    gotten_records = [
        result['record'] for crawl_result in crawl_results
        for result in crawl_result['results_data']
    ]
    assert len(gotten_records) == expected_records_count
    documents = gotten_records[0]['documents']
    assert len(documents) == expected_documents_count
    assert documents[0]['original_url'] == expected_original_url
    document_url = documents[0]['url']
    assert document_url.split("?")[0] == expected_s3_url
    for parameter in expected_parameters_in_s3_url:
        assert parameter in document_url

    s3_document_response = requests.get(document_url)
    original_document_response = requests.get(documents[0]['original_url'])
    assert s3_document_response.status_code == 200
    assert s3_document_response.text == original_document_response.text
Exemple #21
0
def test_arxiv(
    expected_results,
    config,
    spider,
):
    crawler = get_crawler_instance(config['CRAWLER_HOST_URL'])

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=config['CRAWLER_PROJECT'],
        spider=spider,
        settings={},
        **config['CRAWLER_ARGUMENTS']
    )

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    gotten_results = [
        override_generated_fields(result['record'])
        for result in crawl_result['results_data']
    ]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    gotten_results = deep_sort(gotten_results)
    expected_results = deep_sort(expected_results)

    assert gotten_results == expected_results
    assert not crawl_result['errors']