Example #1
0
def test_wsp(expected_results, settings, cleanup):
    crawler = get_crawler_instance(
        settings.get('CRAWLER_HOST_URL'),
    )

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='WSP',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    gotten_results = sort_list_of_records_by_record_title([
        override_generated_fields(result['record'])
        for result in crawl_result['results_data']
    ])
    expected_results = sort_list_of_records_by_record_title([
        override_generated_fields(expected) for expected in expected_results
    ])

    assert DeepDiff(gotten_results, expected_results, ignore_order=True) == {}
    assert gotten_results == expected_results
    assert not crawl_result['errors']
Example #2
0
def test_desy_crawl_twice(expected_results, settings, cleanup):
    setup_correct_files(buckets=settings['buckets'],
                        **settings['CRAWLER_ARGUMENTS'])
    crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'))

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings=settings.get('CRAWLER_SETTINGS'),
        **settings.get('CRAWLER_ARGUMENTS'))

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    gotten_records = sort_list_of_records_by_record_title(
        [result['record'] for result in crawl_result['results_data']])
    expected_results = sort_list_of_records_by_record_title(expected_results)

    gotten_records = override_dynamic_fields_on_records(gotten_records)
    expected_results = override_dynamic_fields_on_records(expected_results)

    # preproces s3 urls
    for rec in gotten_records:
        for document in rec.get('documents', []):
            if settings['CRAWLER_ARGUMENTS']['s3_server'] in document['url']:
                assert "&Expires=" in document['url']
                document['url'] = document['url'].split('&Expires=')[0]

    for record, expected_record in zip(gotten_records, expected_results):
        assert DeepDiff(record, expected_record, ignore_order=True) == {}

    assert not crawl_result['errors']

    # Second crawl
    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS'))

    assert len(crawl_results) == 0
Example #3
0
def test_cds(
    expected_results,
    config,
    spider,
):
    crawler = get_crawler_instance(config['CRAWLER_HOST_URL'])

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=config['CRAWLER_PROJECT'],
        spider=spider,
        settings={},
        **config['CRAWLER_ARGUMENTS']
    )

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    gotten_results = sort_list_of_records_by_record_title(
        [
            override_generated_fields(result['record'])
            for result in crawl_result['results_data']
        ]
    )
    expected_results = sort_list_of_records_by_record_title(
        [
            override_generated_fields(expected) for expected in expected_results
        ]
    )

    for record, expected_record in zip(gotten_results, expected_results):
        assert DeepDiff(record, expected_record, ignore_order=True) == {}
    assert not crawl_result['errors']