def test_cds( expected_results, config, spider, ): crawler = get_crawler_instance(config['CRAWLER_HOST_URL']) crawl_results = CeleryMonitor.do_crawl(app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=config['CRAWLER_PROJECT'], spider=spider, settings={}, **config['CRAWLER_ARGUMENTS']) assert len(crawl_results) == 1 crawl_result = crawl_results[0] gotten_results = [ override_generated_fields(result['record']) for result in crawl_result['results_data'] ] expected_results = [ override_generated_fields(expected) for expected in expected_results ] assert DeepDiff(gotten_results, expected_results, ignore_order=True) == {} assert not crawl_result['errors']
def test_desy_broken_xml(settings, cleanup): crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL')) setup_broken_files(buckets=settings['buckets'], **settings['CRAWLER_ARGUMENTS']) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=2, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS')) crawl_result = crawl_results[0] result_records = crawl_result['results_data'] assert not crawl_result['errors'] assert len(result_records) == 1 res = result_records[0] assert res['record'] assert len(res['errors']) == 1 assert 'DoJsonError' in res['errors'][0]['exception'] assert res['errors'][0]['traceback'] assert res['file_name'] == 'broken_record.xml' assert res['source_data']
def test_wsp_ftp_crawl_twice(expected_results, settings, cleanup): crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'), ) results = CeleryMonitor.do_crawl(app=celery_app, monitor_timeout=5, monitor_iter_limit=20, events_limit=2, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='WSP', settings={}, **settings.get('CRAWLER_ARGUMENTS')) gotten_results = [override_generated_fields(result) for result in results] expected_results = [ override_generated_fields(expected) for expected in expected_results ] assert gotten_results == expected_results results = CeleryMonitor.do_crawl(app=celery_app, monitor_timeout=5, monitor_iter_limit=20, events_limit=2, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='WSP', settings={}, **settings.get('CRAWLER_ARGUMENTS')) gotten_results = [override_generated_fields(result) for result in results] assert gotten_results == []
def setup(self): self.crawler = get_crawler_instance(CONFIG["CRAWLER_HOST_URL"]) self.s3 = establish_s3_connection() self.articles_bucket = get_bucket(self.s3, CRAWLER_ARGS["files_bucket_name"]) self.packages_bucket = get_bucket(self.s3, CRAWLER_ARGS["packages_bucket_name"])
def test_wsp(expected_results, settings, cleanup): crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'), ) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='WSP', settings={}, **settings.get('CRAWLER_ARGUMENTS')) assert len(crawl_results) == len(expected_results) gotten_results = sort_list_of_records_by_record_title([ override_generated_fields(result['record']) for crawl_result in crawl_results for result in crawl_result['results_data'] ]) expected_results = sort_list_of_records_by_record_title( [override_generated_fields(expected) for expected in expected_results]) assert DeepDiff(gotten_results, expected_results, ignore_order=True) == {} assert gotten_results == expected_results for crawl_result in crawl_results: assert not crawl_result['errors']
def test_wsp(expected_results, settings, cleanup): crawler = get_crawler_instance( settings.get('CRAWLER_HOST_URL'), ) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='WSP', settings={}, **settings.get('CRAWLER_ARGUMENTS') ) assert len(crawl_results) == 1 crawl_result = crawl_results[0] gotten_results = [ override_generated_fields(result['record']) for result in crawl_result['results_data'] ] expected_results = [ override_generated_fields(expected) for expected in expected_results ] assert gotten_results == expected_results assert not crawl_result['errors']
def test_desy( expected_results, settings, cleanup, ): crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL')) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=2, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS')) records = [result['record'] for result in crawl_results] gotten_results = override_dynamic_fields_on_records(records) expected_results = override_dynamic_fields_on_records(expected_results) gotten_results = deep_sort( sorted( gotten_results, key=lambda result: result['titles'][0]['title'], )) expected_results = deep_sort( sorted( expected_results, key=lambda result: result['titles'][0]['title'], )) assert gotten_results == expected_results
def test_desy_broken_xml(get_local_settings_for_broken, cleanup): settings = get_local_settings_for_broken crawler = get_crawler_instance( settings.get('CRAWLER_HOST_URL') ) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=2, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS') ) crawl_result = crawl_results[0] result_records = crawl_result['results_data'] assert not crawl_result['errors'] assert len(result_records) == 1 res = result_records[0] assert res['record'] assert len(res['errors']) == 1 assert 'ValueError' in res['errors'][0]['exception'] assert res['errors'][0]['traceback'] assert res['file_name'] == 'broken_record.xml' assert res['source_data']
def test_arxiv( expected_results, config, spider, ): crawler = get_crawler_instance(config['CRAWLER_HOST_URL']) results = CeleryMonitor.do_crawl(app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=config['CRAWLER_PROJECT'], spider=spider, settings={}, **config['CRAWLER_ARGUMENTS']) gotten_results = [override_generated_fields(result) for result in results] expected_results = [ override_generated_fields(expected) for expected in expected_results ] gotten_results = deep_sort(gotten_results) expected_results = deep_sort(expected_results) assert gotten_results == expected_results
def test_arxiv_crawl_twice(set_up_local_environment, expected_results): crawler = get_crawler_instance( set_up_local_environment.get('CRAWLER_HOST_URL')) results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, events_limit=1, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='arXiv', settings={}, **set_up_local_environment.get('CRAWLER_ARGUMENTS')) gotten_results = [override_generated_fields(result) for result in results] expected_results = [ override_generated_fields(expected) for expected in expected_results ] assert gotten_results == expected_results results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='arXiv', settings={}, **set_up_local_environment.get('CRAWLER_ARGUMENTS')) gotten_results = [override_generated_fields(result) for result in results] assert gotten_results == []
def test_pos_conference_paper_record_and_proceedings_record( expected_results, config, ): crawler = get_crawler_instance(config['CRAWLER_HOST_URL']) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=2, crawler_instance=crawler, project=config['CRAWLER_PROJECT'], spider='pos', settings={}, **config['CRAWLER_ARGUMENTS'] ) gotten_results = [ override_generated_fields(result['record']) for result in crawl_results ] expected_results = [ override_generated_fields(expected) for expected in expected_results ] gotten_results = sorted( gotten_results, key=lambda x: x['document_type'] ) expected_results = sorted( expected_results, key=lambda x: x['document_type'] ) assert gotten_results == expected_results
def test_desy( expected_results, settings, cleanup, ): crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL')) results = CeleryMonitor.do_crawl(app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=2, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS')) results = sorted(results, key=lambda x: x['control_number']) gotten_results = override_dynamic_fields_on_records(results) expected_results = override_dynamic_fields_on_records(expected_results) assert gotten_results == expected_results for record in gotten_results: assert_ffts_content_matches_expected(record)
def test_cds(set_up_local_environment, expected_results): crawler = get_crawler_instance( set_up_local_environment.get('CRAWLER_HOST_URL')) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='CDS', settings={}, **set_up_local_environment.get('CRAWLER_ARGUMENTS')) crawl_results = deep_sort( sorted( crawl_results, key=lambda result: result['record']['titles'][0]['title'], )) expected_results = deep_sort( sorted( expected_results, key=lambda result: result['titles'][0]['title'], )) gotten_results = [ override_generated_fields(result['record']) for result in crawl_results ] expected_results = [ override_generated_fields(expected) for expected in expected_results ] assert gotten_results == expected_results
def test_desy_crawl_twice(expected_results, settings, cleanup): crawler = get_crawler_instance( settings.get('CRAWLER_HOST_URL') ) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS') ) assert len(crawl_results) == 1 crawl_result = crawl_results[0] gotten_records = [ result['record'] for result in crawl_result['results_data'] ] gotten_records = override_dynamic_fields_on_records(gotten_records) expected_results = override_dynamic_fields_on_records(expected_results) gotten_records = deep_sort( sorted( gotten_records, key=lambda record: record['titles'][0]['title'], ) ) expected_results = deep_sort( sorted( expected_results, key=lambda result: result['titles'][0]['title'], ) ) assert gotten_records == expected_results assert not crawl_result['errors'] # Second crawl crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS') ) assert len(crawl_results) == 0
def test_cds_crawl_twice(set_up_local_environment, expected_results): crawler = get_crawler_instance( set_up_local_environment.get('CRAWLER_HOST_URL') ) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, events_limit=1, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='CDS', settings={}, **set_up_local_environment.get('CRAWLER_ARGUMENTS') ) assert len(crawl_results) == 1 crawl_result = crawl_results[0] results_records = deep_sort( sorted( crawl_result['results_data'], key=lambda result: result['record']['titles'][0]['title'], ) ) expected_results = deep_sort( sorted( expected_results, key=lambda result: result['titles'][0]['title'], ) ) gotten_results = [ override_generated_fields(result['record']) for result in results_records ] expected_results = [ override_generated_fields(expected) for expected in expected_results ] assert gotten_results == expected_results assert not crawl_result['errors'] crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='CDS', settings={}, **set_up_local_environment.get('CRAWLER_ARGUMENTS') ) assert len(crawl_results) == 0
def test_desy_crawl_twice(expected_results, settings, cleanup): crawler = get_crawler_instance( settings.get('CRAWLER_HOST_URL') ) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS') ) assert len(crawl_results) == 1 crawl_result = crawl_results[0] gotten_records = [ result['record'] for result in crawl_result['results_data'] ] gotten_records = override_dynamic_fields_on_records(gotten_records) expected_results = override_dynamic_fields_on_records(expected_results) gotten_records = sorted( gotten_records, key=lambda record: record['titles'][0]['title'], ) expected_results = sorted( expected_results, key=lambda result: result['titles'][0]['title'], ) assert DeepDiff(gotten_records, expected_results, ignore_order=True) == {} assert not crawl_result['errors'] # Second crawl crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS') ) assert len(crawl_results) == 0
def test_desy_crawl_twice(expected_results, settings, cleanup): setup_correct_files(buckets=settings['buckets'], **settings['CRAWLER_ARGUMENTS']) crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL')) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings=settings.get('CRAWLER_SETTINGS'), **settings.get('CRAWLER_ARGUMENTS')) assert len(crawl_results) == 1 crawl_result = crawl_results[0] gotten_records = sort_list_of_records_by_record_title( [result['record'] for result in crawl_result['results_data']]) expected_results = sort_list_of_records_by_record_title(expected_results) gotten_records = override_dynamic_fields_on_records(gotten_records) expected_results = override_dynamic_fields_on_records(expected_results) # preproces s3 urls for rec in gotten_records: for document in rec.get('documents', []): if settings['CRAWLER_ARGUMENTS']['s3_server'] in document['url']: assert "&Expires=" in document['url'] document['url'] = document['url'].split('&Expires=')[0] for record, expected_record in zip(gotten_records, expected_results): assert DeepDiff(record, expected_record, ignore_order=True) == {} assert not crawl_result['errors'] # Second crawl crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS')) assert len(crawl_results) == 0
def test_wsp_ftp(ftp_environment, expected_results): crawler = get_crawler_instance(ftp_environment.get('CRAWLER_HOST_URL'), ) results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=ftp_environment.get('CRAWLER_PROJECT'), spider='WSP', settings={}, **ftp_environment.get('CRAWLER_ARGUMENTS')) gotten_results = [override_generated_fields(result) for result in results] expected_results = [ override_generated_fields(expected) for expected in expected_results ] assert gotten_results == expected_results
def test_desy_broken_xml(get_local_settings_for_broken, cleanup): settings = get_local_settings_for_broken crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL')) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=2, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS')) res = crawl_results[0] assert res['record'] assert len(res['errors']) == 1 assert 'ValueError' in res['errors'][0]['exception'] assert res['errors'][0]['traceback'] assert res['file_name'] == 'broken_record.xml' assert res['source_data']
def test_aps_have_document_link_to_s3(cleanup): expected_records_count = 1 expected_documents_count = 1 expected_s3_url = "http://localstack:4566/downloaded/full/b99616c5061a542667fb4fa1d5a8ab750a15c731.xml" expected_parameters_in_s3_url = ["AWSAccessKeyId", "Expires", "Signature"] expected_original_url = "http://aps-http-server.local/PhysRevD.96.095036.xml" settings = get_settings() crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL')) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='APS', settings=settings.get('CRAWLER_SETTINGS'), **settings.get('CRAWLER_ARGUMENTS')) gotten_records = [ result['record'] for crawl_result in crawl_results for result in crawl_result['results_data'] ] assert len(gotten_records) == expected_records_count documents = gotten_records[0]['documents'] assert len(documents) == expected_documents_count assert documents[0]['original_url'] == expected_original_url document_url = documents[0]['url'] assert document_url.split("?")[0] == expected_s3_url for parameter in expected_parameters_in_s3_url: assert parameter in document_url s3_document_response = requests.get(document_url) original_document_response = requests.get(documents[0]['original_url']) assert s3_document_response.status_code == 200 assert s3_document_response.text == original_document_response.text
def test_arxiv( expected_results, config, spider, ): crawler = get_crawler_instance(config['CRAWLER_HOST_URL']) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=config['CRAWLER_PROJECT'], spider=spider, settings={}, **config['CRAWLER_ARGUMENTS'] ) assert len(crawl_results) == 1 crawl_result = crawl_results[0] gotten_results = [ override_generated_fields(result['record']) for result in crawl_result['results_data'] ] expected_results = [ override_generated_fields(expected) for expected in expected_results ] gotten_results = deep_sort(gotten_results) expected_results = deep_sort(expected_results) assert gotten_results == expected_results assert not crawl_result['errors']