def test_arxiv_crawl_twice(set_up_local_environment, expected_results): crawler = get_crawler_instance( set_up_local_environment.get('CRAWLER_HOST_URL')) results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, events_limit=1, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='arXiv', settings={}, **set_up_local_environment.get('CRAWLER_ARGUMENTS')) gotten_results = [override_generated_fields(result) for result in results] expected_results = [ override_generated_fields(expected) for expected in expected_results ] assert gotten_results == expected_results results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='arXiv', settings={}, **set_up_local_environment.get('CRAWLER_ARGUMENTS')) gotten_results = [override_generated_fields(result) for result in results] assert gotten_results == []
def test_wsp_ftp_crawl_twice(expected_results, settings, cleanup): crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'), ) results = CeleryMonitor.do_crawl(app=celery_app, monitor_timeout=5, monitor_iter_limit=20, events_limit=2, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='WSP', settings={}, **settings.get('CRAWLER_ARGUMENTS')) gotten_results = [override_generated_fields(result) for result in results] expected_results = [ override_generated_fields(expected) for expected in expected_results ] assert gotten_results == expected_results results = CeleryMonitor.do_crawl(app=celery_app, monitor_timeout=5, monitor_iter_limit=20, events_limit=2, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='WSP', settings={}, **settings.get('CRAWLER_ARGUMENTS')) gotten_results = [override_generated_fields(result) for result in results] assert gotten_results == []
def test_desy_crawl_twice(expected_results, settings, cleanup): crawler = get_crawler_instance( settings.get('CRAWLER_HOST_URL') ) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS') ) assert len(crawl_results) == 1 crawl_result = crawl_results[0] gotten_records = [ result['record'] for result in crawl_result['results_data'] ] gotten_records = override_dynamic_fields_on_records(gotten_records) expected_results = override_dynamic_fields_on_records(expected_results) gotten_records = deep_sort( sorted( gotten_records, key=lambda record: record['titles'][0]['title'], ) ) expected_results = deep_sort( sorted( expected_results, key=lambda result: result['titles'][0]['title'], ) ) assert gotten_records == expected_results assert not crawl_result['errors'] # Second crawl crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS') ) assert len(crawl_results) == 0
def test_cds_crawl_twice(set_up_local_environment, expected_results): crawler = get_crawler_instance( set_up_local_environment.get('CRAWLER_HOST_URL') ) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, events_limit=1, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='CDS', settings={}, **set_up_local_environment.get('CRAWLER_ARGUMENTS') ) assert len(crawl_results) == 1 crawl_result = crawl_results[0] results_records = deep_sort( sorted( crawl_result['results_data'], key=lambda result: result['record']['titles'][0]['title'], ) ) expected_results = deep_sort( sorted( expected_results, key=lambda result: result['titles'][0]['title'], ) ) gotten_results = [ override_generated_fields(result['record']) for result in results_records ] expected_results = [ override_generated_fields(expected) for expected in expected_results ] assert gotten_results == expected_results assert not crawl_result['errors'] crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='CDS', settings={}, **set_up_local_environment.get('CRAWLER_ARGUMENTS') ) assert len(crawl_results) == 0
def test_desy_crawl_twice(expected_results, settings, cleanup): crawler = get_crawler_instance( settings.get('CRAWLER_HOST_URL') ) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS') ) assert len(crawl_results) == 1 crawl_result = crawl_results[0] gotten_records = [ result['record'] for result in crawl_result['results_data'] ] gotten_records = override_dynamic_fields_on_records(gotten_records) expected_results = override_dynamic_fields_on_records(expected_results) gotten_records = sorted( gotten_records, key=lambda record: record['titles'][0]['title'], ) expected_results = sorted( expected_results, key=lambda result: result['titles'][0]['title'], ) assert DeepDiff(gotten_records, expected_results, ignore_order=True) == {} assert not crawl_result['errors'] # Second crawl crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS') ) assert len(crawl_results) == 0
def test_desy_crawl_twice(expected_results, settings, cleanup): setup_correct_files(buckets=settings['buckets'], **settings['CRAWLER_ARGUMENTS']) crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL')) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings=settings.get('CRAWLER_SETTINGS'), **settings.get('CRAWLER_ARGUMENTS')) assert len(crawl_results) == 1 crawl_result = crawl_results[0] gotten_records = sort_list_of_records_by_record_title( [result['record'] for result in crawl_result['results_data']]) expected_results = sort_list_of_records_by_record_title(expected_results) gotten_records = override_dynamic_fields_on_records(gotten_records) expected_results = override_dynamic_fields_on_records(expected_results) # preproces s3 urls for rec in gotten_records: for document in rec.get('documents', []): if settings['CRAWLER_ARGUMENTS']['s3_server'] in document['url']: assert "&Expires=" in document['url'] document['url'] = document['url'].split('&Expires=')[0] for record, expected_record in zip(gotten_records, expected_results): assert DeepDiff(record, expected_record, ignore_order=True) == {} assert not crawl_result['errors'] # Second crawl crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS')) assert len(crawl_results) == 0
def test_wsp(expected_results, settings, cleanup): crawler = get_crawler_instance( settings.get('CRAWLER_HOST_URL'), ) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='WSP', settings={}, **settings.get('CRAWLER_ARGUMENTS') ) assert len(crawl_results) == 1 crawl_result = crawl_results[0] gotten_results = [ override_generated_fields(result['record']) for result in crawl_result['results_data'] ] expected_results = [ override_generated_fields(expected) for expected in expected_results ] assert gotten_results == expected_results assert not crawl_result['errors']
def test_desy_broken_xml(get_local_settings_for_broken, cleanup): settings = get_local_settings_for_broken crawler = get_crawler_instance( settings.get('CRAWLER_HOST_URL') ) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=2, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS') ) crawl_result = crawl_results[0] result_records = crawl_result['results_data'] assert not crawl_result['errors'] assert len(result_records) == 1 res = result_records[0] assert res['record'] assert len(res['errors']) == 1 assert 'ValueError' in res['errors'][0]['exception'] assert res['errors'][0]['traceback'] assert res['file_name'] == 'broken_record.xml' assert res['source_data']
def test_pos_conference_paper_record_and_proceedings_record( expected_results, config, ): crawler = get_crawler_instance(config['CRAWLER_HOST_URL']) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=2, crawler_instance=crawler, project=config['CRAWLER_PROJECT'], spider='pos', settings={}, **config['CRAWLER_ARGUMENTS'] ) gotten_results = [ override_generated_fields(result['record']) for result in crawl_results ] expected_results = [ override_generated_fields(expected) for expected in expected_results ] gotten_results = sorted( gotten_results, key=lambda x: x['document_type'] ) expected_results = sorted( expected_results, key=lambda x: x['document_type'] ) assert gotten_results == expected_results
def test_arxiv( expected_results, config, spider, ): crawler = get_crawler_instance(config['CRAWLER_HOST_URL']) results = CeleryMonitor.do_crawl(app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=config['CRAWLER_PROJECT'], spider=spider, settings={}, **config['CRAWLER_ARGUMENTS']) gotten_results = [override_generated_fields(result) for result in results] expected_results = [ override_generated_fields(expected) for expected in expected_results ] gotten_results = deep_sort(gotten_results) expected_results = deep_sort(expected_results) assert gotten_results == expected_results
def test_elsevier_spider_doesnt_parse_articles_with_missing_metadata_or_wrong_doctype( self, teardown): CRAWLER_ARGS[ "elsevier_consyn_url"] = "http://elsevier-http-server.local/elsevier_batch_feed_response_with_wrong_articles.txt" crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, events_limit=1, crawler_instance=self.crawler, project=CONFIG["CRAWLER_PROJECT"], spider="elsevier", settings=crawler_settings, **CRAWLER_ARGS) nb_of_packages_in_s3 = len( [package for package in self.packages_bucket.objects.all()]) articles_in_s3 = len( [article for article in self.articles_bucket.objects.all()]) assert nb_of_packages_in_s3 == 2 assert articles_in_s3 == 8 assert not crawl_results
def test_desy_broken_xml(settings, cleanup): crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL')) setup_broken_files(buckets=settings['buckets'], **settings['CRAWLER_ARGUMENTS']) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=2, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS')) crawl_result = crawl_results[0] result_records = crawl_result['results_data'] assert not crawl_result['errors'] assert len(result_records) == 1 res = result_records[0] assert res['record'] assert len(res['errors']) == 1 assert 'DoJsonError' in res['errors'][0]['exception'] assert res['errors'][0]['traceback'] assert res['file_name'] == 'broken_record.xml' assert res['source_data']
def test_desy( expected_results, settings, cleanup, ): crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL')) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=2, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS')) records = [result['record'] for result in crawl_results] gotten_results = override_dynamic_fields_on_records(records) expected_results = override_dynamic_fields_on_records(expected_results) gotten_results = deep_sort( sorted( gotten_results, key=lambda result: result['titles'][0]['title'], )) expected_results = deep_sort( sorted( expected_results, key=lambda result: result['titles'][0]['title'], )) assert gotten_results == expected_results
def test_desy( expected_results, settings, cleanup, ): crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL')) results = CeleryMonitor.do_crawl(app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=2, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS')) results = sorted(results, key=lambda x: x['control_number']) gotten_results = override_dynamic_fields_on_records(results) expected_results = override_dynamic_fields_on_records(expected_results) assert gotten_results == expected_results for record in gotten_results: assert_ffts_content_matches_expected(record)
def test_cds( expected_results, config, spider, ): crawler = get_crawler_instance(config['CRAWLER_HOST_URL']) crawl_results = CeleryMonitor.do_crawl(app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=config['CRAWLER_PROJECT'], spider=spider, settings={}, **config['CRAWLER_ARGUMENTS']) assert len(crawl_results) == 1 crawl_result = crawl_results[0] gotten_results = [ override_generated_fields(result['record']) for result in crawl_result['results_data'] ] expected_results = [ override_generated_fields(expected) for expected in expected_results ] assert DeepDiff(gotten_results, expected_results, ignore_order=True) == {} assert not crawl_result['errors']
def test_wsp(expected_results, settings, cleanup): crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'), ) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='WSP', settings={}, **settings.get('CRAWLER_ARGUMENTS')) assert len(crawl_results) == len(expected_results) gotten_results = sort_list_of_records_by_record_title([ override_generated_fields(result['record']) for crawl_result in crawl_results for result in crawl_result['results_data'] ]) expected_results = sort_list_of_records_by_record_title( [override_generated_fields(expected) for expected in expected_results]) assert DeepDiff(gotten_results, expected_results, ignore_order=True) == {} assert gotten_results == expected_results for crawl_result in crawl_results: assert not crawl_result['errors']
def test_elsevier_spider(self, setup_s3): CRAWLER_ARGS[ "elsevier_consyn_url"] = "http://elsevier-http-server.local/elsevier_batch_feed_response_mock.txt" expected_number_of_zip_files = 1 expected_article_names = set([ "10.1016/j.geomphys.2020.103892.xml", "10.1016/j.geomphys.2020.103898.xml", "10.1016/j.geomphys.2020.103925.xml", "10.1016/j.geomphys.2020.103921.xml", ]) expected_records = get_expected_parser_responses_for_new_articles_in_s3( ) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, events_limit=1, crawler_instance=self.crawler, project=CONFIG["CRAWLER_PROJECT"], spider="elsevier", settings=crawler_settings, **CRAWLER_ARGS) gotten_records = [ result['record'] for crawl_result in crawl_results for result in crawl_result['results_data'] ] for record in gotten_records: record.pop("acquisition_source") for document in record['documents']: assert CRAWLER_ARGS['s3_host'] in document[ 'url'] and "Expires" in document['url'] assert document['key'].endswith(".xml") record.pop('documents') extracted_articles_names = set( [article.key for article in self.articles_bucket.objects.all()]) nb_of_packages_in_s3 = len( [package for package in self.packages_bucket.objects.all()]) correctly_parsed_records = [ record for record in gotten_records if record in expected_records ] assert nb_of_packages_in_s3 == expected_number_of_zip_files assert extracted_articles_names == expected_article_names assert len(correctly_parsed_records) == 2
def test_elsevier_spider_doesnt_add_already_existing_packages(self): crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, events_limit=1, crawler_instance=self.crawler, project=CONFIG["CRAWLER_PROJECT"], spider="elsevier", settings=crawler_settings, **CRAWLER_ARGS) nb_of_packages_in_s3 = len( [package for package in self.packages_bucket.objects.all()]) assert nb_of_packages_in_s3 == 1 assert not crawl_results
def test_wsp_ftp(ftp_environment, expected_results): crawler = get_crawler_instance(ftp_environment.get('CRAWLER_HOST_URL'), ) results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=ftp_environment.get('CRAWLER_PROJECT'), spider='WSP', settings={}, **ftp_environment.get('CRAWLER_ARGUMENTS')) gotten_results = [override_generated_fields(result) for result in results] expected_results = [ override_generated_fields(expected) for expected in expected_results ] assert gotten_results == expected_results
def test_elsevier_spider_doesnt_add_already_existing_articles( self, teardown): CRAWLER_ARGS[ "elsevier_consyn_url"] = "http://elsevier-http-server.local/elsevier_batch_feed_response_mock_replicated.txt" crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, events_limit=1, crawler_instance=self.crawler, project=CONFIG["CRAWLER_PROJECT"], spider="elsevier", settings=crawler_settings, **CRAWLER_ARGS) articles_in_s3 = len( [article for article in self.articles_bucket.objects.all()]) assert articles_in_s3 == 4 assert not crawl_results
def test_desy_broken_xml(get_local_settings_for_broken, cleanup): settings = get_local_settings_for_broken crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL')) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=2, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS')) res = crawl_results[0] assert res['record'] assert len(res['errors']) == 1 assert 'ValueError' in res['errors'][0]['exception'] assert res['errors'][0]['traceback'] assert res['file_name'] == 'broken_record.xml' assert res['source_data']
def test_aps_have_document_link_to_s3(cleanup): expected_records_count = 1 expected_documents_count = 1 expected_s3_url = "http://localstack:4566/downloaded/full/b99616c5061a542667fb4fa1d5a8ab750a15c731.xml" expected_parameters_in_s3_url = ["AWSAccessKeyId", "Expires", "Signature"] expected_original_url = "http://aps-http-server.local/PhysRevD.96.095036.xml" settings = get_settings() crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL')) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='APS', settings=settings.get('CRAWLER_SETTINGS'), **settings.get('CRAWLER_ARGUMENTS')) gotten_records = [ result['record'] for crawl_result in crawl_results for result in crawl_result['results_data'] ] assert len(gotten_records) == expected_records_count documents = gotten_records[0]['documents'] assert len(documents) == expected_documents_count assert documents[0]['original_url'] == expected_original_url document_url = documents[0]['url'] assert document_url.split("?")[0] == expected_s3_url for parameter in expected_parameters_in_s3_url: assert parameter in document_url s3_document_response = requests.get(document_url) original_document_response = requests.get(documents[0]['original_url']) assert s3_document_response.status_code == 200 assert s3_document_response.text == original_document_response.text
def test_arxiv( expected_results, config, spider, ): crawler = get_crawler_instance(config['CRAWLER_HOST_URL']) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=config['CRAWLER_PROJECT'], spider=spider, settings={}, **config['CRAWLER_ARGUMENTS'] ) assert len(crawl_results) == 1 crawl_result = crawl_results[0] gotten_results = [ override_generated_fields(result['record']) for result in crawl_result['results_data'] ] expected_results = [ override_generated_fields(expected) for expected in expected_results ] gotten_results = deep_sort(gotten_results) expected_results = deep_sort(expected_results) assert gotten_results == expected_results assert not crawl_result['errors']