def test_upload_file(inspire_app, s3): create_s3_bucket(KEY) filename = "file.txt" mimetype = "text/*" acl = "public-read" expected_content = "This is a demo file\n" record_fixture_path = pkg_resources.resource_filename( __name__, os.path.join("fixtures", "file.txt") ) with open(record_fixture_path, "rb") as data: current_s3_instance.upload_file(data, KEY, filename, mimetype, acl) result = current_s3_instance.client.head_object( Bucket=current_s3_instance.get_bucket_for_file_key(KEY), Key=KEY ) assert result["ContentDisposition"] == f'inline; filename="{filename}"' assert result["ContentType"] == mimetype content = ( current_s3_instance.resource.Object( current_s3_instance.get_bucket_for_file_key(KEY), KEY ) .get()["Body"] .read() .decode("utf-8") ) assert content == expected_content
def test_file_exists_when_file_is_there(inspire_app, s3): expected_result = True create_s3_bucket(KEY) create_s3_file(current_s3_instance.get_bucket_for_file_key(KEY), KEY, "this is my data") result = current_s3_instance.file_exists(KEY) assert result == expected_result
def test_migrate_record_from_mirror_invalidates_local_file_cache_if_no_local_file( inspire_app, s3, redis, datadir ): expected_key = "f43f40833edfd8227c4deb9ad05b321e" create_s3_bucket(expected_key) with patch.dict( current_app.config, {"LABS_AFS_HTTP_SERVICE": "http://inspire-afs-web.cern.ch/"} ): redis.delete("afs_file_locations") # populate cache with invalid file path redis.hset( "afs_file_locations", "http://inspire-afs-web.cern.ch/var/data/files/g97/1940001/content.pdf%3B2", "/api/files/ddb1a354-1d2a-40b6-9cc4-2e823b6bef81/0000000000000000000000000000000000000000", ) raw_record_path = (datadir / "1313624.xml").as_posix() migrate_from_file(raw_record_path) record = LiteratureRecord.get_record_by_pid_value("1313624") assert redis.hlen("afs_file_locations") > 0 assert ( record["documents"][0]["original_url"] == "http://inspire-afs-web.cern.ch/var/data/files/g97/1940001/content.pdf%3B2" )
def test_delete_file(inspire_app, s3): create_s3_bucket(KEY) create_s3_file(current_s3_instance.get_bucket_for_file_key(KEY), KEY, "this is my data") current_s3_instance.delete_file(KEY) with pytest.raises(ClientError): current_s3_instance.client.head_object( Bucket=current_s3_instance.get_bucket_for_file_key(KEY), Key=KEY)
def test_indexer_creates_proper_fulltext_links_in_ui_display_files_enabled( inspire_app, s3): create_s3_bucket("1") create_s3_bucket("f") expected_fulltext_links = ["arXiv", "KEK scanned document", "fulltext"] data = { "external_system_identifiers": [ { "schema": "OSTI", "value": "7224300" }, { "schema": "ADS", "value": "1994PhRvD..50.4491S" }, { "schema": "KEKSCAN", "value": "94-07-219" }, { "schema": "SPIRES", "value": "SPIRES-2926342" }, ], "arxiv_eprints": [{ "categories": ["hep-ph"], "value": "hep-ph/9404247" }], "documents": [ { "source": "arxiv", "fulltext": True, "hidden": True, "key": "arXiv:nucl-th_9310030.pdf", "url": "https://arxiv.org/pdf/1910.11662.pdf", }, { "source": "arxiv", "key": "arXiv:nucl-th_9310031.pdf", "url": "http://inspirehep.net/record/863300/files/fermilab-pub-10-255-e.pdf", }, ], } record = create_record("lit", data=data) response = es_search("records-hep") result = response["hits"]["hits"][0]["_source"] result_ui_display = json.loads(result.pop("_ui_display")) for link in result_ui_display["fulltext_links"]: assert link["value"] assert link["description"] in expected_fulltext_links
def test_get_file_metadata(inspire_app, s3): expected_metadata = {"foo": "bar"} create_s3_bucket(KEY) create_s3_file( current_s3_instance.get_bucket_for_file_key(KEY), KEY, "this is my data", expected_metadata, ) metadata = current_s3_instance.get_file_metadata(KEY)["Metadata"] assert metadata == expected_metadata
def test_index_record_fulltext_manually(inspire_app, clean_celery_session, override_config, s3, datadir): metadata = {"foo": "bar"} pdf_path = os.path.join(datadir, "2206.04407.pdf") create_s3_bucket(KEY) create_s3_file( current_s3_instance.get_bucket_for_file_key(KEY), KEY, pdf_path, metadata, **{"ContentType": "application/pdf"}, ) with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True, FEATURE_FLAG_ENABLE_FILES=False): data = faker.record("lit") data.update({ "documents": [{ "source": "arxiv", "fulltext": True, "filename": "new_doc.pdf", "key": KEY, "url": "http://www.africau.edu/images/default/sample.pdf", }] }) rec = LiteratureRecord.create(data) models_committed.disconnect(index_after_commit) db.session.commit() models_committed.connect(index_after_commit) assert_record_not_in_es(rec["control_number"]) rec.index_fulltext() def assert_record_in_es(): current_search.flush_and_refresh("*") record_lit_es = (LiteratureSearch().get_record(str( rec.id)).execute().hits.hits[0]) document = record_lit_es._source["documents"][0] assert "attachment" in document assert "text" not in document # pipeline should remove it retry_until_pass(assert_record_in_es, timeout=90, retry_interval=5)
def test_migrate_record_from_mirror_uses_local_cache_for_afs_files( inspire_app, s3, redis, datadir): expected_key = "f43f40833edfd8227c4deb9ad05b321e" create_s3_bucket(expected_key) with patch.dict( current_app.config, {"LABS_AFS_HTTP_SERVICE": "http://inspire-afs-web.cern.ch/"}): redis.delete("afs_file_locations") raw_record_path = (datadir / "1313624.xml").as_posix() migrate_from_file(raw_record_path) assert redis.hlen("afs_file_locations") > 0 migrate_from_file(raw_record_path) record = LiteratureRecord.get_record_by_pid_value("1313624") # No original_url as source is local file assert "original_url" not in record["documents"][0]
def test_replace_file_metadata(inspire_app, s3): metadata = {"foo": "bar"} create_s3_bucket(KEY) create_s3_file( current_s3_instance.get_bucket_for_file_key(KEY), KEY, "this is my data", metadata, ) filename = "file.txt" mimetype = "text/*" acl = "public-read" current_s3_instance.replace_file_metadata(KEY, filename, mimetype, acl) result = current_s3_instance.client.head_object( Bucket=current_s3_instance.get_bucket_for_file_key(KEY), Key=KEY) assert result["ContentDisposition"] == f'inline; filename="{filename}"' assert result["ContentType"] == mimetype assert result["Metadata"] == {}
def test_fulltext_indexer_removes_deleted_from_es(inspire_app, override_config, clean_celery_session, s3): metadata = {"foo": "bar"} create_s3_bucket(KEY) create_s3_file( current_s3_instance.get_bucket_for_file_key(KEY), KEY, "this is my data", metadata, ) with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True): lit_record = LiteratureRecord.create( faker.record( "lit", data={ "documents": [{ "fulltext": True, "hidden": False, "key": KEY, "filename": "2105.15193.pdf", "url": "https://arxiv.org/pdf/2105.15193.pdf", }] }, )) db.session.commit() def assert_records_in_es(): lit_record_from_es = LiteratureSearch.get_record_data_from_es( lit_record) assert lit_record_from_es retry_until_pass(assert_records_in_es, retry_interval=5) lit_record.delete() db.session.commit() assert_record_not_in_es(lit_record["control_number"])
def test_file_exists_when_file_is_missing(inspire_app, s3): expected_result = False create_s3_bucket(KEY) result = current_s3_instance.file_exists(KEY) assert result == expected_result
def test_index_records_batch_fulltext_manually(inspire_app, clean_celery_session, override_config, s3): metadata = {"foo": "bar"} key_2 = "9bfe422f251eeaa7ec2a4dd5aebebc8a" key_3 = "e5892c4e59898346d307332354c6c7b8" create_s3_bucket(KEY) create_s3_file( current_s3_instance.get_bucket_for_file_key(KEY), KEY, "this is my data", metadata, ) create_s3_bucket(key_2) create_s3_file( current_s3_instance.get_bucket_for_file_key(key_2), key_2, "this is my data", metadata, ) create_s3_bucket(key_3) create_s3_file( current_s3_instance.get_bucket_for_file_key(key_3), key_3, "this is my data", metadata, ) with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True, FEATURE_FLAG_ENABLE_FILES=False): lit_record = LiteratureRecord.create( faker.record( "lit", data={ "documents": [{ "fulltext": True, "hidden": False, "key": KEY, "filename": "2105.15193.pdf", "url": "https://arxiv.org/pdf/2105.15193.pdf", }] }, )) lit_record_2 = LiteratureRecord.create( faker.record( "lit", data={ "documents": [{ "fulltext": True, "hidden": False, "filename": "new_doc.pdf", "key": key_2, "url": "http://www.africau.edu/images/default/sample.pdf", }] }, )) db.session.commit() def assert_records_in_es(): lit_record_from_es = LiteratureSearch.get_record_data_from_es( lit_record) lit_record_from_es_2 = LiteratureSearch.get_record_data_from_es( lit_record_2) assert lit_record_from_es and lit_record_from_es_2 retry_until_pass(assert_records_in_es, retry_interval=5) models_committed.disconnect(index_after_commit) lit_record["documents"].append( { "source": "arxiv", "fulltext": True, "filename": "another_doc.pdf", "key": key_3, "url": "http://www.africau.edu/images/default/sample.pdf", }, ) lit_record.update(dict(lit_record)) db.session.commit() # reconnect signal before we call process_references_in_records models_committed.connect(index_after_commit) task = batch_index_literature_fulltext.delay( [lit_record.id, lit_record_2.id]) task.get(timeout=5) assert task.result == { "uuids": [str(lit_record.id), str(lit_record_2.id)], "success_count": 2, "failures_count": 0, "failures": [], }