Esempio n. 1
0
def test_upload_file(inspire_app, s3):
    create_s3_bucket(KEY)
    filename = "file.txt"
    mimetype = "text/*"
    acl = "public-read"
    expected_content = "This is a demo file\n"
    record_fixture_path = pkg_resources.resource_filename(
        __name__, os.path.join("fixtures", "file.txt")
    )
    with open(record_fixture_path, "rb") as data:
        current_s3_instance.upload_file(data, KEY, filename, mimetype, acl)
    result = current_s3_instance.client.head_object(
        Bucket=current_s3_instance.get_bucket_for_file_key(KEY), Key=KEY
    )
    assert result["ContentDisposition"] == f'inline; filename="{filename}"'
    assert result["ContentType"] == mimetype
    content = (
        current_s3_instance.resource.Object(
            current_s3_instance.get_bucket_for_file_key(KEY), KEY
        )
        .get()["Body"]
        .read()
        .decode("utf-8")
    )
    assert content == expected_content
Esempio n. 2
0
def test_file_exists_when_file_is_there(inspire_app, s3):
    expected_result = True
    create_s3_bucket(KEY)
    create_s3_file(current_s3_instance.get_bucket_for_file_key(KEY), KEY,
                   "this is my data")
    result = current_s3_instance.file_exists(KEY)
    assert result == expected_result
Esempio n. 3
0
def test_migrate_record_from_mirror_invalidates_local_file_cache_if_no_local_file(
    inspire_app, s3, redis, datadir
):
    expected_key = "f43f40833edfd8227c4deb9ad05b321e"
    create_s3_bucket(expected_key)
    with patch.dict(
        current_app.config, {"LABS_AFS_HTTP_SERVICE": "http://inspire-afs-web.cern.ch/"}
    ):
        redis.delete("afs_file_locations")
        # populate cache with invalid file path
        redis.hset(
            "afs_file_locations",
            "http://inspire-afs-web.cern.ch/var/data/files/g97/1940001/content.pdf%3B2",
            "/api/files/ddb1a354-1d2a-40b6-9cc4-2e823b6bef81/0000000000000000000000000000000000000000",
        )
        raw_record_path = (datadir / "1313624.xml").as_posix()

        migrate_from_file(raw_record_path)
        record = LiteratureRecord.get_record_by_pid_value("1313624")

        assert redis.hlen("afs_file_locations") > 0
        assert (
            record["documents"][0]["original_url"]
            == "http://inspire-afs-web.cern.ch/var/data/files/g97/1940001/content.pdf%3B2"
        )
Esempio n. 4
0
def test_delete_file(inspire_app, s3):
    create_s3_bucket(KEY)
    create_s3_file(current_s3_instance.get_bucket_for_file_key(KEY), KEY,
                   "this is my data")
    current_s3_instance.delete_file(KEY)
    with pytest.raises(ClientError):
        current_s3_instance.client.head_object(
            Bucket=current_s3_instance.get_bucket_for_file_key(KEY), Key=KEY)
Esempio n. 5
0
def test_indexer_creates_proper_fulltext_links_in_ui_display_files_enabled(
        inspire_app, s3):
    create_s3_bucket("1")
    create_s3_bucket("f")
    expected_fulltext_links = ["arXiv", "KEK scanned document", "fulltext"]

    data = {
        "external_system_identifiers": [
            {
                "schema": "OSTI",
                "value": "7224300"
            },
            {
                "schema": "ADS",
                "value": "1994PhRvD..50.4491S"
            },
            {
                "schema": "KEKSCAN",
                "value": "94-07-219"
            },
            {
                "schema": "SPIRES",
                "value": "SPIRES-2926342"
            },
        ],
        "arxiv_eprints": [{
            "categories": ["hep-ph"],
            "value": "hep-ph/9404247"
        }],
        "documents": [
            {
                "source": "arxiv",
                "fulltext": True,
                "hidden": True,
                "key": "arXiv:nucl-th_9310030.pdf",
                "url": "https://arxiv.org/pdf/1910.11662.pdf",
            },
            {
                "source":
                "arxiv",
                "key":
                "arXiv:nucl-th_9310031.pdf",
                "url":
                "http://inspirehep.net/record/863300/files/fermilab-pub-10-255-e.pdf",
            },
        ],
    }
    record = create_record("lit", data=data)
    response = es_search("records-hep")

    result = response["hits"]["hits"][0]["_source"]
    result_ui_display = json.loads(result.pop("_ui_display"))
    for link in result_ui_display["fulltext_links"]:
        assert link["value"]
        assert link["description"] in expected_fulltext_links
Esempio n. 6
0
def test_get_file_metadata(inspire_app, s3):
    expected_metadata = {"foo": "bar"}
    create_s3_bucket(KEY)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(KEY),
        KEY,
        "this is my data",
        expected_metadata,
    )
    metadata = current_s3_instance.get_file_metadata(KEY)["Metadata"]
    assert metadata == expected_metadata
Esempio n. 7
0
def test_index_record_fulltext_manually(inspire_app, clean_celery_session,
                                        override_config, s3, datadir):
    metadata = {"foo": "bar"}
    pdf_path = os.path.join(datadir, "2206.04407.pdf")
    create_s3_bucket(KEY)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(KEY),
        KEY,
        pdf_path,
        metadata,
        **{"ContentType": "application/pdf"},
    )

    with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True,
                         FEATURE_FLAG_ENABLE_FILES=False):
        data = faker.record("lit")
        data.update({
            "documents": [{
                "source":
                "arxiv",
                "fulltext":
                True,
                "filename":
                "new_doc.pdf",
                "key":
                KEY,
                "url":
                "http://www.africau.edu/images/default/sample.pdf",
            }]
        })
        rec = LiteratureRecord.create(data)
        models_committed.disconnect(index_after_commit)
        db.session.commit()
        models_committed.connect(index_after_commit)

        assert_record_not_in_es(rec["control_number"])

        rec.index_fulltext()

        def assert_record_in_es():
            current_search.flush_and_refresh("*")
            record_lit_es = (LiteratureSearch().get_record(str(
                rec.id)).execute().hits.hits[0])
            document = record_lit_es._source["documents"][0]
            assert "attachment" in document
            assert "text" not in document  # pipeline should remove it

        retry_until_pass(assert_record_in_es, timeout=90, retry_interval=5)
def test_migrate_record_from_mirror_uses_local_cache_for_afs_files(
        inspire_app, s3, redis, datadir):
    expected_key = "f43f40833edfd8227c4deb9ad05b321e"
    create_s3_bucket(expected_key)
    with patch.dict(
            current_app.config,
        {"LABS_AFS_HTTP_SERVICE": "http://inspire-afs-web.cern.ch/"}):
        redis.delete("afs_file_locations")
        raw_record_path = (datadir / "1313624.xml").as_posix()

        migrate_from_file(raw_record_path)
        assert redis.hlen("afs_file_locations") > 0

        migrate_from_file(raw_record_path)
        record = LiteratureRecord.get_record_by_pid_value("1313624")
        # No original_url as source is local file
        assert "original_url" not in record["documents"][0]
Esempio n. 9
0
def test_replace_file_metadata(inspire_app, s3):
    metadata = {"foo": "bar"}
    create_s3_bucket(KEY)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(KEY),
        KEY,
        "this is my data",
        metadata,
    )
    filename = "file.txt"
    mimetype = "text/*"
    acl = "public-read"
    current_s3_instance.replace_file_metadata(KEY, filename, mimetype, acl)
    result = current_s3_instance.client.head_object(
        Bucket=current_s3_instance.get_bucket_for_file_key(KEY), Key=KEY)
    assert result["ContentDisposition"] == f'inline; filename="{filename}"'
    assert result["ContentType"] == mimetype
    assert result["Metadata"] == {}
Esempio n. 10
0
def test_fulltext_indexer_removes_deleted_from_es(inspire_app, override_config,
                                                  clean_celery_session, s3):
    metadata = {"foo": "bar"}
    create_s3_bucket(KEY)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(KEY),
        KEY,
        "this is my data",
        metadata,
    )
    with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True):
        lit_record = LiteratureRecord.create(
            faker.record(
                "lit",
                data={
                    "documents": [{
                        "fulltext":
                        True,
                        "hidden":
                        False,
                        "key":
                        KEY,
                        "filename":
                        "2105.15193.pdf",
                        "url":
                        "https://arxiv.org/pdf/2105.15193.pdf",
                    }]
                },
            ))
        db.session.commit()

        def assert_records_in_es():
            lit_record_from_es = LiteratureSearch.get_record_data_from_es(
                lit_record)

            assert lit_record_from_es

        retry_until_pass(assert_records_in_es, retry_interval=5)

        lit_record.delete()
        db.session.commit()

        assert_record_not_in_es(lit_record["control_number"])
Esempio n. 11
0
def test_file_exists_when_file_is_missing(inspire_app, s3):
    expected_result = False
    create_s3_bucket(KEY)
    result = current_s3_instance.file_exists(KEY)
    assert result == expected_result
Esempio n. 12
0
def test_index_records_batch_fulltext_manually(inspire_app,
                                               clean_celery_session,
                                               override_config, s3):
    metadata = {"foo": "bar"}
    key_2 = "9bfe422f251eeaa7ec2a4dd5aebebc8a"
    key_3 = "e5892c4e59898346d307332354c6c7b8"
    create_s3_bucket(KEY)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(KEY),
        KEY,
        "this is my data",
        metadata,
    )

    create_s3_bucket(key_2)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(key_2),
        key_2,
        "this is my data",
        metadata,
    )

    create_s3_bucket(key_3)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(key_3),
        key_3,
        "this is my data",
        metadata,
    )

    with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True,
                         FEATURE_FLAG_ENABLE_FILES=False):
        lit_record = LiteratureRecord.create(
            faker.record(
                "lit",
                data={
                    "documents": [{
                        "fulltext":
                        True,
                        "hidden":
                        False,
                        "key":
                        KEY,
                        "filename":
                        "2105.15193.pdf",
                        "url":
                        "https://arxiv.org/pdf/2105.15193.pdf",
                    }]
                },
            ))
        lit_record_2 = LiteratureRecord.create(
            faker.record(
                "lit",
                data={
                    "documents": [{
                        "fulltext":
                        True,
                        "hidden":
                        False,
                        "filename":
                        "new_doc.pdf",
                        "key":
                        key_2,
                        "url":
                        "http://www.africau.edu/images/default/sample.pdf",
                    }]
                },
            ))
        db.session.commit()

        def assert_records_in_es():
            lit_record_from_es = LiteratureSearch.get_record_data_from_es(
                lit_record)
            lit_record_from_es_2 = LiteratureSearch.get_record_data_from_es(
                lit_record_2)
            assert lit_record_from_es and lit_record_from_es_2

        retry_until_pass(assert_records_in_es, retry_interval=5)

        models_committed.disconnect(index_after_commit)
        lit_record["documents"].append(
            {
                "source": "arxiv",
                "fulltext": True,
                "filename": "another_doc.pdf",
                "key": key_3,
                "url": "http://www.africau.edu/images/default/sample.pdf",
            }, )
        lit_record.update(dict(lit_record))
        db.session.commit()
        # reconnect signal before we call process_references_in_records
        models_committed.connect(index_after_commit)
        task = batch_index_literature_fulltext.delay(
            [lit_record.id, lit_record_2.id])
        task.get(timeout=5)

        assert task.result == {
            "uuids": [str(lit_record.id),
                      str(lit_record_2.id)],
            "success_count": 2,
            "failures_count": 0,
            "failures": [],
        }