Ejemplo n.º 1
0
def objects_manifest_publish(
    ctx,
    file,
    thread_num,
    append_urls,
    manifest_file_delimiter,
    out_manifest_file,
):
    auth = ctx.obj["auth_factory"].get()
    loop = get_or_create_event_loop_for_thread()

    if not file:
        file = click.prompt("Enter Discovery metadata file path to publish")

    click.echo(
        f"Publishing/writing object data from {file}...\n    to: {auth.endpoint}"
    )

    index_object_manifest(
        commons_url=auth.endpoint,
        manifest_file=file,
        thread_num=thread_num,
        auth=auth,
        replace_urls=not append_urls,
        manifest_file_delimiter=manifest_file_delimiter,
        output_filename=out_manifest_file,
        submit_additional_metadata_columns=True,
    )
Ejemplo n.º 2
0
def test_index_manifest_additional_metadata(gen3_index, gen3_auth):
    """
    When `submit_additional_metadata_columns` is set, the data for any
    provided column that is not in indexd should be submitted to the
    metadata service.
    """
    with patch(
        "gen3.tools.indexing.index_manifest.Gen3Metadata.create", MagicMock()
    ) as mock_mds_create:
        index_object_manifest(
            manifest_file=CURRENT_DIR + "/test_data/manifest_additional_metadata.tsv",
            auth=gen3_auth,
            commons_url=gen3_index.client.url,
            thread_num=1,
            replace_urls=False,
            submit_additional_metadata_columns=True,
        )
        mds_records = {
            kwargs["guid"]: kwargs["metadata"]
            for (_, kwargs) in mock_mds_create.call_args_list
        }
        assert len(mds_records) == 1

    indexd_records = {r["did"]: r for r in gen3_index.get_all_records()}
    assert len(indexd_records) == 1

    guid = list(indexd_records.keys())[0]
    assert indexd_records[guid]["file_name"] == "file.txt"
    assert indexd_records[guid]["size"] == 363455714
    assert indexd_records[guid]["hashes"] == {"md5": "473d83400bc1bc9dc635e334faddf33c"}
    assert indexd_records[guid]["authz"] == ["/open"]
    assert indexd_records[guid]["urls"] == ["s3://my-data-bucket/dg.1234/path/file.txt"]
    assert guid in mds_records
    assert mds_records[guid] == {"fancy_column": "fancy_data"}
Ejemplo n.º 3
0
def test_index_manifest_packages_failure(data, gen3_index, gen3_auth, logfile):
    """
    Test that the expected errors are thrown when the manifest contains invalid package rows.
    """
    with patch(
        "gen3.tools.indexing.index_manifest.Gen3Metadata.create", MagicMock()
    ) as mock_mds_create:
        index_object_manifest(
            manifest_file=f"{CURRENT_DIR}/test_data/{data['manifest']}",
            auth=gen3_auth,
            commons_url=gen3_index.client.url,
            thread_num=1,
            replace_urls=False,
            submit_additional_metadata_columns=True,
        )
        mds_records = {
            kwargs["guid"]: kwargs["metadata"]
            for (_, kwargs) in mock_mds_create.call_args_list
        }
        assert len(mds_records) == 0

    indexd_records = {r["did"]: r for r in gen3_index.get_all_records()}
    assert len(indexd_records) == 0

    for error in data["expected_error_msgs"]:
        assert error in logfile.read()
Ejemplo n.º 4
0
def test_index_manifest(gen3_index, indexd_server):

    rec1 = gen3_index.create_record(
        did="255e396f-f1f8-11e9-9a07-0a80fada099c",
        hashes={"md5": "473d83400bc1bc9dc635e334faddf33c"},
        acl=["DEV", "test"],
        size=363_455_714,
        urls=[
            "s3://testaws/aws/test.txt",
            "gs://test/test.txt",
            "gs://test/test,with,comma.txt",
        ],
    )

    index_object_manifest(
        indexd_server.baseurl,
        CURRENT_DIR + "/test_data/test.tsv",
        1,
        ("admin", "admin"),
        replace_urls=False,
    )
    rec1 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada099c")
    rec2 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada010c")
    rec3 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada098c")
    rec4 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada097c")
    rec5 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada096c")
    rec6 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada012c")
    assert set(rec1["urls"]) == set(
        [
            "s3://testaws/aws/test.txt",
            "gs://test/test.txt",
            "s3://pdcdatastore/test1.raw",
            # commas *are* allowed in values of arrays
            "gs://test/test,with,comma.txt",
        ]
    )

    assert rec1["authz"] == []
    assert rec2["hashes"]["md5"] == "473d83400bc1bc9dc635e334fadde33c"
    assert rec2["size"] == 363_455_714
    assert rec2["authz"] == ["/program/DEV/project/test"]
    assert rec2["urls"] == ["s3://pdcdatastore/test5.raw"]
    assert rec3["urls"] == ["s3://pdcdatastore/test2.raw"]
    assert rec3["authz"] == ["/program/DEV/project/test"]
    assert rec4["urls"] == ["s3://pdcdatastore/test3.raw"]
    assert rec4["acl"] == ["phs0001", "phs0002"]
    assert rec5["urls"] == ["s3://pdcdatastore/test4.raw"]
    assert rec5["file_name"] == "test4_file.raw"

    # commas *are* allowed in values of arrays
    assert rec5["acl"] == ["phs0001,", "phs0002"]

    assert rec5["authz"] == ["/program/DEV/project/test"]
    assert rec6["urls"] == ["s3://pdcdatastore/test6 space.raw"]
    assert rec6["authz"] == ["/prog ram/DEV/project/test"]

    # ensure prev_guid worked to create a new version with same baseid
    assert rec6["baseid"] == rec2["baseid"]
Ejemplo n.º 5
0
def test_index_manifest_with_replace_urls(gen3_index, indexd_server):
    rec1 = gen3_index.create_record(
        did="255e396f-f1f8-11e9-9a07-0a80fada099c",
        hashes={"md5": "473d83400bc1bc9dc635e334faddf33c"},
        acl=["DEV", "test"],
        size=363_455_714,
        urls=["s3://testaws/aws/test.txt", "gs://test/test.txt"],
    )
    index_object_manifest(
        indexd_server.baseurl, "./test.tsv", 1, ("admin", "admin"), replace_urls=True
    )
    rec1 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada099c")

    assert rec1["urls"] == ["s3://pdcdatastore/test1.raw"]
Ejemplo n.º 6
0
def test_index_non_guid_manifest(gen3_index, indexd_server):
    files, _ = index_object_manifest(
        indexd_server.baseurl, "./test2.tsv", 1, ("admin", "admin"), replace_urls=True
    )

    assert "testprefix" in files[0]["guid"]
    rec1 = gen3_index.get(files[0]["guid"])
    assert rec1["urls"] == ["s3://pdcdatastore/test1.raw"]
Ejemplo n.º 7
0
def test_index_manifest(gen3_index, indexd_server):

    rec1 = gen3_index.create_record(
        did="255e396f-f1f8-11e9-9a07-0a80fada099c",
        hashes={"md5": "473d83400bc1bc9dc635e334faddf33c"},
        acl=["DEV", "test"],
        size=363_455_714,
        urls=["s3://testaws/aws/test.txt", "gs://test/test.txt"],
    )

    index_object_manifest(
        indexd_server.baseurl, "./test.tsv", 1, ("admin", "admin"), replace_urls=False
    )
    rec1 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada099c")
    rec2 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada010c")
    rec3 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada098c")
    rec4 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada097c")
    rec5 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada096c")
    rec6 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada012c")
    assert set(rec1["urls"]) == set(
        [
            "s3://testaws/aws/test.txt",
            "gs://test/test.txt",
            "s3://pdcdatastore/test1.raw",
        ]
    )

    assert rec1["authz"] == []
    assert rec2["hashes"]["md5"] == "473d83400bc1bc9dc635e334fadde33c"
    assert rec2["size"] == 363_455_714
    assert rec2["authz"] == ["/program/DEV/project/test"]
    assert rec2["urls"] == ["s3://pdcdatastore/test5.raw"]
    assert rec3["urls"] == ["s3://pdcdatastore/test2.raw"]
    assert rec3["authz"] == ["/program/DEV/project/test"]
    assert rec4["urls"] == ["s3://pdcdatastore/test3.raw"]
    assert rec4["acl"] == ["phs0001", "phs0002"]
    assert rec5["urls"] == ["s3://pdcdatastore/test4.raw"]
    assert rec5["file_name"] == "test4_file.raw"
    assert rec5["acl"] == ["phs0001", "phs0002"]
    assert rec5["authz"] == ["/program/DEV/project/test"]
    assert rec6["urls"] == ["s3://pdcdatastore/test6 space.raw"]
    assert rec6["authz"] == ["/prog ram/DEV/project/test"]
Ejemplo n.º 8
0
def test_index_manifest_packages(gen3_index, gen3_auth):
    """
    When `record_type == package`, packages should be created in the metadata service and any `package_contents` values should be parsed and submitted.
    """
    with patch(
        "gen3.tools.indexing.index_manifest.Gen3Metadata.create", MagicMock()
    ) as mock_mds_create:
        index_object_manifest(
            manifest_file=CURRENT_DIR + "/test_data/packages_manifest_ok.tsv",
            auth=gen3_auth,
            commons_url=gen3_index.client.url,
            thread_num=1,
            replace_urls=False,
            submit_additional_metadata_columns=True,
        )

        print("MDS create calls:", mock_mds_create.call_args_list)
        mds_records = {
            kwargs["guid"]: kwargs["metadata"]
            for (_, kwargs) in mock_mds_create.call_args_list
        }
        assert len(mds_records) == 4

    indexd_records = {r["did"]: r for r in gen3_index.get_all_records()}
    assert len(indexd_records) == 5

    # object (not a package) with all fields provided
    guid = "255e396f-f1f8-11e9-9a07-0a80fada0900"
    assert guid in indexd_records
    assert guid not in mds_records

    # package with all fields provided
    # S3 URL
    guid = "255e396f-f1f8-11e9-9a07-0a80fada0901"
    assert guid in indexd_records
    assert indexd_records[guid]["file_name"] == "package.zip"
    assert indexd_records[guid]["size"] == 363455714
    assert indexd_records[guid]["hashes"] == {"md5": "473d83400bc1bc9dc635e334faddf33c"}
    assert indexd_records[guid]["authz"] == ["/open/packages"]
    assert indexd_records[guid]["urls"] == [
        "s3://my-data-bucket/dg.1234/path/package.zip"
    ]

    assert guid in mds_records
    assert mds_records[guid]["type"] == "package"
    assert mds_records[guid]["package"]["version"] == "0.1"
    assert mds_records[guid]["package"]["file_name"] == "package.zip"
    assert mds_records[guid]["package"]["size"] == 363455714
    assert mds_records[guid]["package"]["hashes"] == {
        "md5": "473d83400bc1bc9dc635e334faddf33c"
    }
    assert mds_records[guid]["package"]["contents"] == [
        {
            "hashes": {"md5sum": "2cd6ee2c70b0bde53fbe6cac3c8b8bb1"},
            "file_name": "yes.txt",
            "size": 35,
        },
        {
            "hashes": {"md5sum": "30cf3d7d133b08543cb6c8933c29dfd7"},
            "file_name": "hi.txt",
            "size": 35,
        },
    ]
    assert mds_records[guid]["_buckets"] == ["s3://my-data-bucket"]
    assert mds_records[guid]["_filename"] == "package.zip"
    assert mds_records[guid]["_file_extension"] == ".zip"
    assert mds_records[guid]["_upload_status"] == "uploaded"
    assert mds_records[guid]["_resource_paths"] == ["/open/packages"]

    # package with no "package_contents" provided
    # GS URL
    guid = "255e396f-f1f8-11e9-9a07-0a80fada0902"
    assert guid in indexd_records
    assert indexd_records[guid]["urls"] == [
        "gs://my-google-data-bucket/dg.1234/path/package.zip"
    ]
    assert guid in mds_records
    assert mds_records[guid]["type"] == "package"
    assert mds_records[guid]["package"]["contents"] == None
    assert mds_records[guid]["_buckets"] == ["gs://my-google-data-bucket"]

    # package with no "file_name" provided
    # and 2 URLs with different file names.
    # the file name from the first URL is used as the package file name -
    # depending on the order of the URLs in the indexd record, it could be
    # either one
    guid = "255e396f-f1f8-11e9-9a07-0a80fada0903"
    assert guid in indexd_records
    assert indexd_records[guid]["file_name"] == ""
    assert sorted(indexd_records[guid]["urls"]) == sorted(
        [
            "s3://my-data-bucket/dg.1234/path/package.zip",
            "gs://my-google-data-bucket/dg.1234/path/other_file_name.zip",
        ]
    )
    assert guid in mds_records
    assert sorted(mds_records[guid]["_buckets"]) == sorted(
        ["s3://my-data-bucket", "gs://my-google-data-bucket"]
    )
    assert mds_records[guid]["package"]["file_name"] in [
        "package.zip",
        "other_file_name.zip",
    ]
    assert mds_records[guid]["_filename"] in ["package.zip", "other_file_name.zip"]

    # package with no "guid" provided
    new_guids = [
        guid
        for guid in indexd_records
        if not guid.startswith("255e396f-f1f8-11e9-9a07-0a80fada09")
    ]
    assert len(new_guids) == 1
    guid = new_guids[0]
    assert guid in mds_records