def configuration_file(tmp_path: Path) -> Path:
    with open(tmp_path / "version1.txt", "w") as file:
        file.write("contents1")

    with open(tmp_path / "version2.txt", "w") as file:
        file.write("contents2")

    hash1 = FileAPI.calculate_hash(tmp_path / "version1.txt")
    hash2 = FileAPI.calculate_hash(tmp_path / "version2.txt")
    metadata_file = tmp_path / "metadata.yaml"
    with open(metadata_file, "w") as file:
        file.write("""
-
  data_product: test
  version: 1.0.0
  filename: version1.txt
  verified_hash: {hash1}
-
  data_product: test
  version: 2.0.0
  filename: version2.txt
  verified_hash: {hash2}
""".format(hash1=hash1, hash2=hash2))

    configuration_file = tmp_path / "config.yaml"
    with open(configuration_file, "w") as file:
        file.write("""
data_directory: .
run_id: test_run
access_log: access.yaml
fail_on_hash_mismatch: True
""")
    return configuration_file
def test_multiple_writes_to_same_file_record_same_hash(configuration_file):
    file_api = FileAPI(configuration_file)
    with file_api.open_for_write(data_product="test", extension="txt") as file:
        file.write("foo".encode())
    with file_api.open_for_write(data_product="test", extension="txt") as file:
        file.write("bar".encode())
    access_log = file_api._generate_access_log()
    assert (access_log["io"][0]["access_metadata"]["calculated_hash"] ==
            access_log["io"][1]["access_metadata"]["calculated_hash"])
def test_access_file_contains_run_metadata(tmp_path):
    configuration_file = tmp_path / "config.yaml"
    with open(configuration_file, "w") as file:
        file.write("access_log: access.yaml")
    with FileAPI(configuration_file):
        pass
    with FileAPI():
        pass
    assert (tmp_path / "access.yaml").exists()
def test_read_hash_mismatch(configuration_file: Path):
    file_api = FileAPI(configuration_file)

    with pytest.raises(ValueError):
        with patch("data_pipeline_api.file_api.FileAPI.calculate_hash"
                   ) as mock_hash:
            mock_hash.return_value = "some_random_hash"
            with file_api.open_for_read(data_product="test",
                                        version="1.0.0") as file:
                pass
def test_access_log_written_if_set_to_path(tmp_path):
    configuration_file = tmp_path / "config.yaml"
    with open(configuration_file, "w") as file:
        file.write("""
data_directory: .
access_log: access.yaml
fail_on_hash_mismatch: False
""")
    with FileAPI(configuration_file):
        pass
    assert (tmp_path / "access.yaml").exists()
def _verify_hash(filename: Path, access_calculated_hash: str) -> None:
    """
    Verifies the hash of the file matches the calculated hash from the access log
    
    :param filename: file to verify the hash of 
    :param access_calculated_hash: hash read from the access log for this filename
    """
    calculated_hash = FileAPI.calculate_hash(filename)
    if access_calculated_hash != calculated_hash:
        raise ValueError(
            f"access log contains hash {access_calculated_hash} but calculated hash of {filename} is {calculated_hash}"
        )
def test_access_log_written_if_set_to_absolute_path(tmp_path):
    configuration_file = tmp_path / "config.yaml"
    access_file_path = tmp_path / "access.yaml"
    with open(configuration_file, "w") as file:
        file.write(f"""
data_directory: .
access_log: {str(access_file_path.resolve())}
fail_on_hash_mismatch: False
""")
    with FileAPI(configuration_file):
        pass
    assert access_file_path.exists()
Beispiel #8
0
def download_from_configs(
    run_metadata: Dict[str, Any],
    read_configs: ReadConfigs,
    token: str,
    root_dir: Optional[Union[Path, str]] = None,
) -> None:
    """
    Iterates through the config read blocks and downloads the relevant data for each block

    :param run_metadata: dictionary of run metadata
    :param read_configs: list of read blocks
    :param token: personal access token
    :param root_dir: root directory to instantiate the data in, defaults to current working directory
    """
    unnormalised_data_directory = Path(
        run_metadata[RunMetadata.data_directory])
    root_dir = Path(root_dir) if root_dir is not None else Path.cwd()
    data_directory = FileAPI.normalise_path(root_dir,
                                            unnormalised_data_directory)
    downloader = Downloader(
        data_directory=data_directory,
        data_registry_url=run_metadata.get(RunMetadata.data_registry_url),
        token=token,
    )

    for read_config in read_configs:
        if "doi_or_unique_name" in read_config.get("where", {}):
            config = read_config["where"].copy()
            config.update(read_config.get("use", {}))
            downloader.add_external_object(
                config[MetadataKey.doi_or_unique_name],
                config.get(MetadataKey.title),
                config.get(MetadataKey.component),
                config.get(MetadataKey.version),
            )
        else:
            parsed_config = _parse_read_config(
                read_config,
                run_metadata.get(RunMetadata.default_input_namespace))
            downloader.add_data_product(
                parsed_config.namespace,
                parsed_config.data_product,
                parsed_config.component,
                parsed_config.version,
            )

    downloader.download()
def upload_to_storage(
    remote_uri: str,
    storage_options: Dict[str, Any],
    data_directory: Path,
    filename: Path,
    upload_path: Optional[Union[str, Path]] = None,
    path_prefix: Optional[str] = None,
) -> str:
    """
    Uploads a file to the remote uri

    :param remote_uri: URI to the root of the storage
    :param storage_options: (key, value) pairs that are passed to the remote storage, e.g. credentials
    :param data_directory: root of the data directory read from the access log
    :param filename: file to upload
    :param upload_path: optional override to the upload path of the file
    :param path_prefix: Optional prefix onto the remote path, e.g. namespace
    :return: path of the file on the remote storage
    """
    split_result = urllib.parse.urlsplit(remote_uri)
    protocol = split_result.scheme
    path_prefix = Path(path_prefix) if path_prefix else Path()
    upload_path = (path_prefix /
                   (upload_path or filename.absolute().relative_to(
                       data_directory.absolute()))).as_posix()
    fs, path = get_remote_filesystem_and_path(protocol, remote_uri,
                                              upload_path, **storage_options)
    if protocol in {"file", "ssh", "sftp"}:
        fs.makedirs(Path(path).parent.as_posix(), exist_ok=True)
    sha1 = FileAPI.calculate_hash(filename)
    path_root, path_ext = os.path.splitext(path)
    path = f"{path_root}_{sha1}{path_ext}"
    logger.info(f"Uploading {filename.as_posix()} to {path} on {remote_uri}")
    fs.put(filename.as_posix(), path)
    if path.startswith(remote_uri):
        # some remote filesystems expect the root uri in the path, others don't, but the registry path doesn't
        return path[len(remote_uri):]
    elif path.startswith(split_result.path):
        # some remote_uri's include part of what the fs considers the path, so strip it off
        return path[len(split_result.path):]
    return path
def _upload_file_to_storage(
    posts: List[YamlDict],
    filename: Union[str, Path],
    remote_uri: str,
    storage_options: Dict[str, str],
    storage_root: Union[str, YamlDict],
    namespace: Optional[str] = None,
) -> YamlDict:
    """
    for a given filename, uploads it to the remote uri and returns a reference to the object that will be posted

    :param posts: List of posts to the data registry, will be modified
    :param filename: path to the file to upload
    :param remote_uri: URI to the root of the storage for uploading
    :param storage_options: (key, value) pairs that are passed to the remote storage, e.g. credentials
    :param storage_root: existing reference to the storage_root that this was uploaded to
    :param namespace: namespace of the file being uploaded, if provided will be prefixed onto the upload path
    :return: object reference to the uploaded file
    """
    filename = Path(filename)
    path = upload_to_storage(remote_uri, storage_options, filename.parent, filename, path_prefix=namespace)
    file_hash = FileAPI.calculate_hash(filename)
    location = _create_target_data_dict(
        DataRegistryTarget.storage_location,
        {
            DataRegistryField.path: path,
            DataRegistryField.hash: file_hash,
            DataRegistryField.storage_root: storage_root,
        },
    )
    posts.append(location)
    obj = _create_target_data_dict(
        DataRegistryTarget.object, {DataRegistryField.storage_location: location}
    )
    posts.append(obj)
    return obj
def test_set_get_run_metadata():
    with FileAPI() as api:
        api.set_run_metadata("key", "value")
        assert api.get_run_metadata("key") == "value"
def test_read_latest_version(configuration_file: Path):
    file_api = FileAPI(configuration_file)
    with file_api.open_for_read(data_product="test") as file:
        assert file.read().decode() == "contents2"
def test_cannot_set_reserved_run_metadata_keys(key):
    with pytest.raises(ValueError):
        FileAPI().set_run_metadata(key, "value")
def upload_data_product_cli(
    data_product_path,
    namespace,
    storage_root_name,
    storage_location_path,
    accessibility,
    data_product_name,
    data_product_description,
    data_product_version,
    component,
    data_registry,
    token,
    remote_uri,
    remote_option,
    remote_uri_override,
):
    configure_cli_logging()

    template_file = Path(__file__).parent / Path("templates/data_product.yaml")
    with open(template_file, "r") as f:
        template = f.read()

    data_registry = data_registry or DEFAULT_DATA_REGISTRY_URL
    remote_uri_override = remote_uri_override or remote_uri
    remote_uri = remote_uri.strip()
    remote_uri_override = remote_uri_override.strip()
    storage_root_name = storage_root_name or urllib.parse.urlparse(
        remote_uri_override).netloc
    storage_root = remote_uri_override
    remote_options = get_remote_options()
    arg_remote_options = dict(remote_option) if remote_option else {}
    remote_options.update(arg_remote_options)
    data_product_path = Path(data_product_path)

    storage_location_hash = FileAPI.calculate_hash(data_product_path)

    path = upload_to_storage(remote_uri,
                             remote_option,
                             data_product_path.parent,
                             data_product_path,
                             upload_path=storage_location_path,
                             path_prefix=namespace)
    namespace_ref = get_reference({DataRegistryField.name: namespace},
                                  DataRegistryTarget.namespace, data_registry,
                                  token)
    if namespace_ref:
        query = {
            DataRegistryField.name: data_product_name,
            DataRegistryField.namespace: namespace_ref
        }
        if data_product_version:
            query["version"] = data_product_version
        data_products = get_data(query, DataRegistryTarget.data_product,
                                 data_registry, token, False)
        if data_products:
            latest = next(iter(sort_by_semver(data_products)))
            data_product_version = str(
                semver.VersionInfo.parse(
                    latest[DataRegistryField.version]).bump_minor())
        elif not data_product_version:
            data_product_version = "0.1.0"

    populated_yaml = template.format(
        namespace=namespace,
        storage_root_name=storage_root_name,
        storage_root=storage_root,
        accessibility=accessibility,
        storage_location_path=path,
        storage_location_hash=storage_location_hash,
        data_product_name=data_product_name,
        data_product_description=data_product_description,
        data_product_version=data_product_version,
        component_name="COMPONENT_NAME",
        component_description="COMPONENT_DESCRIPTION",
    )
    config = yaml.safe_load(populated_yaml)
    component_template = config["post"].pop(-1)
    if component:
        for component_name, component_description in component:
            c = component_template["data"].copy()
            c["name"] = component_name
            c["description"] = component_description
            config["post"].append({
                "data": c,
                "target": DataRegistryTarget.object_component
            })
    else:
        c = component_template["data"].copy()
        c["name"] = data_product_name
        c["description"] = data_product_description
        config["post"].append({
            "data": c,
            "target": DataRegistryTarget.object_component
        })
    upload_from_config(config, data_registry, token)
def test_generate_access_log(configuration_file):
    file_api = FileAPI(configuration_file)
    file_api.open_for_read(data_product="test", version="1.0.0").close()
    file_api.open_for_write(data_product="test2", extension="txt").close()
    assert len(file_api._generate_access_log()["io"]) == 2
def test_read_specific_version(configuration_file: Path):
    file_api = FileAPI(configuration_file)
    with file_api.open_for_read(data_product="test", version="1.0.0") as file:
        assert file.read().decode() == "contents1"
def test_write(tmp_path: Path, configuration_file: Path):
    with FileAPI(configuration_file) as api:
        with api.open_for_write(data_product="test", extension="txt") as file:
            file.write("contents3".encode())
        with open(tmp_path / "test" / "test_run.txt") as file:
            assert file.read() == "contents3"