Esempio n. 1
0
def test_gcs_storage_submit() -> None:
    range_size = 10
    dataset_id = "range-dataset"
    dataset_version = "0"
    dataset = tf.data.Dataset.range(range_size)
    configurations = create_gcs_configuration(access_server_port=15032)

    client = google_storage.Client()
    bucket = client.bucket(configurations.bucket)
    gcs_cache_filepath = get_gcs_filepath(
        configurations=configurations, dataset_id=dataset_id, dataset_version=dataset_version,
    )
    blob = bucket.blob(str(gcs_cache_filepath))

    previous_creation_time = None
    if blob.exists():
        blob.reload()
        previous_creation_time = blob.time_created

    gcs_storage = storage.GCSStorage(configurations=configurations)
    gcs_storage.submit(
        data=dataset, dataset_id=dataset_id, dataset_version=dataset_version,
    )

    blob = bucket.blob(str(gcs_cache_filepath))
    blob.reload()
    assert blob.exists()
    assert blob.time_created is not None
    assert previous_creation_time != blob.time_created

    if previous_creation_time is not None:
        assert previous_creation_time < blob.time_created
Esempio n. 2
0
def worker_using_cacheable(config: storage.GCSConfigurations, dataset_id: str,
                           dataset_version: str) -> None:
    gcs_storage = storage.GCSStorage(configurations=config)

    @gcs_storage.cacheable(dataset_id=dataset_id,
                           dataset_version=dataset_version)
    def make_dataset() -> dataref.LMDBDataRef:
        return util.make_mnist_test_dataset()  # type: ignore

    stream_from_cache = make_dataset().stream()
    dataset_from_stream = tensorflow.make_tf_dataset(stream_from_cache)
    original_dataset = util.make_mnist_test_dataset()

    data_samples = util.compare_datasets(original_dataset, dataset_from_stream)
    assert data_samples == 10000
    assert stream_from_cache.length == data_samples
Esempio n. 3
0
def worker(configurations: storage.GCSConfigurations, dataset_id: str,
           dataset_version: str) -> None:
    range_size = 120
    gcs_storage = storage.GCSStorage(configurations=configurations)

    @gcs_storage.cacheable(dataset_id, dataset_version)
    def make_dataref(input_range_size: int) -> dataref.LMDBDataRef:
        return tf.data.Dataset.range(input_range_size)  # type: ignore

    stream = make_dataref(input_range_size=range_size).stream()
    assert stream.length == range_size

    data_generator = stream.iterator_fn()
    generator_length = 0
    for idx, data in enumerate(data_generator):
        assert idx == data
        generator_length += 1
    assert generator_length == range_size
Esempio n. 4
0
def test_gcs_storage_local_metadata() -> None:
    range_size = 10
    dataset_id = "range-dataset"
    dataset_version = "0"
    dataset = tf.data.Dataset.range(range_size)
    configurations = create_gcs_configuration(access_server_port=15032)

    client = google_storage.Client()
    bucket = client.bucket(configurations.bucket)
    gcs_cache_filepath = get_gcs_filepath(
        configurations=configurations,
        dataset_id=dataset_id,
        dataset_version=dataset_version,
    )

    gcs_storage = storage.GCSStorage(configurations=configurations)
    gcs_storage.submit(
        data=dataset,
        dataset_id=dataset_id,
        dataset_version=dataset_version,
    )

    local_metadata_filepath = get_local_metadata_filepath(
        configurations=configurations,
        dataset_id=dataset_id,
        dataset_version=dataset_version)
    with open(str(local_metadata_filepath), "r") as metadata_file:
        metadata = json.load(metadata_file)

    blob = bucket.blob(str(gcs_cache_filepath))
    blob.reload()

    assert metadata.get("time_created")
    assert blob.time_created.timestamp() == metadata["time_created"]

    local_metadata_filepath.unlink()
    _ = gcs_storage.fetch(dataset_id=dataset_id,
                          dataset_version=dataset_version)
    with open(str(local_metadata_filepath), "r") as metadata_file:
        metadata = json.load(metadata_file)

    assert metadata.get("time_created")
    assert blob.time_created.timestamp() == metadata["time_created"]
Esempio n. 5
0
def test_gcs_storage_cacheable_single_threaded() -> None:
    original_range_size = 120
    updated_range_size = 55
    dataset_id = "range-dataset"
    dataset_version = "0"
    configurations = create_gcs_configuration(access_server_port=15032)

    access_server_handler = test_util.AccessServerHandler(hostname="localhost",
                                                          port=15032)
    access_server_handler.run_server_in_thread()

    gcs_cache_filepath = get_gcs_filepath(
        configurations=configurations,
        dataset_id=dataset_id,
        dataset_version=dataset_version,
    )
    client = google_storage.Client()
    bucket = client.bucket(configurations.bucket)
    blob = bucket.blob(str(gcs_cache_filepath))
    if blob.exists():
        blob.delete()

    gcs_storage = storage.GCSStorage(configurations=configurations)

    @gcs_storage.cacheable(dataset_id, dataset_version)
    def make_dataref(range_size: int) -> dataref.LMDBDataRef:
        return tf.data.Dataset.range(range_size)  # type: ignore

    original_data_stream = make_dataref(
        range_size=original_range_size).stream()
    assert original_data_stream.length == original_range_size
    data_generator = original_data_stream.iterator_fn()
    generator_length = 0
    for idx, data in enumerate(data_generator):
        assert idx == data
        generator_length += 1
    assert generator_length == original_range_size

    updated_data_stream = make_dataref(range_size=updated_range_size).stream()
    assert updated_data_stream.length == original_range_size

    access_server_handler.stop_server()
Esempio n. 6
0
def test_gcs_storage_submit_and_fetch() -> None:
    range_size = 20
    dataset_id = "range-dataset"
    dataset_version = "0"
    dataset = tf.data.Dataset.range(range_size)
    configurations = create_gcs_configuration(access_server_port=15032)

    gcs_storage = storage.GCSStorage(configurations=configurations)
    gcs_storage.submit(
        data=dataset, dataset_id=dataset_id, dataset_version=dataset_version,
    )
    dataref = gcs_storage.fetch(dataset_id=dataset_id, dataset_version=dataset_version)
    stream = dataref.stream()

    assert stream.length == range_size
    data_generator = stream.iterator_fn()
    generator_length = 0
    for idx, data in enumerate(data_generator):
        assert idx == data
        generator_length += 1
    assert generator_length == range_size
Esempio n. 7
0
    def _configure_storage(self) -> None:
        session_config = None  # type: Optional[tf.compat.v1.ConfigProto]
        if self._hvd_config.use:
            # For multi-GPU training, we map processes to individual GPUs. TF requires
            # that for each instantiation of `tf.Session`, the process is mapped
            # to the same GPU.
            session_config = tf.compat.v1.ConfigProto()
            session_config.gpu_options.visible_device_list = str(
                hvd.local_rank())

        scheme = "wss" if self._env.use_tls else "ws"
        rw_coordinator_url = (
            f"{scheme}://{self._env.master_addr}:{self._env.master_port}/ws/data-layer/"
        )
        data_layer_type = self._env.experiment_config.get_data_layer_type()

        if data_layer_type == StorageTypes.SHARED_FS.value:
            local_cache_dir_path = self._env.experiment_config[
                "data_layer"].get("container_storage_path")
            local_cache_path = init_container_storage_path(
                configured_storage_path=local_cache_dir_path)

            storage_config = storage.LFSConfigurations(
                storage_dir_path=str(local_cache_path))
            self._storage = storage.LFSStorage(
                storage_config, tensorflow_config=session_config)

        elif data_layer_type == StorageTypes.S3.value:
            local_cache_dir_path = self._env.experiment_config[
                "data_layer"].get("local_cache_container_path")
            local_cache_path = init_container_storage_path(
                configured_storage_path=local_cache_dir_path)

            storage_config = storage.S3Configurations(
                bucket=self._env.experiment_config["data_layer"]["bucket"],
                bucket_directory_path=self._env.experiment_config["data_layer"]
                ["bucket_directory_path"],
                url=rw_coordinator_url,
                local_cache_dir=str(local_cache_path),
                access_key=self._env.experiment_config["data_layer"].get(
                    "access_key"),
                secret_key=self._env.experiment_config["data_layer"].get(
                    "secret_key"),
                endpoint_url=self._env.experiment_config["data_layer"].get(
                    "endpoint_url"),
                coordinator_cert_file=self._env.master_cert_file,
                coordinator_cert_name=self._env.master_cert_name,
            )
            self._storage = storage.S3Storage(storage_config,
                                              tensorflow_config=session_config)

        elif data_layer_type == StorageTypes.GCS.value:
            local_cache_dir_path = self._env.experiment_config[
                "data_layer"].get("local_cache_container_path")
            local_cache_path = init_container_storage_path(
                configured_storage_path=local_cache_dir_path)
            storage_config = storage.GCSConfigurations(
                bucket=self._env.experiment_config["data_layer"]["bucket"],
                bucket_directory_path=self._env.experiment_config["data_layer"]
                ["bucket_directory_path"],
                url=rw_coordinator_url,
                local_cache_dir=str(local_cache_path),
                coordinator_cert_file=self._env.master_cert_file,
                coordinator_cert_name=self._env.master_cert_name,
            )
            self._storage = storage.GCSStorage(
                storage_config, tensorflow_config=session_config)

        else:
            raise AssertionError(
                "Please select a supported data_layer type. Supported types include: "
                f"{[i.value for i in StorageTypes]}")