def test_torch_index_with_compressor(self): from matorage.torch import Dataset data_config = DataConfig( **self.storage_config, dataset_name="test_torch_index_with_compressor", additional={"framework": "pytorch"}, compressor={ "complevel": 4, "complib": "zlib" }, attributes=[ DataAttribute("image", "uint8", (2, 2), itemsize=32), DataAttribute("target", "uint8", (1), itemsize=32), ]) self.test_torch_saver(data_config=data_config) dataset = Dataset(config=self.data_config, index=True, cache_folder_path=self.cache_folder_path) assert torch.equal(dataset[0][0], torch.tensor([[1, 2], [3, 4]], dtype=torch.uint8)) assert torch.equal(dataset[0][1], torch.tensor([0], dtype=torch.uint8))
def test_tf_index_with_compressor(self): from matorage.tensorflow import Dataset data_config = DataConfig(**self.storage_config, dataset_name="test_tf_index_with_compressor", additional={"framework": "tensorflow"}, compressor={ "complevel": 4, "complib": "zlib" }, attributes=[ DataAttribute("image", "uint8", (2, 2), itemsize=32), DataAttribute("target", "uint8", (1), itemsize=32), ]) self.test_tf_saver(data_config=data_config) dataset = Dataset(config=self.data_config, index=True) assert tf.reduce_all( tf.equal(dataset[0][0], tf.constant([[1, 2], [3, 4]], dtype=tf.uint8))) assert tf.reduce_all( tf.equal(dataset[0][1], tf.constant([0], dtype=tf.uint8)))
def test_tf_saver(self, data_config=None, save_to_json_file=False): if data_config is None: self.data_config = DataConfig(**self.storage_config, dataset_name="test_tf_saver", additional={ "framework": "tensorflow" }, attributes=[ DataAttribute("image", "uint8", (2, 2), itemsize=32), DataAttribute("target", "uint8", (1), itemsize=32), ]) else: self.data_config = data_config if save_to_json_file: self.data_config_file = "data_config_file.json" self.data_config.to_json_file(self.data_config_file) self.data_saver = DataSaver(config=self.data_config) self.data_saver({ "image": np.asarray([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]), "target": np.asarray([0, 1]), }) self.data_saver.disconnect()
def test_torch_loader_with_compressor(self): from matorage.torch import Dataset data_config = DataConfig( **self.storage_config, dataset_name="test_torch_loader_with_compressor", additional={"framework": "pytorch"}, compressor={ "complevel": 4, "complib": "zlib" }, attributes=[ DataAttribute("image", "uint8", (2, 2), itemsize=32), DataAttribute("target", "uint8", (1), itemsize=32), ]) self.test_torch_saver(data_config=data_config) self.dataset = Dataset(config=data_config, cache_folder_path=self.cache_folder_path) loader = DataLoader(self.dataset, batch_size=64, num_workers=8, shuffle=True) for batch_idx, (image, target) in enumerate(tqdm(loader)): pass
def test_tf_loader_with_compressor(self): from matorage.tensorflow import Dataset data_config = DataConfig(**self.storage_config, dataset_name="test_tf_loader_with_compressor", additional={"framework": "tensorflow"}, compressor={ "complevel": 4, "complib": "zlib" }, attributes=[ DataAttribute("image", "uint8", (2, 2), itemsize=32), DataAttribute("target", "uint8", (1), itemsize=32), ]) self.test_tf_saver(data_config=data_config) self.dataset = Dataset(config=data_config) for batch_idx, (image, target) in enumerate( tqdm(self.dataset.dataloader, total=2)): pass
def test_dataconfig_two_attributes(self): self.data_config = DataConfig( **self.storage_config, dataset_name="test_dataconfig_two_attributes", attributes=[ DataAttribute("x", "uint8", (1)), DataAttribute("y", "uint8", (1)), ])
def test_dataconfig_attributes_already_exist(self): with self.assertRaisesRegex(KeyError, "is already exist in"): self.data_config = DataConfig( **self.storage_config, dataset_name="test_dataconfig_attributes_already_exist", attributes=[ DataAttribute("x", "uint8", (1)), DataAttribute("x", "uint8", (1)), ])
def test_datasaver_s3_filetype(self): from matorage.torch import Dataset self.storage_config = { 'endpoint': 's3.us-east-1.amazonaws.com', 'access_key': os.environ['access_key'], 'secret_key': os.environ['secret_key'], 'region': 'us-east-1', 'secure': False, } self.data_config = DataConfig( **self.storage_config, dataset_name="test_datasaver_s3_filetype", attributes=[DataAttribute("x", "float64", (2), itemsize=32)], ) self.data_saver = DataSaver(config=self.data_config) x = np.asarray([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]) self.assertEqual(x.shape, (3, 2)) self.data_saver({"x": x}) _file = open("test.txt", "w") _file.write('this is test') self.data_saver({"file": "test.txt"}, filetype=True) _file.close() self.data_saver.disconnect() self.dataset = Dataset(config=self.data_config, cache_folder_path=self.cache_folder_path) self.assertEqual(self.dataset.get_filetype_list, ["file"]) _local_filepath = self.dataset.get_filetype_from_key("file") with open(_local_filepath, 'r') as f: self.assertEqual(f.read(), 'this is test')
def test_datasaver_filetype(self): from matorage.torch import Dataset self.data_config = DataConfig( **self.storage_config, dataset_name="test_datasaver_filetype", attributes=[DataAttribute("x", "float64", (2), itemsize=32)], ) self.data_saver = DataSaver(config=self.data_config) x = np.asarray([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]) self.assertEqual(x.shape, (3, 2)) self.data_saver({"x": x}) _file = open("test.txt", "w") _file.write('this is test') self.data_saver({"file": "test.txt"}, filetype=True) _file.close() self.data_saver.disconnect() self.dataset = Dataset(config=self.data_config, cache_folder_path=self.cache_folder_path) self.assertEqual(self.dataset.get_filetype_list, ["file"]) _local_filepath = self.dataset.get_filetype_from_key("file") with open(_local_filepath, 'r') as f: self.assertEqual(f.read(), 'this is test')
def _check_all(self): """ Check all class variable is fine. Returns: :obj: `None`: """ self._check_bucket() if self.attributes is None: raise ValueError("attributes is empty") if isinstance(self.attributes, tuple): self.attributes = DataAttribute( name=self.attributes[0], type=self.attributes[1], shape=self.attributes[2], ) if isinstance(self.attributes, DataAttribute): self.attributes = [self.attributes] for i, attr in enumerate(self.attributes): if isinstance(attr, tuple): self.attributes[i] = DataAttribute(attr[0], attr[1], attr[2]) attribute_names = set() for attribute in self.attributes: assert isinstance(attribute.type, tables.atom.Atom) if attribute.name in attribute_names: raise KeyError("{} is already exist in {}".format( attribute.name, attribute_names)) else: attribute_names.add(attribute.name) # To convert `self.attributes`'s shape to be flatten self.flatten_attributes = copy.deepcopy(self.attributes) self._convert_type_flatten() if self.compressor["complevel"] < 0 or 9 < self.compressor["complevel"]: raise ValueError( "Compressor level is {} must be 0-9 interger".format( self.compressor["level"])) if self.compressor["complib"] not in ("zlib", "lzo", "bzip2", "blosc"): raise ValueError("compressor mode {} is not valid. select in " "zlib, lzo, bzip2, blosc".format( self.compressor["lib"]))
def test_datasaver_string_attribute(self): self.data_config = DataConfig( **self.storage_config, dataset_name="test_datasaver_string_attribute", attributes=[DataAttribute("x", "string", (2), itemsize=32)]) self.data_saver = DataSaver(config=self.data_config) x = np.asarray([["a", "b"], ["c", "d"], ["e", "f"]]) self.assertEqual(x.shape, (3, 2)) self.data_saver({"x": x})
def test_datasaver_float64_attribute(self): self.data_config = DataConfig( **self.storage_config, dataset_name="test_datasaver_float64_attribute", attributes=[DataAttribute("x", "float64", (2), itemsize=32)]) self.data_saver = DataSaver(config=self.data_config) x = np.asarray([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]) self.assertEqual(x.shape, (3, 2)) self.data_saver({"x": x})
def test_datasaver_uint32_attribute(self): self.data_config = DataConfig( **self.storage_config, dataset_name="test_datasaver_uint32_attribute", attributes=[DataAttribute("x", "uint32", (2), itemsize=32)]) self.data_saver = DataSaver(config=self.data_config) x = np.asarray([[1, 2], [3, 4], [5, 6]]) self.assertEqual(x.shape, (3, 2)) self.data_saver({"x": x})
def test_datasaver_bool_attribute(self): self.data_config = DataConfig( **self.storage_config, dataset_name="test_datasaver_bool_attribute", attributes=[DataAttribute("x", "bool", (2), itemsize=32)]) self.data_saver = DataSaver(config=self.data_config) x = np.asarray([[True, False], [False, True], [True, True]]) self.assertEqual(x.shape, (3, 2)) self.data_saver({"x": x})
def test_reload_dataconfig(self): self.data_config = DataConfig(**self.storage_config, dataset_name="test_reload_dataconfig", attributes=DataAttribute( "x", "uint8", (1))) self.data_config_file = "data_config_file.json" self.data_config.to_json_file(self.data_config_file) self.data_config = None self.data_config = DataConfig.from_json_file(self.data_config_file)
def test_datasaver_nas(self): self.data_config = DataConfig( endpoint="/tmp", dataset_name="test_datasaver_nas", attributes=[DataAttribute("x", "float64", (2), itemsize=32)], ) self.data_saver = DataSaver(config=self.data_config) x = np.asarray([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]) self.assertEqual(x.shape, (3, 2)) self.data_saver({"x": x})
def test_datasaver_refresh(self): self.data_config = DataConfig( **self.storage_config, dataset_name="test_datasaver_refresh", attributes=[DataAttribute("x", "float64", (2), itemsize=32)], ) for refresh in [False, True]: self.data_saver = DataSaver(config=self.data_config, refresh=refresh) x = np.asarray([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]) self.assertEqual(x.shape, (3, 2)) self.data_saver({"x": x}) self.data_saver.disconnect()
def test_torch_saver_nas(self): self.data_config = DataConfig(**self.nas_config, dataset_name="test_torch_saver_nas", additional={"framework": "pytorch"}, attributes=[ DataAttribute("image", "uint8", (2, 2), itemsize=32), DataAttribute("target", "uint8", (1), itemsize=32), ]) self.data_saver = DataSaver(config=self.data_config) self.data_saver({ "image": np.asarray([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]), "target": np.asarray([0, 1]), }) self.data_saver.disconnect()
def test_datasaver_blosc(self): for level in range(10): self.data_config = DataConfig( **self.storage_config, dataset_name="test_datasaver_blosc", attributes=[DataAttribute("x", "float64", (2), itemsize=32)], compressor={"complevel": level, "complib": "blosc"}, ) self.data_saver = DataSaver(config=self.data_config) x = np.asarray([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]) self.assertEqual(x.shape, (3, 2)) self.data_saver({"x": x}) self.data_saver.disconnect()
def test_datasaver_s3(self): self.storage_config = { 'endpoint': 's3.us-east-1.amazonaws.com', 'access_key': os.environ['access_key'], 'secret_key': os.environ['secret_key'], 'region': 'us-east-1', 'secure': False, } self.data_config = DataConfig( **self.storage_config, dataset_name="test_datasaver_s3", attributes=[DataAttribute("x", "float64", (2), itemsize=32)], ) self.data_saver = DataSaver(config=self.data_config) x = np.asarray([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]) self.assertEqual(x.shape, (3, 2)) self.data_saver({"x": x}) self.data_saver.disconnect()
def from_json_file(cls, json_file): """ Constructs a `Config` from the path to a json file of parameters. Args: json_file (:obj:`string`): Path to the JSON file containing the parameters. Returns: :obj:`DataConfig, ModelConfig`: An instance of a configuration object """ config_dict = cls._dict_from_json_file(json_file) config_dict["attributes"] = [ DataAttribute(**item) for item in config_dict["attributes"] ] return cls(**config_dict)
def _check_bucket(self): """ Check bucket name is exist. If not exist, create new bucket If bucket and metadata sub folder exist, get metadata(attributes, compressor) from there. Returns: :obj: `None`: """ _client = (Minio( self.endpoint, access_key=self.access_key, secret_key=self.secret_key, secure=self.secure, ) if not check_nas(self.endpoint) else NAS(self.endpoint)) if _client.bucket_exists(self.bucket_name): objects = _client.list_objects(self.bucket_name, prefix="metadata/") _metadata = None for obj in objects: _metadata = _client.get_object(self.bucket_name, obj.object_name) break if not _metadata: return metadata_dict = json.loads(_metadata.read().decode("utf-8")) if self.endpoint != metadata_dict["endpoint"]: raise ValueError( "Already created endpoint({}) doesn't current endpoint str({})" " It may occurs permission denied error".format( metadata_dict["endpoint"], self.endpoint)) self.compressor = metadata_dict["compressor"] self.attributes = [ DataAttribute(**item) for item in metadata_dict["attributes"] ] else: logger.warn("{} {} is not exist!".format(self.dataset_name, str(self.additional)))
def test_dataconfig_one_attribute(self): self.data_config = DataConfig( **self.storage_config, dataset_name="test_dataconfig_one_attribute", attributes=DataAttribute("x", "uint8", (1)))
def test_dataconfig_float64_attribute(self): self.data_config = DataConfig( **self.storage_config, dataset_name="test_dataconfig_float64_attribute", attributes=[DataAttribute("x", "float64", (1))], )
def test_dataconfig_string_attribute(self): self.data_config = DataConfig( **self.storage_config, dataset_name="test_dataconfig_string_attribute", attributes=[DataAttribute("x", "string", (1), itemsize=32)])
def test_dataconfig_int32_attribute(self): self.data_config = DataConfig( **self.storage_config, dataset_name="test_dataconfig_int32_attribute", attributes=[DataAttribute("x", "int32", (1))])