Ejemplo n.º 1
0
    def test_torch_index_with_compressor(self):
        from matorage.torch import Dataset

        data_config = DataConfig(
            **self.storage_config,
            dataset_name="test_torch_index_with_compressor",
            additional={"framework": "pytorch"},
            compressor={
                "complevel": 4,
                "complib": "zlib"
            },
            attributes=[
                DataAttribute("image", "uint8", (2, 2), itemsize=32),
                DataAttribute("target", "uint8", (1), itemsize=32),
            ])

        self.test_torch_saver(data_config=data_config)

        dataset = Dataset(config=self.data_config,
                          index=True,
                          cache_folder_path=self.cache_folder_path)

        assert torch.equal(dataset[0][0],
                           torch.tensor([[1, 2], [3, 4]], dtype=torch.uint8))
        assert torch.equal(dataset[0][1], torch.tensor([0], dtype=torch.uint8))
Ejemplo n.º 2
0
    def test_tf_index_with_compressor(self):
        from matorage.tensorflow import Dataset

        data_config = DataConfig(**self.storage_config,
                                 dataset_name="test_tf_index_with_compressor",
                                 additional={"framework": "tensorflow"},
                                 compressor={
                                     "complevel": 4,
                                     "complib": "zlib"
                                 },
                                 attributes=[
                                     DataAttribute("image",
                                                   "uint8", (2, 2),
                                                   itemsize=32),
                                     DataAttribute("target",
                                                   "uint8", (1),
                                                   itemsize=32),
                                 ])

        self.test_tf_saver(data_config=data_config)

        dataset = Dataset(config=self.data_config, index=True)

        assert tf.reduce_all(
            tf.equal(dataset[0][0],
                     tf.constant([[1, 2], [3, 4]], dtype=tf.uint8)))
        assert tf.reduce_all(
            tf.equal(dataset[0][1], tf.constant([0], dtype=tf.uint8)))
Ejemplo n.º 3
0
    def test_tf_saver(self, data_config=None, save_to_json_file=False):
        if data_config is None:
            self.data_config = DataConfig(**self.storage_config,
                                          dataset_name="test_tf_saver",
                                          additional={
                                              "framework": "tensorflow"
                                          },
                                          attributes=[
                                              DataAttribute("image",
                                                            "uint8", (2, 2),
                                                            itemsize=32),
                                              DataAttribute("target",
                                                            "uint8", (1),
                                                            itemsize=32),
                                          ])
        else:
            self.data_config = data_config

        if save_to_json_file:
            self.data_config_file = "data_config_file.json"
            self.data_config.to_json_file(self.data_config_file)

        self.data_saver = DataSaver(config=self.data_config)

        self.data_saver({
            "image":
            np.asarray([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]),
            "target":
            np.asarray([0, 1]),
        })
        self.data_saver.disconnect()
Ejemplo n.º 4
0
    def test_torch_loader_with_compressor(self):
        from matorage.torch import Dataset

        data_config = DataConfig(
            **self.storage_config,
            dataset_name="test_torch_loader_with_compressor",
            additional={"framework": "pytorch"},
            compressor={
                "complevel": 4,
                "complib": "zlib"
            },
            attributes=[
                DataAttribute("image", "uint8", (2, 2), itemsize=32),
                DataAttribute("target", "uint8", (1), itemsize=32),
            ])

        self.test_torch_saver(data_config=data_config)

        self.dataset = Dataset(config=data_config,
                               cache_folder_path=self.cache_folder_path)
        loader = DataLoader(self.dataset,
                            batch_size=64,
                            num_workers=8,
                            shuffle=True)

        for batch_idx, (image, target) in enumerate(tqdm(loader)):
            pass
Ejemplo n.º 5
0
    def test_tf_loader_with_compressor(self):
        from matorage.tensorflow import Dataset

        data_config = DataConfig(**self.storage_config,
                                 dataset_name="test_tf_loader_with_compressor",
                                 additional={"framework": "tensorflow"},
                                 compressor={
                                     "complevel": 4,
                                     "complib": "zlib"
                                 },
                                 attributes=[
                                     DataAttribute("image",
                                                   "uint8", (2, 2),
                                                   itemsize=32),
                                     DataAttribute("target",
                                                   "uint8", (1),
                                                   itemsize=32),
                                 ])

        self.test_tf_saver(data_config=data_config)

        self.dataset = Dataset(config=data_config)

        for batch_idx, (image, target) in enumerate(
                tqdm(self.dataset.dataloader, total=2)):
            pass
Ejemplo n.º 6
0
 def test_dataconfig_two_attributes(self):
     self.data_config = DataConfig(
         **self.storage_config,
         dataset_name="test_dataconfig_two_attributes",
         attributes=[
             DataAttribute("x", "uint8", (1)),
             DataAttribute("y", "uint8", (1)),
         ])
Ejemplo n.º 7
0
 def test_dataconfig_attributes_already_exist(self):
     with self.assertRaisesRegex(KeyError, "is already exist in"):
         self.data_config = DataConfig(
             **self.storage_config,
             dataset_name="test_dataconfig_attributes_already_exist",
             attributes=[
                 DataAttribute("x", "uint8", (1)),
                 DataAttribute("x", "uint8", (1)),
             ])
Ejemplo n.º 8
0
    def test_datasaver_s3_filetype(self):
        from matorage.torch import Dataset

        self.storage_config = {
            'endpoint': 's3.us-east-1.amazonaws.com',
            'access_key': os.environ['access_key'],
            'secret_key': os.environ['secret_key'],
            'region': 'us-east-1',
            'secure': False,
        }

        self.data_config = DataConfig(
            **self.storage_config,
            dataset_name="test_datasaver_s3_filetype",
            attributes=[DataAttribute("x", "float64", (2), itemsize=32)],
        )
        self.data_saver = DataSaver(config=self.data_config)
        x = np.asarray([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
        self.assertEqual(x.shape, (3, 2))
        self.data_saver({"x": x})

        _file = open("test.txt", "w")
        _file.write('this is test')
        self.data_saver({"file": "test.txt"}, filetype=True)
        _file.close()

        self.data_saver.disconnect()

        self.dataset = Dataset(config=self.data_config,
                               cache_folder_path=self.cache_folder_path)
        self.assertEqual(self.dataset.get_filetype_list, ["file"])
        _local_filepath = self.dataset.get_filetype_from_key("file")
        with open(_local_filepath, 'r') as f:
            self.assertEqual(f.read(), 'this is test')
Ejemplo n.º 9
0
    def test_datasaver_filetype(self):
        from matorage.torch import Dataset

        self.data_config = DataConfig(
            **self.storage_config,
            dataset_name="test_datasaver_filetype",
            attributes=[DataAttribute("x", "float64", (2), itemsize=32)],
        )
        self.data_saver = DataSaver(config=self.data_config)
        x = np.asarray([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
        self.assertEqual(x.shape, (3, 2))
        self.data_saver({"x": x})

        _file = open("test.txt", "w")
        _file.write('this is test')
        self.data_saver({"file": "test.txt"}, filetype=True)
        _file.close()

        self.data_saver.disconnect()

        self.dataset = Dataset(config=self.data_config,
                               cache_folder_path=self.cache_folder_path)
        self.assertEqual(self.dataset.get_filetype_list, ["file"])
        _local_filepath = self.dataset.get_filetype_from_key("file")
        with open(_local_filepath, 'r') as f:
            self.assertEqual(f.read(), 'this is test')
Ejemplo n.º 10
0
    def _check_all(self):
        """
        Check all class variable is fine.

        Returns:
            :obj: `None`:
        """
        self._check_bucket()

        if self.attributes is None:
            raise ValueError("attributes is empty")
        if isinstance(self.attributes, tuple):
            self.attributes = DataAttribute(
                name=self.attributes[0],
                type=self.attributes[1],
                shape=self.attributes[2],
            )
        if isinstance(self.attributes, DataAttribute):
            self.attributes = [self.attributes]

        for i, attr in enumerate(self.attributes):
            if isinstance(attr, tuple):
                self.attributes[i] = DataAttribute(attr[0], attr[1], attr[2])

        attribute_names = set()
        for attribute in self.attributes:
            assert isinstance(attribute.type, tables.atom.Atom)
            if attribute.name in attribute_names:
                raise KeyError("{} is already exist in {}".format(
                    attribute.name, attribute_names))
            else:
                attribute_names.add(attribute.name)

        # To convert `self.attributes`'s shape to be flatten
        self.flatten_attributes = copy.deepcopy(self.attributes)
        self._convert_type_flatten()

        if self.compressor["complevel"] < 0 or 9 < self.compressor["complevel"]:
            raise ValueError(
                "Compressor level is {} must be 0-9 interger".format(
                    self.compressor["level"]))
        if self.compressor["complib"] not in ("zlib", "lzo", "bzip2", "blosc"):
            raise ValueError("compressor mode {} is not valid. select in "
                             "zlib, lzo, bzip2, blosc".format(
                                 self.compressor["lib"]))
Ejemplo n.º 11
0
 def test_datasaver_string_attribute(self):
     self.data_config = DataConfig(
         **self.storage_config,
         dataset_name="test_datasaver_string_attribute",
         attributes=[DataAttribute("x", "string", (2), itemsize=32)])
     self.data_saver = DataSaver(config=self.data_config)
     x = np.asarray([["a", "b"], ["c", "d"], ["e", "f"]])
     self.assertEqual(x.shape, (3, 2))
     self.data_saver({"x": x})
Ejemplo n.º 12
0
 def test_datasaver_float64_attribute(self):
     self.data_config = DataConfig(
         **self.storage_config,
         dataset_name="test_datasaver_float64_attribute",
         attributes=[DataAttribute("x", "float64", (2), itemsize=32)])
     self.data_saver = DataSaver(config=self.data_config)
     x = np.asarray([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
     self.assertEqual(x.shape, (3, 2))
     self.data_saver({"x": x})
Ejemplo n.º 13
0
 def test_datasaver_uint32_attribute(self):
     self.data_config = DataConfig(
         **self.storage_config,
         dataset_name="test_datasaver_uint32_attribute",
         attributes=[DataAttribute("x", "uint32", (2), itemsize=32)])
     self.data_saver = DataSaver(config=self.data_config)
     x = np.asarray([[1, 2], [3, 4], [5, 6]])
     self.assertEqual(x.shape, (3, 2))
     self.data_saver({"x": x})
Ejemplo n.º 14
0
 def test_datasaver_bool_attribute(self):
     self.data_config = DataConfig(
         **self.storage_config,
         dataset_name="test_datasaver_bool_attribute",
         attributes=[DataAttribute("x", "bool", (2), itemsize=32)])
     self.data_saver = DataSaver(config=self.data_config)
     x = np.asarray([[True, False], [False, True], [True, True]])
     self.assertEqual(x.shape, (3, 2))
     self.data_saver({"x": x})
Ejemplo n.º 15
0
    def test_reload_dataconfig(self):
        self.data_config = DataConfig(**self.storage_config,
                                      dataset_name="test_reload_dataconfig",
                                      attributes=DataAttribute(
                                          "x", "uint8", (1)))
        self.data_config_file = "data_config_file.json"
        self.data_config.to_json_file(self.data_config_file)

        self.data_config = None

        self.data_config = DataConfig.from_json_file(self.data_config_file)
Ejemplo n.º 16
0
    def test_datasaver_nas(self):

        self.data_config = DataConfig(
            endpoint="/tmp",
            dataset_name="test_datasaver_nas",
            attributes=[DataAttribute("x", "float64", (2), itemsize=32)],
        )
        self.data_saver = DataSaver(config=self.data_config)
        x = np.asarray([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
        self.assertEqual(x.shape, (3, 2))
        self.data_saver({"x": x})
Ejemplo n.º 17
0
    def test_datasaver_refresh(self):

        self.data_config = DataConfig(
            **self.storage_config,
            dataset_name="test_datasaver_refresh",
            attributes=[DataAttribute("x", "float64", (2), itemsize=32)],
        )
        for refresh in [False, True]:
            self.data_saver = DataSaver(config=self.data_config, refresh=refresh)
            x = np.asarray([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
            self.assertEqual(x.shape, (3, 2))
            self.data_saver({"x": x})
            self.data_saver.disconnect()
Ejemplo n.º 18
0
    def test_torch_saver_nas(self):
        self.data_config = DataConfig(**self.nas_config,
                                      dataset_name="test_torch_saver_nas",
                                      additional={"framework": "pytorch"},
                                      attributes=[
                                          DataAttribute("image",
                                                        "uint8", (2, 2),
                                                        itemsize=32),
                                          DataAttribute("target",
                                                        "uint8", (1),
                                                        itemsize=32),
                                      ])

        self.data_saver = DataSaver(config=self.data_config)

        self.data_saver({
            "image":
            np.asarray([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]),
            "target":
            np.asarray([0, 1]),
        })
        self.data_saver.disconnect()
Ejemplo n.º 19
0
 def test_datasaver_blosc(self):
     for level in range(10):
         self.data_config = DataConfig(
             **self.storage_config,
             dataset_name="test_datasaver_blosc",
             attributes=[DataAttribute("x", "float64", (2), itemsize=32)],
             compressor={"complevel": level, "complib": "blosc"},
         )
         self.data_saver = DataSaver(config=self.data_config)
         x = np.asarray([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
         self.assertEqual(x.shape, (3, 2))
         self.data_saver({"x": x})
         self.data_saver.disconnect()
Ejemplo n.º 20
0
    def test_datasaver_s3(self):
        self.storage_config = {
            'endpoint': 's3.us-east-1.amazonaws.com',
            'access_key': os.environ['access_key'],
            'secret_key': os.environ['secret_key'],
            'region': 'us-east-1',
            'secure': False,
        }

        self.data_config = DataConfig(
            **self.storage_config,
            dataset_name="test_datasaver_s3",
            attributes=[DataAttribute("x", "float64", (2), itemsize=32)],
        )
        self.data_saver = DataSaver(config=self.data_config)
        x = np.asarray([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
        self.assertEqual(x.shape, (3, 2))
        self.data_saver({"x": x})
        self.data_saver.disconnect()
Ejemplo n.º 21
0
    def from_json_file(cls, json_file):
        """
        Constructs a `Config` from the path to a json file of parameters.

        Args:
            json_file (:obj:`string`):
                Path to the JSON file containing the parameters.

        Returns:
            :obj:`DataConfig, ModelConfig`: An instance of a configuration object

        """
        config_dict = cls._dict_from_json_file(json_file)

        config_dict["attributes"] = [
            DataAttribute(**item) for item in config_dict["attributes"]
        ]

        return cls(**config_dict)
Ejemplo n.º 22
0
    def _check_bucket(self):
        """
        Check bucket name is exist. If not exist, create new bucket
        If bucket and metadata sub folder exist, get metadata(attributes, compressor) from there.

        Returns:
            :obj: `None`:
        """
        _client = (Minio(
            self.endpoint,
            access_key=self.access_key,
            secret_key=self.secret_key,
            secure=self.secure,
        ) if not check_nas(self.endpoint) else NAS(self.endpoint))
        if _client.bucket_exists(self.bucket_name):
            objects = _client.list_objects(self.bucket_name,
                                           prefix="metadata/")
            _metadata = None
            for obj in objects:
                _metadata = _client.get_object(self.bucket_name,
                                               obj.object_name)
                break
            if not _metadata:
                return

            metadata_dict = json.loads(_metadata.read().decode("utf-8"))
            if self.endpoint != metadata_dict["endpoint"]:
                raise ValueError(
                    "Already created endpoint({}) doesn't current endpoint str({})"
                    " It may occurs permission denied error".format(
                        metadata_dict["endpoint"], self.endpoint))

            self.compressor = metadata_dict["compressor"]
            self.attributes = [
                DataAttribute(**item) for item in metadata_dict["attributes"]
            ]
        else:
            logger.warn("{} {} is not exist!".format(self.dataset_name,
                                                     str(self.additional)))
Ejemplo n.º 23
0
 def test_dataconfig_one_attribute(self):
     self.data_config = DataConfig(
         **self.storage_config,
         dataset_name="test_dataconfig_one_attribute",
         attributes=DataAttribute("x", "uint8", (1)))
Ejemplo n.º 24
0
 def test_dataconfig_float64_attribute(self):
     self.data_config = DataConfig(
         **self.storage_config,
         dataset_name="test_dataconfig_float64_attribute",
         attributes=[DataAttribute("x", "float64", (1))],
     )
Ejemplo n.º 25
0
 def test_dataconfig_string_attribute(self):
     self.data_config = DataConfig(
         **self.storage_config,
         dataset_name="test_dataconfig_string_attribute",
         attributes=[DataAttribute("x", "string", (1), itemsize=32)])
Ejemplo n.º 26
0
 def test_dataconfig_int32_attribute(self):
     self.data_config = DataConfig(
         **self.storage_config,
         dataset_name="test_dataconfig_int32_attribute",
         attributes=[DataAttribute("x", "int32", (1))])