def __init__( self, config, multipart_upload_size=5 * _MB, num_worker_threads=4, inmemory=False, refresh=False, ): self.config = config # Storage configuration self.multipart_upload_size = multipart_upload_size self.num_worker_threads = num_worker_threads # HDF5 configuration self.inmemory = inmemory self.filter = tb.Filters(**config.compressor) self._filelist = [] self._file, self._earray = self._get_newfile() self._disconnected = False self._client = (Minio( endpoint=self.config.endpoint, access_key=self.config.access_key, secret_key=self.config.secret_key, secure=self.config.secure, region=self.config.region, ) if not check_nas(self.config.endpoint) else NAS( self.config.endpoint)) self._check_and_create_bucket(refresh=refresh) self._uploader = Uploader( client=self._client, bucket=self.config.bucket_name, num_worker_threads=self.num_worker_threads, multipart_upload_size=self.multipart_upload_size, inmemory=self.inmemory, ) atexit.register(self._exit)
def __init__(self, config, num_worker_threads=4, multipart_upload_size=5 * _MB): self.config = config self.num_worker_threads = num_worker_threads self.multipart_upload_size = multipart_upload_size self._client = ( Minio( endpoint=self.config.endpoint, access_key=self.config.access_key, secret_key=self.config.secret_key, secure=self.config.secure, ) if not check_nas(self.config.endpoint) else NAS(self.config.endpoint) ) self._uploader = Uploader( client=self._client, bucket=self.config.bucket_name, num_worker_threads=self.num_worker_threads, multipart_upload_size=self.multipart_upload_size, inmemory=True, )
class Manager(object): type = "optimizer" def __init__(self, config, num_worker_threads=4, multipart_upload_size=5 * _MB): self.config = config self.num_worker_threads = num_worker_threads self.multipart_upload_size = multipart_upload_size self._client = ( Minio( endpoint=self.config.endpoint, access_key=self.config.access_key, secret_key=self.config.secret_key, secure=self.config.secure, region=self.config.region, ) if not check_nas(self.config.endpoint) else NAS(self.config.endpoint) ) self._uploader = Uploader( client=self._client, bucket=self.config.bucket_name, num_worker_threads=self.num_worker_threads, multipart_upload_size=self.multipart_upload_size, inmemory=True, ) def _uploader_closing(self): self._uploader.join_queue() _metadata_file = tempfile.mktemp("metadata.json") with open(_metadata_file, "w", encoding="utf-8") as writer: writer.write(json.dumps(self.config.metadata, indent=4) + "\n") self._client.fput_object( bucket_name=self.config.bucket_name, object_name="metadata.json", file_path=_metadata_file, ) os.remove(_metadata_file) def _save_with_clear(self, step, optimizer, overwrite=False): if overwrite: objects = self._client.list_objects( bucket_name=self.config.bucket_name, prefix=f"{step}/" ) for obj in objects: self._client.remove_object( bucket_name=self.config.bucket_name, object_name=obj.object_name ) # saving optimizer self._save_optimizer(step, optimizer) self._uploader_closing() def _save_param(self, step, group, name, weight): _local_file = tempfile.mktemp(f"{name}.h5") _file = tables.open_file( _local_file, "w", driver="H5FD_CORE", driver_core_backing_store=False ) _file.create_carray( "/", self.type, obj=weight, filters=tables.Filters(**self.config.compressor) ) if group is not None: self._uploader.set_queue( local_file=_file.get_file_image(), remote_file=f"{step}/{group}/{name}" ) else: self._uploader.set_queue( local_file=_file.get_file_image(), remote_file=f"{step}/{name}" ) _file.close() def save(self, optimizer, scheduler=None): if not self._client.bucket_exists(self.config.bucket_name): self._client.make_bucket( self.config.bucket_name, location=self.config.region ) step = self._get_step(optimizer) if not step: logger.error( "{} {} step({})is not exist".format( self.config.optimizer_name, self.config.additional, str(step) ) ) return if step in self.config.metadata["optimizer"]: logger.info( "{} {} is already exist, so optimizer will be overwrited.".format( self.config.optimizer_name, str(self.config.additional) ) ) self._save_with_clear(step, optimizer, overwrite=True) else: self._set_metadata( metadata=self.config.metadata, optimizer=optimizer, step=step ) self._save_with_clear(step, optimizer) if scheduler: self._set_scheduler( metadata=self.config.metadata, scheduler=scheduler, step=step ) logger.info("optimizer with {} is saved".format(str(step))) def load(self, optimizer, step): layers = self._client.list_objects( bucket_name=self.config.bucket_name, prefix=f"{step}/", recursive=True ) logger.info("optimizer with {} is loaded".format(str(step))) self._load_optimizer(step, layers, optimizer) @property def get_metadata(self): """ Get all optimizers according to metadata by step. Returns: :obj:`dict`: optimizer of metadata Examples:: >>> optimizer_manager = OptimizerManager(config=optimizer_config) >>> optimizer_manager.save(optimizer) >>> optimizer_manager.get_metadata {'938': { 'framework': 'pytorch', 'param_groups': [ { 'lr': 0.01, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'params': [ 140516594711520, 140516594711760, 140517867028384, 140516594711680, 140516594693376, 140516594612336 ] } ] } } """ return self.config.metadata["optimizer"]
class Manager(object): type = "model" def __init__(self, config, num_worker_threads=4, multipart_upload_size=5 * _MB): self.config = config self.num_worker_threads = num_worker_threads self.multipart_upload_size = multipart_upload_size self._client = ( Minio( endpoint=self.config.endpoint, access_key=self.config.access_key, secret_key=self.config.secret_key, secure=self.config.secure, ) if not check_nas(self.config.endpoint) else NAS(self.config.endpoint) ) self._uploader = Uploader( client=self._client, bucket=self.config.bucket_name, num_worker_threads=self.num_worker_threads, multipart_upload_size=self.multipart_upload_size, inmemory=True, ) def _uploader_closing(self): self._uploader.join_queue() _metadata_file = tempfile.mktemp("metadata.json") with open(_metadata_file, "w", encoding="utf-8") as writer: writer.write( json.dumps(self.config.metadata, indent=4, sort_keys=True) + "\n" ) self._client.fput_object( bucket_name=self.config.bucket_name, object_name="metadata.json", file_path=_metadata_file, ) os.remove(_metadata_file) def _save_with_clear(self, model_folder, model, overwrite=False): if overwrite: objects = self._client.list_objects( bucket_name=self.config.bucket_name, prefix=f"{model_folder}/" ) for obj in objects: self._client.remove_object( bucket_name=self.config.bucket_name, object_name=obj.object_name ) # saving model self._save_model(model_folder, model) self._uploader_closing() def _save_layer(self, model_folder, name, weight): _local_file = tempfile.mktemp(f"{name}.h5") _file = tables.open_file( _local_file, "w", driver="H5FD_CORE", driver_core_backing_store=False ) _file.create_carray( "/", self.type, obj=weight, filters=tables.Filters(**self.config.compressor) ) self._uploader.set_queue( local_file=_file.get_file_image(), remote_file=f"{model_folder}/{name}" ) _file.close() def save(self, model, **kwargs): if not self._client.bucket_exists(self.config.bucket_name): self._client.make_bucket(self.config.bucket_name) if not isinstance(kwargs, dict): metadata = 0 else: metadata = kwargs model_folder = self._hashmap_transfer(metadata) if model_folder in self.config.metadata["model"]: logger.warn( "{} {} is already exist, so model will be overwrited.".format( self.config.model_name, str(self.config.additional) ) ) self._save_with_clear(model_folder, model, overwrite=True) else: self.config.metadata["model"].update({model_folder: metadata}) self._save_with_clear(model_folder, model) logger.info("model with {} is saved".format(str(metadata))) def load(self, model, **kwargs): if not isinstance(kwargs, dict): metadata = 0 else: metadata = kwargs model_folder = self._hashmap_transfer(metadata) layers = self._client.list_objects( bucket_name=self.config.bucket_name, prefix=f"{model_folder}/", recursive=True, ) logger.info("model with {} is loaded".format(str(metadata))) return self._load_model(model_folder, layers, model) def _hashmap_transfer(self, metadata): """ Get unikey object folder with `metadata` of model. Returns: :obj: `str`: """ if isinstance(metadata, int): metadata = str(metadata) if not isinstance(metadata, str) and not isinstance(metadata, dict): raise ValueError( "metadata {} is empty or not str and dict type".format(metadata) ) key = json.dumps(metadata, indent=4, sort_keys=True) return hashlib.md5(key.encode("utf-8")).hexdigest() @property def get_metadata(self): """ Get all models according to metadata(ex. step, epoch) .. code-block:: python >>> model_manager.save(model, step=100) >>> model_manager.save(model, step=200) >>> model_manager.get_metadata { 'additional': {'version': '1.0.1'}, 'compressor': {'complevel': 0, 'complib': 'zlib'}, 'endpoint': '127.0.0.1:9000', 'model': { 'ad44168f1343bc77b4d9ad6f1fef50b6': {'step': 100}, 'af0677ecf0d15d17d10204be9ff2f9f5': {'step': 200} }, 'model_name': 'testmodel' } Returns: :obj: `dict`: model of metadata """ return self.config.metadata
class DataSaver(object): """ This class must be created independently for the process. The independent process uses multiple threads to upload to storage and generates unique metadata information when upload is complete. Update the file, push the upload queue if it exceeds a certain size, close the file, and create a new file. After saving, you should disconnect the data saver. To make This procedure easier to understand, the following is written in the pseudo-code. .. code-block:: per_one_batch_data_size = array_size // num_batch per_one_file_batch_size = max_object_size // per_one_batch_data_size for batch_idx in range(num_batch): if get_current_stored_batch_size() < per_one_file_batch_size: file.append(data[batch_idx]) else: file_closing() new_file is opened new_file.append(data[batch_idx]) All files are closed. Note: - Deep Learning Framework Type : All(pure python is also possible) - **All processes should call the constructors of this class independently.** - After data save is over, you must disconnect through the disconnect function. Args: config (:obj:`matorage.DataConfig`, **require**): A DataConfig instance object multipart_upload_size (:obj:`integer`, optional, defaults to `5 * 1024 * 1024`): size of the incompletely uploaded object. You can sync files faster with `multipart upload in MinIO. <https://github.com/minio/minio-py/blob/master/minio/api.py#L1795>`_ This is because MinIO clients use multi-threading, which improves IO speed more efficiently regardless of Python's Global Interpreter Lock(GIL). num_worker_threads (:obj:`integer`, optional, defaults to 4): number of backend storage worker to upload or download. inmemory (:obj:`boolean`, optional, defaults to `False`): If you use this value as `True`, then you can use `HDF5_CORE driver <https://support.hdfgroup.org/HDF5/doc/TechNotes/VFL.html#TOC1>`_ so the temporary file for uploading or downloading to backend storage, such as MinIO, is not stored on disk but is in the memory. Keep in mind that using memory is fast because it doesn't use disk IO, but it's not always good. If default option(False), then `HDF5_SEC2` driver will be used on posix OS(or `HDF5_WINDOWS` in Windows). refresh (:obj:`boolean`, optional, defaults to `False`): All existing data is erased and overwritten. Single Process example Examples:: import numpy as np from tqdm import tqdm from matorage import DataConfig, DataSaver data_config = DataConfig( endpoint='127.0.0.1:9000', access_key='minio', secret_key='miniosecretkey', dataset_name='array_test', attributes=[ ('array', 'uint8', (3, 224, 224)), ] ) data_saver = DataSaver(config=data_config) row = 100 data = np.random.rand(64, 3, 224, 224) for _ in tqdm(range(row)): data_saver({ 'array' : data }) data_saver.disconnect() """ def __init__( self, config, multipart_upload_size=5 * _MB, num_worker_threads=4, inmemory=False, refresh=False, ): self.config = config # Storage configuration self.multipart_upload_size = multipart_upload_size self.num_worker_threads = num_worker_threads # HDF5 configuration self.inmemory = inmemory self.filter = tb.Filters(**config.compressor) self._filelist = [] self._file, self._earray = self._get_newfile() self._disconnected = False self._client = (Minio( endpoint=self.config.endpoint, access_key=self.config.access_key, secret_key=self.config.secret_key, secure=self.config.secure, region=self.config.region, ) if not check_nas(self.config.endpoint) else NAS( self.config.endpoint)) self._check_and_create_bucket(refresh=refresh) self._uploader = Uploader( client=self._client, bucket=self.config.bucket_name, num_worker_threads=self.num_worker_threads, multipart_upload_size=self.multipart_upload_size, inmemory=self.inmemory, ) atexit.register(self._exit) def _append_file(self): """ upload file to key called `<bucket_name>/key`. appended data is `Dict[str, str]` **`value` is file path of `str` type** example: { 'key' : 'value.txt', } """ for key, filepath in self._datas.items(): self._uploader.set_queue( local_file=filepath, remote_file=key, ) self.config.set_files(key) def _append_numpy(self): """ append numpy array in `name` node. appended data is `Dict[str, numpy.ndarray]` type. **`value` is `numpy.ndarray` type with (B, *) shape, B means batch size** example: { 'image' : np.random.rand(16, 28, 28), 'target' : np.random.rand(16) } """ array_size = self._get_array_size() bzs = list(self._datas.values())[0].shape[0] per_one_batch_data_size = array_size // bzs per_one_file_batch_size = max( 1, self.config.max_object_size // per_one_batch_data_size) for batch_idx in range(bzs): if self._get_current_stored_batch_size() < per_one_file_batch_size: for name, array in self._datas.items(): self._earray[name].append(array[batch_idx, None]) else: self._file_closing() self._file, self._earray = self._get_newfile() for name, array in self._datas.items(): self._earray[name].append(array[batch_idx, None]) def _check_and_create_bucket(self, refresh): if not self._client.bucket_exists(self.config.bucket_name): self._client.make_bucket(self.config.bucket_name, location=self.config.region) elif refresh: objects = self._client.list_objects(self.config.bucket_name, recursive=True) for obj in objects: self._client.remove_object(self.config.bucket_name, obj.object_name) def _check_attr_name(self, name): """ check attribute names is exist """ if name not in self._earray.keys(): raise KeyError("attribute name {} is not exist!".format(name)) def _check_data_filetype(self): """ Check data which is file type """ if not isinstance(self._datas, dict): raise TypeError("datas shoud be dict type.", self.__call__.__doc__) for key, filepath in self._datas.items(): if not os.path.exists(filepath): raise FileNotFoundError("{} is not found".format(filepath)) def _check_data_numpytype(self): """ Check data which is numpy array type """ if not isinstance(self._datas, dict): raise TypeError("datas shoud be dict type.", self.__call__.__doc__) bzs = 0 for name, array in self._datas.items(): self._check_attr_name(name=name) if is_tf_available() and not isinstance(array, np.ndarray): array = array.numpy() if is_torch_available() and not isinstance(array, np.ndarray): array = array.numpy() assert isinstance(array, np.ndarray), "array type is not `numpy.ndarray`" if bzs: if bzs != array.shape[0]: raise ValueError( "each datas array batch sizes are not same.") else: bzs = array.shape[0] # This resape is made into a (B, *) shape. # Shape is lowered to two contiguous dimensions, enabling IO operations to operate very quickly. # https://www.slideshare.net/HDFEOS/caching-and-buffering-in-hdf5#25 if len(array.shape) == 1: # this array is ground truth array = array.reshape(-1, 1) self._datas[name] = array.reshape( -1, reduce(lambda x, y: x * y, array.shape[1:])) def __call__(self, datas, filetype=False): """ Args: datas (:obj:`Dict[str, numpy.ndarray] or Dict[str, str]`, **require**): if filetype is false, `datas` is `Dict[str, numpy.ndarray]` type, **`value` is `numpy.ndarray` type with (B, *) shape, B means batch size**. else true, `datas` is `Dict[str, str]` type, **`value` is file path of `str` type**. filetype (:obj:`boolean`, optional): Indicates whether the type of data to be added to this bucket is a simple file type. Examples:: data_saver = DataSaver(config=data_config) data_saver({ 'image' : np.random.rand(16, 28, 28), 'target' : np.random.rand(16) }) When used as shown below, filetype data is saved with a key called `<bucket_name>/raw_image`. Examples:: data_saver = DataSaver(config=data_config) data_saver({ 'raw_image' : 'test.jpg' }) print(data_config.get_filetype_list) """ self._disconnected = False self._datas = datas if not filetype: self._check_data_numpytype() self._append_numpy() else: self._check_data_filetype() self._append_file() def _file_closing(self): _length = len(list(self._earray.values())[0]) _last_index = self.config.get_length if not self.inmemory: self._file.close() self._uploader.set_queue( local_file=self._file.filename, remote_file=os.path.basename(self._filename), ) else: self._uploader.set_queue( local_file=self._file.get_file_image(), remote_file=os.path.basename(self._filename), ) self._file.close() # Set filename indexer _current_index = _last_index + _length self.config.set_indexer({ _current_index: { "name": os.path.basename(self._filename), "length": _length, } }) def _create_name(self, length=16): return tempfile.mktemp("{}.h5".format(uuid.uuid4().hex[:length])) def _exit(self): self._file.close() self._disconnected = True def _get_array_size(self): """ Get size of all array . Returns: :obj:`datas size(bytes)` """ size = 0 for name, array in self._datas.items(): size += array.nbytes return size def _get_current_stored_batch_size(self): """ Get current file stored batch size Returns: :obj:`integer`: current stored batch size in a opened file. """ return len(list(self._earray.values())[0]) def _get_newfile(self): """ Get new file inode and it's attribute Returns: :obj:`tuple(tables.File, dict)` second item is pytable's attribute { 'name1' : tables.EArray, 'name2' : tables.EArray } """ _driver, _driver_core_backing_store = self._set_driver() self._filename = self._create_name() self._filelist.append(self._filename) file = tb.open_file( self._filename, "a", driver=_driver, driver_core_backing_store=_driver_core_backing_store, ) # create expandable array earray = {} for _earray in self.config.flatten_attributes: earray[_earray.name] = file.create_earray( file.root, _earray.name, _earray.type, shape=tuple([0]) + _earray.shape, filters=self.filter, ) return (file, earray) def _get_size(self): if self.inmemory: return sys.getsizeof(self._file.get_file_image()) else: return self._file.get_filesize() def _set_driver(self): """ Setting HDF5 driver type Returns: :obj:`str` : HDF5 driver type string """ if self.inmemory: return "H5FD_CORE", False else: if os.name == "posix": return "H5FD_SEC2", True elif os.name == "nt": return "H5FD_WINDOWS", True else: raise ValueError("{} OS not supported!".format(os.name)) @property def get_downloaded_dataset(self): """ get local paths of downloaded dataset in local storage Returns: :obj:`list`: local path of downloaded datasets """ return self._filelist def disconnect(self): """ disconnecting datasaver. close all opened files and upload to backend storage. Must be called after ``datasaver`` function to store data safely. Examples:: data_saver = DataSaver(config=data_config) data_saver({ 'image' : np.random.rand(16, 28, 28), 'target' : np.random.rand(16) }) data_saver.disconnect() """ self._file_closing() self._uploader.join_queue() # metadata set key = uuid.uuid4().hex[:16] _metadata_file = tempfile.mktemp(f"{key}.json") self.config.metadata.to_json_file(_metadata_file) self._client.fput_object(self.config.bucket_name, f"metadata/{key}.json", _metadata_file) os.remove(_metadata_file) @property def get_disconnected(self): return self._disconnected