def setUp(self): self.make_bucket() buffer = io.BytesIO(b"mock") MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name="artifacts/mock.txt", data=buffer, length=buffer.getbuffer().nbytes, )
def create_mock_figure(self): file = BytesIO( b'<svg viewBox=\'0 0 125 80\' xmlns=\'http://www.w3.org/2000/svg\'>\n' b'<text y="75" font-size="100" font-family="serif"><![CDATA[10]]></text>\n' b'</svg>\n') MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name="experiments/test/operators/test/figure-123456.svg", data=file, length=file.getbuffer().nbytes, )
def create_mock_model(self): model = {"model": MockModel()} buffer = BytesIO() dump(model, buffer) buffer.seek(0, SEEK_SET) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name= f"experiments/{EXPERIMENT_ID}/operators/{OPERATOR_ID}/model.joblib", data=buffer, length=buffer.getbuffer().nbytes, )
def create_mock_dataset3(self): with open("mock.jpg", 'wb') as imagef: imagef.write(MOCK_IMAGE) MINIO_CLIENT.fput_object( bucket_name=BUCKET_NAME, object_name="datasets/mock.jpg/mock.jpg", file_path="mock.jpg", ) metadata = { "filename": "mock.jpg", } buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name="datasets/mock.jpg/mock.jpg.metadata", data=buffer, length=buffer.getbuffer().nbytes, )
def create_mock_dataset2(self): with ZipFile("mock.zip", "w") as zipf: zipf.writestr("mock.gif", MOCK_IMAGE) MINIO_CLIENT.fput_object( bucket_name=BUCKET_NAME, object_name="datasets/mock.zip/mock.zip", file_path="mock.zip", ) metadata = { "filename": "mock.zip", } buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name="datasets/mock.zip/mock.zip.metadata", data=buffer, length=buffer.getbuffer().nbytes, )
def update_dataset_metadata(name: str, metadata: Dict[str, str], run_id: Optional[str] = None, operator_id: Optional[str] = None): """Update the metadata of a dataset. Args: name (str): the dataset name. metadata (dict): metadata about the dataset. run_id (str, optional): the run id of trainning pipeline. Defaults to None. operator_id (str, optional): the operator uuid. Defaults to None. """ object_name = _metadata_filepath(name, run_id, operator_id) # encodes metadata to JSON format buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=buffer, length=buffer.getbuffer().nbytes, )
def create_mock_dataset1(self, size=1e2): header = ",".join(self.mock_columns()) + "\n" rows = "\n".join([ ",".join([str(v) for v in self.mock_values()]) for x in range(int(size)) ]) buffer = BytesIO((header + rows).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name="datasets/mock.csv/mock.csv", data=buffer, length=buffer.getbuffer().nbytes, ) metadata = { "columns": self.mock_columns(), "featuretypes": self.mock_featuretypes(), "filename": "mock.csv", "run_id": RUN_ID, } buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name="datasets/mock.csv/mock.csv.metadata", data=buffer, length=buffer.getbuffer().nbytes, ) MINIO_CLIENT.copy_object( bucket_name=BUCKET_NAME, object_name= f"datasets/mock.csv/runs/{RUN_ID}/operators/{OPERATOR_ID}/mock.csv/mock.csv", object_source=f"/{BUCKET_NAME}/datasets/mock.csv/mock.csv", ) MINIO_CLIENT.copy_object( bucket_name=BUCKET_NAME, object_name= f"datasets/mock.csv/runs/{RUN_ID}/operators/{OPERATOR_ID}/mock.csv/mock.csv.metadata", object_source=f"/{BUCKET_NAME}/datasets/mock.csv/mock.csv.metadata", )
def save_model(**kwargs): """Serializes and saves models. Args: **kwargs: the models as keyword arguments. Raises: TypeError: when a figure is not a matplotlib figure. Raises: TypeError: when experiment_id is undefined in args and env. TypeError: when operator_id is undefined in args and env. """ experiment_id = kwargs.get("experiment_id") if experiment_id is None: experiment_id = get_experiment_id() operator_id = kwargs.get("operator_id") if operator_id is None: operator_id = get_operator_id() object_name = f"{PREFIX_1}/{experiment_id}/{PREFIX_2}/{operator_id}/{MODEL_FILE}" model_buffer = BytesIO() dump(kwargs, model_buffer) model_buffer.seek(0, SEEK_SET) # ensures MinIO bucket exists make_bucket(BUCKET_NAME) # uploads file to MinIO MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=model_buffer, length=model_buffer.getbuffer().nbytes, )
def save_metrics(experiment_id: Optional[str] = None, operator_id: Optional[str] = None, run_id: Optional[str] = None, **kwargs): """Saves metrics of an experiment to the object storage. Args: experiment_id (str, optional): the experiment uuid. Defaults to None operator_id (str, optional): the operator uuid. Defaults to None run_id (str, optional): the run id. Defaults to None. **kwargs: the metrics dict. Raises: TypeError: when experiment_id is undefined in args and env. TypeError: when operator_id is undefined in args and env. """ if experiment_id is None: experiment_id = get_experiment_id() if operator_id is None: operator_id = get_operator_id() if run_id is None: # gets run_id from env variables # Attention: returns None if env is unset run_id = get_run_id() # ensures MinIO bucket exists make_bucket(BUCKET_NAME) if run_id: metadata = {} try: metadata = stat_metadata(experiment_id, operator_id) if run_id == "latest": run_id = metadata.get("run_id") except FileNotFoundError: pass metadata["run_id"] = run_id # encodes metadata to JSON format and uploads to MinIO buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=f'experiments/{experiment_id}/operators/{operator_id}/.metadata', data=buffer, length=buffer.getbuffer().nbytes, ) object_name = operator_filepath(METRICS_FILE, experiment_id, operator_id, run_id) encoded_metrics = [] # retrieves the metrics saved previosuly try: data = MINIO_CLIENT.get_object( bucket_name=BUCKET_NAME, object_name=object_name, ) encoded_metrics = loads(data.read()) except NoSuchKey: pass # appends new metrics encoded_metrics.extend(_encode_metrics(kwargs)) # puts metrics into buffer buffer = BytesIO(dumps(encoded_metrics).encode()) # uploads metrics to MinIO MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=buffer, length=buffer.getbuffer().nbytes, )
def save_dataset(name: str, data: Union[pd.DataFrame, BinaryIO] = None, df: pd.DataFrame = None, metadata: Optional[Dict[str, str]] = None, run_id: Optional[str] = None, operator_id: Optional[str] = None): """Saves a dataset and its metadata to the object storage. Args: name (str): the dataset name. data (pandas.DataFrame, BinaryIO, optional): the dataset contents as a pandas.DataFrame or an `BinaryIO` buffer. Defaults to None. df (pandas.DataFrame, optional): the dataset contents as an `pandas.DataFrame`. df exists only for compatibility with existing components. Use "data" for all types of datasets. Defaults to None. metadata (dict, optional): metadata about the dataset. Defaults to None.. run_id (str, optional): the run id. Defaults to None. operator_id (str, optional): the operator uuid. Defaults to None. Raises: PermissionError: If dataset was read only. """ # ensures MinIO bucket exists make_bucket(BUCKET_NAME) if run_id is None: # gets run_id from env variables # Attention: returns None if env is unset run_id = get_run_id() if operator_id is None: # gets operator_id from env variables # Attention: returns None if env is unset operator_id = get_operator_id(raise_for_none=False) # df exists only for compatibility with existing components # from now on one must use "data" for all types of datasets if df is not None: data = df try: # gets metadata (if dataset exists) stored_metadata = stat_dataset(name, run_id) metadata_should_be_updated = False # update stored metadata values if metadata: stored_metadata.update(metadata) elif isinstance(data, pd.DataFrame): metadata_should_be_updated = True metadata = stored_metadata except FileNotFoundError: metadata_should_be_updated = False # builds metadata dict: # sets filename and run_id if metadata is None: metadata = {} metadata["filename"] = name if isinstance(data, pd.DataFrame): # sets metadata specific for pandas.DataFrame: # columns, featuretypes metadata["columns"] = data.columns.tolist() metadata["total"] = len(data.index) if "featuretypes" not in metadata: metadata["featuretypes"] = infer_featuretypes(data) # if the metadata was given (set manually), ignore updates, otherwise # search for changes and then update current featuretypes to be even with columns if metadata_should_be_updated: previous_metadata = stat_dataset(name, run_id) previous_columns = previous_metadata["columns"] previous_featuretypes = previous_metadata["featuretypes"] column_to_type = dict(zip(previous_columns, previous_featuretypes)) new_featuretypes = [] for new_column in metadata["columns"]: if new_column in column_to_type: new_featuretypes.append(column_to_type[new_column]) else: new_featuretypes.append( infer_featuretypes(pd.DataFrame(data[new_column]))[0]) metadata["featuretypes"] = new_featuretypes if run_id: metadata["run_id"] = run_id # When saving a dataset of a run, also # set the run_id in datasets/<name>.metadata # This enables load_dataset by run="latest" try: root_metadata = stat_dataset(name, "root") except FileNotFoundError: root_metadata = {} root_metadata["run_id"] = run_id object_name = _metadata_filepath(name) # encodes metadata to JSON format buffer = BytesIO(dumps(root_metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=buffer, length=buffer.getbuffer().nbytes, ) # create a run metadata to save the last operator id # to dataset get loaded on next step of the pipeline flow metadata["operator_id"] = operator_id object_name = _metadata_filepath(name, run_id=run_id) buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=buffer, length=buffer.getbuffer().nbytes, ) path = _data_filepath(name, run_id, operator_id) if isinstance(data, pd.DataFrame): # uploads dataframe to MinIO as a .csv file temp_file = tempfile.NamedTemporaryFile(dir='.', delete=False) data.to_csv(temp_file.name, header=True, index=False) MINIO_CLIENT.fput_object(bucket_name=BUCKET_NAME, object_name=path.lstrip(f"{BUCKET_NAME}/"), file_path=temp_file.name) temp_file.close() os.remove(temp_file.name) else: # uploads raw data to MinIO buffer = BytesIO(data.read()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=path.lstrip(f"{BUCKET_NAME}/"), data=buffer, length=buffer.getbuffer().nbytes, ) object_name = _metadata_filepath(name, run_id, operator_id) # encodes metadata to JSON format buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=buffer, length=buffer.getbuffer().nbytes, )