def create_mock_dataset2(self): with ZipFile("mock.zip", "w") as zipf: zipf.writestr("mock.gif", MOCK_IMAGE) MINIO_CLIENT.fput_object( bucket_name=BUCKET_NAME, object_name="datasets/mock.zip/mock.zip", file_path="mock.zip", ) metadata = { "filename": "mock.zip", } buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name="datasets/mock.zip/mock.zip.metadata", data=buffer, length=buffer.getbuffer().nbytes, )
def create_mock_dataset3(self): with open("mock.jpg", 'wb') as imagef: imagef.write(MOCK_IMAGE) MINIO_CLIENT.fput_object( bucket_name=BUCKET_NAME, object_name="datasets/mock.jpg/mock.jpg", file_path="mock.jpg", ) metadata = { "filename": "mock.jpg", } buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name="datasets/mock.jpg/mock.jpg.metadata", data=buffer, length=buffer.getbuffer().nbytes, )
def save_dataset(name: str, data: Union[pd.DataFrame, BinaryIO] = None, df: pd.DataFrame = None, metadata: Optional[Dict[str, str]] = None, run_id: Optional[str] = None, operator_id: Optional[str] = None): """Saves a dataset and its metadata to the object storage. Args: name (str): the dataset name. data (pandas.DataFrame, BinaryIO, optional): the dataset contents as a pandas.DataFrame or an `BinaryIO` buffer. Defaults to None. df (pandas.DataFrame, optional): the dataset contents as an `pandas.DataFrame`. df exists only for compatibility with existing components. Use "data" for all types of datasets. Defaults to None. metadata (dict, optional): metadata about the dataset. Defaults to None.. run_id (str, optional): the run id. Defaults to None. operator_id (str, optional): the operator uuid. Defaults to None. Raises: PermissionError: If dataset was read only. """ # ensures MinIO bucket exists make_bucket(BUCKET_NAME) if run_id is None: # gets run_id from env variables # Attention: returns None if env is unset run_id = get_run_id() if operator_id is None: # gets operator_id from env variables # Attention: returns None if env is unset operator_id = get_operator_id(raise_for_none=False) # df exists only for compatibility with existing components # from now on one must use "data" for all types of datasets if df is not None: data = df try: # gets metadata (if dataset exists) stored_metadata = stat_dataset(name, run_id) metadata_should_be_updated = False # update stored metadata values if metadata: stored_metadata.update(metadata) elif isinstance(data, pd.DataFrame): metadata_should_be_updated = True metadata = stored_metadata except FileNotFoundError: metadata_should_be_updated = False # builds metadata dict: # sets filename and run_id if metadata is None: metadata = {} metadata["filename"] = name if isinstance(data, pd.DataFrame): # sets metadata specific for pandas.DataFrame: # columns, featuretypes metadata["columns"] = data.columns.tolist() metadata["total"] = len(data.index) if "featuretypes" not in metadata: metadata["featuretypes"] = infer_featuretypes(data) # if the metadata was given (set manually), ignore updates, otherwise # search for changes and then update current featuretypes to be even with columns if metadata_should_be_updated: previous_metadata = stat_dataset(name, run_id) previous_columns = previous_metadata["columns"] previous_featuretypes = previous_metadata["featuretypes"] column_to_type = dict(zip(previous_columns, previous_featuretypes)) new_featuretypes = [] for new_column in metadata["columns"]: if new_column in column_to_type: new_featuretypes.append(column_to_type[new_column]) else: new_featuretypes.append( infer_featuretypes(pd.DataFrame(data[new_column]))[0]) metadata["featuretypes"] = new_featuretypes if run_id: metadata["run_id"] = run_id # When saving a dataset of a run, also # set the run_id in datasets/<name>.metadata # This enables load_dataset by run="latest" try: root_metadata = stat_dataset(name, "root") except FileNotFoundError: root_metadata = {} root_metadata["run_id"] = run_id object_name = _metadata_filepath(name) # encodes metadata to JSON format buffer = BytesIO(dumps(root_metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=buffer, length=buffer.getbuffer().nbytes, ) # create a run metadata to save the last operator id # to dataset get loaded on next step of the pipeline flow metadata["operator_id"] = operator_id object_name = _metadata_filepath(name, run_id=run_id) buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=buffer, length=buffer.getbuffer().nbytes, ) path = _data_filepath(name, run_id, operator_id) if isinstance(data, pd.DataFrame): # uploads dataframe to MinIO as a .csv file temp_file = tempfile.NamedTemporaryFile(dir='.', delete=False) data.to_csv(temp_file.name, header=True, index=False) MINIO_CLIENT.fput_object(bucket_name=BUCKET_NAME, object_name=path.lstrip(f"{BUCKET_NAME}/"), file_path=temp_file.name) temp_file.close() os.remove(temp_file.name) else: # uploads raw data to MinIO buffer = BytesIO(data.read()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=path.lstrip(f"{BUCKET_NAME}/"), data=buffer, length=buffer.getbuffer().nbytes, ) object_name = _metadata_filepath(name, run_id, operator_id) # encodes metadata to JSON format buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=buffer, length=buffer.getbuffer().nbytes, )