Example #1
0
    def create_mock_dataset2(self):
        with ZipFile("mock.zip", "w") as zipf:
            zipf.writestr("mock.gif", MOCK_IMAGE)

        MINIO_CLIENT.fput_object(
            bucket_name=BUCKET_NAME,
            object_name="datasets/mock.zip/mock.zip",
            file_path="mock.zip",
        )
        metadata = {
            "filename": "mock.zip",
        }
        buffer = BytesIO(dumps(metadata).encode())
        MINIO_CLIENT.put_object(
            bucket_name=BUCKET_NAME,
            object_name="datasets/mock.zip/mock.zip.metadata",
            data=buffer,
            length=buffer.getbuffer().nbytes,
        )
Example #2
0
    def create_mock_dataset3(self):
        with open("mock.jpg", 'wb') as imagef:
            imagef.write(MOCK_IMAGE)

        MINIO_CLIENT.fput_object(
            bucket_name=BUCKET_NAME,
            object_name="datasets/mock.jpg/mock.jpg",
            file_path="mock.jpg",
        )
        metadata = {
            "filename": "mock.jpg",
        }
        buffer = BytesIO(dumps(metadata).encode())
        MINIO_CLIENT.put_object(
            bucket_name=BUCKET_NAME,
            object_name="datasets/mock.jpg/mock.jpg.metadata",
            data=buffer,
            length=buffer.getbuffer().nbytes,
        )
Example #3
0
def save_dataset(name: str,
                 data: Union[pd.DataFrame, BinaryIO] = None,
                 df: pd.DataFrame = None,
                 metadata: Optional[Dict[str, str]] = None,
                 run_id: Optional[str] = None,
                 operator_id: Optional[str] = None):
    """Saves a dataset and its metadata to the object storage.

    Args:
        name (str): the dataset name.
        data (pandas.DataFrame, BinaryIO, optional): the dataset contents as a
            pandas.DataFrame or an `BinaryIO` buffer. Defaults to None.
        df (pandas.DataFrame, optional): the dataset contents as an `pandas.DataFrame`.
            df exists only for compatibility with existing components.
            Use "data" for all types of datasets. Defaults to None.
        metadata (dict, optional): metadata about the dataset. Defaults to None..
        run_id (str, optional): the run id. Defaults to None.
        operator_id (str, optional): the operator uuid. Defaults to None.

    Raises:
        PermissionError: If dataset was read only.
    """
    # ensures MinIO bucket exists
    make_bucket(BUCKET_NAME)

    if run_id is None:
        # gets run_id from env variables
        # Attention: returns None if env is unset
        run_id = get_run_id()

    if operator_id is None:
        # gets operator_id from env variables
        # Attention: returns None if env is unset
        operator_id = get_operator_id(raise_for_none=False)

    # df exists only for compatibility with existing components
    # from now on one must use "data" for all types of datasets
    if df is not None:
        data = df

    try:
        # gets metadata (if dataset exists)
        stored_metadata = stat_dataset(name, run_id)
        metadata_should_be_updated = False

        # update stored metadata values
        if metadata:
            stored_metadata.update(metadata)
        elif isinstance(data, pd.DataFrame):
            metadata_should_be_updated = True

        metadata = stored_metadata
    except FileNotFoundError:
        metadata_should_be_updated = False

    # builds metadata dict:
    # sets filename and run_id
    if metadata is None:
        metadata = {}

    metadata["filename"] = name

    if isinstance(data, pd.DataFrame):
        # sets metadata specific for pandas.DataFrame:
        # columns, featuretypes
        metadata["columns"] = data.columns.tolist()
        metadata["total"] = len(data.index)

        if "featuretypes" not in metadata:
            metadata["featuretypes"] = infer_featuretypes(data)

    # if the metadata was given (set manually), ignore updates, otherwise
    # search for changes and then update current featuretypes to be even with columns
    if metadata_should_be_updated:
        previous_metadata = stat_dataset(name, run_id)
        previous_columns = previous_metadata["columns"]
        previous_featuretypes = previous_metadata["featuretypes"]
        column_to_type = dict(zip(previous_columns, previous_featuretypes))

        new_featuretypes = []
        for new_column in metadata["columns"]:
            if new_column in column_to_type:
                new_featuretypes.append(column_to_type[new_column])
            else:
                new_featuretypes.append(
                    infer_featuretypes(pd.DataFrame(data[new_column]))[0])

        metadata["featuretypes"] = new_featuretypes

    if run_id:
        metadata["run_id"] = run_id

        # When saving a dataset of a run, also
        # set the run_id in datasets/<name>.metadata
        # This enables load_dataset by run="latest"
        try:
            root_metadata = stat_dataset(name, "root")
        except FileNotFoundError:
            root_metadata = {}

        root_metadata["run_id"] = run_id
        object_name = _metadata_filepath(name)
        # encodes metadata to JSON format
        buffer = BytesIO(dumps(root_metadata).encode())
        MINIO_CLIENT.put_object(
            bucket_name=BUCKET_NAME,
            object_name=object_name,
            data=buffer,
            length=buffer.getbuffer().nbytes,
        )

        # create a run metadata to save the last operator id
        # to dataset get loaded on next step of the pipeline flow
        metadata["operator_id"] = operator_id
        object_name = _metadata_filepath(name, run_id=run_id)
        buffer = BytesIO(dumps(metadata).encode())
        MINIO_CLIENT.put_object(
            bucket_name=BUCKET_NAME,
            object_name=object_name,
            data=buffer,
            length=buffer.getbuffer().nbytes,
        )

    path = _data_filepath(name, run_id, operator_id)

    if isinstance(data, pd.DataFrame):
        # uploads dataframe to MinIO as a .csv file
        temp_file = tempfile.NamedTemporaryFile(dir='.', delete=False)
        data.to_csv(temp_file.name, header=True, index=False)
        MINIO_CLIENT.fput_object(bucket_name=BUCKET_NAME,
                                 object_name=path.lstrip(f"{BUCKET_NAME}/"),
                                 file_path=temp_file.name)
        temp_file.close()
        os.remove(temp_file.name)
    else:
        # uploads raw data to MinIO
        buffer = BytesIO(data.read())
        MINIO_CLIENT.put_object(
            bucket_name=BUCKET_NAME,
            object_name=path.lstrip(f"{BUCKET_NAME}/"),
            data=buffer,
            length=buffer.getbuffer().nbytes,
        )

    object_name = _metadata_filepath(name, run_id, operator_id)
    # encodes metadata to JSON format
    buffer = BytesIO(dumps(metadata).encode())
    MINIO_CLIENT.put_object(
        bucket_name=BUCKET_NAME,
        object_name=object_name,
        data=buffer,
        length=buffer.getbuffer().nbytes,
    )