Exemple #1
0
    def preview_pipeline(self,
                         pipeline: Pipeline,
                         limit: int = 50,
                         offset: int = 0) -> str:
        """
        Execute a pipeline but returns only a slice of the results, determined by `limit` and `offset` parameters, as JSON.

        Return format follows the 'table' JSON table schema used by pandas (see
        https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#orient-options), with a few addition related to
        pagination.

        Note: it's required to use pandas `to_json` methods, as it convert NaN and dates to an appropriate format.
        """
        df = self.execute_pipeline(pipeline)
        return json.dumps({
            'schema':
            build_table_schema(df, index=False),
            'offset':
            offset,
            'limit':
            limit,
            'total':
            df.shape[0],
            'data':
            json.loads(df[offset:offset + limit].to_json(orient='records')),
        })
Exemple #2
0
    def update_preview_fields_from_df(artifact,
                                      df,
                                      stats=None,
                                      preview_rows_length=None,
                                      ignore_preview_limits=False):
        preview_rows_length = preview_rows_length or default_preview_rows_length
        if hasattr(df, "dask"):
            artifact.length = df.shape[0].compute()
            preview_df = df.sample(frac=ddf_sample_pct).compute()
        else:
            artifact.length = df.shape[0]
            preview_df = df

        if artifact.length > preview_rows_length and not ignore_preview_limits:
            preview_df = df.head(preview_rows_length)
        preview_df = preview_df.reset_index()
        if len(preview_df.columns
               ) > max_preview_columns and not ignore_preview_limits:
            preview_df = preview_df.iloc[:, :max_preview_columns]
        artifact.header = preview_df.columns.values.tolist()
        artifact.preview = preview_df.values.tolist()
        artifact.schema = build_table_schema(preview_df)
        if (stats or (artifact.length < max_csv
                      and len(df.columns) < max_preview_columns)
                or ignore_preview_limits):
            artifact.stats = get_df_stats(df)
Exemple #3
0
    def __init__(self,
                 key=None,
                 df=None,
                 preview=None,
                 format='',
                 stats=None,
                 target_path=None,
                 **kwargs):

        format = format.lower()
        super().__init__(key, None, format=format, target_path=target_path)
        if format and format not in supported_formats:
            raise ValueError('unsupported format {} use one of {}'.format(
                format, '|'.join(supported_formats)))

        if format == 'pq':
            format = 'parquet'
        self.format = format
        self.stats = None

        if df is not None:
            self.header = df.columns.values.tolist()
            self.length = df.shape[0]
            preview = preview or preview_lines
            shortdf = df
            if self.length > preview:
                shortdf = df.head(preview)
            self.preview = shortdf.values.tolist()
            self.schema = build_table_schema(df)
            if stats or self.length < max_csv:
                self.stats = get_stats(df)

        self._df = df
        self._kw = kwargs
Exemple #4
0
    def get_schema(cls, dataframe):
        schema = build_table_schema(dataframe)

        c = {}
        for x in schema['fields']:
            c[x['name']] = cls._translate_datatypes(x['type'])

        return c
Exemple #5
0
        def get_schema(csv):
            """
            Get schema for collected csv
            Args:
                csv: csv string file

            Returns:
                StructType with schemas
            """
            schemas = StructType()
            for field in build_table_schema(csv, index=False)["fields"]:
                type = StringType() if field["type"] == "string" \
                    else FloatType() if field["type"] == "number" \
                    else IntegerType()
                schemas.add(StructField(field["name"], type, True))
            return schemas
Exemple #6
0
def get_pandas_df_schema(df: pd.DataFrame) -> Dict[Text, Text]:
    """
    Get dataframe schema using pandas.io.json.build_table_schema.
    Args:
        df {pandas.DataFrame}: dataframe
    Returns:
        Dict[Text, Text]: dictionary with structure:
            {
                <column_name>: <column_type>
            }
    """

    return {
        f['name']: f['type']
        for f in build_table_schema(df, index=False)['fields']
    }
Exemple #7
0
    def __init__(
        self,
        key=None,
        df=None,
        preview=None,
        format="",
        stats=None,
        target_path=None,
        extra_data=None,
        column_metadata=None,
        **kwargs,
    ):

        format = format.lower()
        super().__init__(key, None, format=format, target_path=target_path)
        if format and format not in supported_formats:
            raise ValueError("unsupported format {} use one of {}".format(
                format, "|".join(supported_formats)))

        if format == "pq":
            format = "parquet"
        self.format = format
        self.stats = None
        self.extra_data = extra_data or {}
        self.column_metadata = column_metadata or {}

        if df is not None:
            self.length = df.shape[0]
            preview = preview or preview_lines
            shortdf = df
            if self.length > preview:
                shortdf = df.head(preview)
            shortdf = shortdf.reset_index()
            self.header = shortdf.columns.values.tolist()
            self.preview = shortdf.values.tolist()
            self.schema = build_table_schema(df)
            if stats or self.length < max_csv:
                self.stats = get_df_stats(df)

        self._df = df
        self._kw = kwargs
Exemple #8
0
def update_dataset_meta(
    artifact,
    from_df=None,
    schema: dict = None,
    header: list = None,
    preview: list = None,
    stats: dict = None,
    extra_data: dict = None,
    column_metadata: dict = None,
    labels: dict = None,
):
    """Update dataset object attributes/metadata

    this method will edit or add metadata to a dataset object

    example:
        update_dataset_meta(dataset, from_df=df,
                            extra_data={'histogram': 's3://mybucket/..'})

    :param from_df:         read metadata (schema, preview, ..) from provided df
    :param artifact:        dataset artifact object or path (store://..) or DataItem
    :param schema:          dataset schema, see pandas build_table_schema
    :param header:          column headers
    :param preview:         list of rows and row values (from df.values.tolist())
    :param stats:           dict of column names and their stats (cleaned df.describe(include='all'))
    :param extra_data:      extra data items (key: path string | artifact)
    :param column_metadata: dict of metadata per column
    :param labels:          metadata labels
    """

    if hasattr(artifact, "artifact_url"):
        artifact = artifact.artifact_url

    stores = store_manager
    if isinstance(artifact, DatasetArtifact):
        artifact_spec = artifact
    elif artifact.startswith(DB_SCHEMA + "://"):
        artifact_spec, _ = stores.get_store_artifact(artifact)
    else:
        raise ValueError(
            "model path must be a model store object/URL/DataItem")

    if not artifact_spec or artifact_spec.kind != "dataset":
        raise ValueError(
            "store artifact ({}) is not dataset kind".format(artifact))

    if from_df is not None:
        shortdf = from_df
        length = from_df.shape[0]
        if length > preview_lines:
            shortdf = from_df.head(preview_lines)
        artifact_spec.header = shortdf.reset_index().columns.values.tolist()
        artifact_spec.preview = shortdf.reset_index().values.tolist()
        artifact_spec.schema = build_table_schema(from_df)
        if stats is None and length < max_csv:
            artifact_spec.stats = get_df_stats(from_df)

    if header:
        artifact_spec.header = header
    if stats:
        artifact_spec.stats = stats
    if schema:
        artifact_spec.schema = schema
    if preview:
        artifact_spec.preview = preview
    if column_metadata:
        artifact_spec.column_metadata = column_metadata
    if labels:
        for key, val in labels.items():
            artifact_spec.labels[key] = val

    if extra_data:
        artifact_spec.extra_data = artifact_spec.extra_data or {}
        for key, item in extra_data.items():
            if hasattr(item, "target_path"):
                item = item.target_path
            artifact_spec.extra_data[key] = item

    stores._get_db().store_artifact(
        artifact_spec.db_key,
        artifact_spec.to_dict(),
        artifact_spec.tree,
        iter=artifact_spec.iter,
        project=artifact_spec.project,
    )
Exemple #9
0
 def check_schema(df, schema):
     df_schema = build_table_schema(df, index=False, primary_key=None, version=False)
     df_schema = fields_list_to_frozenset(df_schema["fields"])
     return df_schema == schema