Beispiel #1
0
def _dataframe_from_json(path_or_str,
                         schema: Schema = None,
                         pandas_orient: str = "split") -> pd.DataFrame:
    """
    Parse json into pandas.DataFrame. User can pass schema to ensure correct type parsing and to
    make any necessary conversions (e.g. string -> binary for binary columns).

    :param path_or_str: Path to a json file or a json string.
    :param schema: Mlflow schema used when parsing the data.
    :param pandas_orient: pandas data frame convention used to store the data.
    :return: pandas.DataFrame.
    """
    if schema is not None:
        dtypes = dict(zip(schema.column_names(), schema.column_types()))
        df = pd.read_json(path_or_str, orient=pandas_orient,
                          dtype=dtypes)[schema.column_names()]
        binary_cols = [
            i for i, x in enumerate(schema.column_types())
            if x == DataType.binary
        ]

        for i in binary_cols:
            col = df.columns[i]
            df[col] = np.array(df[col].map(_base64decode), dtype=np.bytes_)
            return df
    else:
        return pd.read_json(path_or_str, orient=pandas_orient, dtype=False)
Beispiel #2
0
def _dataframe_from_json(path_or_str,
                         schema: Schema = None,
                         pandas_orient: str = "split",
                         precise_float=False) -> pd.DataFrame:
    """
    Parse json into pandas.DataFrame. User can pass schema to ensure correct type parsing and to
    make any necessary conversions (e.g. string -> binary for binary columns).

    :param path_or_str: Path to a json file or a json string.
    :param schema: Mlflow schema used when parsing the data.
    :param pandas_orient: pandas data frame convention used to store the data.
    :return: pandas.DataFrame.
    """
    if schema is not None:
        dtypes = dict(zip(schema.column_names(), schema.pandas_types()))
        df = pd.read_json(path_or_str,
                          orient=pandas_orient,
                          dtype=dtypes,
                          precise_float=precise_float)
        actual_cols = set(df.columns)
        for type_, name in zip(schema.column_types(), schema.column_names()):
            if type_ == DataType.binary and name in actual_cols:
                df[name] = df[name].map(
                    lambda x: base64.decodebytes(bytes(x, "utf8")))
        return df
    else:
        return pd.read_json(path_or_str,
                            orient=pandas_orient,
                            dtype=False,
                            precise_float=precise_float)