Exemple #1
0
def discover_dataframe(df):
    obj = datashape.coretypes.object_
    names = list(df.columns)
    dtypes = list(map(datashape.CType.from_numpy_dtype, df.dtypes))
    dtypes = [datashape.string if dt == obj else dt for dt in dtypes]
    schema = datashape.Record(list(zip(names, dtypes)))
    return len(df) * schema
Exemple #2
0
def discover_dataframe(df):
    obj = object_
    names = list(df.columns)
    dtypes = list(map(datashape.CType.from_numpy_dtype, df.dtypes))
    dtypes = [string if dt == obj else dt for dt in dtypes]
    odtypes = [Option(dt) if dt in possibly_missing else dt for dt in dtypes]
    schema = datashape.Record(list(zip(names, odtypes)))
    return len(df) * schema
Exemple #3
0
def discover_sqlcontext(ctx):
    try:
        table_names = list(map(str, ctx.tableNames()))
    except AttributeError:
        java_names = ctx._ssql_ctx.catalog().tables().keySet()
        table_names = list(scala_set_to_set(ctx, java_names))

    table_names.sort()

    dshapes = zip(table_names, map(discover, map(ctx.table, table_names)))
    return datashape.DataShape(datashape.Record(dshapes))
Exemple #4
0
def dshape_from_dask(df):
    """Return a datashape.DataShape object given a dask dataframe."""
    cat_columns = [
        col for col in df.columns
        if (isinstance(df[col].dtype, type(pd.Categorical.dtype))
            or isinstance(df[col].dtype, pd.api.types.CategoricalDtype))
        and not df[col].cat.known
    ]
    df = df.categorize(cat_columns, index=False)
    return datashape.var * datashape.Record(
        [(k, dshape_from_pandas_helper(df[k])) for k in df.columns])
Exemple #5
0
def dshape_from_dask(df):
    """Return a datashape.DataShape object given a dask dataframe."""
    cat_columns = [
        col for col in df.columns
        if (isinstance(df[col].dtype, type(pd.Categorical.dtype)) or
            isinstance(df[col].dtype, pd.api.types.CategoricalDtype))
           and not getattr(df[col].cat, 'known', True)]
    df = df.categorize(cat_columns, index=False)
    # get_partition(0) used below because categories are sometimes repeated
    # for dask-cudf DataFrames with multiple partitions
    return datashape.var * datashape.Record([
        (k, dshape_from_pandas_helper(df[k].get_partition(0))) for k in df.columns
    ])
Exemple #6
0
def discover_h5py_dataset(d):
    dshape = datashape.from_numpy(d.shape, d.dtype)
    shape, measure = dshape.shape, dshape.measure
    if not isrecord(measure):
        if dshape == datashape.object_:
            args = shape + (datashape.string, )
            return DataShape(*args)
        return dshape
    else:
        records = list(
            record_dshape_replace(measure, datashape.object_,
                                  datashape.string))
        args = shape + (datashape.Record(records), )
        return DataShape(*args)
Exemple #7
0
def get_datashape(odo_resource):
    dshape = odo.discover(odo_resource, **odo_args)
    dshape = ''.join(str(dshape).split("*")[1].split()).replace(
        " ", "").replace(":", "\":\"").replace(",", "\",\"").replace(
            "{", "{\"").replace("}", "\"}")
    dictshape = eval(dshape)
    dkeys = [
        x.split(":")[0].replace("\"", "")
        for x in dshape.replace("{", "").replace("}", "").split(",")
    ]
    dictList = []
    for key in dkeys:
        value = dictshape[key]
        value = value.strip().replace("?", "")
        if value == "bool": value = "int32"
        value = ds.Option(value)
        dictList.append([key, value])
    dshape = ds.var * ds.Record(dictList)
    return dshape
Exemple #8
0
def dshape_from_xarray_dataset(xr_ds):
    """Return a datashape.DataShape object given a xarray Dataset."""
    return datashape.var * datashape.Record(
        [(k, dshape_from_pandas_helper(xr_ds[k]))
         for k in list(xr_ds.data_vars) + list(xr_ds.coords)])
Exemple #9
0
def dshape_from_pandas(df):
    """Return a datashape.DataShape object given a pandas dataframe."""
    return len(df) * datashape.Record([(k, dshape_from_pandas_helper(df[k]))
                                       for k in df.columns])
Exemple #10
0
 def out_dshape(self, input_dshape):
     cats = list(range(self.modulo))
     red_shape = self.reduction.out_dshape(input_dshape)
     return datashape.util.dshape(datashape.Record([(c, red_shape) for c in cats]))
Exemple #11
0
def discover_sqlcontext(ctx):
    table_names = sorted(map(str, ctx.tableNames()))
    dshapes = zip(table_names, map(discover, map(ctx.table, table_names)))
    return datashape.DataShape(datashape.Record(dshapes))
Exemple #12
0
def dshape_from_pandas(df):
    return len(df) * datashape.Record([(k, dshape_from_pandas_helper(df[k]))
                                       for k in df.columns])
Exemple #13
0
def discover_dataframe(df):
    return len(df) * datashape.Record([(k, dshape_from_pandas(df[k]))
                                       for k in df.columns])
Exemple #14
0
def discover_dataframe(df):
    return len(df) * datashape.Record(
        zip(df.columns, map(dshape_from_pandas, df.dtypes)), )
Exemple #15
0
def discover(t):
    return t.shape[0] * ds.Record([[col, discover(getattr(t.cols, col))]
                                   for col in t.colnames])