Exemple #1
0
def concrete_head(expr, n=10):
    """ Return head of computed expression """
    if not expr._resources():
        raise ValueError("Expression does not contain data resources")
    if not iscollection(expr.dshape):
        return compute(expr)

    head = expr.head(n + 1)

    if not iscollection(expr.dshape):
        return odo(head, object)
    elif isrecord(expr.dshape.measure):
        return odo(head, DataFrame)
    else:
        df = odo(head, DataFrame)
        df.columns = [expr._name]
        return df
    result = compute(head)

    if len(result) == 0:
        return DataFrame(columns=expr.fields)
    if isrecord(expr.dshape.measure):
        return odo(result, DataFrame, dshape=expr.dshape)
    else:
        df = odo(result, DataFrame, dshape=expr.dshape)
        df.columns = [expr._name]
        return df
Exemple #2
0
def concrete_head(expr, n=10):
    """ Return head of computed expression """
    if not expr._resources():
        raise ValueError("Expression does not contain data resources")
    if not iscollection(expr.dshape):
        return compute(expr)

    head = expr.head(n + 1)

    if not iscollection(expr.dshape):
        return odo(head, object)
    elif isrecord(expr.dshape.measure):
        return odo(head, DataFrame)
    else:
        df = odo(head, DataFrame)
        df.columns = [expr._name]
        return df
    result = compute(head)

    if len(result) == 0:
        return DataFrame(columns=expr.fields)
    if isrecord(expr.dshape.measure):
        return odo(result, DataFrame, dshape=expr.dshape)
    else:
        df = odo(result, DataFrame, dshape=expr.dshape)
        df.columns = [expr._name]
        return df
Exemple #3
0
def Data(data, dshape=None, name=None, fields=None, columns=None, schema=None,
         **kwargs):
    sub_uri = ''
    if isinstance(data, _strtypes):
        if '::' in data:
            data, sub_uri = data.split('::')
        data = resource(data, schema=schema, dshape=dshape, columns=columns,
                        **kwargs)
    if (isinstance(data, Iterator) and
            not isinstance(data, tuple(not_an_iterator))):
        data = tuple(data)
    if columns:
        warnings.warn("columns kwarg deprecated.  Use fields instead",
                      DeprecationWarning)
    if columns and not fields:
        fields = columns
    if schema and dshape:
        raise ValueError("Please specify one of schema= or dshape= keyword"
                         " arguments")
    if schema and not dshape:
        dshape = var * schema
    if dshape and isinstance(dshape, _strtypes):
        dshape = datashape.dshape(dshape)
    if not dshape:
        dshape = discover(data)
        types = None
        if isinstance(dshape.measure, Tuple) and fields:
            types = dshape[1].dshapes
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema,)))
        elif isscalar(dshape.measure) and fields:
            types = (dshape.measure,) * int(dshape[-2])
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape[:-1] + (schema,)))
        elif isrecord(dshape.measure) and fields:
            ds = discover(data)
            assert isrecord(ds.measure)
            names = ds.measure.names
            if names != fields:
                raise ValueError('data column names %s\n'
                                 '\tnot equal to fields parameter %s,\n'
                                 '\tuse Data(data).relabel(%s) to rename fields'
                                 % (names,
                                    fields,
                                    ', '.join('%s=%r' % (k, v)
                                              for k, v in zip(names, fields))))
            types = dshape.measure.types
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema,)))

    ds = datashape.dshape(dshape)
    result = InteractiveSymbol(data, ds, name)

    if sub_uri:
        for field in sub_uri.split('/'):
            if field:
                result = result[field]

    return result
Exemple #4
0
def Data(data,
         dshape=None,
         name=None,
         fields=None,
         columns=None,
         schema=None,
         **kwargs):
    if columns:
        raise ValueError("columns argument deprecated, use fields instead")
    if schema and dshape:
        raise ValueError("Please specify one of schema= or dshape= keyword"
                         " arguments")

    if isinstance(data, InteractiveSymbol):
        return Data(data.data, dshape, name, fields, columns, schema, **kwargs)

    if isinstance(data, _strtypes):
        data = resource(data,
                        schema=schema,
                        dshape=dshape,
                        columns=columns,
                        **kwargs)
    if (isinstance(data, Iterator)
            and not isinstance(data, tuple(not_an_iterator))):
        data = tuple(data)
    if schema and not dshape:
        dshape = var * schema
    if dshape and isinstance(dshape, _strtypes):
        dshape = datashape.dshape(dshape)
    if not dshape:
        dshape = discover(data)
        types = None
        if isinstance(dshape.measure, Tuple) and fields:
            types = dshape[1].dshapes
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema, )))
        elif isscalar(dshape.measure) and fields:
            types = (dshape.measure, ) * int(dshape[-2])
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape[:-1] + (schema, )))
        elif isrecord(dshape.measure) and fields:
            ds = discover(data)
            assert isrecord(ds.measure)
            names = ds.measure.names
            if names != fields:
                raise ValueError(
                    'data column names %s\n'
                    '\tnot equal to fields parameter %s,\n'
                    '\tuse Data(data).relabel(%s) to rename '
                    'fields' %
                    (names, fields, ', '.join('%s=%r' % (k, v)
                                              for k, v in zip(names, fields))))
            types = dshape.measure.types
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema, )))

    ds = datashape.dshape(dshape)
    return InteractiveSymbol(data, ds, name)
Exemple #5
0
def sort(child, key=None, ascending=True):
    """ Sort a collection

    Parameters
    ----------
    key : str, list of str, or Expr
        Defines by what you want to sort.

          * A single column string: ``t.sort('amount')``
          * A list of column strings: ``t.sort(['name', 'amount'])``
          * An expression: ``t.sort(-t.amount)``

        If sorting a columnar dataset, the ``key`` is ignored, as it is not
        necessary:

          * ``t.amount.sort()``
          * ``t.amount.sort('amount')``
          * ``t.amount.sort('foobar')``

       are all equivalent.

    ascending : bool, optional
        Determines order of the sort
    """
    if ascending not in (True, False):
        # NOTE: this test is to guard against users saying `x.sort('a', 'b')`
        # when they should have said `x.sort(['a', 'b'])`.
        msg = "ascending must be True or False, given {}"
        raise ValueError(msg.format(ascending))
    if not isrecord(child.dshape.measure):
        if key is None or isinstance(key, _strtypes):
            # Handle this case separately.
            return Sort(child, None, ascending)
        msg = "sort key {!r} not valid for schema {!r}"
        raise ValueError(msg.format(key, child.dshape.measure))
    if key is None and isrecord(child.dshape.measure):
        key = child.dshape.measure.names
    if isinstance(key, (list, tuple)):
        key = keys_to_validate = tuple(key)
    else:
        keys_to_validate = (key,)
    for k in keys_to_validate:
        if k is None:
            msg = "sort key {!r} not valid for schema {!r}"
            raise ValueError(msg.format(k, child.dshape.measure))
        elif isinstance(k, _strtypes):
            if k not in child.dshape.measure.names:
                msg = "sort key {} is not a column of schema {}"
                raise ValueError(msg.format(k, child.dshape.measure))
        elif not isinstance(k, Expr):
            msg = "sort key {} is not a string column name or an expression."
            raise ValueError(msg.format(k))
    return Sort(child, key, ascending)
Exemple #6
0
def create_from_datashape(group, ds, name=None, **kwargs):
    if isinstance(group, type):
        group = h5py.File(kwargs['path'])
    assert isrecord(ds)
    for name, sub_ds in ds[0].dict.items():
        if isrecord(sub_ds):
            g = group.require_group(name)
            create_from_datashape(g, sub_ds, **kwargs)
        else:
            dataset_from_dshape(file=group.file,
                                datapath='/'.join([group.name, name]),
                                ds=sub_ds, **kwargs)
Exemple #7
0
def create_from_datashape(group, ds, name=None, **kwargs):
    assert isrecord(ds)
    if isinstance(ds, DataShape) and len(ds) == 1:
        ds = ds[0]
    for name, sub_ds in ds.dict.items():
        if isrecord(sub_ds):
            g = group.require_group(name)
            create_from_datashape(g, sub_ds, **kwargs)
        else:
            dataset_from_dshape(file=group.file,
                                datapath='/'.join([group.name, name]),
                                ds=sub_ds, **kwargs)
Exemple #8
0
def Data(data, dshape=None, name=None, fields=None, columns=None, schema=None,
         **kwargs):
    if columns:
        raise ValueError("columns argument deprecated, use fields instead")
    if schema and dshape:
        raise ValueError("Please specify one of schema= or dshape= keyword"
                         " arguments")

    if isinstance(data, InteractiveSymbol):
        return Data(data.data, dshape, name, fields, columns, schema, **kwargs)

    if isinstance(data, _strtypes):
        data = resource(data, schema=schema, dshape=dshape, columns=columns,
                        **kwargs)
    if (isinstance(data, Iterator) and
            not isinstance(data, tuple(not_an_iterator))):
        data = tuple(data)
    if schema and not dshape:
        dshape = var * schema
    if dshape and isinstance(dshape, _strtypes):
        dshape = datashape.dshape(dshape)
    if not dshape:
        dshape = discover(data)
        types = None
        if isinstance(dshape.measure, Tuple) and fields:
            types = dshape[1].dshapes
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema,)))
        elif isscalar(dshape.measure) and fields:
            types = (dshape.measure,) * int(dshape[-2])
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape[:-1] + (schema,)))
        elif isrecord(dshape.measure) and fields:
            ds = discover(data)
            assert isrecord(ds.measure)
            names = ds.measure.names
            if names != fields:
                raise ValueError('data column names %s\n'
                                 '\tnot equal to fields parameter %s,\n'
                                 '\tuse Data(data).relabel(%s) to rename '
                                 'fields' % (names,
                                             fields,
                                             ', '.join('%s=%r' % (k, v)
                                                       for k, v in
                                                       zip(names, fields))))
            types = dshape.measure.types
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema,)))

    ds = datashape.dshape(dshape)
    return InteractiveSymbol(data, ds, name)
def test_base():
    for expr, exclusions in expressions.items():
        if iscollection(expr.dshape):
            model = into(DataFrame, into(np.ndarray, expr._subs({t: Data(base, t.dshape)})))
        else:
            model = compute(expr._subs({t: Data(base, t.dshape)}))
        print('\nexpr: %s\n' % expr)
        for source in sources:
            if id(source) in map(id, exclusions):
                continue
            print('%s <- %s' % (typename(model), typename(source)))
            T = Data(source)
            if iscollection(expr.dshape):
                result = into(type(model), expr._subs({t: T}))
                if isscalar(expr.dshape.measure):
                    assert set(into(list, result)) == set(into(list, model))
                else:
                    assert df_eq(result, model)
            elif isrecord(expr.dshape):
                result = compute(expr._subs({t: T}))
                assert into(tuple, result) == into(tuple, model)
            else:
                result = compute(expr._subs({t: T}))
                try:
                    result = result.scalar()
                except AttributeError:
                    pass
                assert result == model
Exemple #10
0
 def __init__(self, data, dshape, name=None):
     self.data = data
     self.dshape = dshape
     self._name = name or (next(names)
                           if isrecord(dshape.measure)
                           else None)
     self._hash = None
Exemple #11
0
def test_base():
    for expr, exclusions in expressions.items():
        if iscollection(expr.dshape):
            model = into(
                DataFrame,
                into(np.ndarray, expr._subs({t: Data(base, t.dshape)})))
        else:
            model = compute(expr._subs({t: Data(base, t.dshape)}))
        print('\nexpr: %s\n' % expr)
        for source in sources:
            if id(source) in map(id, exclusions):
                continue
            print('%s <- %s' % (typename(model), typename(source)))
            T = Data(source)
            if iscollection(expr.dshape):
                result = into(type(model), expr._subs({t: T}))
                if isscalar(expr.dshape.measure):
                    assert set(into(list, result)) == set(into(list, model))
                else:
                    assert df_eq(result, model)
            elif isrecord(expr.dshape):
                result = compute(expr._subs({t: T}))
                assert into(tuple, result) == into(tuple, model)
            else:
                result = compute(expr._subs({t: T}))
                try:
                    result = result.scalar()
                except AttributeError:
                    pass
                assert result == model
Exemple #12
0
 def __new__(cls, data, dshape, name=None):
     return super(Symbol, cls).__new__(
         cls,
         data,
         dshape,
         name or (next(names) if isrecord(dshape.measure) else None),
     )
Exemple #13
0
def create_from_datashape(engine, ds, **kwargs):
    assert isrecord(ds)
    metadata = sa.MetaData(engine)
    for name, sub_ds in ds[0].dict.items():
        t = dshape_to_table(name, sub_ds, metadata=metadata)
        t.create()
    return engine
Exemple #14
0
def create_from_datashape(engine, ds, schema=None, **kwargs):
    assert isrecord(ds), 'datashape must be Record type, got %s' % ds
    metadata = metadata_of_engine(engine, schema=schema)
    for name, sub_ds in ds[0].dict.items():
        t = dshape_to_table(name, sub_ds, metadata=metadata)
        t.create()
    return engine
Exemple #15
0
def create_from_datashape(engine, ds, schema=None, **kwargs):
    assert isrecord(ds), 'datashape must be Record type, got %s' % ds
    metadata = metadata_of_engine(engine, schema=schema)
    for name, sub_ds in ds[0].dict.items():
        t = dshape_to_table(name, sub_ds, metadata=metadata)
        t.create()
    return engine
Exemple #16
0
def create_from_datashape(engine, ds, **kwargs):
    assert isrecord(ds)
    metadata = metadata_of_engine(engine)
    for name, sub_ds in ds[0].dict.items():
        t = dshape_to_table(name, sub_ds, metadata=metadata)
        t.create()
    return engine
Exemple #17
0
def create_from_datashape(engine, ds, schema=None, foreign_keys=None, primary_key=None, **kwargs):
    assert isrecord(ds), "datashape must be Record type, got %s" % ds
    metadata = sa.MetaData(engine, schema=schema)
    for name, sub_ds in ds[0].dict.items():
        t = dshape_to_table(name, sub_ds, metadata=metadata, foreign_keys=foreign_keys, primary_key=primary_key)
        t.create()
    return engine
Exemple #18
0
def dshape_to_table(name,
                    ds,
                    metadata=None,
                    foreign_keys=None,
                    primary_key=None):
    """
    Create a SQLAlchemy table from a datashape and a name

    >>> dshape_to_table('bank', '{name: string, amount: int}') # doctest: +NORMALIZE_WHITESPACE
    Table('bank', MetaData(bind=None),
          Column('name', Text(), table=<bank>, nullable=False),
          Column('amount', Integer(), table=<bank>, nullable=False),
          schema=None)
    """

    if isinstance(ds, str):
        ds = dshape(ds)
    if not isrecord(ds.measure):
        raise TypeError('dshape measure must be a record type e.g., '
                        '"{a: int64, b: int64}". Input measure is %r' %
                        ds.measure)
    if metadata is None:
        metadata = sa.MetaData()
    if foreign_keys is None:
        foreign_keys = {}

    validate_foreign_keys(ds, foreign_keys)

    cols = dshape_to_alchemy(ds, primary_key=primary_key or frozenset())
    cols.extend(
        sa.ForeignKeyConstraint([column_name], [referent])
        for column_name, referent in foreign_keys.items())
    t = sa.Table(name, metadata, *cols, schema=metadata.schema)
    return attach_schema(t, t.schema)
Exemple #19
0
def dshape_to_table(name, ds, metadata=None, foreign_keys=None,
                    primary_key=None):
    """
    Create a SQLAlchemy table from a datashape and a name

    >>> dshape_to_table('bank', '{name: string, amount: int}') # doctest: +NORMALIZE_WHITESPACE
    Table('bank', MetaData(bind=None),
          Column('name', Text(), table=<bank>, nullable=False),
          Column('amount', Integer(), table=<bank>, nullable=False),
          schema=None)
    """

    if isinstance(ds, str):
        ds = dshape(ds)
    if not isrecord(ds.measure):
        raise TypeError('dshape measure must be a record type e.g., '
                        '"{a: int64, b: int64}". Input measure is %r' %
                        ds.measure)
    if metadata is None:
        metadata = sa.MetaData()
    if foreign_keys is None:
        foreign_keys = {}

    validate_foreign_keys(ds, foreign_keys)

    cols = dshape_to_alchemy(ds, primary_key=primary_key or frozenset())
    cols.extend(sa.ForeignKeyConstraint([column_name], [referent])
                for column_name, referent in foreign_keys.items())
    t = sa.Table(name, metadata, *cols, schema=metadata.schema)
    return attach_schema(t, t.schema)
Exemple #20
0
    def __dir__(self):
        result = dir(type(self))
        if isrecord(self.dshape.measure) or isinstance(self.dshape.measure, datashape.Map) and self.fields:
            result.extend(map(valid_identifier, self.fields))

        result.extend(toolz.merge(schema_methods(self.dshape.measure), dshape_methods(self.dshape)))

        return sorted(set(filter(isvalid_identifier, result)))
Exemple #21
0
def Data(data,
         dshape=None,
         name=None,
         fields=None,
         columns=None,
         schema=None,
         **kwargs):
    sub_uri = ''
    if isinstance(data, _strtypes):
        if '::' in data:
            data, sub_uri = data.split('::')
        data = resource(data,
                        schema=schema,
                        dshape=dshape,
                        columns=columns,
                        **kwargs)
    if (isinstance(data, Iterator)
            and not isinstance(data, tuple(not_an_iterator))):
        data = tuple(data)
    if columns:
        warnings.warn("columns kwarg deprecated.  Use fields instead",
                      DeprecationWarning)
    if columns and not fields:
        fields = columns
    if schema and dshape:
        raise ValueError("Please specify one of schema= or dshape= keyword"
                         " arguments")
    if schema and not dshape:
        dshape = var * schema
    if dshape and isinstance(dshape, _strtypes):
        dshape = datashape.dshape(dshape)
    if not dshape:
        dshape = discover(data)
        types = None
        if isinstance(dshape.measure, Tuple) and fields:
            types = dshape[1].dshapes
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema, )))
        elif isscalar(dshape.measure) and fields:
            types = (dshape.measure, ) * int(dshape[-2])
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape[:-1] + (schema, )))
        elif isrecord(dshape.measure) and fields:
            types = dshape.measure.types
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema, )))

    ds = datashape.dshape(dshape)

    name = name or next(names)
    result = InteractiveSymbol(data, ds, name)

    if sub_uri:
        for field in sub_uri.split('/'):
            if field:
                result = result[field]

    return result
Exemple #22
0
def data(data_source, dshape=None, name=None, fields=None, schema=None, **kwargs):
    if schema and dshape:
        raise ValueError("Please specify one of schema= or dshape= keyword" " arguments")

    if isinstance(data_source, _Data):
        return data(data_source.data, dshape, name, fields, schema, **kwargs)

    if schema and not dshape:
        dshape = var * schema
    if dshape and isinstance(dshape, _strtypes):
        dshape = datashape.dshape(dshape)

    if isinstance(data_source, _strtypes):
        data_source = resource(data_source, schema=schema, dshape=dshape, **kwargs)

    if isinstance(data_source, Iterator) and not isinstance(data_source, tuple(not_an_iterator)):
        data_source = tuple(data_source)
    if not dshape:
        dshape = discover(data_source)
        types = None
        if isinstance(dshape.measure, Tuple) and fields:
            types = dshape[1].dshapes
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema,)))
        elif isscalar(dshape.measure) and fields:
            types = (dshape.measure,) * int(dshape[-2])
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape[:-1] + (schema,)))
        elif isrecord(dshape.measure) and fields:
            ds = discover(data_source)
            assert isrecord(ds.measure)
            names = ds.measure.names
            if names != fields:
                raise ValueError(
                    "data column names %s\n"
                    "\tnot equal to fields parameter %s,\n"
                    "\tuse data(data_source).relabel(%s) to rename "
                    "fields" % (names, fields, ", ".join("%s=%r" % (k, v) for k, v in zip(names, fields)))
                )
            types = dshape.measure.types
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema,)))

    ds = datashape.dshape(dshape)
    return _Data(data_source, ds, name)
Exemple #23
0
def Data(data, dshape=None, name=None, fields=None, columns=None,
         schema=None, **kwargs):
    sub_uri = ''
    if isinstance(data, _strtypes):
        if '::' in data:
            data, sub_uri = data.split('::')
        data = resource(data, schema=schema, dshape=dshape,
                              columns=columns, **kwargs)
    if (isinstance(data, Iterator) and
            not isinstance(data, tuple(not_an_iterator))):
        data = tuple(data)
    if columns:
        warnings.warn("columns kwarg deprecated.  Use fields instead",
                      DeprecationWarning)
    if columns and not fields:
        fields = columns
    if schema and dshape:
        raise ValueError("Please specify one of schema= or dshape= keyword"
                " arguments")
    if schema and not dshape:
        dshape = var * schema
    if dshape and isinstance(dshape, _strtypes):
        dshape = datashape.dshape(dshape)
    if not dshape:
        dshape = discover(data)
        types = None
        if isinstance(dshape.measure, Tuple) and fields:
            types = dshape[1].dshapes
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema,)))
        elif isscalar(dshape.measure) and fields:
            types = (dshape.measure,) * int(dshape[-2])
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape[:-1] + (schema,)))
        elif isrecord(dshape.measure) and fields:
            types = dshape.measure.types
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema,)))

    ds = datashape.dshape(dshape)

    if (hasattr(data, 'schema')
         and isinstance(data.schema, (DataShape, str, unicode))
         and ds.measure != data.dshape.measure):
        raise TypeError('%s schema %s does not match schema %s' %
                        (type(data).__name__, data.schema,
                                              ds.measure))

    name = name or next(names)
    result = InteractiveSymbol(data, ds, name)

    if sub_uri:
        for field in sub_uri.split('/'):
            if field:
                result = result[field]

    return result
Exemple #24
0
def _csv_to_DataFrame(c, dshape=None, chunksize=None, **kwargs):
    has_header = kwargs.pop('has_header', c.has_header)
    if has_header is False:
        header = None
    elif has_header is True:
        header = 0
    else:
        header = 'infer'

    sep = kwargs.pop('sep', kwargs.pop('delimiter', c.dialect.get('delimiter', ',')))
    encoding = kwargs.get('encoding', c.encoding)

    if dshape:
        dtypes, parse_dates = dshape_to_pandas(dshape)
        if isrecord(dshape.measure):
            names = kwargs.get('names', dshape.measure.names)
        else:
            names = kwargs.get('names')
    else:
        dtypes = parse_dates = names = None

    usecols = kwargs.pop('usecols', None)
    if parse_dates and usecols:
        parse_dates = [col for col in parse_dates if col in usecols]

    compression = kwargs.pop('compression',
            {'gz': 'gzip', 'bz2': 'bz2'}.get(ext(c.path)))

    # See read_csv docs for header for reasoning
    if names:
        try:
            found_names = pd.read_csv(c.path, encoding=encoding,
                                      compression=compression, nrows=1)
        except StopIteration:
            found_names = pd.read_csv(c.path, encoding=encoding,
                                      compression=compression)
    if names and header == 'infer':
        if [n.strip() for n in found_names] == [n.strip() for n in names]:
            header = 0
        elif (all(re.match('^\s*\D\w*\s*$', n) for n in found_names) and
                not all(dt == datashape.string for dt in dshape.measure.types)):
            header = 0
        else:
            header = None

    kwargs2 = keyfilter(keywords(pandas.read_csv).__contains__, kwargs)
    return pandas.read_csv(c.path,
                             header=header,
                             sep=sep,
                             encoding=encoding,
                             dtype=dtypes,
                             parse_dates=parse_dates,
                             names=names,
                             compression=compression,
                             chunksize=chunksize,
                             usecols=usecols,
                             **kwargs2)
Exemple #25
0
def _csv_to_DataFrame(c, dshape=None, chunksize=None, **kwargs):
    has_header = kwargs.pop('has_header', c.has_header)
    if has_header is False:
        header = None
    elif has_header is True:
        header = 0
    else:
        header = 'infer'

    sep = kwargs.pop('sep', kwargs.pop('delimiter', c.dialect.get('delimiter', ',')))
    encoding = kwargs.get('encoding', c.encoding)

    if dshape:
        dtypes, parse_dates = dshape_to_pandas(dshape)
        if isrecord(dshape.measure):
            names = kwargs.get('names', dshape.measure.names)
        else:
            names = kwargs.get('names')
    else:
        dtypes = parse_dates = names = None

    usecols = kwargs.pop('usecols', None)
    if parse_dates and usecols:
        parse_dates = [col for col in parse_dates if col in usecols]

    compression = kwargs.pop('compression',
            {'gz': 'gzip', 'bz2': 'bz2'}.get(ext(c.path)))

    # See read_csv docs for header for reasoning
    if names:
        try:
            found_names = pd.read_csv(c.path, encoding=encoding,
                                      compression=compression, nrows=1)
        except StopIteration:
            found_names = pd.read_csv(c.path, encoding=encoding,
                                      compression=compression)
    if names and header == 'infer':
        if [n.strip() for n in found_names] == [n.strip() for n in names]:
            header = 0
        elif (all(re.match('^\s*\D\w*\s*$', n) for n in found_names) and
                not all(dt == datashape.string for dt in dshape.measure.types)):
            header = 0
        else:
            header = None

    kwargs2 = keyfilter(keywords(pandas.read_csv).__contains__, kwargs)
    return pandas.read_csv(c.path,
                             header=header,
                             sep=sep,
                             encoding=encoding,
                             dtype=dtypes,
                             parse_dates=parse_dates,
                             names=names,
                             compression=compression,
                             chunksize=chunksize,
                             usecols=usecols,
                             **kwargs2)
Exemple #26
0
def _csv_to_dataframe(c, dshape=None, chunksize=None, **kwargs):
    header = {
        False: None,
        True: 0
    }.get(kwargs.pop('has_header', c.has_header), 'infer')

    sep = kwargs.pop('sep',
                     kwargs.pop('delimiter', c.dialect.get('delimiter', ',')))
    encoding = kwargs.pop('encoding', c.encoding)

    if dshape:
        dtypes, parse_dates = dshape_to_pandas(dshape)
        if isrecord(dshape.measure):
            names = kwargs.get('names', dshape.measure.names)
        else:
            names = kwargs.get('names')
    else:
        dtypes = parse_dates = names = None

    usecols = kwargs.pop('usecols', None)
    if parse_dates and usecols:
        parse_dates = [col for col in parse_dates if col in usecols]

    # See read_csv docs for header for reasoning
    if names:
        try:
            with c.open() as f:
                found_names = pd.read_csv(f,
                                          nrows=1,
                                          encoding=encoding,
                                          sep=sep)
        except StopIteration:
            with c.open() as f:
                found_names = pd.read_csv(f, encoding=encoding, sep=sep)
    if names and header == 'infer':
        if [n.strip() for n in found_names] == [n.strip() for n in names]:
            header = 0
        elif (all(re.match('^\s*\D\w*\s*$', n) for n in found_names)
              and not all(dt == datashape.string
                          for dt in dshape.measure.types)):
            header = 0
        else:
            header = None

    kwargs = keyfilter(keywords(pd.read_csv).__contains__, kwargs)
    with c.open() as f:
        return pd.read_csv(f,
                           header=header,
                           sep=sep,
                           encoding=encoding,
                           dtype=dtypes,
                           parse_dates=parse_dates,
                           names=names,
                           chunksize=chunksize,
                           usecols=usecols,
                           **kwargs)
Exemple #27
0
 def __new__(cls, data, dshape, name=None):
     return super(Symbol, cls).__new__(
         cls,
         data,
         dshape,
         name or (
             next(names)
             if isrecord(dshape.measure) else None
         ),
     )
Exemple #28
0
def create_from_datashape(group, ds, name=None, **kwargs):
    if not isrecord(ds):
        raise ValueError(
            "Trying to create an HDF5 file with non-record datashape failed\n"
            "Perhaps you forgot to specify a datapath?\n"
            "\tdshape: %s\n"
            "If you're using odo consider the following change\n"
            "\tBefore: odo(data, 'myfile.hdf5')\n"
            "\tAfter:  odo(data, 'myfile.hdf5::/datapath')" % ds)
    if isinstance(ds, DataShape) and len(ds) == 1:
        ds = ds[0]
    for name, sub_ds in ds.dict.items():
        if isrecord(sub_ds):
            g = group.require_group(name)
            create_from_datashape(g, sub_ds, **kwargs)
        else:
            dataset_from_dshape(file=group.file,
                                datapath='/'.join([group.name, name]),
                                ds=sub_ds, **kwargs)
Exemple #29
0
def test_mean():
    (chunk, chunk_expr), (agg, agg_expr) = split(t, t.amount.mean())

    assert chunk.schema == t.schema
    assert chunk_expr.isidentical(
        summary(total=chunk.amount.sum(),
                count=chunk.amount.count(),
                keepdims=True))
    assert isrecord(agg.dshape.measure)
    assert agg_expr.isidentical(agg.total.sum() / agg['count'].sum())
Exemple #30
0
    def __dir__(self):
        result = dir(type(self))
        if isrecord(self.dshape.measure) and self.fields:
            result.extend(list(map(valid_identifier, self.fields)))

        d = toolz.merge(schema_methods(self.dshape.measure),
                        dshape_methods(self.dshape))
        result.extend(list(d))

        return sorted(set(filter(isvalid_identifier, result)))
Exemple #31
0
def test_mean():
    (chunk, chunk_expr), (agg, agg_expr) = split(t, t.amount.mean())

    assert chunk.schema == t.schema
    assert chunk_expr.isidentical(summary(total=chunk.amount.sum(),
                                          count=chunk.amount.count(),
                                          keepdims=True))

    assert isrecord(agg.dshape.measure)
    assert agg_expr.isidentical(agg.total.sum() / agg.count.sum())
Exemple #32
0
    def __dir__(self):
        result = dir(type(self))
        if isrecord(self.dshape.measure) and self.fields:
            result.extend(list(map(valid_identifier, self.fields)))

        d = toolz.merge(schema_methods(self.dshape.measure),
                        dshape_methods(self.dshape))
        result.extend(list(d))

        return sorted(set(filter(isvalid_identifier, result)))
Exemple #33
0
def jsonlines_to_sparksql(ctx, json, dshape=None, name=None, schema=None,
                          samplingRatio=0.25, **kwargs):
    # if we're passing in schema, assume that we know what we're doing and
    # bypass any automated dshape inference
    if dshape is not None and schema is None:
        schema = dshape_to_schema(dshape.measure
                                  if isrecord(dshape.measure) else dshape)
    srdd = ctx.jsonFile(json.path, schema=schema, samplingRatio=samplingRatio)
    register_table(ctx, srdd, name=name)
    return srdd
Exemple #34
0
def create_from_datashape(group, ds, name=None, **kwargs):
    if not isrecord(ds):
        raise ValueError(
            "Trying to create an HDF5 file with non-record datashape failed\n"
            "Perhaps you forgot to specify a datapath?\n"
            "\tdshape: %s\n"
            "If you're using into consider the following change\n"
            "\tBefore: into('myfile.hdf5', data)\n"
            "\tAfter:  into('myfile.hdf5::/datapath', data)" % ds)
    if isinstance(ds, DataShape) and len(ds) == 1:
        ds = ds[0]
    for name, sub_ds in ds.dict.items():
        if isrecord(sub_ds):
            g = group.require_group(name)
            create_from_datashape(g, sub_ds, **kwargs)
        else:
            dataset_from_dshape(file=group.file,
                                datapath='/'.join([group.name, name]),
                                ds=sub_ds,
                                **kwargs)
Exemple #35
0
 def fields(self):
     if isinstance(self.dshape.measure, Record):
         return self.dshape.measure.names
     elif isinstance(self.dshape.measure, datashape.Map):
         if not isrecord(self.dshape.measure.value):
             raise TypeError("Foreign key must reference a " "Record datashape")
         return self.dshape.measure.value.names
     name = getattr(self, "_name", None)
     if name is not None:
         return [self._name]
     return []
Exemple #36
0
    def __init__(self,
                 data,
                 dshape=None,
                 name=None,
                 fields=None,
                 columns=None,
                 schema=None,
                 **kwargs):
        if isinstance(data, _strtypes):
            data = resource(data,
                            schema=schema,
                            dshape=dshape,
                            columns=columns,
                            **kwargs)
        if columns:
            warnings.warn("columns kwarg deprecated.  Use fields instead",
                          DeprecationWarning)
        if columns and not fields:
            fields = columns
        if schema and dshape:
            raise ValueError("Please specify one of schema= or dshape= keyword"
                             " arguments")
        if schema and not dshape:
            dshape = var * schema
        if dshape and isinstance(dshape, _strtypes):
            dshape = datashape.dshape(dshape)
        if not dshape:
            dshape = discover(data)
            types = None
            if isinstance(dshape.measure, Tuple) and fields:
                types = dshape[1].dshapes
                schema = Record(list(zip(fields, types)))
                dshape = DataShape(*(dshape.shape + (schema, )))
            elif isscalar(dshape.measure) and fields:
                types = (dshape.measure, ) * int(dshape[-2])
                schema = Record(list(zip(fields, types)))
                dshape = DataShape(*(dshape.shape + (schema, )))
            elif isrecord(dshape.measure) and fields:
                types = dshape.measure.types
                schema = Record(list(zip(fields, types)))
                dshape = DataShape(*(dshape.shape + (schema, )))

        self.dshape = datashape.dshape(dshape)

        self.data = data

        if (hasattr(data, 'schema') and isinstance(data.schema,
                                                   (DataShape, str, unicode))
                and self.schema != data.schema):
            raise TypeError('%s schema %s does not match %s schema %s' %
                            (type(data).__name__, data.schema,
                             type(self).__name__, self.schema))

        self._name = name or next(names)
Exemple #37
0
    def __dir__(self):
        result = dir(type(self))
        if (isrecord(self.dshape.measure) or
            isinstance(self.dshape.measure, datashape.Map) and
                self.fields):
            result.extend(map(valid_identifier, self.fields))

        result.extend(toolz.merge(schema_methods(self.dshape.measure),
                                  dshape_methods(self.dshape)))

        return sorted(set(filter(isvalid_identifier, result)))
Exemple #38
0
def test_std():
    (chunk, chunk_expr), (agg, agg_expr) = split(t, t.amount.std())

    assert chunk.schema == t.schema
    assert chunk_expr.isidentical(summary(x=chunk.amount.sum(),
                                          x2=(chunk.amount ** 2).sum(),
                                          n=chunk.amount.count(),
                                          keepdims=True))

    assert isrecord(agg.dshape.measure)
    assert agg_expr.isidentical(sqrt((agg.x2.sum() / (agg.n.sum())
                                      - (agg.x.sum() / (agg.n.sum())) ** 2)))
Exemple #39
0
def _csv_to_dataframe(c, dshape=None, chunksize=None, **kwargs):
    header = {False: None, True: 0}.get(
        kwargs.pop('has_header', c.has_header), 'infer')

    sep = kwargs.pop(
        'sep', kwargs.pop('delimiter', c.dialect.get('delimiter', ',')))
    encoding = kwargs.pop('encoding', c.encoding)

    if dshape:
        dtypes, parse_dates = dshape_to_pandas(dshape)
        if isrecord(dshape.measure):
            names = kwargs.get('names', dshape.measure.names)
        else:
            names = kwargs.get('names')
    else:
        dtypes = parse_dates = names = None

    usecols = kwargs.pop('usecols', None)
    if parse_dates and usecols:
        parse_dates = [col for col in parse_dates if col in usecols]

    # See read_csv docs for header for reasoning
    if names:
        try:
            with c.open() as f:
                found_names = pd.read_csv(f,
                                          nrows=1,
                                          encoding=encoding,
                                          sep=sep)
        except StopIteration:
            with c.open() as f:
                found_names = pd.read_csv(f, encoding=encoding, sep=sep)
    if names and header == 'infer':
        if [n.strip() for n in found_names] == [n.strip() for n in names]:
            header = 0
        elif (all(re.match('^\s*\D\w*\s*$', n) for n in found_names) and
                not all(dt == datashape.string for dt in dshape.measure.types)):
            header = 0
        else:
            header = None

    kwargs = keyfilter(keywords(pd.read_csv).__contains__, kwargs)
    with c.open() as f:
        return pd.read_csv(f,
                           header=header,
                           sep=sep,
                           encoding=encoding,
                           dtype=dtypes,
                           parse_dates=parse_dates,
                           names=names,
                           chunksize=chunksize,
                           usecols=usecols,
                           **kwargs)
Exemple #40
0
 def fields(self):
     if isinstance(self.dshape.measure, Record):
         return self.dshape.measure.names
     elif isinstance(self.dshape.measure, datashape.Map):
         if not isrecord(self.dshape.measure.value):
             raise TypeError('Foreign key must reference a '
                             'Record datashape')
         return self.dshape.measure.value.names
     name = getattr(self, '_name', None)
     if name is not None:
         return [self._name]
     return []
Exemple #41
0
def test_var():
    (chunk, chunk_expr), (agg, agg_expr) = split(t, t.amount.var())

    assert chunk.schema == t.schema
    assert chunk_expr.isidentical(
        summary(x=chunk.amount.sum(),
                x2=(chunk.amount**2).sum(),
                n=chunk.amount.count(),
                keepdims=True))

    assert isrecord(agg.dshape.measure)
    assert agg_expr.isidentical(
        (agg.x2.sum() / (agg.n.sum()) - (agg.x.sum() / (agg.n.sum()))**2))
Exemple #42
0
def discover_h5py_dataset(d):
    dshape = datashape.from_numpy(d.shape, d.dtype)
    shape, measure = dshape.shape, dshape.measure
    if not isrecord(measure):
        if dshape == datashape.object_:
            args = shape + (datashape.string,)
            return DataShape(*args)
        return dshape
    else:
        records = list(record_dshape_replace(measure, datashape.object_,
                                             datashape.string))
        args = shape + (datashape.Record(records),)
        return DataShape(*args)
Exemple #43
0
def _csv_to_dataframe(c, dshape=None, chunksize=None, **kwargs):
    header = {False: None, True: 0}.get(kwargs.pop("has_header", c.has_header), "infer")

    sep = kwargs.pop("sep", kwargs.pop("delimiter", c.dialect.get("delimiter", ",")))
    encoding = kwargs.pop("encoding", c.encoding)

    if dshape:
        dtypes, parse_dates = dshape_to_pandas(dshape)
        if isrecord(dshape.measure):
            names = kwargs.get("names", dshape.measure.names)
        else:
            names = kwargs.get("names")
    else:
        dtypes = parse_dates = names = None

    usecols = kwargs.pop("usecols", None)
    if parse_dates and usecols:
        parse_dates = [col for col in parse_dates if col in usecols]

    compression = kwargs.pop("compression", {"gz": "gzip", "bz2": "bz2"}.get(ext(c.path)))

    # See read_csv docs for header for reasoning
    if names:
        try:
            found_names = pd.read_csv(c.path, encoding=encoding, compression=compression, nrows=1)
        except StopIteration:
            found_names = pd.read_csv(c.path, encoding=encoding, compression=compression)
    if names and header == "infer":
        if [n.strip() for n in found_names] == [n.strip() for n in names]:
            header = 0
        elif all(re.match("^\s*\D\w*\s*$", n) for n in found_names) and not all(
            dt == datashape.string for dt in dshape.measure.types
        ):
            header = 0
        else:
            header = None

    kwargs = keyfilter(keywords(pd.read_csv).__contains__, kwargs)
    return pd.read_csv(
        c.path,
        header=header,
        sep=sep,
        encoding=encoding,
        dtype=dtypes,
        parse_dates=parse_dates,
        names=names,
        compression=compression,
        chunksize=chunksize,
        usecols=usecols,
        **kwargs
    )
Exemple #44
0
def discover_h5py_dataset(d):
    dshape = datashape.from_numpy(d.shape, d.dtype)
    shape, measure = dshape.shape, dshape.measure
    if not isrecord(measure):
        if dshape == datashape.object_:
            args = shape + (datashape.string, )
            return DataShape(*args)
        return dshape
    else:
        records = list(
            record_dshape_replace(measure, datashape.object_,
                                  datashape.string))
        args = shape + (datashape.Record(records), )
        return DataShape(*args)
Exemple #45
0
def _split_agg(expr, leaf=None, chunk=None, agg=None, keepdims=True):
    exprs = [(name, split(leaf, val, keepdims=False)[1])
                for name, val in zip(expr.fields, expr.values)]

    d = dict()
    for name, (a, ae) in exprs:
        if isscalar(a.dshape.measure): # For simple reductions
            d[name] = ae._subs({a: agg[name]})
        elif isrecord(a.dshape.measure):  # For reductions like mean/var
            names = ['%s_%s' % (name, field) for field in a.fields]
            namedict = dict(zip(a.fields, names))
            d[name] = ae._subs(toolz.merge({a: agg}, namedict))

    return summary(**d)
Exemple #46
0
def _split_agg(expr, leaf=None, chunk=None, agg=None, keepdims=True):
    exprs = [(name, split(leaf, val, keepdims=False)[1])
             for name, val in zip(expr.fields, expr.values)]

    d = dict()
    for name, (a, ae) in exprs:
        if isscalar(a.dshape.measure):  # For simple reductions
            d[name] = ae._subs({a: agg[name]})
        elif isrecord(a.dshape.measure):  # For reductions like mean/var
            names = ['%s_%s' % (name, field) for field in a.fields]
            namedict = dict(zip(a.fields, names))
            d[name] = ae._subs(toolz.merge({a: agg}, namedict))

    return summary(**d)
Exemple #47
0
def is_nested_record(measure):
    """Predicate for checking whether `measure` is a nested ``Record`` dshape

    Examples
    --------
    >>> from datashape import dshape
    >>> is_nested_record(dshape('{a: int32, b: int32}').measure)
    False
    >>> is_nested_record(dshape('{a: var * ?float64, b: ?string}').measure)
    True
    """
    if not isrecord(measure):
        raise TypeError("Input must be a Record type got %s of type %r" % (measure, type(measure).__name__))
    return not all(isscalar(t) for t in measure.types)
Exemple #48
0
 def fields(self):
     measure = self.dshape.measure
     if isinstance(self.dshape.measure, Option):
         measure = measure.ty
     if isinstance(measure, Record):
         return measure.names
     elif isinstance(measure, datashape.Map):
         if not isrecord(self.dshape.measure.value):
             raise TypeError('Foreign key must reference a '
                             'Record datashape')
         return measure.value.names
     name = getattr(self, '_name', None)
     if name is not None:
         return [self._name]
     return []
Exemple #49
0
def record_dshape_replace(dshape, old, new):
    """Recursively replace all instances of `old` with `new` in the record
    dshape `dshape`.

    Examples
    --------
    >>> from datashape import Record, string, object_, dshape
    >>> ds = DataShape(Record([('a', 'int64'),
    ...                        ('b', 10 * Record([('c', 'object')])),
    ...                        ('d', 'int64')]))
    ...
    >>> Record(list(record_dshape_replace(ds, object_, string)))
    dshape("{a: int64, b: 10 * {c: object}, d: int64}")
    """
    assert isrecord(dshape), 'input dshape must be a record'

    for name, subshape in dshape.measure.fields:
        if subshape == old:
            yield name, new
        else:
            if isrecord(subshape):
                yield record_dshape_replace(subshape, old, new)
            else:
                yield name, subshape
Exemple #50
0
def is_nested_record(measure):
    """Predicate for checking whether `measure` is a nested ``Record`` dshape

    Examples
    --------
    >>> from datashape import dshape
    >>> is_nested_record(dshape('{a: int32, b: int32}').measure)
    False
    >>> is_nested_record(dshape('{a: var * ?float64, b: ?string}').measure)
    True
    """
    if not isrecord(measure):
        raise TypeError('Input must be a Record type got %s of type %r' %
                        (measure, type(measure).__name__))
    return not all(isscalar(t) for t in measure.types)
Exemple #51
0
def record_dshape_replace(dshape, old, new):
    """Recursively replace all instances of `old` with `new` in the record
    dshape `dshape`.

    Examples
    --------
    >>> from datashape import Record, string, object_, dshape
    >>> ds = DataShape(Record([('a', 'int64'),
    ...                        ('b', 10 * Record([('c', 'object')])),
    ...                        ('d', 'int64')]))
    ...
    >>> Record(list(record_dshape_replace(ds, object_, string)))
    dshape("{a: int64, b: 10 * {c: object}, d: int64}")
    """
    assert isrecord(dshape), 'input dshape must be a record'

    for name, subshape in dshape.measure.fields:
        if subshape == old:
            yield name, new
        else:
            if isrecord(subshape):
                yield record_dshape_replace(subshape, old, new)
            else:
                yield name, subshape
Exemple #52
0
def create_from_datashape(engine,
                          ds,
                          schema=None,
                          foreign_keys=None,
                          primary_key=None,
                          **kwargs):
    assert isrecord(ds), 'datashape must be Record type, got %s' % ds
    metadata = sa.MetaData(engine, schema=schema)
    for name, sub_ds in ds[0].dict.items():
        t = dshape_to_table(name,
                            sub_ds,
                            metadata=metadata,
                            foreign_keys=foreign_keys,
                            primary_key=primary_key)
        t.create()
    return engine
Exemple #53
0
def concrete_type(ds):
    """ A type into which we can safely deposit streaming data

    >>> concrete_type('5 * int').__name__
    'ndarray'
    >>> concrete_type('var * {name: string, amount: int}').__name__
    'DataFrame'
    """
    if isinstance(ds, (str, unicode)):
        ds = dshape(ds)
    if not iscollection(ds):
        return type(ds)
    if ndim(ds) == 1 and isrecord(ds.measure):
        return pd.DataFrame
    if ndim(ds) > 1 or isscalar(ds.measure):
        return np.ndarray
    return list
Exemple #54
0
def sort(child, key=None, ascending=True):
    """ Sort collection

    Parameters
    ----------
    key: string, list of strings, Expr
        Defines by what you want to sort.  Either:
            A single column string, ``t.sort('amount')``
            A list of column strings, ``t.sort(['name', 'amount'])``
            A Table Expression, ``t.sort(-t.amount)``
    ascending: bool
        Determines order of the sort
    """
    if not isrecord(child.dshape.measure):
        key = None
    if isinstance(key, list):
        key = tuple(key)
    return Sort(child, key, ascending)
Exemple #55
0
def sort(child, key=None, ascending=True):
    """ Sort a collection

    Parameters
    ----------
    key : str, list of str, or Expr
        Defines by what you want to sort.

          * A single column string: ``t.sort('amount')``
          * A list of column strings: ``t.sort(['name', 'amount'])``
          * An expression: ``t.sort(-t.amount)``

    ascending : bool, optional
        Determines order of the sort
    """
    if not isrecord(child.dshape.measure):
        key = None
    if isinstance(key, list):
        key = tuple(key)
    return Sort(child, key, ascending)
Exemple #56
0
def dshape_to_numpy(ds):
    """ Convert a datashape to a NumPy dtype

    Parameters
    ----------
    ds : DataShape
        The DataShape instance to convert

    Returns
    -------
    np.dtype

    Examples
    --------
    >>> dshape_to_numpy('int32')
    dtype('int32')
    >>> dshape_to_numpy('?int32')
    dtype('float32')

    >>> dshape_to_numpy('{name: string[5, "ascii"], amount: ?int32}')
    dtype([('name', 'S5'), ('amount', '<f4')])

    >>> dshape_to_numpy('(int32, float32)')
    dtype([('f0', '<i4'), ('f1', '<f4')])
    """
    if isinstance(ds, str):
        ds = dshape(ds)
    if isinstance(ds, DataShape):
        ds = ds.measure
    if isrecord(ds):
        return np.dtype([
            (str(name), unit_to_dtype(typ))
            for name, typ in zip(ds.names, ds.types)
        ])
    if isinstance(ds, Tuple):
        return np.dtype([
            ('f%d' % i, unit_to_dtype(typ))
            for i, typ in enumerate(ds.parameters[0])
        ])
    else:
        return unit_to_dtype(ds)
Exemple #57
0
def coerce_core(result, dshape, odo_kwargs=None):
    """Coerce data to a core data type."""
    if iscoretype(result):
        return result
    elif isscalar(dshape):
        result = coerce_scalar(result, dshape, odo_kwargs=odo_kwargs)
    elif istabular(dshape) and isrecord(dshape.measure):
        result = into(DataFrame, result, **(odo_kwargs or {}))
    elif iscollection(dshape):
        dim = _dimensions(dshape)
        if dim == 1:
            result = into(Series, result, **(odo_kwargs or {}))
        elif dim > 1:
            result = into(np.ndarray, result, **(odo_kwargs or {}))
        else:
            msg = "Expr with dshape dimensions < 1 should have been handled earlier: dim={}"
            raise ValueError(msg.format(str(dim)))
    else:
        msg = "Expr does not evaluate to a core return type"
        raise ValueError(msg)

    return result
Exemple #58
0
def sparksql_dataframe_to_list(df, dshape=None, **kwargs):
    result = df.collect()
    if (dshape is not None and iscollection(dshape) and
            not isrecord(dshape.measure)):
        return list(map(get(0), result))
    return result