Exemple #1
0
    def _get_py(self, key):
        if isinstance(key, tuple):
            assert len(key) == 2
            rows, cols = key
            usecols = cols
            ds = self.dshape.subshape[rows, cols]
            usecols = None if isinstance(usecols, slice) else listpack(usecols)
        else:
            rows = key
            ds = self.dshape.subshape[rows]
            usecols = None

        if isinstance(ds, DataShape) and isdimension(ds[0]):
            ds = ds.subshape[0]

        seq = self._iter(usecols=usecols)
        if isinstance(key, tuple) and isinstance(cols, _strtypes + _inttypes):
            seq = pluck(0, seq)
        seq = coerce(ds, seq)

        if isinstance(rows, compatibility._inttypes):
            line = nth(rows, seq)
            try:
                return next(line).item()
            except TypeError:
                try:
                    return line.item()
                except AttributeError:
                    return line
        elif isinstance(rows, list):
            return nth_list(rows, seq)
        elif isinstance(rows, slice):
            return it.islice(seq, rows.start, rows.stop, rows.step)
        else:
            raise IndexError("key %r is not valid" % rows)
Exemple #2
0
    def _get_py(self, key):
        if isinstance(key, tuple):
            assert len(key) == 2
            rows, cols = key
            usecols = cols
            ds = self.dshape.subshape[rows, cols]
            usecols = None if isinstance(usecols, slice) else listpack(usecols)
        else:
            rows = key
            ds = self.dshape.subshape[rows]
            usecols = None

        if isinstance(ds, DataShape) and isdimension(ds[0]):
            ds = ds.subshape[0]

        seq = self._iter(usecols=usecols)
        if isinstance(key, tuple) and isinstance(cols, _strtypes + _inttypes):
            seq = pluck(0, seq)
        seq = coerce(ds, seq)

        if isinstance(rows, compatibility._inttypes):
            line = nth(rows, seq)
            try:
                return next(line).item()
            except TypeError:
                try:
                    return line.item()
                except AttributeError:
                    return line
        elif isinstance(rows, list):
            return nth_list(rows, seq)
        elif isinstance(rows, slice):
            return it.islice(seq, rows.start, rows.stop, rows.step)
        else:
            raise IndexError("key %r is not valid" % rows)
Exemple #3
0
def coerce_to_ordered(ds, data):
    """ Coerce data with dicts into an ordered ND collection

    >>> from datashape import dshape

    >>> coerce_to_ordered('{x: int, y: int}', {'x': 1, 'y': 2})
    (1, 2)

    >>> coerce_to_ordered('var * {x: int, y: int}',
    ...                  [{'x': 1, 'y': 2}, {'x': 10, 'y': 20}])
    ((1, 2), (10, 20))

    Idempotent
    >>> coerce_to_ordered('var * {x: int, y: int}',
    ...                   ((1, 2), (10, 20)))
    ((1, 2), (10, 20))
    """
    if isinstance(ds, _strtypes):
        ds = dshape(ds)
    if isinstance(ds[0], Record):
        if isinstance(data, (list, tuple)):
            return data
        rec = ds[0]
        return tuple(coerce_to_ordered(rec[name], data[name])
                     for name in rec.names)
    if isdimension(ds[0]):
        return tuple(coerce_to_ordered(ds.subarray(1), row)
                     for row in data)
    return data
Exemple #4
0
def coerce_to_ordered(ds, data):
    """ Coerce data with dicts into an ordered ND collection

    >>> from datashape import dshape

    >>> coerce_to_ordered('{x: int, y: int}', {'x': 1, 'y': 2})
    (1, 2)

    >>> coerce_to_ordered('var * {x: int, y: int}',
    ...                  [{'x': 1, 'y': 2}, {'x': 10, 'y': 20}])
    ((1, 2), (10, 20))

    Idempotent
    >>> coerce_to_ordered('var * {x: int, y: int}',
    ...                   ((1, 2), (10, 20)))
    ((1, 2), (10, 20))
    """
    if isinstance(ds, _strtypes):
        ds = dshape(ds)
    if isinstance(ds[0], Record):
        if isinstance(data, (list, tuple)):
            return data
        rec = ds[0]
        return tuple(
            coerce_to_ordered(rec[name], data[name]) for name in rec.names)
    if isdimension(ds[0]):
        return tuple(coerce_to_ordered(ds.subarray(1), row) for row in data)
    return data
Exemple #5
0
def into_string(uri, b, dshape=None, **kwargs):
    if dshape is None:
        dshape = discover(b)

    resource_ds = 0 * dshape.subshape[0] if isdimension(dshape[0]) else dshape

    a = resource(uri, dshape=resource_ds, expected_dshape=dshape, **kwargs)
    return into(a, b, dshape=dshape, **kwargs)
Exemple #6
0
def into_string(uri, b, dshape=None, **kwargs):
    if dshape is None:
        dshape = discover(b)

    resource_ds = 0 * dshape.subshape[0] if isdimension(dshape[0]) else dshape

    a = resource(uri, dshape=resource_ds, expected_dshape=dshape, **kwargs)
    return into(a, b, dshape=dshape, **kwargs)
Exemple #7
0
def into_string(uri, b, **kwargs):
    ds = kwargs.pop('dshape', None)
    if not ds:
        ds = discover(b)
    if isdimension(ds[0]):
        resource_ds = 0 * ds.subshape[0]
    else:
        resource_ds = ds

    a = resource(uri, dshape=resource_ds, expected_dshape=ds, **kwargs)
    return into(a, b, dshape=ds, **kwargs)
Exemple #8
0
def into_string(uri, b, **kwargs):
    try:
        if 'dshape' not in kwargs:
            ds = discover(b)
            if isdimension(ds[0]):
                ds = var * ds.subshape[0]
            kwargs['dshape'] = ds
    except NotImplementedError:
        pass
    a = resource(uri, **kwargs)
    return into(a, b, **kwargs)
Exemple #9
0
def into_string(uri, b, **kwargs):
    ds = kwargs.pop('dshape', None)
    if not ds:
        ds = discover(b)
    if isdimension(ds[0]):
        resource_ds = 0 * ds.subshape[0]
    else:
        resource_ds = ds

    a = resource(uri, dshape=resource_ds, expected_dshape=ds, **kwargs)
    return into(a, b, dshape=ds, **kwargs)
Exemple #10
0
def discover_pymongo_collection(coll, n=50):
    items = list(take(n, coll.find()))
    oid_cols = [k for k, v in items[0].items() if isinstance(v, ObjectId)]
    for item in items:
        for col in oid_cols:
            del item[col]

    ds = discover(items)

    if isdimension(ds[0]):
        return coll.count() * ds.subshape[0]
    else:
        raise ValueError("Consistent datashape not found")
Exemple #11
0
def dshape_to_alchemy(dshape, primary_key=frozenset()):
    """

    >>> dshape_to_alchemy('int')
    <class 'sqlalchemy.sql.sqltypes.Integer'>

    >>> dshape_to_alchemy('string')
    <class 'sqlalchemy.sql.sqltypes.Text'>

    >>> dshape_to_alchemy('{name: string, amount: int}')
    [Column('name', Text(), table=None, nullable=False), Column('amount', Integer(), table=None, nullable=False)]

    >>> dshape_to_alchemy('{name: ?string, amount: ?int}')
    [Column('name', Text(), table=None), Column('amount', Integer(), table=None)]
    """
    if isinstance(dshape, str):
        dshape = datashape.dshape(dshape)
    if isinstance(dshape, Map):
        return dshape_to_alchemy(dshape.key.measure, primary_key=primary_key)
    if isinstance(dshape, Option):
        return dshape_to_alchemy(dshape.ty, primary_key=primary_key)
    if str(dshape) in types:
        return types[str(dshape)]
    if isinstance(dshape, datashape.Record):
        return [
            sa.Column(name,
                      dshape_to_alchemy(getattr(typ, 'ty', typ),
                                        primary_key=primary_key),
                      primary_key=name in primary_key,
                      nullable=isinstance(typ[0], Option))
            for name, typ in dshape.parameters[0]
        ]
    if isinstance(dshape, datashape.DataShape):
        if isdimension(dshape[0]):
            return dshape_to_alchemy(dshape[1], primary_key=primary_key)
        else:
            return dshape_to_alchemy(dshape[0], primary_key=primary_key)
    if isinstance(dshape, datashape.String):
        fixlen = dshape[0].fixlen
        if fixlen is None:
            return sa.TEXT
        string_types = dict(U=sa.Unicode, A=sa.String)
        assert dshape.encoding is not None
        return string_types[dshape.encoding[0]](length=fixlen)
    if isinstance(dshape, datashape.DateTime):
        return sa.DATETIME(timezone=dshape.tz is not None)
    if isinstance(dshape, datashape.Decimal):
        return sa.NUMERIC(dshape.precision, dshape.scale)
    raise NotImplementedError("No SQLAlchemy dtype match for datashape: %s" %
                              dshape)
Exemple #12
0
def dshape_to_alchemy(dshape, primary_key=frozenset()):
    """

    >>> dshape_to_alchemy('int')
    <class 'sqlalchemy.sql.sqltypes.Integer'>

    >>> dshape_to_alchemy('string')
    <class 'sqlalchemy.sql.sqltypes.Text'>

    >>> dshape_to_alchemy('{name: string, amount: int}')
    [Column('name', Text(), table=None, nullable=False), Column('amount', Integer(), table=None, nullable=False)]

    >>> dshape_to_alchemy('{name: ?string, amount: ?int}')
    [Column('name', Text(), table=None), Column('amount', Integer(), table=None)]
    """
    if isinstance(dshape, str):
        dshape = datashape.dshape(dshape)
    if isinstance(dshape, Map):
        return dshape_to_alchemy(dshape.key.measure, primary_key=primary_key)
    if isinstance(dshape, Option):
        return dshape_to_alchemy(dshape.ty, primary_key=primary_key)
    if str(dshape) in types:
        return types[str(dshape)]
    if isinstance(dshape, datashape.Record):
        return [
            sa.Column(
                name,
                dshape_to_alchemy(getattr(typ, "ty", typ), primary_key=primary_key),
                primary_key=name in primary_key,
                nullable=isinstance(typ[0], Option),
            )
            for name, typ in dshape.parameters[0]
        ]
    if isinstance(dshape, datashape.DataShape):
        if isdimension(dshape[0]):
            return dshape_to_alchemy(dshape[1], primary_key=primary_key)
        else:
            return dshape_to_alchemy(dshape[0], primary_key=primary_key)
    if isinstance(dshape, datashape.String):
        fixlen = dshape[0].fixlen
        if fixlen is None:
            return sa.TEXT
        string_types = dict(U=sa.Unicode, A=sa.String)
        assert dshape.encoding is not None
        return string_types[dshape.encoding[0]](length=fixlen)
    if isinstance(dshape, datashape.DateTime):
        return sa.DATETIME(timezone=dshape.tz is not None)
    if isinstance(dshape, datashape.Decimal):
        return sa.NUMERIC(dshape.precision, dshape.scale)
    raise NotImplementedError("No SQLAlchemy dtype match for datashape: %s" % dshape)
Exemple #13
0
def rdd_to_sqlcontext(ctx, rdd, name=None, dshape=None, **kwargs):
    """ Convert a normal PySpark RDD to a SparkSQL RDD or Spark DataFrame

    Schema inferred by ds_to_sparksql.  Can also specify it explicitly with
    schema keyword argument.
    """
    # TODO: assumes that we don't have e.g., 10 * 10 * {x: int, y: int}
    if isdimension(dshape.parameters[0]):
        dshape = dshape.measure
    sql_schema = dshape_to_schema(dshape)
    sdf = ctx.applySchema(rdd, sql_schema)
    if name is None:
        name = next(_names)
    register_table(ctx, sdf, name=name)
    ctx.cacheTable(name)
    return sdf
Exemple #14
0
def dshape_to_alchemy(dshape):
    """

    >>> dshape_to_alchemy('int')
    <class 'sqlalchemy.sql.sqltypes.Integer'>

    >>> dshape_to_alchemy('string')
    <class 'sqlalchemy.sql.sqltypes.Text'>

    >>> dshape_to_alchemy('{name: string, amount: int}')
    [Column('name', Text(), table=None, nullable=False), Column('amount', Integer(), table=None, nullable=False)]

    >>> dshape_to_alchemy('{name: ?string, amount: ?int}')
    [Column('name', Text(), table=None), Column('amount', Integer(), table=None)]
    """
    if isinstance(dshape, str):
        dshape = datashape.dshape(dshape)
    if isinstance(dshape, Option):
        return dshape_to_alchemy(dshape.ty)
    if str(dshape) in types:
        return types[str(dshape)]
    if isinstance(dshape, datashape.Record):
        return [
            sa.Column(name,
                      dshape_to_alchemy(typ),
                      nullable=isinstance(typ[0], Option))
            for name, typ in dshape.parameters[0]
        ]
    if isinstance(dshape, datashape.DataShape):
        if isdimension(dshape[0]):
            return dshape_to_alchemy(dshape[1])
        else:
            return dshape_to_alchemy(dshape[0])
    if isinstance(dshape, datashape.String):
        if dshape[0].fixlen is None:
            return sa.types.Text
        if 'U' in dshape.encoding:
            return sa.types.Unicode(length=dshape[0].fixlen)
        if 'A' in dshape.encoding:
            return sa.types.String(length=dshape[0].fixlen)
    if isinstance(dshape, datashape.DateTime):
        if dshape.tz:
            return sa.types.DateTime(timezone=True)
        else:
            return sa.types.DateTime(timezone=False)
    raise NotImplementedError("No SQLAlchemy dtype match for datashape: %s" %
                              dshape)
Exemple #15
0
def deoption(ds):
    """

    >>> deoption('int32')
    ctype("int32")

    >>> deoption('?int32')
    ctype("int32")
    """
    if isinstance(ds, str):
        ds = dshape(ds)
    if isinstance(ds, DataShape) and not isdimension(ds[0]):
        return deoption(ds[0])
    if isinstance(ds, Option):
        return ds.ty
    else:
        return ds
Exemple #16
0
def dshape_to_alchemy(dshape):
    """

    >>> dshape_to_alchemy('int')
    <class 'sqlalchemy.sql.sqltypes.Integer'>

    >>> dshape_to_alchemy('string')
    <class 'sqlalchemy.sql.sqltypes.Text'>

    >>> dshape_to_alchemy('{name: string, amount: int}')
    [Column('name', Text(), table=None, nullable=False), Column('amount', Integer(), table=None, nullable=False)]

    >>> dshape_to_alchemy('{name: ?string, amount: ?int}')
    [Column('name', Text(), table=None), Column('amount', Integer(), table=None)]
    """
    if isinstance(dshape, str):
        dshape = datashape.dshape(dshape)
    if isinstance(dshape, Option):
        return dshape_to_alchemy(dshape.ty)
    if str(dshape) in types:
        return types[str(dshape)]
    if isinstance(dshape, datashape.Record):
        return [sa.Column(name,
                          dshape_to_alchemy(typ),
                          nullable=isinstance(typ[0], Option))
                for name, typ in dshape.parameters[0]]
    if isinstance(dshape, datashape.DataShape):
        if isdimension(dshape[0]):
            return dshape_to_alchemy(dshape[1])
        else:
            return dshape_to_alchemy(dshape[0])
    if isinstance(dshape, datashape.String):
        fixlen = dshape[0].fixlen
        if fixlen is None:
            return sa.types.Text
        string_types = dict(U=sa.types.Unicode, A=sa.types.String)
        assert dshape.encoding is not None
        return string_types[dshape.encoding[0]](length=fixlen)
    if isinstance(dshape, datashape.DateTime):
        if dshape.tz:
            return sa.types.DateTime(timezone=True)
        else:
            return sa.types.DateTime(timezone=False)
    raise NotImplementedError("No SQLAlchemy dtype match for datashape: %s"
                              % dshape)
Exemple #17
0
def dshape_to_schema(ds):
    """Convert datashape to SparkSQL type system.

    Examples
    --------
    >>> print(dshape_to_schema('int32'))  # doctest: +SKIP
    IntegerType
    >>> print(dshape_to_schema('5 * int32')  # doctest: +SKIP
    ArrayType(IntegerType,false)
    >>> print(dshape_to_schema('5 * ?int32'))  # doctest: +SKIP
    ArrayType(IntegerType,true)
    >>> print(dshape_to_schema('{name: string, amount: int32}'))  # doctest: +SKIP
    StructType(List(StructField(name,StringType,false),StructField(amount,IntegerType,false)  # doctest: +SKIP))
    >>> print(dshape_to_schema('10 * {name: string, amount: ?int32}'))  # doctest: +SKIP
    ArrayType(StructType(List(StructField(name,StringType,false),StructField(amount,IntegerType,true))),false)
    """
    if isinstance(ds, str):
        return dshape_to_schema(dshape(ds))
    if isinstance(ds, Tuple):
        raise TypeError('Please provide a Record dshape for these column '
                        'types: %s' % (ds.dshapes,))
    if isinstance(ds, Record):
        return StructType([
            StructField(name,
                        dshape_to_schema(deoption(typ)),
                        isinstance(typ, datashape.Option))
            for name, typ in ds.fields])
    if isinstance(ds, DataShape):
        if isdimension(ds[0]):
            elem = ds.subshape[0]
            if isinstance(elem, DataShape) and len(elem) == 1:
                elem = elem[0]
            return ArrayType(dshape_to_schema(deoption(elem)),
                             isinstance(elem, Option))
        else:
            return dshape_to_schema(ds[0])
    if ds in dshape_to_sparksql:
        return dshape_to_sparksql[ds]
    raise NotImplementedError()
Exemple #18
0
def ordered_index(ind, ds):
    """ Transform a named index into an ordered one

    >>> ordered_index(1, '3 * int')
    1
    >>> ordered_index('name', '{name: string, amount: int}')
    0
    >>> ordered_index((0, 0), '3 * {x: int, y: int}')
    (0, 0)
    >>> ordered_index([0, 1], '3 * {x: int, y: int}')
    [0, 1]
    >>> ordered_index(([0, 1], 'x'), '3 * {x: int, y: int}')
    ([0, 1], 0)
    >>> ordered_index((0, 'x'), '3 * {x: int, y: int}')
    (0, 0)
    >>> ordered_index((0, [0, 1]), '3 * {x: int, y: int}')
    (0, [0, 1])
    >>> ordered_index((0, ['x', 'y']), '3 * {x: int, y: int}')
    (0, [0, 1])
    """
    if isinstance(ds, _strtypes):
        ds = dshape(ds)
    if isinstance(ind, (int, slice)):
        return ind
    if isinstance(ind, list):
        return [ordered_index(i, ds) for i in ind]
    if isinstance(ind, _strtypes) and isinstance(ds[0], Record):
        return ds[0].names.index(ind)
    if isinstance(ind, tuple) and not ind:
        return ()
    if isdimension(ds[0]):
        return (ind[0],) + tupleit(ordered_index(ind[1:], ds.subshape[0]))
    if isinstance(ind, tuple):
        return ((ordered_index(ind[0], ds),)
                + tupleit(ordered_index(ind[1:], ds.subshape[0])))
    raise NotImplementedError("Rule for ind: %s, ds: %ds not found"
                              % (str(ind), str(ds)))
Exemple #19
0
def ordered_index(ind, ds):
    """ Transform a named index into an ordered one

    >>> ordered_index(1, '3 * int')
    1
    >>> ordered_index('name', '{name: string, amount: int}')
    0
    >>> ordered_index((0, 0), '3 * {x: int, y: int}')
    (0, 0)
    >>> ordered_index([0, 1], '3 * {x: int, y: int}')
    [0, 1]
    >>> ordered_index(([0, 1], 'x'), '3 * {x: int, y: int}')
    ([0, 1], 0)
    >>> ordered_index((0, 'x'), '3 * {x: int, y: int}')
    (0, 0)
    >>> ordered_index((0, [0, 1]), '3 * {x: int, y: int}')
    (0, [0, 1])
    >>> ordered_index((0, ['x', 'y']), '3 * {x: int, y: int}')
    (0, [0, 1])
    """
    if isinstance(ds, _strtypes):
        ds = dshape(ds)
    if isinstance(ind, (int, slice)):
        return ind
    if isinstance(ind, list):
        return [ordered_index(i, ds) for i in ind]
    if isinstance(ind, _strtypes) and isinstance(ds[0], Record):
        return ds[0].names.index(ind)
    if isinstance(ind, tuple) and not ind:
        return ()
    if isdimension(ds[0]):
        return (ind[0], ) + tupleit(ordered_index(ind[1:], ds.subshape[0]))
    if isinstance(ind, tuple):
        return ((ordered_index(ind[0], ds), ) +
                tupleit(ordered_index(ind[1:], ds.subshape[0])))
    raise NotImplementedError("Rule for ind: %s, ds: %ds not found" %
                              (str(ind), str(ds)))
Exemple #20
0
 def __init__(self, descriptors):
     assert all(isdimension(ddesc.dshape[0]) for ddesc in descriptors)
     self.descriptors = descriptors
Exemple #21
0
 def __init__(self, descriptors):
     assert all(isdimension(ddesc.dshape[0]) for ddesc in descriptors)
     self.descriptors = descriptors