Exemple #1
    def sparksql_to_ds(ss):
        """ Convert datashape to SparkSQL type system

        >>> sparksql_to_ds(IntegerType())  # doctest: +SKIP

        >>> sparksql_to_ds(ArrayType(IntegerType(), False))  # doctest: +SKIP
        dshape("var * int64")

        >>> sparksql_to_ds(ArrayType(IntegerType(), True))  # doctest: +SKIP
        dshape("var * ?int64")

        >>> sparksql_to_ds(StructType([  # doctest: +SKIP
        ...                         StructField('name', StringType(), False),
        ...                         StructField('amount', IntegerType(), True)]))
        dshape("{ name : string, amount : ?int64 }")
        if ss in rev_types:
            return rev_types[ss]
        if isinstance(ss, ArrayType):
            elem = sparksql_to_ds(ss.elementType)
            if ss.containsNull:
                return datashape.var * Option(elem)
                return datashape.var * elem
        if isinstance(ss, StructType):
            return dshape(
                    if field.nullable else sparksql_to_ds(field.dataType)
                ] for field in ss.fields]))
        raise NotImplementedError("SparkSQL type not known %s" % ss)
Exemple #2
def schema_to_dshape(schema):
    if type(schema) in sparksql_to_dshape:
        return sparksql_to_dshape[type(schema)]
    if isinstance(schema, ArrayType):
        dshape = schema_to_dshape(schema.elementType)
        return datashape.var * (Option(dshape)
                                if schema.containsNull else dshape)
    if isinstance(schema, StructType):
        fields = [(field.name, Option(schema_to_dshape(field.dataType))
                  if field.nullable else schema_to_dshape(field.dataType))
                  for field in schema.fields]
        return datashape.dshape(Record(fields))
    raise NotImplementedError('SparkSQL type not known %r' %
Exemple #3
def dshape_from_pandas(col):
    if isinstance(col.dtype, categorical):
        return Categorical(col.cat.categories.tolist())
    elif col.dtype.kind == 'M':
        tz = getattr(col.dtype, 'tz', None)
        if tz is not None:
            # Pandas stores this as a pytz.tzinfo, but DataShape wants a
            # string.
            tz = str(tz)
        return Option(DateTime(tz=tz))

    dshape = datashape.CType.from_numpy_dtype(col.dtype)
    dshape = string if dshape == object_ else dshape
    return Option(dshape) if dshape in possibly_missing else dshape
Exemple #4
def discover_csv(c, nrows=1000, **kwargs):
    df = csv_to_DataFrame(c, nrows=nrows, **kwargs)
    df = coerce_datetimes(df)

    if (not list(df.columns) == list(range(len(df.columns)))
            and any(re.match('^[-\d_]*$', c) for c in df.columns)):
        df = csv_to_DataFrame(c, chunksize=50, has_header=False).get_chunk()
        df = coerce_datetimes(df)

    columns = [
        str(c) if not isinstance(c, (str, unicode)) else c for c in df.columns
    df.columns = [c.strip() for c in columns]

    # Replace np.nan with None.  Forces type string rather than flaot
    for col in df.columns:
        if df[col].count() == 0:
            df[col] = [None] * len(df)

    measure = discover(df).measure

    # Use Series.notnull to determine Option-ness
    measure2 = Record([[
        Option(typ) if
        (~df[name].notnull()).any() and not isinstance(typ, Option) else typ
    ] for name, typ in zip(measure.names, measure.types)])

    return datashape.var * measure2
Exemple #5
class last(Reduction):
    """Last value encountered in ``column``.

    Useful for categorical data where an actual value must always be returned,
    not an average or other numerical calculation.

    Currently only supported for rasters, externally to this class.

    column : str
        Name of the column to aggregate over. If the data type is floating point,
        ``NaN`` values in the column are skipped.
    _dshape = dshape(Option(ct.float64))

    def _append(x, y, agg):
        raise NotImplementedError("last is currently implemented only for rasters")

    def _create(shape, array_module):
        raise NotImplementedError("last is currently implemented only for rasters")

    def _combine(aggs):
        raise NotImplementedError("last is currently implemented only for rasters")

    def _finalize(bases, **kwargs):
        raise NotImplementedError("last is currently implemented only for rasters")
Exemple #6
class _upsample(Reduction):
    """"Special internal class used for upsampling"""
    _dshape = dshape(Option(ct.float64))

    def _finalize(bases, cuda=False, **kwargs):
        return xr.DataArray(bases[0], **kwargs)

    def inputs(self):
        return (extract(self.column),)

    def _create(shape, array_module):
        # Use uninitialized memory, the upsample function must explicitly set unused
        # values to nan
        return array_module.empty(shape, dtype='f8')

    def _append(x, y, agg, field):
        # not called, the upsample function must set agg directly

    def _append_cuda(x, y, agg, field):
        # not called, the upsample function must set agg directly

    def _combine(aggs):
        return np.nanmax(aggs, axis=0)
Exemple #7
 def _dtype(self):
     # we can't simply use .schema or .datashape because we may have a bare
     # integer, for example
     lhs, rhs = discover(self.lhs).measure, discover(self.rhs).measure
     if isinstance(lhs, Option) or isinstance(rhs, Option):
         return Option(ct.bool_)
     return ct.bool_
Exemple #8
class mode(Reduction):
    """Mode (most common value) of all the values encountered in ``column``.

    Useful for categorical data where an actual value must always be returned,
    not an average or other numerical calculation.

    Currently only supported for rasters, externally to this class.
    Implementing it for other glyph types would be difficult due to potentially
    unbounded data storage requirements to store indefinite point or line
    data per pixel.

    column : str
        Name of the column to aggregate over. If the data type is floating point,
        ``NaN`` values in the column are skipped.
    _dshape = dshape(Option(ct.float64))

    def _append(x, y, agg):
        raise NotImplementedError("mode is currently implemented only for rasters")

    def _create(shape, array_module):
        raise NotImplementedError("mode is currently implemented only for rasters")

    def _combine(aggs):
        raise NotImplementedError("mode is currently implemented only for rasters")

    def _finalize(bases, **kwargs):
        raise NotImplementedError("mode is currently implemented only for rasters")
Exemple #9
    def schema(self):


        >>> t = Symbol('t', 'var * {name: string, amount: int}')
        >>> s = Symbol('t', 'var * {name: string, id: int}')

        >>> join(t, s).schema
        dshape("{ name : string, amount : int32, id : int32 }")

        >>> join(t, s, how='left').schema
        dshape("{ name : string, amount : int32, id : ?int32 }")
        option = lambda dt: dt if isinstance(dt, Option) else Option(dt)

        joined = [[name, dt] for name, dt in self.lhs.schema[0].parameters[0]
                  if name in self.on_left]

        left = [[name, dt] for name, dt in self.lhs.schema[0].parameters[0]
                if name not in self.on_left]

        right = [[name, dt] for name, dt in self.rhs.schema[0].parameters[0]
                 if name not in self.on_right]

        if self.how in ('right', 'outer'):
            left = [[name, option(dt)] for name, dt in left]
        if self.how in ('left', 'outer'):
            right = [[name, option(dt)] for name, dt in right]

        return dshape(Record(joined + left + right))
Exemple #10
class StrFind(ElemWise):
    Find literal substring in string column.


    _arguments = '_child', 'sub'
    schema = Option(datashape.int64)
Exemple #11
def discover_dataframe(df):
    obj = object_
    names = list(df.columns)
    dtypes = list(map(datashape.CType.from_numpy_dtype, df.dtypes))
    dtypes = [string if dt == obj else dt for dt in dtypes]
    odtypes = [Option(dt) if dt in possibly_missing else dt for dt in dtypes]
    schema = datashape.Record(list(zip(names, odtypes)))
    return len(df) * schema
Exemple #12
def _subexpr_optimize(expr):
    if expr.axis != tuple(range(expr._child.ndim)):
        raise ValueError("Cannot perform 'all' over an axis: %s")
    if expr.keepdims:
        raise ValueError("Cannot perform 'all' with keepdims=True")

    return (~expr._child).coerce(Option(int32)
                                 if isinstance(expr._child.dshape, Option) else
                                 int32).sum() == 0
Exemple #13
    def schema(self):
        measure = self._child.schema.measure

        # if we are not shifting or we are already an Option type then return
        # the child's schema
        if not self.n or isinstance(measure, Option):
            return measure
            return Option(measure)
Exemple #14
class std(Reduction):
    _dshape = dshape(Option(ct.float64))

    def _bases(self):
        return (sum(self.column), count(self.column), m2(self.column))

    def _build_finalize(self, dshape):
        return finalize_std
Exemple #15
class FloatingReduction(Reduction):
    _dshape = dshape(Option(ct.float64))

    def _create(shape):
        return np.full(shape, np.nan, dtype='f8')

    def _finalize(bases, **kwargs):
        return xr.DataArray(bases[0], **kwargs)
Exemple #16
def check_roundtrip_null_values(table_uri,
    """Check the data roundtrip through postgres using warp_prism to read the

    table_uri : str
        The uri to a unique table.
    data : iterable[any]
        The input data.
    dtype : str
        The dtype of the data.
    sqltype : type
        The sqlalchemy type of the data.
    null_values : dict[str, any]
        The value to coerce ``NULL`` to.
    astype : bool, optional
        Coerce the input data to the given dtype before making assertions about
        the output data.
    table = resource(table_uri, dshape=var * R['a':Option(dtype)])
    # Ensure that odo created the table correctly. If these fail the other
    # tests are not well defined.
    assert table.columns.keys() == ['a']
    assert isinstance(table.columns['a'].type, sqltype)
    table.insert().values([{'a': v} for v in data]).execute()

    arrays = to_arrays(table)
    assert len(arrays) == 1
    array, actual_mask = arrays['a']
    assert (actual_mask == mask).all()
    assert (array[mask] == data[mask]).all()

    output_dataframe = to_dataframe(table, null_values=null_values)
    if astype:
        data = data.astype(dshape(dtype).measure.to_numpy_dtype())
    expected_dataframe = pd.DataFrame({'a': data})
    expected_dataframe[~mask] = null_values.get(
        null_values_for_type[array.dtype if array.dtype.kind != 'M' else np.
Exemple #17
class FloatingReduction(Reduction):
    """Base classes for reductions that always have floating-point dtype."""
    _dshape = dshape(Option(ct.float64))

    def _create(shape):
        return np.full(shape, np.nan, dtype='f8')

    def _finalize(bases, **kwargs):
        return xr.DataArray(bases[0], **kwargs)
Exemple #18
class std(Reduction):
    _dshape = dshape(Option(ct.float64))

    def _bases(self):
        return (sum(self.column), count(self.column), m2(self.column))

    def _finalize(bases, **kwargs):
        sums, counts, m2s = bases
        with np.errstate(divide='ignore', invalid='ignore'):
            x = np.sqrt(m2s / counts)
        return xr.DataArray(x, **kwargs)
Exemple #19
    def schema(self):

        >>> from blaze import symbol
        >>> t = symbol('t', 'var * {name: string, amount: int}')
        >>> s = symbol('t', 'var * {name: string, id: int}')

        >>> join(t, s).schema
        dshape("{name: string, amount: int32, id: int32}")

        >>> join(t, s, how='left').schema
        dshape("{name: string, amount: int32, id: ?int32}")

        Overlapping but non-joined fields append _left, _right

        >>> a = symbol('a', 'var * {x: int, y: int}')
        >>> b = symbol('b', 'var * {x: int, y: int}')
        >>> join(a, b, 'x').fields
        ['x', 'y_left', 'y_right']
        option = lambda dt: dt if isinstance(dt, Option) else Option(dt)

        joined = [[name, dt] for name, dt in self.lhs.schema[0].parameters[0]
                  if name in self.on_left]

        left = [[name, dt] for name, dt in zip(
            self.lhs.fields, types_of_fields(self.lhs.fields, self.lhs))
                if name not in self.on_left]

        right = [[name, dt] for name, dt in zip(
            self.rhs.fields, types_of_fields(self.rhs.fields, self.rhs))
                 if name not in self.on_right]

        # Handle overlapping but non-joined case, e.g.
        left_other = [name for name, dt in left if name not in self.on_left]
        right_other = [name for name, dt in right if name not in self.on_right]
        overlap = set.intersection(set(left_other), set(right_other))
        left_suffix, right_suffix = self.suffixes
        left = [[name + left_suffix if name in overlap else name, dt]
                for name, dt in left]
        right = [[name + right_suffix if name in overlap else name, dt]
                 for name, dt in right]

        if self.how in ('right', 'outer'):
            left = [[name, option(dt)] for name, dt in left]
        if self.how in ('left', 'outer'):
            right = [[name, option(dt)] for name, dt in right]

        return dshape(Record(joined + left + right))
Exemple #20
def discover_sqlalchemy_selectable(t):
    ordering = {str(c): i for i, c in enumerate(c for c in t.columns.keys())}
    record = list(_process_columns(t.columns))
    fkeys = [discover(fkey, t, parent_measure=Record(record))
             for fkey in t.foreign_keys]
    for name, column in merge(*fkeys).items():
        index = ordering[name]
        _, key_type = record[index]
        # If the foreign-key is nullable the column (map) key
        # should be an Option type
        if isinstance(key_type, Option):
            column.key = Option(column.key)
        record[index] = (name, column)
    return var * Record(record)
Exemple #21
class sum(FloatingReduction):
    """Sum of all elements in ``column``.

    Elements of resulting aggregate are nan if they are not updated.

    column : str
        Name of the column to aggregate over. Column data type must be numeric.
        ``NaN`` values in the column are skipped.
    _dshape = dshape(Option(ct.float64))

    # Cuda implementation
    def _build_bases(self, cuda=False):
        if cuda:
            return (_sum_zero(self.column), any(self.column))
            return (self, )

    def _finalize(bases, cuda=False, **kwargs):
        if cuda:
            sums, anys = bases
            x = np.where(anys, sums, np.nan)
            return xr.DataArray(x, **kwargs)
            return xr.DataArray(bases[0], **kwargs)

    # Single pass CPU implementation
    # These methods will only be called if _build_bases returned (self,)
    def _append(x, y, agg, field):
        if not isnull(field):
            if isnull(agg[y, x]):
                agg[y, x] = field
                agg[y, x] += field

    def _combine(aggs):
        missing_vals = np.isnan(aggs)
        all_empty = np.bitwise_and.reduce(missing_vals, axis=0)
        set_to_zero = missing_vals & ~all_empty
        return np.where(set_to_zero, 0, aggs).sum(axis=0)
Exemple #22
def safely_option(ds):
    """ Wrap certain types in an option type

    >>> safely_option('int32')
    >>> safely_option('?int32')
    >>> safely_option('float64')
    if isinstance(ds, _strtypes):
        ds = dshape(ds)
    if isinstance(ds, DataShape) and len(ds) == 1:
        ds = ds[0]
    if isinstance(ds, Unit) and 'int' in str(ds) or 'date' in str(ds):
        return Option(ds)
    return ds
Exemple #23
class count(Reduction):
    _dshape = dshape(Option(ct.int32))

    def validate(self, in_dshape):

    def _build_create(self, dshape):
        dtype = numpy_dtype(dshape.measure)
        return lambda shape: np.zeros(shape, dtype=dtype)

    def _build_append(self, dshape):
        return append_count

    def _build_combine(self, dshape):
        return combine_count

    def _build_finalize(self, dshape):
        return identity
Exemple #24
class std(Reduction):
    """Standard Deviation of all elements in ``column``.

    column : str
        Name of the column to aggregate over. Column data type must be numeric.
        ``NaN`` values in the column are skipped.
    _dshape = dshape(Option(ct.float64))

    def _build_bases(self, cuda=False):
        return (_sum_zero(self.column), count(self.column), m2(self.column))

    def _finalize(bases, cuda=False, **kwargs):
        sums, counts, m2s = bases
        with np.errstate(divide='ignore', invalid='ignore'):
            x = np.where(counts > 0, np.sqrt(m2s / counts), np.nan)
        return xr.DataArray(x, **kwargs)
Exemple #25
class var(Reduction):
    """Variance of all elements in ``column``.

    column : str
        Name of the column to aggregate over. Column data type must be numeric.
        ``NaN`` values in the column are skipped.
    _dshape = dshape(Option(ct.float64))

    def _bases(self):
        return (sum(self.column), count(self.column), m2(self.column))

    def _finalize(bases, **kwargs):
        sums, counts, m2s = bases
        with np.errstate(divide='ignore', invalid='ignore'):
            x = m2s/counts
        return xr.DataArray(x, **kwargs)
Exemple #26
def discover_csv(c, nrows=1000, **kwargs):
    df = csv_to_dataframe(c, nrows=nrows, **kwargs)
    df = coerce_datetimes(df)

    columns = [str(c) if not isinstance(c, (str, unicode)) else c
               for c in df.columns]
    df.columns = [c.strip() for c in columns]

    # Replace np.nan with None. Forces type string rather than float
    for col in df.columns:
        if not df[col].count():
            df[col] = None

    measure = discover(df).measure

    # Use Series.notnull to determine Option-ness
    measure = Record([[name, Option(typ)
                       if df[name].isnull().any() and
                       not isinstance(typ, Option) else typ]
                      for name, typ in zip(measure.names, measure.types)])

    return datashape.var * measure
Exemple #27
class sum(FloatingReduction):
    """Sum of all elements in ``column``.

    Elements of resulting aggregate are nan if they are not updated.

    column : str
        Name of the column to aggregate over. Column data type must be numeric.
        ``NaN`` values in the column are skipped.
    _dshape = dshape(Option(ct.float64))

    def _append_int_field(x, y, agg, field):
        if np.isnan(agg[y, x]):
            agg[y, x] = field
            agg[y, x] += field

    def _append_float_field(x, y, agg, field):
        if not np.isnan(field):
            if np.isnan(agg[y, x]):
                agg[y, x] = field
                agg[y, x] += field

    def _combine(aggs):
        missing_vals = np.isnan(aggs)
        all_empty = np.bitwise_and.reduce(missing_vals, axis=0)
        set_to_zero = missing_vals & ~all_empty
        return np.where(set_to_zero, 0, aggs).sum(axis=0)
Exemple #28
def test_datetimetz_pandas():
    df = pd.DataFrame(
            ('naive', pd.date_range('2014', periods=5)),
             pd.date_range('2014', periods=5, tz='Europe/Moscow')),
            ('UTC', pd.date_range('2014', periods=5, tz='UTC')),
            ('US/Eastern', pd.date_range('2014', periods=5, tz='US/Eastern')),

        5 * Record['naive':Option(DateTime(tz=None)),
                   'US/Eastern':Option(DateTime(tz='US/Eastern')), ])

    assert_dshape_equal(discover(df.naive), 5 * Option(DateTime(tz=None)))
    for tz in ('Europe/Moscow', 'UTC', 'US/Eastern'):
        assert_dshape_equal(discover(df[tz]), 5 * Option(DateTime(tz=tz)))
Exemple #29
def test_csv_missing_values():
    with filetext('name,val\nAlice,100\nNA,200', extension='csv') as fn:
        csv = CSV(fn)
        assert discover(csv).measure.dict['name'] == Option(string)
Exemple #30
def optionify(d):
    if isinstance(d, DataShape):
        return DataShape(*(optionify(i) for i in d.parameters))
    return d if isinstance(d, Option) else Option(d)