def sparksql_to_ds(ss): """ Convert datashape to SparkSQL type system >>> sparksql_to_ds(IntegerType()) # doctest: +SKIP ctype("int64") >>> sparksql_to_ds(ArrayType(IntegerType(), False)) # doctest: +SKIP dshape("var * int64") >>> sparksql_to_ds(ArrayType(IntegerType(), True)) # doctest: +SKIP dshape("var * ?int64") >>> sparksql_to_ds(StructType([ # doctest: +SKIP ... StructField('name', StringType(), False), ... StructField('amount', IntegerType(), True)])) dshape("{ name : string, amount : ?int64 }") """ if ss in rev_types: return rev_types[ss] if isinstance(ss, ArrayType): elem = sparksql_to_ds(ss.elementType) if ss.containsNull: return datashape.var * Option(elem) else: return datashape.var * elem if isinstance(ss, StructType): return dshape( Record([[ field.name, Option(sparksql_to_ds(field.dataType)) if field.nullable else sparksql_to_ds(field.dataType) ] for field in ss.fields])) raise NotImplementedError("SparkSQL type not known %s" % ss)
def schema_to_dshape(schema): if type(schema) in sparksql_to_dshape: return sparksql_to_dshape[type(schema)] if isinstance(schema, ArrayType): dshape = schema_to_dshape(schema.elementType) return datashape.var * (Option(dshape) if schema.containsNull else dshape) if isinstance(schema, StructType): fields = [(field.name, Option(schema_to_dshape(field.dataType)) if field.nullable else schema_to_dshape(field.dataType)) for field in schema.fields] return datashape.dshape(Record(fields)) raise NotImplementedError('SparkSQL type not known %r' % type(schema).__name__)
def dshape_from_pandas(col): if isinstance(col.dtype, categorical): return Categorical(col.cat.categories.tolist()) elif col.dtype.kind == 'M': tz = getattr(col.dtype, 'tz', None) if tz is not None: # Pandas stores this as a pytz.tzinfo, but DataShape wants a # string. tz = str(tz) return Option(DateTime(tz=tz)) dshape = datashape.CType.from_numpy_dtype(col.dtype) dshape = string if dshape == object_ else dshape return Option(dshape) if dshape in possibly_missing else dshape
def discover_csv(c, nrows=1000, **kwargs): df = csv_to_DataFrame(c, nrows=nrows, **kwargs) df = coerce_datetimes(df) if (not list(df.columns) == list(range(len(df.columns))) and any(re.match('^[-\d_]*$', c) for c in df.columns)): df = csv_to_DataFrame(c, chunksize=50, has_header=False).get_chunk() df = coerce_datetimes(df) columns = [ str(c) if not isinstance(c, (str, unicode)) else c for c in df.columns ] df.columns = [c.strip() for c in columns] # Replace np.nan with None. Forces type string rather than flaot for col in df.columns: if df[col].count() == 0: df[col] = [None] * len(df) measure = discover(df).measure # Use Series.notnull to determine Option-ness measure2 = Record([[ name, Option(typ) if (~df[name].notnull()).any() and not isinstance(typ, Option) else typ ] for name, typ in zip(measure.names, measure.types)]) return datashape.var * measure2
class last(Reduction): """Last value encountered in ``column``. Useful for categorical data where an actual value must always be returned, not an average or other numerical calculation. Currently only supported for rasters, externally to this class. Parameters ---------- column : str Name of the column to aggregate over. If the data type is floating point, ``NaN`` values in the column are skipped. """ _dshape = dshape(Option(ct.float64)) @staticmethod def _append(x, y, agg): raise NotImplementedError("last is currently implemented only for rasters") @staticmethod def _create(shape, array_module): raise NotImplementedError("last is currently implemented only for rasters") @staticmethod def _combine(aggs): raise NotImplementedError("last is currently implemented only for rasters") @staticmethod def _finalize(bases, **kwargs): raise NotImplementedError("last is currently implemented only for rasters")
class _upsample(Reduction): """"Special internal class used for upsampling""" _dshape = dshape(Option(ct.float64)) @staticmethod def _finalize(bases, cuda=False, **kwargs): return xr.DataArray(bases[0], **kwargs) @property def inputs(self): return (extract(self.column),) @staticmethod def _create(shape, array_module): # Use uninitialized memory, the upsample function must explicitly set unused # values to nan return array_module.empty(shape, dtype='f8') @staticmethod @ngjit def _append(x, y, agg, field): # not called, the upsample function must set agg directly pass @staticmethod @ngjit def _append_cuda(x, y, agg, field): # not called, the upsample function must set agg directly pass @staticmethod def _combine(aggs): return np.nanmax(aggs, axis=0)
def _dtype(self): # we can't simply use .schema or .datashape because we may have a bare # integer, for example lhs, rhs = discover(self.lhs).measure, discover(self.rhs).measure if isinstance(lhs, Option) or isinstance(rhs, Option): return Option(ct.bool_) return ct.bool_
class mode(Reduction): """Mode (most common value) of all the values encountered in ``column``. Useful for categorical data where an actual value must always be returned, not an average or other numerical calculation. Currently only supported for rasters, externally to this class. Implementing it for other glyph types would be difficult due to potentially unbounded data storage requirements to store indefinite point or line data per pixel. Parameters ---------- column : str Name of the column to aggregate over. If the data type is floating point, ``NaN`` values in the column are skipped. """ _dshape = dshape(Option(ct.float64)) @staticmethod def _append(x, y, agg): raise NotImplementedError("mode is currently implemented only for rasters") @staticmethod def _create(shape, array_module): raise NotImplementedError("mode is currently implemented only for rasters") @staticmethod def _combine(aggs): raise NotImplementedError("mode is currently implemented only for rasters") @staticmethod def _finalize(bases, **kwargs): raise NotImplementedError("mode is currently implemented only for rasters")
def schema(self): """ Examples -------- >>> t = Symbol('t', 'var * {name: string, amount: int}') >>> s = Symbol('t', 'var * {name: string, id: int}') >>> join(t, s).schema dshape("{ name : string, amount : int32, id : int32 }") >>> join(t, s, how='left').schema dshape("{ name : string, amount : int32, id : ?int32 }") """ option = lambda dt: dt if isinstance(dt, Option) else Option(dt) joined = [[name, dt] for name, dt in self.lhs.schema[0].parameters[0] if name in self.on_left] left = [[name, dt] for name, dt in self.lhs.schema[0].parameters[0] if name not in self.on_left] right = [[name, dt] for name, dt in self.rhs.schema[0].parameters[0] if name not in self.on_right] if self.how in ('right', 'outer'): left = [[name, option(dt)] for name, dt in left] if self.how in ('left', 'outer'): right = [[name, option(dt)] for name, dt in right] return dshape(Record(joined + left + right))
class StrFind(ElemWise): """ Find literal substring in string column. """ _arguments = '_child', 'sub' schema = Option(datashape.int64)
def discover_dataframe(df): obj = object_ names = list(df.columns) dtypes = list(map(datashape.CType.from_numpy_dtype, df.dtypes)) dtypes = [string if dt == obj else dt for dt in dtypes] odtypes = [Option(dt) if dt in possibly_missing else dt for dt in dtypes] schema = datashape.Record(list(zip(names, odtypes))) return len(df) * schema
def _subexpr_optimize(expr): if expr.axis != tuple(range(expr._child.ndim)): raise ValueError("Cannot perform 'all' over an axis: %s") if expr.keepdims: raise ValueError("Cannot perform 'all' with keepdims=True") return (~expr._child).coerce(Option(int32) if isinstance(expr._child.dshape, Option) else int32).sum() == 0
def schema(self): measure = self._child.schema.measure # if we are not shifting or we are already an Option type then return # the child's schema if not self.n or isinstance(measure, Option): return measure else: return Option(measure)
class std(Reduction): _dshape = dshape(Option(ct.float64)) @property def _bases(self): return (sum(self.column), count(self.column), m2(self.column)) def _build_finalize(self, dshape): return finalize_std
class FloatingReduction(Reduction): _dshape = dshape(Option(ct.float64)) @staticmethod def _create(shape): return np.full(shape, np.nan, dtype='f8') @staticmethod def _finalize(bases, **kwargs): return xr.DataArray(bases[0], **kwargs)
def check_roundtrip_null_values(table_uri, data, dtype, sqltype, null_values, mask, *, astype=False): """Check the data roundtrip through postgres using warp_prism to read the data Parameters ---------- table_uri : str The uri to a unique table. data : iterable[any] The input data. dtype : str The dtype of the data. sqltype : type The sqlalchemy type of the data. null_values : dict[str, any] The value to coerce ``NULL`` to. astype : bool, optional Coerce the input data to the given dtype before making assertions about the output data. """ table = resource(table_uri, dshape=var * R['a':Option(dtype)]) # Ensure that odo created the table correctly. If these fail the other # tests are not well defined. assert table.columns.keys() == ['a'] assert isinstance(table.columns['a'].type, sqltype) table.insert().values([{'a': v} for v in data]).execute() arrays = to_arrays(table) assert len(arrays) == 1 array, actual_mask = arrays['a'] assert (actual_mask == mask).all() assert (array[mask] == data[mask]).all() output_dataframe = to_dataframe(table, null_values=null_values) if astype: data = data.astype(dshape(dtype).measure.to_numpy_dtype()) expected_dataframe = pd.DataFrame({'a': data}) expected_dataframe[~mask] = null_values.get( 'a', null_values_for_type[array.dtype if array.dtype.kind != 'M' else np. dtype('datetime64[ns]')], ) pd.util.testing.assert_frame_equal( output_dataframe, expected_dataframe, check_dtype=False, )
class FloatingReduction(Reduction): """Base classes for reductions that always have floating-point dtype.""" _dshape = dshape(Option(ct.float64)) @staticmethod def _create(shape): return np.full(shape, np.nan, dtype='f8') @staticmethod def _finalize(bases, **kwargs): return xr.DataArray(bases[0], **kwargs)
class std(Reduction): _dshape = dshape(Option(ct.float64)) @property def _bases(self): return (sum(self.column), count(self.column), m2(self.column)) @staticmethod def _finalize(bases, **kwargs): sums, counts, m2s = bases with np.errstate(divide='ignore', invalid='ignore'): x = np.sqrt(m2s / counts) return xr.DataArray(x, **kwargs)
def schema(self): """ Examples -------- >>> from blaze import symbol >>> t = symbol('t', 'var * {name: string, amount: int}') >>> s = symbol('t', 'var * {name: string, id: int}') >>> join(t, s).schema dshape("{name: string, amount: int32, id: int32}") >>> join(t, s, how='left').schema dshape("{name: string, amount: int32, id: ?int32}") Overlapping but non-joined fields append _left, _right >>> a = symbol('a', 'var * {x: int, y: int}') >>> b = symbol('b', 'var * {x: int, y: int}') >>> join(a, b, 'x').fields ['x', 'y_left', 'y_right'] """ option = lambda dt: dt if isinstance(dt, Option) else Option(dt) joined = [[name, dt] for name, dt in self.lhs.schema[0].parameters[0] if name in self.on_left] left = [[name, dt] for name, dt in zip( self.lhs.fields, types_of_fields(self.lhs.fields, self.lhs)) if name not in self.on_left] right = [[name, dt] for name, dt in zip( self.rhs.fields, types_of_fields(self.rhs.fields, self.rhs)) if name not in self.on_right] # Handle overlapping but non-joined case, e.g. left_other = [name for name, dt in left if name not in self.on_left] right_other = [name for name, dt in right if name not in self.on_right] overlap = set.intersection(set(left_other), set(right_other)) left_suffix, right_suffix = self.suffixes left = [[name + left_suffix if name in overlap else name, dt] for name, dt in left] right = [[name + right_suffix if name in overlap else name, dt] for name, dt in right] if self.how in ('right', 'outer'): left = [[name, option(dt)] for name, dt in left] if self.how in ('left', 'outer'): right = [[name, option(dt)] for name, dt in right] return dshape(Record(joined + left + right))
def discover_sqlalchemy_selectable(t): ordering = {str(c): i for i, c in enumerate(c for c in t.columns.keys())} record = list(_process_columns(t.columns)) fkeys = [discover(fkey, t, parent_measure=Record(record)) for fkey in t.foreign_keys] for name, column in merge(*fkeys).items(): index = ordering[name] _, key_type = record[index] # If the foreign-key is nullable the column (map) key # should be an Option type if isinstance(key_type, Option): column.key = Option(column.key) record[index] = (name, column) return var * Record(record)
class sum(FloatingReduction): """Sum of all elements in ``column``. Elements of resulting aggregate are nan if they are not updated. Parameters ---------- column : str Name of the column to aggregate over. Column data type must be numeric. ``NaN`` values in the column are skipped. """ _dshape = dshape(Option(ct.float64)) # Cuda implementation def _build_bases(self, cuda=False): if cuda: return (_sum_zero(self.column), any(self.column)) else: return (self, ) @staticmethod def _finalize(bases, cuda=False, **kwargs): if cuda: sums, anys = bases x = np.where(anys, sums, np.nan) return xr.DataArray(x, **kwargs) else: return xr.DataArray(bases[0], **kwargs) # Single pass CPU implementation # These methods will only be called if _build_bases returned (self,) @staticmethod @ngjit def _append(x, y, agg, field): if not isnull(field): if isnull(agg[y, x]): agg[y, x] = field else: agg[y, x] += field @staticmethod def _combine(aggs): missing_vals = np.isnan(aggs) all_empty = np.bitwise_and.reduce(missing_vals, axis=0) set_to_zero = missing_vals & ~all_empty return np.where(set_to_zero, 0, aggs).sum(axis=0)
def safely_option(ds): """ Wrap certain types in an option type >>> safely_option('int32') ?int32 >>> safely_option('?int32') ?int32 >>> safely_option('float64') ctype("float64") """ if isinstance(ds, _strtypes): ds = dshape(ds) if isinstance(ds, DataShape) and len(ds) == 1: ds = ds[0] if isinstance(ds, Unit) and 'int' in str(ds) or 'date' in str(ds): return Option(ds) return ds
class count(Reduction): _dshape = dshape(Option(ct.int32)) def validate(self, in_dshape): pass @memoize def _build_create(self, dshape): dtype = numpy_dtype(dshape.measure) return lambda shape: np.zeros(shape, dtype=dtype) def _build_append(self, dshape): return append_count def _build_combine(self, dshape): return combine_count def _build_finalize(self, dshape): return identity
class std(Reduction): """Standard Deviation of all elements in ``column``. Parameters ---------- column : str Name of the column to aggregate over. Column data type must be numeric. ``NaN`` values in the column are skipped. """ _dshape = dshape(Option(ct.float64)) def _build_bases(self, cuda=False): return (_sum_zero(self.column), count(self.column), m2(self.column)) @staticmethod def _finalize(bases, cuda=False, **kwargs): sums, counts, m2s = bases with np.errstate(divide='ignore', invalid='ignore'): x = np.where(counts > 0, np.sqrt(m2s / counts), np.nan) return xr.DataArray(x, **kwargs)
class var(Reduction): """Variance of all elements in ``column``. Parameters ---------- column : str Name of the column to aggregate over. Column data type must be numeric. ``NaN`` values in the column are skipped. """ _dshape = dshape(Option(ct.float64)) @property def _bases(self): return (sum(self.column), count(self.column), m2(self.column)) @staticmethod def _finalize(bases, **kwargs): sums, counts, m2s = bases with np.errstate(divide='ignore', invalid='ignore'): x = m2s/counts return xr.DataArray(x, **kwargs)
def discover_csv(c, nrows=1000, **kwargs): df = csv_to_dataframe(c, nrows=nrows, **kwargs) df = coerce_datetimes(df) columns = [str(c) if not isinstance(c, (str, unicode)) else c for c in df.columns] df.columns = [c.strip() for c in columns] # Replace np.nan with None. Forces type string rather than float for col in df.columns: if not df[col].count(): df[col] = None measure = discover(df).measure # Use Series.notnull to determine Option-ness measure = Record([[name, Option(typ) if df[name].isnull().any() and not isinstance(typ, Option) else typ] for name, typ in zip(measure.names, measure.types)]) return datashape.var * measure
class sum(FloatingReduction): """Sum of all elements in ``column``. Elements of resulting aggregate are nan if they are not updated. Parameters ---------- column : str Name of the column to aggregate over. Column data type must be numeric. ``NaN`` values in the column are skipped. """ _dshape = dshape(Option(ct.float64)) @staticmethod @ngjit def _append_int_field(x, y, agg, field): if np.isnan(agg[y, x]): agg[y, x] = field else: agg[y, x] += field @staticmethod @ngjit def _append_float_field(x, y, agg, field): if not np.isnan(field): if np.isnan(agg[y, x]): agg[y, x] = field else: agg[y, x] += field @staticmethod def _combine(aggs): missing_vals = np.isnan(aggs) all_empty = np.bitwise_and.reduce(missing_vals, axis=0) set_to_zero = missing_vals & ~all_empty return np.where(set_to_zero, 0, aggs).sum(axis=0)
def test_datetimetz_pandas(): df = pd.DataFrame( OrderedDict([ ('naive', pd.date_range('2014', periods=5)), ('Europe/Moscow', pd.date_range('2014', periods=5, tz='Europe/Moscow')), ('UTC', pd.date_range('2014', periods=5, tz='UTC')), ('US/Eastern', pd.date_range('2014', periods=5, tz='US/Eastern')), ])) assert_dshape_equal( discover(df), 5 * Record['naive':Option(DateTime(tz=None)), 'Europe/Moscow':Option(DateTime(tz='Europe/Moscow')), 'UTC':Option(DateTime(tz='UTC')), 'US/Eastern':Option(DateTime(tz='US/Eastern')), ]) assert_dshape_equal(discover(df.naive), 5 * Option(DateTime(tz=None))) for tz in ('Europe/Moscow', 'UTC', 'US/Eastern'): assert_dshape_equal(discover(df[tz]), 5 * Option(DateTime(tz=tz)))
def test_csv_missing_values(): with filetext('name,val\nAlice,100\nNA,200', extension='csv') as fn: csv = CSV(fn) assert discover(csv).measure.dict['name'] == Option(string)
def optionify(d): if isinstance(d, DataShape): return DataShape(*(optionify(i) for i in d.parameters)) return d if isinstance(d, Option) else Option(d)