def expr_repr(expr, n=10): # Pure Expressions, not interactive if not expr._resources(): return str(expr) # Scalars if ndim(expr) == 0 and isscalar(expr.dshape): return repr(coerce_scalar(compute(expr), str(expr.dshape))) # Tables if (ndim(expr) == 1 and (istabular(expr.dshape) or isscalar(expr.dshape.measure))): return repr_tables(expr, 10) # Smallish arrays if ndim(expr) >= 2 and numel(expr.shape) and numel(expr.shape) < 1000000: return repr(compute(expr)) # Other dat = expr._resources().values() if len(dat) == 1: dat = list(dat)[0] # may be dict_values s = 'Data: %s' % dat if not isinstance(expr, Symbol): s += '\nExpr: %s' % str(expr) s += '\nDataShape: %s' % short_dshape(expr.dshape, nlines=7) return s
def bypixel(source, canvas, glyph, summary): dshape = discover(source) if not istabular(dshape): raise ValueError("source must be tabular") schema = dshape.measure glyph.validate(schema) summary.validate(schema) return pipeline(source, schema, canvas, glyph, summary)
def fastmsgpack_materialize(data, dshape, odo_kwargs): if istabular(dshape): return odo(data, pd.DataFrame, **odo_kwargs) if iscollection(dshape): return odo(data, pd.Series, **odo_kwargs) if isscalar(dshape): return coerce_scalar(data, str(dshape), odo_kwargs) return data
def select_or_selectable_to_series(el, bind=None, dshape=None, **kwargs): bind = _getbind(el, bind) if istabular(dshape) or bind.dialect.name != 'postgresql': # fall back to the general edge raise NotImplementedError() return to_dataframe(el, bind=bind).iloc[:, 0]
def coerce_core(result, dshape, odo_kwargs=None): """Coerce data to a core data type.""" if iscoretype(result): return result elif isscalar(dshape): result = coerce_scalar(result, dshape, odo_kwargs=odo_kwargs) elif istabular(dshape) and isrecord(dshape.measure): result = into(DataFrame, result, **(odo_kwargs or {})) elif iscollection(dshape): dim = _dimensions(dshape) if dim == 1: result = into(Series, result, **(odo_kwargs or {})) elif dim > 1: result = into(np.ndarray, result, **(odo_kwargs or {})) else: raise ValueError("Expr with dshape dimensions < 1 should have been handled earlier: dim={}".format(str(dim))) else: raise ValueError("Expr does not evaluate to a core return type") return result
def bypixel(source, canvas, glyph, agg): """Compute an aggregate grouped by pixel sized bins. Aggregate input data ``source`` into a grid with shape and axis matching ``canvas``, mapping data to bins by ``glyph``, and aggregating by reduction ``agg``. Parameters ---------- source : pandas.DataFrame, dask.DataFrame Input datasource canvas : Canvas glyph : Glyph agg : Reduction """ dshape = discover(source) if not istabular(dshape): raise ValueError("source must be tabular") schema = dshape.measure glyph.validate(schema) agg.validate(schema) return bypixel.pipeline(source, schema, canvas, glyph, agg)
def coerce_core(result, dshape, odo_kwargs=None): """Coerce data to a core data type.""" if iscoretype(result): return result elif isscalar(dshape): result = coerce_scalar(result, dshape, odo_kwargs=odo_kwargs) elif istabular(dshape) and isrecord(dshape.measure): result = into(DataFrame, result, **(odo_kwargs or {})) elif iscollection(dshape): dim = _dimensions(dshape) if dim == 1: result = into(Series, result, **(odo_kwargs or {})) elif dim > 1: result = into(np.ndarray, result, **(odo_kwargs or {})) else: msg = "Expr with dshape dimensions < 1 should have been handled earlier: dim={}" raise ValueError(msg.format(str(dim))) else: msg = "Expr does not evaluate to a core return type" raise ValueError(msg) return result
def _peek(expr): # Pure Expressions, not interactive if not set(expr._resources().keys()).issuperset(expr._leaves()): return expr # Scalars if ndim(expr) == 0 and isscalar(expr.dshape): return coerce_scalar(compute(expr), str(expr.dshape)) # Tables if (ndim(expr) == 1 and (istabular(expr.dshape) or isscalar(expr.dshape.measure) or isinstance(expr.dshape.measure, Map))): return _peek_tables(expr, 10) # Smallish arrays if ndim(expr) >= 2 and numel(expr.shape) and numel(expr.shape) < 1000000: return compute(expr) # Other dat = expr._resources().values() if len(dat) == 1: dat = list(dat)[0] # may be dict_values return dat
def _bound_symbol(cls, data_source, dshape, name, fields, schema, **kwargs): if schema and dshape: raise ValueError( 'Please specify one of schema= or dshape= keyword arguments', ) if isinstance(data_source, BoundSymbol): return _bound_symbol( cls, data_source.data, dshape, name, fields, schema, **kwargs ) if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if isinstance(data_source, _strtypes): data_source = resource( data_source, schema=schema, dshape=dshape, **kwargs ) if (isinstance(data_source, Iterator) and not isinstance(data_source, tuple(not_an_iterator))): data_source = tuple(data_source) if not dshape: dshape = discover(data_source) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema,))) elif isscalar(dshape.measure) and fields: types = (dshape.measure,) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema,))) elif isrecord(dshape.measure) and fields: ds = discover(data_source) assert isrecord(ds.measure) names = ds.measure.names if names != fields: raise ValueError( 'data column names %s\n' '\tnot equal to fields parameter %s,\n' '\tuse data(data_source).relabel(%s) to rename ' 'fields' % ( names, fields, ', '.join( '%s=%r' % (k, v) for k, v in zip(names, fields) ), ), ) types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema,))) ds = datashape.dshape(dshape) if name is generate: if istabular(dshape): name = next(_names) else: name = None return cls(data_source, ds, name)
def _pd_from_dshape(x, dshape): if istabular(dshape) or len(dshape.shape) > 1: return pd.DataFrame(list(x)) # We name the series something random so that it doesn't clash in the # pd.concat call. Users will never see this name. return pd.Series(list(x), name=uuid4().hex)
def _bound_symbol(cls, data_source, dshape, name, fields, schema, **kwargs): if schema and dshape: raise ValueError( 'Please specify one of schema= or dshape= keyword arguments', ) if isinstance(data_source, BoundSymbol): return _bound_symbol(cls, data_source.data, dshape, name, fields, schema, **kwargs) if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if isinstance(data_source, _strtypes): data_source = resource(data_source, schema=schema, dshape=dshape, **kwargs) if (isinstance(data_source, Iterator) and not isinstance(data_source, tuple(not_an_iterator))): data_source = tuple(data_source) if not dshape: dshape = discover(data_source) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) elif isscalar(dshape.measure) and fields: types = (dshape.measure, ) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema, ))) elif isrecord(dshape.measure) and fields: ds = discover(data_source) assert isrecord(ds.measure) names = ds.measure.names if names != fields: raise ValueError( 'data column names %s\n' '\tnot equal to fields parameter %s,\n' '\tuse data(data_source).relabel(%s) to rename ' 'fields' % ( names, fields, ', '.join('%s=%r' % (k, v) for k, v in zip(names, fields)), ), ) types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) ds = datashape.dshape(dshape) if name is generate: if istabular(dshape): name = next(_names) else: name = None return cls(data_source, ds, name)