def post_compute(expr, query, scope=None): """ Execute SQLAlchemy query against SQLAlchemy engines If the result of compute is a SQLAlchemy query then it is likely that the data elements are themselves SQL objects which contain SQLAlchemy engines. We find these engines and, if they are all the same, run the query against these engines and return the result. """ if not all( isinstance(val, (Engine, MetaData, Table)) for val in scope.values()): return query engines = set(filter(None, map(engine_of, scope.values()))) if not engines: return query if len(set(map(str, engines))) != 1: raise NotImplementedError("Expected single SQLAlchemy engine") engine = toolz.first(engines) with engine.connect() as conn: # Perform query result = conn.execute(select(query)).fetchall() if isscalar(expr.dshape): return result[0][0] if isscalar(expr.dshape.measure): return [x[0] for x in result] return result
def post_compute(expr, query, scope=None): """ Execute SQLAlchemy query against SQLAlchemy engines If the result of compute is a SQLAlchemy query then it is likely that the data elements are themselves SQL objects which contain SQLAlchemy engines. We find these engines and, if they are all the same, run the query against these engines and return the result. """ if not all(isinstance(val, (MetaData, Engine, Table)) for val in scope.values()): return query engines = set(filter(None, map(engine_of, scope.values()))) if not engines: return query if len(set(map(str, engines))) != 1: raise NotImplementedError("Expected single SQLAlchemy engine") engine = first(engines) with engine.connect() as conn: # Perform query result = conn.execute(select(query)).fetchall() if isscalar(expr.dshape): return result[0][0] if isscalar(expr.dshape.measure): return [x[0] for x in result] return result
def post_compute(e, q, d): """ Execute a query using MongoDB's aggregation pipeline The compute_up functions operate on Mongo Collection / list-of-dict queries. Once they're done we need to actually execute the query on MongoDB. We do this using the aggregation pipeline framework. http://docs.mongodb.org/manual/core/aggregation-pipeline/ """ d = {'$project': toolz.merge({'_id': 0}, # remove mongo identifier dict((col, 1) for col in e.fields))} q = q.append(d) if not e.dshape.shape: # not a collection result = q.coll.aggregate(list(q.query))['result'][0] if isscalar(e.dshape.measure): return result[e._name] else: return get(e.fields, result) dicts = q.coll.aggregate(list(q.query))['result'] if isscalar(e.dshape.measure): return list(pluck(e.fields[0], dicts, default=None)) # dicts -> values else: return list(pluck(e.fields, dicts, default=None)) # dicts -> tuples
def expr_repr(expr, n=10): # Pure Expressions, not interactive if not expr._resources(): return str(expr) # Scalars if ndim(expr) == 0 and isscalar(expr.dshape): return repr(coerce_scalar(compute(expr), str(expr.dshape))) # Tables if (ndim(expr) == 1 and (istabular(expr.dshape) or isscalar(expr.dshape.measure))): return repr_tables(expr, 10) # Smallish arrays if ndim(expr) >= 2 and numel(expr.shape) and numel(expr.shape) < 1000000: return repr(compute(expr)) # Other dat = expr._resources().values() if len(dat) == 1: dat = list(dat)[0] # may be dict_values s = 'Data: %s' % dat if not isinstance(expr, Symbol): s += '\nExpr: %s' % str(expr) s += '\nDataShape: %s' % short_dshape(expr.dshape, nlines=7) return s
def _eq(self, other): if (isscalar(self.dshape.measure) and (not isinstance(other, Expr) or isscalar(other.dshape.measure))): return broadcast(Eq, self, other) else: return self.isidentical(other)
def compute_up(t, rdd, **kwargs): grouper = optimize(t.grouper, rdd) apply = optimize(t.apply, rdd) t = by(grouper, apply) if ((isinstance(t.apply, Reduction) and type(t.apply) in binops) or (isinstance(t.apply, Summary) and builtins.all(type(val) in binops for val in t.apply.values))): grouper, binop, combiner, initial = reduce_by_funcs(t) if isscalar(t.grouper.dshape.measure): keyfunc = lambda x: (x,) else: keyfunc = identity if isscalar(t.apply.dshape.measure): valfunc = lambda x: (x,) else: valfunc = identity unpack = lambda kv: keyfunc(kv[0]) + valfunc(kv[1]) create = lambda v: binop(initial, v) return (rdd.keyBy(grouper) .combineByKey(create, binop, combiner) .map(unpack)) else: raise NotImplementedError("By only implemented for common reductions." "\nGot %s" % type(t.apply))
def _name(self): measure = self.dshape.measure if len(self._inputs) == 1 and isscalar(getattr(measure, 'key', measure)): child_measure = self._child.dshape.measure if isscalar(getattr(child_measure, 'key', child_measure)): return self._child._name
def post_compute(e, q, scope=None): """ Execute a query using MongoDB's aggregation pipeline The compute_up functions operate on Mongo Collection / list-of-dict queries. Once they're done we need to actually execute the query on MongoDB. We do this using the aggregation pipeline framework. http://docs.mongodb.org/manual/core/aggregation-pipeline/ """ scope = { '$project': toolz.merge( {'_id': 0}, # remove mongo identifier dict((col, 1) for col in e.fields)) } q = q.append(scope) if not e.dshape.shape: # not a collection result = q.coll.aggregate(list(q.query))['result'][0] if isscalar(e.dshape.measure): return result[e._name] else: return get(e.fields, result) dicts = q.coll.aggregate(list(q.query))['result'] if isscalar(e.dshape.measure): return list(pluck(e.fields[0], dicts, default=None)) # dicts -> values else: return list(pluck(e.fields, dicts, default=None)) # dicts -> tuples
def _name(self): measure = self.dshape.measure if len(self._inputs) == 1 and isscalar(getattr(measure, 'key', measure)): child_measure = self._child.dshape.measure if isscalar(getattr(child_measure, 'key', child_measure)): # memoize the result return _setattr(self, '_name', self._child._name)
def test_map(): t = TableSymbol("t", "{name: string, amount: int32, id: int32}") inc = lambda x: x + 1 assert isscalar(t["amount"].map(inc, schema="int").dshape.measure) s = t["amount"].map(inc, schema="{amount: int}") assert not isscalar(s.dshape.measure) assert s.dshape == dshape("var * {amount: int}") expr = t[["name", "amount"]].map(identity, schema="{name: string, amount: int}") assert expr._name is None
def test_map(): t = TableSymbol('t', '{name: string, amount: int32, id: int32}') inc = lambda x: x + 1 assert isscalar(t['amount'].map(inc, schema='int').dshape.measure) s = t['amount'].map(inc, schema='{amount: int}') assert not isscalar(s.dshape.measure) assert s.dshape == dshape('var * {amount: int}') expr = (t[['name', 'amount']] .map(identity, schema='{name: string, amount: int}')) assert expr._name is None
def test_map(): t = symbol('t', 'var * {name: string, amount: int32, id: int32}') inc = lambda x: x + 1 assert isscalar(t['amount'].map(inc, schema='int').dshape.measure) s = t['amount'].map(inc, schema='{amount: int}') assert not isscalar(s.dshape.measure) assert s.dshape == dshape('var * {amount: int}') expr = (t[['name', 'amount']] .map(identity, schema='{name: string, amount: int}')) assert expr._name is None
def test_relabel(): t = TableSymbol("t", "{name: string, amount: int32, id: int32}") rl = t.relabel({"name": "NAME", "id": "ID"}) rlc = t["amount"].relabel({"amount": "BALANCE"}) assert eval(str(rl)).isidentical(rl) print(rl.fields) assert rl.fields == ["NAME", "amount", "ID"] assert not isscalar(rl.dshape.measure) assert isscalar(rlc.dshape.measure)
def test_relabel(): t = symbol('t', 'var * {name: string, amount: int32, id: int32}') rl = t.relabel({'name': 'NAME', 'id': 'ID'}) rlc = t['amount'].relabel({'amount': 'BALANCE'}) assert eval(str(rl)).isidentical(rl) print(rl.fields) assert rl.fields == ['NAME', 'amount', 'ID'] assert not isscalar(rl.dshape.measure) assert isscalar(rlc.dshape.measure)
def test_relabel(): t = TableSymbol('t', '{name: string, amount: int32, id: int32}') rl = t.relabel({'name': 'NAME', 'id': 'ID'}) rlc = t['amount'].relabel({'amount': 'BALANCE'}) assert eval(str(rl)).isidentical(rl) print(rl.fields) assert rl.fields == ['NAME', 'amount', 'ID'] assert not isscalar(rl.dshape.measure) assert isscalar(rlc.dshape.measure)
def default_materialize(data, dshape, odo_kwargs): if iscollection(dshape): return odo(data, list, **odo_kwargs) if isscalar(dshape): return coerce_scalar(data, str(dshape), odo_kwargs) return data
def compserver(payload, serial): ns = payload.get('namespace', dict()) compute_kwargs = payload.get('compute_kwargs') or {} odo_kwargs = payload.get('odo_kwargs') or {} dataset = _get_data() ns[':leaf'] = symbol('leaf', discover(dataset)) expr = from_tree(payload['expr'], namespace=ns) assert len(expr._leaves()) == 1 leaf = expr._leaves()[0] try: result = compute(expr, {leaf: dataset}, **compute_kwargs) if iscollection(expr.dshape): result = odo(result, list, **odo_kwargs) elif isscalar(expr.dshape): result = coerce_scalar(result, str(expr.dshape)) except NotImplementedError as e: # 501: Not Implemented return ("Computation not supported:\n%s" % e, 501) except Exception as e: # 500: Internal Server Error return ( "Computation failed with message:\n%s: %s" % (type(e).__name__, e), 500, ) return serial.dumps({ 'datashape': pprint(expr.dshape, width=0), 'data': result, 'names': expr.fields })
def unit_to_dtype(ds): """ >>> unit_to_dtype('int32') dtype('int32') >>> unit_to_dtype('float64') dtype('float64') >>> unit_to_dtype('?int64') dtype('float64') >>> unit_to_dtype('string') dtype('O') >>> unit_to_dtype('?datetime') dtype('<M8[us]') """ if isinstance(ds, str): ds = dshape(ds) if isinstance(ds, DataShape): ds = ds.measure if isinstance(ds, Option) and isscalar(ds) and isnumeric(ds): return unit_to_dtype(str(ds).replace('int', 'float').replace('?', '')) if isinstance(ds, Option) and isinstance( ds.ty, (type(date_), type(datetime_), type(string), type(timedelta_))): ds = ds.ty if ds == string: return np.dtype('O') return to_numpy_dtype(ds)
def list_to_numpy(seq, dshape=None, **kwargs): if isinstance(element_of(seq), dict): seq = list(records_to_tuples(dshape, seq)) if (seq and isinstance(seq[0], Iterable) and not ishashable(seq[0]) and not isscalar(dshape)): seq = list(map(tuple, seq)) return np.array(seq, dtype=dshape_to_numpy(dshape))
def __getattr__(self, key): assert key != '_hash', \ '%s should set _hash in __init__' % type(self).__name__ try: return _attr_cache[(self, key)] except: pass try: result = object.__getattribute__(self, key) except AttributeError: fields = dict(zip(map(valid_identifier, self.fields), self.fields)) # prefer the method if there's a field with the same name methods = toolz.merge( schema_methods(self.dshape.measure), dshape_methods(self.dshape) ) if key in methods: func = methods[key] if func in method_properties: result = func(self) else: result = boundmethod(func, self) elif self.fields and key in fields: if isscalar(self.dshape.measure): # t.foo.foo is t.foo result = self else: result = self[fields[key]] else: raise _attr_cache[(self, key)] = result return result
def compserver(dataset): if request.headers['content-type'] != 'application/json': return ("Expected JSON data", 404) try: payload = json.loads(request.data.decode('utf-8')) except ValueError: return ("Bad JSON. Got %s " % request.data, 404) ns = payload.get('namespace', dict()) ns[':leaf'] = symbol('leaf', discover(dataset)) expr = from_tree(payload['expr'], namespace=ns) assert len(expr._leaves()) == 1 leaf = expr._leaves()[0] try: result = compute(expr, {leaf: dataset}) except Exception as e: return ("Computation failed with message:\n%s" % e, 500) if iscollection(expr.dshape): result = into(list, result) elif isscalar(expr.dshape): result = coerce_scalar(result, str(expr.dshape)) return json.dumps({'datashape': str(expr.dshape), 'data': result}, default=json_dumps)
def __getattr__(self, key): if key == '_hash': raise AttributeError() try: return _attr_cache[(self, key)] except: pass try: result = object.__getattribute__(self, key) except AttributeError: fields = dict(zip(map(valid_identifier, self.fields), self.fields)) if self.fields and key in fields: if isscalar(self.dshape.measure): # t.foo.foo is t.foo result = self else: result = self[fields[key]] else: d = toolz.merge(schema_methods(self.dshape.measure), dshape_methods(self.dshape)) if key in d: func = d[key] if func in method_properties: result = func(self) else: result = boundmethod(func, self) else: raise _attr_cache[(self, key)] = result return result
def __getattr__(self, key): assert key != "_hash", "%s expressions should set _hash in __init__" % type(self).__name__ try: result = object.__getattribute__(self, key) except AttributeError: fields = dict(zip(map(valid_identifier, self.fields), self.fields)) # prefer the method if there's a field with the same name methods = toolz.merge(schema_methods(self.dshape.measure), dshape_methods(self.dshape)) if key in methods: func = methods[key] if func in method_properties: result = func(self) else: result = boundmethod(func, self) elif self.fields and key in fields: if isscalar(self.dshape.measure): # t.foo.foo is t.foo result = self else: result = self[fields[key]] else: raise # cache the attribute lookup, getattr will not be invoked again. setattr(self, key, result) return result
def compserver(serial_format): try: serial = _get_format(serial_format) except KeyError: return 'Unsupported serialization format', 404 try: payload = serial.loads(request.data) except ValueError: return ("Bad data. Got %s " % request.data, 400) # 400: Bad Request ns = payload.get('namespace', dict()) dataset = _get_data() ns[':leaf'] = symbol('leaf', discover(dataset)) expr = from_tree(payload['expr'], namespace=ns) assert len(expr._leaves()) == 1 leaf = expr._leaves()[0] try: result = compute(expr, {leaf: dataset}) if iscollection(expr.dshape): result = odo(result, list) elif isscalar(expr.dshape): result = coerce_scalar(result, str(expr.dshape)) except NotImplementedError as e: # 501: Not Implemented return ("Computation not supported:\n%s" % e, 501) except Exception as e: # 500: Internal Server Error return ("Computation failed with message:\n%s" % e, 500) return serial.dumps({'datashape': str(expr.dshape), 'data': result})
def test_base(): for expr, exclusions in expressions.items(): if iscollection(expr.dshape): model = into(DataFrame, into(np.ndarray, expr._subs({t: Data(base, t.dshape)}))) else: model = compute(expr._subs({t: Data(base, t.dshape)})) print('\nexpr: %s\n' % expr) for source in sources: if id(source) in map(id, exclusions): continue print('%s <- %s' % (typename(model), typename(source))) T = Data(source) if iscollection(expr.dshape): result = into(type(model), expr._subs({t: T})) if isscalar(expr.dshape.measure): assert set(into(list, result)) == set(into(list, model)) else: assert df_eq(result, model) elif isrecord(expr.dshape): result = compute(expr._subs({t: T})) assert into(tuple, result) == into(tuple, model) else: result = compute(expr._subs({t: T})) try: result = result.scalar() except AttributeError: pass assert result == model
def compserver(): if request.headers['content-type'] != 'application/json': return ("Expected JSON data", 415) # 415: Unsupported Media Type try: payload = json.loads(request.data.decode('utf-8')) except ValueError: return ("Bad JSON. Got %s " % request.data, 400) # 400: Bad Request ns = payload.get('namespace', dict()) dataset = _get_data() ns[':leaf'] = symbol('leaf', discover(dataset)) expr = from_tree(payload['expr'], namespace=ns) assert len(expr._leaves()) == 1 leaf = expr._leaves()[0] try: result = compute(expr, {leaf: dataset}) if iscollection(expr.dshape): result = odo(result, list) elif isscalar(expr.dshape): result = coerce_scalar(result, str(expr.dshape)) except NotImplementedError as e: # 501: Not Implemented return ("Computation not supported:\n%s" % e, 501) except Exception as e: # 500: Internal Server Error return ("Computation failed with message:\n%s" % e, 500) return json.dumps({'datashape': str(expr.dshape), 'data': result}, default=json_dumps)
def __getattr__(self, key): assert key != '_hash', \ '%s should set _hash in _init' % type(self).__name__ try: result = object.__getattribute__(self, key) except AttributeError: fields = dict(zip(map(valid_identifier, self.fields), self.fields)) # prefer the method if there's a field with the same name methods = toolz.merge( schema_methods(self.dshape.measure), dshape_methods(self.dshape) ) if key in methods: func = methods[key] if func in method_properties: result = func(self) else: result = boundmethod(func, self) elif self.fields and key in fields: if isscalar(self.dshape.measure): # t.foo.foo is t.foo result = self else: result = self[fields[key]] else: raise # cache the attribute lookup, getattr will not be invoked again. _setattr(self, key, result) return result
def unit_to_dtype(ds): """ >>> unit_to_dtype('int32') dtype('int32') >>> unit_to_dtype('float64') dtype('float64') >>> unit_to_dtype('?int64') dtype('float64') >>> unit_to_dtype('string') dtype('O') >>> unit_to_dtype('?datetime') dtype('<M8[us]') """ if isinstance(ds, str): ds = dshape(ds) if isinstance(ds, DataShape): ds = ds.measure if isinstance(ds, Option) and isscalar(ds) and isnumeric(ds): return unit_to_dtype(str(ds).replace('int', 'float').replace('?', '')) if isinstance(ds, Option) and isinstance(ds.ty, (type(date_), type(datetime_), type(string), type(timedelta_))): ds = ds.ty if ds == string: return np.dtype('O') return to_numpy_dtype(ds)
def _get_field(self, key): for child in self.children: if key in child.fields: if isscalar(child.dshape.measure): return child else: return child[key]
def __getattr__(self, key): if key == '_hash': raise AttributeError() try: return _attr_cache[(self, key)] except: pass try: result = object.__getattribute__(self, key) except AttributeError: fields = dict(zip(map(valid_identifier, self.fields), self.fields)) if self.fields and key in fields: if isscalar(self.dshape.measure): # t.foo.foo is t.foo result = self else: result = self[fields[key]] else: d = toolz.merge(schema_methods(self.dshape.measure), dshape_methods(self.dshape)) if key in d: func = d[key] if func in method_properties: result = func(self) else: result = functools.update_wrapper( partial(func, self), func) else: raise _attr_cache[(self, key)] = result return result
def _get_field(self, key): for arg in self.args: if key in arg.fields: if isscalar(arg.dshape.measure): return arg else: return arg[key]
def __getattr__(self, key): if key == '_hash': raise AttributeError(key) try: return _attr_cache[(self, key)] except: pass try: result = object.__getattribute__(self, key) except AttributeError: fields = dict(zip(map(valid_identifier, self.fields), self.fields)) # prefer the method if there's a field with the same name methods = toolz.merge(schema_methods(self.dshape.measure), dshape_methods(self.dshape)) if key in methods: func = methods[key] if func in method_properties: result = func(self) else: result = boundmethod(func, self) elif self.fields and key in fields: if isscalar(self.dshape.measure): # t.foo.foo is t.foo result = self else: result = self[fields[key]] else: raise _attr_cache[(self, key)] = result return result
def test_base(): for expr, exclusions in expressions.items(): if iscollection(expr.dshape): model = into( DataFrame, into(np.ndarray, expr._subs({t: Data(base, t.dshape)}))) else: model = compute(expr._subs({t: Data(base, t.dshape)})) print('\nexpr: %s\n' % expr) for source in sources: if id(source) in map(id, exclusions): continue print('%s <- %s' % (typename(model), typename(source))) T = Data(source) if iscollection(expr.dshape): result = into(type(model), expr._subs({t: T})) if isscalar(expr.dshape.measure): assert set(into(list, result)) == set(into(list, model)) else: assert df_eq(result, model) elif isrecord(expr.dshape): result = compute(expr._subs({t: T})) assert into(tuple, result) == into(tuple, model) else: result = compute(expr._subs({t: T})) try: result = result.scalar() except AttributeError: pass assert result == model
def Data(data, dshape=None, name=None, fields=None, columns=None, schema=None, **kwargs): sub_uri = '' if isinstance(data, _strtypes): if '::' in data: data, sub_uri = data.split('::') data = resource(data, schema=schema, dshape=dshape, columns=columns, **kwargs) if (isinstance(data, Iterator) and not isinstance(data, tuple(not_an_iterator))): data = tuple(data) if columns: warnings.warn("columns kwarg deprecated. Use fields instead", DeprecationWarning) if columns and not fields: fields = columns if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not dshape: dshape = discover(data) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema,))) elif isscalar(dshape.measure) and fields: types = (dshape.measure,) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema,))) elif isrecord(dshape.measure) and fields: ds = discover(data) assert isrecord(ds.measure) names = ds.measure.names if names != fields: raise ValueError('data column names %s\n' '\tnot equal to fields parameter %s,\n' '\tuse Data(data).relabel(%s) to rename fields' % (names, fields, ', '.join('%s=%r' % (k, v) for k, v in zip(names, fields)))) types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema,))) ds = datashape.dshape(dshape) result = InteractiveSymbol(data, ds, name) if sub_uri: for field in sub_uri.split('/'): if field: result = result[field] return result
def _name(self): if not isscalar(self.dshape.measure): return None l, r = name(self.lhs), name(self.rhs) if l and not r: return l if r and not l: return r
def fastmsgpack_materialize(data, dshape, odo_kwargs): if istabular(dshape): return odo(data, pd.DataFrame, **odo_kwargs) if iscollection(dshape): return odo(data, pd.Series, **odo_kwargs) if isscalar(dshape): return coerce_scalar(data, str(dshape), odo_kwargs) return data
def compute_up(t, seq, **kwargs): if isscalar(t._child.dshape.measure) and t.key == t._child._name: key = identity elif isinstance(t.key, (str, unicode, tuple, list)): key = rowfunc(t._child[t.key]) else: key = rrowfunc(optimize(t.key, seq), t._child) return sorted(seq, key=key, reverse=not t.ascending)
def test_distinct(): (chunk, chunk_expr), (agg, agg_expr) = split(t, count(t.amount.distinct())) assert chunk.schema == t.schema assert chunk_expr.isidentical(chunk.amount.distinct()) assert isscalar(agg.dshape.measure) assert agg_expr.isidentical(count(agg.distinct()))
def test_reductions(): (chunk, chunk_expr), (agg, agg_expr) = split(t, t.amount.nunique()) assert chunk.schema == t.schema assert chunk_expr.isidentical(chunk.amount.distinct()) assert isscalar(agg.dshape.measure) assert agg_expr.isidentical(agg.distinct().count()) (chunk, chunk_expr), (agg, agg_expr) = \ split(t, t.amount.nunique(keepdims=True)) assert chunk.schema == t.schema assert chunk_expr.isidentical(chunk.amount.distinct()) assert isscalar(agg.dshape.measure) assert agg_expr.isidentical(agg.distinct().count(keepdims=True))
def test_sum(): (chunk, chunk_expr), (agg, agg_expr) = split(t, t.amount.sum()) assert chunk.schema == t.schema assert chunk_expr.isidentical(chunk.amount.sum(keepdims=True)) assert isscalar(agg.dshape.measure) assert agg_expr.isidentical(sum(agg))
def Data(data, dshape=None, name=None, fields=None, columns=None, schema=None, **kwargs): sub_uri = '' if isinstance(data, _strtypes): if '::' in data: data, sub_uri = data.split('::') data = resource(data, schema=schema, dshape=dshape, columns=columns, **kwargs) if (isinstance(data, Iterator) and not isinstance(data, tuple(not_an_iterator))): data = tuple(data) if columns: warnings.warn("columns kwarg deprecated. Use fields instead", DeprecationWarning) if columns and not fields: fields = columns if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not dshape: dshape = discover(data) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) elif isscalar(dshape.measure) and fields: types = (dshape.measure, ) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema, ))) elif isrecord(dshape.measure) and fields: types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) ds = datashape.dshape(dshape) name = name or next(names) result = InteractiveSymbol(data, ds, name) if sub_uri: for field in sub_uri.split('/'): if field: result = result[field] return result
def Data(data, dshape=None, name=None, fields=None, columns=None, schema=None, **kwargs): if columns: raise ValueError("columns argument deprecated, use fields instead") if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if isinstance(data, InteractiveSymbol): return Data(data.data, dshape, name, fields, columns, schema, **kwargs) if isinstance(data, _strtypes): data = resource(data, schema=schema, dshape=dshape, columns=columns, **kwargs) if (isinstance(data, Iterator) and not isinstance(data, tuple(not_an_iterator))): data = tuple(data) if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not dshape: dshape = discover(data) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) elif isscalar(dshape.measure) and fields: types = (dshape.measure, ) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema, ))) elif isrecord(dshape.measure) and fields: ds = discover(data) assert isrecord(ds.measure) names = ds.measure.names if names != fields: raise ValueError( 'data column names %s\n' '\tnot equal to fields parameter %s,\n' '\tuse Data(data).relabel(%s) to rename ' 'fields' % (names, fields, ', '.join('%s=%r' % (k, v) for k, v in zip(names, fields)))) types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) ds = datashape.dshape(dshape) return InteractiveSymbol(data, ds, name)
def select_to_base(sel, dshape=None, **kwargs): engine = sel.bind # TODO: get engine from select with engine.connect() as conn: result = conn.execute(sel) assert not dshape or isscalar(dshape) result = list(result)[0][0] return result
def test_by_sum(): (chunk, chunk_expr), (agg, agg_expr) = \ split(t, by(t.name, total=t.amount.sum())) assert chunk.schema == t.schema assert chunk_expr.isidentical(by(chunk.name, total=chunk.amount.sum())) assert not isscalar(agg.dshape.measure) assert agg_expr.isidentical(by(agg.name, total=agg.total.sum()))
def _name(self): if not isscalar(self.dshape.measure): return None l, r = name(self.lhs), name(self.rhs) if l and not r: return l if r and not l: return r if l == r: return l