def test_base(): for expr, exclusions in expressions.items(): if iscollection(expr.dshape): model = into(DataFrame, into(np.ndarray, expr._subs({t: Data(base, t.dshape)}))) else: model = compute(expr._subs({t: Data(base, t.dshape)})) print('\nexpr: %s\n' % expr) for source in sources: if id(source) in map(id, exclusions): continue print('%s <- %s' % (typename(model), typename(source))) T = Data(source) if iscollection(expr.dshape): result = into(type(model), expr._subs({t: T})) if isscalar(expr.dshape.measure): assert set(into(list, result)) == set(into(list, model)) else: assert df_eq(result, model) elif isrecord(expr.dshape): result = compute(expr._subs({t: T})) assert into(tuple, result) == into(tuple, model) else: result = compute(expr._subs({t: T})) try: result = result.scalar() except AttributeError: pass assert result == model
def concrete_head(expr, n=10): """ Return head of computed expression """ if not expr._resources(): raise ValueError("Expression does not contain data resources") if not iscollection(expr.dshape): return compute(expr) head = expr.head(n + 1) if not iscollection(expr.dshape): return odo(head, object) elif isrecord(expr.dshape.measure): return odo(head, DataFrame) else: df = odo(head, DataFrame) df.columns = [expr._name] return df result = compute(head) if len(result) == 0: return DataFrame(columns=expr.fields) if isrecord(expr.dshape.measure): return odo(result, DataFrame, dshape=expr.dshape) else: df = odo(result, DataFrame, dshape=expr.dshape) df.columns = [expr._name] return df
def test_comprehensive(): L = [[100, 1, 'Alice'], [200, 2, 'Bob'], [300, 3, 'Charlie'], [400, 4, 'Dan'], [500, 5, 'Edith']] df = DataFrame(L, columns=['amount', 'id', 'name']) rdd = into(sc, df) srdd = into(sqlContext, df) t = TableSymbol('t', '{amount: int64, id: int64, name: string}') expressions = { t: [], t['id']: [], t.id.max(): [], t.amount.sum(): [], t.amount + 1: [], sin(t.amount): [srdd], # sparksql without hiveql doesn't support math exp(t.amount): [srdd], # sparksql without hiveql doesn't support math t.amount > 50: [], t[t.amount > 50]: [], t.sort('name'): [], t.sort('name', ascending=False): [], t.head(3): [], t.name.distinct(): [], t[t.amount > 50]['name']: [], t.id.map(lambda x: x + 1, '{id: int}'): [srdd], # no udfs yet t[t.amount > 50]['name']: [], by(t.name, t.amount.sum()): [], by(t.id, t.id.count()): [], by(t[['id', 'amount']], t.id.count()): [], by(t[['id', 'amount']], (t.amount + 1).sum()): [], by(t[['id', 'amount']], t.name.nunique()): [rdd, srdd], by(t.id, t.amount.count()): [], by(t.id, t.id.nunique()): [rdd, srdd], # by(t, t.count()): [], # by(t.id, t.count()): [df], t[['amount', 'id']]: [], t[['id', 'amount']]: [], } for e, exclusions in expressions.items(): if rdd not in exclusions: if iscollection(e.dshape): assert into(set, compute(e, rdd)) == into(set, compute(e, df)) else: assert compute(e, rdd) == compute(e, df) if srdd not in exclusions: if iscollection(e.dshape): assert into(set, compute(e, srdd)) == into(set, compute(e, df)) else: assert compute(e, rdd) == compute(e, df)
def concrete_head(expr, n=10): """ Return head of computed expression """ if not expr.resources(): raise ValueError("Expression does not contain data resources") if iscollection(expr.dshape): head = expr.head(n + 1) result = compute(head) if not len(result): return DataFrame(columns=expr.fields) if iscollection(expr.dshape): return into(DataFrame(columns=expr.fields), result) else: return compute(expr)
def summary(keepdims=False, **kwargs): items = sorted(kwargs.items(), key=first) names = tuple(map(first, items)) values = tuple(map(toolz.second, items)) child = common_subexpression(*values) if len(kwargs) == 1 and not iscollection(child.dshape): while not iscollection(child.dshape): children = [i for i in child._inputs if isinstance(i, Expr)] if len(children) == 1: child = children[0] else: raise ValueError() return Summary(child, names, values, keepdims=keepdims)
def compserver(serial_format): try: serial = _get_format(serial_format) except KeyError: return 'Unsupported serialization format', 404 try: payload = serial.loads(request.data) except ValueError: return ("Bad data. Got %s " % request.data, 400) # 400: Bad Request ns = payload.get('namespace', dict()) dataset = _get_data() ns[':leaf'] = symbol('leaf', discover(dataset)) expr = from_tree(payload['expr'], namespace=ns) assert len(expr._leaves()) == 1 leaf = expr._leaves()[0] try: result = compute(expr, {leaf: dataset}) if iscollection(expr.dshape): result = odo(result, list) elif isscalar(expr.dshape): result = coerce_scalar(result, str(expr.dshape)) except NotImplementedError as e: # 501: Not Implemented return ("Computation not supported:\n%s" % e, 501) except Exception as e: # 500: Internal Server Error return ("Computation failed with message:\n%s" % e, 500) return serial.dumps({'datashape': str(expr.dshape), 'data': result})
def compserver(payload, serial): ns = payload.get('namespace', dict()) compute_kwargs = payload.get('compute_kwargs') or {} odo_kwargs = payload.get('odo_kwargs') or {} dataset = _get_data() ns[':leaf'] = symbol('leaf', discover(dataset)) expr = from_tree(payload['expr'], namespace=ns) assert len(expr._leaves()) == 1 leaf = expr._leaves()[0] try: result = compute(expr, {leaf: dataset}, **compute_kwargs) if iscollection(expr.dshape): result = odo(result, list, **odo_kwargs) elif isscalar(expr.dshape): result = coerce_scalar(result, str(expr.dshape)) except NotImplementedError as e: # 501: Not Implemented return ("Computation not supported:\n%s" % e, 501) except Exception as e: # 500: Internal Server Error return ( "Computation failed with message:\n%s: %s" % (type(e).__name__, e), 500, ) return serial.dumps({ 'datashape': pprint(expr.dshape, width=0), 'data': result, 'names': expr.fields })
def default_materialize(data, dshape, odo_kwargs): if iscollection(dshape): return odo(data, list, **odo_kwargs) if isscalar(dshape): return coerce_scalar(data, str(dshape), odo_kwargs) return data
def comp(datasets, name): if request.headers['content-type'] != 'application/json': return ("Expected JSON data", 404) try: data = json.loads(request.data) except ValueError: return ("Bad JSON. Got %s " % request.data, 404) try: dset = datasets[name] except KeyError: return ("Dataset %s not found" % name, 404) t = Symbol(name, discover(dset)) namespace = data.get('namespace', dict()) namespace[name] = t expr = from_tree(data['expr'], namespace=namespace) result = compute(expr, dset) if iscollection(expr.dshape): result = into(list, result) return jsonify({'name': name, 'datashape': str(expr.dshape), 'data': result})
def compserver(dataset): if request.headers['content-type'] != 'application/json': return ("Expected JSON data", 404) try: payload = json.loads(request.data.decode('utf-8')) except ValueError: return ("Bad JSON. Got %s " % request.data, 404) ns = payload.get('namespace', dict()) ns[':leaf'] = symbol('leaf', discover(dataset)) expr = from_tree(payload['expr'], namespace=ns) assert len(expr._leaves()) == 1 leaf = expr._leaves()[0] try: result = compute(expr, {leaf: dataset}) except Exception as e: return ("Computation failed with message:\n%s" % e, 500) if iscollection(expr.dshape): result = into(list, result) return json.dumps({'datashape': str(expr.dshape), 'data': result}, default=json_dumps)
def compserver(): if request.headers['content-type'] != 'application/json': return ("Expected JSON data", 415) # 415: Unsupported Media Type try: payload = json.loads(request.data.decode('utf-8')) except ValueError: return ("Bad JSON. Got %s " % request.data, 400) # 400: Bad Request ns = payload.get('namespace', dict()) dataset = _get_data() ns[':leaf'] = symbol('leaf', discover(dataset)) expr = from_tree(payload['expr'], namespace=ns) assert len(expr._leaves()) == 1 leaf = expr._leaves()[0] try: result = compute(expr, {leaf: dataset}) if iscollection(expr.dshape): result = odo(result, list) elif isscalar(expr.dshape): result = coerce_scalar(result, str(expr.dshape)) except NotImplementedError as e: # 501: Not Implemented return ("Computation not supported:\n%s" % e, 501) except Exception as e: # 500: Internal Server Error return ("Computation failed with message:\n%s" % e, 500) return json.dumps({'datashape': str(expr.dshape), 'data': result}, default=json_dumps)
def concrete_head(expr, n=10): """ Return head of computed expression """ if not expr._resources(): raise ValueError("Expression does not contain data resources") if not iscollection(expr.dshape): return compute(expr) head = expr.head(n + 1) if not iscollection(expr.dshape): return odo(head, object) elif isrecord(expr.dshape.measure): return odo(head, DataFrame) df = odo(head, DataFrame) df.columns = [expr._name] return df
def fastmsgpack_materialize(data, dshape, odo_kwargs): if istabular(dshape): return odo(data, pd.DataFrame, **odo_kwargs) if iscollection(dshape): return odo(data, pd.Series, **odo_kwargs) if isscalar(dshape): return coerce_scalar(data, str(dshape), odo_kwargs) return data
def test_isnan(): t = symbol('t', 'var * {name: string, amount: real, timestamp: ?date}') for expr in [t.amount.isnan(), ~t.amount.isnan()]: assert eval(str(expr)).isidentical(expr) assert iscollection(t.amount.isnan().dshape) assert 'bool' in str(t.amount.isnan().dshape)
def test_chunks_compute(): exprs = [s, s + 1, s.max(), s.mean() + 1, s.head()] for e in exprs: result = compute(e, {s: cL}) expected = compute(e, {s: L}) if iscollection(e.dshape): result = into(list, result) expected = into(list, expected) assert result == expected
def test_isnan(): from blaze import isnan t = TableSymbol('t', '{name: string, amount: int, timestamp: ?date}') for expr in [t.amount.isnan(), ~t.amount.isnan()]: assert eval(str(expr)).isidentical(expr) assert iscollection(t.amount.isnan().dshape) assert 'bool' in str(t.amount.isnan().dshape)
def test_isnan(): from blaze import isnan t = TableSymbol("t", "{name: string, amount: real, timestamp: ?date}") for expr in [t.amount.isnan(), ~t.amount.isnan()]: assert eval(str(expr)).isidentical(expr) assert iscollection(t.amount.isnan().dshape) assert "bool" in str(t.amount.isnan().dshape)
def summary(keepdims=False, axis=None, **kwargs): items = sorted(kwargs.items(), key=toolz.first) names = tuple(map(toolz.first, items)) values = tuple(map(toolz.second, items)) child = common_subexpression(*values) if len(kwargs) == 1 and not iscollection(child.dshape): while not iscollection(child.dshape): children = [i for i in child._inputs if isinstance(i, Expr)] if len(children) == 1: child = children[0] else: child = common_subexpression(*children) if axis is None: axis = tuple(range(ndim(child))) if isinstance(axis, (set, list)): axis = tuple(axis) if not isinstance(axis, tuple): axis = (axis,) return Summary(child, names, values, keepdims=keepdims, axis=axis)
def summary(keepdims=False, axis=None, **kwargs): items = sorted(kwargs.items(), key=toolz.first) names = tuple(map(toolz.first, items)) values = tuple(map(toolz.second, items)) child = common_subexpression(*values) if len(kwargs) == 1 and not iscollection(child.dshape): while not iscollection(child.dshape): children = [i for i in child._inputs if isinstance(i, Expr)] if len(children) == 1: child = children[0] else: child = common_subexpression(*children) if axis is None: axis = tuple(range(ndim(child))) if isinstance(axis, (set, list)): axis = tuple(axis) if not isinstance(axis, tuple): axis = (axis, ) return Summary(child, names, values, keepdims=keepdims, axis=axis)
def test_multiple_csv_files(): d = {"mult1.csv": "name,val\nAlice,1\nBob,2", "mult2.csv": "name,val\nAlice,3\nCharlie,4"} data = [("Alice", 1), ("Bob", 2), ("Alice", 3), ("Charlie", 4)] with filetexts(d) as fns: r = resource("mult*.csv") s = symbol("s", discover(r)) for e in [s, s.name, s.name.nunique(), s.name.count_values(), s.val.mean()]: a = compute(e, {s: r}) b = compute(e, {s: data}) if iscollection(e.dshape): a, b = into(set, a), into(set, b) assert a == b
def compserver(): if not request.json: raise ServerException('Expected JSON data', status_code=404) payload = request.json expr, result = _compserver(payload) if iscollection(expr.dshape): result = into(list, result) return json.dumps( { 'datashape': str(expr.dshape), 'names': expr.fields, 'data': result }, default=json_dumps)
def test_summary(): t = TableSymbol("t", "{id: int32, name: string, amount: int32}") s = summary(total=t.amount.sum(), num=t.id.count()) assert s.dshape == dshape("{num: int32, total: int64}") assert hash(s) assert eval(str(s)).isidentical(s) assert "summary(" in str(s) assert "total=" in str(s) assert "num=" in str(s) assert str(t.amount.sum()) in str(s) assert not summary(total=t.amount.sum())._child.isidentical(t.amount.sum()) assert iscollection(summary(total=t.amount.sum() + 1)._child.dshape)
def test_summary(): t = symbol('t', 'var * {id: int32, name: string, amount: int32}') s = summary(total=t.amount.sum(), num=t.id.count()) assert s.dshape == dshape('{num: int32, total: int64}') assert hash(s) assert eval(str(s)).isidentical(s) assert 'summary(' in str(s) assert 'total=' in str(s) assert 'num=' in str(s) assert str(t.amount.sum()) in str(s) assert not summary(total=t.amount.sum())._child.isidentical(t.amount.sum()) assert iscollection(summary(total=t.amount.sum() + 1)._child.dshape)
def __getitem__(self, key): if isinstance(key, _strtypes) and key in self.fields: return self._get_field(key) elif isinstance(key, Expr) and iscollection(key.dshape): return selection(self, key) elif isinstance(key, list) and builtins.all(isinstance(k, _strtypes) for k in key): if set(key).issubset(self.fields): return self._project(key) else: raise ValueError("Names %s not consistent with known names %s" % (key, self.fields)) elif isinstance(key, tuple) and all(isinstance(k, (int, slice, type(None), list, np.ndarray)) for k in key): return sliceit(self, key) elif isinstance(key, (slice, int, type(None), list, np.ndarray)): return sliceit(self, (key,)) raise ValueError("Not understood %s[%s]" % (self, key))
def test_multiple_csv_files(): d = {'mult1.csv': 'name,val\nAlice,1\nBob,2', 'mult2.csv': 'name,val\nAlice,3\nCharlie,4'} dta = [('Alice', 1), ('Bob', 2), ('Alice', 3), ('Charlie', 4)] with filetexts(d) as fns: r = data('mult*.csv') s = symbol('s', discover(r)) for e in [s, s.name, s.name.nunique(), s.name.count_values(), s.val.mean()]: a = compute(e, {s: r}) b = compute(e, {s: dta}) if iscollection(e.dshape): a, b = into(set, a), into(set, b) assert a == b
def test_multiple_csv_files(): d = {'mult1.csv': 'name,val\nAlice,1\nBob,2', 'mult2.csv': 'name,val\nAlice,3\nCharlie,4'} data = [('Alice', 1), ('Bob', 2), ('Alice', 3), ('Charlie', 4)] with filetexts(d) as fns: r = resource('mult*.csv') s = symbol('s', discover(r)) for e in [s, s.name, s.name.nunique(), s.name.count_values(), s.val.mean()]: a = compute(e, {s: r}) b = compute(e, {s: data}) if iscollection(e.dshape): a, b = into(set, a), into(set, b) assert a == b
def compserver(): content_type = request.headers['content-type'] matched = mimetype_regex.match(content_type) if matched is None: return 'Unsupported serialization format %s' % content_type, 415 try: serial = _get_format(matched.groups()[0]) except KeyError: return ( "Unsupported serialization format '%s'" % matched.groups()[0], 415, ) try: payload = serial.loads(request.data) except ValueError: return ("Bad data. Got %s " % request.data, 400) # 400: Bad Request ns = payload.get('namespace', dict()) dataset = _get_data() ns[':leaf'] = symbol('leaf', discover(dataset)) expr = from_tree(payload['expr'], namespace=ns) assert len(expr._leaves()) == 1 leaf = expr._leaves()[0] try: result = compute(expr, {leaf: dataset}) if iscollection(expr.dshape): result = odo(result, list) elif isscalar(expr.dshape): result = coerce_scalar(result, str(expr.dshape)) except NotImplementedError as e: # 501: Not Implemented return ("Computation not supported:\n%s" % e, 501) except Exception as e: # 500: Internal Server Error return ("Computation failed with message:\n%s" % e, 500) return serial.dumps({ 'datashape': str(expr.dshape), 'data': result, 'names': expr.fields })
def concrete_type(ds): """ A type into which we can safely deposit streaming data >>> concrete_type('5 * int').__name__ 'ndarray' >>> concrete_type('var * {name: string, amount: int}').__name__ 'DataFrame' """ if isinstance(ds, (str, unicode)): ds = dshape(ds) if not iscollection(ds): return type(ds) if ndim(ds) == 1 and isrecord(ds.measure): return pd.DataFrame if ndim(ds) > 1 or isscalar(ds.measure): return np.ndarray return list
def concrete_head(expr, n=10): """ Return head of computed expression """ if not expr._resources(): raise ValueError("Expression does not contain data resources") if not iscollection(expr.dshape): return compute(expr) head = expr.head(n + 1) result = compute(head) if len(result) == 0: return DataFrame(columns=expr.fields) if isrecord(expr.dshape.measure): return into(DataFrame, result, dshape=expr.dshape) else: df = into(DataFrame, result, dshape=expr.dshape) df.columns = [expr._name] return df
def __getitem__(self, key): if isinstance(key, _strtypes) and key in self.fields: return self._get_field(key) elif isinstance(key, Expr) and iscollection(key.dshape): return selection(self, key) elif (isinstance(key, list) and builtins.all(isinstance(k, _strtypes) for k in key)): if set(key).issubset(self.fields): return self._project(key) else: raise ValueError('Names %s not consistent with known names %s' % (key, self.fields)) elif (isinstance(key, tuple) and all(isinstance(k, (int, slice)) for k in key)): return Slice(self, key) elif isinstance(key, (slice, int)): return Slice(self, (key,)) raise ValueError("Not understood %s[%s]" % (self, key))
def coerce_core(result, dshape, odo_kwargs=None): """Coerce data to a core data type.""" if iscoretype(result): return result elif isscalar(dshape): result = coerce_scalar(result, dshape, odo_kwargs=odo_kwargs) elif istabular(dshape) and isrecord(dshape.measure): result = into(DataFrame, result, **(odo_kwargs or {})) elif iscollection(dshape): dim = _dimensions(dshape) if dim == 1: result = into(Series, result, **(odo_kwargs or {})) elif dim > 1: result = into(np.ndarray, result, **(odo_kwargs or {})) else: raise ValueError("Expr with dshape dimensions < 1 should have been handled earlier: dim={}".format(str(dim))) else: raise ValueError("Expr does not evaluate to a core return type") return result
def test_base(): for expr, exclusions in expressions.items(): model = compute(expr._subs({t: Table(base, t.schema)})) print('\nexpr: %s\n' % expr) for source in sources: if id(source) in map(id, exclusions): continue print('%s <- %s' % (typename(model), typename(source))) T = Table(source) if iscollection(expr.dshape): result = into(model, expr._subs({t: T})) if isscalar(expr.dshape.measure): assert set(into([], result)) == set(into([], model)) else: assert df_eq(result, model) elif isrecord(expr.dshape): result = compute(expr._subs({t: T})) assert into(tuple, result) == into(tuple, model) else: result = compute(expr._subs({t: T})) assert result == model
def test_base(): for expr, exclusions in expressions.items(): model = compute(expr._subs({t: Data(base, t.dshape)})) print('\nexpr: %s\n' % expr) for source in sources: if id(source) in map(id, exclusions): continue print('%s <- %s' % (typename(model), typename(source))) T = Data(source) if iscollection(expr.dshape): result = into(model, expr._subs({t: T})) if isscalar(expr.dshape.measure): assert set(into([], result)) == set(into([], model)) else: assert df_eq(result, model) elif isrecord(expr.dshape): result = compute(expr._subs({t: T})) assert into(tuple, result) == into(tuple, model) else: result = compute(expr._subs({t: T})) assert result == model
def TableSymbol(name, dshape): """ A Symbol for Tabular data This is a leaf in the expression tree Examples -------- >>> accounts = TableSymbol('accounts', ... '{name: string, amount: int, id: int}') >>> accounts.amount + 1 accounts.amount + 1 We define a TableSymbol with a name like ``accounts`` and the datashape of a single row, called a schema. """ if isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not iscollection(dshape): dshape = datashape.var * dshape return symbol(name, dshape)
def compserver(datasets): if request.headers['content-type'] != 'application/json': return ("Expected JSON data", 404) try: data = json.loads(request.data) except ValueError: return ("Bad JSON. Got %s " % request.data, 404) tree_ns = dict( (name, Symbol(name, discover(datasets[name]))) for name in datasets) if 'namespace' in data: tree_ns = merge(tree_ns, data['namespace']) expr = from_tree(data['expr'], namespace=tree_ns) compute_ns = dict((Symbol(name, discover(datasets[name])), datasets[name]) for name in datasets) result = compute(expr, compute_ns) if iscollection(expr.dshape): result = into(list, result) return jsonify({'datashape': str(expr.dshape), 'data': result})
def broadcast_collect(expr, Broadcastable=Broadcastable, WantToBroadcast=WantToBroadcast): """ Collapse expression down using Broadcast - Tabular cases only Expressions of type Broadcastables are swallowed into Broadcast operations >>> t = Symbol('t', 'var * {x: int, y: int, z: int, when: datetime}') >>> expr = (t.x + 2*t.y).distinct() >>> broadcast_collect(expr) distinct(Broadcast(_children=(t,), _scalars=(t,), _scalar_expr=t.x + (2 * t.y))) """ if (isinstance(expr, WantToBroadcast) and iscollection(expr.dshape)): leaves = leaves_of_type(Broadcastable, expr) expr = broadcast(expr, sorted(leaves, key=str)) # Recurse down children = [broadcast_collect(i, Broadcastable, WantToBroadcast) for i in expr._inputs] return expr._subs(dict(zip(expr._inputs, children)))
def coerce_core(result, dshape, odo_kwargs=None): """Coerce data to a core data type.""" if iscoretype(result): return result elif isscalar(dshape): result = coerce_scalar(result, dshape, odo_kwargs=odo_kwargs) elif istabular(dshape) and isrecord(dshape.measure): result = into(DataFrame, result, **(odo_kwargs or {})) elif iscollection(dshape): dim = _dimensions(dshape) if dim == 1: result = into(Series, result, **(odo_kwargs or {})) elif dim > 1: result = into(np.ndarray, result, **(odo_kwargs or {})) else: msg = "Expr with dshape dimensions < 1 should have been handled earlier: dim={}" raise ValueError(msg.format(str(dim))) else: msg = "Expr does not evaluate to a core return type" raise ValueError(msg) return result
def broadcast_collect(expr, Broadcastable=Broadcastable, WantToBroadcast=WantToBroadcast): """ Collapse expression down using Broadcast - Tabular cases only Expressions of type Broadcastables are swallowed into Broadcast operations >>> t = symbol('t', 'var * {x: int, y: int, z: int, when: datetime}') >>> expr = (t.x + 2*t.y).distinct() >>> broadcast_collect(expr) distinct(Broadcast(_children=(t,), _scalars=(t,), _scalar_expr=t.x + (2 * t.y))) """ if (isinstance(expr, WantToBroadcast) and iscollection(expr.dshape)): leaves = leaves_of_type(Broadcastable, expr) expr = broadcast(expr, sorted(leaves, key=str)) # Recurse down children = [ broadcast_collect(i, Broadcastable, WantToBroadcast) for i in expr._inputs ] return expr._subs(dict(zip(expr._inputs, children)))
def compserver(datasets): if request.headers['content-type'] != 'application/json': return ("Expected JSON data", 404) try: data = json.loads(request.data) except ValueError: return ("Bad JSON. Got %s " % request.data, 404) tree_ns = dict((name, Symbol(name, discover(datasets[name]))) for name in datasets) if 'namespace' in data: tree_ns = merge(tree_ns, data['namespace']) expr = from_tree(data['expr'], namespace=tree_ns) compute_ns = dict((Symbol(name, discover(datasets[name])), datasets[name]) for name in datasets) result = compute(expr, compute_ns) if iscollection(expr.dshape): result = into(list, result) return jsonify({'datashape': str(expr.dshape), 'data': result})
See np.linalg.norm """ if ord is None or ord == 'fro': ord = 2 if ord == inf: return max(abs(expr), axis=axis, keepdims=keepdims) elif ord == -inf: return min(abs(expr), axis=axis, keepdims=keepdims) elif ord == 1: return sum(abs(expr), axis=axis, keepdims=keepdims) elif ord % 2 == 0: return sum(expr ** ord, axis=axis, keepdims=keepdims) ** (1.0 / ord) return sum(abs(expr) ** ord, axis=axis, keepdims=keepdims) ** (1.0 / ord) dshape_method_list.extend([ (iscollection, set([count, nelements])), (lambda ds: (iscollection(ds) and (isstring(ds) or isnumeric(ds) or isboolean(ds) or isdatelike(ds) or isinstance(ds, TimeDelta))), set([min, max])), (lambda ds: len(ds.shape) == 1, set([nrows, nunique])), (lambda ds: iscollection(ds) and isboolean(ds), set([any, all])), (lambda ds: iscollection(ds) and (isnumeric(ds) or isboolean(ds)), set([mean, sum, std, var, vnorm])), ]) method_properties.update([nrows])
def valid_reducer(expr): ds = expr.dshape measure = ds.measure return (not iscollection(ds) and (isscalar(measure) or (isrecord(measure) and not is_nested_record(measure))))
def schema(self): if iscollection(self.dshape): return self.dshape.subshape[0] else: raise TypeError("Non-tabular datashape, %s" % self.dshape)
def sparksql_dataframe_to_list(df, dshape=None, **kwargs): result = df.collect() if (dshape is not None and iscollection(dshape) and not isrecord(dshape.measure)): return list(map(get(0), result)) return result
def compute_up(t, seq, **kwargs): func = rowfunc(t) if iscollection(t._child.dshape): return deepmap(func, seq, n=ndim(child(t))) else: return func(seq)
def concrete_type(ds): """A type into which we can safely deposit streaming data. Parameters ---------- ds : DataShape Returns ------- type : type The concrete type corresponding to the DataShape `ds` Notes ----- * This will return a Python type if possible * Option types are not handled specially. The base type of the option type is returned. Examples -------- >>> concrete_type('5 * int') <class 'pandas.core.series.Series'> >>> concrete_type('var * {name: string, amount: int}') <class 'pandas.core.frame.DataFrame'> >>> concrete_type('float64') <... 'float'> >>> concrete_type('float32') <... 'float'> >>> concrete_type('int64') <... 'int'> >>> concrete_type('int32') <... 'int'> >>> concrete_type('uint8') <... 'int'> >>> concrete_type('bool') <... 'bool'> >>> concrete_type('complex[float64]') <... 'complex'> >>> concrete_type('complex[float32]') <... 'complex'> >>> concrete_type('?int64') <... 'int'> """ if isinstance(ds, (str, unicode)): ds = dshape(ds) if not iscollection(ds) and isscalar(ds.measure): measure = getattr(ds.measure, 'ty', ds.measure) if measure in integral.types: return int elif measure in floating.types: return float elif measure in boolean.types: return bool elif measure in complexes.types: return complex else: return ds.measure.to_numpy_dtype().type if not iscollection(ds): return type(ds) if ndim(ds) == 1: return pd.DataFrame if isrecord(ds.measure) else pd.Series if ndim(ds) > 1: return np.ndarray return list
def test_keepdims_equals_true_doesnt_mess_up_agg_shape(): x = symbol('x', '10 * int') (chunk, chunk_expr), (agg, agg_expr) = split(x, x.sum(), keepdims=False) assert iscollection(agg.dshape)
def valid_grouper(expr): ds = expr.dshape measure = ds.measure return (iscollection(ds) and (isscalar(getattr(measure, 'key', measure)) or (isrecord(measure) and not is_nested_record(measure))))
return tuple(s) def ndim(expr): """ Number of dimensions of expression >>> symbol('s', '3 * var * int32').ndim 2 """ return len(shape(expr)) dshape_method_list.extend([ (lambda ds: True, set([apply])), (iscollection, set([shape, ndim])), (lambda ds: iscollection(ds) and isscalar(ds.measure), set([coerce])) ]) schema_method_list.extend([ (isscalar, set([label, relabel, coerce])), (isrecord, set([relabel])), (lambda ds: isinstance(ds, Option), {coalesce}), ]) method_properties.update([shape, ndim]) @dispatch(Expr) def discover(expr): return expr.dshape
return broadcast(USub, a) def _invert(a): return broadcast(Not, a) def isnan(expr): return broadcast(math.isnan, expr) from .expressions import dshape_method_list def isreal(ds): if isinstance(ds, DataShape) and len(ds) == 1: ds = ds[0] if isinstance(ds, Option): ds = ds.ty return isinstance(ds, Unit) and 'float' in str(ds) dshape_method_list.extend([ (lambda ds: iscollection(ds) and isscalar(ds.measure), set([ _eq, _ne, _lt, _le, _gt, _ge, _add, _radd, _mul, _rmul, _div, _rdiv, _floordiv, _rfloordiv, _sub, _rsub, _pow, _rpow, _mod, _rmod, _or, _ror, _and, _rand, _neg, _invert ])), (lambda ds: iscollection(ds) and isreal(ds.measure), set([isnan])) ])
""" Vector norm See np.linalg.norm """ if ord is None or ord == 'fro': ord = 2 if ord == inf: return max(abs(expr), axis=axis, keepdims=keepdims) elif ord == -inf: return min(abs(expr), axis=axis, keepdims=keepdims) elif ord == 1: return sum(abs(expr), axis=axis, keepdims=keepdims) elif ord % 2 == 0: return sum(expr**ord, axis=axis, keepdims=keepdims)**(1.0 / ord) return sum(abs(expr)**ord, axis=axis, keepdims=keepdims)**(1.0 / ord) dshape_method_list.extend([ (iscollection, set([count, nelements])), (lambda ds: (iscollection(ds) and (isstring(ds) or isnumeric(ds) or isboolean(ds) or isdatelike(ds))), set([min, max])), (lambda ds: len(ds.shape) == 1, set([nrows, nunique])), (lambda ds: iscollection(ds) and isboolean(ds), set([any, all])), (lambda ds: iscollection(ds) and (isnumeric(ds) or isboolean(ds)), set([mean, sum, std, var, vnorm])), ]) method_properties.update([nrows])
if field not in to_remove: new_fields.append(field) else: to_remove.remove(field) if to_remove: raise ValueError( 'fields %r were not in the fields of expr (%r)' % (sorted(to_remove), expr.fields), ) return expr[new_fields] dshape_method_list.extend([ (lambda ds: True, {apply}), (iscollection, {shape, ndim}), (lambda ds: iscollection(ds) and isscalar(ds.measure), {coerce}), (istabular, {drop_field}), ]) schema_method_list.extend([ (isscalar, {label, relabel, coerce}), (isrecord, {relabel}), (lambda ds: isinstance(ds, Option), {coalesce}), ]) method_properties.update([shape, ndim]) @dispatch(Expr) def discover(expr): return expr.dshape