def commodityTonnage_by_level(d, outdir, level): name = '-'.join(level) outpath = path.join(outdir, 'tonnage_by_{}.csv'.format(name)) if replace or not path.exists(outpath): dr = bz.by(bz.merge(d[level], d.commodity, d.year, d.month, d.category), commodityTonnage=d.commodityTonnage.sum()) do = bz.by(bz.merge(d[level], d.commodity, d.category), commodityTonnage=d.commodityTonnage.sum()) #d[[level, 'commodity']] save(do, outpath, replace) content, by_year_month = commodityTonnage_over_time(dr, outdir, level) commodityTonnage_by_year(dr, outdir, level) if content: commodityTonnage_by_month(by_year_month, outdir, level) return
def _odo_object(self): from blaze import by, merge, head table = self.binded_table if self.whereclauses: wheres = bind_list(self, self.whereclauses) table = table[reduce(lambda x, y: x and y, wheres)] tb = self.tables[self.table.name] self.tables[self.table.name] = table columns = bind_list(self, self.columns) or [table[_] for _ in table.fields] self.tables[self.table.name] = tb if self.groupclauses: groups = bind_list(self, self.groupclauses) groups = [table[_.fields[0]] for _ in groups] names = [_.fields[0] for _ in groups] groups = merge(*groups) if len(groups) > 1 else groups[0] table = by( groups, **{ c.fields[0]: c for c in columns if c.fields[0] not in names }) if self.orderclauses: orders = bind_list(self, self.orderclauses) for order in orders.reverse(): table = table.sort(*order) if self.limit: table = head(table, self.limit) return table[[_.fields[0] for _ in columns]]
def load_adjusted_array(self, columns, dates, assets, mask): expr = self._expr filtered = expr[expr[TS_FIELD_NAME] <= dates[0]] lower = odo( bz.by( filtered[SID_FIELD_NAME], timestamp=filtered[TS_FIELD_NAME].max(), ).timestamp.min(), pd.Timestamp, **self._odo_kwargs) if pd.isnull(lower): # If there is no lower date, just query for data in the date # range. It must all be null anyways. lower = dates[0] raw = odo( expr[(expr[TS_FIELD_NAME] >= lower) & (expr[TS_FIELD_NAME] <= dates[-1])], pd.DataFrame, **self._odo_kwargs) sids = raw.loc[:, SID_FIELD_NAME] raw.drop(sids[~(sids.isin(assets) | sids.notnull())].index, inplace=True) gb = raw.groupby(SID_FIELD_NAME) def mkseries(idx, raw_loc=raw.loc): vs = raw_loc[idx, [TS_FIELD_NAME, ANNOUNCEMENT_FIELD_NAME]].values return pd.Series( index=pd.DatetimeIndex(vs[:, 0]), data=vs[:, 1], ) return EarningsCalendarLoader( dates, valmap(mkseries, gb.groups), ).load_adjusted_array(columns, dates, assets, mask)
def test_by(ctx, db, grouper, reducer, reduction): t = db.t expr = by(t[grouper], total=getattr(t[reducer], reduction)()) result = compute(expr, ctx) expected = compute(expr, {db: {'t': df}}) assert (set(map(frozenset, into(list, result))) == set(map(frozenset, into(list, expected))))
def build_training_set( rundb: str, topicsfile: str, index_path: str, axioms: typing.Sequence[PairwiseAxiom], max_rank=100): queries_by_id, rd, ctx = initialize(rundb, topicsfile, index_path, max_rank) system_query = rd[['system', 'query']].distinct().sort('query') from blaze import by, merge ranking_lengths = by(merge(rd.system, rd.query), n=rd.rank.count()).n cpair_count = int((ranking_lengths * ranking_lengths - ranking_lengths).sum()) iter_count = cpair_count * len(axioms) * 2 pbar = tqdm.tqdm(total=iter_count) def loop(i): for item in i: pbar.update() yield item for sys, qid in system_query: sqrun = rd[(rd['system'] == sys) & (rd['query'] == qid)] ranking = [ctx.c.get_document(did[0]) for did in sqrun.docid] query = queries_by_id[qid] part = build_training_set_for_one_ranking(ctx, axioms, query, sys, ranking, loop) yield part
def test_multikey_by(ctx, db, reducer, reduction): t = db.t expr = by(t[['id', 'amount']], total=getattr(t[reducer], reduction)()) result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert (set(map(frozenset, into(list, result))) == set(map(frozenset, into(list, expected))))
def create_plot(team="LAA", year=2012): expr = bz.by(db.Salaries.teamID, avg=db.Salaries.salary.mean(), max=db.Salaries.salary.max(), ratio=db.Salaries.salary.max() / db.Salaries.salary.min()) expr = expr.sort('ratio', ascending=False) df_salary_gb = into(pd.DataFrame, expr) source1 = into(ColumnDataSource, df_salary_gb[["teamID", "avg"]]) plot1 = plt.figure(title="Salary ratio by team", x_range=list(df_salary_gb["teamID"])) plot1.scatter(x="teamID", y="avg", source=source1, size=20) plot1.xaxis.major_label_orientation = np.pi/3 df = into(pd.DataFrame, db.Salaries) df = df[df["teamID"] == team] df = df[df["yearID"] == year] df = df[["playerID","salary"]].sort('salary') source_team = into(ColumnDataSource, df) p_team = plt.figure(title="Salary of players for %s during %s" % (team, year), x_range=list(df["playerID"]))#, tools=TOOLS) p_team.scatter(x="playerID", y="salary", source=source_team, size=20) p_team.xaxis.major_label_orientation = np.pi/3 p = plt.gridplot([[plot1, p_team]]) return p
def test_expr_client_interactive(): c = Client('localhost:6363') t = bz_data(c) assert compute(t.accounts.name) == ['Alice', 'Bob'] assert (into(set, compute(by(t.accounts.name, min=t.accounts.amount.min(), max=t.accounts.amount.max()))) == set([('Alice', 100, 100), ('Bob', 200, 200)]))
def test_spark_big_by(sc): tbig = symbol( 'tbig', 'var * {name: string, sex: string[1], amount: int, id: int}') big_exprs = [ by(tbig[['name', 'sex']], total=tbig['amount'].sum()), by(tbig[['name', 'sex']], total=(tbig['id'] + tbig['amount']).sum())] databig = [['Alice', 'F', 100, 1], ['Alice', 'F', 100, 3], ['Drew', 'F', 100, 4], ['Drew', 'M', 100, 5], ['Drew', 'M', 200, 5]] rddbig = sc.parallelize(databig) check_exprs_against_python(big_exprs, databig, rddbig)
def test_by_with_single_row(): ct = bcolz.ctable([[1, 1, 3, 3], [1, 2, 3, 4]], names=list('ab')) t = symbol('t', discover(ct)) subset = t[t.a == 3] expr = by(subset.a, b_sum=subset.b.sum()) result = compute(expr, ct) expected = compute(expr, ct, optimize=False) tm.assert_frame_equal(result, expected)
def test_expr_client_interactive(): ec = Client('localhost:6363', 'accounts') t = Table(ec) assert compute(t.name) == ['Alice', 'Bob'] assert (into(set, compute(by(t.name, min=t.amount.min(), max=t.amount.max()))) == set([('Alice', 100, 100), ('Bob', 200, 200)]))
def test_by_with_date(ctx, db, attr): # TODO: investigate CSV writing precision between pandas 0.16.0 and 0.16.1 # TODO: see if we can use odo to convert the dshape of an existing # DataFrame expr = by(getattr(db.dates.ds, attr), mean=db.dates.amount.mean()) result = odo(compute(expr, ctx), pd.DataFrame).sort("mean").reset_index(drop=True) expected = compute(expr, {db: {"dates": date_df}}).sort("mean").reset_index(drop=True) tm.assert_frame_equal(result, expected, check_dtype=False)
def test_expr_client_interactive(): ec = ExprClient('localhost:5000', 'accounts_df') t = Table(ec) assert compute(t.name) == ['Alice', 'Bob'] assert (into(set, compute(by(t.name, min=t.amount.min(), max=t.amount.max()))) == set([('Alice', 100, 100), ('Bob', 200, 200)]))
def _bar_any_by_year(func, **kwargs): grouping = bz.by(g.ds.production_year, val=func(g.ds.price)) df = _order_years(grouping) p = Bar(df, 'production_year', values='val', legend=False, **kwargs) set_numerical_axis(p) return p
def test_by_with_date(ctx, db, attr): # TODO: investigate CSV writing precision between pandas 0.16.0 and 0.16.1 # TODO: see if we can use odo to convert the dshape of an existing # DataFrame expr = by(getattr(db.dates.ds, attr), mean=db.dates.amount.mean()) result = odo(compute(expr, ctx), pd.DataFrame).sort('mean').reset_index(drop=True) expected = compute(expr, {db: {'dates': date_df}}).sort('mean').reset_index(drop=True) tm.assert_frame_equal(result, expected, check_dtype=False)
def commodityTonnage_by_year(d, outdir, level=None): df = odo.odo(d, pd.DataFrame) d = bz.Data(df, d.dshape) if level: if isinstance(level, list): expr = [l for l in level] else: expr = level name = '-'.join(level) outpath = path.join(outdir, 'tonnage_by_{}_year.csv'.format(name)) if replace or not path.exists(outpath): do = bz.by(bz.merge(d[expr], d.year, d.commodity, d.category), commodityTonnage=d.commodityTonnage.sum()) save(do, outpath, replace) else: outpath = path.join(outdir, 'tonnage_by_year.csv') if replace or not path.exists(outpath): do = bz.by(bz.merge(d.year, d.commodity, d.category), commodityTonnage=d.commodityTonnage.sum()) save(do, outpath, replace) return
def test_groupby(sc): rddidx = sc.parallelize(data_idx) rddarc = sc.parallelize(data_arc) joined = join(t_arc, t_idx, "node_id") t = by(joined['name'], count=joined['node_id'].count()) a = compute(t, {t_arc: rddarc, t_idx: rddidx}) in_degree = dict(a.collect()) assert in_degree == {'A': 1, 'C': 2}
def ffill_query_in_range(expr, lower, upper, odo_kwargs=None, ts_field=TS_FIELD_NAME, sid_field=SID_FIELD_NAME): """Query a blaze expression in a given time range properly forward filling from values that fall before the lower date. Parameters ---------- expr : Expr Bound blaze expression. lower : datetime The lower date to query for. upper : datetime The upper date to query for. odo_kwargs : dict, optional The extra keyword arguments to pass to ``odo``. ts_field : str, optional The name of the timestamp field in the given blaze expression. sid_field : str, optional The name of the sid field in the given blaze expression. Returns ------- raw : pd.DataFrame A strict dataframe for the data in the given date range. This may start before the requested start date if a value is needed to ffill. """ odo_kwargs = odo_kwargs or {} filtered = expr[expr[ts_field] <= lower] computed_lower = odo( bz.by( filtered[sid_field], timestamp=filtered[ts_field].max(), ).timestamp.min(), pd.Timestamp, **odo_kwargs ) if pd.isnull(computed_lower): # If there is no lower date, just query for data in the date # range. It must all be null anyways. computed_lower = lower raw = odo( expr[ (expr[ts_field] >= computed_lower) & (expr[ts_field] <= upper) ], pd.DataFrame, **odo_kwargs ) raw.loc[:, ts_field] = raw.loc[:, ts_field].astype('datetime64[ns]') return raw
def test_group_by_map(fkey, grouper): t = symbol('fkey', discover(fkey)) expr = by(t[grouper], id_count=t.size.count()) result = compute(expr, fkey, return_type='native') expected = """SELECT fkey.sym_id, count(fkey.size) AS id_count FROM fkey GROUP BY fkey.sym_id """ assert normalize(str(result)) == normalize(expected)
def test_group_by_map(fkey, grouper): t = symbol('fkey', discover(fkey)) expr = by(t[grouper], id_count=t.size.count()) result = compute(expr, fkey) expected = """SELECT fkey.sym_id, count(fkey.size) AS id_count FROM fkey GROUP BY fkey.sym_id """ assert normalize(str(result)) == normalize(expected)
def test_foreign_key_group_by(fkey, grouper): t = symbol('fkey', discover(fkey)) expr = by(t.sym_id[grouper], avg_price=t.sym_id.price.mean()) result = compute(expr, fkey, return_type='native') expected = """SELECT pkey.sym, avg(pkey.price) AS avg_price FROM pkey, fkey WHERE fkey.sym_id = pkey.id GROUP BY pkey.sym """ assert normalize(str(result)) == normalize(expected)
def _base_stats(ds): df = odo(bz.by(ds.district, sum_price=bz.sum(ds.price), sum_area=bz.sum(ds.area), count=bz.count(ds.price)), pd.DataFrame) df["avg_area"] = df["sum_area"] / df["count"] df["avg_price"] = df["sum_price"] / df["count"] df["avg_price_m2"] = df["sum_price"] / df["sum_area"] return df
def test_compute_by_with_summary(iris_server, iris): test = iris_server t = TableSymbol('t', iris.dshape) expr = by(t.species, max=t.petal_length.max(), sum=t.petal_width.sum()) tree = to_tree(expr) blob = json.dumps({'expr': tree}) resp = test.post('/compute/iris.json', data=blob, content_type='application/json') assert 'OK' in resp.status result = json.loads(resp.data)['data'] expected = compute(expr, iris) assert result == list(map(list, expected))
def test_foreign_key_group_by(fkey, grouper): t = symbol('fkey', discover(fkey)) expr = by(t.sym_id[grouper], avg_price=t.sym_id.price.mean()) result = compute(expr, fkey) expected = """SELECT pkey.sym, avg(pkey.price) AS avg_price FROM pkey, fkey WHERE fkey.sym_id = pkey.id GROUP BY pkey.sym """ assert normalize(str(result)) == normalize(expected)
def test_compute_by_with_summary(iris_server): test = iris_server t = symbol('t', discover(iris)) expr = by(t.species, max=t.petal_length.max(), sum=t.petal_width.sum()) tree = to_tree(expr) blob = json.dumps({'expr': tree}) resp = test.post('/compute.json', data=blob, content_type='application/json') assert 'OK' in resp.status result = json.loads(resp.data.decode('utf-8'))['data'] expected = compute(expr, iris) assert result == list(map(list, into(list, expected)))
def time_by(d, grouper='passenger_count', reducer='trip_time_in_secs', function='sum'): expr = by(getattr(d, grouper), s=getattr(getattr(d, reducer), function)()) times = [] for core_count in cores(): if isinstance(d, pd.DataFrame): with timeit('cores: %d' % core_count, times): getattr(d.groupby(grouper)[reducer], function)() else: p = mp.Pool(core_count) with timeit('cores: %d' % core_count, times): compute(expr, map=p.map) p.close() return np.array(times)
def test_compute_by_with_summary(iris_server, serial): test = iris_server t = symbol('t', discover(iris)) expr = by(t.species, max=t.petal_length.max(), sum=t.petal_width.sum()) tree = to_tree(expr) blob = serial.dumps({'expr': tree}) resp = test.post('/compute', data=blob, headers=mimetype(serial)) assert 'OK' in resp.status tdata = serial.loads(resp.data) result = DataFrame(serial.data_loads(tdata['data'])).values expected = compute(expr, iris).values np.testing.assert_array_equal(result[:, 0], expected[:, 0]) np.testing.assert_array_almost_equal(result[:, 1:], expected[:, 1:]) assert list(tdata['names']) == ['species', 'max', 'sum']
def load_adjusted_array(self, columns, dates, assets, mask): expr = self._expr filtered = expr[expr[TS_FIELD_NAME] <= dates[0]] lower = odo( bz.by( filtered[SID_FIELD_NAME], timestamp=filtered[TS_FIELD_NAME].max(), ).timestamp.min(), pd.Timestamp, **self._odo_kwargs ) if pd.isnull(lower): # If there is no lower date, just query for data in the date # range. It must all be null anyways. lower = dates[0] raw = odo( expr[ (expr[TS_FIELD_NAME] >= lower) & (expr[TS_FIELD_NAME] <= dates[-1]) ], pd.DataFrame, **self._odo_kwargs ) sids = raw.loc[:, SID_FIELD_NAME] raw.drop( sids[~(sids.isin(assets) | sids.notnull())].index, inplace=True ) gb = raw.groupby(SID_FIELD_NAME) def mkseries(idx, raw_loc=raw.loc): vs = raw_loc[ idx, [TS_FIELD_NAME, ANNOUNCEMENT_FIELD_NAME] ].values return pd.Series( index=pd.DatetimeIndex(vs[:, 0]), data=vs[:, 1], ) return EarningsCalendarLoader( dates, valmap(mkseries, gb.groups), dataset=self._dataset, ).load_adjusted_array(columns, dates, assets, mask)
def where(e, column): """Create the query to run against the resources. Parameters ---------- e : Expr The baseline or deltas expression. column : BoundColumn The column to query for. Returns ------- q : Expr The query to run for the given column. """ colname = column.name pred = e[TS_FIELD_NAME] <= lower_dt schema = e[colname].schema.measure if isinstance(schema, Option): pred &= e[colname].notnull() schema = schema.ty if schema in floating: pred &= ~e[colname].isnan() filtered = e[pred] lower = filtered.timestamp.max() if have_sids: # If we have sids, then we need to take the earliest of the # greatest date that has a non-null value by sid. lower = bz.by( filtered[SID_FIELD_NAME], timestamp=lower, ).timestamp.min() lower = odo(lower, pd.Timestamp) if lower is pd.NaT: # If there is no lower date, just query for data in he date # range. It must all be null anyways. lower = lower_dt return e[ (e[TS_FIELD_NAME] >= lower) & (e[TS_FIELD_NAME] <= upper_dt) ][added_query_fields + [colname]]
def where(e, column): """Create the query to run against the resources. Parameters ---------- e : Expr The baseline or deltas expression. column : BoundColumn The column to query for. Returns ------- q : Expr The query to run for the given column. """ colname = column.name pred = e[TS_FIELD_NAME] <= lower_dt schema = e[colname].schema.measure if isinstance(schema, Option): pred &= e[colname].notnull() schema = schema.ty if schema in floating: pred &= ~e[colname].isnan() filtered = e[pred] lower = filtered.timestamp.max() if have_sids: # If we have sids, then we need to take the earliest of the # greatest date that has a non-null value by sid. lower = bz.by( filtered[SID_FIELD_NAME], timestamp=lower, ).timestamp.min() lower = odo(lower, pd.Timestamp) if lower is pd.NaT: # If there is no lower date, just query for data in he date # range. It must all be null anyways. lower = lower_dt return e[(e[TS_FIELD_NAME] >= lower) & (e[TS_FIELD_NAME] <= upper_dt)][added_query_fields + [colname]]
def test_compute_by_with_summary(iris_server, serial): test = iris_server t = symbol('t', discover(iris)) expr = by( t.species, max=t.petal_length.max(), sum=t.petal_width.sum(), ) tree = to_tree(expr) blob = serial.dumps({'expr': tree}) resp = test.post( '/compute.{name}'.format(name=serial.name), data=blob, ) assert 'OK' in resp.status result = DataFrame(serial.loads(resp.data)['data']).values expected = compute(expr, iris).values np.testing.assert_array_equal(result[:, 0], expected[:, 0]) np.testing.assert_array_almost_equal(result[:, 1:], expected[:, 1:])
def lower_for_col(column): pred = e[TS_FIELD_NAME] <= lower_dt colname = column.name schema = e[colname].schema.measure if isinstance(schema, Option): pred &= e[colname].notnull() schema = schema.ty if schema in floating: pred &= ~e[colname].isnan() filtered = e[pred] lower = filtered[TS_FIELD_NAME].max() if have_sids: # If we have sids, then we need to take the earliest of the # greatest date that has a non-null value by sid. lower = bz.by( filtered[SID_FIELD_NAME], timestamp=lower, ).timestamp.min() return lower
if isinstance(result, float): assert abs(result - expected) < 0.001 else: assert result == expected exprs = [ t['amount'], t['amount'] == 100, t['amount'].truncate(150), t[t['name'] == 'Alice'], t[t['amount'] == 0], t[t['amount'] > 150], t['amount'] + t['id'], t['amount'] % t['id'], exp(t['amount']), by(t['name'], total=t['amount'].sum()), by(t['name'], total=(t['amount'] + 1).sum()), (t['amount'] * 1).label('foo'), t.map(lambda tup: tup[1] + tup[2], 'real'), t.like(name='Alice'), t['amount'].apply(identity, 'var * real', splittable=True), t['amount'].map(inc, 'int')] def test_spark_basic(rdd): check_exprs_against_python(exprs, data, rdd) def check_exprs_against_python(exprs, data, rdd): any_bad = False for expr in exprs:
result = df.groupby(['f0'])['f2'].sum() print(result) t_pandas = t_elapsed # -- cytoolz -- with ctime(message='cytoolz over bcolz'): # In Memory Split-Apply-Combine # http://toolz.readthedocs.org/en/latest/streaming-analytics.html?highlight=reduce#split-apply-combine-with-groupby-and-reduceby r = cytoolz.groupby(lambda row: row.f0, ct) result = valmap(compose(sum, pluck(2)), r) print('x{0} slower than pandas'.format(round(t_elapsed / t_pandas, 2))) print(result) # -- blaze + bcolz -- blaze_data = blz.Data(ct.rootdir) expr = blz.by(blaze_data.f0, sum_f2=blaze_data.f2.sum()) with ctime(message='blaze over bcolz'): result = blz.compute(expr) print('x{0} slower than pandas'.format(round(t_elapsed / t_pandas, 2))) print(result) # -- bquery -- with ctime(message='bquery over bcolz'): result = ct.groupby(['f0'], ['f2']) print('x{0} slower than pandas'.format(round(t_elapsed / t_pandas, 2))) print(result) ct.cache_factor(['f0'], refresh=True) with ctime(message='bquery over bcolz (factorization cached)'): result = ct.groupby(['f0'], ['f2']) print('x{0} slower than pandas'.format(round(t_elapsed / t_pandas, 2)))
def _blaze(self, _selects, _wheres, _groups, _aggs, _offset, _limit, _sorts, _count, _q): import blaze as bz import datashape # TODO: Not caching blaze connections parameters = self.params.get('parameters', {}) bzcon = bz.Data( self.params['url'] + ('::' + self.params['table'] if self.params.get('table') else ''), **parameters) table = bz.Symbol('table', bzcon.dshape) columns = table.fields query = table if _wheres: wh_re = re.compile(r'([^=><~!]+)([=><~!]{1,2})([\s\S]+)') wheres = None for where in _wheres: match = wh_re.search(where) if match is None: continue col, oper, val = match.groups() col = table[col] if oper in ['==', '=']: whr = (col == val) elif oper == '>=': whr = (col >= val) elif oper == '<=': whr = (col <= val) elif oper == '>': whr = (col > val) elif oper == '<': whr = (col < val) elif oper == '!=': whr = (col != val) elif oper == '~': whr = (col.like('*' + val + '*')) elif oper == '!~': whr = (~col.like('*' + val + '*')) wheres = whr if wheres is None else wheres & whr query = query if wheres is None else query[wheres] alias_cols = [] if _groups and _aggs: byaggs = { 'min': bz.min, 'max': bz.max, 'sum': bz.sum, 'count': bz.count, 'mean': bz.mean, 'nunique': bz.nunique } agg_re = re.compile(r'([^:]+):([aA-zZ]+)\(([^:]+)\)') grps = bz.merge(*[query[group] for group in _groups]) aggs = {} for agg in _aggs: match = agg_re.search(agg) if match is None: continue name, oper, col = match.groups() alias_cols.append(name) aggs[name] = byaggs[oper](query[col]) query = bz.by(grps, **aggs) if _q: wheres = None for col in columns: if isinstance(table[col].dshape.measure.ty, datashape.coretypes.String): whr = table[col].like('*' + _q + '*') wheres = whr if wheres is None else wheres | whr if wheres is not None: query = query[wheres] count_query = query.count() if _sorts: order = {'asc': True, 'desc': False} sorts = [] for sort in _sorts: col, odr = sort.partition(':')[::2] if col not in columns + alias_cols: continue sorts.append(col) if sorts: query = query.sort(sorts, ascending=order.get(odr, True)) if _offset: _offset = int(_offset) if _limit: _limit = int(_limit) if _offset and _limit: _limit += _offset if _offset or _limit: query = query[_offset:_limit] if _selects: query = query[_selects] # TODO: Improve json, csv, html outputs using native odo result = { 'query': query, 'data': bz.odo(bz.compute(query, bzcon.data), pd.DataFrame), } if _count: count = bz.odo(bz.compute(count_query, bzcon.data), pd.DataFrame) result['count'] = count.iloc[0, 0] return result
def test_multikey_by(ctx, db, reducer, reduction): t = db.t expr = by(t[["id", "amount"]], total=getattr(t[reducer], reduction)()) result = compute(expr, ctx) expected = compute(expr, {db: {"t": df}}) assert set(map(frozenset, into(list, result))) == set(map(frozenset, into(list, expected)))
def test_grouper_with_arith(ctx, db): expr = by(db.t[['id', 'amount']], total=(db.t.amount + 1).sum()) result = compute(expr, ctx) expected = compute(expr, {db: {'t': df}}) assert list(map(set, into(list, result))) == list(map(set, into(list, expected)))
def test_by_non_native_ops(ctx, db): expr = by(db.t.id, total=db.t.id.nunique()) result = compute(expr, ctx) expected = compute(expr, {db: {'t': df}}) assert list(map(set, into(list, result))) == list(map(set, into(list, expected)))
def test_recursive_rowfunc_is_used(rdd): expr = by(t['name'], total=(2 * (t['amount'] + t['id'])).sum()) expected = [('Alice', 2 * (101 + 53)), ('Bob', 2 * (202))] assert set(compute(expr, rdd).collect()) == set(expected)
def groupmeans(data, groups, numbers, cutoff=0.01, quantile=0.95, minsize=None): """ Yields the significant differences in average between every pair of groups and numbers. Parameters ---------- data : blaze data object groups : non-empty iterable containing category column names in data numbers : non-empty iterable containing numeric column names in data cutoff : ignore anything with prob > cutoff. cutoff=None ignores significance checks, speeding it up a LOT. quantile : number that represents target improvement. Defaults to .95. The ``diff`` returned is the % impact of everyone moving to the 95th percentile minsize : each group should contain at least minsize values. If minsize=None, automatically set the minimum size to 1% of the dataset, or 10, whichever is larger. """ if minsize is None: minsize = max(data.nrows / 100, 10) means = {col: data[col].mean() for col in numbers} results = [] for group in groups: agg = {number: bz.mean(data[number]) for number in numbers} agg["#"] = bz.count(data) ave = bz.by(data[group], **agg).sort("#", ascending=False) ave = bz.into(pd.DataFrame, ave) ave.index = ave[group] sizes = ave["#"] # Each group should contain at least minsize values biggies = sizes[sizes >= minsize].index # ... and at least 2 groups overall, to compare. if len(biggies) < 2: continue for number in numbers: if number == group: continue sorted_cats = ave[number][biggies].dropna().sort_values() if len(sorted_cats) < 2: continue lo = bz.into(list, data[number][data[group] == sorted_cats.index[0]]) hi = bz.into(list, data[number][data[group] == sorted_cats.index[-1]]) _, prob = ttest_ind(np.ma.masked_array(lo, np.isnan(lo)), np.ma.masked_array(hi, np.isnan(hi))) if prob > cutoff: continue results.append( { "group": group, "number": number, "prob": prob, "gain": (sorted_cats.iloc[-1] / means[number] - 1)[0], "biggies": ave.ix[biggies][number], "means": ave[[number, "#"]].sort_values(by=number), } ) results = pd.DataFrame(results) if len(results) > 0: results = results.set_index(["group", "number"]) return results
from odo import odo f = 300 # Load sqlite here from sqlalchemy import create_engine from sqlalchemy.engine import reflection from sqlalchemy.orm import sessionmaker from sqlalchemy import * app = Sanic() tables = data('sqlite:///../../db.db') coin_table = tables['coins'] # Compress the information coin_tables_comp = by(coin_table.name, sentences=coin_table.sentences) # Prepare to use annoy ctl = len(coin_tables_comp) item_arr_set = [] item_names = [i[0] for i in coin_tables_comp] # Initialize and fill annoy t = AnnoyIndex(f) # Length of item vector that will be indexed for i in range(ctl): v = [random.gauss(0, 1) for z in range(f)] t.add_item(i, v) item_arr_set.append((item_names[i], i)) t.build(10) # 10 trees t.save('../../test.ann')
expected = compute(expr, data) if not result == expected: print(result) print(expected) if isinstance(result, float): assert abs(result - expected) < 0.001 else: assert result == expected exprs = [ t['amount'], t['amount'] == 100, t['amount'].truncate(150), t[t['name'] == 'Alice'], t[t['amount'] == 0], t[t['amount'] > 150], t['amount'] + t['id'], t['amount'] % t['id'], exp(t['amount']), by(t['name'], total=t['amount'].sum()), by(t['name'], total=(t['amount'] + 1).sum()), (t['amount'] * 1).label('foo'), t.map(lambda tup: tup[1] + tup[2], 'real'), t[t.name.like('Alice')], t['amount'].apply(identity, 'var * real', splittable=True), t['amount'].map(lambda x: x + 1, 'int') ] exprs = list(zip(map(str, exprs), exprs)) def tuplify(x): return tuple(x) if isinstance(x, list) else x
def test_by_with_date(ctx, db, attr): expr = by(getattr(db.dates.ds, attr), mean=db.dates.amount.mean()) result = odo(compute(expr, ctx), set) expected = odo(compute(expr, {db: {'dates': date_df}}), set) assert result == expected
def test_by_summary(db, ctx): t = db.t expr = by(t.name, mymin=t.amount.min(), mymax=t.amount.max()) result = compute(expr, ctx) expected = compute(expr, {db: {'t': df}}) assert into(set, result) == into(set, expected)