Ejemplo n.º 1
0
def commodityTonnage_by_level(d, outdir, level):
    name = '-'.join(level)
    outpath = path.join(outdir, 'tonnage_by_{}.csv'.format(name))
    if replace or not path.exists(outpath):
        dr = bz.by(bz.merge(d[level], d.commodity, d.year, d.month, d.category), commodityTonnage=d.commodityTonnage.sum())
        do = bz.by(bz.merge(d[level], d.commodity, d.category), commodityTonnage=d.commodityTonnage.sum())
        #d[[level, 'commodity']]
        save(do, outpath, replace)
        content, by_year_month = commodityTonnage_over_time(dr, outdir, level)
        commodityTonnage_by_year(dr, outdir, level)
        if content:
            commodityTonnage_by_month(by_year_month, outdir, level)
    return
Ejemplo n.º 2
0
    def _odo_object(self):
        from blaze import by, merge, head

        table = self.binded_table
        if self.whereclauses:
            wheres = bind_list(self, self.whereclauses)
            table = table[reduce(lambda x, y: x and y, wheres)]
        tb = self.tables[self.table.name]
        self.tables[self.table.name] = table
        columns = bind_list(self,
                            self.columns) or [table[_] for _ in table.fields]
        self.tables[self.table.name] = tb
        if self.groupclauses:
            groups = bind_list(self, self.groupclauses)
            groups = [table[_.fields[0]] for _ in groups]
            names = [_.fields[0] for _ in groups]
            groups = merge(*groups) if len(groups) > 1 else groups[0]
            table = by(
                groups, **{
                    c.fields[0]: c
                    for c in columns if c.fields[0] not in names
                })
        if self.orderclauses:
            orders = bind_list(self, self.orderclauses)
            for order in orders.reverse():
                table = table.sort(*order)
        if self.limit:
            table = head(table, self.limit)
        return table[[_.fields[0] for _ in columns]]
Ejemplo n.º 3
0
    def load_adjusted_array(self, columns, dates, assets, mask):
        expr = self._expr
        filtered = expr[expr[TS_FIELD_NAME] <= dates[0]]
        lower = odo(
            bz.by(
                filtered[SID_FIELD_NAME],
                timestamp=filtered[TS_FIELD_NAME].max(),
            ).timestamp.min(), pd.Timestamp, **self._odo_kwargs)
        if pd.isnull(lower):
            # If there is no lower date, just query for data in the date
            # range. It must all be null anyways.
            lower = dates[0]

        raw = odo(
            expr[(expr[TS_FIELD_NAME] >= lower)
                 & (expr[TS_FIELD_NAME] <= dates[-1])], pd.DataFrame,
            **self._odo_kwargs)

        sids = raw.loc[:, SID_FIELD_NAME]
        raw.drop(sids[~(sids.isin(assets) | sids.notnull())].index,
                 inplace=True)

        gb = raw.groupby(SID_FIELD_NAME)

        def mkseries(idx, raw_loc=raw.loc):
            vs = raw_loc[idx, [TS_FIELD_NAME, ANNOUNCEMENT_FIELD_NAME]].values
            return pd.Series(
                index=pd.DatetimeIndex(vs[:, 0]),
                data=vs[:, 1],
            )

        return EarningsCalendarLoader(
            dates,
            valmap(mkseries, gb.groups),
        ).load_adjusted_array(columns, dates, assets, mask)
Ejemplo n.º 4
0
def test_by(ctx, db, grouper, reducer, reduction):
    t = db.t
    expr = by(t[grouper], total=getattr(t[reducer], reduction)())
    result = compute(expr, ctx)
    expected = compute(expr, {db: {'t': df}})
    assert (set(map(frozenset, into(list, result))) ==
            set(map(frozenset, into(list, expected))))
Ejemplo n.º 5
0
def build_training_set(
        rundb: str,
        topicsfile: str,
        index_path: str,
        axioms: typing.Sequence[PairwiseAxiom],
        max_rank=100):

    queries_by_id, rd, ctx = initialize(rundb, topicsfile, index_path, max_rank)

    system_query = rd[['system', 'query']].distinct().sort('query')

    from blaze import by, merge
    ranking_lengths = by(merge(rd.system, rd.query), n=rd.rank.count()).n
    cpair_count = int((ranking_lengths * ranking_lengths - ranking_lengths).sum())
    iter_count = cpair_count * len(axioms) * 2

    pbar = tqdm.tqdm(total=iter_count)

    def loop(i):
        for item in i:
            pbar.update()
            yield item


    for sys, qid in system_query:
        sqrun = rd[(rd['system'] == sys) & (rd['query'] == qid)]
        ranking = [ctx.c.get_document(did[0]) for did in sqrun.docid]
        query = queries_by_id[qid]

        part = build_training_set_for_one_ranking(ctx, axioms, query, sys, ranking, loop)

        yield part
Ejemplo n.º 6
0
def test_multikey_by(ctx, db, reducer, reduction):
    t = db.t
    expr = by(t[['id', 'amount']], total=getattr(t[reducer], reduction)())
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert (set(map(frozenset, into(list, result))) ==
            set(map(frozenset, into(list, expected))))
Ejemplo n.º 7
0
def create_plot(team="LAA", year=2012):
    expr = bz.by(db.Salaries.teamID,
                 avg=db.Salaries.salary.mean(),
                 max=db.Salaries.salary.max(),
                 ratio=db.Salaries.salary.max() / db.Salaries.salary.min())
    expr = expr.sort('ratio', ascending=False)

    df_salary_gb = into(pd.DataFrame, expr)
    source1 = into(ColumnDataSource, df_salary_gb[["teamID", "avg"]])

    plot1 = plt.figure(title="Salary ratio by team", x_range=list(df_salary_gb["teamID"]))
    plot1.scatter(x="teamID", y="avg", source=source1, size=20)
    plot1.xaxis.major_label_orientation = np.pi/3

    df = into(pd.DataFrame, db.Salaries)
    df = df[df["teamID"] == team]
    df = df[df["yearID"] == year]

    df = df[["playerID","salary"]].sort('salary')
    source_team = into(ColumnDataSource, df)
    p_team = plt.figure(title="Salary of players for %s during %s" % (team, year),
                        x_range=list(df["playerID"]))#, tools=TOOLS)
    p_team.scatter(x="playerID", y="salary", source=source_team, size=20)
    p_team.xaxis.major_label_orientation = np.pi/3

    p = plt.gridplot([[plot1, p_team]])
    return p
Ejemplo n.º 8
0
def test_multikey_by(ctx, db, reducer, reduction):
    t = db.t
    expr = by(t[['id', 'amount']], total=getattr(t[reducer], reduction)())
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert (set(map(frozenset, into(list, result))) ==
            set(map(frozenset, into(list, expected))))
Ejemplo n.º 9
0
def test_expr_client_interactive():
    c = Client('localhost:6363')
    t = bz_data(c)

    assert compute(t.accounts.name) == ['Alice', 'Bob']
    assert (into(set, compute(by(t.accounts.name, min=t.accounts.amount.min(),
                                                  max=t.accounts.amount.max())))
            == set([('Alice', 100, 100), ('Bob', 200, 200)]))
Ejemplo n.º 10
0
def test_spark_big_by(sc):
    tbig = symbol(
        'tbig', 'var * {name: string, sex: string[1], amount: int, id: int}')

    big_exprs = [
        by(tbig[['name', 'sex']], total=tbig['amount'].sum()),
        by(tbig[['name', 'sex']], total=(tbig['id'] + tbig['amount']).sum())]

    databig = [['Alice', 'F', 100, 1],
               ['Alice', 'F', 100, 3],
               ['Drew', 'F', 100, 4],
               ['Drew', 'M', 100, 5],
               ['Drew', 'M', 200, 5]]

    rddbig = sc.parallelize(databig)

    check_exprs_against_python(big_exprs, databig, rddbig)
Ejemplo n.º 11
0
def test_by_with_single_row():
    ct = bcolz.ctable([[1, 1, 3, 3], [1, 2, 3, 4]], names=list('ab'))
    t = symbol('t', discover(ct))
    subset = t[t.a == 3]
    expr = by(subset.a, b_sum=subset.b.sum())
    result = compute(expr, ct)
    expected = compute(expr, ct, optimize=False)
    tm.assert_frame_equal(result, expected)
Ejemplo n.º 12
0
def test_expr_client_interactive():
    ec = Client('localhost:6363', 'accounts')
    t = Table(ec)

    assert compute(t.name) == ['Alice', 'Bob']
    assert (into(set, compute(by(t.name, min=t.amount.min(),
                                         max=t.amount.max()))) ==
            set([('Alice', 100, 100), ('Bob', 200, 200)]))
Ejemplo n.º 13
0
def test_by_with_single_row():
    ct = bcolz.ctable([[1, 1, 3, 3], [1, 2, 3, 4]], names=list('ab'))
    t = symbol('t', discover(ct))
    subset = t[t.a == 3]
    expr = by(subset.a, b_sum=subset.b.sum())
    result = compute(expr, ct)
    expected = compute(expr, ct, optimize=False)
    tm.assert_frame_equal(result, expected)
Ejemplo n.º 14
0
def test_by_with_date(ctx, db, attr):
    # TODO: investigate CSV writing precision between pandas 0.16.0 and 0.16.1
    # TODO: see if we can use odo to convert the dshape of an existing
    #       DataFrame
    expr = by(getattr(db.dates.ds, attr), mean=db.dates.amount.mean())
    result = odo(compute(expr, ctx), pd.DataFrame).sort("mean").reset_index(drop=True)
    expected = compute(expr, {db: {"dates": date_df}}).sort("mean").reset_index(drop=True)
    tm.assert_frame_equal(result, expected, check_dtype=False)
Ejemplo n.º 15
0
def test_expr_client_interactive():
    ec = ExprClient('localhost:5000', 'accounts_df')
    t = Table(ec)

    assert compute(t.name) == ['Alice', 'Bob']
    assert (into(set, compute(by(t.name, min=t.amount.min(),
                                 max=t.amount.max()))) ==
            set([('Alice', 100, 100), ('Bob', 200, 200)]))
Ejemplo n.º 16
0
def _bar_any_by_year(func, **kwargs):
    grouping = bz.by(g.ds.production_year, val=func(g.ds.price))

    df = _order_years(grouping)

    p = Bar(df, 'production_year', values='val', legend=False, **kwargs)
    set_numerical_axis(p)

    return p
Ejemplo n.º 17
0
def test_by_with_date(ctx, db, attr):
    # TODO: investigate CSV writing precision between pandas 0.16.0 and 0.16.1
    # TODO: see if we can use odo to convert the dshape of an existing
    #       DataFrame
    expr = by(getattr(db.dates.ds, attr),
              mean=db.dates.amount.mean())
    result = odo(compute(expr, ctx), pd.DataFrame).sort('mean').reset_index(drop=True)
    expected = compute(expr, {db: {'dates': date_df}}).sort('mean').reset_index(drop=True)
    tm.assert_frame_equal(result, expected, check_dtype=False)
Ejemplo n.º 18
0
def commodityTonnage_by_year(d, outdir, level=None):
    df = odo.odo(d, pd.DataFrame)
    d = bz.Data(df, d.dshape)
    if level:
        if isinstance(level, list):
            expr = [l for l in level]
        else:
            expr = level
        name = '-'.join(level)
        outpath = path.join(outdir, 'tonnage_by_{}_year.csv'.format(name))
        if replace or not path.exists(outpath):
            do = bz.by(bz.merge(d[expr], d.year, d.commodity, d.category), commodityTonnage=d.commodityTonnage.sum())
            save(do, outpath, replace)
    else:
        outpath = path.join(outdir, 'tonnage_by_year.csv')
        if replace or not path.exists(outpath):
            do = bz.by(bz.merge(d.year, d.commodity, d.category), commodityTonnage=d.commodityTonnage.sum())
            save(do, outpath, replace)
    return
Ejemplo n.º 19
0
def test_groupby(sc):
    rddidx = sc.parallelize(data_idx)
    rddarc = sc.parallelize(data_arc)

    joined = join(t_arc, t_idx, "node_id")

    t = by(joined['name'], count=joined['node_id'].count())
    a = compute(t, {t_arc: rddarc, t_idx: rddidx})
    in_degree = dict(a.collect())
    assert in_degree == {'A': 1, 'C': 2}
Ejemplo n.º 20
0
def ffill_query_in_range(expr,
                         lower,
                         upper,
                         odo_kwargs=None,
                         ts_field=TS_FIELD_NAME,
                         sid_field=SID_FIELD_NAME):
    """Query a blaze expression in a given time range properly forward filling
    from values that fall before the lower date.

    Parameters
    ----------
    expr : Expr
        Bound blaze expression.
    lower : datetime
        The lower date to query for.
    upper : datetime
        The upper date to query for.
    odo_kwargs : dict, optional
        The extra keyword arguments to pass to ``odo``.
    ts_field : str, optional
        The name of the timestamp field in the given blaze expression.
    sid_field : str, optional
        The name of the sid field in the given blaze expression.

    Returns
    -------
    raw : pd.DataFrame
        A strict dataframe for the data in the given date range. This may
        start before the requested start date if a value is needed to ffill.
    """
    odo_kwargs = odo_kwargs or {}
    filtered = expr[expr[ts_field] <= lower]
    computed_lower = odo(
        bz.by(
            filtered[sid_field],
            timestamp=filtered[ts_field].max(),
        ).timestamp.min(),
        pd.Timestamp,
        **odo_kwargs
    )
    if pd.isnull(computed_lower):
        # If there is no lower date, just query for data in the date
        # range. It must all be null anyways.
        computed_lower = lower

    raw = odo(
        expr[
            (expr[ts_field] >= computed_lower) &
            (expr[ts_field] <= upper)
        ],
        pd.DataFrame,
        **odo_kwargs
    )
    raw.loc[:, ts_field] = raw.loc[:, ts_field].astype('datetime64[ns]')
    return raw
Ejemplo n.º 21
0
def ffill_query_in_range(expr,
                         lower,
                         upper,
                         odo_kwargs=None,
                         ts_field=TS_FIELD_NAME,
                         sid_field=SID_FIELD_NAME):
    """Query a blaze expression in a given time range properly forward filling
    from values that fall before the lower date.

    Parameters
    ----------
    expr : Expr
        Bound blaze expression.
    lower : datetime
        The lower date to query for.
    upper : datetime
        The upper date to query for.
    odo_kwargs : dict, optional
        The extra keyword arguments to pass to ``odo``.
    ts_field : str, optional
        The name of the timestamp field in the given blaze expression.
    sid_field : str, optional
        The name of the sid field in the given blaze expression.

    Returns
    -------
    raw : pd.DataFrame
        A strict dataframe for the data in the given date range. This may
        start before the requested start date if a value is needed to ffill.
    """
    odo_kwargs = odo_kwargs or {}
    filtered = expr[expr[ts_field] <= lower]
    computed_lower = odo(
        bz.by(
            filtered[sid_field],
            timestamp=filtered[ts_field].max(),
        ).timestamp.min(),
        pd.Timestamp,
        **odo_kwargs
    )
    if pd.isnull(computed_lower):
        # If there is no lower date, just query for data in the date
        # range. It must all be null anyways.
        computed_lower = lower

    raw = odo(
        expr[
            (expr[ts_field] >= computed_lower) &
            (expr[ts_field] <= upper)
        ],
        pd.DataFrame,
        **odo_kwargs
    )
    raw.loc[:, ts_field] = raw.loc[:, ts_field].astype('datetime64[ns]')
    return raw
Ejemplo n.º 22
0
def test_group_by_map(fkey, grouper):
    t = symbol('fkey', discover(fkey))
    expr = by(t[grouper], id_count=t.size.count())
    result = compute(expr, fkey, return_type='native')
    expected = """SELECT
        fkey.sym_id,
        count(fkey.size) AS id_count
    FROM fkey
    GROUP BY fkey.sym_id
    """
    assert normalize(str(result)) == normalize(expected)
Ejemplo n.º 23
0
def test_group_by_map(fkey, grouper):
    t = symbol('fkey', discover(fkey))
    expr = by(t[grouper], id_count=t.size.count())
    result = compute(expr, fkey)
    expected = """SELECT
        fkey.sym_id,
        count(fkey.size) AS id_count
    FROM fkey
    GROUP BY fkey.sym_id
    """
    assert normalize(str(result)) == normalize(expected)
Ejemplo n.º 24
0
def test_foreign_key_group_by(fkey, grouper):
    t = symbol('fkey', discover(fkey))
    expr = by(t.sym_id[grouper], avg_price=t.sym_id.price.mean())
    result = compute(expr, fkey, return_type='native')
    expected = """SELECT
        pkey.sym,
        avg(pkey.price) AS avg_price
    FROM pkey, fkey
    WHERE fkey.sym_id = pkey.id
    GROUP BY pkey.sym
    """
    assert normalize(str(result)) == normalize(expected)
Ejemplo n.º 25
0
def _base_stats(ds):
    df = odo(bz.by(ds.district,
            sum_price=bz.sum(ds.price),
            sum_area=bz.sum(ds.area),
            count=bz.count(ds.price)),
        pd.DataFrame)

    df["avg_area"] = df["sum_area"] / df["count"]
    df["avg_price"] = df["sum_price"] / df["count"]
    df["avg_price_m2"] = df["sum_price"] / df["sum_area"]

    return df
Ejemplo n.º 26
0
def test_compute_by_with_summary(iris_server, iris):
    test = iris_server
    t = TableSymbol('t', iris.dshape)
    expr = by(t.species, max=t.petal_length.max(), sum=t.petal_width.sum())
    tree = to_tree(expr)
    blob = json.dumps({'expr': tree})
    resp = test.post('/compute/iris.json', data=blob,
                     content_type='application/json')
    assert 'OK' in resp.status
    result = json.loads(resp.data)['data']
    expected = compute(expr, iris)
    assert result == list(map(list, expected))
Ejemplo n.º 27
0
def test_foreign_key_group_by(fkey, grouper):
    t = symbol('fkey', discover(fkey))
    expr = by(t.sym_id[grouper], avg_price=t.sym_id.price.mean())
    result = compute(expr, fkey)
    expected = """SELECT
        pkey.sym,
        avg(pkey.price) AS avg_price
    FROM pkey, fkey
    WHERE fkey.sym_id = pkey.id
    GROUP BY pkey.sym
    """
    assert normalize(str(result)) == normalize(expected)
Ejemplo n.º 28
0
def test_compute_by_with_summary(iris_server):
    test = iris_server
    t = symbol('t', discover(iris))
    expr = by(t.species, max=t.petal_length.max(), sum=t.petal_width.sum())
    tree = to_tree(expr)
    blob = json.dumps({'expr': tree})
    resp = test.post('/compute.json',
                     data=blob,
                     content_type='application/json')
    assert 'OK' in resp.status
    result = json.loads(resp.data.decode('utf-8'))['data']
    expected = compute(expr, iris)
    assert result == list(map(list, into(list, expected)))
Ejemplo n.º 29
0
def time_by(d, grouper='passenger_count', reducer='trip_time_in_secs',
            function='sum'):
    expr = by(getattr(d, grouper), s=getattr(getattr(d, reducer), function)())
    times = []
    for core_count in cores():
        if isinstance(d, pd.DataFrame):
            with timeit('cores: %d' % core_count, times):
                getattr(d.groupby(grouper)[reducer], function)()
        else:
            p = mp.Pool(core_count)
            with timeit('cores: %d' % core_count, times):
                compute(expr, map=p.map)
            p.close()
    return np.array(times)
Ejemplo n.º 30
0
def test_compute_by_with_summary(iris_server, serial):
    test = iris_server
    t = symbol('t', discover(iris))
    expr = by(t.species, max=t.petal_length.max(), sum=t.petal_width.sum())
    tree = to_tree(expr)
    blob = serial.dumps({'expr': tree})
    resp = test.post('/compute', data=blob, headers=mimetype(serial))
    assert 'OK' in resp.status
    tdata = serial.loads(resp.data)
    result = DataFrame(serial.data_loads(tdata['data'])).values
    expected = compute(expr, iris).values
    np.testing.assert_array_equal(result[:, 0], expected[:, 0])
    np.testing.assert_array_almost_equal(result[:, 1:], expected[:, 1:])
    assert list(tdata['names']) == ['species', 'max', 'sum']
Ejemplo n.º 31
0
    def load_adjusted_array(self, columns, dates, assets, mask):
        expr = self._expr
        filtered = expr[expr[TS_FIELD_NAME] <= dates[0]]
        lower = odo(
            bz.by(
                filtered[SID_FIELD_NAME],
                timestamp=filtered[TS_FIELD_NAME].max(),
            ).timestamp.min(),
            pd.Timestamp,
            **self._odo_kwargs
        )
        if pd.isnull(lower):
            # If there is no lower date, just query for data in the date
            # range. It must all be null anyways.
            lower = dates[0]

        raw = odo(
            expr[
                (expr[TS_FIELD_NAME] >= lower) &
                (expr[TS_FIELD_NAME] <= dates[-1])
            ],
            pd.DataFrame,
            **self._odo_kwargs
        )

        sids = raw.loc[:, SID_FIELD_NAME]
        raw.drop(
            sids[~(sids.isin(assets) | sids.notnull())].index,
            inplace=True
        )

        gb = raw.groupby(SID_FIELD_NAME)

        def mkseries(idx, raw_loc=raw.loc):
            vs = raw_loc[
                idx, [TS_FIELD_NAME, ANNOUNCEMENT_FIELD_NAME]
            ].values
            return pd.Series(
                index=pd.DatetimeIndex(vs[:, 0]),
                data=vs[:, 1],
            )

        return EarningsCalendarLoader(
            dates,
            valmap(mkseries, gb.groups),
            dataset=self._dataset,
        ).load_adjusted_array(columns, dates, assets, mask)
Ejemplo n.º 32
0
        def where(e, column):
            """Create the query to run against the resources.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            column : BoundColumn
                The column to query for.

            Returns
            -------
            q : Expr
                The query to run for the given column.
            """
            colname = column.name
            pred = e[TS_FIELD_NAME] <= lower_dt
            schema = e[colname].schema.measure
            if isinstance(schema, Option):
                pred &= e[colname].notnull()
                schema = schema.ty
            if schema in floating:
                pred &= ~e[colname].isnan()
            filtered = e[pred]
            lower = filtered.timestamp.max()

            if have_sids:
                # If we have sids, then we need to take the earliest of the
                # greatest date that has a non-null value by sid.
                lower = bz.by(
                    filtered[SID_FIELD_NAME],
                    timestamp=lower,
                ).timestamp.min()

            lower = odo(lower, pd.Timestamp)
            if lower is pd.NaT:
                # If there is no lower date, just query for data in he date
                # range. It must all be null anyways.
                lower = lower_dt

            return e[
                (e[TS_FIELD_NAME] >= lower) &
                (e[TS_FIELD_NAME] <= upper_dt)
            ][added_query_fields + [colname]]
Ejemplo n.º 33
0
        def where(e, column):
            """Create the query to run against the resources.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            column : BoundColumn
                The column to query for.

            Returns
            -------
            q : Expr
                The query to run for the given column.
            """
            colname = column.name
            pred = e[TS_FIELD_NAME] <= lower_dt
            schema = e[colname].schema.measure
            if isinstance(schema, Option):
                pred &= e[colname].notnull()
                schema = schema.ty
            if schema in floating:
                pred &= ~e[colname].isnan()
            filtered = e[pred]
            lower = filtered.timestamp.max()

            if have_sids:
                # If we have sids, then we need to take the earliest of the
                # greatest date that has a non-null value by sid.
                lower = bz.by(
                    filtered[SID_FIELD_NAME],
                    timestamp=lower,
                ).timestamp.min()

            lower = odo(lower, pd.Timestamp)
            if lower is pd.NaT:
                # If there is no lower date, just query for data in he date
                # range. It must all be null anyways.
                lower = lower_dt

            return e[(e[TS_FIELD_NAME] >= lower)
                     & (e[TS_FIELD_NAME] <= upper_dt)][added_query_fields +
                                                       [colname]]
Ejemplo n.º 34
0
def test_compute_by_with_summary(iris_server, serial):
    test = iris_server
    t = symbol('t', discover(iris))
    expr = by(
        t.species,
        max=t.petal_length.max(),
        sum=t.petal_width.sum(),
    )
    tree = to_tree(expr)
    blob = serial.dumps({'expr': tree})
    resp = test.post(
        '/compute.{name}'.format(name=serial.name),
        data=blob,
    )
    assert 'OK' in resp.status
    result = DataFrame(serial.loads(resp.data)['data']).values
    expected = compute(expr, iris).values
    np.testing.assert_array_equal(result[:, 0], expected[:, 0])
    np.testing.assert_array_almost_equal(result[:, 1:], expected[:, 1:])
Ejemplo n.º 35
0
            def lower_for_col(column):
                pred = e[TS_FIELD_NAME] <= lower_dt
                colname = column.name
                schema = e[colname].schema.measure
                if isinstance(schema, Option):
                    pred &= e[colname].notnull()
                    schema = schema.ty
                if schema in floating:
                    pred &= ~e[colname].isnan()

                filtered = e[pred]
                lower = filtered[TS_FIELD_NAME].max()
                if have_sids:
                    # If we have sids, then we need to take the earliest of the
                    # greatest date that has a non-null value by sid.
                    lower = bz.by(
                        filtered[SID_FIELD_NAME],
                        timestamp=lower,
                    ).timestamp.min()
                return lower
Ejemplo n.º 36
0
def test_compute_by_with_summary(iris_server, serial):
    test = iris_server
    t = symbol('t', discover(iris))
    expr = by(t.species,
              max=t.petal_length.max(),
              sum=t.petal_width.sum())
    tree = to_tree(expr)
    blob = serial.dumps({'expr': tree})
    resp = test.post('/compute',
                     data=blob,
                     headers=mimetype(serial))
    assert 'OK' in resp.status
    tdata = serial.loads(resp.data)
    result = DataFrame(serial.data_loads(tdata['data'])).values
    expected = compute(expr, iris).values
    np.testing.assert_array_equal(result[:, 0],
                                  expected[:, 0])
    np.testing.assert_array_almost_equal(result[:, 1:],
                                         expected[:, 1:])
    assert list(tdata['names']) == ['species', 'max', 'sum']
Ejemplo n.º 37
0
            if isinstance(result, float):
                assert abs(result - expected) < 0.001
            else:
                assert result == expected

exprs = [
    t['amount'],
    t['amount'] == 100,
    t['amount'].truncate(150),
    t[t['name'] == 'Alice'],
    t[t['amount'] == 0],
    t[t['amount'] > 150],
    t['amount'] + t['id'],
    t['amount'] % t['id'],
    exp(t['amount']),
    by(t['name'], total=t['amount'].sum()),
    by(t['name'], total=(t['amount'] + 1).sum()),
    (t['amount'] * 1).label('foo'),
    t.map(lambda tup: tup[1] + tup[2], 'real'),
    t.like(name='Alice'),
    t['amount'].apply(identity, 'var * real', splittable=True),
    t['amount'].map(inc, 'int')]


def test_spark_basic(rdd):
    check_exprs_against_python(exprs, data, rdd)


def check_exprs_against_python(exprs, data, rdd):
    any_bad = False
    for expr in exprs:
Ejemplo n.º 38
0
    result = df.groupby(['f0'])['f2'].sum()
print(result)
t_pandas = t_elapsed

# -- cytoolz --
with ctime(message='cytoolz over bcolz'):
    # In Memory Split-Apply-Combine
    # http://toolz.readthedocs.org/en/latest/streaming-analytics.html?highlight=reduce#split-apply-combine-with-groupby-and-reduceby
    r = cytoolz.groupby(lambda row: row.f0, ct)
    result = valmap(compose(sum, pluck(2)), r)
print('x{0} slower than pandas'.format(round(t_elapsed / t_pandas, 2)))
print(result)

# -- blaze + bcolz --
blaze_data = blz.Data(ct.rootdir)
expr = blz.by(blaze_data.f0, sum_f2=blaze_data.f2.sum())
with ctime(message='blaze over bcolz'):
    result = blz.compute(expr)
print('x{0} slower than pandas'.format(round(t_elapsed / t_pandas, 2)))
print(result)

# -- bquery --
with ctime(message='bquery over bcolz'):
    result = ct.groupby(['f0'], ['f2'])
print('x{0} slower than pandas'.format(round(t_elapsed / t_pandas, 2)))
print(result)

ct.cache_factor(['f0'], refresh=True)
with ctime(message='bquery over bcolz (factorization cached)'):
    result = ct.groupby(['f0'], ['f2'])
print('x{0} slower than pandas'.format(round(t_elapsed / t_pandas, 2)))
Ejemplo n.º 39
0
    def _blaze(self, _selects, _wheres, _groups, _aggs, _offset, _limit,
               _sorts, _count, _q):
        import blaze as bz
        import datashape
        # TODO: Not caching blaze connections
        parameters = self.params.get('parameters', {})
        bzcon = bz.Data(
            self.params['url'] +
            ('::' + self.params['table'] if self.params.get('table') else ''),
            **parameters)
        table = bz.Symbol('table', bzcon.dshape)
        columns = table.fields
        query = table

        if _wheres:
            wh_re = re.compile(r'([^=><~!]+)([=><~!]{1,2})([\s\S]+)')
            wheres = None
            for where in _wheres:
                match = wh_re.search(where)
                if match is None:
                    continue
                col, oper, val = match.groups()
                col = table[col]
                if oper in ['==', '=']:
                    whr = (col == val)
                elif oper == '>=':
                    whr = (col >= val)
                elif oper == '<=':
                    whr = (col <= val)
                elif oper == '>':
                    whr = (col > val)
                elif oper == '<':
                    whr = (col < val)
                elif oper == '!=':
                    whr = (col != val)
                elif oper == '~':
                    whr = (col.like('*' + val + '*'))
                elif oper == '!~':
                    whr = (~col.like('*' + val + '*'))
                wheres = whr if wheres is None else wheres & whr
            query = query if wheres is None else query[wheres]

        alias_cols = []
        if _groups and _aggs:
            byaggs = {
                'min': bz.min,
                'max': bz.max,
                'sum': bz.sum,
                'count': bz.count,
                'mean': bz.mean,
                'nunique': bz.nunique
            }
            agg_re = re.compile(r'([^:]+):([aA-zZ]+)\(([^:]+)\)')
            grps = bz.merge(*[query[group] for group in _groups])
            aggs = {}
            for agg in _aggs:
                match = agg_re.search(agg)
                if match is None:
                    continue
                name, oper, col = match.groups()
                alias_cols.append(name)
                aggs[name] = byaggs[oper](query[col])
            query = bz.by(grps, **aggs)

        if _q:
            wheres = None
            for col in columns:
                if isinstance(table[col].dshape.measure.ty,
                              datashape.coretypes.String):
                    whr = table[col].like('*' + _q + '*')
                    wheres = whr if wheres is None else wheres | whr
            if wheres is not None:
                query = query[wheres]

        count_query = query.count()

        if _sorts:
            order = {'asc': True, 'desc': False}
            sorts = []
            for sort in _sorts:
                col, odr = sort.partition(':')[::2]
                if col not in columns + alias_cols:
                    continue
                sorts.append(col)
            if sorts:
                query = query.sort(sorts, ascending=order.get(odr, True))

        if _offset:
            _offset = int(_offset)
        if _limit:
            _limit = int(_limit)
        if _offset and _limit:
            _limit += _offset
        if _offset or _limit:
            query = query[_offset:_limit]

        if _selects:
            query = query[_selects]

        # TODO: Improve json, csv, html outputs using native odo
        result = {
            'query': query,
            'data': bz.odo(bz.compute(query, bzcon.data), pd.DataFrame),
        }
        if _count:
            count = bz.odo(bz.compute(count_query, bzcon.data), pd.DataFrame)
            result['count'] = count.iloc[0, 0]
        return result
Ejemplo n.º 40
0
def test_multikey_by(ctx, db, reducer, reduction):
    t = db.t
    expr = by(t[["id", "amount"]], total=getattr(t[reducer], reduction)())
    result = compute(expr, ctx)
    expected = compute(expr, {db: {"t": df}})
    assert set(map(frozenset, into(list, result))) == set(map(frozenset, into(list, expected)))
Ejemplo n.º 41
0
def test_grouper_with_arith(ctx, db):
    expr = by(db.t[['id', 'amount']], total=(db.t.amount + 1).sum())
    result = compute(expr, ctx)
    expected = compute(expr, {db: {'t': df}})
    assert list(map(set, into(list, result))) == list(map(set, into(list,
                                                                    expected)))
Ejemplo n.º 42
0
def test_by_non_native_ops(ctx, db):
    expr = by(db.t.id, total=db.t.id.nunique())
    result = compute(expr, ctx)
    expected = compute(expr, {db: {'t': df}})
    assert list(map(set, into(list,
                              result))) == list(map(set, into(list, expected)))
Ejemplo n.º 43
0
def test_recursive_rowfunc_is_used(rdd):
    expr = by(t['name'], total=(2 * (t['amount'] + t['id'])).sum())
    expected = [('Alice', 2 * (101 + 53)),
                ('Bob', 2 * (202))]
    assert set(compute(expr, rdd).collect()) == set(expected)
Ejemplo n.º 44
0
def groupmeans(data, groups, numbers, cutoff=0.01, quantile=0.95, minsize=None):
    """
    Yields the significant differences in average between every pair of
    groups and numbers.

    Parameters
    ----------
    data : blaze data object
    groups : non-empty iterable containing category column names in data
    numbers : non-empty iterable containing numeric column names in data
    cutoff : ignore anything with prob > cutoff.
        cutoff=None ignores significance checks, speeding it up a LOT.
    quantile : number that represents target improvement. Defaults to .95.
        The ``diff`` returned is the % impact of everyone moving to the 95th
        percentile
    minsize : each group should contain at least minsize values.
        If minsize=None, automatically set the minimum size to
        1% of the dataset, or 10, whichever is larger.
    """

    if minsize is None:
        minsize = max(data.nrows / 100, 10)

    means = {col: data[col].mean() for col in numbers}
    results = []

    for group in groups:
        agg = {number: bz.mean(data[number]) for number in numbers}
        agg["#"] = bz.count(data)
        ave = bz.by(data[group], **agg).sort("#", ascending=False)
        ave = bz.into(pd.DataFrame, ave)
        ave.index = ave[group]
        sizes = ave["#"]

        # Each group should contain at least minsize values
        biggies = sizes[sizes >= minsize].index
        # ... and at least 2 groups overall, to compare.
        if len(biggies) < 2:
            continue
        for number in numbers:
            if number == group:
                continue
            sorted_cats = ave[number][biggies].dropna().sort_values()
            if len(sorted_cats) < 2:
                continue
            lo = bz.into(list, data[number][data[group] == sorted_cats.index[0]])
            hi = bz.into(list, data[number][data[group] == sorted_cats.index[-1]])
            _, prob = ttest_ind(np.ma.masked_array(lo, np.isnan(lo)), np.ma.masked_array(hi, np.isnan(hi)))
            if prob > cutoff:
                continue
            results.append(
                {
                    "group": group,
                    "number": number,
                    "prob": prob,
                    "gain": (sorted_cats.iloc[-1] / means[number] - 1)[0],
                    "biggies": ave.ix[biggies][number],
                    "means": ave[[number, "#"]].sort_values(by=number),
                }
            )

    results = pd.DataFrame(results)
    if len(results) > 0:
        results = results.set_index(["group", "number"])
    return results
Ejemplo n.º 45
0
from odo import odo
f = 300

# Load sqlite here
from sqlalchemy import create_engine
from sqlalchemy.engine import reflection
from sqlalchemy.orm import sessionmaker
from sqlalchemy import *

app = Sanic()

tables = data('sqlite:///../../db.db')
coin_table = tables['coins']

# Compress the information
coin_tables_comp = by(coin_table.name, sentences=coin_table.sentences)

# Prepare to use annoy
ctl = len(coin_tables_comp)
item_arr_set = []
item_names = [i[0] for i in coin_tables_comp]

# Initialize and fill annoy
t = AnnoyIndex(f)  # Length of item vector that will be indexed
for i in range(ctl):
    v = [random.gauss(0, 1) for z in range(f)]
    t.add_item(i, v)
    item_arr_set.append((item_names[i], i))

t.build(10)  # 10 trees
t.save('../../test.ann')
Ejemplo n.º 46
0
        expected = compute(expr, data)
        if not result == expected:
            print(result)
            print(expected)
            if isinstance(result, float):
                assert abs(result - expected) < 0.001
            else:
                assert result == expected


exprs = [
    t['amount'], t['amount'] == 100, t['amount'].truncate(150),
    t[t['name'] == 'Alice'], t[t['amount'] == 0], t[t['amount'] > 150],
    t['amount'] + t['id'], t['amount'] % t['id'],
    exp(t['amount']),
    by(t['name'], total=t['amount'].sum()),
    by(t['name'],
       total=(t['amount'] + 1).sum()), (t['amount'] * 1).label('foo'),
    t.map(lambda tup: tup[1] + tup[2],
          'real'), t[t.name.like('Alice')], t['amount'].apply(identity,
                                                              'var * real',
                                                              splittable=True),
    t['amount'].map(lambda x: x + 1, 'int')
]

exprs = list(zip(map(str, exprs), exprs))


def tuplify(x):
    return tuple(x) if isinstance(x, list) else x
Ejemplo n.º 47
0
def test_by_non_native_ops(ctx, db):
    expr = by(db.t.id, total=db.t.id.nunique())
    result = compute(expr, ctx)
    expected = compute(expr, {db: {'t': df}})
    assert list(map(set, into(list, result))) == list(map(set, into(list,
                                                                    expected)))
Ejemplo n.º 48
0
    result = df.groupby(['f0'])['f2'].sum()
print(result)
t_pandas = t_elapsed

# -- cytoolz --
with ctime(message='cytoolz over bcolz'):
    # In Memory Split-Apply-Combine
    # http://toolz.readthedocs.org/en/latest/streaming-analytics.html?highlight=reduce#split-apply-combine-with-groupby-and-reduceby
    r = cytoolz.groupby(lambda row: row.f0, ct)
    result = valmap(compose(sum, pluck(2)), r)
print('x{0} slower than pandas'.format(round(t_elapsed / t_pandas, 2)))
print(result)

# -- blaze + bcolz --
blaze_data = blz.Data(ct.rootdir)
expr = blz.by(blaze_data.f0, sum_f2=blaze_data.f2.sum())
with ctime(message='blaze over bcolz'):
    result = blz.compute(expr)
print('x{0} slower than pandas'.format(round(t_elapsed / t_pandas, 2)))
print(result)

# -- bquery --
with ctime(message='bquery over bcolz'):
    result = ct.groupby(['f0'], ['f2'])
print('x{0} slower than pandas'.format(round(t_elapsed / t_pandas, 2)))
print(result)

ct.cache_factor(['f0'], refresh=True)
with ctime(message='bquery over bcolz (factorization cached)'):
    result = ct.groupby(['f0'], ['f2'])
print('x{0} slower than pandas'.format(round(t_elapsed / t_pandas, 2)))
Ejemplo n.º 49
0
def test_grouper_with_arith(ctx, db):
    expr = by(db.t[['id', 'amount']], total=(db.t.amount + 1).sum())
    result = compute(expr, ctx)
    expected = compute(expr, {db: {'t': df}})
    assert list(map(set, into(list,
                              result))) == list(map(set, into(list, expected)))
Ejemplo n.º 50
0
def test_by_with_date(ctx, db, attr):
    expr = by(getattr(db.dates.ds, attr),
              mean=db.dates.amount.mean())
    result = odo(compute(expr, ctx), set)
    expected = odo(compute(expr, {db: {'dates': date_df}}), set)
    assert result == expected
Ejemplo n.º 51
0
def test_by_summary(db, ctx):
    t = db.t
    expr = by(t.name, mymin=t.amount.min(), mymax=t.amount.max())
    result = compute(expr, ctx)
    expected = compute(expr, {db: {'t': df}})
    assert into(set, result) == into(set, expected)
Ejemplo n.º 52
0
def test_by_summary(db, ctx):
    t = db.t
    expr = by(t.name, mymin=t.amount.min(), mymax=t.amount.max())
    result = compute(expr, ctx)
    expected = compute(expr, {db: {'t': df}})
    assert into(set, result) == into(set, expected)