Exemple #1
0
def test_add_expanded_payload_has_effect(temp_add_server, serial):
    # Ensure that the expanded payload format actually passes the arguments
    # through to the resource constructor
    iris_path = example('iris-latin1.tsv')
    csv_kwargs = {'delimiter': '\t', 'encoding': 'iso-8859-1'}
    blob = serial.dumps({'iris': {'source': iris_path,
                                  'kwargs': csv_kwargs}})
    response1 = temp_add_server.post('/add',
                                     headers=mimetype(serial),
                                     data=blob)
    assert 'CREATED' in response1.status
    assert response1.status_code == RC.CREATED

    # check for expected server datashape
    response2 = temp_add_server.get('/datashape')
    expected2 = discover({'iris': data(iris_path, **csv_kwargs)})
    response_dshape = datashape.dshape(response2.data.decode('utf-8'))
    assert_dshape_equal(response_dshape.measure.dict['iris'],
                        expected2.measure.dict['iris'])

    # compute on added data
    t = data({'iris': data(iris_path, **csv_kwargs)})
    expr = t.iris.petal_length.sum()

    response3 = temp_add_server.post('/compute',
                                     data=serial.dumps({'expr': to_tree(expr)}),
                                     headers=mimetype(serial))

    result3 = serial.data_loads(serial.loads(response3.data)['data'])
    expected3 = compute(expr, {'iris': data(iris_path, **csv_kwargs)})
    assert result3 == expected3
Exemple #2
0
def test_add_data_to_server(temp_add_server, serial):
    # add data
    iris_path = example('iris.csv')
    blob = serial.dumps({'iris': iris_path})
    response1 = temp_add_server.post('/add',
                                     headers=mimetype(serial),
                                     data=blob)
    assert 'CREATED' in response1.status
    assert response1.status_code == RC.CREATED

    # check for expected server datashape
    response2 = temp_add_server.get('/datashape')
    expected2 = discover({'iris': data(iris_path)})
    response_dshape = datashape.dshape(response2.data.decode('utf-8'))
    assert_dshape_equal(response_dshape.measure.dict['iris'],
                        expected2.measure.dict['iris'])

    # compute on added data
    t = data({'iris': data(iris_path)})
    expr = t.iris.petal_length.sum()

    response3 = temp_add_server.post('/compute',
                                     data=serial.dumps({'expr': to_tree(expr)}),
                                     headers=mimetype(serial))

    result3 = serial.data_loads(serial.loads(response3.data)['data'])
    expected3 = compute(expr, {'iris': data(iris_path)})
    assert result3 == expected3
def sql_with_null(url):
    ds = dshape(""" var * {name: ?string,
                           sex: ?string,
                           amount: int,
                           id: int,
                           comment: ?string}
              """)
    rows = [('Alice', 'F', 100, 1, 'Alice comment'),
            (None, 'M', 300, 2, None),
            ('Drew', 'F', 100, 4, 'Drew comment'),
            ('Bob', 'M', 100, 5, 'Bob comment 2'),
            ('Drew', 'M', 200, 5, None),
            ('first', None, 300, 4, 'Missing info'),
            (None, None, 300, 6, None)]
    try:
        x = url % next(names)
        t = data(x, dshape=ds)
        print(x)
    except sa.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        assert t.dshape == ds
        t = data(odo(rows, t))
        try:
            yield t
        finally:
            drop(t)
Exemple #4
0
    def test_novel_deltas_macro(self):
        asset_info = asset_infos[0][0]
        base_dates = pd.DatetimeIndex([
            pd.Timestamp('2014-01-01'),
            pd.Timestamp('2014-01-04')
        ])
        baseline = pd.DataFrame({
            'value': (0, 1),
            'asof_date': base_dates,
            'timestamp': base_dates,
        })
        expr = bz.data(baseline, name='expr', dshape=self.macro_dshape)
        deltas = bz.data(baseline, name='deltas', dshape=self.macro_dshape)
        deltas = bz.transform(
            deltas,
            value=deltas.value + 10,
            timestamp=deltas.timestamp + timedelta(days=1),
        )

        nassets = len(asset_info)
        expected_views = keymap(pd.Timestamp, {
            '2014-01-03': repeat_last_axis(
                np.array([10.0, 10.0, 10.0]),
                nassets,
            ),
            '2014-01-06': repeat_last_axis(
                np.array([10.0, 10.0, 11.0]),
                nassets,
            ),
        })

        cal = pd.DatetimeIndex([
            pd.Timestamp('2014-01-01'),
            pd.Timestamp('2014-01-02'),
            pd.Timestamp('2014-01-03'),
            # omitting the 4th and 5th to simulate a weekend
            pd.Timestamp('2014-01-06'),
        ])
        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                list(concatv([10] * nassets, [11] * nassets)),
                index=pd.MultiIndex.from_product((
                    sorted(expected_views.keys()),
                    finder.retrieve_all(asset_info.index),
                )),
                columns=('value',),
            )
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=cal,
                start=cal[2],
                end=cal[-1],
                window_length=3,
                compute_fn=op.itemgetter(-1),
            )
def sql(url):
    ds = dshape('var * {A: string, B: int64}')
    try:
        t = data(url % next(names), dshape=ds)
    except sa.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        assert t.dshape == ds
        t = data(odo([('a', 1), ('b', 2)], t))
        try:
            yield t
        finally:
            drop(t)
Exemple #6
0
def sql_two_tables(url):
    dshape = 'var * {a: int32}'
    try:
        t = data(url % next(names), dshape=dshape)
        u = data(url % next(names), dshape=dshape)
    except sa.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        try:
            yield u, t
        finally:
            drop(t)
            drop(u)
def sql_two_tables(url):
    dshape = 'var * {a: int32}'
    try:
        t = data(url % next(names), dshape=dshape)
        u = data(url % next(names), dshape=dshape)
    except sa.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        try:
            yield u, t
        finally:
            drop(t)
            drop(u)
def sql(url):
    ds = dshape('var * {A: string, B: int64}')
    try:
        t = data(url % next(names), dshape=ds)
    except sa.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        assert t.dshape == ds
        t = data(odo([('a', 1), ('b', 2)], t))
        try:
            yield t
        finally:
            drop(t)
Exemple #9
0
    def test_deltas_only_one_delta_in_universe(self, asset_info):
        expr = bz.data(self.df, name='expr', dshape=self.dshape)
        deltas = pd.DataFrame({
            'sid': [65, 66],
            'asof_date': [self.dates[1], self.dates[0]],
            'timestamp': [self.dates[2], self.dates[1]],
            'value': [10, 11],
        })
        deltas = bz.data(deltas, name='deltas', dshape=self.dshape)
        expected_views = keymap(pd.Timestamp, {
            '2014-01-02': np.array([[0.0, 11.0, 2.0],
                                    [1.0, 2.0, 3.0]]),
            '2014-01-03': np.array([[10.0, 2.0, 3.0],
                                    [2.0, 3.0, 4.0]]),
            '2014-01-04': np.array([[2.0, 3.0, 4.0],
                                    [2.0, 3.0, 4.0]]),
        })

        nassets = len(asset_info)
        if nassets == 4:
            expected_views = valmap(
                lambda view: np.c_[view, [np.nan, np.nan]],
                expected_views,
            )

        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                columns=[
                    'value',
                ],
                data=np.array([11, 10, 4]).repeat(len(asset_info.index)),
                index=pd.MultiIndex.from_product((
                    sorted(expected_views.keys()),
                    finder.retrieve_all(asset_info.index),
                )),
            )
            dates = self.dates
            dates = dates.insert(len(dates), dates[-1] + timedelta(days=1))
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=dates,
                start=dates[1],
                end=dates[-1],
                window_length=2,
                compute_fn=np.nanmax,
            )
Exemple #10
0
    def _bcolz(self, tblname, dbname=None, type=None, df=None, blaze=False):
        if type is None:
            type = self.type
        if dbname is None:
            dbname = self.name
        if df is None:
            # return the dataframe if it exists
            try:
                df = bcz.open(
                    os.path.expanduser(
                        os.path.join(cf.options.basedir, 'databases',
                                     "{}.{}.{}".format(type, dbname,
                                                       tblname))))
            except IOError:
                return None
            else:
                if len(df) == 0:
                    df = pd.DataFrame()
                    if blaze:
                        df = blz.data(df)
                else:
                    if blaze:
                        df = blz.data(df)
                    else:
                        df = df.todataframe()
                if not blaze and 'idx' in df.columns.values:
                    df.set_index('idx', drop=True, inplace=True)
                    df.index.name = None
                return df

        else:
            if not (df.index.dtype_str == 'int64') and not (df.empty):
                df = df.copy()
                df['idx'] = df.index
            if isinstance(df, pd.DataFrame):
                path = os.path.expanduser(
                    os.path.join(cf.options.basedir, 'databases',
                                 "{}.{}.{}".format(type, dbname, tblname)))
                if df.empty:
                    bcz.fromiter((),
                                 dtype=np.int32,
                                 mode='w',
                                 count=0,
                                 rootdir=path)
                else:
                    bcz.ctable.fromdataframe(df, mode='w', rootdir=path)

            if 'idx' in df.columns.values:
                del df
            return
Exemple #11
0
    def test_deltas_only_one_delta_in_universe(self, asset_info):
        expr = bz.data(self.df, name='expr', dshape=self.dshape)
        deltas = pd.DataFrame({
            'sid': [65, 66],
            'asof_date': [self.dates[1], self.dates[0]],
            'timestamp': [self.dates[2], self.dates[1]],
            'value': [10, 11],
        })
        deltas = bz.data(deltas, name='deltas', dshape=self.dshape)
        expected_views = keymap(
            pd.Timestamp, {
                '2014-01-02': np.array([[0.0, 11.0, 2.0], [1.0, 2.0, 3.0]]),
                '2014-01-03': np.array([[10.0, 2.0, 3.0], [2.0, 3.0, 4.0]]),
                '2014-01-04': np.array([[2.0, 3.0, 4.0], [2.0, 3.0, 4.0]]),
            })

        nassets = len(asset_info)
        if nassets == 4:
            expected_views = valmap(
                lambda view: np.c_[view, [np.nan, np.nan]],
                expected_views,
            )

        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                columns=[
                    'value',
                ],
                data=np.array([11, 10, 4]).repeat(len(asset_info.index)),
                index=pd.MultiIndex.from_product((
                    sorted(expected_views.keys()),
                    finder.retrieve_all(asset_info.index),
                )),
            )
            dates = self.dates
            dates = dates.insert(len(dates), dates[-1] + timedelta(days=1))
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=dates,
                start=dates[1],
                end=dates[-1],
                window_length=2,
                compute_fn=np.nanmax,
            )
Exemple #12
0
    def _test_id(self, df, dshape, expected, finder, add):
        expr = bz.data(df, name='expr', dshape=dshape)
        loader = BlazeLoader()
        ds = from_blaze(
            expr,
            loader=loader,
            no_deltas_rule=no_deltas_rules.ignore,
            missing_values=self.missing_values,
        )
        p = Pipeline()
        for a in add:
            p.add(getattr(ds, a).latest, a)
        dates = self.dates

        with tmp_asset_finder() as finder:
            result = SimplePipelineEngine(
                loader,
                dates,
                finder,
            ).run_pipeline(p, dates[0], dates[-1])

        assert_frame_equal(
            result,
            _utc_localize_index_level_0(expected),
            check_dtype=False,
        )
Exemple #13
0
def test_url_csv_data(iris_local):
    iris_remote = data(iris_url)
    assert isinstance(iris_remote.data, URL(CSV))
    iris_remote_df = compute(iris_remote)
    assert isinstance(iris_remote_df, pd.DataFrame)
    iris_local_df = compute(iris_local)
    tm.assert_frame_equal(iris_remote_df, iris_local_df)
Exemple #14
0
    def test_custom_query_time_tz(self):
        df = self.df.copy()
        df['timestamp'] = (
            pd.DatetimeIndex(df['timestamp'], tz='EST') +
            timedelta(hours=8, minutes=44)).tz_convert('utc').tz_localize(None)
        df.ix[3:5, 'timestamp'] = pd.Timestamp('2014-01-01 13:45')
        expr = bz.data(df, name='expr', dshape=self.dshape)
        loader = BlazeLoader(data_query_time=time(8, 45), data_query_tz='EST')
        ds = from_blaze(
            expr,
            loader=loader,
            no_deltas_rule=no_deltas_rules.ignore,
            missing_values=self.missing_values,
        )
        p = Pipeline()
        p.add(ds.value.latest, 'value')
        p.add(ds.int_value.latest, 'int_value')
        dates = self.dates

        result = SimplePipelineEngine(
            loader,
            dates,
            self.asset_finder,
        ).run_pipeline(p, dates[0], dates[-1])

        expected = df.drop('asof_date', axis=1)
        expected['timestamp'] = expected['timestamp'].dt.normalize().astype(
            'datetime64[ns]', ).dt.tz_localize('utc')
        expected.ix[3:5, 'timestamp'] += timedelta(days=1)
        expected.set_index(['timestamp', 'sid'], inplace=True)
        expected.index = pd.MultiIndex.from_product((
            expected.index.levels[0],
            self.asset_finder.retrieve_all(expected.index.levels[1]),
        ))
        assert_frame_equal(result, expected, check_dtype=False)
Exemple #15
0
def initialize(rundb, topicsfile, index_path, max_rank):

    #buf = io.StringIO()

    #for fn in runfiles:
    #    with open(fn) as f:
    #        buf.write(f.read())
    #buf.seek(0)

    rcoll = TrecRunIndexedCollection(None)

    #rcoll.run.run_data = pd.read_csv(
    #    buf, sep='\s+', names=['query', 'q0', 'docid', 'rank', 'score', 'system', 'other'])
    rcoll.run.run_data = blaze.data(
        rundb
    )
    #rcoll.run.run_data.sort(['query', 'score'], inplace=True, ascending=[True, False])


    queries_by_id = None

    if index_path is not None:
        icoll = AnseriniLuceneCollection(index_path)
        coll = AutoDelegate(rcoll, icoll)
        queries_by_id = TrecRobustQueries(topicsfile, collection_for_processing=icoll)
    else:
        coll = rcoll

    ctx = RerankingContext(coll, Features(coll))
    rd = rcoll.run.run_data
    rd = rd[rd['rank'] <= max_rank]

    return queries_by_id, rd, ctx
Exemple #16
0
async def check_status(req):
    # verify that the upstream services are functional
    engine = sa.create_engine(app.config.dbc.uri)
    db = bz.data(engine)
    dbinfo = {
        'host': db.data.engine.url.host,
        'engine': db.data.engine.name,
        'tables': db.fields,
        'config': app.config.dbc
    }
    r = None
    wrkinfo = None
    try:
        r = rq.get(app.config.stats_svc)
        wrkinfo = r.json()
        r = {'error': None, 'data': None}
    except Exception as e:
        raise ServerError(str(e))
    return json({
        'data': {
            'db': dbinfo,
            'worker_url': app.config.stats_svc,
            'worker_status': wrkinfo
        }
    })
Exemple #17
0
def test_url_csv_data(iris_local):
    iris_remote = data(iris_url)
    assert isinstance(iris_remote.data, URL(CSV))
    iris_remote_df = compute(iris_remote)
    assert isinstance(iris_remote_df, pd.DataFrame)
    iris_local_df = compute(iris_local)
    tm.assert_frame_equal(iris_remote_df, iris_local_df)
Exemple #18
0
    def _test_id(self, df, dshape, expected, finder, add):
        expr = bz.data(df, name='expr', dshape=dshape)
        loader = BlazeLoader()
        ds = from_blaze(
            expr,
            loader=loader,
            no_deltas_rule=no_deltas_rules.ignore,
            missing_values=self.missing_values,
        )
        p = Pipeline()
        for a in add:
            p.add(getattr(ds, a).latest, a)
        dates = self.dates

        with tmp_asset_finder() as finder:
            result = SimplePipelineEngine(
                loader,
                dates,
                finder,
            ).run_pipeline(p, dates[0], dates[-1])

        assert_frame_equal(
            result,
            _utc_localize_index_level_0(expected),
            check_dtype=False,
        )
Exemple #19
0
def bind_expression_to_resources(expr, resources):
    """
    Bind a Blaze expression to resources.

    Parameters
    ----------
    expr : bz.Expr
        The expression to which we want to bind resources.
    resources : dict[bz.Symbol -> any]
        Mapping from the loadable terms of ``expr`` to actual data resources.

    Returns
    -------
    bound_expr : bz.Expr
        ``expr`` with bound resources.
    """
    # bind the resources into the expression
    if resources is None:
        resources = {}

    # _subs stands for substitute.  It's not actually private, blaze just
    # prefixes symbol-manipulation methods with underscores to prevent
    # collisions with data column names.
    return expr._subs(
        {k: bz.data(v, dshape=k.dshape)
         for k, v in iteritems(resources)})
Exemple #20
0
    def test_tabular(self):
        name = 'expr'
        expr = bz.data(self.df, name=name, dshape=self.dshape)
        ds = from_blaze(
            expr,
            loader=self.garbage_loader,
            no_deltas_rule=no_deltas_rules.ignore,
            missing_values=self.missing_values,
        )
        self.assertEqual(ds.__name__, name)
        self.assertTrue(issubclass(ds, DataSet))

        self.assertIs(ds.value.dtype, float64_dtype)
        self.assertIs(ds.int_value.dtype, int64_dtype)

        self.assertTrue(np.isnan(ds.value.missing_value))
        self.assertEqual(ds.int_value.missing_value, 0)

        # test memoization
        self.assertIs(
            from_blaze(
                expr,
                loader=self.garbage_loader,
                no_deltas_rule=no_deltas_rules.ignore,
                missing_values=self.missing_values,
            ),
            ds,
        )
Exemple #21
0
def bind_expression_to_resources(expr, resources):
    """
    Bind a Blaze expression to resources.

    Parameters
    ----------
    expr : bz.Expr
        The expression to which we want to bind resources.
    resources : dict[bz.Symbol -> any]
        Mapping from the loadable terms of ``expr`` to actual data resources.

    Returns
    -------
    bound_expr : bz.Expr
        ``expr`` with bound resources.
    """
    # bind the resources into the expression
    if resources is None:
        resources = {}

    # _subs stands for substitute.  It's not actually private, blaze just
    # prefixes symbol-manipulation methods with underscores to prevent
    # collisions with data column names.
    return expr._subs({
        k: bz.data(v, dshape=k.dshape) for k, v in iteritems(resources)
    })
Exemple #22
0
    def test_tabular(self):
        name = 'expr'
        expr = bz.data(self.df, name=name, dshape=self.dshape)
        ds = from_blaze(
            expr,
            loader=self.garbage_loader,
            no_deltas_rule=no_deltas_rules.ignore,
            missing_values=self.missing_values,
        )
        self.assertEqual(ds.__name__, name)
        self.assertTrue(issubclass(ds, DataSet))

        self.assertIs(ds.value.dtype, float64_dtype)
        self.assertIs(ds.int_value.dtype, int64_dtype)

        self.assertTrue(np.isnan(ds.value.missing_value))
        self.assertEqual(ds.int_value.missing_value, 0)

        # test memoization
        self.assertIs(
            from_blaze(
                expr,
                loader=self.garbage_loader,
                no_deltas_rule=no_deltas_rules.ignore,
                missing_values=self.missing_values,
            ),
            ds,
        )
def test_subsecond(sql_with_subsecond_dts):
    """Verify that `.second` returns a value with subsecond resolution and does not
    truncate to the second.
    """
    t = data(sql_with_subsecond_dts)
    result = compute(t.A.second, sql_with_subsecond_dts, return_type=pd.Series)
    assert_series_equal(result, pd.Series([0.042, 0.047], name='A_second'))
Exemple #24
0
def test_map_called_on_data_star():
    r = data(example('accounts_*.csv'))
    s = symbol('s', discover(r))
    flag[0] = False
    a = compute(s.count(), r)
    b = compute(s.count(), r, map=mymap)
    assert a == b
    assert flag[0]
Exemple #25
0
def test_swap_resources_into_scope():

    from blaze import data
    t = data([1, 2, 3], dshape='3 * int', name='t')
    scope = swap_resources_into_scope(t.head(2), {})

    assert t._resources()
    assert t in scope
Exemple #26
0
    def make_loader(cls, events, next_value_columns, previous_value_columns):
        import blaze as bz

        return BlazeEventsLoader(
            bz.data(events),
            next_value_columns,
            previous_value_columns,
        )
Exemple #27
0
def test_map_called_on_data_star():
    r = data(example('accounts_*.csv'))
    s = symbol('s', discover(r))
    flag[0] = False
    a = compute(s.count(), r)
    b = compute(s.count(), r, map=mymap)
    assert a == b
    assert flag[0]
Exemple #28
0
 def test_no_concrete_loader_defined(self):
     with self.assertRaisesRegexp(
             TypeError, re.escape(ABSTRACT_CONCRETE_LOADER_ERROR)):
         BlazeEventDataSetLoaderNoConcreteLoader(
             bz.data(
                 pd.DataFrame({
                     ANNOUNCEMENT_FIELD_NAME: dtx,
                     SID_FIELD_NAME: 0
                 })))
Exemple #29
0
 def start_requests(self):
     biz_path = getattr(self, 'biz_json', 'data/biz.json')
     biz = bz.data(biz_path)
     biz = bz.compute(biz[['id', 'url']])
     self.logger.info("%s start urls for ReviewsSpider", str(len(biz)))
     for biz_id, url in biz:
         yield scrapy.Request(url=url,
                              callback=self.parse,
                              dont_filter=True,
                              meta={'id': biz_id})
Exemple #30
0
 def test_auto_deltas_fail_raise(self):
     loader = BlazeLoader()
     expr = bz.data(self.df, dshape=self.dshape)
     with self.assertRaises(ValueError) as e:
         from_blaze(
             expr,
             loader=loader,
             no_deltas_rule=no_deltas_rules.raise_,
         )
     self.assertIn(str(expr), str(e.exception))
Exemple #31
0
 def test_no_concrete_loader_defined(self):
     with self.assertRaisesRegexp(
             TypeError, re.escape(ABSTRACT_CONCRETE_LOADER_ERROR)
     ):
         BlazeEventDataSetLoaderNoConcreteLoader(
             bz.data(
                 pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx,
                               SID_FIELD_NAME: 0})
             )
         )
Exemple #32
0
def sql_with_float(url):
    try:
        t = data(url % next(names), dshape='var * {c: float64}')
    except sa.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        try:
            yield t
        finally:
            drop(t)
Exemple #33
0
 def test_auto_deltas_fail_raise(self):
     loader = BlazeLoader()
     expr = bz.data(self.df, dshape=self.dshape)
     with self.assertRaises(ValueError) as e:
         from_blaze(
             expr,
             loader=loader,
             no_deltas_rule=no_deltas_rules.raise_,
         )
     self.assertIn(str(expr), str(e.exception))
Exemple #34
0
    def test_deltas_macro(self):
        asset_info = asset_infos[0][0]
        expr = bz.data(self.macro_df, name='expr', dshape=self.macro_dshape)
        deltas = bz.data(
            self.macro_df.iloc[:-1],
            name='deltas',
            dshape=self.macro_dshape,
        )
        deltas = bz.transform(
            deltas,
            value=deltas.value + 10,
            timestamp=deltas.timestamp + timedelta(days=1),
        )

        nassets = len(asset_info)
        expected_views = keymap(
            pd.Timestamp, {
                '2014-01-02': repeat_last_axis(np.array([10.0, 1.0]), nassets),
                '2014-01-03': repeat_last_axis(np.array([11.0, 2.0]), nassets),
            })

        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                list(concatv([10] * nassets, [11] * nassets)),
                index=pd.MultiIndex.from_product((
                    sorted(expected_views.keys()),
                    finder.retrieve_all(asset_info.index),
                )),
                columns=('value', ),
            )
            dates = self.dates
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=dates,
                start=dates[1],
                end=dates[-1],
                window_length=2,
                compute_fn=np.nanmax,
            )
def sql_with_float(url):
    try:
        t = data(url % next(names), dshape='var * {c: float64}')
    except sa.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        try:
            yield t
        finally:
            drop(t)
Exemple #36
0
def make_dataset(df, name):
    """构造指定数据集名称的数据集"""
    old_dshape = discover(df)
    expr = blaze.data(df, _normalized_dshape(old_dshape), name)
    return from_blaze(expr,
                      loader=global_loader,
                      no_deltas_rule='ignore',
                      no_checkpoints_rule='ignore',
                      missing_values=make_default_missing_values_for_df(
                          df.dtypes))
Exemple #37
0
    def test_deltas_macro(self):
        asset_info = asset_infos[0][0]
        expr = bz.data(self.macro_df, name='expr', dshape=self.macro_dshape)
        deltas = bz.data(
            self.macro_df.iloc[:-1],
            name='deltas',
            dshape=self.macro_dshape,
        )
        deltas = bz.transform(
            deltas,
            value=deltas.value + 10,
            timestamp=deltas.timestamp + timedelta(days=1),
        )

        nassets = len(asset_info)
        expected_views = keymap(pd.Timestamp, {
            '2014-01-02': repeat_last_axis(np.array([10.0, 1.0]), nassets),
            '2014-01-03': repeat_last_axis(np.array([11.0, 2.0]), nassets),
        })

        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                list(concatv([10] * nassets, [11] * nassets)),
                index=pd.MultiIndex.from_product((
                    sorted(expected_views.keys()),
                    finder.retrieve_all(asset_info.index),
                )),
                columns=('value',),
            )
            dates = self.dates
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=dates,
                start=dates[1],
                end=dates[-1],
                window_length=2,
                compute_fn=np.nanmax,
            )
Exemple #38
0
def sql_with_dts(url):
    try:
        t = data(url % next(names), dshape='var * {A: datetime}')
    except sa.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        t = odo([(d, ) for d in pd.date_range('2014-01-01', '2014-02-01')], t)
        try:
            yield t
        finally:
            drop(t)
Exemple #39
0
def sql_with_timedeltas(url):
    try:
        t = data(url % next(names), dshape='var * {N: timedelta}')
    except sa.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        t = odo([(timedelta(seconds=n), ) for n in range(10)], t)
        try:
            yield t
        finally:
            drop(t)
Exemple #40
0
def sqla(url):
    try:
        t = data(url % next(names), dshape='var * {A: ?string, B: ?int32}')
    except sa.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        t = odo([('a', 1), (None, 1), ('c', None)], t)
        try:
            yield t
        finally:
            drop(t)
Exemple #41
0
def sqlb(url):
    try:
        t = data(url % next(names), dshape='var * {A: string, B: int64}')
    except sa.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        t = odo([('a', 1), ('b', 2)], t)
        try:
            yield t
        finally:
            drop(t)
Exemple #42
0
def big_sql(url):
    try:
        t = data(url % next(names), dshape='var * {A: string, B: int64}')
    except sa.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        t = odo(zip(list('a' * 100), list(range(100))), t)
        try:
            yield t
        finally:
            drop(t)
Exemple #43
0
def test_concat():
    d = {'a.csv': 'a,b\n1,2\n3,4',
         'b.csv': 'a,b\n5,6\n7,8'}

    with filetexts(d):
        a_rsc = data('a.csv')
        b_rsc = data('b.csv')

        a = symbol('a', discover(a_rsc))
        b = symbol('b', discover(b_rsc))

        tm.assert_frame_equal(
            odo(
                compute(concat(a, b), {a: a_rsc, b: b_rsc}), pd.DataFrame,
            ),

            # windows needs explicit int64 construction b/c default is int32
            pd.DataFrame(np.arange(1, 9, dtype='int64').reshape(4, 2),
                         columns=list('ab')),
        )
Exemple #44
0
 def start_requests(self):
     zip_path = getattr(self, 'zip_csv', 'data/nyc_zip_codes.csv')
     zips = bz.data(zip_path)
     zips = bz.compute(zips.Zip_Code)
     url_str = "https://www.yelp.com/search?find_desc=Restaurants&find_loc={}"
     urls = [url_str.format(z) for z in zips]
     self.logger.info("%s start urls for BizInfoSpider", str(len(urls)))
     for url in urls:
         yield scrapy.Request(url=url,
                              callback=self.parse,
                              dont_filter=True)
Exemple #45
0
def test_concat():
    d = {'a.csv': 'a,b\n1,2\n3,4',
         'b.csv': 'a,b\n5,6\n7,8'}

    with filetexts(d):
        a_rsc = data('a.csv')
        b_rsc = data('b.csv')

        a = symbol('a', discover(a_rsc))
        b = symbol('b', discover(b_rsc))

        tm.assert_frame_equal(
            odo(
                compute(concat(a, b), {a: a_rsc, b: b_rsc}), pd.DataFrame,
            ),

            # windows needs explicit int64 construction b/c default is int32
            pd.DataFrame(np.arange(1, 9, dtype='int64').reshape(4, 2),
                         columns=list('ab')),
        )
Exemple #46
0
 def pipeline_event_loader_args(self, dates):
     _, mapping = super(
         BlazeConsensusEstimatesLoaderTestCase,
         self,
     ).pipeline_event_loader_args(dates)
     frames = []
     for sid, df in iteritems(mapping):
         frame = df.copy()
         frame[SID_FIELD_NAME] = sid
         frames.append(frame)
     return bz.data(pd.concat(frames).reset_index(drop=True)),
Exemple #47
0
def test_csv_join():
    d = {'a.csv': 'a,b,c\n0,1,2\n3,4,5',
         'b.csv': 'c,d,e\n2,3,4\n5,6,7'}

    with filetexts(d):
        data_a = data('a.csv')
        data_b = data('b.csv')
        a = symbol('a', discover(data_a))
        b = symbol('b', discover(data_b))
        tm.assert_frame_equal(
            odo(
                compute(join(a, b, 'c'), {a: data_a, b: data_b}),
                pd.DataFrame,
            ),

            # windows needs explicit int64 construction b/c default is int32
            pd.DataFrame(np.array([[2, 0, 1, 3, 4],
                                   [5, 3, 4, 6, 7]], dtype='int64'),
                         columns=list('cabde'))
        )
Exemple #48
0
def test_csv_join():
    d = {'a.csv': 'a,b,c\n0,1,2\n3,4,5',
         'b.csv': 'c,d,e\n2,3,4\n5,6,7'}

    with filetexts(d):
        data_a = data('a.csv')
        data_b = data('b.csv')
        a = symbol('a', discover(data_a))
        b = symbol('b', discover(data_b))
        tm.assert_frame_equal(
            odo(
                compute(join(a, b, 'c'), {a: data_a, b: data_b}),
                pd.DataFrame,
            ),

            # windows needs explicit int64 construction b/c default is int32
            pd.DataFrame(np.array([[2, 0, 1, 3, 4],
                                   [5, 3, 4, 6, 7]], dtype='int64'),
                         columns=list('cabde'))
        )
def sql_with_timedeltas(url):
    try:
        t = data(url % next(names), dshape='var * {N: timedelta}')
    except sa.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        t = odo([(timedelta(seconds=n),) for n in range(10)], t)
        try:
            yield t
        finally:
            drop(t)
def sql_with_dts(url):
    try:
        t = data(url % next(names), dshape='var * {A: datetime}')
    except sa.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        t = odo([(d,) for d in pd.date_range('2014-01-01', '2014-02-01')], t)
        try:
            yield t
        finally:
            drop(t)
def sqlb(url):
    try:
        t = data(url % next(names), dshape='var * {A: string, B: int64}')
    except sa.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        t = odo([('a', 1), ('b', 2)], t)
        try:
            yield t
        finally:
            drop(t)
def sqla(url):
    try:
        t = data(url % next(names), dshape='var * {A: ?string, B: ?int32}')
    except sa.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        t = odo([('a', 1), (None, 1), ('c', None)], t)
        try:
            yield t
        finally:
            drop(t)
def big_sql(url):
    try:
        t = data(url % next(names), dshape='var * {A: string, B: int64}')
    except sa.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        t = odo(zip(list('a'*100), list(range(100))), t)
        try:
            yield t
        finally:
            drop(t)
Exemple #54
0
def test_groups():
    with tmpfile('.hdf5') as fn:
        df.to_hdf(fn, '/data/fixed')

        hdf = data('hdfstore://%s' % fn)
        assert dshape(discover(hdf)) == dshape(discover({'data': {'fixed': df}}))

        s = symbol('s', discover(hdf))

        assert list(compute(s.data.fixed, hdf).a) == [1, 2, 3, 4]

        hdf.data.close()
Exemple #55
0
def test_hdfstore():
    with tmpfile('.hdf5') as fn:
        df.to_hdf(fn, '/appendable', format='table')
        df.to_hdf(fn, '/fixed')

        hdf = data('hdfstore://%s' % fn)
        s = symbol('s', discover(hdf))

        assert isinstance(compute(s.fixed, hdf),
                          (pd.DataFrame, pd.io.pytables.Fixed))
        assert isinstance(compute(s.appendable, hdf),
                          (pd.io.pytables.AppendableFrameTable, Chunks))

        s = symbol('s', discover(df))
        f = data('hdfstore://%s::/fixed' % fn)
        a = data('hdfstore://%s::/appendable' % fn)
        assert isinstance(pre_compute(s, a), Chunks)

        hdf.data.close()
        f.data.parent.close()
        a.data.parent.close()
Exemple #56
0
    def test_complex_expr(self):
        expr = bz.data(self.df, dshape=self.dshape)
        # put an Add in the table
        expr_with_add = bz.transform(expr, value=expr.value + 1)

        # Test that we can have complex expressions with no deltas
        from_blaze(
            expr_with_add,
            deltas=None,
            loader=self.garbage_loader,
            missing_values=self.missing_values,
        )

        with self.assertRaises(TypeError):
            from_blaze(
                expr.value + 1,  # put an Add in the column
                deltas=None,
                loader=self.garbage_loader,
                missing_values=self.missing_values,
            )

        deltas = bz.data(
            pd.DataFrame(columns=self.df.columns),
            dshape=self.dshape,
        )
        with self.assertRaises(TypeError):
            from_blaze(
                expr_with_add,
                deltas=deltas,
                loader=self.garbage_loader,
                missing_values=self.missing_values,
            )

        with self.assertRaises(TypeError):
            from_blaze(
                expr.value + 1,
                deltas=deltas,
                loader=self.garbage_loader,
                missing_values=self.missing_values,
            )
 def pipeline_event_loader_args(self, dates):
     _, mapping = super(
         BlazeEarningsCalendarLoaderTestCase,
         self,
     ).pipeline_event_loader_args(dates)
     return (bz.data(pd.concat(
         pd.DataFrame({
             ANNOUNCEMENT_FIELD_NAME: df[ANNOUNCEMENT_FIELD_NAME],
             TS_FIELD_NAME: df[TS_FIELD_NAME],
             SID_FIELD_NAME: sid,
         })
         for sid, df in iteritems(mapping)
     ).reset_index(drop=True)),)