def test_add_expanded_payload_has_effect(temp_add_server, serial): # Ensure that the expanded payload format actually passes the arguments # through to the resource constructor iris_path = example('iris-latin1.tsv') csv_kwargs = {'delimiter': '\t', 'encoding': 'iso-8859-1'} blob = serial.dumps({'iris': {'source': iris_path, 'kwargs': csv_kwargs}}) response1 = temp_add_server.post('/add', headers=mimetype(serial), data=blob) assert 'CREATED' in response1.status assert response1.status_code == RC.CREATED # check for expected server datashape response2 = temp_add_server.get('/datashape') expected2 = discover({'iris': data(iris_path, **csv_kwargs)}) response_dshape = datashape.dshape(response2.data.decode('utf-8')) assert_dshape_equal(response_dshape.measure.dict['iris'], expected2.measure.dict['iris']) # compute on added data t = data({'iris': data(iris_path, **csv_kwargs)}) expr = t.iris.petal_length.sum() response3 = temp_add_server.post('/compute', data=serial.dumps({'expr': to_tree(expr)}), headers=mimetype(serial)) result3 = serial.data_loads(serial.loads(response3.data)['data']) expected3 = compute(expr, {'iris': data(iris_path, **csv_kwargs)}) assert result3 == expected3
def test_add_data_to_server(temp_add_server, serial): # add data iris_path = example('iris.csv') blob = serial.dumps({'iris': iris_path}) response1 = temp_add_server.post('/add', headers=mimetype(serial), data=blob) assert 'CREATED' in response1.status assert response1.status_code == RC.CREATED # check for expected server datashape response2 = temp_add_server.get('/datashape') expected2 = discover({'iris': data(iris_path)}) response_dshape = datashape.dshape(response2.data.decode('utf-8')) assert_dshape_equal(response_dshape.measure.dict['iris'], expected2.measure.dict['iris']) # compute on added data t = data({'iris': data(iris_path)}) expr = t.iris.petal_length.sum() response3 = temp_add_server.post('/compute', data=serial.dumps({'expr': to_tree(expr)}), headers=mimetype(serial)) result3 = serial.data_loads(serial.loads(response3.data)['data']) expected3 = compute(expr, {'iris': data(iris_path)}) assert result3 == expected3
def sql_with_null(url): ds = dshape(""" var * {name: ?string, sex: ?string, amount: int, id: int, comment: ?string} """) rows = [('Alice', 'F', 100, 1, 'Alice comment'), (None, 'M', 300, 2, None), ('Drew', 'F', 100, 4, 'Drew comment'), ('Bob', 'M', 100, 5, 'Bob comment 2'), ('Drew', 'M', 200, 5, None), ('first', None, 300, 4, 'Missing info'), (None, None, 300, 6, None)] try: x = url % next(names) t = data(x, dshape=ds) print(x) except sa.exc.OperationalError as e: pytest.skip(str(e)) else: assert t.dshape == ds t = data(odo(rows, t)) try: yield t finally: drop(t)
def test_novel_deltas_macro(self): asset_info = asset_infos[0][0] base_dates = pd.DatetimeIndex([ pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-04') ]) baseline = pd.DataFrame({ 'value': (0, 1), 'asof_date': base_dates, 'timestamp': base_dates, }) expr = bz.data(baseline, name='expr', dshape=self.macro_dshape) deltas = bz.data(baseline, name='deltas', dshape=self.macro_dshape) deltas = bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ) nassets = len(asset_info) expected_views = keymap(pd.Timestamp, { '2014-01-03': repeat_last_axis( np.array([10.0, 10.0, 10.0]), nassets, ), '2014-01-06': repeat_last_axis( np.array([10.0, 10.0, 11.0]), nassets, ), }) cal = pd.DatetimeIndex([ pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-02'), pd.Timestamp('2014-01-03'), # omitting the 4th and 5th to simulate a weekend pd.Timestamp('2014-01-06'), ]) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( list(concatv([10] * nassets, [11] * nassets)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value',), ) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=cal, start=cal[2], end=cal[-1], window_length=3, compute_fn=op.itemgetter(-1), )
def sql(url): ds = dshape('var * {A: string, B: int64}') try: t = data(url % next(names), dshape=ds) except sa.exc.OperationalError as e: pytest.skip(str(e)) else: assert t.dshape == ds t = data(odo([('a', 1), ('b', 2)], t)) try: yield t finally: drop(t)
def sql_two_tables(url): dshape = 'var * {a: int32}' try: t = data(url % next(names), dshape=dshape) u = data(url % next(names), dshape=dshape) except sa.exc.OperationalError as e: pytest.skip(str(e)) else: try: yield u, t finally: drop(t) drop(u)
def test_deltas_only_one_delta_in_universe(self, asset_info): expr = bz.data(self.df, name='expr', dshape=self.dshape) deltas = pd.DataFrame({ 'sid': [65, 66], 'asof_date': [self.dates[1], self.dates[0]], 'timestamp': [self.dates[2], self.dates[1]], 'value': [10, 11], }) deltas = bz.data(deltas, name='deltas', dshape=self.dshape) expected_views = keymap(pd.Timestamp, { '2014-01-02': np.array([[0.0, 11.0, 2.0], [1.0, 2.0, 3.0]]), '2014-01-03': np.array([[10.0, 2.0, 3.0], [2.0, 3.0, 4.0]]), '2014-01-04': np.array([[2.0, 3.0, 4.0], [2.0, 3.0, 4.0]]), }) nassets = len(asset_info) if nassets == 4: expected_views = valmap( lambda view: np.c_[view, [np.nan, np.nan]], expected_views, ) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( columns=[ 'value', ], data=np.array([11, 10, 4]).repeat(len(asset_info.index)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), ) dates = self.dates dates = dates.insert(len(dates), dates[-1] + timedelta(days=1)) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=dates, start=dates[1], end=dates[-1], window_length=2, compute_fn=np.nanmax, )
def _bcolz(self, tblname, dbname=None, type=None, df=None, blaze=False): if type is None: type = self.type if dbname is None: dbname = self.name if df is None: # return the dataframe if it exists try: df = bcz.open( os.path.expanduser( os.path.join(cf.options.basedir, 'databases', "{}.{}.{}".format(type, dbname, tblname)))) except IOError: return None else: if len(df) == 0: df = pd.DataFrame() if blaze: df = blz.data(df) else: if blaze: df = blz.data(df) else: df = df.todataframe() if not blaze and 'idx' in df.columns.values: df.set_index('idx', drop=True, inplace=True) df.index.name = None return df else: if not (df.index.dtype_str == 'int64') and not (df.empty): df = df.copy() df['idx'] = df.index if isinstance(df, pd.DataFrame): path = os.path.expanduser( os.path.join(cf.options.basedir, 'databases', "{}.{}.{}".format(type, dbname, tblname))) if df.empty: bcz.fromiter((), dtype=np.int32, mode='w', count=0, rootdir=path) else: bcz.ctable.fromdataframe(df, mode='w', rootdir=path) if 'idx' in df.columns.values: del df return
def test_deltas_only_one_delta_in_universe(self, asset_info): expr = bz.data(self.df, name='expr', dshape=self.dshape) deltas = pd.DataFrame({ 'sid': [65, 66], 'asof_date': [self.dates[1], self.dates[0]], 'timestamp': [self.dates[2], self.dates[1]], 'value': [10, 11], }) deltas = bz.data(deltas, name='deltas', dshape=self.dshape) expected_views = keymap( pd.Timestamp, { '2014-01-02': np.array([[0.0, 11.0, 2.0], [1.0, 2.0, 3.0]]), '2014-01-03': np.array([[10.0, 2.0, 3.0], [2.0, 3.0, 4.0]]), '2014-01-04': np.array([[2.0, 3.0, 4.0], [2.0, 3.0, 4.0]]), }) nassets = len(asset_info) if nassets == 4: expected_views = valmap( lambda view: np.c_[view, [np.nan, np.nan]], expected_views, ) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( columns=[ 'value', ], data=np.array([11, 10, 4]).repeat(len(asset_info.index)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), ) dates = self.dates dates = dates.insert(len(dates), dates[-1] + timedelta(days=1)) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=dates, start=dates[1], end=dates[-1], window_length=2, compute_fn=np.nanmax, )
def _test_id(self, df, dshape, expected, finder, add): expr = bz.data(df, name='expr', dshape=dshape) loader = BlazeLoader() ds = from_blaze( expr, loader=loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ) p = Pipeline() for a in add: p.add(getattr(ds, a).latest, a) dates = self.dates with tmp_asset_finder() as finder: result = SimplePipelineEngine( loader, dates, finder, ).run_pipeline(p, dates[0], dates[-1]) assert_frame_equal( result, _utc_localize_index_level_0(expected), check_dtype=False, )
def test_url_csv_data(iris_local): iris_remote = data(iris_url) assert isinstance(iris_remote.data, URL(CSV)) iris_remote_df = compute(iris_remote) assert isinstance(iris_remote_df, pd.DataFrame) iris_local_df = compute(iris_local) tm.assert_frame_equal(iris_remote_df, iris_local_df)
def test_custom_query_time_tz(self): df = self.df.copy() df['timestamp'] = ( pd.DatetimeIndex(df['timestamp'], tz='EST') + timedelta(hours=8, minutes=44)).tz_convert('utc').tz_localize(None) df.ix[3:5, 'timestamp'] = pd.Timestamp('2014-01-01 13:45') expr = bz.data(df, name='expr', dshape=self.dshape) loader = BlazeLoader(data_query_time=time(8, 45), data_query_tz='EST') ds = from_blaze( expr, loader=loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ) p = Pipeline() p.add(ds.value.latest, 'value') p.add(ds.int_value.latest, 'int_value') dates = self.dates result = SimplePipelineEngine( loader, dates, self.asset_finder, ).run_pipeline(p, dates[0], dates[-1]) expected = df.drop('asof_date', axis=1) expected['timestamp'] = expected['timestamp'].dt.normalize().astype( 'datetime64[ns]', ).dt.tz_localize('utc') expected.ix[3:5, 'timestamp'] += timedelta(days=1) expected.set_index(['timestamp', 'sid'], inplace=True) expected.index = pd.MultiIndex.from_product(( expected.index.levels[0], self.asset_finder.retrieve_all(expected.index.levels[1]), )) assert_frame_equal(result, expected, check_dtype=False)
def initialize(rundb, topicsfile, index_path, max_rank): #buf = io.StringIO() #for fn in runfiles: # with open(fn) as f: # buf.write(f.read()) #buf.seek(0) rcoll = TrecRunIndexedCollection(None) #rcoll.run.run_data = pd.read_csv( # buf, sep='\s+', names=['query', 'q0', 'docid', 'rank', 'score', 'system', 'other']) rcoll.run.run_data = blaze.data( rundb ) #rcoll.run.run_data.sort(['query', 'score'], inplace=True, ascending=[True, False]) queries_by_id = None if index_path is not None: icoll = AnseriniLuceneCollection(index_path) coll = AutoDelegate(rcoll, icoll) queries_by_id = TrecRobustQueries(topicsfile, collection_for_processing=icoll) else: coll = rcoll ctx = RerankingContext(coll, Features(coll)) rd = rcoll.run.run_data rd = rd[rd['rank'] <= max_rank] return queries_by_id, rd, ctx
async def check_status(req): # verify that the upstream services are functional engine = sa.create_engine(app.config.dbc.uri) db = bz.data(engine) dbinfo = { 'host': db.data.engine.url.host, 'engine': db.data.engine.name, 'tables': db.fields, 'config': app.config.dbc } r = None wrkinfo = None try: r = rq.get(app.config.stats_svc) wrkinfo = r.json() r = {'error': None, 'data': None} except Exception as e: raise ServerError(str(e)) return json({ 'data': { 'db': dbinfo, 'worker_url': app.config.stats_svc, 'worker_status': wrkinfo } })
def bind_expression_to_resources(expr, resources): """ Bind a Blaze expression to resources. Parameters ---------- expr : bz.Expr The expression to which we want to bind resources. resources : dict[bz.Symbol -> any] Mapping from the loadable terms of ``expr`` to actual data resources. Returns ------- bound_expr : bz.Expr ``expr`` with bound resources. """ # bind the resources into the expression if resources is None: resources = {} # _subs stands for substitute. It's not actually private, blaze just # prefixes symbol-manipulation methods with underscores to prevent # collisions with data column names. return expr._subs( {k: bz.data(v, dshape=k.dshape) for k, v in iteritems(resources)})
def test_tabular(self): name = 'expr' expr = bz.data(self.df, name=name, dshape=self.dshape) ds = from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ) self.assertEqual(ds.__name__, name) self.assertTrue(issubclass(ds, DataSet)) self.assertIs(ds.value.dtype, float64_dtype) self.assertIs(ds.int_value.dtype, int64_dtype) self.assertTrue(np.isnan(ds.value.missing_value)) self.assertEqual(ds.int_value.missing_value, 0) # test memoization self.assertIs( from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ), ds, )
def bind_expression_to_resources(expr, resources): """ Bind a Blaze expression to resources. Parameters ---------- expr : bz.Expr The expression to which we want to bind resources. resources : dict[bz.Symbol -> any] Mapping from the loadable terms of ``expr`` to actual data resources. Returns ------- bound_expr : bz.Expr ``expr`` with bound resources. """ # bind the resources into the expression if resources is None: resources = {} # _subs stands for substitute. It's not actually private, blaze just # prefixes symbol-manipulation methods with underscores to prevent # collisions with data column names. return expr._subs({ k: bz.data(v, dshape=k.dshape) for k, v in iteritems(resources) })
def test_subsecond(sql_with_subsecond_dts): """Verify that `.second` returns a value with subsecond resolution and does not truncate to the second. """ t = data(sql_with_subsecond_dts) result = compute(t.A.second, sql_with_subsecond_dts, return_type=pd.Series) assert_series_equal(result, pd.Series([0.042, 0.047], name='A_second'))
def test_map_called_on_data_star(): r = data(example('accounts_*.csv')) s = symbol('s', discover(r)) flag[0] = False a = compute(s.count(), r) b = compute(s.count(), r, map=mymap) assert a == b assert flag[0]
def test_swap_resources_into_scope(): from blaze import data t = data([1, 2, 3], dshape='3 * int', name='t') scope = swap_resources_into_scope(t.head(2), {}) assert t._resources() assert t in scope
def make_loader(cls, events, next_value_columns, previous_value_columns): import blaze as bz return BlazeEventsLoader( bz.data(events), next_value_columns, previous_value_columns, )
def test_no_concrete_loader_defined(self): with self.assertRaisesRegexp( TypeError, re.escape(ABSTRACT_CONCRETE_LOADER_ERROR)): BlazeEventDataSetLoaderNoConcreteLoader( bz.data( pd.DataFrame({ ANNOUNCEMENT_FIELD_NAME: dtx, SID_FIELD_NAME: 0 })))
def start_requests(self): biz_path = getattr(self, 'biz_json', 'data/biz.json') biz = bz.data(biz_path) biz = bz.compute(biz[['id', 'url']]) self.logger.info("%s start urls for ReviewsSpider", str(len(biz))) for biz_id, url in biz: yield scrapy.Request(url=url, callback=self.parse, dont_filter=True, meta={'id': biz_id})
def test_auto_deltas_fail_raise(self): loader = BlazeLoader() expr = bz.data(self.df, dshape=self.dshape) with self.assertRaises(ValueError) as e: from_blaze( expr, loader=loader, no_deltas_rule=no_deltas_rules.raise_, ) self.assertIn(str(expr), str(e.exception))
def test_no_concrete_loader_defined(self): with self.assertRaisesRegexp( TypeError, re.escape(ABSTRACT_CONCRETE_LOADER_ERROR) ): BlazeEventDataSetLoaderNoConcreteLoader( bz.data( pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx, SID_FIELD_NAME: 0}) ) )
def sql_with_float(url): try: t = data(url % next(names), dshape='var * {c: float64}') except sa.exc.OperationalError as e: pytest.skip(str(e)) else: try: yield t finally: drop(t)
def test_deltas_macro(self): asset_info = asset_infos[0][0] expr = bz.data(self.macro_df, name='expr', dshape=self.macro_dshape) deltas = bz.data( self.macro_df.iloc[:-1], name='deltas', dshape=self.macro_dshape, ) deltas = bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ) nassets = len(asset_info) expected_views = keymap( pd.Timestamp, { '2014-01-02': repeat_last_axis(np.array([10.0, 1.0]), nassets), '2014-01-03': repeat_last_axis(np.array([11.0, 2.0]), nassets), }) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( list(concatv([10] * nassets, [11] * nassets)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value', ), ) dates = self.dates self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=dates, start=dates[1], end=dates[-1], window_length=2, compute_fn=np.nanmax, )
def make_dataset(df, name): """构造指定数据集名称的数据集""" old_dshape = discover(df) expr = blaze.data(df, _normalized_dshape(old_dshape), name) return from_blaze(expr, loader=global_loader, no_deltas_rule='ignore', no_checkpoints_rule='ignore', missing_values=make_default_missing_values_for_df( df.dtypes))
def test_deltas_macro(self): asset_info = asset_infos[0][0] expr = bz.data(self.macro_df, name='expr', dshape=self.macro_dshape) deltas = bz.data( self.macro_df.iloc[:-1], name='deltas', dshape=self.macro_dshape, ) deltas = bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ) nassets = len(asset_info) expected_views = keymap(pd.Timestamp, { '2014-01-02': repeat_last_axis(np.array([10.0, 1.0]), nassets), '2014-01-03': repeat_last_axis(np.array([11.0, 2.0]), nassets), }) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( list(concatv([10] * nassets, [11] * nassets)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value',), ) dates = self.dates self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=dates, start=dates[1], end=dates[-1], window_length=2, compute_fn=np.nanmax, )
def sql_with_dts(url): try: t = data(url % next(names), dshape='var * {A: datetime}') except sa.exc.OperationalError as e: pytest.skip(str(e)) else: t = odo([(d, ) for d in pd.date_range('2014-01-01', '2014-02-01')], t) try: yield t finally: drop(t)
def sql_with_timedeltas(url): try: t = data(url % next(names), dshape='var * {N: timedelta}') except sa.exc.OperationalError as e: pytest.skip(str(e)) else: t = odo([(timedelta(seconds=n), ) for n in range(10)], t) try: yield t finally: drop(t)
def sqla(url): try: t = data(url % next(names), dshape='var * {A: ?string, B: ?int32}') except sa.exc.OperationalError as e: pytest.skip(str(e)) else: t = odo([('a', 1), (None, 1), ('c', None)], t) try: yield t finally: drop(t)
def sqlb(url): try: t = data(url % next(names), dshape='var * {A: string, B: int64}') except sa.exc.OperationalError as e: pytest.skip(str(e)) else: t = odo([('a', 1), ('b', 2)], t) try: yield t finally: drop(t)
def big_sql(url): try: t = data(url % next(names), dshape='var * {A: string, B: int64}') except sa.exc.OperationalError as e: pytest.skip(str(e)) else: t = odo(zip(list('a' * 100), list(range(100))), t) try: yield t finally: drop(t)
def test_concat(): d = {'a.csv': 'a,b\n1,2\n3,4', 'b.csv': 'a,b\n5,6\n7,8'} with filetexts(d): a_rsc = data('a.csv') b_rsc = data('b.csv') a = symbol('a', discover(a_rsc)) b = symbol('b', discover(b_rsc)) tm.assert_frame_equal( odo( compute(concat(a, b), {a: a_rsc, b: b_rsc}), pd.DataFrame, ), # windows needs explicit int64 construction b/c default is int32 pd.DataFrame(np.arange(1, 9, dtype='int64').reshape(4, 2), columns=list('ab')), )
def start_requests(self): zip_path = getattr(self, 'zip_csv', 'data/nyc_zip_codes.csv') zips = bz.data(zip_path) zips = bz.compute(zips.Zip_Code) url_str = "https://www.yelp.com/search?find_desc=Restaurants&find_loc={}" urls = [url_str.format(z) for z in zips] self.logger.info("%s start urls for BizInfoSpider", str(len(urls))) for url in urls: yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def pipeline_event_loader_args(self, dates): _, mapping = super( BlazeConsensusEstimatesLoaderTestCase, self, ).pipeline_event_loader_args(dates) frames = [] for sid, df in iteritems(mapping): frame = df.copy() frame[SID_FIELD_NAME] = sid frames.append(frame) return bz.data(pd.concat(frames).reset_index(drop=True)),
def test_csv_join(): d = {'a.csv': 'a,b,c\n0,1,2\n3,4,5', 'b.csv': 'c,d,e\n2,3,4\n5,6,7'} with filetexts(d): data_a = data('a.csv') data_b = data('b.csv') a = symbol('a', discover(data_a)) b = symbol('b', discover(data_b)) tm.assert_frame_equal( odo( compute(join(a, b, 'c'), {a: data_a, b: data_b}), pd.DataFrame, ), # windows needs explicit int64 construction b/c default is int32 pd.DataFrame(np.array([[2, 0, 1, 3, 4], [5, 3, 4, 6, 7]], dtype='int64'), columns=list('cabde')) )
def sql_with_timedeltas(url): try: t = data(url % next(names), dshape='var * {N: timedelta}') except sa.exc.OperationalError as e: pytest.skip(str(e)) else: t = odo([(timedelta(seconds=n),) for n in range(10)], t) try: yield t finally: drop(t)
def sql_with_dts(url): try: t = data(url % next(names), dshape='var * {A: datetime}') except sa.exc.OperationalError as e: pytest.skip(str(e)) else: t = odo([(d,) for d in pd.date_range('2014-01-01', '2014-02-01')], t) try: yield t finally: drop(t)
def big_sql(url): try: t = data(url % next(names), dshape='var * {A: string, B: int64}') except sa.exc.OperationalError as e: pytest.skip(str(e)) else: t = odo(zip(list('a'*100), list(range(100))), t) try: yield t finally: drop(t)
def test_groups(): with tmpfile('.hdf5') as fn: df.to_hdf(fn, '/data/fixed') hdf = data('hdfstore://%s' % fn) assert dshape(discover(hdf)) == dshape(discover({'data': {'fixed': df}})) s = symbol('s', discover(hdf)) assert list(compute(s.data.fixed, hdf).a) == [1, 2, 3, 4] hdf.data.close()
def test_hdfstore(): with tmpfile('.hdf5') as fn: df.to_hdf(fn, '/appendable', format='table') df.to_hdf(fn, '/fixed') hdf = data('hdfstore://%s' % fn) s = symbol('s', discover(hdf)) assert isinstance(compute(s.fixed, hdf), (pd.DataFrame, pd.io.pytables.Fixed)) assert isinstance(compute(s.appendable, hdf), (pd.io.pytables.AppendableFrameTable, Chunks)) s = symbol('s', discover(df)) f = data('hdfstore://%s::/fixed' % fn) a = data('hdfstore://%s::/appendable' % fn) assert isinstance(pre_compute(s, a), Chunks) hdf.data.close() f.data.parent.close() a.data.parent.close()
def test_complex_expr(self): expr = bz.data(self.df, dshape=self.dshape) # put an Add in the table expr_with_add = bz.transform(expr, value=expr.value + 1) # Test that we can have complex expressions with no deltas from_blaze( expr_with_add, deltas=None, loader=self.garbage_loader, missing_values=self.missing_values, ) with self.assertRaises(TypeError): from_blaze( expr.value + 1, # put an Add in the column deltas=None, loader=self.garbage_loader, missing_values=self.missing_values, ) deltas = bz.data( pd.DataFrame(columns=self.df.columns), dshape=self.dshape, ) with self.assertRaises(TypeError): from_blaze( expr_with_add, deltas=deltas, loader=self.garbage_loader, missing_values=self.missing_values, ) with self.assertRaises(TypeError): from_blaze( expr.value + 1, deltas=deltas, loader=self.garbage_loader, missing_values=self.missing_values, )
def pipeline_event_loader_args(self, dates): _, mapping = super( BlazeEarningsCalendarLoaderTestCase, self, ).pipeline_event_loader_args(dates) return (bz.data(pd.concat( pd.DataFrame({ ANNOUNCEMENT_FIELD_NAME: df[ANNOUNCEMENT_FIELD_NAME], TS_FIELD_NAME: df[TS_FIELD_NAME], SID_FIELD_NAME: sid, }) for sid, df in iteritems(mapping) ).reset_index(drop=True)),)