def test_deltas(self, asset_info): expr = bz.Data(self.df, name='expr', dshape=self.dshape) deltas = bz.Data(self.df, dshape=self.dshape) deltas = bz.Data( odo( bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ), pd.DataFrame, ), name='delta', dshape=self.dshape, ) expected_views = keymap( pd.Timestamp, { '2014-01-02': np.array([[10.0, 11.0, 12.0], [1.0, 2.0, 3.0]]), '2014-01-03': np.array([[11.0, 12.0, 13.0], [2.0, 3.0, 4.0]]), '2014-01-04': np.array([[12.0, 13.0, 14.0], [12.0, 13.0, 14.0] ]), }) nassets = len(asset_info) if nassets == 4: expected_views = valmap( lambda view: np.c_[view, [np.nan, np.nan]], expected_views, ) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( list(concatv([12] * nassets, [13] * nassets, [14] * nassets)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value', ), ) dates = self.dates dates = dates.insert(len(dates), dates[-1] + timedelta(days=1)) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=dates, start=dates[1], end=dates[-1], window_length=2, compute_fn=np.nanmax, )
def test_deltas_only_one_delta_in_universe(self, asset_info): expr = bz.Data(self.df, name='expr', dshape=self.dshape) deltas = pd.DataFrame({ 'sid': [65, 66], 'asof_date': [self.dates[1], self.dates[0]], 'timestamp': [self.dates[2], self.dates[1]], 'value': [10, 11], }) deltas = bz.Data(deltas, name='deltas', dshape=self.dshape) expected_views = keymap( pd.Timestamp, { '2014-01-02': np.array([[0.0, 11.0, 2.0], [1.0, 2.0, 3.0]]), '2014-01-03': np.array([[10.0, 2.0, 3.0], [2.0, 3.0, 4.0]]), '2014-01-04': np.array([[2.0, 3.0, 4.0], [2.0, 3.0, 4.0]]), }) nassets = len(asset_info) if nassets == 4: expected_views = valmap( lambda view: np.c_[view, [np.nan, np.nan]], expected_views, ) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( columns=[ 'value', ], data=np.array([11, 10, 4]).repeat(len(asset_info.index)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), ) dates = self.dates dates = dates.insert(len(dates), dates[-1] + timedelta(days=1)) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=dates, start=dates[1], end=dates[-1], window_length=2, compute_fn=np.nanmax, )
def test_id(self): expr = bz.Data(self.df, name='expr', dshape=self.dshape) loader = BlazeLoader() ds = from_blaze( expr, loader=loader, no_deltas_rule='ignore', ) p = Pipeline() p.add(ds.value.latest, 'value') dates = self.dates with tmp_asset_finder() as finder: result = SimplePipelineEngine( loader, dates, finder, ).run_pipeline(p, dates[0], dates[-1]) expected = self.df.drop('asof_date', axis=1).set_index(['timestamp', 'sid'], ) expected.index = pd.MultiIndex.from_product(( expected.index.levels[0], finder.retrieve_all(expected.index.levels[1]), )) assert_frame_equal(result, expected, check_dtype=False)
def test_tabular(self): name = 'expr' expr = bz.Data(self.df, name=name, dshape=self.dshape) ds = from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, ) self.assertEqual(ds.__name__, name) self.assertTrue(issubclass(ds, DataSet)) self.assertEqual( {c.name: c.dtype for c in ds.columns}, { 'sid': np.int64, 'value': np.float64 }, ) for field in ('timestamp', 'asof_date'): with self.assertRaises(AttributeError) as e: getattr(ds, field) self.assertIn("'%s'" % field, str(e.exception)) self.assertIn("'datetime'", str(e.exception)) # test memoization self.assertIs( from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, ), ds, )
def _test_id(self, df, dshape, expected, finder, add): expr = bz.Data(df, name='expr', dshape=dshape) loader = BlazeLoader() ds = from_blaze( expr, loader=loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ) p = Pipeline() for a in add: p.add(getattr(ds, a).latest, a) dates = self.dates with tmp_asset_finder() as finder: result = SimplePipelineEngine( loader, dates, finder, ).run_pipeline(p, dates[0], dates[-1]) assert_frame_equal( result, _utc_localize_index_level_0(expected), check_dtype=False, )
def plot(self): t = blz.Data(self.input_file) df = pd.read_csv(self.input_file) MAX = blz.compute(t.weight.max()) MIN = blz.compute(t.weight.min()) # Create a size variable to define the size of the the circle for the plot. t = blz.transform(t, size=blz.sqrt((t.weight - MIN)/(MAX - MIN))*50) WORDS = t['word'].distinct() WORDS = into(list, WORDS) topics = t['topic'].distinct() topics = into(list, topics) # Convert topics to strings TOPICS = [str(i) for i in topics] source = into(pd.DataFrame, t) data_source = ColumnDataSource(source) p = plt.figure(x_range=TOPICS, y_range=WORDS, plot_width=1000, plot_height=1700, title=None) p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source) #p.xaxis().major_label_orientation = np.pi/3 logging.info("generating termite plot for file %s" % self.input_file) script, div = components(p, CDN) return script, div
def test_custom_query_time_tz(self): df = self.df.copy() df['timestamp'] = ( pd.DatetimeIndex(df['timestamp'], tz='EST') + timedelta(hours=8, minutes=44)).tz_convert('utc').tz_localize(None) df.ix[3:5, 'timestamp'] = pd.Timestamp('2014-01-01 13:45') expr = bz.Data(df, name='expr', dshape=self.dshape) loader = BlazeLoader(data_query_time=time(8, 45), data_query_tz='EST') ds = from_blaze( expr, loader=loader, no_deltas_rule=no_deltas_rules.ignore, ) p = Pipeline() p.add(ds.value.latest, 'value') dates = self.dates with tmp_asset_finder() as finder: result = SimplePipelineEngine( loader, dates, finder, ).run_pipeline(p, dates[0], dates[-1]) expected = df.drop('asof_date', axis=1) expected['timestamp'] = expected['timestamp'].dt.normalize().astype( 'datetime64[ns]', ) expected.ix[3:5, 'timestamp'] += timedelta(days=1) expected.set_index(['timestamp', 'sid'], inplace=True) expected.index = pd.MultiIndex.from_product(( expected.index.levels[0], finder.retrieve_all(expected.index.levels[1]), )) assert_frame_equal(result, expected, check_dtype=False)
def bind_expression_to_resources(expr, resources): """ Bind a Blaze expression to resources. Parameters ---------- expr : bz.Expr The expression to which we want to bind resources. resources : dict[bz.Symbol -> any] Mapping from the atomic terms of ``expr`` to actual data resources. Returns ------- bound_expr : bz.Expr ``expr`` with bound resources. """ # bind the resources into the expression if resources is None: resources = {} # _subs stands for substitute. It's not actually private, blaze just # prefixes symbol-manipulation methods with underscores to prevent # collisions with data column names. return expr._subs( {k: bz.Data(v, dshape=k.dshape) for k, v in iteritems(resources)})
def test_model_blaze(): import blaze as bz model = as_model(bz.Data(os.path.join(TDATA_ROOT, 'simple.csv'))) assert(model.header_shape == (1, 0)) assert(model.shape == (2, 3)) assert(materialize(model) == [[1, 2, 3], [4, 5, 6]]) assert(materialize_header(model, 0) == [['a', 'b', 'c']])
def test_id_macro_dataset(self): expr = bz.Data(self.macro_df, name='expr', dshape=self.macro_dshape) loader = BlazeLoader() ds = from_blaze( expr, loader=loader, no_deltas_rule='ignore', ) p = Pipeline() p.add(ds.value.latest, 'value') dates = self.dates asset_info = asset_infos[0][0] with tmp_asset_finder(asset_info) as finder: result = SimplePipelineEngine( loader, dates, finder, ).run_pipeline(p, dates[0], dates[-1]) nassets = len(asset_info) expected = pd.DataFrame( list(concatv([0] * nassets, [1] * nassets, [2] * nassets)), index=pd.MultiIndex.from_product(( self.macro_df.timestamp, finder.retrieve_all(asset_info.index), )), columns=('value', ), ) assert_frame_equal(result, expected, check_dtype=False)
def blaze_from_uri(uri, enc): import blaze # make file:// uris work uniformly if uri.startswith('file://'): uri = uri[7:] return blaze.Data(uri, encoding=enc)
def test_id_macro_dataset_multiple_columns(self): """ input (df): asof_date timestamp other value 0 2014-01-01 2014-01-01 1 0 3 2014-01-02 2014-01-02 2 1 6 2014-01-03 2014-01-03 3 2 output (expected): other value 2014-01-01 Equity(65 [A]) 1 0 Equity(66 [B]) 1 0 Equity(67 [C]) 1 0 2014-01-02 Equity(65 [A]) 2 1 Equity(66 [B]) 2 1 Equity(67 [C]) 2 1 2014-01-03 Equity(65 [A]) 3 2 Equity(66 [B]) 3 2 Equity(67 [C]) 3 2 """ df = self.macro_df.copy() df['other'] = df.value + 1 fields = OrderedDict(self.macro_dshape.measure.fields) fields['other'] = fields['value'] expr = bz.Data(df, name='expr', dshape=var * Record(fields)) loader = BlazeLoader() ds = from_blaze( expr, loader=loader, no_deltas_rule=no_deltas_rules.ignore, ) p = Pipeline() p.add(ds.value.latest, 'value') p.add(ds.other.latest, 'other') dates = self.dates asset_info = asset_infos[0][0] with tmp_asset_finder(equities=asset_info) as finder: result = SimplePipelineEngine( loader, dates, finder, ).run_pipeline(p, dates[0], dates[-1]) expected = pd.DataFrame( np.array([[0, 1], [1, 2], [2, 3]]).repeat(3, axis=0), index=pd.MultiIndex.from_product(( df.timestamp, finder.retrieve_all(asset_info.index), )), columns=('value', 'other'), ).sort_index(axis=1) assert_frame_equal( result, expected.sort_index(axis=1), check_dtype=False, )
def test_no_concrete_loader_defined(self): with self.assertRaisesRegexp( TypeError, re.escape(ABSTRACT_CONCRETE_LOADER_ERROR)): BlazeEventDataSetLoaderNoConcreteLoader( bz.Data( pd.DataFrame({ ANNOUNCEMENT_FIELD_NAME: dtx, SID_FIELD_NAME: 0 })))
def test_auto_deltas_fail_raise(self): loader = BlazeLoader() expr = bz.Data(self.df, dshape=self.dshape) with self.assertRaises(ValueError) as e: from_blaze( expr, loader=loader, no_deltas_rule=no_deltas_rules.raise_, ) self.assertIn(str(expr), str(e.exception))
def test_deltas_macro(self): asset_info = asset_infos[0][0] expr = bz.Data(self.macro_df, name='expr', dshape=self.macro_dshape) deltas = bz.Data( self.macro_df.iloc[:-1], name='deltas', dshape=self.macro_dshape, ) deltas = bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ) nassets = len(asset_info) expected_views = keymap( pd.Timestamp, { '2014-01-02': repeat_last_axis(np.array([10.0, 1.0]), nassets), '2014-01-03': repeat_last_axis(np.array([11.0, 2.0]), nassets), }) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( list(concatv([10] * nassets, [11] * nassets)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value', ), ) dates = self.dates self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=dates, start=dates[1], end=dates[-1], window_length=2, compute_fn=np.nanmax, )
def _make_blaze_data_obj(resource, columns=None, datashape=None): if columns is not None: s = [] for col in columns: s.append(resource.c[col]) data = select(s) else: data = resource bz_data = bz.Data(data, dshape=datashape) return bz_data
def test_complex_expr(self): expr = bz.Data(self.df, dshape=self.dshape) # put an Add in the table expr_with_add = bz.transform(expr, value=expr.value + 1) # Test that we can have complex expressions with no deltas from_blaze( expr_with_add, deltas=None, loader=self.garbage_loader, missing_values=self.missing_values, ) with self.assertRaises(TypeError): from_blaze( expr.value + 1, # put an Add in the column deltas=None, loader=self.garbage_loader, missing_values=self.missing_values, ) deltas = bz.Data( pd.DataFrame(columns=self.df.columns), dshape=self.dshape, ) with self.assertRaises(TypeError): from_blaze( expr_with_add, deltas=deltas, loader=self.garbage_loader, missing_values=self.missing_values, ) with self.assertRaises(TypeError): from_blaze( expr.value + 1, deltas=deltas, loader=self.garbage_loader, missing_values=self.missing_values, )
def test_blaze_data_no_fields(self): import blaze valuesdf = pd.DataFrame(self._values) values = blaze.Data(valuesdf) da = DataAdapter(values) assert_array_equal(da.values(), list(self._values.values())) self.assertEqual(da.columns, ['first', 'second', 'third']) self.assertEqual(da.keys(), ['first', 'second', 'third']) self.assertEqual(da.index, [0, 1, 2]) xs, _values = DataAdapter.get_index_and_data(values, None) assert_array_equal([0, 1, 2], xs)
def loader_args(self, dates): _, mapping = super( BlazeEarningsCalendarLoaderTestCase, self, ).loader_args(dates) return (bz.Data( pd.concat( pd.DataFrame({ ANNOUNCEMENT_FIELD_NAME: earning_dates, TS_FIELD_NAME: earning_dates.index, SID_FIELD_NAME: sid, }) for sid, earning_dates in iteritems(mapping)).reset_index( drop=True)), )
def test_auto_deltas_fail_warn(self): with warnings.catch_warnings(record=True) as ws: warnings.simplefilter('always') loader = BlazeLoader() expr = bz.Data(self.df, dshape=self.dshape) from_blaze( expr, loader=loader, no_deltas_rule=no_deltas_rules.warn, ) self.assertEqual(len(ws), 1) w = ws[0].message self.assertIsInstance(w, NoDeltasWarning) self.assertIn(str(expr), str(w))
def pipeline_event_loader_args(self, dates): _, mapping = super( BlazeDividendsByPayDateLoaderTestCase, self, ).pipeline_event_loader_args(dates) return (bz.Data( pd.concat( pd.DataFrame({ PAY_DATE_FIELD_NAME: df[PAY_DATE_FIELD_NAME], TS_FIELD_NAME: df[TS_FIELD_NAME], SID_FIELD_NAME: sid, CASH_AMOUNT_FIELD_NAME: df[CASH_AMOUNT_FIELD_NAME] }) for sid, df in iteritems(mapping)).reset_index(drop=True)), )
def termite(modeled_corpus, plot_title="Termite plot", topn=15): """A Bokeh Termite Visualization for LDA results analysis. Parameters ---------- input_file : str or pandas DataFrame A pandas dataframe from a topik model get_termite_data() containing columns "word", "topic" and "weight". May also be a string, in which case the string is a filename of a csv file with the above columns. title : str The title for your termite plot Examples -------- >>> plot = termite(test_model_output, plot_title="My model results", topn=5) """ prepared_model_vis_data = _termite_data(modeled_corpus, topn) t = blz.Data(prepared_model_vis_data) MAX = blz.compute(t.weight.max()) MIN = blz.compute(t.weight.min()) # Create a size variable to define the size of the the circle for the plot. t = blz.transform(t, size=blz.sqrt((t.weight - MIN) / (MAX - MIN)) * 50) WORDS = t['word'].distinct() WORDS = into(list, WORDS) topics = t['topic'].distinct() topics = into(list, topics) # Convert topics to strings TOPICS = [str(i) for i in topics] source = into(pd.DataFrame, t) data_source = sources.ColumnDataSource(source) p = plt.figure(x_range=TOPICS, y_range=WORDS, plot_width=1000, plot_height=1700, title=plot_title) p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source) return p
def before_request(): ds = bz.Data(bz.resource('mongodb://localhost/scrapy::flat'), dshape="""var * { open: bool, price: real, price_period: ?string, area: real, url: string, city: string, district: string, project: string, rooms: ?int }""") g.ds = ds[(ds.open == True) & (ds.price_period == None)]
def pipeline_event_loader_args(self, dates): _, mapping = super( BlazeDividendsByAnnouncementDateTestCase, self, ).pipeline_event_loader_args(dates) return (bz.Data( pd.concat( pd.DataFrame( { ANNOUNCEMENT_FIELD_NAME: df[ANNOUNCEMENT_FIELD_NAME], TS_FIELD_NAME: df[TS_FIELD_NAME], SID_FIELD_NAME: sid, CASH_AMOUNT_FIELD_NAME: df[CASH_AMOUNT_FIELD_NAME], CURRENCY_FIELD_NAME: df[CURRENCY_FIELD_NAME], DIVIDEND_TYPE_FIELD_NAME: df[DIVIDEND_TYPE_FIELD_NAME], }) for sid, df in iteritems(mapping)).reset_index(drop=True)), )
def loader_args(self, dates): _, mapping = super( BlazeShareBuybackAuthLoaderTestCase, self, ).loader_args(dates) return (bz.Data(pd.concat( pd.DataFrame({ BUYBACK_ANNOUNCEMENT_FIELD_NAME: frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME], SHARE_COUNT_FIELD_NAME: frame[SHARE_COUNT_FIELD_NAME], TS_FIELD_NAME: frame[TS_FIELD_NAME], SID_FIELD_NAME: sid, }) for sid, frame in iteritems(mapping) ).reset_index(drop=True)),)
def test_auto_deltas(self): expr = bz.Data( { 'ds': self.df, 'ds_deltas': pd.DataFrame(columns=self.df.columns) }, dshape=var * Record(( ('ds', self.dshape.measure), ('ds_deltas', self.dshape.measure), )), ) loader = BlazeLoader() ds = from_blaze(expr.ds, loader=loader) self.assertEqual(len(loader), 1) exprdata = loader[ds] self.assertTrue(exprdata.expr.isidentical(expr.ds)) self.assertTrue(exprdata.deltas.isidentical(expr.ds_deltas))
def before_request(): ds = bz.Data(bz.resource('mongodb://localhost/scrapy::car'), dshape="""var * { open: bool, production_year: int, mileage: ?int, price: real, price_period: ?string, url: string, brand: string, color: string }""") g.ds = ds[(ds.open == True) & (ds.price_period == None) & (ds.mileage > 0) & (ds.mileage < 1e+6) & (ds.production_year > (dt.date.today().year - 20))]
def test_non_numpy_field(self): expr = bz.Data( [], dshape=""" var * { a: datetime, asof_date: datetime, timestamp: datetime, }""", ) ds = from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, ) with self.assertRaises(AttributeError): ds.a self.assertIsInstance(object.__getattribute__(ds, 'a'), NonNumpyField)
def test_column(self): exprname = 'expr' expr = bz.Data(self.df, name=exprname, dshape=self.dshape) value = from_blaze( expr.value, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ) self.assertEqual(value.name, 'value') self.assertIsInstance(value, BoundColumn) self.assertIs(value.dtype, float64_dtype) # test memoization self.assertIs( from_blaze( expr.value, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ), value, ) self.assertIs( from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ).value, value, ) # test the walk back up the tree self.assertIs( from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ), value.dataset, ) self.assertEqual(value.dataset.__name__, exprname)
def test_missing_asof(self): expr = bz.Data( self.df.loc[:, ['sid', 'value', 'timestamp']], name='expr', dshape=""" var * { sid: ?int64, value: float64, timestamp: datetime, }""", ) with self.assertRaises(TypeError) as e: from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, ) self.assertIn("'asof_date'", str(e.exception)) self.assertIn(repr(str(expr.dshape.measure)), str(e.exception))