def correct_commodities(): src_dir = path.join(data_dir, 'agmarknet/by_commodity') init_dir = os.getcwd() os.chdir(src_dir) folders = glob.glob('*') csv_dir = os.getcwd() for folder in folders: os.chdir(path.join(csv_dir, folder)) files = glob.glob('*_all.csv') for file in files: csvr = odo.resource(path.join(csv_dir, folder, file)) # Have to use resource to discover URIs num_col = len(odo.discover(csvr)[1].types) ds = None if num_col == 9: ds = bz.dshape("var * {date: datetime, state: ?string, market: ?string, commodity: ?string, variety: ?string, arrival: ?string, min: ?string, max: ?string, modal: ?string}") elif num_col == 10: ds = bz.dshape("var * {date: datetime, state: ?string, market: ?string, commodity: ?string, variety: ?string, arrival: ?string, grade: ?string, min: ?string, max: ?string, modal: ?string}") else: ds = odo.discover(csvr) d = bz.Data(path.join(csv_dir, folder, file), dshape=ds) print(d.dshape.measure) d = bz.transform(d, commodity=d.commodity.map(lambda x: x.strip(), 'string')) d = bz.transform(d, commodity=d.commodity.map(lambda x: spelling_dict[x] if x in spelling_dict else x, 'string')) print(d.dshape.measure) print(list(bz.compute(d.commodity))) os.chdir(init_dir)
def _ensure_timestamp_field(dataset_expr, deltas): """Verify that the baseline and deltas expressions have a timestamp field. If there is not a ``TS_FIELD_NAME`` on either of the expressions, it will be copied from the ``AD_FIELD_NAME``. If one is provided, then we will verify that it is the correct dshape. Parameters ---------- dataset_expr : Expr The baseline expression. deltas : Expr or None The deltas expression if any was provided. Returns ------- dataset_expr, deltas : Expr The new baseline and deltas expressions to use. """ measure = dataset_expr.dshape.measure if TS_FIELD_NAME not in measure.names: dataset_expr = bz.transform( dataset_expr, **{TS_FIELD_NAME: dataset_expr[AD_FIELD_NAME]} ) if deltas is not None: deltas = bz.transform( deltas, **{TS_FIELD_NAME: deltas[AD_FIELD_NAME]} ) else: _check_datetime_field(TS_FIELD_NAME, measure) return dataset_expr, deltas
def test_nested_transform(): d = {'timestamp': [1379613528, 1379620047], 'platform': ["Linux", "Windows"]} df = DataFrame(d) t = symbol('t', discover(df)) t = transform(t, timestamp=t.timestamp.map(datetime.fromtimestamp, schema='datetime')) expr = transform(t, date=t.timestamp.map(lambda x: x.date(), schema='datetime')) result = compute(expr, df) df['timestamp'] = df.timestamp.map(datetime.fromtimestamp) df['date'] = df.timestamp.map(lambda x: x.date()) assert str(result) == str(df)
def test_nested_transform(): d = {'timestamp': [1379613528, 1379620047], 'platform': ["Linux", "Windows"]} df = DataFrame(d) t = symbol('t', discover(df)) t = transform(t, timestamp=t.timestamp.map(datetime.fromtimestamp, schema='datetime')) expr = transform(t, date=t.timestamp.map(lambda x: x.date(), schema='datetime')) result = compute(expr, df) df['timestamp'] = df.timestamp.map(datetime.fromtimestamp) df['date'] = df.timestamp.map(lambda x: x.date()) tm.assert_frame_equal(result, df)
def test_dist(nyc): def distance(lat1, lon1, lat2, lon2, R=3959): # http://andrew.hedges.name/experiments/haversine/ dlon = radians(lon2 - lon1) dlat = radians(lat2 - lat1) a = sin(dlat / 2.0) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2.0) ** 2 return R * 2 * atan2(sqrt(a), sqrt(1 - a)) t = symbol('t', discover(nyc)) filtered = t[ (t.pickup_latitude >= 40.477399) & (t.pickup_latitude <= 40.917577) & (t.dropoff_latitude >= 40.477399) & (t.dropoff_latitude <= 40.917577) & (t.pickup_longitude >= -74.259090) & (t.pickup_longitude <= -73.700272) & (t.dropoff_longitude >= -74.259090) & (t.dropoff_longitude <= -73.700272) & (t.passenger_count < 6) ] dist = distance(filtered.pickup_latitude, filtered.pickup_longitude, filtered.dropoff_latitude, filtered.dropoff_longitude) transformed = transform(filtered, dist=dist) assert ( odo(compute(transformed.dist.max(), nyc), float) == odo(compute(transformed.dist, nyc), pd.Series).max().item() )
def test_str_does_not_repr(): # see GH issue #1240. d = Data([('aa', 1), ('b', 2)], name="ZZZ", dshape='2 * {a: string, b: int64}') expr = transform(d, c=d.a.strlen() + d.b) assert str( expr) == "Merge(_child=ZZZ, children=(ZZZ, label(strlen(_child=ZZZ.a) + ZZZ.b, 'c')))"
def plot(self, output_file="termite.html"): import blaze as blz from odo import into import pandas as pd import bokeh.plotting as plt from bokeh.models.sources import ColumnDataSource t = blz.Data(self.input_file) MAX = blz.compute(t.weight.max()) MIN = blz.compute(t.weight.min()) # Create a size variable to define the size of the the circle for the plot. t = blz.transform(t, size=blz.sqrt((t.weight - MIN)/(MAX - MIN))*50) WORDS = t['word'].distinct() WORDS = into(list, WORDS) topics = t['topic'].distinct() topics = into(list, topics) # Convert topics to strings TOPICS = [str(i) for i in topics] source = into(pd.DataFrame, t) plt.output_file(output_file) data_source = ColumnDataSource(source) p = plt.figure(x_range=TOPICS, y_range=WORDS, plot_width=1000, plot_height=1700, title=self.title) p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source) plt.show(p)
def plot(self, output_file="termite.html"): t = blz.Data(self.input_file) df = pd.read_csv(self.input_file) MAX = blz.compute(t.weight.max()) MIN = blz.compute(t.weight.min()) # Create a size variable to define the size of the the circle for the plot. t = blz.transform(t, size=blz.sqrt((t.weight - MIN)/(MAX - MIN))*50) WORDS = t['word'].distinct() WORDS = into(list, WORDS) topics = t['topic'].distinct() topics = into(list, topics) # Convert topics to strings TOPICS = [str(i) for i in topics] source = into(pd.DataFrame, t) plt.output_file(output_file) data_source = ColumnDataSource(source) p = plt.figure(x_range=TOPICS, y_range=WORDS, plot_width=1000, plot_height=1700, title=self.title) p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source) #p.xaxis().major_label_orientation = np.pi/3 logging.info("generating termite plot for file %s" % self.input_file) plt.show(p)
def test_dist(nyc): def distance(lat1, lon1, lat2, lon2, R=3959): # http://andrew.hedges.name/experiments/haversine/ dlon = radians(lon2 - lon1) dlat = radians(lat2 - lat1) a = sin(dlat / 2.0) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2.0) ** 2 return R * 2 * atan2(sqrt(a), sqrt(1 - a)) t = symbol('t', discover(nyc)) filtered = t[ (t.pickup_latitude >= 40.477399) & (t.pickup_latitude <= 40.917577) & (t.dropoff_latitude >= 40.477399) & (t.dropoff_latitude <= 40.917577) & (t.pickup_longitude >= -74.259090) & (t.pickup_longitude <= -73.700272) & (t.dropoff_longitude >= -74.259090) & (t.dropoff_longitude <= -73.700272) & (t.passenger_count < 6) ] dist = distance(filtered.pickup_latitude, filtered.pickup_longitude, filtered.dropoff_latitude, filtered.dropoff_longitude) transformed = transform(filtered, dist=dist) assert ( compute(transformed.dist.max(), nyc, return_type=float) == compute(transformed.dist, nyc, return_type=pd.Series).max() )
def plot(self): t = blz.Data(self.input_file) df = pd.read_csv(self.input_file) MAX = blz.compute(t.weight.max()) MIN = blz.compute(t.weight.min()) # Create a size variable to define the size of the the circle for the plot. t = blz.transform(t, size=blz.sqrt((t.weight - MIN)/(MAX - MIN))*50) WORDS = t['word'].distinct() WORDS = into(list, WORDS) topics = t['topic'].distinct() topics = into(list, topics) # Convert topics to strings TOPICS = [str(i) for i in topics] source = into(pd.DataFrame, t) data_source = ColumnDataSource(source) p = plt.figure(x_range=TOPICS, y_range=WORDS, plot_width=1000, plot_height=1700, title=None) p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source) #p.xaxis().major_label_orientation = np.pi/3 logging.info("generating termite plot for file %s" % self.input_file) script, div = components(p, CDN) return script, div
def test_novel_deltas_macro(self): asset_info = asset_infos[0][0] base_dates = pd.DatetimeIndex([ pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-04') ]) baseline = pd.DataFrame({ 'value': (0, 1), 'asof_date': base_dates, 'timestamp': base_dates, }) expr = bz.Data(baseline, name='expr', dshape=self.macro_dshape) deltas = bz.Data(baseline, name='deltas', dshape=self.macro_dshape) deltas = bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ) nassets = len(asset_info) expected_views = keymap(pd.Timestamp, { '2014-01-03': repeat_last_axis( np.array([10.0, 10.0, 10.0]), nassets, ), '2014-01-06': repeat_last_axis( np.array([10.0, 10.0, 11.0]), nassets, ), }) cal = pd.DatetimeIndex([ pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-02'), pd.Timestamp('2014-01-03'), # omitting the 4th and 5th to simulate a weekend pd.Timestamp('2014-01-06'), ]) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( list(concatv([10] * nassets, [11] * nassets)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value',), ) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=cal, start=cal[2], end=cal[-1], window_length=3, compute_fn=op.itemgetter(-1), )
def insert_commodities(df, mongo_str, batch=False): ### TODO: load this list from commodities/selected_commodities.json selected_commodities = json.load(open(path.join(data_dir, 'commodities', 'selected_commodities.json'))) to_strip = '' init_dir = os.getcwd() if batch: agmarknet_dir = path.join(data_dir, 'agmarknet', 'by_commodity') to_strip = '_stacked_localized.csv' else: agmarknet_dir = path.join(data_dir, 'agmarknet', 'by_date_and_commodity') to_strip = '_localized.csv' for cat, comm_list in selected_commodities.items(): ### NOTE: following is just for testing if cat != 'Cereals': continue cat_folder = name_to_fs(cat).replace('-', '_') src_folder = path.join(agmarknet_dir, cat_folder, 'integrated') os.chdir(src_folder) files = glob.glob('*.csv') print(files) selected_files = [] # TODO: test for Coffee and Tea print(comm_list) for comm in comm_list: # need to filter with every commodity name selected :/ print(name_to_fs(comm)) selected_files+=list(filter(lambda x: name_to_fs(comm) == x.replace(to_strip, ''), files)) print(selected_files) # map commodity name to filename? # filter with comm_list for filename in selected_files: print(filename) coll = filename.replace(to_strip, '') coll = coll.replace('-', '.') coll = coll.lower() coll = 'market.'+coll+'.varieties' print(coll) print('Inserting {0} into collection \"{1}\"..'.format(filename, coll)) target = mongo_str+coll print(target) d = bz.Data(filename) #print('shape', d.dshape) #nrows = bz.compute(d.count()) #print('rows', nrows) # renaming columns in blaze d = d.relabel(min='minPrice', max='maxPrice', modal='modalPrice')#, arrival='commodityTonnage')#, commodity_translated='commodityTranslated') # NOTE: assumption of normal distribution since underlying price discovery process not known: # -> modal price = mean price """ df.rename(columns={'modal': 'minPrice'}, inplace=True) df.rename(columns={'modal': 'maxPrice'}, inplace=True) df.rename(columns={'modal': 'modalPrice'}, inplace=True) """ d = bz.transform(d, varietyTonnage=d.commodityTonnage.map(lambda x: np.nan, 'float64')) odo.odo(d, target) os.chdir(init_dir) return
def test_dplyr_transform(): df = DataFrame({'timestamp': pd.date_range('now', periods=5)}) t = symbol('t', discover(df)) expr = transform(t, date=t.timestamp.map(lambda x: x.date(), schema='datetime')) lhs = compute(expr, df) rhs = pd.concat([df, Series(df.timestamp.map(lambda x: x.date()), name='date').to_frame()], axis=1) tm.assert_frame_equal(lhs, rhs)
def test_dplyr_transform(): df = DataFrame({'timestamp': pd.date_range('now', periods=5)}) t = symbol('t', discover(df)) expr = transform(t, date=t.timestamp.map(lambda x: x.date(), schema='datetime')) lhs = compute(expr, df) rhs = pd.concat([df, Series(df.timestamp.map(lambda x: x.date()), name='date').to_frame()], axis=1) assert str(lhs) == str(rhs)
def test_novel_deltas(self, asset_info): base_dates = pd.DatetimeIndex([pd.Timestamp("2014-01-01"), pd.Timestamp("2014-01-04")]) repeated_dates = base_dates.repeat(3) baseline = pd.DataFrame( { "sid": self.sids * 2, "value": (0, 1, 2, 1, 2, 3), "asof_date": repeated_dates, "timestamp": repeated_dates, } ) expr = bz.Data(baseline, name="expr", dshape=self.dshape) deltas = bz.Data(baseline, name="deltas", dshape=self.dshape) deltas = bz.transform(deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1)) expected_views = keymap( pd.Timestamp, { "2014-01-03": np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0], [10.0, 11.0, 12.0]]), "2014-01-06": np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0], [11.0, 12.0, 13.0]]), }, ) if len(asset_info) == 4: expected_views = valmap(lambda view: np.c_[view, [np.nan, np.nan, np.nan]], expected_views) expected_output_buffer = [10, 11, 12, np.nan, 11, 12, 13, np.nan] else: expected_output_buffer = [10, 11, 12, 11, 12, 13] cal = pd.DatetimeIndex( [ pd.Timestamp("2014-01-01"), pd.Timestamp("2014-01-02"), pd.Timestamp("2014-01-03"), # omitting the 4th and 5th to simulate a weekend pd.Timestamp("2014-01-06"), ] ) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( expected_output_buffer, index=pd.MultiIndex.from_product( (sorted(expected_views.keys()), finder.retrieve_all(asset_info.index)) ), columns=("value",), ) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=cal, start=cal[2], end=cal[-1], window_length=3, compute_fn=op.itemgetter(-1), )
def test_deltas(self, asset_info): expr = bz.Data(self.df, name='expr', dshape=self.dshape) deltas = bz.Data(self.df, dshape=self.dshape) deltas = bz.Data( odo( bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ), pd.DataFrame, ), name='delta', dshape=self.dshape, ) expected_views = keymap(pd.Timestamp, { '2014-01-02': np.array([[10.0, 11.0, 12.0], [1.0, 2.0, 3.0]]), '2014-01-03': np.array([[11.0, 12.0, 13.0], [2.0, 3.0, 4.0]]), '2014-01-04': np.array([[12.0, 13.0, 14.0], [12.0, 13.0, 14.0]]), }) nassets = len(asset_info) if nassets == 4: expected_views = valmap( lambda view: np.c_[view, [np.nan, np.nan]], expected_views, ) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( list(concatv([12] * nassets, [13] * nassets, [14] * nassets)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value',), ) dates = self.dates dates = dates.insert(len(dates), dates[-1] + timedelta(days=1)) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=dates, start=dates[1], end=dates[-1], window_length=2, compute_fn=np.nanmax, )
def test_transform_with_common_subexpression(): df = DataFrame(np.random.rand(5, 2), columns=list('ab')) t = symbol('t', discover(df)) expr = transform(t, c=t.a - t.a % 3, d=t.a % 3) result = compute(expr, df) expected = pd.concat( [df[c] for c in df.columns] + [pd.Series(df.a - df.a % 3, name='c'), pd.Series(df.a % 3, name='d')], axis=1) tm.assert_frame_equal(result, expected)
def test_deltas(self, asset_info): expr = bz.Data(self.df, name='expr', dshape=self.dshape) deltas = bz.Data(self.df, dshape=self.dshape) deltas = bz.Data( odo( bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ), pd.DataFrame, ), name='delta', dshape=self.dshape, ) expected_views = keymap( pd.Timestamp, { '2014-01-02': np.array([[10.0, 11.0, 12.0], [1.0, 2.0, 3.0]]), '2014-01-03': np.array([[11.0, 12.0, 13.0], [2.0, 3.0, 4.0]]), '2014-01-04': np.array([[12.0, 13.0, 14.0], [12.0, 13.0, 14.0] ]), }) nassets = len(asset_info) if nassets == 4: expected_views = valmap( lambda view: np.c_[view, [np.nan, np.nan]], expected_views, ) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( list(concatv([12] * nassets, [13] * nassets, [14] * nassets)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value', ), ) dates = self.dates dates = dates.insert(len(dates), dates[-1] + timedelta(days=1)) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=dates, start=dates[1], end=dates[-1], window_length=2, compute_fn=np.nanmax, )
def test_transform_with_common_subexpression(): df = DataFrame(np.random.rand(5, 2), columns=list('ab')) t = symbol('t', discover(df)) expr = transform(t, c=t.a - t.a % 3, d=t.a % 3) result = compute(expr, df) expected = pd.concat( [df[c] for c in df.columns] + [ pd.Series(df.a - df.a % 3, name='c'), pd.Series(df.a % 3, name='d') ], axis=1 ) tm.assert_frame_equal(result, expected)
def test_coerce_on_select(nyc): t = symbol('t', discover(nyc)) t = t[(t.pickup_latitude >= 40.477399) & (t.pickup_latitude <= 40.917577) & (t.dropoff_latitude >= 40.477399) & (t.dropoff_latitude <= 40.917577) & (t.pickup_longitude >= -74.259090) & (t.pickup_longitude <= -73.700272) & (t.dropoff_longitude >= -74.259090) & (t.dropoff_longitude <= -73.700272) & (t.passenger_count < 6)] t = transform(t, pass_count=t.passenger_count + 1) result = compute(t.pass_count.coerce('float64'), nyc, return_type='native') s = odo(result, pd.Series) expected = compute(t, nyc, return_type=pd.DataFrame) \ .passenger_count.astype('float64') + 1.0 assert list(s) == list(expected)
def termite(modeled_corpus, plot_title="Termite plot", topn=15): """A Bokeh Termite Visualization for LDA results analysis. Parameters ---------- input_file : str or pandas DataFrame A pandas dataframe from a topik model get_termite_data() containing columns "word", "topic" and "weight". May also be a string, in which case the string is a filename of a csv file with the above columns. title : str The title for your termite plot Examples -------- >>> plot = termite(test_model_output, plot_title="My model results", topn=5) """ prepared_model_vis_data = _termite_data(modeled_corpus, topn) t = blz.Data(prepared_model_vis_data) MAX = blz.compute(t.weight.max()) MIN = blz.compute(t.weight.min()) # Create a size variable to define the size of the the circle for the plot. t = blz.transform(t, size=blz.sqrt((t.weight - MIN) / (MAX - MIN)) * 50) WORDS = t['word'].distinct() WORDS = into(list, WORDS) topics = t['topic'].distinct() topics = into(list, topics) # Convert topics to strings TOPICS = [str(i) for i in topics] source = into(pd.DataFrame, t) data_source = sources.ColumnDataSource(source) p = plt.figure(x_range=TOPICS, y_range=WORDS, plot_width=1000, plot_height=1700, title=plot_title) p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source) return p
def _ad_as_ts(expr): """Duplicate the asof_date column as the timestamp column. Parameters ---------- expr : Expr or None The expression to change the columns of. Returns ------- transformed : Expr or None The transformed expression or None if ``expr`` is None. """ return (None if expr is None else bz.transform( expr, **{TS_FIELD_NAME: expr[AD_FIELD_NAME]}))
def test_multiple_columns_in_transform(nyc): t = symbol('t', discover(nyc)) t = t[(t.pickup_latitude >= 40.477399) & (t.pickup_latitude <= 40.917577) & (t.dropoff_latitude >= 40.477399) & (t.dropoff_latitude <= 40.917577) & (t.pickup_longitude >= -74.259090) & (t.pickup_longitude <= -73.700272) & (t.dropoff_longitude >= -74.259090) & (t.dropoff_longitude <= -73.700272) & (t.passenger_count < 6)] hours = t.trip_time_in_secs.coerce('float64') / 3600.0 avg_speed_in_mph = t.trip_distance / hours d = transform(t, avg_speed_in_mph=avg_speed_in_mph, mycol=avg_speed_in_mph + 1) df = compute(d[d.avg_speed_in_mph <= 200], nyc, return_type=pd.DataFrame) assert not df.empty
def test_multiple_columns_in_transform(nyc): t = symbol('t', discover(nyc)) t = t[ (t.pickup_latitude >= 40.477399) & (t.pickup_latitude <= 40.917577) & (t.dropoff_latitude >= 40.477399) & (t.dropoff_latitude <= 40.917577) & (t.pickup_longitude >= -74.259090) & (t.pickup_longitude <= -73.700272) & (t.dropoff_longitude >= -74.259090) & (t.dropoff_longitude <= -73.700272) & (t.passenger_count < 6) ] hours = t.trip_time_in_secs.coerce('float64') / 3600.0 avg_speed_in_mph = t.trip_distance / hours d = transform(t, avg_speed_in_mph=avg_speed_in_mph, mycol=avg_speed_in_mph + 1) df = compute(d[d.avg_speed_in_mph <= 200], nyc, return_type=pd.DataFrame) assert not df.empty
def _ad_as_ts(expr): """Duplicate the asof_date column as the timestamp column. Parameters ---------- expr : Expr or None The expression to change the columns of. Returns ------- transformed : Expr or None The transformed expression or None if ``expr`` is None. """ return ( None if expr is None else bz.transform(expr, **{TS_FIELD_NAME: expr[AD_FIELD_NAME]}) )
def test_deltas_macro(self): asset_info = asset_infos[0][0] expr = bz.Data(self.macro_df, name='expr', dshape=self.macro_dshape) deltas = bz.Data( self.macro_df.iloc[:-1], name='deltas', dshape=self.macro_dshape, ) deltas = bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ) nassets = len(asset_info) expected_views = keymap( pd.Timestamp, { '2014-01-02': repeat_last_axis(np.array([10.0, 1.0]), nassets), '2014-01-03': repeat_last_axis(np.array([11.0, 2.0]), nassets), }) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( list(concatv([10] * nassets, [11] * nassets)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value', ), ) dates = self.dates self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=dates, start=dates[1], end=dates[-1], window_length=2, compute_fn=np.nanmax, )
def termite(modeled_corpus, plot_title="Termite plot", topn=15): """A Bokeh Termite Visualization for LDA results analysis. Parameters ---------- input_file : str or pandas DataFrame A pandas dataframe from a topik model get_termite_data() containing columns "word", "topic" and "weight". May also be a string, in which case the string is a filename of a csv file with the above columns. title : str The title for your termite plot Examples -------- >>> plot = termite(test_model_output, plot_title="My model results", topn=5) """ prepared_model_vis_data = _termite_data(modeled_corpus, topn) t = blz.Data(prepared_model_vis_data) MAX = blz.compute(t.weight.max()) MIN = blz.compute(t.weight.min()) # Create a size variable to define the size of the the circle for the plot. t = blz.transform(t, size=blz.sqrt((t.weight - MIN)/(MAX - MIN))*50) WORDS = t['word'].distinct() WORDS = into(list, WORDS) topics = t['topic'].distinct() topics = into(list, topics) # Convert topics to strings TOPICS = [str(i) for i in topics] source = into(pd.DataFrame, t) data_source = sources.ColumnDataSource(source) p = plt.figure(x_range=TOPICS, y_range=WORDS, plot_width=1000, plot_height=1700, title=plot_title) p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source) return p
def test_coerce_on_select(nyc): t = symbol('t', discover(nyc)) t = t[ (t.pickup_latitude >= 40.477399) & (t.pickup_latitude <= 40.917577) & (t.dropoff_latitude >= 40.477399) & (t.dropoff_latitude <= 40.917577) & (t.pickup_longitude >= -74.259090) & (t.pickup_longitude <= -73.700272) & (t.dropoff_longitude >= -74.259090) & (t.dropoff_longitude <= -73.700272) & (t.passenger_count < 6) ] t = transform(t, pass_count=t.passenger_count + 1) result = compute(t.pass_count.coerce('float64'), nyc, return_type='native') s = odo(result, pd.Series) expected = compute(t, nyc, return_type=pd.DataFrame) \ .passenger_count.astype('float64') + 1.0 assert list(s) == list(expected)
def test_deltas_macro(self): asset_info = asset_infos[0][0] expr = bz.Data(self.macro_df, name='expr', dshape=self.macro_dshape) deltas = bz.Data( self.macro_df.iloc[:-1], name='deltas', dshape=self.macro_dshape, ) deltas = bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ) nassets = len(asset_info) expected_views = keymap(pd.Timestamp, { '2014-01-02': repeat_last_axis(np.array([10.0, 1.0]), nassets), '2014-01-03': repeat_last_axis(np.array([11.0, 2.0]), nassets), }) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( list(concatv([10] * nassets, [11] * nassets)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value',), ) dates = self.dates self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=dates, start=dates[1], end=dates[-1], window_length=2, compute_fn=np.nanmax, )
def test_str_does_not_repr(): # see GH issue #1240. d = data( [('aa', 1), ('b', 2)], name="ZZZ", dshape='2 * {a: string, b: int64}', ) expr = transform(d, c=d.a.str.len() + d.b) assert ( normalize(str(expr)) == normalize(""" Merge( args=(ZZZ, label(len(_child=ZZZ.a) + ZZZ.b, 'c')), _varargsexpr=VarArgsExpr( _inputs=(ZZZ, label(len(_child=ZZZ.a) + ZZZ.b, 'c')) ), _shape=(2,) ) """) )
def test_complex_expr(self): expr = bz.data(self.df, dshape=self.dshape) # put an Add in the table expr_with_add = bz.transform(expr, value=expr.value + 1) # Test that we can have complex expressions with no deltas from_blaze( expr_with_add, deltas=None, loader=self.garbage_loader, missing_values=self.missing_values, ) with self.assertRaises(TypeError): from_blaze( expr.value + 1, # put an Add in the column deltas=None, loader=self.garbage_loader, missing_values=self.missing_values, ) deltas = bz.data( pd.DataFrame(columns=self.df.columns), dshape=self.dshape, ) with self.assertRaises(TypeError): from_blaze( expr_with_add, deltas=deltas, loader=self.garbage_loader, missing_values=self.missing_values, ) with self.assertRaises(TypeError): from_blaze( expr.value + 1, deltas=deltas, loader=self.garbage_loader, missing_values=self.missing_values, )
def test_novel_deltas(self, asset_info): base_dates = pd.DatetimeIndex([ pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-04') ]) repeated_dates = base_dates.repeat(3) baseline = pd.DataFrame({ 'sid': self.sids * 2, 'value': (0., 1., 2., 1., 2., 3.), 'int_value': (0, 1, 2, 1, 2, 3), 'asof_date': repeated_dates, 'timestamp': repeated_dates, }) expr = bz.data(baseline, name='expr', dshape=self.dshape) deltas = bz.data( odo( bz.transform( expr, value=expr.value + 10, timestamp=expr.timestamp + timedelta(days=1), ), pd.DataFrame, ), name='delta', dshape=self.dshape, ) expected_views = keymap(pd.Timestamp, { '2014-01-03': np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0], [10.0, 11.0, 12.0]]), '2014-01-06': np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0], [11.0, 12.0, 13.0]]), }) if len(asset_info) == 4: expected_views = valmap( lambda view: np.c_[view, [np.nan, np.nan, np.nan]], expected_views, ) expected_output_buffer = [10, 11, 12, np.nan, 11, 12, 13, np.nan] else: expected_output_buffer = [10, 11, 12, 11, 12, 13] cal = pd.DatetimeIndex([ pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-02'), pd.Timestamp('2014-01-03'), # omitting the 4th and 5th to simulate a weekend pd.Timestamp('2014-01-06'), ]) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( expected_output_buffer, index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value',), ) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=cal, start=cal[2], end=cal[-1], window_length=3, compute_fn=op.itemgetter(-1), )
# coding: utf-8 import pandas as pd import blaze as bz df = pd.DataFrame.from_csv('Tea_stacked_cleaned.csv', index_col=None) df.head() d = bz.Data('Tea_stacked_cleaned.csv') d.shape d.dshape d = bz.transform(d, year=d.date.year, month=d.date.month) d.head(5) d.tail(5) import odo df = odo.odo(d, pd.DataFrame) df.head(2) df.groupby(['year', 'month']) df.groupby(['year', 'month']).arrival groups = df.groupby(['year', 'month']) print(groups) df.groupby(['year', 'month']).arrival.sum() df.loc[df.year == 15] df.loc[:, df.year==15] grouped = df.groupby(['year', 'month']) for name, group in grouped: print(name) print(group) get_ipython().magic('save trying.py 1-23')
def clean(files, mode, commodity_corrections, commodity_name_mapping): if mode == 'batch': #commodities = list(set(list(map(lambda x: x.split('_')[0], files)))) #for commodity in commodities: for filename in files: #commodity = commodity.replace('(', '\(').replace(')', '\)') #filename = '{}_stacked.csv'.format(commodity) # Have to use resource to discover URIs csvr = bz.resource(filename) num_col = len(bz.discover(csvr)[1].types) print(num_col) ds = None if num_col == 12: ds = bz.dshape("var * {date: ?string, state: ?string, market: ?string, category: ?string, commodity: ?string, variety: ?string, arrival: ?float64, min: ?float64, max: ?float64, modal: ?float64, originState: ?string, originMarket: ?string}") elif num_col == 13: ds = bz.dshape("var * {date: ?string, state: ?string, market: ?string, category: ?string, commodity: ?string, variety: ?string, arrival: ?float64, grade: ?string, min: ?float64, max: ?float64, originState: ?string, originMarket: ?string, modal: ?float64 }") else: ds = bz.discover(csvr) d = bz.Data(filename, dshape=ds) ### Use Dask if Data loads index_col and/or header: # http://stackoverflow.com/questions/32716093/how-do-i-read-tabulator-separated-csv-in-blaze ### Fixes issue with two added months on date d = bz.transform(d, date=d.date.map(lambda x: datetime.strptime(x, '%d/%m/%Y').date(), 'date')) commodity = list(d.commodity.distinct())[0] varieties = list(d.variety.distinct()) ### Removing unnecessary varieties ### TODO: test effectiveness if [commodity, 'Other'] == varieties: d = bz.transform(d, variety=d.variety.map(lambda x: commodity, 'string')) ### TODO: merge related filenames # if price > 100: divide by 100 #d = bz.transform(d, min=d.min.map(lambda x: x/100 if x > 100 else x, 'float64')) #d = bz.transform(d, max=d.max.map(lambda x: x/100 if x > 100 else x, 'float64')) #d = bz.transform(d, max=d.modal.map(lambda x: x/100 if x > 100 else x, 'float64')) d = bz.transform(d, commodity=d.commodity.map(lambda x: commodity_corrections[x] if x in commodity_corrections else x, 'string')) d = bz.transform(d, commodityTranslated=d.commodity.map(lambda x: commodity_name_mapping[x] if x in commodity_name_mapping else x, 'string')) d = bz.transform(d, state=d.state.map(lambda x: state_corrections[x] if x in state_corrections else x, 'string')) ### TODO: there must be a better solution for this #for key, val in market_corrections.items(): # d = bz.transform(d, market=d.market.map(lambda x: re.sub(key, val, x), 'string')) cleaned_fn = filename.replace('.csv', '_cleaned.csv')# {}_stacked_cleaned.csv'.format(commodity) print(cleaned_fn) if not path.isdir('cleaned'): os.makedirs('cleaned') outpath = path.join('cleaned', cleaned_fn) print(outpath) if path.isfile(outpath): os.remove(outpath) odo.odo(d, outpath) #df_cleaned = odo.odo(d, pd.DataFrame) #df_cleaned.to_csv(outpath, index=False) # df = pd.DataFrame.from_csv('cleaned/Tea_stacked_cleaned.csv', header=False, index_col=None) # df['year'] = df['date'].apply(lambda x: datetime.strptime(x, "%d/%m/%Y").year) # df['month'] = df['date'].apply(lambda x: datetime.strptime(x, "%d/%m/%Y").month) # os.remove(filename) else: for filename in files: df = pd.DataFrame.from_csv(filename, header=False, index_col=False) num_col = len(df.columns) if num_col == 10: df.columns = ['date', 'state', 'market', 'category', 'commodity', 'variety', 'arrival', 'min', 'max', 'modal'] elif num_col == 11: df.columns = ['date', 'state', 'market', 'category', 'commodity', 'variety', 'arrival', 'grade', 'min', 'max', 'modal'] df['state'] = df['state'].replace(commodity_corrections, replace=True) df['commodity'] = df['commodity'].replace(commodity_corrections, replace=True) df['commodity'] = df['commodity'].replace(commodity_name_mapping, replace=True) cleaned_fn = '{}_cleaned.csv'.format(filename.rstrip('.csv')) if not path.isdir('cleaned'): os.makedirs('cleaned') outpath = path.join('cleaned', cleaned_fn) if path.isfile(outpath): os.remove(outpath) df.to_csv(cleaned_fn, index=False) ### TODO: use blaze to load commodity files, process them and save to desk cleaned version #### "online" --> use pandas return
def test_novel_deltas_macro(self): asset_info = asset_infos[0][0] base_dates = pd.DatetimeIndex( [pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-04')]) baseline = pd.DataFrame({ 'value': (0, 1), 'asof_date': base_dates, 'timestamp': base_dates, }) expr = bz.Data(baseline, name='expr', dshape=self.macro_dshape) deltas = bz.Data(baseline, name='deltas', dshape=self.macro_dshape) deltas = bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ) nassets = len(asset_info) expected_views = keymap( pd.Timestamp, { '2014-01-03': repeat_last_axis( np.array([10.0, 10.0, 10.0]), nassets, ), '2014-01-06': repeat_last_axis( np.array([10.0, 10.0, 11.0]), nassets, ), }) cal = pd.DatetimeIndex([ pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-02'), pd.Timestamp('2014-01-03'), # omitting the 4th and 5th to simulate a weekend pd.Timestamp('2014-01-06'), ]) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( list(concatv([10] * nassets, [11] * nassets)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value', ), ) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=cal, start=cal[2], end=cal[-1], window_length=3, compute_fn=op.itemgetter(-1), )
def blaze_tutorial(): accounts = bl.Symbol('accounts', 'var * {id: int, name: string, amount: int}') deadbeats = accounts[accounts.amount < 0].name list_ = [[1, 'Alice', 100], [2, 'Bob', -200], [3, 'Charlie', 300], [4, 'Denis', 400], [5, 'Edith', -500]] print(list(bl.compute(deadbeats, list_))) df_ = bl.DataFrame(list_, columns=['id', 'name', 'amount']) print(bl.compute(deadbeats, df_)) bl_df_dir = dir(df_) df_ = pd.DataFrame(list_, columns=['id', 'name', 'amount']) print(df_[df_.amount < 0].name) pd_df_dir = dir(df_) print(len(bl_df_dir), len(pd_df_dir)) print(len([d for d in bl_df_dir if d in pd_df_dir])) print([d for d in bl_df_dir if d not in pd_df_dir]) print([d for d in pd_df_dir if d not in bl_df_dir]) df_ = bl.Data([(1, 'Alice', 100), (2, 'Bob', -200), (3, 'Charlie', 300), (4, 'Denis', 400), (5, 'Edith', -500)], fields=['id', 'name', 'balance']) print(repr(df_)) print(repr(df_[df_.balance < 0])) print(repr(df_[df_.balance < 0].name)) print(list(df_[df_.balance < 0].name)) iris = bl.Data('examples/iris.csv') print(repr(iris)) iris = bl.Data('sqlite:///examples/iris.db::iris') print(repr(iris)) print(repr(bl.by(iris.species, min=iris.petal_width.min(), max=iris.petal_width.max()))) result = bl.by(iris.species, min=iris.petal_width.min(), max=iris.petal_width.max()) print(odo(result, bl.DataFrame)) print(odo(result, pd.DataFrame)) ### odo has weird issue with unicode filenames, apparently... name = 'output.csv' print(odo(result, bl.CSV(name))) print(repr(iris.sepal_length.mean())) print(repr(bl.mean(iris.sepal_length))) print(repr(bl.by(iris.species, shortest=iris.petal_length.min(), longest=iris.petal_length.max(), average=iris.petal_length.mean()))) print(repr(iris.head())) iris = bl.transform(iris, sepal_ratio=iris.sepal_length / iris.sepal_width, petal_ratio=iris.petal_length / iris.petal_width) print(repr(iris.head())) versicolor = iris[iris.species.like('%versicolor')] print(repr(versicolor)) print((len(versicolor), len(versicolor.fields))) print(repr(iris.relabel(petal_length='PETAL-LENGTH', petal_width='PETAL-WIDTH'))) pd_df = pd.DataFrame({'name': ['Alice', 'Bob', 'Joe', 'Bob'], 'amount': [100, 200, 300, 400], 'id': [1, 2, 3, 4]}) # put the `df` DataFrame odo a Blaze Data object bl_df = bl.DataFrame(pd_df) bl_dt = bl.Data(pd_df) print(repr(pd_df.amount * 2)) print(repr(bl_df.amount * 2)) print(repr(bl_dt.amount * 2)) print(repr(pd_df[['id', 'amount']])) print(repr(bl_df[['id', 'amount']])) print(repr(bl_dt[['id', 'amount']])) print(repr(pd_df[pd_df.amount > 300])) print(repr(bl_df[bl_df.amount > 300])) print(repr(bl_dt[bl_dt.amount > 300])) print(repr(pd_df.groupby('name').amount.mean())) print(repr(pd_df.groupby(['name', 'id']).amount.mean())) print(repr(bl_df.groupby('name').amount.mean())) print(repr(bl_df.groupby(['name', 'id']).amount.mean())) print(repr(bl.by(bl_dt.name, amount=bl_dt.amount.mean()))) print(repr(bl.by(bl.merge(bl_dt.name, bl_dt.id), amount=bl_dt.amount.mean()))) #pd.merge(pd_df, pd_df2, on='name') #bl.join(bl_dt, bl_dt2, 'name') print(repr(pd_df.amount.map(lambda x: x + 1))) print(repr(bl_df.amount.map(lambda x: x + 1))) print(repr(bl_dt.amount.map(lambda x: x + 1, 'int64'))) print(repr(pd_df.rename(columns={'name': 'alias', 'amount': 'dollars'}))) print(repr(bl_df.rename(columns={'name': 'alias', 'amount': 'dollars'}))) print(repr(bl_dt.relabel(name='alias', amount='dollars'))) print(repr(pd_df.drop_duplicates())) print(repr(bl_df.drop_duplicates())) print(repr(bl_dt.distinct())) print(repr(pd_df.name.drop_duplicates())) print(repr(bl_df.name.drop_duplicates())) print(repr(bl_dt.name.distinct())) print(repr(pd_df.amount.mean())) print(repr(bl_df.amount.mean())) print(repr(bl_dt.amount.mean())) print(repr(pd_df)) print(repr(bl_df)) print(repr(bl_dt)) print(repr(pd_df.amount.value_counts()), '\n') print(repr(bl_df.amount.value_counts()), '\n') print(repr(bl_dt.amount.count_values()), '\n') print(repr(pd_df.dtypes), '\n') print(repr(bl_df.dtypes), '\n') print(repr(bl_df.columns), '\n') print(repr(bl_dt.dshape), '\n') print(repr(pd_df.amount.dtypes), '\n') print(repr(bl_df.amount.dtypes), '\n') print(repr(bl_dt.amount.dshape), '\n') print(type(pd_df), type(bl_df), type(bl_dt), '\n') os.remove('output.csv') for fn_ in glob.glob('*.csv.gz'): os.remove(fn_) return
def compute_stats(data_dir, filename, category, commodity): ### PROBLEM: some commodities are still spread over multiple files => solve by gathering stats over multiple files csvr = bz.resource(filename) ds = bz.discover(csvr) d = bz.Data(filename, dshape=ds) category = list(d.category)[0] #commodity = list(d.commodity)[0] d = bz.transform(d, year=d.date.year, month=d.date.month) outdir = path.join(data_dir, 'stats', category, commodity) if not path.isdir(outdir): os.makedirs(outdir) nas_by_commodity(d, commodity, outdir) ### NOTE: transform: add date and month column # problem here is that it's not simply calling a predefined function ### NEED PANDAS FOR THIS #bz.by(bz.merge(d.year, d.month), commodityTonnage=d.commodityTonnage.sum()) start = time.time() df = odo.odo(d, pd.DataFrame) elapsed = time.time() - start print('{0} took {1} secs to be loaded into pandas df with odo'.format(commodity, np.round(elapsed, 2))) # TODO: read the latest date from config/or use yesterdays date date_range = pd.date_range('1/1/2002', time.strftime('%m/%d/%Y')) #df = fill_records(df, date_range) ### In what particular periods do NAs occur? # does data quality improve over time? na ratio per (year, month) => further group by commodity before saving final dataframe get_loc_nas(df, commodity, [], ['state'], outdir) #get_loc_nas(df, commodity, [], ['state', 'district'], outdir) #get_loc_nas(df, commodity, [], ['state', 'district', 'market'], outdir) get_loc_nas(df, commodity, ['year', 'month'], ['state'], outdir) #get_loc_nas(df, commodity, ['year', 'month'], ['state', 'district'], outdir) #get_loc_nas(df, commodity, ['year', 'month'], ['state', 'district', 'market'], outdir) get_loc_nas(df, commodity, ['year'], ['state'], outdir) #get_loc_nas(df, commodity, ['year'], ['state', 'district'], outdir) #get_loc_nas(df, commodity, ['year'], ['state', 'district', 'market'], outdir) get_loc_nas(df, commodity, ['month'], ['state'], outdir) #get_loc_nas(df, commodity, ['month'], ['state', 'district'], outdir) #get_loc_nas(df, commodity, ['month'], ['state', 'district', 'market'], outdir) nas_over_time(df, commodity, ['year', 'month'], [], outdir) nas_over_time(df, commodity, ['year'], [], outdir) nas_over_time(df, commodity, ['month'], [], outdir) ### NOTE: taking into account that commodityTonnages are repeated: commodityTonnages = d[['date', 'state', 'district', 'market', 'category', 'commodity', 'commodityTonnage', 'year', 'month']].distinct() content, commodityTonnage_by_year_month = commodityTonnage_over_time(commodityTonnages, outdir) commodityTonnage_by_year(commodityTonnages, outdir) if content: commodityTonnage_by_month(commodityTonnage_by_year_month, outdir) #commodityTonnage_by_level(commodityTonnages, outdir, ['state', 'district', 'market']) #commodityTonnage_by_level(commodityTonnages, outdir, ['state', 'district']) commodityTonnage_by_level(commodityTonnages, outdir, ['state']) """ commodityTonnage_by_month(commodityTonnage_by_year_month, outdir) commodityTonnage_by_year(commodityTonnages, outdir) # commodityTonnage by state,( district,) market commodityTonnage_by_market(commodityTonnages, outdir) commodityTonnage_by_district(commodityTonnages, outdir) commodityTonnage_by_state(commodityTonnages, outdir) """ get_coverages(df, date_range, outdir, commodity) ### TODO: # display some of these statistics in the application """ make nice printout to compute commodityTonnage stats by commodity, first have to drop variety column and then remove all duplicates df.drop_duplicates() --> df.distinct() trick to work with blaze: fill unwanted columns (variety) with unique string/float and call drop duplicates on dataframe Questions per commodity (can be answered with single files): What is the variety with the most commodityTonnages per commodity? => commodityTonnages per variety What markets have the most commodityTonnages per commodity? What months have the most commodityTonnages per commodity? What weekdays have the most commodityTonnages per commodity? Average price per commodity? - by year - by month (seasonal phenomena?) - by year, month TODO: What is the NA ratio of commodityTonnages? Are there any particular patterns to this ratio? - by month? => DONE - by market? --> COULD USE THIS TO VISUALIZE DATA QUALITY! - by state? Questions that can't be answered with single files => use pymongo or odo to load multiple collections into dataframe? - commodityTonnage tonnage by commodity by(commodity, ==> commodityTonnage.sum()) (total) - commodityTonnage tonnage by category? - avg price by commodity by(commodity, ==> modal.mean()) - minimum modal price by commodity - maximum modal price by commodity - commodityTonnage tonnages by years (total) - commodityTonnage tonnages by months (accumulative) - commodityTonnage tonnages by commodity,years (total) - commodityTonnage tonnages by commodity, month (accumulative) - commodityTonnage tonnages by commodity, month, year TODO: test all of these in ipython first by(df.commodity, total_commodityTonnages = df.commodityTonnage.sum()) - NA percentage for tonnages of commodity - NA percentage for tonnages of commodity """ return
def test_novel_deltas(self, asset_info): base_dates = pd.DatetimeIndex( [pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-04')]) repeated_dates = base_dates.repeat(3) baseline = pd.DataFrame({ 'sid': self.sids * 2, 'value': (0, 1, 2, 1, 2, 3), 'asof_date': repeated_dates, 'timestamp': repeated_dates, }) expr = bz.Data(baseline, name='expr', dshape=self.dshape) deltas = bz.Data(baseline, name='deltas', dshape=self.dshape) deltas = bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ) expected_views = keymap( pd.Timestamp, { '2014-01-03': np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0], [10.0, 11.0, 12.0]]), '2014-01-06': np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0], [11.0, 12.0, 13.0]]), }) if len(asset_info) == 4: expected_views = valmap( lambda view: np.c_[view, [np.nan, np.nan, np.nan]], expected_views, ) expected_output_buffer = [10, 11, 12, np.nan, 11, 12, 13, np.nan] else: expected_output_buffer = [10, 11, 12, 11, 12, 13] cal = pd.DatetimeIndex([ pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-02'), pd.Timestamp('2014-01-03'), # omitting the 4th and 5th to simulate a weekend pd.Timestamp('2014-01-06'), ]) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( expected_output_buffer, index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value', ), ) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=cal, start=cal[2], end=cal[-1], window_length=3, compute_fn=op.itemgetter(-1), )