def test_novel_deltas_macro(self): asset_info = asset_infos[0][0] base_dates = pd.DatetimeIndex([ pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-04') ]) baseline = pd.DataFrame({ 'value': (0, 1), 'asof_date': base_dates, 'timestamp': base_dates, }) expr = bz.Data(baseline, name='expr', dshape=self.macro_dshape) deltas = bz.Data(baseline, name='deltas', dshape=self.macro_dshape) deltas = bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ) nassets = len(asset_info) expected_views = keymap(pd.Timestamp, { '2014-01-03': repeat_last_axis( np.array([10.0, 10.0, 10.0]), nassets, ), '2014-01-06': repeat_last_axis( np.array([10.0, 10.0, 11.0]), nassets, ), }) cal = pd.DatetimeIndex([ pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-02'), pd.Timestamp('2014-01-03'), # omitting the 4th and 5th to simulate a weekend pd.Timestamp('2014-01-06'), ]) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( list(concatv([10] * nassets, [11] * nassets)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value',), ) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=cal, start=cal[2], end=cal[-1], window_length=3, compute_fn=op.itemgetter(-1), )
def test_novel_deltas(self, asset_info): base_dates = pd.DatetimeIndex([pd.Timestamp("2014-01-01"), pd.Timestamp("2014-01-04")]) repeated_dates = base_dates.repeat(3) baseline = pd.DataFrame( { "sid": self.sids * 2, "value": (0, 1, 2, 1, 2, 3), "asof_date": repeated_dates, "timestamp": repeated_dates, } ) expr = bz.Data(baseline, name="expr", dshape=self.dshape) deltas = bz.Data(baseline, name="deltas", dshape=self.dshape) deltas = bz.transform(deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1)) expected_views = keymap( pd.Timestamp, { "2014-01-03": np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0], [10.0, 11.0, 12.0]]), "2014-01-06": np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0], [11.0, 12.0, 13.0]]), }, ) if len(asset_info) == 4: expected_views = valmap(lambda view: np.c_[view, [np.nan, np.nan, np.nan]], expected_views) expected_output_buffer = [10, 11, 12, np.nan, 11, 12, 13, np.nan] else: expected_output_buffer = [10, 11, 12, 11, 12, 13] cal = pd.DatetimeIndex( [ pd.Timestamp("2014-01-01"), pd.Timestamp("2014-01-02"), pd.Timestamp("2014-01-03"), # omitting the 4th and 5th to simulate a weekend pd.Timestamp("2014-01-06"), ] ) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( expected_output_buffer, index=pd.MultiIndex.from_product( (sorted(expected_views.keys()), finder.retrieve_all(asset_info.index)) ), columns=("value",), ) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=cal, start=cal[2], end=cal[-1], window_length=3, compute_fn=op.itemgetter(-1), )
def _load_dataset(self, dates, assets, mask, columns): try: (dataset,) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, checkpoints, odo_kwargs = self[dataset] have_sids = (dataset.ndim == 2) asset_idx = pd.Series(index=assets, data=np.arange(len(assets))) assets = list(map(int, assets)) # coerce from numpy.int64 added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME] + ( [SID_FIELD_NAME] if have_sids else [] ) colnames = added_query_fields + list(map(getname, columns)) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def collect_expr(e, lower): """Materialize the expression as a dataframe. Parameters ---------- e : Expr The baseline or deltas expression. lower : datetime The lower time bound to query. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ predicate = e[TS_FIELD_NAME] <= upper_dt if lower is not None: predicate &= e[TS_FIELD_NAME] >= lower return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) if checkpoints is not None: ts = checkpoints[TS_FIELD_NAME] checkpoints_ts = odo(ts[ts <= lower_dt].max(), pd.Timestamp) if pd.isnull(checkpoints_ts): materialized_checkpoints = pd.DataFrame(columns=colnames) lower = None else: materialized_checkpoints = odo( checkpoints[ts == checkpoints_ts][colnames], pd.DataFrame, **odo_kwargs ) lower = checkpoints_ts else: materialized_checkpoints = pd.DataFrame(columns=colnames) lower = None materialized_expr = collect_expr(expr, lower) if materialized_checkpoints is not None: materialized_expr = pd.concat( ( materialized_checkpoints, materialized_expr, ), ignore_index=True, copy=False, ) materialized_deltas = ( collect_expr(deltas, lower) if deltas is not None else pd.DataFrame(columns=colnames) ) # It's not guaranteed that assets returned by the engine will contain # all sids from the deltas table; filter out such mismatches here. if not materialized_deltas.empty and have_sids: materialized_deltas = materialized_deltas[ materialized_deltas[SID_FIELD_NAME].isin(assets) ] if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[ :, TS_FIELD_NAME ].astype('datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) def last_in_date_group(df, reindex, have_sids=have_sids): idx = dates[dates.searchsorted( df[TS_FIELD_NAME].values.astype('datetime64[D]') )] if have_sids: idx = [idx, SID_FIELD_NAME] last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby( idx, sort=False, ).last() if have_sids: last_in_group = last_in_group.unstack() if reindex: if have_sids: cols = last_in_group.columns last_in_group = last_in_group.reindex( index=dates, columns=pd.MultiIndex.from_product( (cols.levels[0], assets), names=cols.names, ), ) else: last_in_group = last_in_group.reindex(dates) return last_in_group sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False) dense_output = last_in_date_group(sparse_output, reindex=True) dense_output.ffill(inplace=True) # Fill in missing values specified by each column. This is made # significantly more complex by the fact that we need to work around # two pandas issues: # 1) When we have sids, if there are no records for a given sid for any # dates, pandas will generate a column full of NaNs for that sid. # This means that some of the columns in `dense_output` are now # float instead of the intended dtype, so we have to coerce back to # our expected type and convert NaNs into the desired missing value. # 2) DataFrame.ffill assumes that receiving None as a fill-value means # that no value was passed. Consequently, there's no way to tell # pandas to replace NaNs in an object column with None using fillna, # so we have to roll our own instead using df.where. for column in columns: # Special logic for strings since `fillna` doesn't work if the # missing value is `None`. if column.dtype == categorical_dtype: dense_output[column.name] = dense_output[ column.name ].where(pd.notnull(dense_output[column.name]), column.missing_value) else: # We need to execute `fillna` before `astype` in case the # column contains NaNs and needs to be cast to bool or int. # This is so that the NaNs are replaced first, since pandas # can't convert NaNs for those types. dense_output[column.name] = dense_output[ column.name ].fillna(column.missing_value).astype(column.dtype) if have_sids: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # If we do not have sids, use the column view to make a single # column vector which is unassociated with any assets. column_view = op.itemgetter(np.s_[:, np.newaxis]) adjustments_from_deltas = adjustments_from_deltas_no_sids mask = np.full( shape=(len(mask), 1), fill_value=True, dtype=bool_dtype, ) for column_idx, column in enumerate(columns): column_name = column.name yield column, AdjustedArray( column_view( dense_output[column_name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output[TS_FIELD_NAME].values, column_idx, column_name, asset_idx, sparse_deltas, ), column.missing_value, )
def test_novel_deltas(self, asset_info): base_dates = pd.DatetimeIndex( [pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-04')]) repeated_dates = base_dates.repeat(3) baseline = pd.DataFrame({ 'sid': self.sids * 2, 'value': (0, 1, 2, 1, 2, 3), 'asof_date': repeated_dates, 'timestamp': repeated_dates, }) expr = bz.Data(baseline, name='expr', dshape=self.dshape) deltas = bz.Data(baseline, name='deltas', dshape=self.dshape) deltas = bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ) expected_views = keymap( pd.Timestamp, { '2014-01-03': np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0], [10.0, 11.0, 12.0]]), '2014-01-06': np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0], [11.0, 12.0, 13.0]]), }) if len(asset_info) == 4: expected_views = valmap( lambda view: np.c_[view, [np.nan, np.nan, np.nan]], expected_views, ) expected_output_buffer = [10, 11, 12, np.nan, 11, 12, 13, np.nan] else: expected_output_buffer = [10, 11, 12, 11, 12, 13] cal = pd.DatetimeIndex([ pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-02'), pd.Timestamp('2014-01-03'), # omitting the 4th and 5th to simulate a weekend pd.Timestamp('2014-01-06'), ]) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( expected_output_buffer, index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value', ), ) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=cal, start=cal[2], end=cal[-1], window_length=3, compute_fn=op.itemgetter(-1), )
def test_novel_deltas_macro(self): asset_info = asset_infos[0][0] base_dates = pd.DatetimeIndex( [pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-04')]) baseline = pd.DataFrame({ 'value': (0, 1), 'asof_date': base_dates, 'timestamp': base_dates, }) expr = bz.Data(baseline, name='expr', dshape=self.macro_dshape) deltas = bz.Data(baseline, name='deltas', dshape=self.macro_dshape) deltas = bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ) nassets = len(asset_info) expected_views = keymap( pd.Timestamp, { '2014-01-03': repeat_last_axis( np.array([10.0, 10.0, 10.0]), nassets, ), '2014-01-06': repeat_last_axis( np.array([10.0, 10.0, 11.0]), nassets, ), }) cal = pd.DatetimeIndex([ pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-02'), pd.Timestamp('2014-01-03'), # omitting the 4th and 5th to simulate a weekend pd.Timestamp('2014-01-06'), ]) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( list(concatv([10] * nassets, [11] * nassets)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value', ), ) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=cal, start=cal[2], end=cal[-1], window_length=3, compute_fn=op.itemgetter(-1), )
def _load_dataset(self, dates, assets, mask, columns): try: (dataset, ) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, checkpoints, odo_kwargs, apply_deltas_adjustments = self[ dataset] have_sids = (dataset.ndim == 2) asset_idx = pd.Series(index=assets, data=np.arange(len(assets))) assets = list(map(int, assets)) # coerce from numpy.int64 added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME } | ({SID_FIELD_NAME} if have_sids else set()) requested_columns = set(map(getname, columns)) colnames = sorted(added_query_fields | requested_columns) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def collect_expr(e, lower): """Materialize the expression as a dataframe. Parameters ---------- e : Expr The baseline or deltas expression. lower : datetime The lower time bound to query. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ predicate = e[TS_FIELD_NAME] <= upper_dt if lower is not None: predicate &= e[TS_FIELD_NAME] >= lower return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) lower, materialized_checkpoints = get_materialized_checkpoints( checkpoints, colnames, lower_dt, odo_kwargs) materialized_expr = self.pool.apply_async(collect_expr, (expr, lower)) materialized_deltas = (self.pool.apply(collect_expr, (deltas, lower)) if deltas is not None else pd.DataFrame(columns=colnames)) if materialized_checkpoints is not None: materialized_expr = pd.concat( ( materialized_checkpoints, materialized_expr.get(), ), ignore_index=True, copy=False, ) # It's not guaranteed that assets returned by the engine will contain # all sids from the deltas table; filter out such mismatches here. if not materialized_deltas.empty and have_sids: materialized_deltas = materialized_deltas[ materialized_deltas[SID_FIELD_NAME].isin(assets)] if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[:, TS_FIELD_NAME].astype( 'datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) if AD_FIELD_NAME not in requested_columns: sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) sparse_deltas = last_in_date_group(non_novel_deltas, dates, assets, reindex=False, have_sids=have_sids) dense_output = last_in_date_group(sparse_output, dates, assets, reindex=True, have_sids=have_sids) ffill_across_cols(dense_output, columns, {c.name: c.name for c in columns}) # By default, no non-novel deltas are applied. def no_adjustments_from_deltas(*args): return {} adjustments_from_deltas = no_adjustments_from_deltas if have_sids: if apply_deltas_adjustments: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # If we do not have sids, use the column view to make a single # column vector which is unassociated with any assets. column_view = op.itemgetter(np.s_[:, np.newaxis]) if apply_deltas_adjustments: adjustments_from_deltas = adjustments_from_deltas_no_sids mask = np.full( shape=(len(mask), 1), fill_value=True, dtype=bool_dtype, ) return { column: AdjustedArray( column_view( dense_output[column.name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output[TS_FIELD_NAME].values, column_idx, column.name, asset_idx, sparse_deltas, ), column.missing_value, ) for column_idx, column in enumerate(columns) }
def _load_dataset(self, dates, assets, mask, columns): try: (dataset,) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, checkpoints, odo_kwargs = self[dataset] have_sids = (dataset.ndim == 2) asset_idx = pd.Series(index=assets, data=np.arange(len(assets))) assets = list(map(int, assets)) # coerce from numpy.int64 added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME] + ( [SID_FIELD_NAME] if have_sids else [] ) colnames = added_query_fields + list(map(getname, columns)) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def collect_expr(e, lower): """Materialize the expression as a dataframe. Parameters ---------- e : Expr The baseline or deltas expression. lower : datetime The lower time bound to query. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ predicate = e[TS_FIELD_NAME] <= upper_dt if lower is not None: predicate &= e[TS_FIELD_NAME] >= lower return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) lower, materialized_checkpoints = get_materialized_checkpoints( checkpoints, colnames, lower_dt, odo_kwargs ) materialized_expr = self.pool.apply_async(collect_expr, (expr, lower)) materialized_deltas = ( self.pool.apply(collect_expr, (deltas, lower)) if deltas is not None else pd.DataFrame(columns=colnames) ) if materialized_checkpoints is not None: materialized_expr = pd.concat( ( materialized_checkpoints, materialized_expr.get(), ), ignore_index=True, copy=False, ) # It's not guaranteed that assets returned by the engine will contain # all sids from the deltas table; filter out such mismatches here. if not materialized_deltas.empty and have_sids: materialized_deltas = materialized_deltas[ materialized_deltas[SID_FIELD_NAME].isin(assets) ] if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[ :, TS_FIELD_NAME ].astype('datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) sparse_deltas = last_in_date_group(non_novel_deltas, dates, assets, reindex=False, have_sids=have_sids) dense_output = last_in_date_group(sparse_output, dates, assets, reindex=True, have_sids=have_sids) ffill_across_cols(dense_output, columns, {c.name: c.name for c in columns}) if have_sids: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # If we do not have sids, use the column view to make a single # column vector which is unassociated with any assets. column_view = op.itemgetter(np.s_[:, np.newaxis]) adjustments_from_deltas = adjustments_from_deltas_no_sids mask = np.full( shape=(len(mask), 1), fill_value=True, dtype=bool_dtype, ) return { column: AdjustedArray( column_view( dense_output[column.name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output[TS_FIELD_NAME].values, column_idx, column.name, asset_idx, sparse_deltas, ), column.missing_value, ) for column_idx, column in enumerate(columns) }
def _load_dataset(self, dates, assets, mask, columns): try: (dataset, ) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, checkpoints, odo_kwargs = self[dataset] have_sids = (dataset.ndim == 2) asset_idx = pd.Series(index=assets, data=np.arange(len(assets))) assets = list(map(int, assets)) # coerce from numpy.int64 added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME ] + ([SID_FIELD_NAME] if have_sids else []) colnames = added_query_fields + list(map(getname, columns)) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def collect_expr(e, lower): """Materialize the expression as a dataframe. Parameters ---------- e : Expr The baseline or deltas expression. lower : datetime The lower time bound to query. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ predicate = e[TS_FIELD_NAME] <= upper_dt if lower is not None: predicate &= e[TS_FIELD_NAME] >= lower return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) if checkpoints is not None: ts = checkpoints[TS_FIELD_NAME] checkpoints_ts = odo(ts[ts <= lower_dt].max(), pd.Timestamp) if pd.isnull(checkpoints_ts): materialized_checkpoints = pd.DataFrame(columns=colnames) lower = None else: materialized_checkpoints = odo( checkpoints[ts == checkpoints_ts][colnames], pd.DataFrame, **odo_kwargs) lower = checkpoints_ts else: materialized_checkpoints = pd.DataFrame(columns=colnames) lower = None materialized_expr = self.pool.apply_async(collect_expr, (expr, lower)) materialized_deltas = (self.pool.apply(collect_expr, (deltas, lower)) if deltas is not None else pd.DataFrame(columns=colnames)) if materialized_checkpoints is not None: materialized_expr = pd.concat( ( materialized_checkpoints, materialized_expr.get(), ), ignore_index=True, copy=False, ) # It's not guaranteed that assets returned by the engine will contain # all sids from the deltas table; filter out such mismatches here. if not materialized_deltas.empty and have_sids: materialized_deltas = materialized_deltas[ materialized_deltas[SID_FIELD_NAME].isin(assets)] if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[:, TS_FIELD_NAME].astype( 'datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) def last_in_date_group(df, reindex, have_sids=have_sids): idx = dates[dates.searchsorted( df[TS_FIELD_NAME].values.astype('datetime64[D]'))] if have_sids: idx = [idx, SID_FIELD_NAME] last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby( idx, sort=False, ).last() if have_sids: last_in_group = last_in_group.unstack() if reindex: if have_sids: cols = last_in_group.columns last_in_group = last_in_group.reindex( index=dates, columns=pd.MultiIndex.from_product( (cols.levels[0], assets), names=cols.names, ), ) else: last_in_group = last_in_group.reindex(dates) return last_in_group sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False) dense_output = last_in_date_group(sparse_output, reindex=True) dense_output.ffill(inplace=True) # Fill in missing values specified by each column. This is made # significantly more complex by the fact that we need to work around # two pandas issues: # 1) When we have sids, if there are no records for a given sid for any # dates, pandas will generate a column full of NaNs for that sid. # This means that some of the columns in `dense_output` are now # float instead of the intended dtype, so we have to coerce back to # our expected type and convert NaNs into the desired missing value. # 2) DataFrame.ffill assumes that receiving None as a fill-value means # that no value was passed. Consequently, there's no way to tell # pandas to replace NaNs in an object column with None using fillna, # so we have to roll our own instead using df.where. for column in columns: # Special logic for strings since `fillna` doesn't work if the # missing value is `None`. if column.dtype == categorical_dtype: dense_output[column.name] = dense_output[column.name].where( pd.notnull(dense_output[column.name]), column.missing_value) else: # We need to execute `fillna` before `astype` in case the # column contains NaNs and needs to be cast to bool or int. # This is so that the NaNs are replaced first, since pandas # can't convert NaNs for those types. dense_output[column.name] = dense_output[column.name].fillna( column.missing_value).astype(column.dtype) if have_sids: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # If we do not have sids, use the column view to make a single # column vector which is unassociated with any assets. column_view = op.itemgetter(np.s_[:, np.newaxis]) adjustments_from_deltas = adjustments_from_deltas_no_sids mask = np.full( shape=(len(mask), 1), fill_value=True, dtype=bool_dtype, ) return { column: AdjustedArray( column_view( dense_output[column.name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output[TS_FIELD_NAME].values, column_idx, column.name, asset_idx, sparse_deltas, ), column.missing_value, ) for column_idx, column in enumerate(columns) }
def test_novel_deltas(self, asset_info): base_dates = pd.DatetimeIndex([ pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-04') ]) repeated_dates = base_dates.repeat(3) baseline = pd.DataFrame({ 'sid': self.sids * 2, 'value': (0., 1., 2., 1., 2., 3.), 'int_value': (0, 1, 2, 1, 2, 3), 'asof_date': repeated_dates, 'timestamp': repeated_dates, }) expr = bz.data(baseline, name='expr', dshape=self.dshape) deltas = bz.data( odo( bz.transform( expr, value=expr.value + 10, timestamp=expr.timestamp + timedelta(days=1), ), pd.DataFrame, ), name='delta', dshape=self.dshape, ) expected_views = keymap(pd.Timestamp, { '2014-01-03': np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0], [10.0, 11.0, 12.0]]), '2014-01-06': np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0], [11.0, 12.0, 13.0]]), }) if len(asset_info) == 4: expected_views = valmap( lambda view: np.c_[view, [np.nan, np.nan, np.nan]], expected_views, ) expected_output_buffer = [10, 11, 12, np.nan, 11, 12, 13, np.nan] else: expected_output_buffer = [10, 11, 12, 11, 12, 13] cal = pd.DatetimeIndex([ pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-02'), pd.Timestamp('2014-01-03'), # omitting the 4th and 5th to simulate a weekend pd.Timestamp('2014-01-06'), ]) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( expected_output_buffer, index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value',), ) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=cal, start=cal[2], end=cal[-1], window_length=3, compute_fn=op.itemgetter(-1), )