def test_novel_deltas_macro(self): asset_info = asset_infos[0][0] base_dates = pd.DatetimeIndex([ pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-04') ]) baseline = pd.DataFrame({ 'value': (0, 1), 'asof_date': base_dates, 'timestamp': base_dates, }) expr = bz.Data(baseline, name='expr', dshape=self.macro_dshape) deltas = bz.Data(baseline, name='deltas', dshape=self.macro_dshape) deltas = bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ) nassets = len(asset_info) expected_views = keymap(pd.Timestamp, { '2014-01-03': repeat_last_axis( np.array([10.0, 10.0, 10.0]), nassets, ), '2014-01-06': repeat_last_axis( np.array([10.0, 10.0, 11.0]), nassets, ), }) cal = pd.DatetimeIndex([ pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-02'), pd.Timestamp('2014-01-03'), # omitting the 4th and 5th to simulate a weekend pd.Timestamp('2014-01-06'), ]) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( list(concatv([10] * nassets, [11] * nassets)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value',), ) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=cal, start=cal[2], end=cal[-1], window_length=3, compute_fn=op.itemgetter(-1), )
def _pipeline_output_index(dates, assets, mask): """ Create a MultiIndex for a pipeline output. Parameters ---------- dates : pd.DatetimeIndex Row labels for ``mask``. assets : pd.Index Column labels for ``mask``. mask : np.ndarray[bool] Mask array indicating date/asset pairs that should be included in output index. Returns ------- index : pd.MultiIndex MultiIndex containing (date, asset) pairs corresponding to ``True`` values in ``mask``. """ date_labels = repeat_last_axis(arange(len(dates)), len(assets))[mask] asset_labels = repeat_first_axis(arange(len(assets)), len(dates))[mask] return MultiIndex( levels=[dates, assets], labels=[date_labels, asset_labels], # TODO: We should probably add names for these. names=[None, None], verify_integrity=False, )
def test_deltas_macro(self): asset_info = asset_infos[0][0] expr = bz.Data(self.macro_df, name='expr', dshape=self.macro_dshape) deltas = bz.Data( self.macro_df.iloc[:-1], name='deltas', dshape=self.macro_dshape, ) deltas = bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ) nassets = len(asset_info) expected_views = keymap( pd.Timestamp, { '2014-01-02': repeat_last_axis(np.array([10.0, 1.0]), nassets), '2014-01-03': repeat_last_axis(np.array([11.0, 2.0]), nassets), }) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( list(concatv([10] * nassets, [11] * nassets)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value', ), ) dates = self.dates self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=dates, start=dates[1], end=dates[-1], window_length=2, compute_fn=np.nanmax, )
def test_deltas_macro(self): asset_info = asset_infos[0][0] expr = bz.Data(self.macro_df, name='expr', dshape=self.macro_dshape) deltas = bz.Data( self.macro_df.iloc[:-1], name='deltas', dshape=self.macro_dshape, ) deltas = bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ) nassets = len(asset_info) expected_views = keymap(pd.Timestamp, { '2014-01-02': repeat_last_axis(np.array([10.0, 1.0]), nassets), '2014-01-03': repeat_last_axis(np.array([11.0, 2.0]), nassets), }) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( list(concatv([10] * nassets, [11] * nassets)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value',), ) dates = self.dates self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=dates, start=dates[1], end=dates[-1], window_length=2, compute_fn=np.nanmax, )
def _to_narrow(self, data, mask, dates, assets): """ Convert raw computed pipeline results into a DataFrame for public APIs. Parameters ---------- data : dict[str -> ndarray[ndim=2]] Dict mapping column names to computed results. mask : ndarray[bool, ndim=2] Mask array of values to keep. dates : ndarray[datetime64, ndim=1] Row index for arrays `data` and `mask` assets : ndarray[int64, ndim=2] Column index for arrays `data` and `mask` Returns ------- results : pd.DataFrame The indices of `results` are as follows: index : two-tiered MultiIndex of (date, asset). Contains an entry for each (date, asset) pair corresponding to a `True` value in `mask`. columns : Index of str One column per entry in `data`. If mask[date, asset] is True, then result.loc[(date, asset), colname] will contain the value of data[colname][date, asset]. """ if not mask.any(): # Manually handle the empty DataFrame case. This is a workaround # to pandas failing to tz_localize an empty dataframe with a # MultiIndex. It also saves us the work of applying a known-empty # mask to each array. # # Slicing `dates` here to preserve pandas metadata. empty_dates = dates[:0] empty_assets = array([], dtype=object) return DataFrame( data={ name: array([], dtype=arr.dtype) for name, arr in iteritems(data) }, index=MultiIndex.from_arrays([empty_dates, empty_assets]), ) resolved_assets = array(self._finder.retrieve_all(assets)) dates_kept = repeat_last_axis(dates.values, len(assets))[mask] assets_kept = repeat_first_axis(resolved_assets, len(dates))[mask] return DataFrame( data={name: arr[mask] for name, arr in iteritems(data)}, index=MultiIndex.from_arrays([dates_kept, assets_kept]), ).tz_localize('UTC', level=0)
def compute(self, today, assets, out, close): x_matrix = repeat_last_axis( (self.window_length - 1) / 2 - self._x, len(assets), ) y_bar = np.nanmean(close, axis=0) y_bars = repeat_first_axis(y_bar, self.window_length) y_matrix = close - y_bars out[:] = preprocess(-np.divide((x_matrix * y_matrix).sum(axis=0) / self._x_var, self.window_length))
def _to_narrow(self, data, mask, dates, assets): """ Convert raw computed pipeline results into a DataFrame for public APIs. Parameters ---------- data : dict[str -> ndarray[ndim=2]] Dict mapping column names to computed results. mask : ndarray[bool, ndim=2] Mask array of values to keep. dates : ndarray[datetime64, ndim=1] Row index for arrays `data` and `mask` assets : ndarray[int64, ndim=2] Column index for arrays `data` and `mask` Returns ------- results : pd.DataFrame The indices of `results` are as follows: index : two-tiered MultiIndex of (date, asset). Contains an entry for each (date, asset) pair corresponding to a `True` value in `mask`. columns : Index of str One column per entry in `data`. If mask[date, asset] is True, then result.loc[(date, asset), colname] will contain the value of data[colname][date, asset]. """ resolved_assets = array(self._finder.retrieve_all(assets)) dates_kept = repeat_last_axis(dates.values, len(assets))[mask] assets_kept = repeat_first_axis(resolved_assets, len(dates))[mask] return DataFrame( data={name: arr[mask] for name, arr in iteritems(data)}, index=MultiIndex.from_arrays([dates_kept, assets_kept]), ).tz_localize('UTC', level=0)
def test_novel_deltas_macro(self): asset_info = asset_infos[0][0] base_dates = pd.DatetimeIndex( [pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-04')]) baseline = pd.DataFrame({ 'value': (0, 1), 'asof_date': base_dates, 'timestamp': base_dates, }) expr = bz.Data(baseline, name='expr', dshape=self.macro_dshape) deltas = bz.Data(baseline, name='deltas', dshape=self.macro_dshape) deltas = bz.transform( deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1), ) nassets = len(asset_info) expected_views = keymap( pd.Timestamp, { '2014-01-03': repeat_last_axis( np.array([10.0, 10.0, 10.0]), nassets, ), '2014-01-06': repeat_last_axis( np.array([10.0, 10.0, 11.0]), nassets, ), }) cal = pd.DatetimeIndex([ pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-02'), pd.Timestamp('2014-01-03'), # omitting the 4th and 5th to simulate a weekend pd.Timestamp('2014-01-06'), ]) with tmp_asset_finder(equities=asset_info) as finder: expected_output = pd.DataFrame( list(concatv([10] * nassets, [11] * nassets)), index=pd.MultiIndex.from_product(( sorted(expected_views.keys()), finder.retrieve_all(asset_info.index), )), columns=('value', ), ) self._run_pipeline( expr, deltas, expected_views, expected_output, finder, calendar=cal, start=cal[2], end=cal[-1], window_length=3, compute_fn=op.itemgetter(-1), )