def test_bad_input(self): data = arange(100).reshape(self.ndates, self.nsids) baseline = DataFrame(data, index=self.dates, columns=self.sids) loader = DataFrameLoader( USEquityPricing.close, baseline, ) with self.assertRaises(ValueError): # Wrong column. loader.load_adjusted_array( US_EQUITIES, [USEquityPricing.open], self.dates, self.sids, self.mask, ) with self.assertRaises(ValueError): # Too many columns. loader.load_adjusted_array( US_EQUITIES, [USEquityPricing.open, USEquityPricing.close], self.dates, self.sids, self.mask, )
def init_class_fixtures(cls): super(ParameterizedFactorTestCase, cls).init_class_fixtures() day = cls.env.trading_day cls.dates = dates = date_range( '2015-02-01', '2015-02-28', freq=day, tz='UTC', ) sids = cls.sids cls.raw_data = DataFrame( data=arange(len(dates) * len(sids), dtype=float).reshape( len(dates), len(sids), ), index=dates, columns=cls.asset_finder.retrieve_all(sids), ) close_loader = DataFrameLoader(USEquityPricing.close, cls.raw_data) volume_loader = DataFrameLoader( USEquityPricing.volume, cls.raw_data * 2, ) cls.engine = SimplePipelineEngine( { USEquityPricing.close: close_loader, USEquityPricing.volume: volume_loader, }.__getitem__, cls.dates, cls.asset_finder, )
def _load_events(self, name_map, indexer, columns, dates, sids, mask): def to_frame(array): return pd.DataFrame(array, index=dates, columns=sids) assert indexer.shape == (len(dates), len(sids)) out = {} for c in columns: # Array holding the value for column `c` for every event we have. col_array = self.events[name_map[c]] if not len(col_array): # We don't have **any** events, so return col.missing_value # every day for every sid. We have to special case empty events # because in normal branch we depend on being able to index # with -1 for missing values, which fails if there are no # events at all. raw = np.full((len(dates), len(sids)), c.missing_value, dtype=c.dtype) else: # Slot event values into sid/date locations using `indexer`. # This produces a 2D array of the same shape as `indexer`, # which must be (len(dates), len(sids))`. raw = col_array[indexer] # indexer will be -1 for locations where we don't have a known # value. Overwrite those locations with c.missing_value. raw[indexer < 0] = c.missing_value # Delegate the actual array formatting logic to a DataFrameLoader. loader = DataFrameLoader(c, to_frame(raw), adjustments=None) out[c] = loader.load_adjusted_array([c], dates, sids, mask)[c] return out
def load_adjusted_array(self, domain, columns, dates, sids, mask): out = {} for column in columns: fundamentals_df = self.fundamentals_reader.read( column.name, dates, sids, ) df_loader = DataFrameLoader(column, fundamentals_df) out.update(df_loader.load_adjusted_array(domain, [column,], dates, sids, mask)) return out
def _load_events(self, name_map, indexer, columns, dates, sids, mask): def to_frame(array): return pd.DataFrame(array, index=dates, columns=sids) out = {} for c in columns: raw = self.events[name_map[c]][indexer] # indexer will be -1 for locations where we don't have a known # value. raw[indexer < 0] = c.missing_value # Delegate the actual array formatting logic to a DataFrameLoader. loader = DataFrameLoader(c, to_frame(raw), adjustments=None) out[c] = loader.load_adjusted_array([c], dates, sids, mask)[c] return out
def test_baseline(self): data = arange(100).reshape(self.ndates, self.nsids) baseline = DataFrame(data, index=self.dates, columns=self.sids) loader = DataFrameLoader(USEquityPricing.close, baseline) dates_slice = slice(None, 10, None) sids_slice = slice(1, 3, None) [adj_array] = loader.load_adjusted_array( [USEquityPricing.close], self.dates[dates_slice], self.sids[sids_slice], self.mask[dates_slice, sids_slice], ).values() for idx, window in enumerate(adj_array.traverse(window_length=3)): expected = baseline.values[dates_slice, sids_slice][idx:idx + 3] assert_array_equal(window, expected)
def test_baseline(self): data = arange(100).reshape(self.ndates, self.nsids) baseline = DataFrame(data, index=self.dates, columns=self.sids) loader = DataFrameLoader(USEquityPricing.close, baseline) dates_slice = slice(None, 10, None) sids_slice = slice(1, 3, None) [adj_array] = loader.load_adjusted_array( [USEquityPricing.close], self.dates[dates_slice], self.sids[sids_slice], self.mask[dates_slice, sids_slice], ) for idx, window in enumerate(adj_array.traverse(window_length=3)): expected = baseline.values[dates_slice, sids_slice][idx:idx + 3] assert_array_equal(window, expected)
def _load_events(self, name_map, indexer, domain, columns, dates, sids, mask): def to_frame(array): return pd.DataFrame(array, index=dates, columns=sids) assert indexer.shape == (len(dates), len(sids)) out = {} for c in columns: # Array holding the value for column `c` for every event we have. col_array = self.events[name_map[c]] if not len(col_array): # We don't have **any** events, so return col.missing_value # every day for every sid. We have to special case empty events # because in normal branch we depend on being able to index # with -1 for missing values, which fails if there are no # events at all. raw = np.full( (len(dates), len(sids)), c.missing_value, dtype=c.dtype, ) else: # Slot event values into sid/date locations using `indexer`. # This produces a 2D array of the same shape as `indexer`, # which must be (len(dates), len(sids))`. raw = col_array[indexer] # indexer will be -1 for locations where we don't have a known # value. Overwrite those locations with c.missing_value. raw[indexer < 0] = c.missing_value # Delegate the actual array formatting logic to a DataFrameLoader. loader = DataFrameLoader(c, to_frame(raw), adjustments=None) out[c] = loader.load_adjusted_array( domain, [c], dates, sids, mask, )[c] return out
def init_class_fixtures(cls): super(StatisticalBuiltInsTestCase, cls).init_class_fixtures() day = cls.trading_calendar.day cls.dates = dates = date_range( '2015-02-01', '2015-02-28', freq=day, tz='UTC', ) # Using these start and end dates because they are a contigous span of # 5 days (Monday - Friday) and they allow for plenty of days to look # back on when computing correlations and regressions. cls.start_date_index = start_date_index = 14 cls.end_date_index = end_date_index = 18 cls.pipeline_start_date = dates[start_date_index] cls.pipeline_end_date = dates[end_date_index] cls.num_days = num_days = end_date_index - start_date_index + 1 sids = cls.sids cls.assets = assets = cls.asset_finder.retrieve_all(sids) cls.my_asset_column = my_asset_column = 0 cls.my_asset = assets[my_asset_column] cls.num_assets = num_assets = len(assets) cls.raw_data = raw_data = DataFrame( data=arange(len(dates) * len(sids), dtype=float64_dtype).reshape( len(dates), len(sids), ), index=dates, columns=assets, ) # Using mock 'close' data here because the correlation and regression # built-ins use USEquityPricing.close as the input to their `Returns` # factors. Since there is no way to change that when constructing an # instance of these built-ins, we need to test with mock 'close' data # to most accurately reflect their true behavior and results. close_loader = DataFrameLoader(USEquityPricing.close, raw_data) cls.run_pipeline = SimplePipelineEngine( {USEquityPricing.close: close_loader}.__getitem__, dates, cls.asset_finder, ).run_pipeline cls.cascading_mask = \ AssetIDPlusDay() < (sids[-1] + dates[start_date_index].day) cls.expected_cascading_mask_result = make_cascading_boolean_array( shape=(num_days, num_assets), ) cls.alternating_mask = (AssetIDPlusDay() % 2).eq(0) cls.expected_alternating_mask_result = make_alternating_boolean_array( shape=(num_days, num_assets), ) cls.expected_no_mask_result = full( shape=(num_days, num_assets), fill_value=True, dtype=bool_dtype, )
def setUpClass(cls): cls.env = TradingEnvironment() day = cls.env.trading_day cls.sids = sids = Int64Index([1, 2, 3]) cls.dates = dates = date_range( '2015-02-01', '2015-02-28', freq=day, tz='UTC', ) asset_info = make_simple_equity_info( cls.sids, start_date=Timestamp('2015-01-31', tz='UTC'), end_date=Timestamp('2015-03-01', tz='UTC'), ) cls.env.write_data(equities_df=asset_info) cls.asset_finder = cls.env.asset_finder cls.raw_data = DataFrame( data=arange(len(dates) * len(sids), dtype=float).reshape( len(dates), len(sids), ), index=dates, columns=cls.asset_finder.retrieve_all(sids), ) close_loader = DataFrameLoader(USEquityPricing.close, cls.raw_data) volume_loader = DataFrameLoader( USEquityPricing.volume, cls.raw_data * 2, ) cls.engine = SimplePipelineEngine( { USEquityPricing.close: close_loader, USEquityPricing.volume: volume_loader, }.__getitem__, cls.dates, cls.asset_finder, )
def init_instance_fixtures(self): super(ClosesAndVolumes, self).init_instance_fixtures() # View of the data on/after the split. self.adj_closes = adj_closes = self.closes.copy() adj_closes.ix[:self.split_date, self.split_asset] *= self.split_ratio self.adj_volumes = adj_volumes = self.volumes.copy() adj_volumes.ix[:self.split_date, self.split_asset] *= self.split_ratio self.pipeline_close_loader = DataFrameLoader( column=USEquityPricing.close, baseline=self.closes, adjustments=self.adjustments, ) self.pipeline_volume_loader = DataFrameLoader( column=USEquityPricing.volume, baseline=self.volumes, adjustments=self.adjustments, )
def test_compute_with_adjustments(self): dates, assets = self.dates, self.assets low, high = USEquityPricing.low, USEquityPricing.high apply_idxs = [3, 10, 16] def apply_date(idx, offset=0): return dates[apply_idxs[idx] + offset] adjustments = DataFrame.from_records([ dict( kind=MULTIPLY, sid=assets[1], value=2.0, start_date=None, end_date=apply_date(0, offset=-1), apply_date=apply_date(0), ), dict( kind=MULTIPLY, sid=assets[1], value=3.0, start_date=None, end_date=apply_date(1, offset=-1), apply_date=apply_date(1), ), dict( kind=MULTIPLY, sid=assets[1], value=5.0, start_date=None, end_date=apply_date(2, offset=-1), apply_date=apply_date(2), ), ]) low_base = DataFrame(self.make_frame(30.0)) low_loader = DataFrameLoader(low, low_base.copy(), adjustments=None) # Pre-apply inverse of adjustments to the baseline. high_base = DataFrame(self.make_frame(30.0)) high_base.iloc[:apply_idxs[0], 1] /= 2.0 high_base.iloc[:apply_idxs[1], 1] /= 3.0 high_base.iloc[:apply_idxs[2], 1] /= 5.0 high_loader = DataFrameLoader(high, high_base, adjustments) engine = SimplePipelineEngine( { low: low_loader, high: high_loader }.__getitem__, self.dates, self.asset_finder, ) for window_length in range(1, 4): low_mavg = SimpleMovingAverage( inputs=[USEquityPricing.low], window_length=window_length, ) high_mavg = SimpleMovingAverage( inputs=[USEquityPricing.high], window_length=window_length, ) bounds = product_upper_triangle(range(window_length, len(dates))) for start, stop in bounds: results = engine.run_pipeline( Pipeline(columns={ 'low': low_mavg, 'high': high_mavg }), dates[start], dates[stop], ) self.assertEqual(set(results.columns), {'low', 'high'}) iloc_bounds = slice(start, stop + 1) # +1 to include end date low_results = results.unstack()['low'] assert_frame_equal(low_results, low_base.iloc[iloc_bounds]) high_results = results.unstack()['high'] assert_frame_equal(high_results, high_base.iloc[iloc_bounds])
def setUp(self): self.env = env = trading.TradingEnvironment() self.dates = date_range('2014-01-01', '2014-02-01', freq=trading_day, tz='UTC') asset_info = DataFrame.from_records([ { 'sid': 1, 'symbol': 'A', 'asset_type': 'equity', 'start_date': self.dates[10], 'end_date': self.dates[13], 'exchange': 'TEST', }, { 'sid': 2, 'symbol': 'B', 'asset_type': 'equity', 'start_date': self.dates[11], 'end_date': self.dates[14], 'exchange': 'TEST', }, { 'sid': 3, 'symbol': 'C', 'asset_type': 'equity', 'start_date': self.dates[12], 'end_date': self.dates[15], 'exchange': 'TEST', }, ]) self.first_asset_start = min(asset_info.start_date) self.last_asset_end = max(asset_info.end_date) env.write_data(equities_df=asset_info) self.asset_finder = finder = env.asset_finder sids = (1, 2, 3) self.assets = finder.retrieve_all(sids) # View of the baseline data. self.closes = DataFrame( {sid: arange(1, len(self.dates) + 1) * sid for sid in sids}, index=self.dates, dtype=float, ) # Add a split for 'A' on its second date. self.split_asset = self.assets[0] self.split_date = self.split_asset.start_date + trading_day self.split_ratio = 0.5 self.adjustments = DataFrame.from_records([{ 'sid': self.split_asset.sid, 'value': self.split_ratio, 'kind': MULTIPLY, 'start_date': Timestamp('NaT'), 'end_date': self.split_date, 'apply_date': self.split_date, }]) # View of the data on/after the split. self.adj_closes = adj_closes = self.closes.copy() adj_closes.ix[:self.split_date, self.split_asset] *= self.split_ratio self.pipeline_loader = DataFrameLoader( column=USEquityPricing.close, baseline=self.closes, adjustments=self.adjustments, )
def test_adjustments(self): data = arange(100).reshape(self.ndates, self.nsids) baseline = DataFrame(data, index=self.dates, columns=self.sids) # Use the dates from index 10 on and sids 1-3. dates_slice = slice(10, None, None) sids_slice = slice(1, 4, None) # Adjustments that should actually affect the output. relevant_adjustments = [ { 'sid': 1, 'start_date': None, 'end_date': self.dates[15], 'apply_date': self.dates[16], 'value': 0.5, 'kind': MULTIPLY, }, { 'sid': 2, 'start_date': self.dates[5], 'end_date': self.dates[15], 'apply_date': self.dates[16], 'value': 1.0, 'kind': ADD, }, { 'sid': 2, 'start_date': self.dates[15], 'end_date': self.dates[16], 'apply_date': self.dates[17], 'value': 1.0, 'kind': ADD, }, { 'sid': 3, 'start_date': self.dates[16], 'end_date': self.dates[17], 'apply_date': self.dates[18], 'value': 99.0, 'kind': OVERWRITE, }, ] # These adjustments shouldn't affect the output. irrelevant_adjustments = [ { # Sid Not Requested 'sid': 0, 'start_date': self.dates[16], 'end_date': self.dates[17], 'apply_date': self.dates[18], 'value': -9999.0, 'kind': OVERWRITE, }, { # Sid Unknown 'sid': 9999, 'start_date': self.dates[16], 'end_date': self.dates[17], 'apply_date': self.dates[18], 'value': -9999.0, 'kind': OVERWRITE, }, { # Date Not Requested 'sid': 2, 'start_date': self.dates[1], 'end_date': self.dates[2], 'apply_date': self.dates[3], 'value': -9999.0, 'kind': OVERWRITE, }, { # Date Before Known Data 'sid': 2, 'start_date': self.dates[0] - (2 * trading_day), 'end_date': self.dates[0] - trading_day, 'apply_date': self.dates[0] - trading_day, 'value': -9999.0, 'kind': OVERWRITE, }, { # Date After Known Data 'sid': 2, 'start_date': self.dates[-1] + trading_day, 'end_date': self.dates[-1] + (2 * trading_day), 'apply_date': self.dates[-1] + (3 * trading_day), 'value': -9999.0, 'kind': OVERWRITE, }, ] adjustments = DataFrame(relevant_adjustments + irrelevant_adjustments) loader = DataFrameLoader( USEquityPricing.close, baseline, adjustments=adjustments, ) expected_baseline = baseline.iloc[dates_slice, sids_slice] formatted_adjustments = loader.format_adjustments( self.dates[dates_slice], self.sids[sids_slice], ) expected_formatted_adjustments = { 6: [ Float64Multiply( first_row=0, last_row=5, first_col=0, last_col=0, value=0.5, ), Float64Add( first_row=0, last_row=5, first_col=1, last_col=1, value=1.0, ), ], 7: [ Float64Add( first_row=5, last_row=6, first_col=1, last_col=1, value=1.0, ), ], 8: [ Float64Overwrite( first_row=6, last_row=7, first_col=2, last_col=2, value=99.0, ) ], } self.assertEqual(formatted_adjustments, expected_formatted_adjustments) mask = self.mask[dates_slice, sids_slice] with patch('zipline.pipeline.loaders.frame.AdjustedArray') as m: loader.load_adjusted_array( columns=[USEquityPricing.close], dates=self.dates[dates_slice], assets=self.sids[sids_slice], mask=mask, ) self.assertEqual(m.call_count, 1) args, kwargs = m.call_args assert_array_equal(kwargs['data'], expected_baseline.values) assert_array_equal(kwargs['mask'], mask) self.assertEqual(kwargs['adjustments'], expected_formatted_adjustments)
def setUp(self): self.env = env = trading.TradingEnvironment() self.dates = date_range( '2014-01-01', '2014-02-01', freq=trading_day, tz='UTC' ) asset_info = DataFrame.from_records([ { 'sid': 1, 'symbol': 'A', 'start_date': self.dates[10], 'end_date': self.dates[13], 'exchange': 'TEST', }, { 'sid': 2, 'symbol': 'B', 'start_date': self.dates[11], 'end_date': self.dates[14], 'exchange': 'TEST', }, { 'sid': 3, 'symbol': 'C', 'start_date': self.dates[12], 'end_date': self.dates[15], 'exchange': 'TEST', }, ]) self.first_asset_start = min(asset_info.start_date) self.last_asset_end = max(asset_info.end_date) env.write_data(equities_df=asset_info) self.asset_finder = finder = env.asset_finder sids = (1, 2, 3) self.assets = finder.retrieve_all(sids) # View of the baseline data. self.closes = DataFrame( {sid: arange(1, len(self.dates) + 1) * sid for sid in sids}, index=self.dates, dtype=float, ) # Create a data portal holding the data in self.closes data = {} for sid in sids: data[sid] = DataFrame({ "open": self.closes[sid].values, "high": self.closes[sid].values, "low": self.closes[sid].values, "close": self.closes[sid].values, "volume": self.closes[sid].values, "day": [day.value for day in self.dates] }) path = os.path.join(self.tempdir.path, "testdaily.bcolz") DailyBarWriterFromDataFrames(data).write( path, self.dates, data ) daily_bar_reader = BcolzDailyBarReader(path) self.data_portal = DataPortal( self.env, equity_daily_reader=daily_bar_reader, ) # Add a split for 'A' on its second date. self.split_asset = self.assets[0] self.split_date = self.split_asset.start_date + trading_day self.split_ratio = 0.5 self.adjustments = DataFrame.from_records([ { 'sid': self.split_asset.sid, 'value': self.split_ratio, 'kind': MULTIPLY, 'start_date': Timestamp('NaT'), 'end_date': self.split_date, 'apply_date': self.split_date, } ]) # View of the data on/after the split. self.adj_closes = adj_closes = self.closes.copy() adj_closes.ix[:self.split_date, self.split_asset] *= self.split_ratio self.pipeline_loader = DataFrameLoader( column=USEquityPricing.close, baseline=self.closes, adjustments=self.adjustments, )
def initialize(context): dates = pd.date_range('2018-01-01', '2018-09-28') # assets = bundle_data.asset_finder.lookup_symbols(['A', 'AAL'], as_of_date=None) # assets = bundle_data.asset_finder sids = bundle_data.asset_finder.sids assets = [sid(item) for item in sids] # The values for Column A will just be a 2D array of numbers ranging from 1 -> N. column_A_frame = pd.DataFrame( data=np.arange(len(dates) * len(assets), dtype=float).reshape(len(dates), len(assets)), index=dates, columns=sids, ) # Column B will always provide True for 0 and False for 1. column_B_frame = pd.DataFrame(data={sids[0]: True, sids[1]: False}, index=dates) loaders = { MyDataSet.column_A: DataFrameLoader(MyDataSet.column_A, column_A_frame), MyDataSet.column_B: DataFrameLoader(MyDataSet.column_B, column_B_frame), } def my_dispatcher(column): return loaders[column] # Set up pipeline engine # Loader for pricing pipeline_loader = USEquityPricingLoader( bundle_data.equity_daily_bar_reader, bundle_data.adjustment_reader, ) def choose_loader(column): if column in USEquityPricing.columns: return pipeline_loader return my_dispatcher(column) engine = SimplePipelineEngine( get_loader=choose_loader, calendar=trading_calendar.all_sessions, asset_finder=bundle_data.asset_finder, ) p = Pipeline( columns={ 'price': USEquityPricing.close.latest, 'col_A': MyDataSet.column_A.latest, 'col_B': MyDataSet.column_B.latest }, screen=StaticAssets(assets) ) df = engine.run_pipeline( p, pd.Timestamp('2016-01-07', tz='utc'), pd.Timestamp('2016-01-07', tz='utc') ) df = df.sort_values(by=['price'], axis=0, ascending=False) print(df)
# In[10]: class SignalData(DataSet): predictions = Column(dtype=float) domain = US_EQUITIES # ### Define Pipeline Loaders # While the bundle’s OHLCV data can rely on the built-in `USEquityPricingLoader`, we need to define our own `zipline.pipeline.loaders.frame.DataFrameLoader`: # In[11]: signal_loader = { SignalData.predictions: DataFrameLoader(SignalData.predictions, predictions) } # In fact, we need to slightly modify the Zipline library’s source code to bypass the assumption that we will only load price data. To this end, we will add a `custom_loader` parameter to the `run_algorithm` and ensure that this loader is used when the `Pipeline` needs one of `SignalData`’s `Column` instances. # ## Pipeline Setup # Our Pipeline is going to have two Boolean columns that identify the assets we would like to trade as long and short positions. # # To get there, we first define a `CustomFactor` called `MLSignal` that just receives the current `SignalData.predictions`. The motivation is to allow us to use some of the convenient `Factor` methods designed to rank and filter securities. # ### Custom ML Factor # In[12]:
#MarketCap_frame = df[['Date','marketcap', 'sid']].reset_index().set_index(['Date', 'sid'], append=True).sort_index().drop(columns=['index']).unstack().sort_index() #MarketCap_frame.columns = MarketCap_frame.columns.droplevel() #MarketCap_frame.index = pd.to_datetime(MarketCap_frame.index) #MarketCap_frame.index = MarketCap_frame.index.tz_localize('UTC') #MarketCap_frame = MarketCap_frame.sort_index().fillna(method='ffill') #DE_frame = df[['Date','de', 'sid']] #DE_frame.columns = DE_frame.columns.droplevel() #DE_frame.index = pd.to_datetime(DE_frame.index) #DE_frame.index = DE_frame.index.tz_localize('UTC') #DE_frame = DE_frame.sort_index().fillna(method='ffill') #MarketCap_frame = df[['marketcap', 'sid']].sort_index().fillna(method='ffill') #DE_frame = df[['de', 'sid']].sort_index().fillna(method='ffill') DE_frame = df[['Date','de', 'sid']].reset_index().set_index(['Date', 'sid']).sort_index().drop(columns=['index']) DE_frame = DE_frame.pivot_table(values='de', index='Date', columns='sid', aggfunc='max', fill_value=None, margins=False, dropna=True, margins_name='All') DE_frame = DE_frame.sort_index().fillna(method='ffill') class Fundamentals(DataSet): DE = Column(dtype=float) MarketCap = Column(dtype=float) # register the loaders loaders[Fundamentals.DE] = DataFrameLoader(Fundamentals.DE, DE_frame) loaders[Fundamentals.MarketCap] = DataFrameLoader(Fundamentals.MarketCap, MarketCap_frame) df_loaders=loaders
df = pd.read_pickle(path.join(path.dirname(__file__), 'sharadar_with_sid.pkl')) MarketCap_frame = ( df[['MarketCap', 'sid']]. reset_index().set_index(['Date', 'sid']). unstack() ) MarketCap_frame.columns = MarketCap_frame.columns.droplevel() PriceToBook_frame = df[['P/B', 'sid']].reset_index().set_index(['Date', 'sid']).unstack() PriceToBook_frame.columns = PriceToBook_frame.columns.droplevel() PriceToSales_frame = df[['P/S', 'sid']].reset_index().set_index(['Date', 'sid']).unstack() PriceToSales_frame.columns = PriceToSales_frame.columns.droplevel() PriceToEarnings_frame = df[['P/E', 'sid']].reset_index().set_index(['Date', 'sid']).unstack() PriceToEarnings_frame.columns = PriceToEarnings_frame.columns.droplevel() class Fundamentals(DataSet): MarketCap = Column(dtype=float) PriceToBook = Column(dtype=float) PriceToSales = Column(dtype=float) PriceToEarnings = Column(dtype=float) # register the loaders loaders[Fundamentals.MarketCap] = DataFrameLoader(Fundamentals.MarketCap, MarketCap_frame) loaders[Fundamentals.PriceToBook] = DataFrameLoader(Fundamentals.PriceToBook, PriceToBook_frame) loaders[Fundamentals.PriceToSales] = DataFrameLoader(Fundamentals.PriceToSales, PriceToSales_frame) loaders[Fundamentals.PriceToEarnings] = DataFrameLoader(Fundamentals.PriceToEarnings, PriceToEarnings_frame)
def test_adjustments(self): data = np.arange(100).reshape(self.ndates, self.nsids) baseline = pd.DataFrame(data, index=self.dates, columns=self.sids) # Use the dates from index 10 on and sids 1-3. dates_slice = slice(10, None, None) sids_slice = slice(1, 4, None) # Adjustments that should actually affect the output. relevant_adjustments = [ { "sid": 1, "start_date": None, "end_date": self.dates[15], "apply_date": self.dates[16], "value": 0.5, "kind": MULTIPLY, }, { "sid": 2, "start_date": self.dates[5], "end_date": self.dates[15], "apply_date": self.dates[16], "value": 1.0, "kind": ADD, }, { "sid": 2, "start_date": self.dates[15], "end_date": self.dates[16], "apply_date": self.dates[17], "value": 1.0, "kind": ADD, }, { "sid": 3, "start_date": self.dates[16], "end_date": self.dates[17], "apply_date": self.dates[18], "value": 99.0, "kind": OVERWRITE, }, ] # These adjustments shouldn't affect the output. irrelevant_adjustments = [ { # Sid Not Requested "sid": 0, "start_date": self.dates[16], "end_date": self.dates[17], "apply_date": self.dates[18], "value": -9999.0, "kind": OVERWRITE, }, { # Sid Unknown "sid": 9999, "start_date": self.dates[16], "end_date": self.dates[17], "apply_date": self.dates[18], "value": -9999.0, "kind": OVERWRITE, }, { # Date Not Requested "sid": 2, "start_date": self.dates[1], "end_date": self.dates[2], "apply_date": self.dates[3], "value": -9999.0, "kind": OVERWRITE, }, { # Date Before Known Data "sid": 2, "start_date": self.dates[0] - (2 * self.trading_day), "end_date": self.dates[0] - self.trading_day, "apply_date": self.dates[0] - self.trading_day, "value": -9999.0, "kind": OVERWRITE, }, { # Date After Known Data "sid": 2, "start_date": self.dates[-1] + self.trading_day, "end_date": self.dates[-1] + (2 * self.trading_day), "apply_date": self.dates[-1] + (3 * self.trading_day), "value": -9999.0, "kind": OVERWRITE, }, ] adjustments = pd.DataFrame(relevant_adjustments + irrelevant_adjustments) loader = DataFrameLoader( USEquityPricing.close, baseline, adjustments=adjustments, ) expected_baseline = baseline.iloc[dates_slice, sids_slice] formatted_adjustments = loader.format_adjustments( self.dates[dates_slice], self.sids[sids_slice], ) expected_formatted_adjustments = { 6: [ Float64Multiply( first_row=0, last_row=5, first_col=0, last_col=0, value=0.5, ), Float64Add( first_row=0, last_row=5, first_col=1, last_col=1, value=1.0, ), ], 7: [ Float64Add( first_row=5, last_row=6, first_col=1, last_col=1, value=1.0, ), ], 8: [ Float64Overwrite( first_row=6, last_row=7, first_col=2, last_col=2, value=99.0, ) ], } assert formatted_adjustments == expected_formatted_adjustments mask = self.mask[dates_slice, sids_slice] with patch("zipline.pipeline.loaders.frame.AdjustedArray") as m: loader.load_adjusted_array( US_EQUITIES, columns=[USEquityPricing.close], dates=self.dates[dates_slice], sids=self.sids[sids_slice], mask=mask, ) assert m.call_count == 1 args, kwargs = m.call_args assert_array_equal(kwargs["data"], expected_baseline.values) assert kwargs["adjustments"] == expected_formatted_adjustments
locals().update({ name: Column(dtype=float) for name in sharadar_f1_top25 }) locals().update({ name: Column(dtype=object) for name in sharadar_tickers }) #class Fundamentals(DataSet): # for name in sharadar_f1_top25: # name = Column(dtype=float) #class Fundamentals(DataSet): #DE = Column(dtype=float) #MarketCap = Column(dtype=float) #EUSD = Column(dtype=float) #DNC = Column(dtype=float) # register the loaders for name in sharadar_f1_top25: loaders[Fundamentals.get_column(name)] = DataFrameLoader(Fundamentals.get_column(name), fundy_frames[name] ) for name in sharadar_tickers: loaders[Fundamentals.get_column(name)] = DataFrameLoader(Fundamentals.get_column(name), fundy_frames[name] ) #loaders[Fundamentals.DE] = DataFrameLoader(Fundamentals.DE, DE_frame) #loaders[Fundamentals.MarketCap] = DataFrameLoader(Fundamentals.MarketCap, MarketCap_frame) #loaders[Fundamentals.EUSD] = DataFrameLoader(Fundamentals.EUSD, EUSD_frame) #loaders[Fundamentals.DNC] = DataFrameLoader(Fundamentals.DNC, DNC_frame) df_loaders=loaders
def test_adjustments(self): data = arange(100).reshape(self.ndates, self.nsids) baseline = DataFrame(data, index=self.dates, columns=self.sids) # Use the dates from index 10 on and sids 1-3. dates_slice = slice(10, None, None) sids_slice = slice(1, 4, None) # Adjustments that should actually affect the output. relevant_adjustments = [ { 'sid': 1, 'start_date': None, 'end_date': self.dates[15], 'apply_date': self.dates[16], 'value': 0.5, 'kind': MULTIPLY, }, { 'sid': 2, 'start_date': self.dates[5], 'end_date': self.dates[15], 'apply_date': self.dates[16], 'value': 1.0, 'kind': ADD, }, { 'sid': 2, 'start_date': self.dates[15], 'end_date': self.dates[16], 'apply_date': self.dates[17], 'value': 1.0, 'kind': ADD, }, { 'sid': 3, 'start_date': self.dates[16], 'end_date': self.dates[17], 'apply_date': self.dates[18], 'value': 99.0, 'kind': OVERWRITE, }, ] # These adjustments shouldn't affect the output. irrelevant_adjustments = [ { # Sid Not Requested 'sid': 0, 'start_date': self.dates[16], 'end_date': self.dates[17], 'apply_date': self.dates[18], 'value': -9999.0, 'kind': OVERWRITE, }, { # Sid Unknown 'sid': 9999, 'start_date': self.dates[16], 'end_date': self.dates[17], 'apply_date': self.dates[18], 'value': -9999.0, 'kind': OVERWRITE, }, { # Date Not Requested 'sid': 2, 'start_date': self.dates[1], 'end_date': self.dates[2], 'apply_date': self.dates[3], 'value': -9999.0, 'kind': OVERWRITE, }, { # Date Before Known Data 'sid': 2, 'start_date': self.dates[0] - (2 * trading_day), 'end_date': self.dates[0] - trading_day, 'apply_date': self.dates[0] - trading_day, 'value': -9999.0, 'kind': OVERWRITE, }, { # Date After Known Data 'sid': 2, 'start_date': self.dates[-1] + trading_day, 'end_date': self.dates[-1] + (2 * trading_day), 'apply_date': self.dates[-1] + (3 * trading_day), 'value': -9999.0, 'kind': OVERWRITE, }, ] adjustments = DataFrame(relevant_adjustments + irrelevant_adjustments) loader = DataFrameLoader( USEquityPricing.close, baseline, adjustments=adjustments, ) expected_baseline = baseline.iloc[dates_slice, sids_slice] formatted_adjustments = loader.format_adjustments( self.dates[dates_slice], self.sids[sids_slice], ) expected_formatted_adjustments = { 6: [ Float64Multiply( first_row=0, last_row=5, first_col=0, last_col=0, value=0.5, ), Float64Add( first_row=0, last_row=5, first_col=1, last_col=1, value=1.0, ), ], 7: [ Float64Add( first_row=5, last_row=6, first_col=1, last_col=1, value=1.0, ), ], 8: [ Float64Overwrite( first_row=6, last_row=7, first_col=2, last_col=2, value=99.0, ) ], } self.assertEqual(formatted_adjustments, expected_formatted_adjustments) mask = self.mask[dates_slice, sids_slice] with patch('zipline.pipeline.loaders.frame.adjusted_array') as m: loader.load_adjusted_array( columns=[USEquityPricing.close], dates=self.dates[dates_slice], assets=self.sids[sids_slice], mask=mask, ) self.assertEqual(m.call_count, 1) args, kwargs = m.call_args assert_array_equal(kwargs['data'], expected_baseline.values) assert_array_equal(kwargs['mask'], mask) self.assertEqual(kwargs['adjustments'], expected_formatted_adjustments)
def prepare_data(bundle_data): """ This function takes a data bundle and matches fundamental data points to the correct asset objects. :param bundle_data: The data bundle that you ingested from SEP :return: A dictionary of loaders to be used within a data pipeline, and a DataSet class with the correct columns """ """ Enter the name of the data points you wish to use in the backtest here. The names need to match the name of the appropriate CSV file found in processed_data/fundamentals """ data_points = ['pe1', 'de', 'earnings_growth', 'marketcap'] # Specify where our CSV files live fundamentals_directory = '../processed_data/fundamentals/' pricing_directory = '../processed_data/pricing/daily/' # pricing_assets is an ordered dict that contains the name of every security in the pricing directory pricing_assets = helper_functions.get_pricing_securities(pricing_directory) """ fundamental_assets is an ordered dict that contains the name of every security in the fundamentals directory dates is a list of dates that the fundamentals directory is indexed by """ fundamental_assets, dates = helper_functions.get_dates( fundamentals_directory) # Securities that are in both pricing_assets, and fundamental_assets tickers = helper_functions.get_tickers_in_both(pricing_assets, fundamental_assets) date_stamps = helper_functions.convert_to_date_stamps(dates) data_frames = {} for data in data_points: # creates a dataframe for each data point, puts it in the data_frames dict data_frames[data] = helper_functions.make_frame( data, fundamentals_directory, tickers) for data_frame in data_frames: """ assets variable becomes a list of Asset objects, sids becomes a list of SID objects corresponding to the correct assets. """ assets = bundle_data.asset_finder.lookup_symbols( [ticker for ticker in data_frames[data_frame].columns], as_of_date=None) sids = pd.Int64Index([asset.sid for asset in assets]) break class MyDataSet(DataSet): """ We need to create an attribute for each needed data point within MyDataSet, before __new__() runs... This is so MyDataSet converts the Column types into BoundColumn types. """ for point in data_points: locals()[point] = Column(dtype=float) """ We are finally ready to create a dictionary of data frame loaders, with corresponding BoundColumn attributes within our MyDataSet class. """ data_frame_loaders = {} for data_frame in data_frames: """ Reindexes the dataframe indexes with date_stamps instead of dates, and replaces the column names (which are currently strings) with SIDS. """ data_frames[data_frame].index, data_frames[ data_frame].columns = date_stamps, sids for attr in data_frames: """ Filles data_frame_loaders with key value pairs of: MyDataSet.attribute_name: DataFrameLoader(attribute_name """ data_frame_loaders[getattr(MyDataSet, attr)] = DataFrameLoader( getattr(MyDataSet, attr), data_frames[attr]) return data_frame_loaders, MyDataSet