def test_inspect(self): data = np.arange(15, dtype=float).reshape(5, 3) adj_array = AdjustedArray( data, {4: [Float64Multiply(2, 3, 0, 0, 4.0)]}, float("nan"), ) # TODO: CHECK WHY DO I NEED TO FIX THE INDENT IN THE EXPECTED? expected = dedent( """\ Adjusted Array (float64): Data: array([[ 0., 1., 2.], [ 3., 4., 5.], [ 6., 7., 8.], [ 9., 10., 11.], [12., 13., 14.]]) Adjustments: {4: [Float64Multiply(first_row=2, last_row=3, first_col=0, \ last_col=0, value=4.000000)]} """ ) got = adj_array.inspect() assert expected == got
def test_inspect(self): data = arange(15, dtype=float).reshape(5, 3) adj_array = AdjustedArray( data, {4: [Float64Multiply(2, 3, 0, 0, 4.0)]}, float('nan'), ) expected = dedent( """\ Adjusted Array (float64): Data: array([[ 0., 1., 2.], [ 3., 4., 5.], [ 6., 7., 8.], [ 9., 10., 11.], [ 12., 13., 14.]]) Adjustments: {4: [Float64Multiply(first_row=2, last_row=3, first_col=0, \ last_col=0, value=4.000000)]} """ ) got = adj_array.inspect() self.assertEqual(expected, got)
def test_inspect(self): data = arange(15, dtype=float).reshape(5, 3) adj_array = AdjustedArray( data, NOMASK, {4: [Float64Multiply(2, 3, 0, 0, 4.0)]}, float('nan'), ) expected = dedent( """\ Adjusted Array (float64): Data: array([[ 0., 1., 2.], [ 3., 4., 5.], [ 6., 7., 8.], [ 9., 10., 11.], [ 12., 13., 14.]]) Adjustments: {4: [Float64Multiply(first_row=2, last_row=3, first_col=0, \ last_col=0, value=4.000000)]} """ ) got = adj_array.inspect() self.assertEqual(expected, got)
def test_array_views_arent_writable(self): data = arange(30, dtype=float).reshape(6, 5) adj_array = AdjustedArray(data, NOMASK, {}, float('nan')) for frame in adj_array.traverse(3): with self.assertRaises(ValueError): frame[0, 0] = 5.0
def test_array_views_arent_writable(self): data = arange(30, dtype=float).reshape(6, 5) adj_array = AdjustedArray(data, {}, float('nan')) for frame in adj_array.traverse(3): with self.assertRaises(ValueError): frame[0, 0] = 5.0
def load_adjusted_array(self, domain, columns, dates, sids, mask): # load_adjusted_array is called with dates on which the user's algo # will be shown data, which means we need to return the data that would # be known at the **start** of each date. We assume that the latest # data known on day N is the data from day (N - 1), so we shift all # query dates back by a trading session. sessions = domain.all_sessions() shifted_dates = shift_dates(sessions, dates[0], dates[-1], shift=1) ohlcv_cols, currency_cols = self._split_column_types(columns) del columns # From here on we should use ohlcv_cols or currency_cols. ohlcv_colnames = [c.name for c in ohlcv_cols] raw_ohlcv_arrays = self.raw_price_reader.load_raw_arrays( ohlcv_colnames, shifted_dates[0], shifted_dates[-1], sids, ) # Currency convert raw_arrays in place if necessary. We use shifted # dates to load currency conversion rates to make them line up with # dates used to fetch prices. self._inplace_currency_convert( ohlcv_cols, raw_ohlcv_arrays, shifted_dates, sids, ) adjustments = self.adjustments_reader.load_pricing_adjustments( ohlcv_colnames, dates, sids, ) out = {} for c, c_raw, c_adjs in zip(ohlcv_cols, raw_ohlcv_arrays, adjustments): out[c] = AdjustedArray( c_raw.astype(c.dtype), c_adjs, c.missing_value, ) for c in currency_cols: codes_1d = self.raw_price_reader.currency_codes(sids) codes = repeat_first_axis(codes_1d, len(dates)) out[c] = AdjustedArray( codes, adjustments={}, missing_value=None, ) return out
def test_traverse_invalidating(self): data = np.arange(5 * 3, dtype="f8").reshape(5, 3) original_data = data.copy() adjustments = {2: [Float64Multiply(0, 4, 0, 2, 2.0)]} adjusted_array = AdjustedArray(data, adjustments, float("nan")) for _ in adjusted_array.traverse(1, copy=False): pass assert_equal(data, original_data * 2) err_msg = "cannot traverse invalidated AdjustedArray" with pytest.raises(ValueError, match=err_msg): adjusted_array.traverse(1)
def test_update_adjustments(self, initial_adjustments, adjustments_to_add, expected_adjustments_with_append, expected_adjustments_with_prepend): methods = ['append', 'prepend'] expected_outputs = [ expected_adjustments_with_append, expected_adjustments_with_prepend ] for method, expected_output in zip(methods, expected_outputs): data = arange(30, dtype=float).reshape(6, 5) adjusted_array = AdjustedArray(data, initial_adjustments, float('nan')) adjusted_array.update_adjustments(adjustments_to_add, method) self.assertEqual(adjusted_array.adjustments, expected_output)
def load_adjusted_array(self, domain, columns, dates, sids, mask): date_indexer = np.searchsorted(self.dates, dates) assets_indexer = np.searchsorted(self.assets, sids) # Boolean arrays with True on matched entries good_dates = (date_indexer != -1) good_assets = (assets_indexer != -1) mask = (good_assets & as_column(good_dates)) & mask out = {} with pd.HDFStore(self.data_path) as store: for column in columns: try: data = store["/data/" + column.name].values data = data[np.ix_(date_indexer, assets_indexer)] data[~mask] = column.missing_value except KeyError: raise ValueError("Couldn't find loader for %s" % column.name) out[column] = AdjustedArray( # Pull out requested columns/rows from our baseline data. data=data, adjustments={}, missing_value=column.missing_value, ) return out
def test_bollinger_bands(self, window_length, k, mask_sid): closes = self.closes(mask_sid) result = self.run_graph( TermGraph({ 'f': BollingerBands( window_length=window_length, k=k, ), }), initial_workspace={ USEquityPricing.close: AdjustedArray( closes, np.full_like(closes, True, dtype=bool), {}, np.nan, ), }, mask_sid=mask_sid, )['f'] expected_upper, expected_middle, expected_lower = self.expected( window_length, k, closes, ) assert_equal(result.upper, expected_upper) assert_equal(result.middle, expected_middle) assert_equal(result.lower, expected_lower)
def test_overwrite_adjustment_cases(self, name, data, lookback, adjustments, missing_value, expected): array = AdjustedArray(data, NOMASK, adjustments, missing_value) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. window_iter = array.traverse(lookback) for yielded, expected_yield in zip_longest(window_iter, expected): check_arrays(yielded, expected_yield)
def load_adjusted_array(self, domain, columns, dates, sids, mask): """ Load data from our stored baseline. """ if len(columns) != 1: raise ValueError( "Can't load multiple columns with DataFrameLoader") column = columns[0] self._validate_input_column(column) date_indexer = self.dates.get_indexer(dates) assets_indexer = self.assets.get_indexer(sids) # Boolean arrays with True on matched entries good_dates = (date_indexer != -1) good_assets = (assets_indexer != -1) data = self.baseline[ix_(date_indexer, assets_indexer)] mask = (good_assets & as_column(good_dates)) & mask # Mask out requested columns/rows that didn't match. data[~mask] = column.missing_value return { column: AdjustedArray( # Pull out requested columns/rows from our baseline data. data=data, adjustments=self.format_adjustments(dates, sids), missing_value=column.missing_value, ), }
def test_object1darrayoverwrite(self): pairs = [u + l for u, l in product(ascii_uppercase, ascii_lowercase)] categories = pairs + ["~" + c for c in pairs] baseline = LabelArray( np.array([["".join((r, c)) for c in "abc"] for r in ascii_uppercase]), None, categories, ) full_expected = baseline.copy() def flip(cs): if cs is None: return None if cs[0] != "~": return "~" + cs return cs def make_overwrite(fr, lr, fc, lc): fr, lr, fc, lc = map(ord, (fr, lr, fc, lc)) fr -= ord("A") lr -= ord("A") fc -= ord("a") lc -= ord("a") return Object1DArrayOverwrite( fr, lr, fc, lc, baseline[fr : lr + 1, fc].map(flip), ) overwrites = { 3: [make_overwrite("A", "B", "a", "a")], 4: [make_overwrite("A", "C", "b", "c")], 5: [make_overwrite("D", "D", "a", "b")], } it = AdjustedArray(baseline, overwrites, None).traverse(3) window = next(it) expected = full_expected[:3] check_arrays(window, expected) window = next(it) full_expected[0:2, 0] = LabelArray(["~Aa", "~Ba"], None) expected = full_expected[1:4] check_arrays(window, expected) window = next(it) full_expected[0:3, 1:3] = LabelArray( [["~Ab", "~Ac"], ["~Bb", "~Bc"], ["~Cb", "~Cb"]], None ) expected = full_expected[2:5] check_arrays(window, expected) window = next(it) full_expected[3, :2] = "~Da" expected = full_expected[3:6] check_arrays(window, expected)
def load_adjusted_array(self, domain, columns, dates, sids, mask): fields = [c.name for c in columns] real_sids = [ self.zipline_sids_to_real_sids[zipline_sid] for zipline_sid in sids ] reindex_like = pd.DataFrame(None, index=dates, columns=real_sids) reindex_like.index.name = "Date" securities = get_securities_reindexed_like(reindex_like, fields=fields) out = {} for column in columns: missing_value = MISSING_VALUES_BY_DTYPE[column.dtype] if column.dtype == datetime64ns_dtype: # pd.to_datetime handles NaNs in pandas 0.22 while .astype(column.dtype) doesn't values = securities.loc[column.name].apply( pd.to_datetime).fillna(missing_value).values else: values = securities.loc[column.name].astype( column.dtype).fillna(missing_value).values out[column] = AdjustedArray(values, adjustments={}, missing_value=missing_value) return out
def test_no_adjustments( self, name, data, lookback, adjustments, missing_value, perspective_offset, expected_output, ): array = AdjustedArray(data, adjustments, missing_value) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. in_out = zip(array.traverse(lookback), expected_output) for yielded, expected_yield in in_out: check_arrays(yielded, expected_yield)
def load_adjusted_array(self, columns, dates, assets, mask): """ Load data from our stored baseline. """ column = self.column if len(columns) != 1: raise ValueError( "Can't load multiple columns with DataFrameLoader") elif columns[0] != column: raise ValueError("Can't load unknown column %s" % columns[0]) date_indexer = self.dates.get_indexer(dates) assets_indexer = self.assets.get_indexer(assets) # Boolean arrays with True on matched entries good_dates = (date_indexer != -1) good_assets = (assets_indexer != -1) return { column: AdjustedArray( # Pull out requested columns/rows from our baseline data. data=self.baseline[ix_(date_indexer, assets_indexer)], # Mask out requested columns/rows that didnt match. mask=(good_assets & good_dates[:, None]) & mask, adjustments=self.format_adjustments(dates, assets), missing_value=column.missing_value, ), }
def load_adjusted_array(self, domain, columns, dates, sids, mask): # load_adjusted_array is called with dates on which the user's algo # will be shown data, which means we need to return the data that would # be known at the start of each date. We assume that the latest data # known on day N is the data from day (N - 1), so we shift all query # dates back by a day. sessions = domain.all_sessions() start_date, end_date = shift_dates( sessions, dates[0], dates[-1], shift=1, ) colnames = [c.name for c in columns] raw_arrays = self.raw_price_reader.load_raw_arrays( colnames, start_date, end_date, sids, ) adjustments = self.adjustments_reader.load_pricing_adjustments( colnames, dates, sids, ) out = {} for c, c_raw, c_adjs in zip(columns, raw_arrays, adjustments): out[c] = AdjustedArray( c_raw.astype(c.dtype), c_adjs, c.missing_value, ) return out
def load_adjusted_array(self, columns, dates, assets, mask): # load_adjusted_array is called with dates on which the user's algo # will be shown data, which means we need to return the data that would # be known at the start of each date. We assume that the latest data # known on day N is the data from day (N - 1), so we shift all query # dates back by a day. start_date, end_date = _shift_dates( self._calendar, dates[0], dates[-1], shift=1, ) raw_arrays = self.raw_price_loader.load_raw_arrays( columns, start_date, end_date, assets, ) adjustments = self.adjustments_loader.load_adjustments( columns, dates, assets, ) adjusted_arrays = [ AdjustedArray(raw_array, mask, col_adjustments) for raw_array, col_adjustments in zip(raw_arrays, adjustments) ] return dict(zip(columns, adjusted_arrays))
def test_bollinger_bands(self, window_length, k, mask_last_sid): closes = self.closes(mask_last_sid=mask_last_sid) mask = ~np.isnan(closes) bbands = BollingerBands(window_length=window_length, k=k) expected = self.expected_bbands(window_length, k, closes) self.check_terms( terms={ 'upper': bbands.upper, 'middle': bbands.middle, 'lower': bbands.lower, }, expected={ 'upper': expected[0], 'middle': expected[1], 'lower': expected[2], }, initial_workspace={ USEquityPricing.close: AdjustedArray( data=closes, mask=mask, adjustments={}, missing_value=np.nan, ), }, mask=self.build_mask(mask), )
def load_adjusted_array(self, domain, columns, dates, sids, mask): fields = [c.name for c in columns] real_sids = [ self.zipline_sids_to_real_sids[zipline_sid] for zipline_sid in sids ] reindex_like = pd.DataFrame(None, index=dates, columns=real_sids) reindex_like.index.name = "Date" try: institutions = get_sharadar_institutions_reindexed_like( reindex_like, fields=fields) except NoFundamentalData: institutions = reindex_like out = {} for column in columns: missing_value = MISSING_VALUES_BY_DTYPE[column.dtype] out[column] = AdjustedArray(institutions.loc[column.name].astype( column.dtype).fillna(missing_value).values, adjustments={}, missing_value=missing_value) return out
def load_adjusted_array(self, domain, columns, dates, sids, mask): real_sids = [ self.zipline_sids_to_real_sids[zipline_sid] for zipline_sid in sids ] reindex_like = pd.DataFrame(False, index=dates, columns=real_sids) reindex_like.index.name = "Date" try: in_sp500 = get_sharadar_sp500_reindexed_like(reindex_like) except NoFundamentalData: in_sp500 = reindex_like # This dataset has only one column column = columns[0] missing_value = MISSING_VALUES_BY_DTYPE[column.dtype] return { column: AdjustedArray(in_sp500.astype( column.dtype).fillna(missing_value).values, adjustments={}, missing_value=missing_value) }
def test_bad_input(self): msg = "Mask shape \(2, 3\) != data shape \(5, 5\)" data = arange(25).reshape(5, 5) bad_mask = array([[0, 1, 1], [0, 0, 1]], dtype=bool) with self.assertRaisesRegexp(ValueError, msg): AdjustedArray(data, bad_mask, {})
def test_object1darrayoverwrite(self): pairs = [u + l for u, l in product(ascii_uppercase, ascii_lowercase)] categories = pairs + ['~' + c for c in pairs] baseline = LabelArray( array([[''.join((r, c)) for c in 'abc'] for r in ascii_uppercase]), None, categories, ) full_expected = baseline.copy() def flip(cs): if cs is None: return None if cs[0] != '~': return '~' + cs return cs def make_overwrite(fr, lr, fc, lc): fr, lr, fc, lc = map(ord, (fr, lr, fc, lc)) fr -= ord('A') lr -= ord('A') fc -= ord('a') lc -= ord('a') return Object1DArrayOverwrite( fr, lr, fc, lc, baseline[fr:lr + 1, fc].map(flip), ) overwrites = { 3: [make_overwrite('A', 'B', 'a', 'a')], 4: [make_overwrite('A', 'C', 'b', 'c')], 5: [make_overwrite('D', 'D', 'a', 'b')], } it = AdjustedArray(baseline, overwrites, None).traverse(3) window = next(it) expected = full_expected[:3] check_arrays(window, expected) window = next(it) full_expected[0:2, 0] = LabelArray(['~Aa', '~Ba'], None) expected = full_expected[1:4] check_arrays(window, expected) window = next(it) full_expected[0:3, 1:3] = LabelArray( [['~Ab', '~Ac'], ['~Bb', '~Bc'], ['~Cb', '~Cb']], None) expected = full_expected[2:5] check_arrays(window, expected) window = next(it) full_expected[3, :2] = '~Da' expected = full_expected[3:6] check_arrays(window, expected)
def test_multiplicative_adjustments(self, name, data, lookback, adjustments, expected): array = AdjustedArray(data, NOMASK, adjustments) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. window_iter = array.traverse(lookback) for yielded, expected_yield in zip_longest(window_iter, expected): assert_array_equal(yielded, expected_yield)
def test_overwrite_adjustment_cases(self, name, data, lookback, adjustments, expected): array = AdjustedArray(data, NOMASK, adjustments) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. window_iter = array.traverse(lookback) for yielded, expected_yield in zip_longest(window_iter, expected): self.assertEqual(yielded.dtype, data.dtype) assert_array_equal(yielded, expected_yield)
def test_copy(self): data = arange(5 * 3, dtype='f8').reshape(5, 3) original_data = data.copy() adjustments = {2: [Float64Multiply(0, 4, 0, 2, 2.0)]} adjusted_array = AdjustedArray(data, adjustments, float('nan')) traverse_copy = adjusted_array.copy() clean_copy = adjusted_array.copy() a_it = adjusted_array.traverse(2, copy=False) b_it = traverse_copy.traverse(2, copy=False) for a, b in zip(a_it, b_it): assert_equal(a, b) with self.assertRaises(ValueError) as e: adjusted_array.copy() assert_equal( str(e.exception), 'cannot copy invalidated AdjustedArray', ) # the clean copy should have the original data even though the # original adjusted array has it's data mutated in place assert_equal(clean_copy.data, original_data) assert_equal(adjusted_array.data, original_data * 2)
def test_traverse_invalidating(self): data = arange(5 * 3, dtype='f8').reshape(5, 3) original_data = data.copy() adjustments = {2: [Float64Multiply(0, 4, 0, 2, 2.0)]} adjusted_array = AdjustedArray(data, adjustments, float('nan')) for _ in adjusted_array.traverse(1, copy=False): pass assert_equal(data, original_data * 2) with self.assertRaises(ValueError) as e: adjusted_array.traverse(1) assert_equal( str(e.exception), 'cannot traverse invalidated AdjustedArray', )
def test_update_adjustments(self, initial_adjustments, adjustments_to_add, expected_adjustments_with_append, expected_adjustments_with_prepend): methods = ['append', 'prepend'] expected_outputs = [ expected_adjustments_with_append, expected_adjustments_with_prepend ] for method, expected_output in zip(methods, expected_outputs): data = arange(30, dtype=float).reshape(6, 5) adjusted_array = AdjustedArray( data, initial_adjustments, float('nan') ) adjusted_array.update_adjustments(adjustments_to_add, method) self.assertEqual(adjusted_array.adjustments, expected_output)
def test_overwrite_adjustment_cases( self, name, baseline, lookback, adjustments, missing_value, perspective_offset, expected, ): array = AdjustedArray(baseline, adjustments, missing_value) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. window_iter = array.traverse( lookback, perspective_offset=perspective_offset, ) for yielded, expected_yield in zip_longest(window_iter, expected): check_arrays(yielded, expected_yield)
def test_update_adjustments( self, initial_adjustments, adjustments_to_add, expected_adjustments_with_append, expected_adjustments_with_prepend, ): methods = ["append", "prepend"] expected_outputs = [ expected_adjustments_with_append, expected_adjustments_with_prepend, ] for method, expected_output in zip(methods, expected_outputs): data = np.arange(30, dtype=float).reshape(6, 5) adjusted_array = AdjustedArray(data, initial_adjustments, float("nan")) adjusted_array.update_adjustments(adjustments_to_add, method) assert adjusted_array.adjustments == expected_output
def test_invalid_lookback(self): data = arange(30, dtype=float).reshape(6, 5) adj_array = AdjustedArray(data, {}, float('nan')) with self.assertRaises(WindowLengthTooLong): adj_array.traverse(7) with self.assertRaises(WindowLengthNotPositive): adj_array.traverse(0) with self.assertRaises(WindowLengthNotPositive): adj_array.traverse(-1)
def test_update_labels(self): data = np.array( [ ["aaa", "bbb", "ccc"], ["ddd", "eee", "fff"], ["ggg", "hhh", "iii"], ["jjj", "kkk", "lll"], ["mmm", "nnn", "ooo"], ] ) label_array = LabelArray(data, missing_value="") adj_array = AdjustedArray( data=label_array, adjustments={4: [ObjectOverwrite(2, 3, 0, 0, "ppp")]}, missing_value="", ) expected_data = np.array( [ ["aaa-foo", "bbb-foo", "ccc-foo"], ["ddd-foo", "eee-foo", "fff-foo"], ["ggg-foo", "hhh-foo", "iii-foo"], ["jjj-foo", "kkk-foo", "lll-foo"], ["mmm-foo", "nnn-foo", "ooo-foo"], ] ) expected_label_array = LabelArray(expected_data, missing_value="") expected_adj_array = AdjustedArray( data=expected_label_array, adjustments={4: [ObjectOverwrite(2, 3, 0, 0, "ppp-foo")]}, missing_value="", ) adj_array.update_labels(lambda x: x + "-foo") # Check that the mapped AdjustedArray has the expected baseline # values and adjustment values. check_arrays(adj_array.data, expected_adj_array.data) assert adj_array.adjustments == expected_adj_array.adjustments
def test_multiplicative_adjustments(self, name, data, lookback, adjustments, missing_value, perspective_offset, expected): array = AdjustedArray(data, NOMASK, adjustments, missing_value) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. window_iter = array.traverse( lookback, perspective_offset=perspective_offset, ) for yielded, expected_yield in zip_longest(window_iter, expected): check_arrays(yielded, expected_yield)
def test_update_labels(self): data = array([ ['aaa', 'bbb', 'ccc'], ['ddd', 'eee', 'fff'], ['ggg', 'hhh', 'iii'], ['jjj', 'kkk', 'lll'], ['mmm', 'nnn', 'ooo'], ]) label_array = LabelArray(data, missing_value='') adj_array = AdjustedArray( data=label_array, adjustments={4: [ObjectOverwrite(2, 3, 0, 0, 'ppp')]}, missing_value='', ) expected_data = array([ ['aaa-foo', 'bbb-foo', 'ccc-foo'], ['ddd-foo', 'eee-foo', 'fff-foo'], ['ggg-foo', 'hhh-foo', 'iii-foo'], ['jjj-foo', 'kkk-foo', 'lll-foo'], ['mmm-foo', 'nnn-foo', 'ooo-foo'], ]) expected_label_array = LabelArray(expected_data, missing_value='') expected_adj_array = AdjustedArray( data=expected_label_array, adjustments={4: [ObjectOverwrite(2, 3, 0, 0, 'ppp-foo')]}, missing_value='', ) adj_array.update_labels(lambda x: x + '-foo') # Check that the mapped AdjustedArray has the expected baseline # values and adjustment values. check_arrays(adj_array.data, expected_adj_array.data) self.assertEqual(adj_array.adjustments, expected_adj_array.adjustments)
def test_invalid_lookback(self): data = arange(30, dtype=float).reshape(6, 5) adj_array = AdjustedArray(data, NOMASK, {}, float('nan')) with self.assertRaises(WindowLengthTooLong): adj_array.traverse(7) with self.assertRaises(WindowLengthNotPositive): adj_array.traverse(0) with self.assertRaises(WindowLengthNotPositive): adj_array.traverse(-1)