def test_percentile_nasty_partitions(self): # Test percentile with nasty partitions: divide up 5 assets into # quartiles. # There isn't a nice mathematical definition of correct behavior here, # so for now we guarantee the behavior of numpy.nanpercentile. This is # mostly for regression testing in case we write our own specialized # percentile calculation at some point in the future. data = arange(25, dtype=float).reshape(5, 5) % 4 quartiles = range(4) filter_names = ['pct_' + str(q) for q in quartiles] graph = TermGraph({ name: self.f.percentile_between(q * 25.0, (q + 1) * 25.0) for name, q in zip(filter_names, quartiles) }) results = self.run_graph( graph, initial_workspace={self.f: data}, mask=self.build_mask(ones((5, 5))), ) for name, quartile in zip(filter_names, quartiles): result = results[name] lower = quartile * 25.0 upper = (quartile + 1) * 25.0 expected = and_( nanpercentile(data, lower, axis=1, keepdims=True) <= data, data <= nanpercentile(data, upper, axis=1, keepdims=True), ) check_arrays(result, expected)
def test_single_factor(self): loader = self.loader assets = self.assets engine = SimplePipelineEngine( lambda column: loader, self.dates, self.asset_finder, ) result_shape = (num_dates, num_assets) = (5, len(assets)) dates = self.dates[10:10 + num_dates] factor = RollingSumDifference() expected_result = -factor.window_length # Since every asset will pass the screen, these should be equivalent. pipelines = [ Pipeline(columns={'f': factor}), Pipeline( columns={'f': factor}, screen=factor.eq(expected_result), ), ] for p in pipelines: result = engine.run_pipeline(p, dates[0], dates[-1]) self.assertEqual(set(result.columns), {'f'}) assert_multi_index_is_product( self, result.index, dates, assets ) check_arrays( result['f'].unstack().values, full(result_shape, expected_result, dtype=float), )
def test_engine_with_multicolumn_loader(self): open_, close = USEquityPricing.open, USEquityPricing.close loader = MultiColumnLoader({ open_: ConstantLoader(dates=self.dates, assets=self.assets, constants={open_: 1}), close: ConstantLoader(dates=self.dates, assets=self.assets, constants={close: 2}) }) engine = SimpleFFCEngine(loader, self.dates, self.asset_finder) factor = RollingSumDifference() result = engine.factor_matrix({'f': factor}, self.dates[2], self.dates[-1]) self.assertIsNotNone(result) self.assertEqual({'f'}, set(result.columns)) # (close - open) * window = (1 - 2) * 3 = -3 # skipped 2 from the start, so that the window is full check_arrays(result['f'], Series([-3] * len(self.assets) * (len(self.dates) - 2)))
def test_engine_with_multicolumn_loader(self): open_ = USEquityPricing.open close = USEquityPricing.close volume = USEquityPricing.volume # Test for thirty days up to the second to last day that we think all # the assets existed. If we test the last day of our calendar, no # assets will be in our output, because their end dates are all dates_to_test = self.dates[-32:-2] constants = {open_: 1, close: 2, volume: 3} loader = ConstantLoader(constants=constants, dates=self.dates, assets=self.assets) engine = SimplePipelineEngine(loader, self.dates, self.asset_finder) sumdiff = RollingSumDifference() result = engine.run_pipeline( Pipeline( columns={"sumdiff": sumdiff, "open": open_.latest, "close": close.latest, "volume": volume.latest} ), dates_to_test[0], dates_to_test[-1], ) self.assertIsNotNone(result) self.assertEqual({"sumdiff", "open", "close", "volume"}, set(result.columns)) result_index = self.assets * len(dates_to_test) result_shape = (len(result_index),) check_arrays(result["sumdiff"], Series(index=result_index, data=full(result_shape, -3))) for name, const in [("open", 1), ("close", 2), ("volume", 3)]: check_arrays(result[name], Series(index=result_index, data=full(result_shape, const)))
def test_engine_with_multicolumn_loader(self): open_, close = USEquityPricing.open, USEquityPricing.close # Test for thirty days up to the second to last day that we think all # the assets existed. If we test the last day of our calendar, no # assets will be in our output, because their end dates are all dates_to_test = self.dates[-32:-2] loader = MultiColumnLoader({ open_: ConstantLoader(dates=self.dates, assets=self.assets, constants={open_: 1}), close: ConstantLoader(dates=self.dates, assets=self.assets, constants={close: 2}) }) engine = SimpleFFCEngine(loader, self.dates, self.asset_finder) factor = RollingSumDifference() result = engine.factor_matrix({'f': factor}, dates_to_test[0], dates_to_test[-1]) self.assertIsNotNone(result) self.assertEqual({'f'}, set(result.columns)) result_index = self.assets * len(dates_to_test) result_shape = (len(result_index),) check_arrays( result['f'], Series(index=result_index, data=full(result_shape, -3)), )
def test_rolling_and_nonrolling(self): open_ = USEquityPricing.open close = USEquityPricing.close volume = USEquityPricing.volume # Test for thirty days up to the last day that we think all # the assets existed. dates_to_test = self.dates[-30:] constants = {open_: 1, close: 2, volume: 3} loader = ConstantLoader(constants=constants, dates=self.dates, assets=self.assets) engine = SimplePipelineEngine(lambda column: loader, self.dates, self.asset_finder) sumdiff = RollingSumDifference() result = engine.run_pipeline( Pipeline( columns={"sumdiff": sumdiff, "open": open_.latest, "close": close.latest, "volume": volume.latest} ), dates_to_test[0], dates_to_test[-1], ) self.assertIsNotNone(result) self.assertEqual({"sumdiff", "open", "close", "volume"}, set(result.columns)) result_index = self.assets * len(dates_to_test) result_shape = (len(result_index),) check_arrays(result["sumdiff"], Series(index=result_index, data=full(result_shape, -3))) for name, const in [("open", 1), ("close", 2), ("volume", 3)]: check_arrays(result[name], Series(index=result_index, data=full(result_shape, const)))
def test_masked_rankdata_2d(self, seed_value, method, use_mask, set_missing, ascending): eyemask = ~eye(5, dtype=bool) nomask = ones((5, 5), dtype=bool) seed(seed_value) asfloat = (randn(5, 5) * seed_value) asdatetime = (asfloat).copy().view('datetime64[ns]') mask = eyemask if use_mask else nomask if set_missing: asfloat[:, 2] = nan asdatetime[:, 2] = np_NaT float_result = masked_rankdata_2d( data=asfloat, mask=mask, missing_value=nan, method=method, ascending=True, ) datetime_result = masked_rankdata_2d( data=asdatetime, mask=mask, missing_value=np_NaT, method=method, ascending=True, ) check_arrays(float_result, datetime_result)
def test_isnull_datetime_dtype(self): class DatetimeFactor(Factor): dtype = datetime64ns_dtype window_length = 0 inputs = () factor = DatetimeFactor() data = arange(25).reshape(5, 5).astype('datetime64[ns]') data[eye(5, dtype=bool)] = NaTns graph = TermGraph( { 'isnull': factor.isnull(), 'notnull': factor.notnull(), } ) results = self.run_graph( graph, initial_workspace={factor: data}, mask=self.build_mask(ones((5, 5))), ) check_arrays(results['isnull'], eye(5, dtype=bool)) check_arrays(results['notnull'], ~eye(5, dtype=bool))
def test_isnull_int_dtype(self, custom_missing_value): class CustomMissingValue(Factor): dtype = int64_dtype window_length = 0 missing_value = custom_missing_value inputs = () factor = CustomMissingValue() data = arange(25).reshape(5, 5) data[eye(5, dtype=bool)] = custom_missing_value graph = TermGraph( { 'isnull': factor.isnull(), 'notnull': factor.notnull(), } ) results = self.run_graph( graph, initial_workspace={factor: data}, mask=self.build_mask(ones((5, 5))), ) check_arrays(results['isnull'], eye(5, dtype=bool)) check_arrays(results['notnull'], ~eye(5, dtype=bool))
def test_single_factor(self): loader = self.loader finder = self.asset_finder assets = self.assets engine = SimplePipelineEngine( lambda column: loader, self.dates, self.asset_finder, ) result_shape = (num_dates, num_assets) = (5, len(assets)) dates = self.dates[10:10 + num_dates] factor = RollingSumDifference() expected_result = -factor.window_length # Since every asset will pass the screen, these should be equivalent. pipelines = [ Pipeline(columns={'f': factor}), Pipeline( columns={'f': factor}, screen=factor.eq(expected_result), ), ] for p in pipelines: result = engine.run_pipeline(p, dates[0], dates[-1]) self.assertEqual(set(result.columns), {'f'}) assert_multi_index_is_product(self, result.index, dates, finder.retrieve_all(assets)) check_arrays( result['f'].unstack().values, full(result_shape, expected_result), )
def test_percentile_nasty_partitions(self): # Test percentile with nasty partitions: divide up 5 assets into # quartiles. # There isn't a nice mathematical definition of correct behavior here, # so for now we guarantee the behavior of numpy.nanpercentile. This is # mostly for regression testing in case we write our own specialized # percentile calculation at some point in the future. data = arange(25, dtype=float).reshape(5, 5) % 4 quartiles = range(4) filter_names = ['pct_' + str(q) for q in quartiles] graph = TermGraph( { name: self.f.percentile_between(q * 25.0, (q + 1) * 25.0) for name, q in zip(filter_names, quartiles) } ) results = self.run_graph( graph, initial_workspace={self.f: data}, mask=self.build_mask(ones((5, 5))), ) for name, quartile in zip(filter_names, quartiles): result = results[name] lower = quartile * 25.0 upper = (quartile + 1) * 25.0 expected = and_( nanpercentile(data, lower, axis=1, keepdims=True) <= data, data <= nanpercentile(data, upper, axis=1, keepdims=True), ) check_arrays(result, expected)
def test_rank_after_mask(self): # data = arange(25).reshape(5, 5).transpose() % 4 data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2], [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]], dtype=float) mask_data = ~eye(5, dtype=bool) initial_workspace = {self.f: data, Mask(): mask_data} graph = TermGraph( { "ascending_nomask": self.f.rank(ascending=True), "ascending_mask": self.f.rank(ascending=True, mask=Mask()), "descending_nomask": self.f.rank(ascending=False), "descending_mask": self.f.rank(ascending=False, mask=Mask()), } ) expected = { "ascending_nomask": array( [ [1.0, 3.0, 4.0, 5.0, 2.0], [2.0, 4.0, 5.0, 1.0, 3.0], [3.0, 5.0, 1.0, 2.0, 4.0], [4.0, 1.0, 2.0, 3.0, 5.0], [1.0, 3.0, 4.0, 5.0, 2.0], ] ), "descending_nomask": array( [ [4.0, 3.0, 2.0, 1.0, 5.0], [3.0, 2.0, 1.0, 5.0, 4.0], [2.0, 1.0, 5.0, 4.0, 3.0], [1.0, 5.0, 4.0, 3.0, 2.0], [4.0, 3.0, 2.0, 1.0, 5.0], ] ), # Diagonal should be all nans, and anything whose rank was less # than the diagonal in the unmasked calc should go down by 1. "ascending_mask": array( [ [nan, 2.0, 3.0, 4.0, 1.0], [2.0, nan, 4.0, 1.0, 3.0], [2.0, 4.0, nan, 1.0, 3.0], [3.0, 1.0, 2.0, nan, 4.0], [1.0, 2.0, 3.0, 4.0, nan], ] ), "descending_mask": array( [ [nan, 3.0, 2.0, 1.0, 4.0], [2.0, nan, 1.0, 4.0, 3.0], [2.0, 1.0, nan, 4.0, 3.0], [1.0, 4.0, 3.0, nan, 2.0], [4.0, 3.0, 2.0, 1.0, nan], ] ), } results = self.run_graph(graph, initial_workspace, mask=self.build_mask(ones((5, 5)))) for method in results: check_arrays(expected[method], results[method])
def check(terms): results = self.run_terms( terms, initial_workspace={self.f: data}, mask=self.build_mask(ones((5, 5))), ) for method in terms: check_arrays(results[method], expected_ranks[method])
def check_output(self, expr, expected): result = expr._compute( [self.fake_raw_data[input_] for input_ in expr.inputs], self.mask.index, self.mask.columns, self.mask.values, ) check_arrays(result, expected)
def test_rolling_and_nonrolling(self): open_ = USEquityPricing.open close = USEquityPricing.close volume = USEquityPricing.volume # Test for thirty days up to the last day that we think all # the assets existed. dates_to_test = self.dates[-30:] constants = {open_: 1, close: 2, volume: 3} loader = PrecomputedLoader( constants=constants, dates=self.dates, sids=self.asset_ids, ) engine = SimplePipelineEngine( lambda column: loader, self.dates, self.asset_finder, ) sumdiff = RollingSumDifference() result = engine.run_pipeline( Pipeline( columns={ 'sumdiff': sumdiff, 'open': open_.latest, 'close': close.latest, 'volume': volume.latest, }, ), dates_to_test[0], dates_to_test[-1] ) self.assertIsNotNone(result) self.assertEqual( {'sumdiff', 'open', 'close', 'volume'}, set(result.columns) ) result_index = self.asset_ids * len(dates_to_test) result_shape = (len(result_index),) check_arrays( result['sumdiff'], Series( index=result_index, data=full(result_shape, -3, dtype=float), ), ) for name, const in [('open', 1), ('close', 2), ('volume', 3)]: check_arrays( result[name], Series( index=result_index, data=full(result_shape, const, dtype=float), ), )
def check(terms): graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={f: data}, mask=self.build_mask(ones((5, 5))), ) for method in terms: check_arrays(results[method], expected_ranks[method])
def test_notnan(self): data = self.randn_data(seed=10) diag = eye(*data.shape, dtype=bool) data[diag] = nan results = self.run_graph( TermGraph({'notnan': self.f.notnan()}), initial_workspace={self.f: data}, ) check_arrays(results['notnan'], ~diag)
def test_isfinite(self): data = self.randn_data(seed=10) data[:, 0] = nan data[:, 2] = inf data[:, 4] = -inf results = self.run_graph( TermGraph({'isfinite': self.f.isfinite()}), initial_workspace={self.f: data}, ) check_arrays(results['isfinite'], isfinite(data))
def test_top_and_bottom(self): data = self.randn_data(seed=5) # Fix a seed for determinism. mask_data = ones_like(data, dtype=bool) mask_data[:, 0] = False nan_data = data.copy() nan_data[:, 0] = nan mask = Mask() workspace = {self.f: data, mask: mask_data} methods = ['top', 'bottom'] counts = 2, 3, 10 term_combos = list(product(methods, counts, [True, False])) def termname(method, count, masked): return '_'.join([method, str(count), 'mask' if masked else '']) # Add a term for each permutation of top/bottom, count, and # mask/no_mask. terms = {} for method, count, masked in term_combos: kwargs = {'N': count} if masked: kwargs['mask'] = mask term = getattr(self.f, method)(**kwargs) terms[termname(method, count, masked)] = term results = self.run_graph(TermGraph(terms), initial_workspace=workspace) def expected_result(method, count, masked): # Ranking with a mask is equivalent to ranking with nans applied on # the masked values. to_rank = nan_data if masked else data if method == 'top': return rowwise_rank(-to_rank) < count elif method == 'bottom': return rowwise_rank(to_rank) < count for method, count, masked in term_combos: result = results[termname(method, count, masked)] # Check that `min(c, num_assets)` assets passed each day. passed_per_day = result.sum(axis=1) check_arrays( passed_per_day, full_like(passed_per_day, min(count, data.shape[1])), ) expected = expected_result(method, count, masked) check_arrays(result, expected)
def test_sequenced_filter_order_independent(self): data = self.arange_data() % 5 results = self.run_terms( { # Sequencing is equivalent to &ing for commutative filters. 'sequenced': (1.5 < self.f).then(self.f < 3.5), 'anded': (1.5 < self.f) & (self.f < 3.5), }, initial_workspace={self.f: data}, ) expected = (1.5 < data) & (data < 3.5) check_arrays(results['sequenced'], expected) check_arrays(results['anded'], expected)
def test_engine_with_multicolumn_loader(self): open_ = USEquityPricing.open close = USEquityPricing.close volume = USEquityPricing.volume # Test for thirty days up to the second to last day that we think all # the assets existed. If we test the last day of our calendar, no # assets will be in our output, because their end dates are all dates_to_test = self.dates[-32:-2] constants = {open_: 1, close: 2, volume: 3} loader = ConstantLoader( constants=constants, dates=self.dates, assets=self.assets, ) engine = SimplePipelineEngine(loader, self.dates, self.asset_finder) sumdiff = RollingSumDifference() result = engine.run_pipeline( Pipeline( columns={ 'sumdiff': sumdiff, 'open': open_.latest, 'close': close.latest, 'volume': volume.latest, }, ), dates_to_test[0], dates_to_test[-1] ) self.assertIsNotNone(result) self.assertEqual( {'sumdiff', 'open', 'close', 'volume'}, set(result.columns) ) result_index = self.assets * len(dates_to_test) result_shape = (len(result_index),) check_arrays( result['sumdiff'], Series(index=result_index, data=full(result_shape, -3)), ) for name, const in [('open', 1), ('close', 2), ('volume', 3)]: check_arrays( result[name], Series(index=result_index, data=full(result_shape, const)), )
def test_rank_after_mask(self, name, factor_dtype): f = F(dtype=factor_dtype) # data = arange(25).reshape(5, 5).transpose() % 4 data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2], [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]], dtype=factor_dtype) mask_data = ~eye(5, dtype=bool) initial_workspace = {f: data, Mask(): mask_data} graph = TermGraph({ "ascending_nomask": f.rank(ascending=True), "ascending_mask": f.rank(ascending=True, mask=Mask()), "descending_nomask": f.rank(ascending=False), "descending_mask": f.rank(ascending=False, mask=Mask()), }) expected = { "ascending_nomask": array([[1., 3., 4., 5., 2.], [2., 4., 5., 1., 3.], [3., 5., 1., 2., 4.], [4., 1., 2., 3., 5.], [1., 3., 4., 5., 2.]]), "descending_nomask": array([[4., 3., 2., 1., 5.], [3., 2., 1., 5., 4.], [2., 1., 5., 4., 3.], [1., 5., 4., 3., 2.], [4., 3., 2., 1., 5.]]), # Diagonal should be all nans, and anything whose rank was less # than the diagonal in the unmasked calc should go down by 1. "ascending_mask": array([[nan, 2., 3., 4., 1.], [2., nan, 4., 1., 3.], [2., 4., nan, 1., 3.], [3., 1., 2., nan, 4.], [1., 2., 3., 4., nan]]), "descending_mask": array([[nan, 3., 2., 1., 4.], [2., nan, 1., 4., 3.], [2., 1., nan, 4., 3.], [1., 4., 3., nan, 2.], [4., 3., 2., 1., nan]]), } results = self.run_graph( graph, initial_workspace, mask=self.build_mask(ones((5, 5))), ) for method in results: check_arrays(expected[method], results[method])
def test_masking(self, dtype, missing_value, window_length): missing_value = value_with_dtype(dtype, missing_value) baseline_ints = arange(15).reshape(5, 3) baseline = baseline_ints.astype(dtype) mask = (baseline_ints % 2).astype(bool) masked_baseline = where(mask, baseline, missing_value) array = AdjustedArray( baseline, mask, adjustments={}, missing_value=missing_value, ) gen_expected = moving_window(masked_baseline, window_length) gen_actual = array.traverse(window_length) for expected, actual in zip(gen_expected, gen_actual): check_arrays(expected, actual)
def test_masking(self, dtype, missing_value, window_length): missing_value = coerce_to_dtype(dtype, missing_value) baseline_ints = arange(15).reshape(5, 3) baseline = baseline_ints.astype(dtype) mask = (baseline_ints % 2).astype(bool) masked_baseline = where(mask, baseline, missing_value) array = AdjustedArray( baseline, mask, adjustments={}, missing_value=missing_value, ) gen_expected = moving_window(masked_baseline, window_length) gen_actual = array.traverse(window_length) for expected, actual in zip(gen_expected, gen_actual): check_arrays(expected, actual)
def test_bottom(self): counts = 2, 3, 10 data = self.randn_data(seed=5) # Arbitrary seed choice. results = self.run_terms( terms={'bottom_' + str(c): self.f.bottom(c) for c in counts}, initial_workspace={self.f: data}, ) for c in counts: result = results['bottom_' + str(c)] # Check that `min(c, num_assets)` assets passed each day. passed_per_day = result.sum(axis=1) check_arrays( passed_per_day, full_like(passed_per_day, min(c, data.shape[1])), ) # Check that the bottom `c` assets passed. expected = rowwise_rank(data) < c check_arrays(result, expected)
def test_bottom(self): counts = 2, 3, 10 data = self.randn_data(seed=5) # Arbitrary seed choice. results = self.run_graph( TermGraph({'bottom_' + str(c): self.f.bottom(c) for c in counts}), initial_workspace={self.f: data}, ) for c in counts: result = results['bottom_' + str(c)] # Check that `min(c, num_assets)` assets passed each day. passed_per_day = result.sum(axis=1) check_arrays( passed_per_day, full_like(passed_per_day, min(c, data.shape[1])), ) # Check that the bottom `c` assets passed. expected = rowwise_rank(data) < c check_arrays(result, expected)
def test_top(self): counts = 2, 3, 10 data = self.randn_data(seed=5) # Arbitrary seed choice. results = self.run_terms( terms={'top_' + str(c): self.f.top(c) for c in counts}, initial_workspace={self.f: data}, ) for c in counts: result = results['top_' + str(c)] # Check that `min(c, num_assets)` assets passed each day. passed_per_day = result.sum(axis=1) check_arrays( passed_per_day, full_like(passed_per_day, min(c, data.shape[1])), ) # Check that the top `c` assets passed. expected = rowwise_rank(-data) < c check_arrays(result, expected)
def test_percentile_after_mask(self): f_input = eye(5) g_input = arange(25, dtype=float).reshape(5, 5) initial_mask = self.build_mask(ones((5, 5))) custom_mask = self.f < 1 without_mask = self.g.percentile_between(80, 100) with_mask = self.g.percentile_between(80, 100, mask=custom_mask) graph = TermGraph( { 'custom_mask': custom_mask, 'without': without_mask, 'with': with_mask, } ) results = self.run_graph( graph, initial_workspace={self.f: f_input, self.g: g_input}, mask=initial_mask, ) # First should pass everything but the diagonal. check_arrays(results['custom_mask'], ~eye(5, dtype=bool)) # Second should pass the largest value each day. Each row is strictly # increasing, so we always select the last value. expected_without = array( [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]], dtype=bool, ) check_arrays(results['without'], expected_without) # When sequencing, we should remove the diagonal as an option before # computing percentiles. On the last day, we should get the # second-largest value, rather than the largest. expected_with = array( [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0]], # Different from previous! dtype=bool, ) check_arrays(results['with'], expected_with)
def test_sequenced_filter_order_dependent(self): first = self.f < 1 f_input = eye(5) second = self.g.percentile_between(80, 100) g_input = arange(25, dtype=float).reshape(5, 5) initial_mask = self.build_mask(ones((5, 5))) terms = { 'first': first, 'second': second, 'sequenced': first.then(second), } results = self.run_terms( terms, initial_workspace={self.f: f_input, self.g: g_input}, mask=initial_mask, ) # First should pass everything but the diagonal. check_arrays(results['first'], ~eye(5, dtype=bool)) # Second should pass the largest value each day. Each row is strictly # increasing, so we always select the last value. expected_second = array( [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]], dtype=bool, ) check_arrays(results['second'], expected_second) # When sequencing, we should remove the diagonal as an option before # computing percentiles. On the last day, we should get the # second-largest value, rather than the largest. expected_sequenced = array( [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0]], # Different from previous! dtype=bool, ) check_arrays(results['sequenced'], expected_sequenced)
def test_multiple_rolling_factors(self): loader = self.loader finder = self.asset_finder assets = self.assets engine = SimplePipelineEngine( lambda column: loader, self.dates, self.asset_finder, ) shape = num_dates, num_assets = (5, len(assets)) dates = self.dates[10:10 + num_dates] short_factor = RollingSumDifference(window_length=3) long_factor = RollingSumDifference(window_length=5) high_factor = RollingSumDifference( window_length=3, inputs=[USEquityPricing.open, USEquityPricing.high], ) pipeline = Pipeline( columns={ 'short': short_factor, 'long': long_factor, 'high': high_factor, } ) results = engine.run_pipeline(pipeline, dates[0], dates[-1]) self.assertEqual(set(results.columns), {'short', 'high', 'long'}) assert_multi_index_is_product( self, results.index, dates, finder.retrieve_all(assets) ) # row-wise sum over an array whose values are all (1 - 2) check_arrays( results['short'].unstack().values, full(shape, -short_factor.window_length), ) check_arrays( results['long'].unstack().values, full(shape, -long_factor.window_length), ) # row-wise sum over an array whose values are all (1 - 3) check_arrays( results['high'].unstack().values, full(shape, -2 * high_factor.window_length), )
def test_sequenced_filter_order_dependent(self): first = self.f < 1 f_input = eye(5) second = self.g.percentile_between(80, 100) g_input = arange(25, dtype=float).reshape(5, 5) initial_mask = self.build_mask(ones((5, 5))) terms = { 'first': first, 'second': second, 'sequenced': first.then(second), } results = self.run_terms( terms, initial_workspace={ self.f: f_input, self.g: g_input }, mask=initial_mask, ) # First should pass everything but the diagonal. check_arrays(results['first'], ~eye(5, dtype=bool)) # Second should pass the largest value each day. Each row is strictly # increasing, so we always select the last value. expected_second = array( [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]], dtype=bool, ) check_arrays(results['second'], expected_second) # When sequencing, we should remove the diagonal as an option before # computing percentiles. On the last day, we should get the # second-largest value, rather than the largest. expected_sequenced = array( [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0]], # Different from previous! dtype=bool, ) check_arrays(results['sequenced'], expected_sequenced)
def test_percentile_after_mask(self): f_input = eye(5) g_input = arange(25, dtype=float).reshape(5, 5) initial_mask = self.build_mask(ones((5, 5))) custom_mask = self.f < 1 without_mask = self.g.percentile_between(80, 100) with_mask = self.g.percentile_between(80, 100, mask=custom_mask) graph = TermGraph({ 'custom_mask': custom_mask, 'without': without_mask, 'with': with_mask, }) results = self.run_graph( graph, initial_workspace={ self.f: f_input, self.g: g_input }, mask=initial_mask, ) # First should pass everything but the diagonal. check_arrays(results['custom_mask'], ~eye(5, dtype=bool)) # Second should pass the largest value each day. Each row is strictly # increasing, so we always select the last value. expected_without = array( [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]], dtype=bool, ) check_arrays(results['without'], expected_without) # When sequencing, we should remove the diagonal as an option before # computing percentiles. On the last day, we should get the # second-largest value, rather than the largest. expected_with = array( [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0]], # Different from previous! dtype=bool, ) check_arrays(results['with'], expected_with)
def test_multiple_rolling_factors(self): loader = self.loader finder = self.asset_finder assets = self.assets engine = SimplePipelineEngine( lambda column: loader, self.dates, self.asset_finder, ) shape = num_dates, num_assets = (5, len(assets)) dates = self.dates[10:10 + num_dates] short_factor = RollingSumDifference(window_length=3) long_factor = RollingSumDifference(window_length=5) high_factor = RollingSumDifference( window_length=3, inputs=[USEquityPricing.open, USEquityPricing.high], ) pipeline = Pipeline(columns={ 'short': short_factor, 'long': long_factor, 'high': high_factor, }) results = engine.run_pipeline(pipeline, dates[0], dates[-1]) self.assertEqual(set(results.columns), {'short', 'high', 'long'}) assert_multi_index_is_product(self, results.index, dates, finder.retrieve_all(assets)) # row-wise sum over an array whose values are all (1 - 2) check_arrays( results['short'].unstack().values, full(shape, -short_factor.window_length), ) check_arrays( results['long'].unstack().values, full(shape, -long_factor.window_length), ) # row-wise sum over an array whose values are all (1 - 3) check_arrays( results['high'].unstack().values, full(shape, -2 * high_factor.window_length), )
def check_output(self, expr, expected): result = expr.compute_from_arrays( [self.fake_raw_data[input_] for input_ in expr.inputs], self.mask, ) check_arrays(result, expected)
def check_output(self, expr, expected): result = expr.compute_from_arrays([self.fake_raw_data[input_] for input_ in expr.inputs], self.mask) check_arrays(result, expected)
def test_percentile_between(self): quintiles = range(5) filter_names = ['pct_' + str(q) for q in quintiles] iter_quintiles = zip(filter_names, quintiles) graph = TermGraph({ name: self.f.percentile_between(q * 20.0, (q + 1) * 20.0) for name, q in zip(filter_names, quintiles) }) # Test with 5 columns and no NaNs. eye5 = eye(5, dtype=float64) results = self.run_graph( graph, initial_workspace={self.f: eye5}, mask=self.build_mask(ones((5, 5))), ) for name, quintile in iter_quintiles: result = results[name] if quintile < 4: # There are four 0s and one 1 in each row, so the first 4 # quintiles should be all the locations with zeros in the input # array. check_arrays(result, ~eye5.astype(bool)) else: # The top quintile should match the sole 1 in each row. check_arrays(result, eye5.astype(bool)) # Test with 6 columns, no NaNs, and one masked entry per day. eye6 = eye(6, dtype=float64) mask = array( [[1, 1, 1, 1, 1, 0], [0, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1], [1, 1, 0, 1, 1, 1], [1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 0, 1]], dtype=bool) results = self.run_graph(graph, initial_workspace={self.f: eye6}, mask=self.build_mask(mask)) for name, quintile in iter_quintiles: result = results[name] if quintile < 4: # Should keep all values that were 0 in the base data and were # 1 in the mask. check_arrays(result, mask & (~eye6.astype(bool))), else: # Should keep all the 1s in the base data. check_arrays(result, eye6.astype(bool)) # Test with 6 columns, no mask, and one NaN per day. Should have the # same outcome as if we had masked the NaNs. # In particular, the NaNs should never pass any filters. eye6_withnans = eye6.copy() putmask(eye6_withnans, ~mask, nan) results = self.run_graph(graph, initial_workspace={self.f: eye6}, mask=self.build_mask(mask)) for name, quintile in iter_quintiles: result = results[name] if quintile < 4: # Should keep all values that were 0 in the base data and were # 1 in the mask. check_arrays(result, mask & (~eye6.astype(bool))), else: # Should keep all the 1s in the base data. check_arrays(result, eye6.astype(bool))
def test_percentile_between(self): quintiles = range(5) filter_names = ['pct_' + str(q) for q in quintiles] iter_quintiles = zip(filter_names, quintiles) graph = TermGraph( { name: self.f.percentile_between(q * 20.0, (q + 1) * 20.0) for name, q in zip(filter_names, quintiles) } ) # Test with 5 columns and no NaNs. eye5 = eye(5, dtype=float64) results = self.run_graph( graph, initial_workspace={self.f: eye5}, mask=self.build_mask(ones((5, 5))), ) for name, quintile in iter_quintiles: result = results[name] if quintile < 4: # There are four 0s and one 1 in each row, so the first 4 # quintiles should be all the locations with zeros in the input # array. check_arrays(result, ~eye5.astype(bool)) else: # The top quintile should match the sole 1 in each row. check_arrays(result, eye5.astype(bool)) # Test with 6 columns, no NaNs, and one masked entry per day. eye6 = eye(6, dtype=float64) mask = array([[1, 1, 1, 1, 1, 0], [0, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1], [1, 1, 0, 1, 1, 1], [1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 0, 1]], dtype=bool) results = self.run_graph( graph, initial_workspace={self.f: eye6}, mask=self.build_mask(mask) ) for name, quintile in iter_quintiles: result = results[name] if quintile < 4: # Should keep all values that were 0 in the base data and were # 1 in the mask. check_arrays(result, mask & (~eye6.astype(bool))), else: # Should keep all the 1s in the base data. check_arrays(result, eye6.astype(bool)) # Test with 6 columns, no mask, and one NaN per day. Should have the # same outcome as if we had masked the NaNs. # In particular, the NaNs should never pass any filters. eye6_withnans = eye6.copy() putmask(eye6_withnans, ~mask, nan) results = self.run_graph( graph, initial_workspace={self.f: eye6}, mask=self.build_mask(mask) ) for name, quintile in iter_quintiles: result = results[name] if quintile < 4: # Should keep all values that were 0 in the base data and were # 1 in the mask. check_arrays(result, mask & (~eye6.astype(bool))), else: # Should keep all the 1s in the base data. check_arrays(result, eye6.astype(bool))