def init_class_fixtures(cls): super(StatisticalMethodsTestCase, cls).init_class_fixtures() # Using these start and end dates because they are a contigous span of # 5 days (Monday - Friday) and they allow for plenty of days to look # back on when computing correlations and regressions. cls.dates = dates = cls.trading_days cls.start_date_index = start_date_index = 14 cls.end_date_index = end_date_index = 18 cls.pipeline_start_date = cls.trading_days[start_date_index] cls.pipeline_end_date = cls.trading_days[end_date_index] sids = cls.sids cls.assets = assets = cls.asset_finder.retrieve_all(sids) cls.my_asset_column = my_asset_column = 0 cls.my_asset = assets[my_asset_column] cls.num_days = num_days = end_date_index - start_date_index + 1 cls.num_assets = num_assets = len(assets) cls.cascading_mask = \ AssetIDPlusDay() < (sids[-1] + dates[start_date_index].day) cls.expected_cascading_mask_result = make_cascading_boolean_array( shape=(num_days, num_assets), ) cls.alternating_mask = (AssetIDPlusDay() % 2).eq(0) cls.expected_alternating_mask_result = make_alternating_boolean_array( shape=(num_days, num_assets), ) cls.expected_no_mask_result = full( shape=(num_days, num_assets), fill_value=True, dtype=bool_dtype, ) # Random input for factors. cls.col = TestingDataSet.float_col
def init_class_fixtures(cls): super(StatisticalBuiltInsTestCase, cls).init_class_fixtures() day = cls.trading_calendar.day cls.dates = dates = date_range( '2015-02-01', '2015-02-28', freq=day, tz='UTC', ) # Using these start and end dates because they are a contigous span of # 5 days (Monday - Friday) and they allow for plenty of days to look # back on when computing correlations and regressions. cls.start_date_index = start_date_index = 14 cls.end_date_index = end_date_index = 18 cls.pipeline_start_date = dates[start_date_index] cls.pipeline_end_date = dates[end_date_index] cls.num_days = num_days = end_date_index - start_date_index + 1 sids = cls.sids cls.assets = assets = cls.asset_finder.retrieve_all(sids) cls.my_asset_column = my_asset_column = 0 cls.my_asset = assets[my_asset_column] cls.num_assets = num_assets = len(assets) cls.raw_data = raw_data = DataFrame( data=arange(len(dates) * len(sids), dtype=float64_dtype).reshape( len(dates), len(sids), ), index=dates, columns=assets, ) # Using mock 'close' data here because the correlation and regression # built-ins use USEquityPricing.close as the input to their `Returns` # factors. Since there is no way to change that when constructing an # instance of these built-ins, we need to test with mock 'close' data # to most accurately reflect their true behavior and results. close_loader = DataFrameLoader(USEquityPricing.close, raw_data) cls.run_pipeline = SimplePipelineEngine( {USEquityPricing.close: close_loader}.__getitem__, dates, cls.asset_finder, ).run_pipeline cls.cascading_mask = \ AssetIDPlusDay() < (sids[-1] + dates[start_date_index].day) cls.expected_cascading_mask_result = make_cascading_boolean_array( shape=(num_days, num_assets), ) cls.alternating_mask = (AssetIDPlusDay() % 2).eq(0) cls.expected_alternating_mask_result = make_alternating_boolean_array( shape=(num_days, num_assets), ) cls.expected_no_mask_result = full( shape=(num_days, num_assets), fill_value=True, dtype=bool_dtype, )
def test_factor_with_multiple_outputs(self): dates = self.dates[5:10] assets = self.assets asset_ids = self.asset_ids constants = self.constants num_dates = len(dates) num_assets = len(assets) open = USEquityPricing.open close = USEquityPricing.close engine = SimplePipelineEngine( lambda column: self.loader, self.dates, self.asset_finder, ) def create_expected_results(expected_value, mask): expected_values = where(mask, expected_value, nan) return DataFrame(expected_values, index=dates, columns=assets) cascading_mask = AssetIDPlusDay() < (asset_ids[-1] + dates[0].day) expected_cascading_mask_result = make_cascading_boolean_array( shape=(num_dates, num_assets), ) alternating_mask = (AssetIDPlusDay() % 2).eq(0) expected_alternating_mask_result = make_alternating_boolean_array( shape=(num_dates, num_assets), first_value=False, ) expected_no_mask_result = full( shape=(num_dates, num_assets), fill_value=True, dtype=bool_dtype, ) masks = cascading_mask, alternating_mask, NotSpecified expected_mask_results = ( expected_cascading_mask_result, expected_alternating_mask_result, expected_no_mask_result, ) for mask, expected_mask in zip(masks, expected_mask_results): open_price, close_price = MultipleOutputs(mask=mask) pipeline = Pipeline( columns={'open_price': open_price, 'close_price': close_price}, ) if mask is not NotSpecified: pipeline.add(mask, 'mask') results = engine.run_pipeline(pipeline, dates[0], dates[-1]) for colname, case_column in (('open_price', open), ('close_price', close)): if mask is not NotSpecified: mask_results = results['mask'].unstack() check_arrays(mask_results.values, expected_mask) output_results = results[colname].unstack() output_expected = create_expected_results( constants[case_column], expected_mask, ) assert_frame_equal(output_results, output_expected)
def test_masked_single_column_output(self): """ Tests for masking custom factors that compute a 1D out. """ start_date = self.pipeline_start_date end_date = self.pipeline_end_date alternating_mask = (AssetIDPlusDay() % 2).eq(0) cascading_mask = AssetIDPlusDay() < (self.sids[-1] + start_date.day) alternating_mask.window_safe = True cascading_mask.window_safe = True for mask in (alternating_mask, cascading_mask): class SingleColumnOutput(CustomFactor): window_length = 1 inputs = [self.col, mask] window_safe = True ndim = 1 def compute(self, today, assets, out, col, mask): # Because we specified ndim as 1, `out` should always be a # singleton array but `close` should be a sized based on # the mask we passed. assert out.shape == (1,) assert col.shape == (1, mask.sum()) out[:] = col.sum() # Since we cannot add single column output factors as pipeline # columns, we have to test its output through another factor. class UsesSingleColumnInput(CustomFactor): window_length = 1 inputs = [self.col, mask, SingleColumnOutput(mask=mask)] def compute(self, today, assets, out, col, mask, single_column_output): # Make sure that `single_column` has the correct value # based on the masked it used. assert single_column_output.shape == (1, 1) single_column_output_value = single_column_output[0][0] expected_value = where(mask, col, 0).sum() assert single_column_output_value == expected_value columns = {'uses_single_column_input': UsesSingleColumnInput()} # Assertions about the expected shapes of our data are made in the # `compute` function of our custom factors above. self.run_pipeline(Pipeline(columns=columns), start_date, end_date)
def test_single_column_output(self): """ Tests for custom factors that compute a 1D out. """ start_date = self.pipeline_start_date end_date = self.pipeline_end_date alternating_mask = (AssetIDPlusDay() % 2).eq(0) cascading_mask = AssetIDPlusDay() < (self.sids[-1] + start_date.day) class SingleColumnOutput(CustomFactor): window_length = 1 inputs = [self.col] window_safe = True ndim = 1 def compute(self, today, assets, out, col): # Because we specified ndim as 1, `out` should be a singleton # array but `close` should be a regular sized input. assert out.shape == (1, ) assert col.shape == (1, 3) out[:] = col.sum() # Since we cannot add single column output factors as pipeline # columns, we have to test its output through another factor. class UsesSingleColumnOutput(CustomFactor): window_length = 1 inputs = [SingleColumnOutput()] def compute(self, today, assets, out, single_column_output): # Make sure that `single_column` has the correct shape. That # is, it should always have one column regardless of any mask # passed to `UsesSingleColumnInput`. assert single_column_output.shape == (1, 1) for mask in (alternating_mask, cascading_mask): columns = { 'uses_single_column_output': UsesSingleColumnOutput(), 'uses_single_column_output_masked': UsesSingleColumnOutput(mask=mask, ), } # Assertions about the expected shapes of our data are made in the # `compute` function of our custom factors above. self.run_pipeline(Pipeline(columns=columns), start_date, end_date)
def test_masked_factor(self): """ Test that a Custom Factor computes the correct values when passed a mask. The mask/filter should be applied prior to computing any values, as opposed to computing the factor across the entire universe of assets. Any assets that are filtered out should be filled with missing values. """ loader = self.loader dates = self.dates[5:8] assets = self.assets asset_ids = self.asset_ids constants = self.constants num_dates = len(dates) num_assets = len(assets) open = USEquityPricing.open close = USEquityPricing.close engine = SimplePipelineEngine( lambda column: loader, self.dates, self.asset_finder, ) factor1_value = constants[open] factor2_value = 3.0 * (constants[open] - constants[close]) def create_expected_results(expected_value, mask): expected_values = where(mask, expected_value, nan) return DataFrame(expected_values, index=dates, columns=assets) cascading_mask = AssetIDPlusDay() < (asset_ids[-1] + dates[0].day) expected_cascading_mask_result = make_cascading_boolean_array( shape=(num_dates, num_assets), ) alternating_mask = (AssetIDPlusDay() % 2).eq(0) expected_alternating_mask_result = make_alternating_boolean_array( shape=(num_dates, num_assets), first_value=False, ) masks = cascading_mask, alternating_mask expected_mask_results = ( expected_cascading_mask_result, expected_alternating_mask_result, ) for mask, expected_mask in zip(masks, expected_mask_results): # Test running a pipeline with a single masked factor. columns = {'factor1': OpenPrice(mask=mask), 'mask': mask} pipeline = Pipeline(columns=columns) results = engine.run_pipeline(pipeline, dates[0], dates[-1]) mask_results = results['mask'].unstack() check_arrays(mask_results.values, expected_mask) factor1_results = results['factor1'].unstack() factor1_expected = create_expected_results(factor1_value, mask_results) assert_frame_equal(factor1_results, factor1_expected) # Test running a pipeline with a second factor. This ensures that # adding another factor to the pipeline with a different window # length does not cause any unexpected behavior, especially when # both factors share the same mask. columns['factor2'] = RollingSumDifference(mask=mask) pipeline = Pipeline(columns=columns) results = engine.run_pipeline(pipeline, dates[0], dates[-1]) mask_results = results['mask'].unstack() check_arrays(mask_results.values, expected_mask) factor1_results = results['factor1'].unstack() factor2_results = results['factor2'].unstack() factor1_expected = create_expected_results(factor1_value, mask_results) factor2_expected = create_expected_results(factor2_value, mask_results) assert_frame_equal(factor1_results, factor1_expected) assert_frame_equal(factor2_results, factor2_expected)