def fast_corr(m0, m1): """Improving the speed of correlation""" nan = np.nan isnan = np.isnan N, M = m0.shape out = np.full(M, nan) allowed_missing_count = int(0.25 * N) independent = np.where( # shape: (N, M) isnan(m0), nan, m1, ) ind_residual = independent - nanmean(independent, axis=0) # shape: (N, M) covariances = nanmean(ind_residual * m0, axis=0) # shape: (M,) # corr(x,y) = cov(x,y)/std(x)/std(y) std_v = nanstd(m0, axis=0) # std(X) could reuse ind_residual for possible speedup np.divide(covariances, std_v, out=out) std_v = nanstd(m1, axis=0) # std(Y) np.divide(out, std_v, out=out) # handle NaNs nanlocs = isnan(independent).sum(axis=0) > allowed_missing_count out[nanlocs] = nan return out
def compute(self, today, asset_ids, out, close): diffs = np.diff(close, axis=0) ups = nanmean(np.clip(diffs, 0, np.inf), axis=0) downs = abs(nanmean(np.clip(diffs, -np.inf, 0), axis=0)) rsi = np.zeros(len(out)) evaluate( "100 - (100 / (1 + (ups / downs)))", local_dict={ 'ups': ups, 'downs': downs }, global_dict={}, out=rsi, ) difference = BBSTD * nanstd(close, axis=0) middle = nanmean(close, axis=0) upper = middle + difference lower = middle - difference for i in range(len(out)): out[i] = 0 if rsi[i] < RSI_LOWER: out[i] += RSI_LOWER / rsi[i] elif rsi[i] > RSI_UPPER: out[i] -= rsi[i] / RSI_UPPER prices = close[:, i] if prices[-1] < lower[i]: out[i] += lower[i] / prices[-1] elif prices[-1] > upper[i]: out[i] -= prices[-1] / upper[i] if TREND_FOLLOW: out[i] *= -1
def get_simple_transform(self, asset, transform_name, dt, data_frequency, bars=None): if transform_name == "returns": # returns is always calculated over the last 2 days, regardless # of the simulation's data frequency. hst = self.get_history_window([asset], dt, 2, "1d", "price", ffill=True)[asset] return (hst.iloc[-1] - hst.iloc[0]) / hst.iloc[0] if bars is None: raise ValueError("bars cannot be None!") if data_frequency == "minute": freq_str = "1m" calculated_bar_count = int( self._get_minute_count_for_transform(dt, bars)) else: freq_str = "1d" calculated_bar_count = bars price_arr = self.get_history_window([asset], dt, calculated_bar_count, freq_str, "price", ffill=True)[asset] if transform_name == "mavg": return nanmean(price_arr) elif transform_name == "stddev": return nanstd(price_arr, ddof=1) elif transform_name == "vwap": volume_arr = self.get_history_window([asset], dt, calculated_bar_count, freq_str, "volume", ffill=True)[asset] vol_sum = nansum(volume_arr) try: ret = nansum(price_arr * volume_arr) / vol_sum except ZeroDivisionError: ret = np.nan return ret
def get_simple_transform(self, asset, transform_name, dt, data_frequency, bars=None): if transform_name == "returns": # returns is always calculated over the last 2 days, regardless # of the simulation's data frequency. hst = self.get_history_window( [asset], dt, 2, "1d", "price", ffill=True )[asset] return (hst.iloc[-1] - hst.iloc[0]) / hst.iloc[0] if bars is None: raise ValueError("bars cannot be None!") if data_frequency == "minute": freq_str = "1m" calculated_bar_count = int(self._get_minute_count_for_transform( dt, bars )) else: freq_str = "1d" calculated_bar_count = bars price_arr = self.get_history_window( [asset], dt, calculated_bar_count, freq_str, "price", ffill=True )[asset] if transform_name == "mavg": return nanmean(price_arr) elif transform_name == "stddev": return nanstd(price_arr, ddof=1) elif transform_name == "vwap": volume_arr = self.get_history_window( [asset], dt, calculated_bar_count, freq_str, "volume", ffill=True )[asset] vol_sum = nansum(volume_arr) try: ret = nansum(price_arr * volume_arr) / vol_sum except ZeroDivisionError: ret = np.nan return ret
def compute(self, today, assets, out, close, k): difference = k * nanstd(close, axis=0) out.middle = middle = nanmean(close, axis=0) out.upper = middle + difference out.lower = middle - difference
def compute(self, today, assets, out, returns, annualization_factor): out[:] = nanstd(returns, axis=0) * (annualization_factor**.5)
def compute(self, today, assets, out, returns, annualization_factor): out[:] = nanstd(returns, axis=0) * (annualization_factor ** .5)
def zscore(row): return (row - nanmean(row)) / nanstd(row)
class FactorTestCase(BasePipelineTestCase): def init_instance_fixtures(self): super(FactorTestCase, self).init_instance_fixtures() self.f = F() def test_bad_input(self): with self.assertRaises(UnknownRankMethod): self.f.rank("not a real rank method") @parameter_space(method_name=['isnan', 'notnan', 'isfinite']) def test_float64_only_ops(self, method_name): class NotFloat(Factor): dtype = datetime64ns_dtype inputs = () window_length = 0 nf = NotFloat() meth = getattr(nf, method_name) with self.assertRaises(TypeError): meth() @parameter_space(custom_missing_value=[-1, 0]) def test_isnull_int_dtype(self, custom_missing_value): class CustomMissingValue(Factor): dtype = int64_dtype window_length = 0 missing_value = custom_missing_value inputs = () factor = CustomMissingValue() data = arange(25).reshape(5, 5) data[eye(5, dtype=bool)] = custom_missing_value graph = TermGraph({ 'isnull': factor.isnull(), 'notnull': factor.notnull(), }) results = self.run_graph( graph, initial_workspace={factor: data}, mask=self.build_mask(ones((5, 5))), ) check_arrays(results['isnull'], eye(5, dtype=bool)) check_arrays(results['notnull'], ~eye(5, dtype=bool)) def test_isnull_datetime_dtype(self): class DatetimeFactor(Factor): dtype = datetime64ns_dtype window_length = 0 inputs = () factor = DatetimeFactor() data = arange(25).reshape(5, 5).astype('datetime64[ns]') data[eye(5, dtype=bool)] = NaTns graph = TermGraph({ 'isnull': factor.isnull(), 'notnull': factor.notnull(), }) results = self.run_graph( graph, initial_workspace={factor: data}, mask=self.build_mask(ones((5, 5))), ) check_arrays(results['isnull'], eye(5, dtype=bool)) check_arrays(results['notnull'], ~eye(5, dtype=bool)) @for_each_factor_dtype def test_rank_ascending(self, name, factor_dtype): f = F(dtype=factor_dtype) # Generated with: # data = arange(25).reshape(5, 5).transpose() % 4 data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2], [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]], dtype=factor_dtype) expected_ranks = { 'ordinal': array([[1., 3., 4., 5., 2.], [2., 4., 5., 1., 3.], [3., 5., 1., 2., 4.], [4., 1., 2., 3., 5.], [1., 3., 4., 5., 2.]]), 'average': array([[1.5, 3., 4., 5., 1.5], [2.5, 4., 5., 1., 2.5], [3.5, 5., 1., 2., 3.5], [4.5, 1., 2., 3., 4.5], [1.5, 3., 4., 5., 1.5]]), 'min': array([[1., 3., 4., 5., 1.], [2., 4., 5., 1., 2.], [3., 5., 1., 2., 3.], [4., 1., 2., 3., 4.], [1., 3., 4., 5., 1.]]), 'max': array([[2., 3., 4., 5., 2.], [3., 4., 5., 1., 3.], [4., 5., 1., 2., 4.], [5., 1., 2., 3., 5.], [2., 3., 4., 5., 2.]]), 'dense': array([[1., 2., 3., 4., 1.], [2., 3., 4., 1., 2.], [3., 4., 1., 2., 3.], [4., 1., 2., 3., 4.], [1., 2., 3., 4., 1.]]), } def check(terms): graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={f: data}, mask=self.build_mask(ones((5, 5))), ) for method in terms: check_arrays(results[method], expected_ranks[method]) check({meth: f.rank(method=meth) for meth in expected_ranks}) check({ meth: f.rank(method=meth, ascending=True) for meth in expected_ranks }) # Not passing a method should default to ordinal. check({'ordinal': f.rank()}) check({'ordinal': f.rank(ascending=True)}) @for_each_factor_dtype def test_rank_descending(self, name, factor_dtype): f = F(dtype=factor_dtype) # Generated with: # data = arange(25).reshape(5, 5).transpose() % 4 data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2], [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]], dtype=factor_dtype) expected_ranks = { 'ordinal': array([[4., 3., 2., 1., 5.], [3., 2., 1., 5., 4.], [2., 1., 5., 4., 3.], [1., 5., 4., 3., 2.], [4., 3., 2., 1., 5.]]), 'average': array([[4.5, 3., 2., 1., 4.5], [3.5, 2., 1., 5., 3.5], [2.5, 1., 5., 4., 2.5], [1.5, 5., 4., 3., 1.5], [4.5, 3., 2., 1., 4.5]]), 'min': array([[4., 3., 2., 1., 4.], [3., 2., 1., 5., 3.], [2., 1., 5., 4., 2.], [1., 5., 4., 3., 1.], [4., 3., 2., 1., 4.]]), 'max': array([[5., 3., 2., 1., 5.], [4., 2., 1., 5., 4.], [3., 1., 5., 4., 3.], [2., 5., 4., 3., 2.], [5., 3., 2., 1., 5.]]), 'dense': array([[4., 3., 2., 1., 4.], [3., 2., 1., 4., 3.], [2., 1., 4., 3., 2.], [1., 4., 3., 2., 1.], [4., 3., 2., 1., 4.]]), } def check(terms): graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={f: data}, mask=self.build_mask(ones((5, 5))), ) for method in terms: check_arrays(results[method], expected_ranks[method]) check({ meth: f.rank(method=meth, ascending=False) for meth in expected_ranks }) # Not passing a method should default to ordinal. check({'ordinal': f.rank(ascending=False)}) @for_each_factor_dtype def test_rank_after_mask(self, name, factor_dtype): f = F(dtype=factor_dtype) # data = arange(25).reshape(5, 5).transpose() % 4 data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2], [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]], dtype=factor_dtype) mask_data = ~eye(5, dtype=bool) initial_workspace = {f: data, Mask(): mask_data} graph = TermGraph({ "ascending_nomask": f.rank(ascending=True), "ascending_mask": f.rank(ascending=True, mask=Mask()), "descending_nomask": f.rank(ascending=False), "descending_mask": f.rank(ascending=False, mask=Mask()), }) expected = { "ascending_nomask": array([[1., 3., 4., 5., 2.], [2., 4., 5., 1., 3.], [3., 5., 1., 2., 4.], [4., 1., 2., 3., 5.], [1., 3., 4., 5., 2.]]), "descending_nomask": array([[4., 3., 2., 1., 5.], [3., 2., 1., 5., 4.], [2., 1., 5., 4., 3.], [1., 5., 4., 3., 2.], [4., 3., 2., 1., 5.]]), # Diagonal should be all nans, and anything whose rank was less # than the diagonal in the unmasked calc should go down by 1. "ascending_mask": array([[nan, 2., 3., 4., 1.], [2., nan, 4., 1., 3.], [2., 4., nan, 1., 3.], [3., 1., 2., nan, 4.], [1., 2., 3., 4., nan]]), "descending_mask": array([[nan, 3., 2., 1., 4.], [2., nan, 1., 4., 3.], [2., 1., nan, 4., 3.], [1., 4., 3., nan, 2.], [4., 3., 2., 1., nan]]), } results = self.run_graph( graph, initial_workspace, mask=self.build_mask(ones((5, 5))), ) for method in results: check_arrays(expected[method], results[method]) @for_each_factor_dtype def test_grouped_rank_ascending(self, name, factor_dtype=float64_dtype): f = F(dtype=factor_dtype) c = C() str_c = C(dtype=categorical_dtype, missing_value=None) # Generated with: # data = arange(25).reshape(5, 5).transpose() % 4 data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2], [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]], dtype=factor_dtype) # Generated with: # classifier_data = arange(25).reshape(5, 5).transpose() % 2 classifier_data = array( [[0, 1, 0, 1, 0], [1, 0, 1, 0, 1], [0, 1, 0, 1, 0], [1, 0, 1, 0, 1], [0, 1, 0, 1, 0]], dtype=int64_dtype) string_classifier_data = LabelArray( classifier_data.astype(str).astype(object), missing_value=None, ) expected_grouped_ranks = { 'ordinal': array([[1., 1., 3., 2., 2.], [1., 2., 3., 1., 2.], [2., 2., 1., 1., 3.], [2., 1., 1., 2., 3.], [1., 1., 3., 2., 2.]]), 'average': array([[1.5, 1., 3., 2., 1.5], [1.5, 2., 3., 1., 1.5], [2.5, 2., 1., 1., 2.5], [2.5, 1., 1., 2., 2.5], [1.5, 1., 3., 2., 1.5]]), 'min': array([[1., 1., 3., 2., 1.], [1., 2., 3., 1., 1.], [2., 2., 1., 1., 2.], [2., 1., 1., 2., 2.], [1., 1., 3., 2., 1.]]), 'max': array([[2., 1., 3., 2., 2.], [2., 2., 3., 1., 2.], [3., 2., 1., 1., 3.], [3., 1., 1., 2., 3.], [2., 1., 3., 2., 2.]]), 'dense': array([[1., 1., 2., 2., 1.], [1., 2., 2., 1., 1.], [2., 2., 1., 1., 2.], [2., 1., 1., 2., 2.], [1., 1., 2., 2., 1.]]), } def check(terms): graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={ f: data, c: classifier_data, str_c: string_classifier_data, }, mask=self.build_mask(ones((5, 5))), ) for method in terms: check_arrays(results[method], expected_grouped_ranks[method]) # Not specifying the value of ascending param should default to True check({ meth: f.rank(method=meth, groupby=c) for meth in expected_grouped_ranks }) check({ meth: f.rank(method=meth, groupby=str_c) for meth in expected_grouped_ranks }) check({ meth: f.rank(method=meth, groupby=c, ascending=True) for meth in expected_grouped_ranks }) check({ meth: f.rank(method=meth, groupby=str_c, ascending=True) for meth in expected_grouped_ranks }) # Not passing a method should default to ordinal check({'ordinal': f.rank(groupby=c)}) check({'ordinal': f.rank(groupby=str_c)}) check({'ordinal': f.rank(groupby=c, ascending=True)}) check({'ordinal': f.rank(groupby=str_c, ascending=True)}) @for_each_factor_dtype def test_grouped_rank_descending(self, name, factor_dtype): f = F(dtype=factor_dtype) c = C() str_c = C(dtype=categorical_dtype, missing_value=None) # Generated with: # data = arange(25).reshape(5, 5).transpose() % 4 data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2], [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]], dtype=factor_dtype) # Generated with: # classifier_data = arange(25).reshape(5, 5).transpose() % 2 classifier_data = array( [[0, 1, 0, 1, 0], [1, 0, 1, 0, 1], [0, 1, 0, 1, 0], [1, 0, 1, 0, 1], [0, 1, 0, 1, 0]], dtype=int64_dtype) string_classifier_data = LabelArray( classifier_data.astype(str).astype(object), missing_value=None, ) expected_grouped_ranks = { 'ordinal': array([[2., 2., 1., 1., 3.], [2., 1., 1., 2., 3.], [1., 1., 3., 2., 2.], [1., 2., 3., 1., 2.], [2., 2., 1., 1., 3.]]), 'average': array([[2.5, 2., 1., 1., 2.5], [2.5, 1., 1., 2., 2.5], [1.5, 1., 3., 2., 1.5], [1.5, 2., 3., 1., 1.5], [2.5, 2., 1., 1., 2.5]]), 'min': array([[2., 2., 1., 1., 2.], [2., 1., 1., 2., 2.], [1., 1., 3., 2., 1.], [1., 2., 3., 1., 1.], [2., 2., 1., 1., 2.]]), 'max': array([[3., 2., 1., 1., 3.], [3., 1., 1., 2., 3.], [2., 1., 3., 2., 2.], [2., 2., 3., 1., 2.], [3., 2., 1., 1., 3.]]), 'dense': array([[2., 2., 1., 1., 2.], [2., 1., 1., 2., 2.], [1., 1., 2., 2., 1.], [1., 2., 2., 1., 1.], [2., 2., 1., 1., 2.]]), } def check(terms): graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={ f: data, c: classifier_data, str_c: string_classifier_data, }, mask=self.build_mask(ones((5, 5))), ) for method in terms: check_arrays(results[method], expected_grouped_ranks[method]) check({ meth: f.rank(method=meth, groupby=c, ascending=False) for meth in expected_grouped_ranks }) check({ meth: f.rank(method=meth, groupby=str_c, ascending=False) for meth in expected_grouped_ranks }) # Not passing a method should default to ordinal check({'ordinal': f.rank(groupby=c, ascending=False)}) check({'ordinal': f.rank(groupby=str_c, ascending=False)}) @parameterized.expand([ # Test cases computed by doing: # from numpy.random import seed, randn # from talib import RSI # seed(seed_value) # data = abs(randn(15, 3)) # expected = [RSI(data[:, i])[-1] for i in range(3)] (100, array([41.032913785966, 51.553585468393, 51.022005016446])), (101, array([43.506969935466, 46.145367530182, 50.57407044197])), (102, array([46.610102205934, 47.646892444315, 52.13182788538])), ]) def test_rsi(self, seed_value, expected): rsi = RSI() today = datetime64(1, 'ns') assets = arange(3) out = empty((3, ), dtype=float) seed(seed_value) # Seed so we get deterministic results. test_data = abs(randn(15, 3)) out = empty((3, ), dtype=float) rsi.compute(today, assets, out, test_data) check_allclose(expected, out) @parameterized.expand([ (100, 15), (101, 4), (102, 100), ]) def test_returns(self, seed_value, window_length): returns = Returns(window_length=window_length) today = datetime64(1, 'ns') assets = arange(3) out = empty((3, ), dtype=float) seed(seed_value) # Seed so we get deterministic results. test_data = abs(randn(window_length, 3)) # Calculate the expected returns expected = (test_data[-1] - test_data[0]) / test_data[0] out = empty((3, ), dtype=float) returns.compute(today, assets, out, test_data) check_allclose(expected, out) def gen_ranking_cases(): seeds = range(int(1e4), int(1e5), int(1e4)) methods = ('ordinal', 'average') use_mask_values = (True, False) set_missing_values = (True, False) ascending_values = (True, False) return product( seeds, methods, use_mask_values, set_missing_values, ascending_values, ) @parameterized.expand(gen_ranking_cases()) def test_masked_rankdata_2d(self, seed_value, method, use_mask, set_missing, ascending): eyemask = ~eye(5, dtype=bool) nomask = ones((5, 5), dtype=bool) seed(seed_value) asfloat = (randn(5, 5) * seed_value) asdatetime = (asfloat).copy().view('datetime64[ns]') mask = eyemask if use_mask else nomask if set_missing: asfloat[:, 2] = nan asdatetime[:, 2] = NaTns float_result = masked_rankdata_2d( data=asfloat, mask=mask, missing_value=nan, method=method, ascending=True, ) datetime_result = masked_rankdata_2d( data=asdatetime, mask=mask, missing_value=NaTns, method=method, ascending=True, ) check_arrays(float_result, datetime_result) def test_normalizations_hand_computed(self): """ Test the hand-computed example in factor.demean. """ f = self.f m = Mask() c = C() str_c = C(dtype=categorical_dtype, missing_value=None) factor_data = array([[1.0, 2.0, 3.0, 4.0], [1.5, 2.5, 3.5, 1.0], [2.0, 3.0, 4.0, 1.5], [2.5, 3.5, 1.0, 2.0]], ) filter_data = array( [[False, True, True, True], [True, False, True, True], [True, True, False, True], [True, True, True, False]], dtype=bool, ) classifier_data = array( [[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2]], dtype=int64_dtype, ) string_classifier_data = LabelArray( classifier_data.astype(str).astype(object), missing_value=None, ) terms = { 'vanilla': f.demean(), 'masked': f.demean(mask=m), 'grouped': f.demean(groupby=c), 'grouped_str': f.demean(groupby=str_c), 'grouped_masked': f.demean(mask=m, groupby=c), 'grouped_masked_str': f.demean(mask=m, groupby=str_c), } expected = { 'vanilla': array([[-1.500, -0.500, 0.500, 1.500], [-0.625, 0.375, 1.375, -1.125], [-0.625, 0.375, 1.375, -1.125], [0.250, 1.250, -1.250, -0.250]], ), 'masked': array( [[nan, -1.000, 0.000, 1.000], [-0.500, nan, 1.500, -1.000], [-0.166, 0.833, nan, -0.666], [0.166, 1.166, -1.333, nan]], ), 'grouped': array([[-0.500, 0.500, -0.500, 0.500], [-0.500, 0.500, 1.250, -1.250], [-0.500, 0.500, 1.250, -1.250], [-0.500, 0.500, -0.500, 0.500]], ), 'grouped_masked': array([[nan, 0.000, -0.500, 0.500], [0.000, nan, 1.250, -1.250], [-0.500, 0.500, nan, 0.000], [-0.500, 0.500, 0.000, nan]]) } # Changing the classifier dtype shouldn't affect anything. expected['grouped_str'] = expected['grouped'] expected['grouped_masked_str'] = expected['grouped_masked'] graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={ f: factor_data, c: classifier_data, str_c: string_classifier_data, m: filter_data, }, mask=self.build_mask(self.ones_mask(shape=factor_data.shape)), ) for key, (res, exp) in dzip_exact(results, expected).items(): check_allclose( res, exp, # The hand-computed values aren't very precise (in particular, # we truncate repeating decimals at 3 places) This is just # asserting that the example isn't misleading by being totally # wrong. atol=0.001, err_msg="Mismatch for %r" % key) @parameter_space( seed_value=range(1, 2), normalizer_name_and_func=[ ('demean', lambda row: row - nanmean(row)), ('zscore', lambda row: (row - nanmean(row)) / nanstd(row)), ], add_nulls_to_factor=( False, True, ), ) def test_normalizations_randomized(self, seed_value, normalizer_name_and_func, add_nulls_to_factor): name, func = normalizer_name_and_func shape = (7, 7) # All Trues. nomask = self.ones_mask(shape=shape) # Falses on main diagonal. eyemask = self.eye_mask(shape=shape) # Falses on other diagonal. eyemask90 = rot90(eyemask) # Falses on both diagonals. xmask = eyemask & eyemask90 # Block of random data. factor_data = self.randn_data(seed=seed_value, shape=shape) if add_nulls_to_factor: factor_data = where(eyemask, factor_data, nan) # Cycles of 0, 1, 2, 0, 1, 2, ... classifier_data = ( (self.arange_data(shape=shape, dtype=int64_dtype) + seed_value) % 3) # With -1s on main diagonal. classifier_data_eyenulls = where(eyemask, classifier_data, -1) # With -1s on opposite diagonal. classifier_data_eyenulls90 = where(eyemask90, classifier_data, -1) # With -1s on both diagonals. classifier_data_xnulls = where(xmask, classifier_data, -1) f = self.f c = C() c_with_nulls = OtherC() m = Mask() method = getattr(f, name) terms = { 'vanilla': method(), 'masked': method(mask=m), 'grouped': method(groupby=c), 'grouped_with_nulls': method(groupby=c_with_nulls), 'both': method(mask=m, groupby=c), 'both_with_nulls': method(mask=m, groupby=c_with_nulls), } expected = { 'vanilla': apply_along_axis( func, 1, factor_data, ), 'masked': where( eyemask, grouped_apply(factor_data, eyemask, func), nan, ), 'grouped': grouped_apply( factor_data, classifier_data, func, ), # If the classifier has nulls, we should get NaNs in the # corresponding locations in the output. 'grouped_with_nulls': where( eyemask90, grouped_apply(factor_data, classifier_data_eyenulls90, func), nan, ), # Passing a mask with a classifier should behave as though the # classifier had nulls where the mask was False. 'both': where( eyemask, grouped_apply( factor_data, classifier_data_eyenulls, func, ), nan, ), 'both_with_nulls': where( xmask, grouped_apply( factor_data, classifier_data_xnulls, func, ), nan, ) } self.check_terms( terms=terms, expected=expected, initial_workspace={ f: factor_data, c: classifier_data, c_with_nulls: classifier_data_eyenulls90, Mask(): eyemask, }, mask=self.build_mask(nomask), ) @parameter_space(method_name=['demean', 'zscore']) def test_cant_normalize_non_float(self, method_name): class DateFactor(Factor): dtype = datetime64ns_dtype inputs = () window_length = 0 d = DateFactor() with self.assertRaises(TypeError) as e: getattr(d, method_name)() errmsg = str(e.exception) expected = ( "{normalizer}() is only defined on Factors of dtype float64," " but it was called on a Factor of dtype datetime64[ns].").format( normalizer=method_name) self.assertEqual(errmsg, expected) @parameter_space(seed=[1, 2, 3]) def test_quantiles_unmasked(self, seed): permute = partial(permute_rows, seed) shape = (6, 6) # Shuffle the input rows to verify that we don't depend on the order. # Take the log to ensure that we don't depend on linear scaling or # integrality of inputs factor_data = permute(log1p(arange(36, dtype=float).reshape(shape))) f = self.f # Apply the same shuffle we applied to the input rows to our # expectations. Doing it this way makes it obvious that our # expectation corresponds to our input, while still testing against # a range of input orderings. permuted_array = compose(permute, partial(array, dtype=int64_dtype)) self.check_terms( terms={ '2': f.quantiles(bins=2), '3': f.quantiles(bins=3), '6': f.quantiles(bins=6), }, initial_workspace={ f: factor_data, }, expected={ # The values in the input are all increasing, so the first half # of each row should be in the bottom bucket, and the second # half should be in the top bucket. '2': permuted_array([[0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1]]), # Similar for three buckets. '3': permuted_array([[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2]]), # In the limiting case, we just have every column different. '6': permuted_array([[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]]), }, mask=self.build_mask(self.ones_mask(shape=shape)), ) @parameter_space(seed=[1, 2, 3]) def test_quantiles_masked(self, seed): permute = partial(permute_rows, seed) # 7 x 7 so that we divide evenly into 2/3/6-tiles after including the # nan value in each row. shape = (7, 7) # Shuffle the input rows to verify that we don't depend on the order. # Take the log to ensure that we don't depend on linear scaling or # integrality of inputs factor_data = permute(log1p(arange(49, dtype=float).reshape(shape))) factor_data_w_nans = where( permute(rot90(self.eye_mask(shape=shape))), factor_data, nan, ) mask_data = permute(self.eye_mask(shape=shape)) f = F() f_nans = OtherF() m = Mask() # Apply the same shuffle we applied to the input rows to our # expectations. Doing it this way makes it obvious that our # expectation corresponds to our input, while still testing against # a range of input orderings. permuted_array = compose(permute, partial(array, dtype=int64_dtype)) self.check_terms( terms={ '2_masked': f.quantiles(bins=2, mask=m), '3_masked': f.quantiles(bins=3, mask=m), '6_masked': f.quantiles(bins=6, mask=m), '2_nans': f_nans.quantiles(bins=2), '3_nans': f_nans.quantiles(bins=3), '6_nans': f_nans.quantiles(bins=6), }, initial_workspace={ f: factor_data, f_nans: factor_data_w_nans, m: mask_data, }, expected={ # Expected results here are the same as in # test_quantiles_unmasked, except with diagonals of -1s # interpolated to match the effects of masking and/or input # nans. '2_masked': permuted_array([[-1, 0, 0, 0, 1, 1, 1], [0, -1, 0, 0, 1, 1, 1], [0, 0, -1, 0, 1, 1, 1], [0, 0, 0, -1, 1, 1, 1], [0, 0, 0, 1, -1, 1, 1], [0, 0, 0, 1, 1, -1, 1], [0, 0, 0, 1, 1, 1, -1]]), '3_masked': permuted_array([[-1, 0, 0, 1, 1, 2, 2], [0, -1, 0, 1, 1, 2, 2], [0, 0, -1, 1, 1, 2, 2], [0, 0, 1, -1, 1, 2, 2], [0, 0, 1, 1, -1, 2, 2], [0, 0, 1, 1, 2, -1, 2], [0, 0, 1, 1, 2, 2, -1]]), '6_masked': permuted_array([[-1, 0, 1, 2, 3, 4, 5], [0, -1, 1, 2, 3, 4, 5], [0, 1, -1, 2, 3, 4, 5], [0, 1, 2, -1, 3, 4, 5], [0, 1, 2, 3, -1, 4, 5], [0, 1, 2, 3, 4, -1, 5], [0, 1, 2, 3, 4, 5, -1]]), '2_nans': permuted_array([[0, 0, 0, 1, 1, 1, -1], [0, 0, 0, 1, 1, -1, 1], [0, 0, 0, 1, -1, 1, 1], [0, 0, 0, -1, 1, 1, 1], [0, 0, -1, 0, 1, 1, 1], [0, -1, 0, 0, 1, 1, 1], [-1, 0, 0, 0, 1, 1, 1]]), '3_nans': permuted_array([[0, 0, 1, 1, 2, 2, -1], [0, 0, 1, 1, 2, -1, 2], [0, 0, 1, 1, -1, 2, 2], [0, 0, 1, -1, 1, 2, 2], [0, 0, -1, 1, 1, 2, 2], [0, -1, 0, 1, 1, 2, 2], [-1, 0, 0, 1, 1, 2, 2]]), '6_nans': permuted_array([[0, 1, 2, 3, 4, 5, -1], [0, 1, 2, 3, 4, -1, 5], [0, 1, 2, 3, -1, 4, 5], [0, 1, 2, -1, 3, 4, 5], [0, 1, -1, 2, 3, 4, 5], [0, -1, 1, 2, 3, 4, 5], [-1, 0, 1, 2, 3, 4, 5]]), }, mask=self.build_mask(self.ones_mask(shape=shape)), ) def test_quantiles_uneven_buckets(self): permute = partial(permute_rows, 5) shape = (5, 5) factor_data = permute(log1p(arange(25, dtype=float).reshape(shape))) mask_data = permute(self.eye_mask(shape=shape)) f = F() m = Mask() permuted_array = compose(permute, partial(array, dtype=int64_dtype)) self.check_terms( terms={ '3_masked': f.quantiles(bins=3, mask=m), '7_masked': f.quantiles(bins=7, mask=m), }, initial_workspace={ f: factor_data, m: mask_data, }, expected={ '3_masked': permuted_array([[-1, 0, 0, 1, 2], [0, -1, 0, 1, 2], [0, 0, -1, 1, 2], [0, 0, 1, -1, 2], [0, 0, 1, 2, -1]]), '7_masked': permuted_array([[-1, 0, 2, 4, 6], [0, -1, 2, 4, 6], [0, 2, -1, 4, 6], [0, 2, 4, -1, 6], [0, 2, 4, 6, -1]]), }, mask=self.build_mask(self.ones_mask(shape=shape)), ) def test_quantile_helpers(self): f = self.f m = Mask() self.assertIs(f.quartiles(), f.quantiles(bins=4)) self.assertIs(f.quartiles(mask=m), f.quantiles(bins=4, mask=m)) self.assertIsNot(f.quartiles(), f.quartiles(mask=m)) self.assertIs(f.quintiles(), f.quantiles(bins=5)) self.assertIs(f.quintiles(mask=m), f.quantiles(bins=5, mask=m)) self.assertIsNot(f.quintiles(), f.quintiles(mask=m)) self.assertIs(f.deciles(), f.quantiles(bins=10)) self.assertIs(f.deciles(mask=m), f.quantiles(bins=10, mask=m)) self.assertIsNot(f.deciles(), f.deciles(mask=m))