def test_percentile_after_mask(self): f_input = eye(5) g_input = arange(25, dtype=float).reshape(5, 5) initial_mask = self.build_mask(ones((5, 5))) custom_mask = self.f < 1 without_mask = self.g.percentile_between(80, 100) with_mask = self.g.percentile_between(80, 100, mask=custom_mask) graph = TermGraph({ 'custom_mask': custom_mask, 'without': without_mask, 'with': with_mask, }) results = self.run_graph( graph, initial_workspace={ self.f: f_input, self.g: g_input }, mask=initial_mask, ) # First should pass everything but the diagonal. check_arrays(results['custom_mask'], ~eye(5, dtype=bool)) # Second should pass the largest value each day. Each row is strictly # increasing, so we always select the last value. expected_without = array( [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]], dtype=bool, ) check_arrays(results['without'], expected_without) # When sequencing, we should remove the diagonal as an option before # computing percentiles. On the last day, we should get the # second-largest value, rather than the largest. expected_with = array( [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0]], # Different from previous! dtype=bool, ) check_arrays(results['with'], expected_with)
def test_single_factor(self): """ Test dependency resolution for a single factor. """ def check_output(graph): resolution_order = list(graph.ordered()) self.assertEqual(len(resolution_order), 4) self.check_dependency_order(resolution_order) self.assertIn(AssetExists(), resolution_order) self.assertIn(SomeDataSet.foo, resolution_order) self.assertIn(SomeDataSet.bar, resolution_order) self.assertIn(SomeFactor(), resolution_order) self.assertEqual(graph.node[SomeDataSet.foo]['extra_rows'], 4) self.assertEqual(graph.node[SomeDataSet.bar]['extra_rows'], 4) for foobar in gen_equivalent_factors(): check_output(TermGraph(to_dict([foobar])))
def test_bottom(self): counts = 2, 3, 10 data = self.randn_data(seed=5) # Arbitrary seed choice. results = self.run_graph( TermGraph({'bottom_' + str(c): self.f.bottom(c) for c in counts}), initial_workspace={self.f: data}, ) for c in counts: result = results['bottom_' + str(c)] # Check that `min(c, num_assets)` assets passed each day. passed_per_day = result.sum(axis=1) check_arrays( passed_per_day, full_like(passed_per_day, min(c, data.shape[1])), ) # Check that the bottom `c` assets passed. expected = rowwise_rank(data) < c check_arrays(result, expected)
def test_single_factor_instance_args(self): """ Test dependency resolution for a single factor with arguments passed to the constructor. """ bar, buzz = SomeDataSet.bar, SomeDataSet.buzz graph = TermGraph(to_dict([SomeFactor([bar, buzz], window_length=5)])) resolution_order = list(graph.ordered()) # SomeFactor, its inputs, and AssetExists() self.assertEqual(len(resolution_order), 4) self.check_dependency_order(resolution_order) self.assertIn(AssetExists(), resolution_order) self.assertEqual(graph.extra_rows[AssetExists()], 4) self.assertIn(bar, resolution_order) self.assertIn(buzz, resolution_order) self.assertIn(SomeFactor([bar, buzz], window_length=5), resolution_order) self.assertEqual(graph.extra_rows[bar], 4) self.assertEqual(graph.extra_rows[buzz], 4)
def test_all(self): data = array( [[1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1], [1, 1, 0, 1, 1, 1], [1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 0, 1], [1, 1, 1, 1, 1, 0]], dtype=bool) # With a window_length of N, 0's should be "sticky" for the (N - 1) # days after the 0 in the base data. # Note that, the way ``self.run_graph`` works, we compute the same # number of output rows for all inputs, so we only get the last 4 # outputs for expected_3 even though we have enought input data to # compute 5 rows. expected_3 = array([[0, 0, 0, 1, 1, 1], [1, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 1], [1, 1, 1, 0, 0, 0]], dtype=bool) expected_4 = array([[0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 1, 1], [1, 0, 0, 0, 0, 1], [1, 1, 0, 0, 0, 0]], dtype=bool) class Input(Filter): inputs = () window_length = 0 results = self.run_graph( TermGraph({ '3': All(inputs=[Input()], window_length=3), '4': All(inputs=[Input()], window_length=4), }), initial_workspace={Input(): data}, mask=self.build_mask(ones(shape=data.shape)), ) check_arrays(results['3'], expected_3) check_arrays(results['4'], expected_4)
def test_isnull_datetime_dtype(self): class DatetimeFactor(Factor): dtype = datetime64ns_dtype window_length = 0 inputs = () factor = DatetimeFactor() data = arange(25).reshape(5, 5).astype('datetime64[ns]') data[eye(5, dtype=bool)] = NaTns graph = TermGraph({ 'isnull': factor.isnull(), 'notnull': factor.notnull(), }) results = self.run_graph( graph, initial_workspace={factor: data}, mask=self.build_mask(ones((5, 5))), ) check_arrays(results['isnull'], eye(5, dtype=bool)) check_arrays(results['notnull'], ~eye(5, dtype=bool))
def test_isnull_int_dtype(self, custom_missing_value): class CustomMissingValue(Factor): dtype = int64_dtype window_length = 0 missing_value = custom_missing_value inputs = () factor = CustomMissingValue() data = arange(25).reshape(5, 5) data[eye(5, dtype=bool)] = custom_missing_value graph = TermGraph({ 'isnull': factor.isnull(), 'notnull': factor.notnull(), }) results = self.run_graph( graph, initial_workspace={factor: data}, mask=self.build_mask(ones((5, 5))), ) check_arrays(results['isnull'], eye(5, dtype=bool)) check_arrays(results['notnull'], ~eye(5, dtype=bool))
def test_percentile_between(self): quintiles = range(5) filter_names = ['pct_' + str(q) for q in quintiles] iter_quintiles = zip(filter_names, quintiles) graph = TermGraph({ name: self.f.percentile_between(q * 20.0, (q + 1) * 20.0) for name, q in zip(filter_names, quintiles) }) # Test with 5 columns and no NaNs. eye5 = eye(5, dtype=float64) results = self.run_graph( graph, initial_workspace={self.f: eye5}, mask=self.build_mask(ones((5, 5))), ) for name, quintile in iter_quintiles: result = results[name] if quintile < 4: # There are four 0s and one 1 in each row, so the first 4 # quintiles should be all the locations with zeros in the input # array. check_arrays(result, ~eye5.astype(bool)) else: # The top quintile should match the sole 1 in each row. check_arrays(result, eye5.astype(bool)) # Test with 6 columns, no NaNs, and one masked entry per day. eye6 = eye(6, dtype=float64) mask = array( [[1, 1, 1, 1, 1, 0], [0, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1], [1, 1, 0, 1, 1, 1], [1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 0, 1]], dtype=bool) results = self.run_graph(graph, initial_workspace={self.f: eye6}, mask=self.build_mask(mask)) for name, quintile in iter_quintiles: result = results[name] if quintile < 4: # Should keep all values that were 0 in the base data and were # 1 in the mask. check_arrays(result, mask & (~eye6.astype(bool))), else: # Should keep all the 1s in the base data. check_arrays(result, eye6.astype(bool)) # Test with 6 columns, no mask, and one NaN per day. Should have the # same outcome as if we had masked the NaNs. # In particular, the NaNs should never pass any filters. eye6_withnans = eye6.copy() putmask(eye6_withnans, ~mask, nan) results = self.run_graph(graph, initial_workspace={self.f: eye6}, mask=self.build_mask(mask)) for name, quintile in iter_quintiles: result = results[name] if quintile < 4: # Should keep all values that were 0 in the base data and were # 1 in the mask. check_arrays(result, mask & (~eye6.astype(bool))), else: # Should keep all the 1s in the base data. check_arrays(result, eye6.astype(bool))
def test_normalizations_hand_computed(self): """ Test the hand-computed example in factor.demean. """ f = self.f m = Mask() c = C() str_c = C(dtype=categorical_dtype, missing_value=None) factor_data = array([[1.0, 2.0, 3.0, 4.0], [1.5, 2.5, 3.5, 1.0], [2.0, 3.0, 4.0, 1.5], [2.5, 3.5, 1.0, 2.0]], ) filter_data = array( [[False, True, True, True], [True, False, True, True], [True, True, False, True], [True, True, True, False]], dtype=bool, ) classifier_data = array( [[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2]], dtype=int64_dtype, ) string_classifier_data = LabelArray( classifier_data.astype(str).astype(object), missing_value=None, ) terms = { 'vanilla': f.demean(), 'masked': f.demean(mask=m), 'grouped': f.demean(groupby=c), 'grouped_str': f.demean(groupby=str_c), 'grouped_masked': f.demean(mask=m, groupby=c), 'grouped_masked_str': f.demean(mask=m, groupby=str_c), } expected = { 'vanilla': array([[-1.500, -0.500, 0.500, 1.500], [-0.625, 0.375, 1.375, -1.125], [-0.625, 0.375, 1.375, -1.125], [0.250, 1.250, -1.250, -0.250]], ), 'masked': array( [[nan, -1.000, 0.000, 1.000], [-0.500, nan, 1.500, -1.000], [-0.166, 0.833, nan, -0.666], [0.166, 1.166, -1.333, nan]], ), 'grouped': array([[-0.500, 0.500, -0.500, 0.500], [-0.500, 0.500, 1.250, -1.250], [-0.500, 0.500, 1.250, -1.250], [-0.500, 0.500, -0.500, 0.500]], ), 'grouped_masked': array([[nan, 0.000, -0.500, 0.500], [0.000, nan, 1.250, -1.250], [-0.500, 0.500, nan, 0.000], [-0.500, 0.500, 0.000, nan]]) } # Changing the classifier dtype shouldn't affect anything. expected['grouped_str'] = expected['grouped'] expected['grouped_masked_str'] = expected['grouped_masked'] graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={ f: factor_data, c: classifier_data, str_c: string_classifier_data, m: filter_data, }, mask=self.build_mask(self.ones_mask(shape=factor_data.shape)), ) for key, (res, exp) in dzip_exact(results, expected).items(): check_allclose( res, exp, # The hand-computed values aren't very precise (in particular, # we truncate repeating decimals at 3 places) This is just # asserting that the example isn't misleading by being totally # wrong. atol=0.001, err_msg="Mismatch for %r" % key)
def test_normalizations(self, seed_value, normalizer_name_and_func, add_nulls_to_factor): name, func = normalizer_name_and_func shape = (7, 7) # All Trues. nomask = self.ones_mask(shape=shape) # Falses on main diagonal. eyemask = self.eye_mask(shape=shape) # Falses on other diagonal. eyemask_T = eyemask.T # Falses on both diagonals. xmask = eyemask & eyemask_T # Block of random data. factor_data = self.randn_data(seed=seed_value, shape=shape) if add_nulls_to_factor: factor_data = where(eyemask, factor_data, nan) # Cycles of 0, 1, 2, 0, 1, 2, ... classifier_data = ( (self.arange_data(shape=shape, dtype=int) + seed_value) % 3) # With -1s on main diagonal. classifier_data_eyenulls = where(eyemask, classifier_data, -1) # With -1s on opposite diagonal. classifier_data_eyenulls_T = where(eyemask_T, classifier_data, -1) # With -1s on both diagonals. classifier_data_xnulls = where(xmask, classifier_data, -1) f = self.f c = C() c_with_nulls = OtherC() m = Mask() method = getattr(f, name) terms = { 'vanilla': method(), 'masked': method(mask=m), 'grouped': method(groupby=c), 'grouped_with_nulls': method(groupby=c_with_nulls), 'both': method(mask=m, groupby=c), 'both_with_nulls': method(mask=m, groupby=c_with_nulls), } expected = { 'vanilla': apply_along_axis( func, 1, factor_data, ), 'masked': where( eyemask, grouped_apply(factor_data, eyemask, func), nan, ), 'grouped': grouped_apply( factor_data, classifier_data, func, ), # If the classifier has nulls, we should get NaNs in the # corresponding locations in the output. 'grouped_with_nulls': where( eyemask_T, grouped_apply(factor_data, classifier_data_eyenulls_T, func), nan, ), # Passing a mask with a classifier should behave as though the # classifier had nulls where the mask was False. 'both': where( eyemask, grouped_apply( factor_data, classifier_data_eyenulls, func, ), nan, ), 'both_with_nulls': where( xmask, grouped_apply( factor_data, classifier_data_xnulls, func, ), nan, ) } graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={ f: factor_data, c: classifier_data, c_with_nulls: classifier_data_eyenulls_T, Mask(): eyemask, }, mask=self.build_mask(nomask), ) for key in expected: check_arrays(expected[key], results[key])