def test_reuse_atomic_terms(self): """ Test that raw inputs only show up in the dependency graph once. """ f1 = SomeFactor([SomeDataSet.foo, SomeDataSet.bar]) f2 = SomeOtherFactor([SomeDataSet.bar, SomeDataSet.buzz]) graph = TermGraph(to_dict([f1, f2])) resolution_order = list(graph.ordered()) # bar should only appear once. self.assertEqual(len(resolution_order), 6) indices = { term: resolution_order.index(term) for term in resolution_order } self.assertEqual(indices[AssetExists()], 0) # Verify that f1's dependencies will be computed before f1. self.assertLess(indices[SomeDataSet.foo], indices[f1]) self.assertLess(indices[SomeDataSet.bar], indices[f1]) # Verify that f2's dependencies will be computed before f2. self.assertLess(indices[SomeDataSet.bar], indices[f2]) self.assertLess(indices[SomeDataSet.buzz], indices[f2])
def test_single_factor_instance_args(self): """ Test dependency resolution for a single factor with arguments passed to the constructor. """ bar, buzz = SomeDataSet.bar, SomeDataSet.buzz graph = TermGraph(to_dict([SomeFactor([bar, buzz], window_length=5)])) resolution_order = list(graph.ordered()) # SomeFactor, its inputs, and AssetExists() self.assertEqual(len(resolution_order), 4) self.assertIs(resolution_order[0], AssetExists()) self.assertEqual(graph.extra_rows[AssetExists()], 4) self.assertEqual( set([resolution_order[1], resolution_order[2]]), set([bar, buzz]), ) self.assertEqual( resolution_order[-1], SomeFactor([bar, buzz], window_length=5), ) self.assertEqual(graph.extra_rows[bar], 4) self.assertEqual(graph.extra_rows[buzz], 4)
def test_reuse_loadable_terms(self): """ Test that raw inputs only show up in the dependency graph once. """ f1 = SomeFactor([SomeDataSet.foo, SomeDataSet.bar]) f2 = SomeOtherFactor([SomeDataSet.bar, SomeDataSet.buzz]) graph = TermGraph(to_dict([f1, f2])) resolution_order = list(graph.ordered()) # bar should only appear once. self.assertEqual(len(resolution_order), 6) self.assertEqual(len(set(resolution_order)), 6) self.check_dependency_order(resolution_order)
def test_reuse_atomic_terms(self): """ Test that raw inputs only show up in the dependency graph once. """ f1 = SomeFactor([SomeDataSet.foo, SomeDataSet.bar]) f2 = SomeOtherFactor([SomeDataSet.bar, SomeDataSet.buzz]) graph = TermGraph(to_dict([f1, f2])) resolution_order = list(graph.ordered()) # bar should only appear once. self.assertEqual(len(resolution_order), 6) self.assertEqual(len(set(resolution_order)), 6) self.check_dependency_order(resolution_order)
def test_bollinger_bands(self, window_length, k, mask_sid): closes = self.closes(mask_sid) result = self.run_graph( TermGraph({ 'f': BollingerBands( window_length=window_length, k=k, ), }), initial_workspace={ USEquityPricing.close: AdjustedArray( closes, np.full_like(closes, True, dtype=bool), {}, np.nan, ), }, mask_sid=mask_sid, )['f'] expected_upper, expected_middle, expected_lower = self.expected( window_length, k, closes, ) assert_equal(result.upper, expected_upper) assert_equal(result.middle, expected_middle) assert_equal(result.lower, expected_lower)
def test_percentile_nasty_partitions(self): # Test percentile with nasty partitions: divide up 5 assets into # quartiles. # There isn't a nice mathematical definition of correct behavior here, # so for now we guarantee the behavior of numpy.nanpercentile. This is # mostly for regression testing in case we write our own specialized # percentile calculation at some point in the future. data = arange(25, dtype=float).reshape(5, 5) % 4 quartiles = range(4) filter_names = ['pct_' + str(q) for q in quartiles] graph = TermGraph({ name: self.f.percentile_between(q * 25.0, (q + 1) * 25.0) for name, q in zip(filter_names, quartiles) }) results = self.run_graph( graph, initial_workspace={self.f: data}, mask=self.build_mask(ones((5, 5))), ) for name, quartile in zip(filter_names, quartiles): result = results[name] lower = quartile * 25.0 upper = (quartile + 1) * 25.0 expected = and_( nanpercentile(data, lower, axis=1, keepdims=True) <= data, data <= nanpercentile(data, upper, axis=1, keepdims=True), ) check_arrays(result, expected)
def test_window_safe(self, factor_len): # all true data set of (days, securities) data = full(self.default_shape, True, dtype=bool) class InputFilter(Filter): inputs = () window_length = 0 class TestFactor(CustomFactor): dtype = float64_dtype inputs = (InputFilter(), ) window_length = factor_len def compute(self, today, assets, out, filter_): # sum for each column out[:] = np_sum(filter_, axis=0) results = self.run_graph( TermGraph({'windowsafe': TestFactor()}), initial_workspace={InputFilter(): data}, ) # number of days in default_shape n = self.default_shape[0] # shape of output array output_shape = ((n - factor_len + 1), self.default_shape[1]) check_arrays(results['windowsafe'], full(output_shape, factor_len, dtype=float64))
def test_at_least_N(self): # With a window_length of K, AtLeastN should return 1 # if N or more 1's exist in the lookback window # This smoothing filter gives customizable "stickiness" data = array( [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0]], dtype=bool) expected_1 = array([[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 0, 0]], dtype=bool) expected_2 = array([[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0]], dtype=bool) expected_3 = array([[1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0]], dtype=bool) expected_4 = array([[1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0]], dtype=bool) class Input(Filter): inputs = () window_length = 0 all_but_one = AtLeastN(inputs=[Input()], window_length=4, N=3) all_but_two = AtLeastN(inputs=[Input()], window_length=4, N=2) any_equiv = AtLeastN(inputs=[Input()], window_length=4, N=1) all_equiv = AtLeastN(inputs=[Input()], window_length=4, N=4) results = self.run_graph( TermGraph({ 'AllButOne': all_but_one, 'AllButTwo': all_but_two, 'AnyEquiv': any_equiv, 'AllEquiv': all_equiv, 'Any': Any(inputs=[Input()], window_length=4), 'All': All(inputs=[Input()], window_length=4) }), initial_workspace={Input(): data}, mask=self.build_mask(ones(shape=data.shape)), ) check_arrays(results['Any'], expected_1) check_arrays(results['AnyEquiv'], expected_1) check_arrays(results['AllButTwo'], expected_2) check_arrays(results['AllButOne'], expected_3) check_arrays(results['All'], expected_4) check_arrays(results['AllEquiv'], expected_4)
def check_terms(self, terms, expected, initial_workspace, mask): """ Compile the given terms into a TermGraph, compute it with initial_workspace, and compare the results with ``expected``. """ graph = TermGraph(terms) results = self.run_graph(graph, initial_workspace, mask) for key, (res, exp) in dzip_exact(results, expected).items(): check_arrays(res, exp)
def check(terms): graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={self.f: data}, mask=self.build_mask(ones((5, 5))), ) for method in terms: check_arrays(results[method], expected_ranks[method])
def test_notnan(self): data = self.randn_data(seed=10) diag = eye(*data.shape, dtype=bool) data[diag] = nan results = self.run_graph( TermGraph({'notnan': self.f.notnan()}), initial_workspace={self.f: data}, ) check_arrays(results['notnan'], ~diag)
def test_isfinite(self): data = self.randn_data(seed=10) data[:, 0] = nan data[:, 2] = inf data[:, 4] = -inf results = self.run_graph( TermGraph({'isfinite': self.f.isfinite()}), initial_workspace={self.f: data}, ) check_arrays(results['isfinite'], isfinite(data))
def test_any(self): # FUN FACT: The inputs and outputs here are exactly the negation of # the inputs and outputs for test_all above. This isn't a coincidence. # # By de Morgan's Laws, we have:: # # ~(a & b) == (~a | ~b) # # negating both sides, we have:: # # (a & b) == ~(a | ~b) # # Since all(a, b) is isomorphic to (a & b), and any(a, b) is isomorphic # to (a | b), we have:: # # all(a, b) == ~(any(~a, ~b)) # data = array( [[0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]], dtype=bool) # With a window_length of N, 1's should be "sticky" for the (N - 1) # days after the 1 in the base data. # Note that, the way ``self.run_graph`` works, we compute the same # number of output rows for all inputs, so we only get the last 4 # outputs for expected_3 even though we have enought input data to # compute 5 rows. expected_3 = array([[1, 1, 1, 0, 0, 0], [0, 1, 1, 1, 0, 0], [0, 0, 1, 1, 1, 0], [0, 0, 0, 1, 1, 1]], dtype=bool) expected_4 = array([[1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 0, 0], [0, 1, 1, 1, 1, 0], [0, 0, 1, 1, 1, 1]], dtype=bool) class Input(Filter): inputs = () window_length = 0 results = self.run_graph( TermGraph({ '3': Any(inputs=[Input()], window_length=3), '4': Any(inputs=[Input()], window_length=4), }), initial_workspace={Input(): data}, mask=self.build_mask(ones(shape=data.shape)), ) check_arrays(results['3'], expected_3) check_arrays(results['4'], expected_4)
def test_top_and_bottom(self): data = self.randn_data(seed=5) # Fix a seed for determinism. mask_data = ones_like(data, dtype=bool) mask_data[:, 0] = False nan_data = data.copy() nan_data[:, 0] = nan mask = Mask() workspace = {self.f: data, mask: mask_data} methods = ['top', 'bottom'] counts = 2, 3, 10 term_combos = list(product(methods, counts, [True, False])) def termname(method, count, masked): return '_'.join([method, str(count), 'mask' if masked else '']) # Add a term for each permutation of top/bottom, count, and # mask/no_mask. terms = {} for method, count, masked in term_combos: kwargs = {'N': count} if masked: kwargs['mask'] = mask term = getattr(self.f, method)(**kwargs) terms[termname(method, count, masked)] = term results = self.run_graph(TermGraph(terms), initial_workspace=workspace) def expected_result(method, count, masked): # Ranking with a mask is equivalent to ranking with nans applied on # the masked values. to_rank = nan_data if masked else data if method == 'top': return rowwise_rank(-to_rank) < count elif method == 'bottom': return rowwise_rank(to_rank) < count for method, count, masked in term_combos: result = results[termname(method, count, masked)] # Check that `min(c, num_assets)` assets passed each day. passed_per_day = result.sum(axis=1) check_arrays( passed_per_day, full_like(passed_per_day, min(count, data.shape[1])), ) expected = expected_result(method, count, masked) check_arrays(result, expected)
def check(terms): graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={ f: data, c: classifier_data, str_c: string_classifier_data, }, mask=self.build_mask(ones((5, 5))), ) for method in terms: check_arrays(results[method], expected_grouped_ranks[method])
def test_percentile_after_mask(self): f_input = eye(5) g_input = arange(25, dtype=float).reshape(5, 5) initial_mask = self.build_mask(ones((5, 5))) custom_mask = self.f < 1 without_mask = self.g.percentile_between(80, 100) with_mask = self.g.percentile_between(80, 100, mask=custom_mask) graph = TermGraph( { 'custom_mask': custom_mask, 'without': without_mask, 'with': with_mask, } ) results = self.run_graph( graph, initial_workspace={self.f: f_input, self.g: g_input}, mask=initial_mask, ) # First should pass everything but the diagonal. check_arrays(results['custom_mask'], ~eye(5, dtype=bool)) # Second should pass the largest value each day. Each row is strictly # increasing, so we always select the last value. expected_without = array( [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]], dtype=bool, ) check_arrays(results['without'], expected_without) # When sequencing, we should remove the diagonal as an option before # computing percentiles. On the last day, we should get the # second-largest value, rather than the largest. expected_with = array( [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0]], # Different from previous! dtype=bool, ) check_arrays(results['with'], expected_with)
def test_isnan(self): data = self.randn_data(seed=10) diag = eye(*data.shape, dtype=bool) data[diag] = nan results = self.run_graph( TermGraph({ 'isnan': self.f.isnan(), 'isnull': self.f.isnull(), }), initial_workspace={self.f: data}, ) check_arrays(results['isnan'], diag) check_arrays(results['isnull'], diag)
def test_rank_after_mask(self, name, factor_dtype): f = F(dtype=factor_dtype) # data = arange(25).reshape(5, 5).transpose() % 4 data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2], [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]], dtype=factor_dtype) mask_data = ~eye(5, dtype=bool) initial_workspace = {f: data, Mask(): mask_data} graph = TermGraph({ "ascending_nomask": f.rank(ascending=True), "ascending_mask": f.rank(ascending=True, mask=Mask()), "descending_nomask": f.rank(ascending=False), "descending_mask": f.rank(ascending=False, mask=Mask()), }) expected = { "ascending_nomask": array([[1., 3., 4., 5., 2.], [2., 4., 5., 1., 3.], [3., 5., 1., 2., 4.], [4., 1., 2., 3., 5.], [1., 3., 4., 5., 2.]]), "descending_nomask": array([[4., 3., 2., 1., 5.], [3., 2., 1., 5., 4.], [2., 1., 5., 4., 3.], [1., 5., 4., 3., 2.], [4., 3., 2., 1., 5.]]), # Diagonal should be all nans, and anything whose rank was less # than the diagonal in the unmasked calc should go down by 1. "ascending_mask": array([[nan, 2., 3., 4., 1.], [2., nan, 4., 1., 3.], [2., 4., nan, 1., 3.], [3., 1., 2., nan, 4.], [1., 2., 3., 4., nan]]), "descending_mask": array([[nan, 3., 2., 1., 4.], [2., nan, 1., 4., 3.], [2., 1., nan, 4., 3.], [1., 4., 3., nan, 2.], [4., 3., 2., 1., nan]]), } results = self.run_graph( graph, initial_workspace, mask=self.build_mask(ones((5, 5))), ) for method in results: check_arrays(expected[method], results[method])
def test_single_factor(self): """ Test dependency resolution for a single factor. """ def check_output(graph): resolution_order = list(graph.ordered()) self.assertEqual(len(resolution_order), 4) self.check_dependency_order(resolution_order) self.assertIn(AssetExists(), resolution_order) self.assertIn(SomeDataSet.foo, resolution_order) self.assertIn(SomeDataSet.bar, resolution_order) self.assertIn(SomeFactor(), resolution_order) self.assertEqual(graph.node[SomeDataSet.foo]['extra_rows'], 4) self.assertEqual(graph.node[SomeDataSet.bar]['extra_rows'], 4) for foobar in gen_equivalent_factors(): check_output(TermGraph(to_dict([foobar])))
def test_bottom(self): counts = 2, 3, 10 data = self.randn_data(seed=5) # Arbitrary seed choice. results = self.run_graph( TermGraph({'bottom_' + str(c): self.f.bottom(c) for c in counts}), initial_workspace={self.f: data}, ) for c in counts: result = results['bottom_' + str(c)] # Check that `min(c, num_assets)` assets passed each day. passed_per_day = result.sum(axis=1) check_arrays( passed_per_day, full_like(passed_per_day, min(c, data.shape[1])), ) # Check that the bottom `c` assets passed. expected = rowwise_rank(data) < c check_arrays(result, expected)
def test_all(self): data = array( [[1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1], [1, 1, 0, 1, 1, 1], [1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 0, 1], [1, 1, 1, 1, 1, 0]], dtype=bool) # With a window_length of N, 0's should be "sticky" for the (N - 1) # days after the 0 in the base data. # Note that, the way ``self.run_graph`` works, we compute the same # number of output rows for all inputs, so we only get the last 4 # outputs for expected_3 even though we have enought input data to # compute 5 rows. expected_3 = array([[0, 0, 0, 1, 1, 1], [1, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 1], [1, 1, 1, 0, 0, 0]], dtype=bool) expected_4 = array([[0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 1, 1], [1, 0, 0, 0, 0, 1], [1, 1, 0, 0, 0, 0]], dtype=bool) class Input(Filter): inputs = () window_length = 0 results = self.run_graph( TermGraph({ '3': All(inputs=[Input()], window_length=3), '4': All(inputs=[Input()], window_length=4), }), initial_workspace={Input(): data}, mask=self.build_mask(ones(shape=data.shape)), ) check_arrays(results['3'], expected_3) check_arrays(results['4'], expected_4)
def test_isnull_datetime_dtype(self): class DatetimeFactor(Factor): dtype = datetime64ns_dtype window_length = 0 inputs = () factor = DatetimeFactor() data = arange(25).reshape(5, 5).astype('datetime64[ns]') data[eye(5, dtype=bool)] = NaTns graph = TermGraph({ 'isnull': factor.isnull(), 'notnull': factor.notnull(), }) results = self.run_graph( graph, initial_workspace={factor: data}, mask=self.build_mask(ones((5, 5))), ) check_arrays(results['isnull'], eye(5, dtype=bool)) check_arrays(results['notnull'], ~eye(5, dtype=bool))
def test_isnull_int_dtype(self, custom_missing_value): class CustomMissingValue(Factor): dtype = int64_dtype window_length = 0 missing_value = custom_missing_value inputs = () factor = CustomMissingValue() data = arange(25).reshape(5, 5) data[eye(5, dtype=bool)] = custom_missing_value graph = TermGraph({ 'isnull': factor.isnull(), 'notnull': factor.notnull(), }) results = self.run_graph( graph, initial_workspace={factor: data}, mask=self.build_mask(ones((5, 5))), ) check_arrays(results['isnull'], eye(5, dtype=bool)) check_arrays(results['notnull'], ~eye(5, dtype=bool))
def test_normalizations_hand_computed(self): """ Test the hand-computed example in factor.demean. """ f = self.f m = Mask() c = C() str_c = C(dtype=categorical_dtype, missing_value=None) factor_data = array([[1.0, 2.0, 3.0, 4.0], [1.5, 2.5, 3.5, 1.0], [2.0, 3.0, 4.0, 1.5], [2.5, 3.5, 1.0, 2.0]], ) filter_data = array( [[False, True, True, True], [True, False, True, True], [True, True, False, True], [True, True, True, False]], dtype=bool, ) classifier_data = array( [[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2]], dtype=int64_dtype, ) string_classifier_data = LabelArray( classifier_data.astype(str).astype(object), missing_value=None, ) terms = { 'vanilla': f.demean(), 'masked': f.demean(mask=m), 'grouped': f.demean(groupby=c), 'grouped_str': f.demean(groupby=str_c), 'grouped_masked': f.demean(mask=m, groupby=c), 'grouped_masked_str': f.demean(mask=m, groupby=str_c), } expected = { 'vanilla': array([[-1.500, -0.500, 0.500, 1.500], [-0.625, 0.375, 1.375, -1.125], [-0.625, 0.375, 1.375, -1.125], [0.250, 1.250, -1.250, -0.250]], ), 'masked': array( [[nan, -1.000, 0.000, 1.000], [-0.500, nan, 1.500, -1.000], [-0.166, 0.833, nan, -0.666], [0.166, 1.166, -1.333, nan]], ), 'grouped': array([[-0.500, 0.500, -0.500, 0.500], [-0.500, 0.500, 1.250, -1.250], [-0.500, 0.500, 1.250, -1.250], [-0.500, 0.500, -0.500, 0.500]], ), 'grouped_masked': array([[nan, 0.000, -0.500, 0.500], [0.000, nan, 1.250, -1.250], [-0.500, 0.500, nan, 0.000], [-0.500, 0.500, 0.000, nan]]) } # Changing the classifier dtype shouldn't affect anything. expected['grouped_str'] = expected['grouped'] expected['grouped_masked_str'] = expected['grouped_masked'] graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={ f: factor_data, c: classifier_data, str_c: string_classifier_data, m: filter_data, }, mask=self.build_mask(self.ones_mask(shape=factor_data.shape)), ) for key, (res, exp) in dzip_exact(results, expected).items(): check_allclose( res, exp, # The hand-computed values aren't very precise (in particular, # we truncate repeating decimals at 3 places) This is just # asserting that the example isn't misleading by being totally # wrong. atol=0.001, err_msg="Mismatch for %r" % key)
def test_normalizations(self, seed_value, normalizer_name_and_func, add_nulls_to_factor): name, func = normalizer_name_and_func shape = (7, 7) # All Trues. nomask = self.ones_mask(shape=shape) # Falses on main diagonal. eyemask = self.eye_mask(shape=shape) # Falses on other diagonal. eyemask_T = eyemask.T # Falses on both diagonals. xmask = eyemask & eyemask_T # Block of random data. factor_data = self.randn_data(seed=seed_value, shape=shape) if add_nulls_to_factor: factor_data = where(eyemask, factor_data, nan) # Cycles of 0, 1, 2, 0, 1, 2, ... classifier_data = ( (self.arange_data(shape=shape, dtype=int) + seed_value) % 3) # With -1s on main diagonal. classifier_data_eyenulls = where(eyemask, classifier_data, -1) # With -1s on opposite diagonal. classifier_data_eyenulls_T = where(eyemask_T, classifier_data, -1) # With -1s on both diagonals. classifier_data_xnulls = where(xmask, classifier_data, -1) f = self.f c = C() c_with_nulls = OtherC() m = Mask() method = getattr(f, name) terms = { 'vanilla': method(), 'masked': method(mask=m), 'grouped': method(groupby=c), 'grouped_with_nulls': method(groupby=c_with_nulls), 'both': method(mask=m, groupby=c), 'both_with_nulls': method(mask=m, groupby=c_with_nulls), } expected = { 'vanilla': apply_along_axis( func, 1, factor_data, ), 'masked': where( eyemask, grouped_apply(factor_data, eyemask, func), nan, ), 'grouped': grouped_apply( factor_data, classifier_data, func, ), # If the classifier has nulls, we should get NaNs in the # corresponding locations in the output. 'grouped_with_nulls': where( eyemask_T, grouped_apply(factor_data, classifier_data_eyenulls_T, func), nan, ), # Passing a mask with a classifier should behave as though the # classifier had nulls where the mask was False. 'both': where( eyemask, grouped_apply( factor_data, classifier_data_eyenulls, func, ), nan, ), 'both_with_nulls': where( xmask, grouped_apply( factor_data, classifier_data_xnulls, func, ), nan, ) } graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={ f: factor_data, c: classifier_data, c_with_nulls: classifier_data_eyenulls_T, Mask(): eyemask, }, mask=self.build_mask(nomask), ) for key in expected: check_arrays(expected[key], results[key])
def test_percentile_between(self): quintiles = range(5) filter_names = ['pct_' + str(q) for q in quintiles] iter_quintiles = zip(filter_names, quintiles) graph = TermGraph({ name: self.f.percentile_between(q * 20.0, (q + 1) * 20.0) for name, q in zip(filter_names, quintiles) }) # Test with 5 columns and no NaNs. eye5 = eye(5, dtype=float64) results = self.run_graph( graph, initial_workspace={self.f: eye5}, mask=self.build_mask(ones((5, 5))), ) for name, quintile in iter_quintiles: result = results[name] if quintile < 4: # There are four 0s and one 1 in each row, so the first 4 # quintiles should be all the locations with zeros in the input # array. check_arrays(result, ~eye5.astype(bool)) else: # The top quintile should match the sole 1 in each row. check_arrays(result, eye5.astype(bool)) # Test with 6 columns, no NaNs, and one masked entry per day. eye6 = eye(6, dtype=float64) mask = array( [[1, 1, 1, 1, 1, 0], [0, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1], [1, 1, 0, 1, 1, 1], [1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 0, 1]], dtype=bool) results = self.run_graph(graph, initial_workspace={self.f: eye6}, mask=self.build_mask(mask)) for name, quintile in iter_quintiles: result = results[name] if quintile < 4: # Should keep all values that were 0 in the base data and were # 1 in the mask. check_arrays(result, mask & (~eye6.astype(bool))), else: # Should keep all the 1s in the base data. check_arrays(result, eye6.astype(bool)) # Test with 6 columns, no mask, and one NaN per day. Should have the # same outcome as if we had masked the NaNs. # In particular, the NaNs should never pass any filters. eye6_withnans = eye6.copy() putmask(eye6_withnans, ~mask, nan) results = self.run_graph(graph, initial_workspace={self.f: eye6}, mask=self.build_mask(mask)) for name, quintile in iter_quintiles: result = results[name] if quintile < 4: # Should keep all values that were 0 in the base data and were # 1 in the mask. check_arrays(result, mask & (~eye6.astype(bool))), else: # Should keep all the 1s in the base data. check_arrays(result, eye6.astype(bool))