def compute(self, today, assets, out, returns, returns_slice): # Make sure that our slice is the correct shape (i.e. has only # one column) and that it has the same values as the original # returns factor from which it is derived. assert returns_slice.shape == (self.window_length, 1) assert returns.shape == (self.window_length, len(sids)) check_arrays(returns_slice[:, 0], returns[:, my_asset_column])
def test_make_cascading_boolean_array(self): check_arrays( make_cascading_boolean_array((3, 3)), array( [[True, True, False], [True, False, False], [False, False, False]] ), ) check_arrays( make_cascading_boolean_array((3, 3), first_value=False), array( [[False, False, True], [False, True, True], [True, True, True]] ), ) check_arrays( make_cascading_boolean_array((1, 3)), array([[True, True, False]]), ) check_arrays( make_cascading_boolean_array((3, 1)), array([[False], [False], [False]]), ) check_arrays( make_cascading_boolean_array((3, 0)), empty((3, 0), dtype=bool_dtype), )
def test_window_safe(self, factor_len): # all true data set of (days, securities) data = full(self.default_shape, True, dtype=bool) class InputFilter(Filter): inputs = () window_length = 0 class TestFactor(CustomFactor): dtype = float64_dtype inputs = (InputFilter(), ) window_length = factor_len def compute(self, today, assets, out, filter_): # sum for each column out[:] = np_sum(filter_, axis=0) results = self.run_graph( TermGraph({'windowsafe': TestFactor()}), initial_workspace={InputFilter(): data}, ) # number of days in default_shape n = self.default_shape[0] # shape of output array output_shape = ((n - factor_len + 1), self.default_shape[1]) check_arrays(results['windowsafe'], full(output_shape, factor_len, dtype=float64))
def test_setitem_array(self): arr = LabelArray(self.strs, missing_value=None) orig_arr = arr.copy() # Write a row. self.assertFalse( (arr[0] == arr[1]).all(), "This test doesn't test anything because rows 0" " and 1 are already equal!" ) arr[0] = arr[1] for i in range(arr.shape[1]): self.assertEqual(arr[0, i], arr[1, i]) # Write a column. self.assertFalse( (arr[:, 0] == arr[:, 1]).all(), "This test doesn't test anything because columns 0" " and 1 are already equal!" ) arr[:, 0] = arr[:, 1] for i in range(arr.shape[0]): self.assertEqual(arr[i, 0], arr[i, 1]) # Write the whole array. arr[:] = orig_arr check_arrays(arr, orig_arr)
def test_single_factor(self): loader = self.loader assets = self.assets engine = SimplePipelineEngine( lambda column: loader, self.dates, self.asset_finder, ) result_shape = (num_dates, num_assets) = (5, len(assets)) dates = self.dates[10:10 + num_dates] factor = RollingSumDifference() expected_result = -factor.window_length # Since every asset will pass the screen, these should be equivalent. pipelines = [ Pipeline(columns={'f': factor}), Pipeline( columns={'f': factor}, screen=factor.eq(expected_result), ), ] for p in pipelines: result = engine.run_pipeline(p, dates[0], dates[-1]) self.assertEqual(set(result.columns), {'f'}) assert_multi_index_is_product( self, result.index, dates, assets ) check_arrays( result['f'].unstack().values, full(result_shape, expected_result, dtype=float), )
def test_percentile_nasty_partitions(self): # Test percentile with nasty partitions: divide up 5 assets into # quartiles. # There isn't a nice mathematical definition of correct behavior here, # so for now we guarantee the behavior of numpy.nanpercentile. This is # mostly for regression testing in case we write our own specialized # percentile calculation at some point in the future. data = arange(25, dtype=float).reshape(5, 5) % 4 quartiles = range(4) filter_names = ['pct_' + str(q) for q in quartiles] graph = TermGraph({ name: self.f.percentile_between(q * 25.0, (q + 1) * 25.0) for name, q in zip(filter_names, quartiles) }) results = self.run_graph( graph, initial_workspace={self.f: data}, mask=self.build_mask(ones((5, 5))), ) for name, quartile in zip(filter_names, quartiles): result = results[name] lower = quartile * 25.0 upper = (quartile + 1) * 25.0 expected = and_( nanpercentile(data, lower, axis=1, keepdims=True) <= data, data <= nanpercentile(data, upper, axis=1, keepdims=True), ) check_arrays(result, expected)
def test_isnull_datetime_dtype(self): class DatetimeFactor(Factor): dtype = datetime64ns_dtype window_length = 0 inputs = () factor = DatetimeFactor() data = arange(25).reshape(5, 5).astype('datetime64[ns]') data[eye(5, dtype=bool)] = NaTns graph = TermGraph( { 'isnull': factor.isnull(), 'notnull': factor.notnull(), } ) results = self.run_graph( graph, initial_workspace={factor: data}, mask=self.build_mask(ones((5, 5))), ) check_arrays(results['isnull'], eye(5, dtype=bool)) check_arrays(results['notnull'], ~eye(5, dtype=bool))
def test_overwrite_adjustment_cases(self, name, data, lookback, adjustments, missing_value, expected): array = AdjustedArray(data, NOMASK, adjustments, missing_value) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. window_iter = array.traverse(lookback) for yielded, expected_yield in zip_longest(window_iter, expected): check_arrays(yielded, expected_yield)
def test_percentile_nasty_partitions(self): # Test percentile with nasty partitions: divide up 5 assets into # quartiles. # There isn't a nice mathematical definition of correct behavior here, # so for now we guarantee the behavior of numpy.nanpercentile. This is # mostly for regression testing in case we write our own specialized # percentile calculation at some point in the future. data = arange(25, dtype=float).reshape(5, 5) % 4 quartiles = range(4) filter_names = ['pct_' + str(q) for q in quartiles] graph = TermGraph( { name: self.f.percentile_between(q * 25.0, (q + 1) * 25.0) for name, q in zip(filter_names, quartiles) } ) results = self.run_graph( graph, initial_workspace={self.f: data}, mask=self.build_mask(ones((5, 5))), ) for name, quartile in zip(filter_names, quartiles): result = results[name] lower = quartile * 25.0 upper = (quartile + 1) * 25.0 expected = and_( nanpercentile(data, lower, axis=1, keepdims=True) <= data, data <= nanpercentile(data, upper, axis=1, keepdims=True), ) check_arrays(result, expected)
def test_input_dates_provided_by_default(self): loader = self.loader engine = SimplePipelineEngine( lambda column: loader, self.dates, self.asset_finder, ) class TestFactor(CustomFactor): inputs = [InputDates(), USEquityPricing.close] window_length = 10 dtype = datetime64ns_dtype def compute(self, today, assets, out, dates, closes): first, last = dates[[0, -1], 0] assert last == today.asm8 assert len(dates) == len(closes) == self.window_length out[:] = first p = Pipeline(columns={'t': TestFactor()}) results = engine.run_pipeline(p, self.dates[9], self.dates[10]) # All results are the same, so just grab one column. column = results.unstack().iloc[:, 0].values check_arrays(column, self.dates[:2].values)
def test_window_safe(self, factor_len): # all true data set of (days, securities) data = full(self.default_shape, True, dtype=bool) class InputFilter(Filter): inputs = () window_length = 0 class TestFactor(CustomFactor): dtype = float64_dtype inputs = (InputFilter(), ) window_length = factor_len def compute(self, today, assets, out, filter_): # sum for each column out[:] = np_sum(filter_, axis=0) results = self.run_graph( TermGraph({'windowsafe': TestFactor()}), initial_workspace={InputFilter(): data}, ) # number of days in default_shape n = self.default_shape[0] # shape of output array output_shape = ((n - factor_len + 1), self.default_shape[1]) check_arrays( results['windowsafe'], full(output_shape, factor_len, dtype=float64) )
def test_masked_rankdata_2d(self, seed_value, method, use_mask, set_missing, ascending): eyemask = ~eye(5, dtype=bool) nomask = ones((5, 5), dtype=bool) seed(seed_value) asfloat = (randn(5, 5) * seed_value) asdatetime = (asfloat).copy().view('datetime64[ns]') mask = eyemask if use_mask else nomask if set_missing: asfloat[:, 2] = nan asdatetime[:, 2] = NaTns float_result = masked_rankdata_2d( data=asfloat, mask=mask, missing_value=nan, method=method, ascending=True, ) datetime_result = masked_rankdata_2d( data=asdatetime, mask=mask, missing_value=NaTns, method=method, ascending=True, ) check_arrays(float_result, datetime_result)
def test_rolling_and_nonrolling(self): open_ = USEquityPricing.open close = USEquityPricing.close volume = USEquityPricing.volume # Test for thirty days up to the last day that we think all # the assets existed. dates_to_test = self.dates[-30:] constants = {open_: 1, close: 2, volume: 3} loader = PrecomputedLoader(constants=constants, dates=self.dates, sids=self.asset_ids) engine = SimplePipelineEngine(lambda column: loader, self.dates, self.asset_finder) sumdiff = RollingSumDifference() result = engine.run_pipeline( Pipeline( columns={"sumdiff": sumdiff, "open": open_.latest, "close": close.latest, "volume": volume.latest} ), dates_to_test[0], dates_to_test[-1], ) self.assertIsNotNone(result) self.assertEqual({"sumdiff", "open", "close", "volume"}, set(result.columns)) result_index = self.asset_ids * len(dates_to_test) result_shape = (len(result_index),) check_arrays(result["sumdiff"], Series(index=result_index, data=full(result_shape, -3, dtype=float))) for name, const in [("open", 1), ("close", 2), ("volume", 3)]: check_arrays(result[name], Series(index=result_index, data=full(result_shape, const, dtype=float)))
def test_isnull_int_dtype(self, custom_missing_value): class CustomMissingValue(Factor): dtype = int64_dtype window_length = 0 missing_value = custom_missing_value inputs = () factor = CustomMissingValue() data = arange(25).reshape(5, 5) data[eye(5, dtype=bool)] = custom_missing_value graph = TermGraph( { 'isnull': factor.isnull(), 'notnull': factor.notnull(), } ) results = self.run_graph( graph, initial_workspace={factor: data}, mask=self.build_mask(ones((5, 5))), ) check_arrays(results['isnull'], eye(5, dtype=bool)) check_arrays(results['notnull'], ~eye(5, dtype=bool))
def test_no_adjustments(self, name, data, lookback, adjustments, missing_value, expected_output): array = AdjustedArray(data, NOMASK, adjustments, missing_value) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. in_out = zip(array.traverse(lookback), expected_output) for yielded, expected_yield in in_out: check_arrays(yielded, expected_yield)
def check_output(self, expr, expected): result = expr._compute( [self.fake_raw_data[input_] for input_ in expr.inputs], self.mask.index, self.mask.columns, self.mask.values, ) check_arrays(result, expected)
def test_any(self): # FUN FACT: The inputs and outputs here are exactly the negation of # the inputs and outputs for test_all above. This isn't a coincidence. # # By de Morgan's Laws, we have:: # # ~(a & b) == (~a | ~b) # # negating both sides, we have:: # # (a & b) == ~(a | ~b) # # Since all(a, b) is isomorphic to (a & b), and any(a, b) is isomorphic # to (a | b), we have:: # # all(a, b) == ~(any(~a, ~b)) # data = array([[0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]], dtype=bool) # With a window_length of N, 1's should be "sticky" for the (N - 1) # days after the 1 in the base data. # Note that, the way ``self.run_graph`` works, we compute the same # number of output rows for all inputs, so we only get the last 4 # outputs for expected_3 even though we have enought input data to # compute 5 rows. expected_3 = array([[1, 1, 1, 0, 0, 0], [0, 1, 1, 1, 0, 0], [0, 0, 1, 1, 1, 0], [0, 0, 0, 1, 1, 1]], dtype=bool) expected_4 = array([[1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 0, 0], [0, 1, 1, 1, 1, 0], [0, 0, 1, 1, 1, 1]], dtype=bool) class Input(Filter): inputs = () window_length = 0 results = self.run_graph( TermGraph({ '3': Any(inputs=[Input()], window_length=3), '4': Any(inputs=[Input()], window_length=4), }), initial_workspace={Input(): data}, mask=self.build_mask(ones(shape=data.shape)), ) check_arrays(results['3'], expected_3) check_arrays(results['4'], expected_4)
def check_terms(self, terms, expected, initial_workspace, mask): """ Compile the given terms into a TermGraph, compute it with initial_workspace, and compare the results with ``expected``. """ graph = TermGraph(terms) results = self.run_graph(graph, initial_workspace, mask) for key, (res, exp) in dzip_exact(results, expected).items(): check_arrays(res, exp)
def test_rolling_and_nonrolling(self): open_ = USEquityPricing.open close = USEquityPricing.close volume = USEquityPricing.volume # Test for thirty days up to the last day that we think all # the assets existed. dates_to_test = self.dates[-30:] constants = {open_: 1, close: 2, volume: 3} loader = PrecomputedLoader( constants=constants, dates=self.dates, sids=self.asset_ids, ) engine = SimplePipelineEngine( lambda column: loader, self.dates, self.asset_finder, ) sumdiff = RollingSumDifference() result = engine.run_pipeline( Pipeline( columns={ 'sumdiff': sumdiff, 'open': open_.latest, 'close': close.latest, 'volume': volume.latest, }, ), dates_to_test[0], dates_to_test[-1] ) self.assertIsNotNone(result) self.assertEqual( {'sumdiff', 'open', 'close', 'volume'}, set(result.columns) ) result_index = self.asset_ids * len(dates_to_test) result_shape = (len(result_index),) check_arrays( result['sumdiff'], Series( index=result_index, data=full(result_shape, -3, dtype=float), ), ) for name, const in [('open', 1), ('close', 2), ('volume', 3)]: check_arrays( result[name], Series( index=result_index, data=full(result_shape, const, dtype=float), ), )
def check(terms): graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={f: data}, mask=self.build_mask(ones((5, 5))), ) for method in terms: check_arrays(results[method], expected_ranks[method])
def test_factor_with_multiple_outputs(self): dates = self.dates[5:10] assets = self.assets asset_ids = self.asset_ids constants = self.constants num_dates = len(dates) num_assets = len(assets) open = USEquityPricing.open close = USEquityPricing.close engine = SimplePipelineEngine( lambda column: self.loader, self.dates, self.asset_finder, ) def create_expected_results(expected_value, mask): expected_values = where(mask, expected_value, nan) return DataFrame(expected_values, index=dates, columns=assets) cascading_mask = AssetIDPlusDay() < (asset_ids[-1] + dates[0].day) expected_cascading_mask_result = make_cascading_boolean_array( shape=(num_dates, num_assets), ) alternating_mask = (AssetIDPlusDay() % 2).eq(0) expected_alternating_mask_result = make_alternating_boolean_array( shape=(num_dates, num_assets), first_value=False, ) expected_no_mask_result = full( shape=(num_dates, num_assets), fill_value=True, dtype=bool_dtype, ) masks = cascading_mask, alternating_mask, NotSpecified expected_mask_results = ( expected_cascading_mask_result, expected_alternating_mask_result, expected_no_mask_result, ) for mask, expected_mask in zip(masks, expected_mask_results): open_price, close_price = MultipleOutputs(mask=mask) pipeline = Pipeline( columns={'open_price': open_price, 'close_price': close_price}, ) if mask is not NotSpecified: pipeline.add(mask, 'mask') results = engine.run_pipeline(pipeline, dates[0], dates[-1]) for colname, case_column in (('open_price', open), ('close_price', close)): if mask is not NotSpecified: mask_results = results['mask'].unstack() check_arrays(mask_results.values, expected_mask) output_results = results[colname].unstack() output_expected = create_expected_results( constants[case_column], expected_mask, ) assert_frame_equal(output_results, output_expected)
def test_isfinite(self): data = self.randn_data(seed=10) data[:, 0] = nan data[:, 2] = inf data[:, 4] = -inf results = self.run_graph( TermGraph({'isfinite': self.f.isfinite()}), initial_workspace={self.f: data}, ) check_arrays(results['isfinite'], isfinite(data))
def test_any(self): # FUN FACT: The inputs and outputs here are exactly the negation of # the inputs and outputs for test_all above. This isn't a coincidence. # # By de Morgan's Laws, we have:: # # ~(a & b) == (~a | ~b) # # negating both sides, we have:: # # (a & b) == ~(a | ~b) # # Since all(a, b) is isomorphic to (a & b), and any(a, b) is isomorphic # to (a | b), we have:: # # all(a, b) == ~(any(~a, ~b)) # data = array( [[0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]], dtype=bool) # With a window_length of N, 1's should be "sticky" for the (N - 1) # days after the 1 in the base data. # Note that, the way ``self.run_graph`` works, we compute the same # number of output rows for all inputs, so we only get the last 4 # outputs for expected_3 even though we have enought input data to # compute 5 rows. expected_3 = array([[1, 1, 1, 0, 0, 0], [0, 1, 1, 1, 0, 0], [0, 0, 1, 1, 1, 0], [0, 0, 0, 1, 1, 1]], dtype=bool) expected_4 = array([[1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 0, 0], [0, 1, 1, 1, 1, 0], [0, 0, 1, 1, 1, 1]], dtype=bool) class Input(Filter): inputs = () window_length = 0 results = self.run_graph( TermGraph({ '3': Any(inputs=[Input()], window_length=3), '4': Any(inputs=[Input()], window_length=4), }), initial_workspace={Input(): data}, mask=self.build_mask(ones(shape=data.shape)), ) check_arrays(results['3'], expected_3) check_arrays(results['4'], expected_4)
def test_top_and_bottom(self): data = self.randn_data(seed=5) # Fix a seed for determinism. mask_data = ones_like(data, dtype=bool) mask_data[:, 0] = False nan_data = data.copy() nan_data[:, 0] = nan mask = Mask() workspace = {self.f: data, mask: mask_data} methods = ['top', 'bottom'] counts = 2, 3, 10 term_combos = list(product(methods, counts, [True, False])) def termname(method, count, masked): return '_'.join([method, str(count), 'mask' if masked else '']) # Add a term for each permutation of top/bottom, count, and # mask/no_mask. terms = {} for method, count, masked in term_combos: kwargs = {'N': count} if masked: kwargs['mask'] = mask term = getattr(self.f, method)(**kwargs) terms[termname(method, count, masked)] = term results = self.run_graph(TermGraph(terms), initial_workspace=workspace) def expected_result(method, count, masked): # Ranking with a mask is equivalent to ranking with nans applied on # the masked values. to_rank = nan_data if masked else data if method == 'top': return rowwise_rank(-to_rank) < count elif method == 'bottom': return rowwise_rank(to_rank) < count for method, count, masked in term_combos: result = results[termname(method, count, masked)] # Check that `min(c, num_assets)` assets passed each day. passed_per_day = result.sum(axis=1) check_arrays( passed_per_day, full_like(passed_per_day, min(count, data.shape[1])), ) expected = expected_result(method, count, masked) check_arrays(result, expected)
def test_overwrite_adjustment_cases(self, name, baseline, lookback, adjustments, missing_value, perspective_offset, expected): array = AdjustedArray(baseline, adjustments, missing_value) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. window_iter = array.traverse( lookback, perspective_offset=perspective_offset, ) for yielded, expected_yield in zip_longest(window_iter, expected): check_arrays(yielded, expected_yield)
def test_rank_after_mask(self, name, factor_dtype): f = F(dtype=factor_dtype) # data = arange(25).reshape(5, 5).transpose() % 4 data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2], [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]], dtype=factor_dtype) mask_data = ~eye(5, dtype=bool) initial_workspace = {f: data, Mask(): mask_data} graph = TermGraph( { "ascending_nomask": f.rank(ascending=True), "ascending_mask": f.rank(ascending=True, mask=Mask()), "descending_nomask": f.rank(ascending=False), "descending_mask": f.rank(ascending=False, mask=Mask()), } ) expected = { "ascending_nomask": array([[1., 3., 4., 5., 2.], [2., 4., 5., 1., 3.], [3., 5., 1., 2., 4.], [4., 1., 2., 3., 5.], [1., 3., 4., 5., 2.]]), "descending_nomask": array([[4., 3., 2., 1., 5.], [3., 2., 1., 5., 4.], [2., 1., 5., 4., 3.], [1., 5., 4., 3., 2.], [4., 3., 2., 1., 5.]]), # Diagonal should be all nans, and anything whose rank was less # than the diagonal in the unmasked calc should go down by 1. "ascending_mask": array([[nan, 2., 3., 4., 1.], [2., nan, 4., 1., 3.], [2., 4., nan, 1., 3.], [3., 1., 2., nan, 4.], [1., 2., 3., 4., nan]]), "descending_mask": array([[nan, 3., 2., 1., 4.], [2., nan, 1., 4., 3.], [2., 1., nan, 4., 3.], [1., 4., 3., nan, 2.], [4., 3., 2., 1., nan]]), } results = self.run_graph( graph, initial_workspace, mask=self.build_mask(ones((5, 5))), ) for method in results: check_arrays(expected[method], results[method])
def test_multiplicative_adjustments(self, name, data, lookback, adjustments, missing_value, perspective_offset, expected): array = AdjustedArray(data, NOMASK, adjustments, missing_value) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. window_iter = array.traverse( lookback, perspective_offset=perspective_offset, ) for yielded, expected_yield in zip_longest(window_iter, expected): check_arrays(yielded, expected_yield)
def test_map_can_only_return_none_if_missing_value_is_none(self): # Should work. la = LabelArray(self.strs, missing_value=None) result = la.map(lambda x: None) check_arrays( result, LabelArray(np.full_like(self.strs, None), missing_value=None), ) la = LabelArray(self.strs, missing_value="__MISSING__") with self.assertRaises(TypeError): la.map(lambda x: None)
def test_normalize_to_query_time(self, expected, tz, dates): # Order matters in pandas 0.18.2. Prior to that, using tz_convert on # a DatetimeIndex with DST/EST timestamps mixed resulted in some of # them being an hour off (1 hour past midnight). for scrambler in self.combos: df = pd.DataFrame({"timestamp": dates[scrambler]}) result = normalize_timestamp_to_query_time(df, time(8, 45), tz, inplace=False, ts_field='timestamp') timestamps = result['timestamp'].values check_arrays(np.sort(timestamps), np.sort(expected[scrambler]))
def test_compare_to_str(self, compval, shape, array_astype, missing_value): strs = self.strs.reshape(shape).astype(array_astype) if missing_value is None: # As of numpy 1.9.2, object array != None returns just False # instead of an array, with a deprecation warning saying the # behavior will change in the future. Work around that by just # using the ufunc. notmissing = np.not_equal(strs, missing_value) else: if not isinstance(missing_value, array_astype): missing_value = array_astype(missing_value, 'utf-8') notmissing = (strs != missing_value) arr = LabelArray(strs, missing_value=missing_value) if not isinstance(compval, array_astype): compval = array_astype(compval, 'utf-8') # arr.missing_value should behave like NaN. check_arrays( arr == compval, (strs == compval) & notmissing, ) check_arrays( arr != compval, (strs != compval) & notmissing, ) np_startswith = np.vectorize(lambda elem: elem.startswith(compval)) check_arrays( arr.startswith(compval), np_startswith(strs) & notmissing, ) np_endswith = np.vectorize(lambda elem: elem.endswith(compval)) check_arrays( arr.endswith(compval), np_endswith(strs) & notmissing, ) np_contains = np.vectorize(lambda elem: compval in elem) check_arrays( arr.has_substring(compval), np_contains(strs) & notmissing, )
def check(terms): graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={ f: data, c: classifier_data, str_c: string_classifier_data, }, mask=self.build_mask(ones((5, 5))), ) for method in terms: check_arrays(results[method], expected_grouped_ranks[method])
def test_notnan(self): data = self.randn_data(seed=10) diag = eye(*data.shape, dtype=bool) data[diag] = nan results = self.run_graph( TermGraph({ 'notnan': self.f.notnan(), 'notnull': self.f.notnull(), }), initial_workspace={self.f: data}, ) check_arrays(results['notnan'], ~diag) check_arrays(results['notnull'], ~diag)
def test_object1darrayoverwrite(self): pairs = [u + l for u, l in product(ascii_uppercase, ascii_lowercase)] categories = pairs + ["~" + c for c in pairs] baseline = LabelArray( np.array([["".join((r, c)) for c in "abc"] for r in ascii_uppercase]), None, categories, ) full_expected = baseline.copy() def flip(cs): if cs is None: return None if cs[0] != "~": return "~" + cs return cs def make_overwrite(fr, lr, fc, lc): fr, lr, fc, lc = map(ord, (fr, lr, fc, lc)) fr -= ord("A") lr -= ord("A") fc -= ord("a") lc -= ord("a") return Object1DArrayOverwrite( fr, lr, fc, lc, baseline[fr : lr + 1, fc].map(flip), ) overwrites = { 3: [make_overwrite("A", "B", "a", "a")], 4: [make_overwrite("A", "C", "b", "c")], 5: [make_overwrite("D", "D", "a", "b")], } it = AdjustedArray(baseline, overwrites, None).traverse(3) window = next(it) expected = full_expected[:3] check_arrays(window, expected) window = next(it) full_expected[0:2, 0] = LabelArray(["~Aa", "~Ba"], None) expected = full_expected[1:4] check_arrays(window, expected) window = next(it) full_expected[0:3, 1:3] = LabelArray( [["~Ab", "~Ac"], ["~Bb", "~Bc"], ["~Cb", "~Cb"]], None ) expected = full_expected[2:5] check_arrays(window, expected) window = next(it) full_expected[3, :2] = "~Da" expected = full_expected[3:6] check_arrays(window, expected)
def test_object1darrayoverwrite(self): pairs = [u + l for u, l in product(ascii_uppercase, ascii_lowercase)] categories = pairs + ['~' + c for c in pairs] baseline = LabelArray( array([[''.join((r, c)) for c in 'abc'] for r in ascii_uppercase]), None, categories, ) full_expected = baseline.copy() def flip(cs): if cs is None: return None if cs[0] != '~': return '~' + cs return cs def make_overwrite(fr, lr, fc, lc): fr, lr, fc, lc = map(ord, (fr, lr, fc, lc)) fr -= ord('A') lr -= ord('A') fc -= ord('a') lc -= ord('a') return Object1DArrayOverwrite( fr, lr, fc, lc, baseline[fr:lr + 1, fc].map(flip), ) overwrites = { 3: [make_overwrite('A', 'B', 'a', 'a')], 4: [make_overwrite('A', 'C', 'b', 'c')], 5: [make_overwrite('D', 'D', 'a', 'b')], } it = AdjustedArray(baseline, overwrites, None).traverse(3) window = next(it) expected = full_expected[:3] check_arrays(window, expected) window = next(it) full_expected[0:2, 0] = LabelArray(['~Aa', '~Ba'], None) expected = full_expected[1:4] check_arrays(window, expected) window = next(it) full_expected[0:3, 1:3] = LabelArray( [['~Ab', '~Ac'], ['~Bb', '~Bc'], ['~Cb', '~Cb']], None) expected = full_expected[2:5] check_arrays(window, expected) window = next(it) full_expected[3, :2] = '~Da' expected = full_expected[3:6] check_arrays(window, expected)
def test_rank_after_mask(self, name, factor_dtype): f = F(dtype=factor_dtype) # data = arange(25).reshape(5, 5).transpose() % 4 data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2], [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]], dtype=factor_dtype) mask_data = ~eye(5, dtype=bool) initial_workspace = {f: data, Mask(): mask_data} graph = TermGraph({ "ascending_nomask": f.rank(ascending=True), "ascending_mask": f.rank(ascending=True, mask=Mask()), "descending_nomask": f.rank(ascending=False), "descending_mask": f.rank(ascending=False, mask=Mask()), }) expected = { "ascending_nomask": array([[1., 3., 4., 5., 2.], [2., 4., 5., 1., 3.], [3., 5., 1., 2., 4.], [4., 1., 2., 3., 5.], [1., 3., 4., 5., 2.]]), "descending_nomask": array([[4., 3., 2., 1., 5.], [3., 2., 1., 5., 4.], [2., 1., 5., 4., 3.], [1., 5., 4., 3., 2.], [4., 3., 2., 1., 5.]]), # Diagonal should be all nans, and anything whose rank was less # than the diagonal in the unmasked calc should go down by 1. "ascending_mask": array([[nan, 2., 3., 4., 1.], [2., nan, 4., 1., 3.], [2., 4., nan, 1., 3.], [3., 1., 2., nan, 4.], [1., 2., 3., 4., nan]]), "descending_mask": array([[nan, 3., 2., 1., 4.], [2., nan, 1., 4., 3.], [2., 1., nan, 4., 3.], [1., 4., 3., nan, 2.], [4., 3., 2., 1., nan]]), } results = self.run_graph( graph, initial_workspace, mask=self.build_mask(ones((5, 5))), ) for method in results: check_arrays(expected[method], results[method])
def test_object1darrayoverwrite(self): pairs = [u + l for u, l in product(ascii_uppercase, ascii_lowercase)] categories = pairs + ['~' + c for c in pairs] baseline = LabelArray( array([[''.join((r, c)) for c in 'abc'] for r in ascii_uppercase]), None, categories, ) full_expected = baseline.copy() def flip(cs): if cs is None: return None if cs[0] != '~': return '~' + cs return cs def make_overwrite(fr, lr, fc, lc): fr, lr, fc, lc = map(ord, (fr, lr, fc, lc)) fr -= ord('A') lr -= ord('A') fc -= ord('a') lc -= ord('a') return Object1DArrayOverwrite( fr, lr, fc, lc, baseline[fr:lr + 1, fc].map(flip), ) overwrites = { 3: [make_overwrite('A', 'B', 'a', 'a')], 4: [make_overwrite('A', 'C', 'b', 'c')], 5: [make_overwrite('D', 'D', 'a', 'b')], } it = AdjustedArray(baseline, overwrites, None).traverse(3) window = next(it) expected = full_expected[:3] check_arrays(window, expected) window = next(it) full_expected[0:2, 0] = LabelArray(['~Aa', '~Ba'], None) expected = full_expected[1:4] check_arrays(window, expected) window = next(it) full_expected[0:3, 1:3] = LabelArray([['~Ab', '~Ac'], ['~Bb', '~Bc'], ['~Cb', '~Cb']], None) expected = full_expected[2:5] check_arrays(window, expected) window = next(it) full_expected[3, :2] = '~Da' expected = full_expected[3:6] check_arrays(window, expected)
def test_masking(self, dtype, missing_value, window_length): missing_value = coerce_to_dtype(dtype, missing_value) baseline_ints = arange(15).reshape(5, 3) baseline = baseline_ints.astype(dtype) mask = (baseline_ints % 2).astype(bool) masked_baseline = where(mask, baseline, missing_value) array = AdjustedArray( baseline, mask, adjustments={}, missing_value=missing_value, ) gen_expected = moving_window(masked_baseline, window_length) gen_actual = array.traverse(window_length) for expected, actual in zip(gen_expected, gen_actual): check_arrays(expected, actual)
def test_all(self): data = array([[1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1], [1, 1, 0, 1, 1, 1], [1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 0, 1], [1, 1, 1, 1, 1, 0]], dtype=bool) # With a window_length of N, 0's should be "sticky" for the (N - 1) # days after the 0 in the base data. # Note that, the way ``self.run_graph`` works, we compute the same # number of output rows for all inputs, so we only get the last 4 # outputs for expected_3 even though we have enought input data to # compute 5 rows. expected_3 = array([[0, 0, 0, 1, 1, 1], [1, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 1], [1, 1, 1, 0, 0, 0]], dtype=bool) expected_4 = array([[0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 1, 1], [1, 0, 0, 0, 0, 1], [1, 1, 0, 0, 0, 0]], dtype=bool) class Input(Filter): inputs = () window_length = 0 results = self.run_graph( TermGraph({ '3': All(inputs=[Input()], window_length=3), '4': All(inputs=[Input()], window_length=4), }), initial_workspace={Input(): data}, mask=self.build_mask(ones(shape=data.shape)), ) check_arrays(results['3'], expected_3) check_arrays(results['4'], expected_4)
def test_update_labels(self): data = np.array( [ ["aaa", "bbb", "ccc"], ["ddd", "eee", "fff"], ["ggg", "hhh", "iii"], ["jjj", "kkk", "lll"], ["mmm", "nnn", "ooo"], ] ) label_array = LabelArray(data, missing_value="") adj_array = AdjustedArray( data=label_array, adjustments={4: [ObjectOverwrite(2, 3, 0, 0, "ppp")]}, missing_value="", ) expected_data = np.array( [ ["aaa-foo", "bbb-foo", "ccc-foo"], ["ddd-foo", "eee-foo", "fff-foo"], ["ggg-foo", "hhh-foo", "iii-foo"], ["jjj-foo", "kkk-foo", "lll-foo"], ["mmm-foo", "nnn-foo", "ooo-foo"], ] ) expected_label_array = LabelArray(expected_data, missing_value="") expected_adj_array = AdjustedArray( data=expected_label_array, adjustments={4: [ObjectOverwrite(2, 3, 0, 0, "ppp-foo")]}, missing_value="", ) adj_array.update_labels(lambda x: x + "-foo") # Check that the mapped AdjustedArray has the expected baseline # values and adjustment values. check_arrays(adj_array.data, expected_adj_array.data) assert adj_array.adjustments == expected_adj_array.adjustments