def test_relabel_strings(self, relabel_func, labelarray_dtype): class C(Classifier): inputs = () dtype = categorical_dtype missing_value = None window_length = 0 c = C() raw = np.asarray( [['a', 'aa', 'aaa', 'abab'], ['bab', 'aba', 'aa', 'bb'], ['a', 'aba', 'abaa', 'abaab'], ['a', 'aa', 'aaa', 'aaaa']], dtype=labelarray_dtype, ) raw_relabeled = np.vectorize(relabel_func, otypes=[object])(raw) data = LabelArray(raw, missing_value=None) terms = { 'relabeled': c.relabel(relabel_func), } expected_results = { 'relabeled': LabelArray(raw_relabeled, missing_value=None), } self.check_terms( terms, expected_results, initial_workspace={c: data}, mask=self.build_mask(self.ones_mask(shape=data.shape)), )
def test_narrow_condense_back_to_valid_size(self): categories = ['a'] * (2**8 + 1) arr = LabelArray(categories, missing_value=categories[0]) assert_equal(arr.itemsize, 1) self.check_roundtrip(arr) # longer than int16 but still fits when deduped categories = self.create_categories(16, plus_one=False) categories.append(categories[0]) arr = LabelArray(categories, missing_value=categories[0]) assert_equal(arr.itemsize, 2) self.check_roundtrip(arr)
def test_map(self, f): data = np.array( [['E', 'GHIJ', 'HIJKLMNOP', 'DEFGHIJ'], ['CDE', 'ABCDEFGHIJKLMNOPQ', 'DEFGHIJKLMNOPQRS', 'ABCDEFGHIJK'], ['DEFGHIJKLMNOPQR', 'DEFGHI', 'DEFGHIJ', 'FGHIJK'], ['EFGHIJKLM', 'EFGHIJKLMNOPQRS', 'ABCDEFGHI', 'DEFGHIJ']], dtype=object, ) la = LabelArray(data, missing_value=None) numpy_transformed = np.vectorize(f)(data) la_transformed = la.map(f).as_string_array() assert_equal(numpy_transformed, la_transformed)
def test_slicing_preserves_attributes(self, slice_): arr = LabelArray(self.strs.reshape((9, 3)), missing_value='') sliced = arr[slice_] self.assertIsInstance(sliced, LabelArray) self.assertIs(sliced.categories, arr.categories) self.assertIs(sliced.reverse_categories, arr.reverse_categories) self.assertIs(sliced.missing_value, arr.missing_value)
def to_workspace_value(self, result, assets): """ Called with the result of a pipeline. This needs to return an object which can be put into the workspace to continue doing computations. This is the inverse of :func:`~gateway.pipeline.term.Term.postprocess`. """ if self.dtype == int64_dtype: return super(Classifier, self).to_workspace_value(result, assets) assert isinstance(result.values, pd.Categorical), ('Expected a Categorical, got %r.' % type(result.values)) with_missing = pd.Series( data=pd.Categorical( result.values, result.values.categories.union([self.missing_value]), ), index=result.index, ) return LabelArray( super(Classifier, self).to_workspace_value( with_missing, assets, ), self.missing_value, )
def test_string_eq(self, compval, labelarray_dtype): compval = labelarray_dtype.type(compval) class C(Classifier): dtype = categorical_dtype missing_value = '' inputs = () window_length = 0 c = C() # There's no significance to the values here other than that they # contain a mix of the comparison value and other values. data = LabelArray( np.asarray( [['', 'a', 'ab', 'ba'], ['z', 'ab', 'a', 'ab'], ['aa', 'ab', '', 'ab'], ['aa', 'a', 'ba', 'ba']], dtype=labelarray_dtype, ), missing_value='', ) self.check_terms( terms={ 'eq': c.eq(compval), }, expected={ 'eq': (data == compval), }, initial_workspace={c: data}, mask=self.build_mask(self.ones_mask(shape=data.shape)), )
def test_string_isnull(self, mv): class C(Classifier): dtype = categorical_dtype missing_value = mv inputs = () window_length = 0 c = C() # There's no significance to the values here other than that they # contain a mix of missing and non-missing values. raw = np.asarray( [['', 'a', 'ab', 'ba'], ['z', 'ab', 'a', 'ab'], ['aa', 'ab', '', 'ab'], ['aa', 'a', 'ba', 'ba']], dtype=categorical_dtype, ) data = LabelArray(raw, missing_value=mv) self.check_terms( terms={ 'isnull': c.isnull(), 'notnull': c.notnull() }, expected={ 'isnull': np.equal(raw, mv), 'notnull': np.not_equal(raw, mv), }, initial_workspace={c: data}, mask=self.build_mask(self.ones_mask(shape=data.shape)), )
def test_relabel_missing_value_interactions(self, missing_value): mv = missing_value class C(Classifier): inputs = () dtype = categorical_dtype missing_value = mv window_length = 0 c = C() def relabel_func(s): if s == 'B': return mv return ''.join([s, s]) raw = np.asarray( [['A', 'B', 'C', mv], [mv, 'A', 'B', 'C'], ['C', mv, 'A', 'B'], ['B', 'C', mv, 'A']], dtype=categorical_dtype, ) data = LabelArray(raw, missing_value=mv) expected_relabeled_raw = np.asarray( [['AA', mv, 'CC', mv], [mv, 'AA', mv, 'CC'], ['CC', mv, 'AA', mv], [mv, 'CC', mv, 'AA']], dtype=categorical_dtype, ) terms = { 'relabeled': c.relabel(relabel_func), } expected_results = { 'relabeled': LabelArray(expected_relabeled_raw, missing_value=mv), } self.check_terms( terms, expected_results, initial_workspace={c: data}, mask=self.build_mask(self.ones_mask(shape=data.shape)), )
def check_roundtrip(arr): assert_equal( arr.as_string_array(), LabelArray( arr.as_string_array(), arr.missing_value, ).as_string_array(), )
def test_compare_to_str(self, compval, shape, array_astype, missing_value): strs = self.strs.reshape(shape).astype(array_astype) if missing_value is None: # As of numpy 1.9.2, object array != None returns just False # instead of an array, with a deprecation warning saying the # behavior will change in the future. Work around that by just # using the ufunc. notmissing = np.not_equal(strs, missing_value) else: if not isinstance(missing_value, array_astype): missing_value = array_astype(missing_value, 'utf-8') notmissing = (strs != missing_value) arr = LabelArray(strs, missing_value=missing_value) if not isinstance(compval, array_astype): compval = array_astype(compval, 'utf-8') # arr.missing_value should behave like NaN. check_arrays( arr == compval, (strs == compval) & notmissing, ) check_arrays( arr != compval, (strs != compval) & notmissing, ) np_startswith = np.vectorize(lambda elem: elem.startswith(compval)) check_arrays( arr.startswith(compval), np_startswith(strs) & notmissing, ) np_endswith = np.vectorize(lambda elem: elem.endswith(compval)) check_arrays( arr.endswith(compval), np_endswith(strs) & notmissing, ) np_contains = np.vectorize(lambda elem: compval in elem) check_arrays( arr.has_substring(compval), np_contains(strs) & notmissing, )
def test_compare_to_str_array(self, missing_value): strs = self.strs shape = strs.shape arr = LabelArray(strs, missing_value=missing_value) if missing_value is None: # As of numpy 1.9.2, object array != None returns just False # instead of an array, with a deprecation warning saying the # behavior will change in the future. Work around that by just # using the ufunc. notmissing = np.not_equal(strs, missing_value) else: notmissing = (strs != missing_value) check_arrays(arr.not_missing(), notmissing) check_arrays(arr.is_missing(), ~notmissing) # The arrays are equal everywhere, but comparisons against the # missing_value should always produce False check_arrays(strs == arr, notmissing) check_arrays(strs != arr, np.zeros_like(strs, dtype=bool)) def broadcastable_row(value, dtype): return np.full((shape[0], 1), value, dtype=strs.dtype) def broadcastable_col(value, dtype): return np.full((1, shape[1]), value, dtype=strs.dtype) # Test comparison between arr and a like-shap 2D array, a column # vector, and a row vector. for comparator, dtype, value in product((eq, ne), (bytes, unicode, object), set(self.rowvalues)): check_arrays( comparator(arr, np.full_like(strs, value)), comparator(strs, value) & notmissing, ) check_arrays( comparator(arr, broadcastable_row(value, dtype=dtype)), comparator(strs, value) & notmissing, ) check_arrays( comparator(arr, broadcastable_col(value, dtype=dtype)), comparator(strs, value) & notmissing, )
def as_labelarray(initial_dtype, missing_value, array): """ Curried wrapper around LabelArray, that round-trips the input data through `initial_dtype` first. """ return LabelArray( array.astype(initial_dtype), missing_value=initial_dtype.type(missing_value), )
def manual_narrow_condense_back_to_valid_size_slow(self): """This test is really slow so we don't want it run by default. """ # tests that we don't try to create an 'int24' (which is meaningless) categories = self.create_categories(24, plus_one=False) categories.append(categories[0]) arr = LabelArray(categories, missing_value=categories[0]) assert_equal(arr.itemsize, 4) self.check_roundtrip(arr)
def test_narrow_code_storage(self): create_categories = self.create_categories check_roundtrip = self.check_roundtrip # uint8 categories = create_categories(8, plus_one=False) arr = LabelArray( [], missing_value=categories[0], categories=categories, ) self.assertEqual(arr.itemsize, 1) check_roundtrip(arr) # uint8 inference arr = LabelArray(categories, missing_value=categories[0]) self.assertEqual(arr.itemsize, 1) check_roundtrip(arr) # just over uint8 categories = create_categories(8, plus_one=True) arr = LabelArray( [], missing_value=categories[0], categories=categories, ) self.assertEqual(arr.itemsize, 2) check_roundtrip(arr) # fits in uint16 categories = create_categories(16, plus_one=False) arr = LabelArray( [], missing_value=categories[0], categories=categories, ) self.assertEqual(arr.itemsize, 2) check_roundtrip(arr) # uint16 inference arr = LabelArray(categories, missing_value=categories[0]) self.assertEqual(arr.itemsize, 2) check_roundtrip(arr) # just over uint16 categories = create_categories(16, plus_one=True) arr = LabelArray( [], missing_value=categories[0], categories=categories, ) self.assertEqual(arr.itemsize, 4) check_roundtrip(arr) # uint32 inference arr = LabelArray(categories, missing_value=categories[0]) self.assertEqual(arr.itemsize, 4) check_roundtrip(arr)
def test_reversability_categorical(self): class F(Classifier): inputs = () window_length = 0 dtype = categorical_dtype missing_value = '<missing>' f = F() column_data = LabelArray( np.array( [['a', f.missing_value], ['b', f.missing_value], ['c', 'd']], ), missing_value=f.missing_value, ) assert_equal( f.postprocess(column_data.ravel()), pd.Categorical( ['a', f.missing_value, 'b', f.missing_value, 'c', 'd'], ), ) # only include the non-missing data pipeline_output = pd.Series( data=['a', 'b', 'c', 'd'], index=pd.MultiIndex.from_arrays([ [pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-02'), pd.Timestamp('2014-01-03'), pd.Timestamp('2014-01-03')], [0, 0, 0, 1], ]), dtype='category', ) assert_equal( f.to_workspace_value(pipeline_output, pd.Index([0, 1])), column_data, )
def test_string_not_equal(self, compval, missing, labelarray_dtype): compval = labelarray_dtype.type(compval) class C(Classifier): dtype = categorical_dtype missing_value = missing inputs = () window_length = 0 c = C() # There's no significance to the values here other than that they # contain a mix of the comparison value and other values. data = LabelArray( np.asarray( [['', 'a', 'ab', 'ba'], ['z', 'ab', 'a', 'ab'], ['aa', 'ab', '', 'ab'], ['aa', 'a', 'ba', 'ba']], dtype=labelarray_dtype, ), missing_value=missing, ) expected = ( (data.as_int_array() != data.reverse_categories.get(compval, -1)) & (data.as_int_array() != data.reverse_categories[C.missing_value]) ) self.check_terms( terms={ 'ne': c != compval, }, expected={ 'ne': expected, }, initial_workspace={c: data}, mask=self.build_mask(self.ones_mask(shape=data.shape)), )
def test_map_can_only_return_none_if_missing_value_is_none(self): # Should work. la = LabelArray(self.strs, missing_value=None) result = la.map(lambda x: None) check_arrays( result, LabelArray(np.full_like(self.strs, None), missing_value=None), ) la = LabelArray(self.strs, missing_value="__MISSING__") with self.assertRaises(TypeError): la.map(lambda x: None)
def test_map_shrinks_code_storage_if_possible(self): arr = LabelArray( # Drop the last value so we fit in a uint16 with None as a missing # value. self.create_categories(16, plus_one=False)[:-1], missing_value=None, ) self.assertEqual(arr.itemsize, 2) def either_A_or_B(s): return ('A', 'B')[sum(ord(c) for c in s) % 2] result = arr.map(either_A_or_B) self.assertEqual(set(result.categories), {'A', 'B', None}) self.assertEqual(result.itemsize, 1) assert_equal( np.vectorize(either_A_or_B)(arr.as_string_array()), result.as_string_array(), )
def test_setitem_array(self): arr = LabelArray(self.strs, missing_value=None) orig_arr = arr.copy() # Write a row. self.assertFalse((arr[0] == arr[1]).all(), "This test doesn't test anything because rows 0" " and 1 are already equal!") arr[0] = arr[1] for i in range(arr.shape[1]): self.assertEqual(arr[0, i], arr[1, i]) # Write a column. self.assertFalse((arr[:, 0] == arr[:, 1]).all(), "This test doesn't test anything because columns 0" " and 1 are already equal!") arr[:, 0] = arr[:, 1] for i in range(arr.shape[0]): self.assertEqual(arr[i, 0], arr[i, 1]) # Write the whole array. arr[:] = orig_arr check_arrays(arr, orig_arr)
def test_copy_categories_list(self): """regression test for #1927 """ categories = ['a', 'b', 'c'] LabelArray( [None, 'a', 'b', 'c'], missing_value=None, categories=categories, ) # before #1927 we didn't take a copy and would insert the missing value # (None) into the list assert_equal(categories, ['a', 'b', 'c'])
def test_map_ignores_missing_value(self, missing): data = np.array([missing, 'B', 'C'], dtype=object) la = LabelArray(data, missing_value=missing) def increment_char(c): return chr(ord(c) + 1) result = la.map(increment_char) expected = LabelArray([missing, 'C', 'D'], missing_value=missing) assert_equal(result.as_string_array(), expected.as_string_array())
def _normalize_array(data, missing_value): """ Coerce buffer data for an AdjustedArray into a standard scalar representation, returning the coerced array and a dict of argument to pass to np.view to use when providing a user-facing view of the underlying data. - float* data is coerced to float64 with viewtype float64. - int32, int64, and uint32 are converted to int64 with viewtype int64. - datetime[*] data is coerced to int64 with a viewtype of datetime64[ns]. - bool_ data is coerced to uint8 with a viewtype of bool_. Parameters ---------- data : np.ndarray Returns ------- coerced, view_kwargs : (np.ndarray, np.dtype) """ if isinstance(data, LabelArray): return data, {} data_dtype = data.dtype if data_dtype in BOOL_DTYPES: return data.astype(uint8), {'dtype': dtype(bool_)} elif data_dtype in FLOAT_DTYPES: return data.astype(float64), {'dtype': dtype(float64)} elif data_dtype in INT_DTYPES: return data.astype(int64), {'dtype': dtype(int64)} elif is_categorical(data_dtype): if not isinstance(missing_value, LabelArray.SUPPORTED_SCALAR_TYPES): raise TypeError( "Invalid missing_value for categorical array.\n" "Expected None, bytes or unicode. Got %r." % missing_value, ) return LabelArray(data, missing_value), {} elif data_dtype.kind == 'M': try: outarray = data.astype('datetime64[ns]').view('int64') return outarray, {'dtype': datetime64ns_dtype} except OverflowError: raise ValueError("AdjustedArray received a datetime array " "not representable as datetime64[ns].\n" "Min Date: %s\n" "Max Date: %s\n" % (data.min(), data.max())) else: raise TypeError("Don't know how to construct AdjustedArray " "on data of type %s." % data_dtype)
def test_infer_categories(self): """ Test that categories are inferred in sorted order if they're not explicitly passed. """ arr1d = LabelArray(self.strs, missing_value='') codes1d = arr1d.as_int_array() self.assertEqual(arr1d.shape, self.strs.shape) self.assertEqual(arr1d.shape, codes1d.shape) categories = arr1d.categories unique_rowvalues = set(self.rowvalues) # There should be an entry in categories for each unique row value, and # each integer stored in the data array should be an index into # categories. self.assertEqual(list(categories), sorted(set(self.rowvalues))) self.assertEqual(set(codes1d.ravel()), set(range(len(unique_rowvalues)))) for idx, value in enumerate(arr1d.categories): check_arrays( self.strs == value, arr1d.as_int_array() == idx, ) # It should be equivalent to pass the same set of categories manually. arr1d_explicit_categories = LabelArray( self.strs, missing_value='', categories=arr1d.categories, ) check_arrays(arr1d, arr1d_explicit_categories) for shape in (9, 3), (3, 9), (3, 3, 3): strs2d = self.strs.reshape(shape) arr2d = LabelArray(strs2d, missing_value='') codes2d = arr2d.as_int_array() self.assertEqual(arr2d.shape, shape) check_arrays(arr2d.categories, categories) for idx, value in enumerate(arr2d.categories): check_arrays(strs2d == value, codes2d == idx)
def test_element_of_strings(self, container_type, labelarray_dtype): missing = labelarray_dtype.type("not in the array") class C(Classifier): dtype = categorical_dtype missing_value = missing inputs = () window_length = 0 c = C() raw = np.asarray( [['', 'a', 'ab', 'ba'], ['z', 'ab', 'a', 'ab'], ['aa', 'ab', '', 'ab'], ['aa', 'a', 'ba', 'ba']], dtype=labelarray_dtype, ) data = LabelArray(raw, missing_value=missing) choices = [ container_type(choices) for choices in [ [], ['a', ''], ['a', 'a', 'a', 'ab', 'a'], set(data.reverse_categories) - {missing}, ['random value', 'ab'], ['_' * i for i in range(30)], ] ] def make_expected(choice_set): return np.vectorize(choice_set.__contains__, otypes=[bool])(raw) terms = {str(i): c.element_of(s) for i, s in enumerate(choices)} expected = {str(i): make_expected(s) for i, s in enumerate(choices)} self.check_terms( terms=terms, expected=expected, initial_workspace={c: data}, mask=self.build_mask(self.ones_mask(shape=data.shape)), )
def expected_latest(self, column, slice_): loader = self.engine.get_loader(column) index = self.calendar[slice_] columns = self.assets values = loader.values(column.dtype, self.calendar, self.sids)[slice_] if column.dtype.kind in ('O', 'S', 'U'): # For string columns, we expect a categorical in the output. return LabelArray( values, missing_value=column.missing_value, ).as_categorical_frame( index=index, columns=columns, ) return DataFrame( loader.values(column.dtype, self.calendar, self.sids)[slice_], index=self.calendar[slice_], columns=self.assets, )
def test_reject_ufuncs(self): """ The internal values of a LabelArray should be opaque to numpy ufuncs. Test that all unfuncs fail. """ l = LabelArray(self.strs, '') ints = np.arange(len(l)) with warnings.catch_warnings(): # Some ufuncs return NotImplemented, but warn that they will fail # in the future. Both outcomes are fine, so ignore the warnings. warnings.filterwarnings( 'ignore', message="unorderable dtypes.*", category=DeprecationWarning, ) warnings.filterwarnings( 'ignore', message="elementwise comparison failed.*", category=FutureWarning, ) for func in all_ufuncs(): # Different ufuncs vary between returning NotImplemented and # raising a TypeError when provided with unknown dtypes. # This is a bit unfortunate, but still better than silently # accepting an int array. try: if func.nin == 1: ret = func(l) elif func.nin == 2: ret = func(l, ints) else: self.fail("Who added a ternary ufunc !?!") except TypeError: pass else: self.assertIs(ret, NotImplemented)
def test_map_never_increases_code_storage_size(self): # This tests a pathological case where a user maps an impure function # that returns a different label on every invocation, which in a naive # implementation could cause us to need to **increase** the size of our # codes after a map. # # This doesn't happen, however, because we guarantee that the user's # mapping function will be called on each unique category exactly once, # which means we can never increase the number of categories in the # LabelArray after mapping. # Using all but one of the categories so that we still fit in a uint8 # with an extra category for None as a missing value. categories = self.create_categories(8, plus_one=False)[:-1] larger_categories = self.create_categories(16, plus_one=False) # Double the length of the categories so that we have to increase the # required size after our map. categories_twice = categories + categories arr = LabelArray(categories_twice, missing_value=None) assert_equal(arr.itemsize, 1) gen_unique_categories = iter(larger_categories) def new_string_every_time(c): # Return a new unique category every time so that every result is # different. return next(gen_unique_categories) result = arr.map(new_string_every_time) # Result should still be of size 1. assert_equal(result.itemsize, 1) # Result should be the first `len(categories)` entries from the larger # categories, repeated twice. expected = LabelArray( larger_categories[:len(categories)] * 2, missing_value=None, ) assert_equal(result.as_string_array(), expected.as_string_array())
def test_setitem_scalar(self, val, missing_value): arr = LabelArray(self.strs, missing_value=missing_value) if not arr.has_label(val): self.assertTrue((val == 'not in the array') or (val is None and missing_value is not None)) for slicer in [(0, 0), (0, 1), 1]: with self.assertRaises(ValueError): arr[slicer] = val return arr[0, 0] = val self.assertEqual(arr[0, 0], val) arr[0, 1] = val self.assertEqual(arr[0, 1], val) arr[1] = val if val == missing_value: self.assertTrue(arr.is_missing()[1].all()) else: self.assertTrue((arr[1] == val).all()) self.assertTrue((arr[1].as_string_array() == val).all()) arr[:, -1] = val if val == missing_value: self.assertTrue(arr.is_missing()[:, -1].all()) else: self.assertTrue((arr[:, -1] == val).all()) self.assertTrue((arr[:, -1].as_string_array() == val).all()) arr[:] = val if val == missing_value: self.assertTrue(arr.is_missing().all()) else: self.assertFalse(arr.is_missing().any()) self.assertTrue((arr == val).all())
def test_winsorize_hand_computed(self): """ Test the hand-computed example in factor.winsorize. """ f = self.f m = Mask() c = C() str_c = C(dtype=categorical_dtype, missing_value=None) factor_data = array([[1., 2., 3., 4., 5., 6.], [1., 8., 27., 64., 125., 216.], [6., 5., 4., 3., 2., 1.]]) filter_data = array( [[False, True, True, True, True, True], [True, False, True, True, True, True], [True, True, False, True, True, True]], dtype=bool, ) classifier_data = array( [[1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2]], dtype=int64_dtype, ) string_classifier_data = LabelArray( classifier_data.astype(str).astype(object), missing_value=None, ) terms = { 'winsor_1': f.winsorize(min_percentile=0.33, max_percentile=0.67), 'winsor_2': f.winsorize(min_percentile=0.49, max_percentile=1), 'winsor_3': f.winsorize(min_percentile=0, max_percentile=.67), 'masked': f.winsorize(min_percentile=0.33, max_percentile=0.67, mask=m), 'grouped': f.winsorize(min_percentile=0.34, max_percentile=0.66, groupby=c), 'grouped_str': f.winsorize(min_percentile=0.34, max_percentile=0.66, groupby=str_c), 'grouped_masked': f.winsorize(min_percentile=0.34, max_percentile=0.66, mask=m, groupby=c), 'grouped_masked_str': f.winsorize(min_percentile=0.34, max_percentile=0.66, mask=m, groupby=str_c), } expected = { 'winsor_1': array([[2., 2., 3., 4., 5., 5.], [8., 8., 27., 64., 125., 125.], [5., 5., 4., 3., 2., 2.]]), 'winsor_2': array([[3.0, 3., 3., 4., 5., 6.], [27., 27., 27., 64., 125., 216.], [6.0, 5., 4., 3., 3., 3.]]), 'winsor_3': array([[1., 2., 3., 4., 5., 5.], [1., 8., 27., 64., 125., 125.], [5., 5., 4., 3., 2., 1.]]), 'masked': array([[nan, 3., 3., 4., 5., 5.], [27., nan, 27., 64., 125., 125.], [5.0, 5., nan, 3., 2., 2.]]), 'grouped': array([[2., 2., 2., 5., 5., 5.], [8., 8., 8., 125., 125., 125.], [5., 5., 5., 2., 2., 2.]]), 'grouped_masked': array([[nan, 2., 3., 5., 5., 5.], [1.0, nan, 27., 125., 125., 125.], [6.0, 5., nan, 2., 2., 2.]]), } # Changing the classifier dtype shouldn't affect anything. expected['grouped_str'] = expected['grouped'] expected['grouped_masked_str'] = expected['grouped_masked'] self.check_terms( terms, expected, initial_workspace={ f: factor_data, c: classifier_data, str_c: string_classifier_data, m: filter_data, }, mask=self.build_mask(self.ones_mask(shape=factor_data.shape)), check=partial(check_allclose, atol=0.001), )
def test_normalizations_hand_computed(self): """ Test the hand-computed example in factor.demean. """ f = self.f m = Mask() c = C() str_c = C(dtype=categorical_dtype, missing_value=None) factor_data = array([[1.0, 2.0, 3.0, 4.0], [1.5, 2.5, 3.5, 1.0], [2.0, 3.0, 4.0, 1.5], [2.5, 3.5, 1.0, 2.0]], ) filter_data = array( [[False, True, True, True], [True, False, True, True], [True, True, False, True], [True, True, True, False]], dtype=bool, ) classifier_data = array( [[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2]], dtype=int64_dtype, ) string_classifier_data = LabelArray( classifier_data.astype(str).astype(object), missing_value=None, ) terms = { 'vanilla': f.demean(), 'masked': f.demean(mask=m), 'grouped': f.demean(groupby=c), 'grouped_str': f.demean(groupby=str_c), 'grouped_masked': f.demean(mask=m, groupby=c), 'grouped_masked_str': f.demean(mask=m, groupby=str_c), } expected = { 'vanilla': array([[-1.500, -0.500, 0.500, 1.500], [-0.625, 0.375, 1.375, -1.125], [-0.625, 0.375, 1.375, -1.125], [0.250, 1.250, -1.250, -0.250]], ), 'masked': array( [[nan, -1.000, 0.000, 1.000], [-0.500, nan, 1.500, -1.000], [-0.166, 0.833, nan, -0.666], [0.166, 1.166, -1.333, nan]], ), 'grouped': array([[-0.500, 0.500, -0.500, 0.500], [-0.500, 0.500, 1.250, -1.250], [-0.500, 0.500, 1.250, -1.250], [-0.500, 0.500, -0.500, 0.500]], ), 'grouped_masked': array([[nan, 0.000, -0.500, 0.500], [0.000, nan, 1.250, -1.250], [-0.500, 0.500, nan, 0.000], [-0.500, 0.500, 0.000, nan]]) } # Changing the classifier dtype shouldn't affect anything. expected['grouped_str'] = expected['grouped'] expected['grouped_masked_str'] = expected['grouped_masked'] self.check_terms( terms, expected, initial_workspace={ f: factor_data, c: classifier_data, str_c: string_classifier_data, m: filter_data, }, mask=self.build_mask(self.ones_mask(shape=factor_data.shape)), # The hand-computed values aren't very precise (in particular, # we truncate repeating decimals at 3 places) This is just # asserting that the example isn't misleading by being totally # wrong. check=partial(check_allclose, atol=0.001), )