Beispiel #1
0
    def test_setitem_array(self):
        arr = LabelArray(self.strs, missing_value=None)
        orig_arr = arr.copy()

        # Write a row.
        self.assertFalse(
            (arr[0] == arr[1]).all(),
            "This test doesn't test anything because rows 0"
            " and 1 are already equal!"
        )
        arr[0] = arr[1]
        for i in range(arr.shape[1]):
            self.assertEqual(arr[0, i], arr[1, i])

        # Write a column.
        self.assertFalse(
            (arr[:, 0] == arr[:, 1]).all(),
            "This test doesn't test anything because columns 0"
            " and 1 are already equal!"
        )
        arr[:, 0] = arr[:, 1]
        for i in range(arr.shape[0]):
            self.assertEqual(arr[i, 0], arr[i, 1])

        # Write the whole array.
        arr[:] = orig_arr
        check_arrays(arr, orig_arr)
Beispiel #2
0
    def test_string_classifiers_produce_categoricals(self):
        """
        Test that string-based classifiers produce pandas categoricals as their
        outputs.
        """
        col = TestingDataSet.categorical_col
        pipe = Pipeline(columns={'c': col.latest})

        run_dates = self.trading_days[-10:]
        start_date, end_date = run_dates[[0, -1]]

        result = self.run_pipeline(pipe, start_date, end_date)
        assert isinstance(result.c.values, Categorical)

        expected_raw_data = self.raw_expected_values(
            col,
            start_date,
            end_date,
        )
        expected_labels = LabelArray(expected_raw_data, col.missing_value)
        expected_final_result = expected_labels.as_categorical_frame(
            index=run_dates,
            columns=self.asset_finder.retrieve_all(self.asset_finder.sids),
        )
        assert_frame_equal(result.c.unstack(), expected_final_result)
Beispiel #3
0
    def test_object1darrayoverwrite(self):
        pairs = [u + l for u, l in product(ascii_uppercase, ascii_lowercase)]
        categories = pairs + ["~" + c for c in pairs]
        baseline = LabelArray(
            np.array([["".join((r, c)) for c in "abc"] for r in ascii_uppercase]),
            None,
            categories,
        )
        full_expected = baseline.copy()

        def flip(cs):
            if cs is None:
                return None
            if cs[0] != "~":
                return "~" + cs
            return cs

        def make_overwrite(fr, lr, fc, lc):
            fr, lr, fc, lc = map(ord, (fr, lr, fc, lc))
            fr -= ord("A")
            lr -= ord("A")
            fc -= ord("a")
            lc -= ord("a")

            return Object1DArrayOverwrite(
                fr,
                lr,
                fc,
                lc,
                baseline[fr : lr + 1, fc].map(flip),
            )

        overwrites = {
            3: [make_overwrite("A", "B", "a", "a")],
            4: [make_overwrite("A", "C", "b", "c")],
            5: [make_overwrite("D", "D", "a", "b")],
        }

        it = AdjustedArray(baseline, overwrites, None).traverse(3)

        window = next(it)
        expected = full_expected[:3]
        check_arrays(window, expected)

        window = next(it)
        full_expected[0:2, 0] = LabelArray(["~Aa", "~Ba"], None)
        expected = full_expected[1:4]
        check_arrays(window, expected)

        window = next(it)
        full_expected[0:3, 1:3] = LabelArray(
            [["~Ab", "~Ac"], ["~Bb", "~Bc"], ["~Cb", "~Cb"]], None
        )
        expected = full_expected[2:5]
        check_arrays(window, expected)

        window = next(it)
        full_expected[3, :2] = "~Da"
        expected = full_expected[3:6]
        check_arrays(window, expected)
Beispiel #4
0
    def test_setitem_array(self):
        arr = LabelArray(self.strs, missing_value=None)
        orig_arr = arr.copy()

        # Write a row.
        self.assertFalse(
            (arr[0] == arr[1]).all(),
            "This test doesn't test anything because rows 0"
            " and 1 are already equal!"
        )
        arr[0] = arr[1]
        for i in range(arr.shape[1]):
            self.assertEqual(arr[0, i], arr[1, i])

        # Write a column.
        self.assertFalse(
            (arr[:, 0] == arr[:, 1]).all(),
            "This test doesn't test anything because columns 0"
            " and 1 are already equal!"
        )
        arr[:, 0] = arr[:, 1]
        for i in range(arr.shape[0]):
            self.assertEqual(arr[i, 0], arr[i, 1])

        # Write the whole array.
        arr[:] = orig_arr
        check_arrays(arr, orig_arr)
Beispiel #5
0
    def test_relabel_strings(self, relabel_func, labelarray_dtype):
        class C(Classifier):
            inputs = ()
            dtype = categorical_dtype
            missing_value = None
            window_length = 0

        c = C()

        raw = np.asarray(
            [['a', 'aa', 'aaa', 'abab'], ['bab', 'aba', 'aa', 'bb'],
             ['a', 'aba', 'abaa', 'abaab'], ['a', 'aa', 'aaa', 'aaaa']],
            dtype=labelarray_dtype,
        )
        raw_relabeled = np.vectorize(relabel_func, otypes=[object])(raw)

        data = LabelArray(raw, missing_value=None)

        terms = {
            'relabeled': c.relabel(relabel_func),
        }
        expected_results = {
            'relabeled': LabelArray(raw_relabeled, missing_value=None),
        }

        self.check_terms(
            terms,
            expected_results,
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )
Beispiel #6
0
    def test_object1darrayoverwrite(self):
        pairs = [u + l for u, l in product(ascii_uppercase, ascii_lowercase)]
        categories = pairs + ['~' + c for c in pairs]
        baseline = LabelArray(
            array([[''.join((r, c)) for c in 'abc'] for r in ascii_uppercase]),
            None,
            categories,
        )
        full_expected = baseline.copy()

        def flip(cs):
            if cs is None:
                return None
            if cs[0] != '~':
                return '~' + cs
            return cs

        def make_overwrite(fr, lr, fc, lc):
            fr, lr, fc, lc = map(ord, (fr, lr, fc, lc))
            fr -= ord('A')
            lr -= ord('A')
            fc -= ord('a')
            lc -= ord('a')

            return Object1DArrayOverwrite(
                fr,
                lr,
                fc,
                lc,
                baseline[fr:lr + 1, fc].map(flip),
            )

        overwrites = {
            3: [make_overwrite('A', 'B', 'a', 'a')],
            4: [make_overwrite('A', 'C', 'b', 'c')],
            5: [make_overwrite('D', 'D', 'a', 'b')],
        }

        it = AdjustedArray(baseline, overwrites, None).traverse(3)

        window = next(it)
        expected = full_expected[:3]
        check_arrays(window, expected)

        window = next(it)
        full_expected[0:2, 0] = LabelArray(['~Aa', '~Ba'], None)
        expected = full_expected[1:4]
        check_arrays(window, expected)

        window = next(it)
        full_expected[0:3, 1:3] = LabelArray(
            [['~Ab', '~Ac'], ['~Bb', '~Bc'], ['~Cb', '~Cb']], None)
        expected = full_expected[2:5]
        check_arrays(window, expected)

        window = next(it)
        full_expected[3, :2] = '~Da'
        expected = full_expected[3:6]
        check_arrays(window, expected)
    def test_object1darrayoverwrite(self):
        pairs = [u + l for u, l in product(ascii_uppercase, ascii_lowercase)]
        categories = pairs + ['~' + c for c in pairs]
        baseline = LabelArray(
            array([[''.join((r, c)) for c in 'abc'] for r in ascii_uppercase]),
            None,
            categories,
        )
        full_expected = baseline.copy()

        def flip(cs):
            if cs is None:
                return None
            if cs[0] != '~':
                return '~' + cs
            return cs

        def make_overwrite(fr, lr, fc, lc):
            fr, lr, fc, lc = map(ord, (fr, lr, fc, lc))
            fr -= ord('A')
            lr -= ord('A')
            fc -= ord('a')
            lc -= ord('a')

            return Object1DArrayOverwrite(
                fr, lr,
                fc, lc,
                baseline[fr:lr + 1, fc].map(flip),
            )

        overwrites = {
            3: [make_overwrite('A', 'B', 'a', 'a')],
            4: [make_overwrite('A', 'C', 'b', 'c')],
            5: [make_overwrite('D', 'D', 'a', 'b')],
        }

        it = AdjustedArray(baseline, overwrites, None).traverse(3)

        window = next(it)
        expected = full_expected[:3]
        check_arrays(window, expected)

        window = next(it)
        full_expected[0:2, 0] = LabelArray(['~Aa', '~Ba'], None)
        expected = full_expected[1:4]
        check_arrays(window, expected)

        window = next(it)
        full_expected[0:3, 1:3] = LabelArray([['~Ab', '~Ac'],
                                              ['~Bb', '~Bc'],
                                              ['~Cb', '~Cb']], None)
        expected = full_expected[2:5]
        check_arrays(window, expected)

        window = next(it)
        full_expected[3, :2] = '~Da'
        expected = full_expected[3:6]
        check_arrays(window, expected)
Beispiel #8
0
    def test_string_elementwise_predicates(self, compval, missing,
                                           labelarray_dtype):
        if labelarray_dtype == bytes_dtype:
            compval = compval.encode('utf-8')
            missing = missing.encode('utf-8')

            startswith_re = b'^' + compval + b'.*'
            endswith_re = b'.*' + compval + b'$'
            substring_re = b'.*' + compval + b'.*'
        else:
            startswith_re = '^' + compval + '.*'
            endswith_re = '.*' + compval + '$'
            substring_re = '.*' + compval + '.*'

        class C(Classifier):
            dtype = categorical_dtype
            missing_value = missing
            inputs = ()
            window_length = 0

        c = C()

        # There's no significance to the values here other than that they
        # contain a mix of the comparison value and other values.
        data = LabelArray(
            np.asarray(
                [['', 'a', 'ab', 'ba'], ['z', 'ab', 'a', 'ab'],
                 ['aa', 'ab', '', 'ab'], ['aa', 'a', 'ba', 'ba']],
                dtype=labelarray_dtype,
            ),
            missing_value=missing,
        )

        terms = {
            'startswith': c.startswith(compval),
            'endswith': c.endswith(compval),
            'has_substring': c.has_substring(compval),
            # Equivalent filters using regex matching.
            'startswith_re': c.matches(startswith_re),
            'endswith_re': c.matches(endswith_re),
            'has_substring_re': c.matches(substring_re),
        }

        expected = {
            'startswith': (data.startswith(compval) & (data != missing)),
            'endswith': (data.endswith(compval) & (data != missing)),
            'has_substring': (data.has_substring(compval) & (data != missing)),
        }
        for key in list(expected):
            expected[key + '_re'] = expected[key]

        self.check_terms(
            terms=terms,
            expected=expected,
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )
Beispiel #9
0
    def test_map_ignores_missing_value(self, missing):
        data = np.array([missing, 'B', 'C'], dtype=object)
        la = LabelArray(data, missing_value=missing)

        def increment_char(c):
            return chr(ord(c) + 1)

        result = la.map(increment_char)
        expected = LabelArray([missing, 'C', 'D'], missing_value=missing)
        assert_equal(result.as_string_array(), expected.as_string_array())
Beispiel #10
0
    def test_narrow_condense_back_to_valid_size(self):
        categories = ['a'] * (2**8 + 1)
        arr = LabelArray(categories, missing_value=categories[0])
        assert_equal(arr.itemsize, 1)
        self.check_roundtrip(arr)

        # longer than int16 but still fits when deduped
        categories = self.create_categories(16, plus_one=False)
        categories.append(categories[0])
        arr = LabelArray(categories, missing_value=categories[0])
        assert_equal(arr.itemsize, 2)
        self.check_roundtrip(arr)
    def test_relabel_missing_value_interactions(self, missing_value):

        mv = missing_value

        class C(Classifier):
            inputs = ()
            dtype = categorical_dtype
            missing_value = mv
            window_length = 0

        c = C()

        def relabel_func(s):
            if s == "B":
                return mv
            return "".join([s, s])

        raw = np.asarray(
            [
                ["A", "B", "C", mv],
                [mv, "A", "B", "C"],
                ["C", mv, "A", "B"],
                ["B", "C", mv, "A"],
            ],
            dtype=categorical_dtype,
        )
        data = LabelArray(raw, missing_value=mv)

        expected_relabeled_raw = np.asarray(
            [
                ["AA", mv, "CC", mv],
                [mv, "AA", mv, "CC"],
                ["CC", mv, "AA", mv],
                [mv, "CC", mv, "AA"],
            ],
            dtype=categorical_dtype,
        )

        terms = {
            "relabeled": c.relabel(relabel_func),
        }
        expected_results = {
            "relabeled": LabelArray(expected_relabeled_raw, missing_value=mv),
        }

        self.check_terms(
            terms,
            expected_results,
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )
Beispiel #12
0
    def test_map(self, f):
        data = np.array(
            [['E', 'GHIJ', 'HIJKLMNOP', 'DEFGHIJ'],
             ['CDE', 'ABCDEFGHIJKLMNOPQ', 'DEFGHIJKLMNOPQRS', 'ABCDEFGHIJK'],
             ['DEFGHIJKLMNOPQR', 'DEFGHI', 'DEFGHIJ', 'FGHIJK'],
             ['EFGHIJKLM', 'EFGHIJKLMNOPQRS', 'ABCDEFGHI', 'DEFGHIJ']],
            dtype=object,
        )
        la = LabelArray(data, missing_value=None)

        numpy_transformed = np.vectorize(f)(data)
        la_transformed = la.map(f).as_string_array()

        assert_equal(numpy_transformed, la_transformed)
Beispiel #13
0
    def test_map_can_only_return_none_if_missing_value_is_none(self):

        # Should work.
        la = LabelArray(self.strs, missing_value=None)
        result = la.map(lambda x: None)

        check_arrays(
            result,
            LabelArray(np.full_like(self.strs, None), missing_value=None),
        )

        la = LabelArray(self.strs, missing_value="__MISSING__")
        with self.assertRaises(TypeError):
            la.map(lambda x: None)
Beispiel #14
0
    def test_compare_to_str(self,
                            compval,
                            shape,
                            array_astype,
                            missing_value):

        strs = self.strs.reshape(shape).astype(array_astype)
        if missing_value is None:
            # As of numpy 1.9.2, object array != None returns just False
            # instead of an array, with a deprecation warning saying the
            # behavior will change in the future.  Work around that by just
            # using the ufunc.
            notmissing = np.not_equal(strs, missing_value)
        else:
            if not isinstance(missing_value, array_astype):
                missing_value = array_astype(missing_value, 'utf-8')
            notmissing = (strs != missing_value)

        arr = LabelArray(strs, missing_value=missing_value)

        if not isinstance(compval, array_astype):
            compval = array_astype(compval, 'utf-8')

        # arr.missing_value should behave like NaN.
        check_arrays(
            arr == compval,
            (strs == compval) & notmissing,
        )
        check_arrays(
            arr != compval,
            (strs != compval) & notmissing,
        )

        np_startswith = np.vectorize(lambda elem: elem.startswith(compval))
        check_arrays(
            arr.startswith(compval),
            np_startswith(strs) & notmissing,
        )

        np_endswith = np.vectorize(lambda elem: elem.endswith(compval))
        check_arrays(
            arr.endswith(compval),
            np_endswith(strs) & notmissing,
        )

        np_contains = np.vectorize(lambda elem: compval in elem)
        check_arrays(
            arr.has_substring(compval),
            np_contains(strs) & notmissing,
        )
Beispiel #15
0
    def test_compare_to_str(self,
                            compval,
                            shape,
                            array_astype,
                            missing_value):

        strs = self.strs.reshape(shape).astype(array_astype)
        if missing_value is None:
            # As of numpy 1.9.2, object array != None returns just False
            # instead of an array, with a deprecation warning saying the
            # behavior will change in the future.  Work around that by just
            # using the ufunc.
            notmissing = np.not_equal(strs, missing_value)
        else:
            if not isinstance(missing_value, array_astype):
                missing_value = array_astype(missing_value, 'utf-8')
            notmissing = (strs != missing_value)

        arr = LabelArray(strs, missing_value=missing_value)

        if not isinstance(compval, array_astype):
            compval = array_astype(compval, 'utf-8')

        # arr.missing_value should behave like NaN.
        check_arrays(
            arr == compval,
            (strs == compval) & notmissing,
        )
        check_arrays(
            arr != compval,
            (strs != compval) & notmissing,
        )

        np_startswith = np.vectorize(lambda elem: elem.startswith(compval))
        check_arrays(
            arr.startswith(compval),
            np_startswith(strs) & notmissing,
        )

        np_endswith = np.vectorize(lambda elem: elem.endswith(compval))
        check_arrays(
            arr.endswith(compval),
            np_endswith(strs) & notmissing,
        )

        np_contains = np.vectorize(lambda elem: compval in elem)
        check_arrays(
            arr.has_substring(compval),
            np_contains(strs) & notmissing,
        )
Beispiel #16
0
    def test_map(self, f):
        data = np.array(
            [['E', 'GHIJ', 'HIJKLMNOP', 'DEFGHIJ'],
             ['CDE', 'ABCDEFGHIJKLMNOPQ', 'DEFGHIJKLMNOPQRS', 'ABCDEFGHIJK'],
             ['DEFGHIJKLMNOPQR', 'DEFGHI', 'DEFGHIJ', 'FGHIJK'],
             ['EFGHIJKLM', 'EFGHIJKLMNOPQRS', 'ABCDEFGHI', 'DEFGHIJ']],
            dtype=object,
        )
        la = LabelArray(data, missing_value=None)

        numpy_transformed = np.vectorize(f)(data)
        la_transformed = la.map(f).as_string_array()

        assert_equal(numpy_transformed, la_transformed)
 def test_slicing_preserves_attributes(self, slice_):
     arr = LabelArray(self.strs.reshape((9, 3)), missing_value="")
     sliced = arr[slice_]
     assert isinstance(sliced, LabelArray)
     assert sliced.categories is arr.categories
     assert sliced.reverse_categories is arr.reverse_categories
     assert sliced.missing_value is arr.missing_value
    def to_workspace_value(self, result, assets):
        """
        Called with the result of a pipeline. This needs to return an object
        which can be put into the workspace to continue doing computations.

        This is the inverse of :func:`~zipline.pipeline.term.Term.postprocess`.
        """
        if self.dtype == int64_dtype:
            return super(Classifier, self).to_workspace_value(result, assets)

        assert isinstance(result.values,
                          pd.Categorical), ('Expected a Categorical, got %r.' %
                                            type(result.values))
        with_missing = pd.Series(
            data=pd.Categorical(
                result.values,
                result.values.categories.union([self.missing_value]),
            ),
            index=result.index,
        )
        return LabelArray(
            super(Classifier, self).to_workspace_value(
                with_missing,
                assets,
            ),
            self.missing_value,
        )
    def test_map(self, f):
        data = np.array(
            [
                ["E", "GHIJ", "HIJKLMNOP", "DEFGHIJ"],
                ["CDE", "ABCDEFGHIJKLMNOPQ", "DEFGHIJKLMNOPQRS", "ABCDEFGHIJK"],
                ["DEFGHIJKLMNOPQR", "DEFGHI", "DEFGHIJ", "FGHIJK"],
                ["EFGHIJKLM", "EFGHIJKLMNOPQRS", "ABCDEFGHI", "DEFGHIJ"],
            ],
            dtype=object,
        )
        la = LabelArray(data, missing_value=None)

        numpy_transformed = np.vectorize(f)(data)
        la_transformed = la.map(f).as_string_array()

        assert_equal(numpy_transformed, la_transformed)
Beispiel #20
0
 def test_slicing_preserves_attributes(self, slice_):
     arr = LabelArray(self.strs.reshape((9, 3)), missing_value='')
     sliced = arr[slice_]
     self.assertIsInstance(sliced, LabelArray)
     self.assertIs(sliced.categories, arr.categories)
     self.assertIs(sliced.reverse_categories, arr.reverse_categories)
     self.assertIs(sliced.missing_value, arr.missing_value)
Beispiel #21
0
    def test_string_isnull(self, mv):
        class C(Classifier):
            dtype = categorical_dtype
            missing_value = mv
            inputs = ()
            window_length = 0

        c = C()

        # There's no significance to the values here other than that they
        # contain a mix of missing and non-missing values.
        raw = np.asarray(
            [['', 'a', 'ab', 'ba'], ['z', 'ab', 'a', 'ab'],
             ['aa', 'ab', '', 'ab'], ['aa', 'a', 'ba', 'ba']],
            dtype=categorical_dtype,
        )
        data = LabelArray(raw, missing_value=mv)

        self.check_terms(
            terms={
                'isnull': c.isnull(),
                'notnull': c.notnull()
            },
            expected={
                'isnull': np.equal(raw, mv),
                'notnull': np.not_equal(raw, mv),
            },
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )
Beispiel #22
0
    def test_string_eq(self, compval, labelarray_dtype):

        compval = labelarray_dtype.type(compval)

        class C(Classifier):
            dtype = categorical_dtype
            missing_value = ''
            inputs = ()
            window_length = 0

        c = C()

        # There's no significance to the values here other than that they
        # contain a mix of the comparison value and other values.
        data = LabelArray(
            np.asarray(
                [['', 'a', 'ab', 'ba'], ['z', 'ab', 'a', 'ab'],
                 ['aa', 'ab', '', 'ab'], ['aa', 'a', 'ba', 'ba']],
                dtype=labelarray_dtype,
            ),
            missing_value='',
        )

        self.check_terms(
            terms={
                'eq': c.eq(compval),
            },
            expected={
                'eq': (data == compval),
            },
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )
    def test_relabel_missing_value_interactions(self, missing_value):

        mv = missing_value

        class C(Classifier):
            inputs = ()
            dtype = categorical_dtype
            missing_value = mv
            window_length = 0

        c = C()

        def relabel_func(s):
            if s == 'B':
                return mv
            return ''.join([s, s])

        raw = np.asarray(
            [['A', 'B', 'C', mv],
             [mv, 'A', 'B', 'C'],
             ['C', mv, 'A', 'B'],
             ['B', 'C', mv, 'A']],
            dtype=categorical_dtype,
        )
        data = LabelArray(raw, missing_value=mv)

        expected_relabeled_raw = np.asarray(
            [['AA', mv, 'CC', mv],
             [mv, 'AA', mv, 'CC'],
             ['CC', mv, 'AA', mv],
             [mv, 'CC', mv, 'AA']],
            dtype=categorical_dtype,
        )

        terms = {
            'relabeled': c.relabel(relabel_func),
        }
        expected_results = {
            'relabeled': LabelArray(expected_relabeled_raw, missing_value=mv),
        }

        self.check_terms(
            terms,
            expected_results,
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )
Beispiel #24
0
 def check_roundtrip(arr):
     assert_equal(
         arr.as_string_array(),
         LabelArray(
             arr.as_string_array(),
             arr.missing_value,
         ).as_string_array(),
     )
 def manual_narrow_condense_back_to_valid_size_slow(self):
     """This test is really slow so we don't want it run by default."""
     # tests that we don't try to create an 'int24' (which is meaningless)
     categories = self.create_categories(24, plus_one=False)
     categories.append(categories[0])
     arr = LabelArray(categories, missing_value=categories[0])
     assert arr.itemsize == 4
     self.check_roundtrip(arr)
Beispiel #26
0
    def test_infer_categories(self):
        """
        Test that categories are inferred in sorted order if they're not
        explicitly passed.
        """
        arr1d = LabelArray(self.strs, missing_value='')
        codes1d = arr1d.as_int_array()
        self.assertEqual(arr1d.shape, self.strs.shape)
        self.assertEqual(arr1d.shape, codes1d.shape)

        categories = arr1d.categories
        unique_rowvalues = set(self.rowvalues)

        # There should be an entry in categories for each unique row value, and
        # each integer stored in the data array should be an index into
        # categories.
        self.assertEqual(list(categories), sorted(set(self.rowvalues)))
        self.assertEqual(
            set(codes1d.ravel()),
            set(range(len(unique_rowvalues)))
        )
        for idx, value in enumerate(arr1d.categories):
            check_arrays(
                self.strs == value,
                arr1d.as_int_array() == idx,
            )

        # It should be equivalent to pass the same set of categories manually.
        arr1d_explicit_categories = LabelArray(
            self.strs,
            missing_value='',
            categories=arr1d.categories,
        )
        check_arrays(arr1d, arr1d_explicit_categories)

        for shape in (9, 3), (3, 9), (3, 3, 3):
            strs2d = self.strs.reshape(shape)
            arr2d = LabelArray(strs2d, missing_value='')
            codes2d = arr2d.as_int_array()

            self.assertEqual(arr2d.shape, shape)
            check_arrays(arr2d.categories, categories)

            for idx, value in enumerate(arr2d.categories):
                check_arrays(strs2d == value, codes2d == idx)
Beispiel #27
0
    def test_compare_to_str_array(self, missing_value):
        strs = self.strs
        shape = strs.shape
        arr = LabelArray(strs, missing_value=missing_value)

        if missing_value is None:
            # As of numpy 1.9.2, object array != None returns just False
            # instead of an array, with a deprecation warning saying the
            # behavior will change in the future.  Work around that by just
            # using the ufunc.
            notmissing = np.not_equal(strs, missing_value)
        else:
            notmissing = (strs != missing_value)

        check_arrays(arr.not_missing(), notmissing)
        check_arrays(arr.is_missing(), ~notmissing)

        # The arrays are equal everywhere, but comparisons against the
        # missing_value should always produce False
        check_arrays(strs == arr, notmissing)
        check_arrays(strs != arr, np.zeros_like(strs, dtype=bool))

        def broadcastable_row(value, dtype):
            return np.full((shape[0], 1), value, dtype=strs.dtype)

        def broadcastable_col(value, dtype):
            return np.full((1, shape[1]), value, dtype=strs.dtype)

        # Test comparison between arr and a like-shap 2D array, a column
        # vector, and a row vector.
        for comparator, dtype, value in product((eq, ne),
                                                (bytes, unicode, object),
                                                set(self.rowvalues)):
            check_arrays(
                comparator(arr, np.full_like(strs, value)),
                comparator(strs, value) & notmissing,
            )
            check_arrays(
                comparator(arr, broadcastable_row(value, dtype=dtype)),
                comparator(strs, value) & notmissing,
            )
            check_arrays(
                comparator(arr, broadcastable_col(value, dtype=dtype)),
                comparator(strs, value) & notmissing,
            )
Beispiel #28
0
    def test_compare_to_str_array(self, missing_value):
        strs = self.strs
        shape = strs.shape
        arr = LabelArray(strs, missing_value=missing_value)

        if missing_value is None:
            # As of numpy 1.9.2, object array != None returns just False
            # instead of an array, with a deprecation warning saying the
            # behavior will change in the future.  Work around that by just
            # using the ufunc.
            notmissing = np.not_equal(strs, missing_value)
        else:
            notmissing = (strs != missing_value)

        check_arrays(arr.not_missing(), notmissing)
        check_arrays(arr.is_missing(), ~notmissing)

        # The arrays are equal everywhere, but comparisons against the
        # missing_value should always produce False
        check_arrays(strs == arr, notmissing)
        check_arrays(strs != arr, np.zeros_like(strs, dtype=bool))

        def broadcastable_row(value, dtype):
            return np.full((shape[0], 1), value, dtype=strs.dtype)

        def broadcastable_col(value, dtype):
            return np.full((1, shape[1]), value, dtype=strs.dtype)

        # Test comparison between arr and a like-shap 2D array, a column
        # vector, and a row vector.
        for comparator, dtype, value in product((eq, ne),
                                                (bytes, unicode, object),
                                                set(self.rowvalues)):
            check_arrays(
                comparator(arr, np.full_like(strs, value)),
                comparator(strs, value) & notmissing,
            )
            check_arrays(
                comparator(arr, broadcastable_row(value, dtype=dtype)),
                comparator(strs, value) & notmissing,
            )
            check_arrays(
                comparator(arr, broadcastable_col(value, dtype=dtype)),
                comparator(strs, value) & notmissing,
            )
Beispiel #29
0
def as_labelarray(initial_dtype, missing_value, array):
    """
    Curried wrapper around LabelArray, that round-trips the input data through
    `initial_dtype` first.
    """
    return LabelArray(
        array.astype(initial_dtype),
        missing_value=initial_dtype.type(missing_value),
    )
Beispiel #30
0
    def test_narrow_code_storage(self):
        create_categories = self.create_categories
        check_roundtrip = self.check_roundtrip

        # uint8
        categories = create_categories(8, plus_one=False)

        arr = LabelArray(
            categories,
            missing_value=categories[0],
            categories=categories,
        )
        self.assertEqual(arr.itemsize, 1)
        check_roundtrip(arr)

        # uint8 inference
        arr = LabelArray(categories, missing_value=categories[0])
        self.assertEqual(arr.itemsize, 1)
        check_roundtrip(arr)

        # just over uint8
        categories = create_categories(8, plus_one=True)
        arr = LabelArray(
            categories,
            missing_value=categories[0],
            categories=categories,
        )
        self.assertEqual(arr.itemsize, 2)
        check_roundtrip(arr)

        # fits in uint16
        categories = create_categories(16, plus_one=False)
        arr = LabelArray(
            categories,
            missing_value=categories[0],
            categories=categories,
        )
        self.assertEqual(arr.itemsize, 2)
        check_roundtrip(arr)

        # uint16 inference
        arr = LabelArray(categories, missing_value=categories[0])
        self.assertEqual(arr.itemsize, 2)
        check_roundtrip(arr)

        # just over uint16
        categories = create_categories(16, plus_one=True)
        arr = LabelArray(
            categories,
            missing_value=categories[0],
            categories=categories,
        )
        self.assertEqual(arr.itemsize, 4)
        check_roundtrip(arr)

        # uint32 inference
        arr = LabelArray(categories, missing_value=categories[0])
        self.assertEqual(arr.itemsize, 4)
        check_roundtrip(arr)
Beispiel #31
0
    def test_map_never_increases_code_storage_size(self):
        # This tests a pathological case where a user maps an impure function
        # that returns a different label on every invocation, which in a naive
        # implementation could cause us to need to **increase** the size of our
        # codes after a map.
        #
        # This doesn't happen, however, because we guarantee that the user's
        # mapping function will be called on each unique category exactly once,
        # which means we can never increase the number of categories in the
        # LabelArray after mapping.

        # Using all but one of the categories so that we still fit in a uint8
        # with an extra category for None as a missing value.
        categories = self.create_categories(8, plus_one=False)[:-1]

        larger_categories = self.create_categories(16, plus_one=False)

        # Double the length of the categories so that we have to increase the
        # required size after our map.
        categories_twice = categories + categories

        arr = LabelArray(categories_twice, missing_value=None)
        assert_equal(arr.itemsize, 1)

        gen_unique_categories = iter(larger_categories)

        def new_string_every_time(c):
            # Return a new unique category every time so that every result is
            # different.
            return next(gen_unique_categories)

        result = arr.map(new_string_every_time)

        # Result should still be of size 1.
        assert_equal(result.itemsize, 1)

        # Result should be the first `len(categories)` entries from the larger
        # categories, repeated twice.
        expected = LabelArray(
            larger_categories[:len(categories)] * 2,
            missing_value=None,
        )
        assert_equal(result.as_string_array(), expected.as_string_array())
Beispiel #32
0
 def _compute(self, inputs, assets, dates, mask):
     constant = self.params["const"]
     out = full(mask.shape, constant, dtype=self.dtype)
     if self.dtype == object:
         return LabelArray(
             out,
             categories=[constant],
             missing_value=self.missing_value,
         )
     return out
Beispiel #33
0
def _normalize_array(data, missing_value):
    """
    Coerce buffer data for an AdjustedArray into a standard scalar
    representation, returning the coerced array and a dict of argument to pass
    to np.view to use when providing a user-facing view of the underlying data.

    - float* data is coerced to float64 with viewtype float64.
    - int32, int64, and uint32 are converted to int64 with viewtype int64.
    - datetime[*] data is coerced to int64 with a viewtype of datetime64[ns].
    - bool_ data is coerced to uint8 with a viewtype of bool_.

    Parameters
    ----------
    data : np.ndarray

    Returns
    -------
    coerced, view_kwargs : (np.ndarray, np.dtype)
        The input ``data`` array coerced to the appropriate pipeline type.
        This may return the original array or a view over the same data.
    """
    if isinstance(data, LabelArray):
        return data, {}

    data_dtype = data.dtype
    if data_dtype in BOOL_DTYPES:
        return data.astype(uint8, copy=False), {'dtype': dtype(bool_)}
    elif data_dtype in FLOAT_DTYPES:
        return data.astype(float64, copy=False), {'dtype': dtype(float64)}
    elif data_dtype in INT_DTYPES:
        return data.astype(int64, copy=False), {'dtype': dtype(int64)}
    elif is_categorical(data_dtype):
        if not isinstance(missing_value, LabelArray.SUPPORTED_SCALAR_TYPES):
            raise TypeError(
                "Invalid missing_value for categorical array.\n"
                "Expected None, bytes or unicode. Got %r." % missing_value,
            )
        return LabelArray(data, missing_value), {}
    elif data_dtype.kind == 'M':
        try:
            outarray = data.astype('datetime64[ns]', copy=False).view('int64')
            return outarray, {'dtype': datetime64ns_dtype}
        except OverflowError:
            raise ValueError(
                "AdjustedArray received a datetime array "
                "not representable as datetime64[ns].\n"
                "Min Date: %s\n"
                "Max Date: %s\n"
                % (data.min(), data.max())
            )
    else:
        raise TypeError(
            "Don't know how to construct AdjustedArray "
            "on data of type %s." % data_dtype
        )
Beispiel #34
0
    def test_map_can_only_return_none_if_missing_value_is_none(self):
        # Should work.
        la = LabelArray(self.strs, missing_value=None)
        result = la.map(lambda x: None)

        check_arrays(
            result,
            LabelArray(np.full_like(self.strs, None), missing_value=None),
        )

        la = LabelArray(self.strs, missing_value="__MISSING__")
        with self.assertRaises(TypeError):
            la.map(lambda x: None)
Beispiel #35
0
    def test_reversability_categorical(self):
        class F(Classifier):
            inputs = ()
            window_length = 0
            dtype = categorical_dtype
            missing_value = '<missing>'

        f = F()
        column_data = LabelArray(
            np.array(
                [['a', f.missing_value],
                 ['b', f.missing_value],
                 ['c', 'd']],
            ),
            missing_value=f.missing_value,
        )

        assert_equal(
            f.postprocess(column_data.ravel()),
            pd.Categorical(
                ['a', f.missing_value, 'b', f.missing_value, 'c', 'd'],
            ),
        )

        # only include the non-missing data
        pipeline_output = pd.Series(
            data=['a', 'b', 'c', 'd'],
            index=pd.MultiIndex.from_arrays([
                [pd.Timestamp('2014-01-01'),
                 pd.Timestamp('2014-01-02'),
                 pd.Timestamp('2014-01-03'),
                 pd.Timestamp('2014-01-03')],
                [0, 0, 0, 1],
            ]),
            dtype='category',
        )

        assert_equal(
            f.to_workspace_value(pipeline_output, pd.Index([0, 1])),
            column_data,
        )
Beispiel #36
0
    def test_update_labels(self):
        data = np.array(
            [
                ["aaa", "bbb", "ccc"],
                ["ddd", "eee", "fff"],
                ["ggg", "hhh", "iii"],
                ["jjj", "kkk", "lll"],
                ["mmm", "nnn", "ooo"],
            ]
        )
        label_array = LabelArray(data, missing_value="")

        adj_array = AdjustedArray(
            data=label_array,
            adjustments={4: [ObjectOverwrite(2, 3, 0, 0, "ppp")]},
            missing_value="",
        )

        expected_data = np.array(
            [
                ["aaa-foo", "bbb-foo", "ccc-foo"],
                ["ddd-foo", "eee-foo", "fff-foo"],
                ["ggg-foo", "hhh-foo", "iii-foo"],
                ["jjj-foo", "kkk-foo", "lll-foo"],
                ["mmm-foo", "nnn-foo", "ooo-foo"],
            ]
        )
        expected_label_array = LabelArray(expected_data, missing_value="")

        expected_adj_array = AdjustedArray(
            data=expected_label_array,
            adjustments={4: [ObjectOverwrite(2, 3, 0, 0, "ppp-foo")]},
            missing_value="",
        )

        adj_array.update_labels(lambda x: x + "-foo")

        # Check that the mapped AdjustedArray has the expected baseline
        # values and adjustment values.
        check_arrays(adj_array.data, expected_adj_array.data)
        assert adj_array.adjustments == expected_adj_array.adjustments
    def test_reversability_categorical(self):
        class F(Classifier):
            inputs = ()
            window_length = 0
            dtype = categorical_dtype
            missing_value = '<missing>'

        f = F()
        column_data = LabelArray(
            np.array(
                [['a', f.missing_value],
                 ['b', f.missing_value],
                 ['c', 'd']],
            ),
            missing_value=f.missing_value,
        )

        assert_equal(
            f.postprocess(column_data.ravel()),
            pd.Categorical(
                ['a', f.missing_value, 'b', f.missing_value, 'c', 'd'],
            ),
        )

        # only include the non-missing data
        pipeline_output = pd.Series(
            data=['a', 'b', 'c', 'd'],
            index=pd.MultiIndex.from_arrays([
                [pd.Timestamp('2014-01-01'),
                 pd.Timestamp('2014-01-02'),
                 pd.Timestamp('2014-01-03'),
                 pd.Timestamp('2014-01-03')],
                [0, 0, 0, 1],
            ]),
            dtype='category',
        )

        assert_equal(
            f.to_workspace_value(pipeline_output, pd.Index([0, 1])),
            column_data,
        )
    def test_string_not_equal(self, compval, missing, labelarray_dtype):

        compval = labelarray_dtype.type(compval)

        class C(Classifier):
            dtype = categorical_dtype
            missing_value = missing
            inputs = ()
            window_length = 0

        c = C()

        # There's no significance to the values here other than that they
        # contain a mix of the comparison value and other values.
        data = LabelArray(
            np.asarray(
                [
                    ["", "a", "ab", "ba"],
                    ["z", "ab", "a", "ab"],
                    ["aa", "ab", "", "ab"],
                    ["aa", "a", "ba", "ba"],
                ],
                dtype=labelarray_dtype,
            ),
            missing_value=missing,
        )

        expected = (data.as_int_array() != data.reverse_categories.get(
            compval, -1)) & (data.as_int_array() !=
                             data.reverse_categories[C.missing_value])

        self.check_terms(
            terms={
                "ne": c != compval,
            },
            expected={
                "ne": expected,
            },
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )
Beispiel #39
0
    def test_peer_count(self, dtype_and_missing, use_mask):
        class C(Classifier):
            dtype = dtype_and_missing[0]
            missing_value = dtype_and_missing[1]
            inputs = ()
            window_length = 0

        c = C()

        if dtype_and_missing[0] == int64_dtype:
            data = np.array(
                [[1, 1, -1, 2, 1, -1], [2, 1, 3, 2, 2, 2],
                 [-1, 1, 10, 10, 10, -1], [3, 3, 3, 3, 3, 3]],
                dtype=int64_dtype,
            )
        else:
            data = LabelArray(
                [['a', 'a', None, 'b', 'a', None],
                 ['b', 'a', 'c', 'b', 'b', 'b'],
                 [None, 'a', 'aa', 'aa', 'aa', None],
                 ['c', 'c', 'c', 'c', 'c', 'c']],
                missing_value=None,
            )

        if not use_mask:
            mask = self.build_mask(self.ones_mask(shape=data.shape))
            expected = np.array(
                [[3, 3, np.nan, 1, 3, np.nan], [4, 1, 1, 4, 4, 4],
                 [np.nan, 1, 3, 3, 3, np.nan], [6, 6, 6, 6, 6, 6]], )
        else:
            # Punch a couple holes in the mask to check that we handle the mask
            # correctly.
            mask = self.build_mask(
                np.array([[1, 1, 1, 1, 0, 1], [1, 1, 1, 1, 1, 0],
                          [1, 1, 1, 1, 1, 1], [1, 1, 0, 0, 1, 1]],
                         dtype='bool'))
            expected = np.array(
                [[2, 2, np.nan, 1, np.nan, np.nan], [3, 1, 1, 3, 3, np.nan],
                 [np.nan, 1, 3, 3, 3, np.nan], [4, 4, np.nan, np.nan, 4, 4]], )

        terms = {
            'peer_counts': c.peer_count(),
        }
        expected_results = {
            'peer_counts': expected,
        }

        self.check_terms(
            terms=terms,
            expected=expected_results,
            initial_workspace={c: data},
            mask=mask,
        )
Beispiel #40
0
    def rand_categoricals(self, categories, seed, shape, missing_value=None):
        """Build a block of random categorical data.

        Categories should not include ``missing_value``.
        """
        categories = list(categories) + [missing_value]
        data = np.random.RandomState(seed).choice(categories, shape)
        return LabelArray(
            data,
            missing_value=missing_value,
            categories=categories,
        )
Beispiel #41
0
    def test_string_not_equal(self, compval, missing, labelarray_dtype):

        compval = labelarray_dtype.type(compval)

        class C(Classifier):
            dtype = categorical_dtype
            missing_value = missing
            inputs = ()
            window_length = 0

        c = C()

        # There's no significance to the values here other than that they
        # contain a mix of the comparison value and other values.
        data = LabelArray(
            np.asarray(
                [['',    'a',  'ab', 'ba'],
                 ['z',  'ab',   'a', 'ab'],
                 ['aa', 'ab',    '', 'ab'],
                 ['aa',  'a',  'ba', 'ba']],
                dtype=labelarray_dtype,
            ),
            missing_value=missing,
        )

        expected = (
            (data.as_int_array() != data.reverse_categories.get(compval, -1)) &
            (data.as_int_array() != data.reverse_categories[C.missing_value])
        )

        self.check_terms(
            terms={
                'ne': c != compval,
            },
            expected={
                'ne': expected,
            },
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )
Beispiel #42
0
    def test_map_shrinks_code_storage_if_possible(self):
        arr = LabelArray(
            # Drop the last value so we fit in a uint16 with None as a missing
            # value.
            self.create_categories(16, plus_one=False)[:-1],
            missing_value=None,
        )

        self.assertEqual(arr.itemsize, 2)

        def either_A_or_B(s):
            return ('A', 'B')[sum(ord(c) for c in s) % 2]

        result = arr.map(either_A_or_B)

        self.assertEqual(set(result.categories), {'A', 'B', None})
        self.assertEqual(result.itemsize, 1)

        assert_equal(
            np.vectorize(either_A_or_B)(arr.as_string_array()),
            result.as_string_array(),
        )
Beispiel #43
0
    def test_setitem_scalar(self, val, missing_value):
        arr = LabelArray(self.strs, missing_value=missing_value)

        if not arr.has_label(val):
            self.assertTrue(
                (val == 'not in the array')
                or (val is None and missing_value is not None)
            )
            for slicer in [(0, 0), (0, 1), 1]:
                with self.assertRaises(ValueError):
                    arr[slicer] = val
            return

        arr[0, 0] = val
        self.assertEqual(arr[0, 0], val)

        arr[0, 1] = val
        self.assertEqual(arr[0, 1], val)

        arr[1] = val
        if val == missing_value:
            self.assertTrue(arr.is_missing()[1].all())
        else:
            self.assertTrue((arr[1] == val).all())
            self.assertTrue((arr[1].as_string_array() == val).all())

        arr[:, -1] = val
        if val == missing_value:
            self.assertTrue(arr.is_missing()[:, -1].all())
        else:
            self.assertTrue((arr[:, -1] == val).all())
            self.assertTrue((arr[:, -1].as_string_array() == val).all())

        arr[:] = val
        if val == missing_value:
            self.assertTrue(arr.is_missing().all())
        else:
            self.assertFalse(arr.is_missing().any())
            self.assertTrue((arr == val).all())
Beispiel #44
0
    def test_map_requires_f_to_return_a_string_or_none(self, f):
        la = LabelArray(self.strs, missing_value=None)

        with self.assertRaises(TypeError):
            la.map(f)
Beispiel #45
0
    def test_string_elementwise_predicates(self,
                                           compval,
                                           missing,
                                           labelarray_dtype):
        if labelarray_dtype == bytes_dtype:
            compval = compval.encode('utf-8')
            missing = missing.encode('utf-8')

            startswith_re = b'^' + compval + b'.*'
            endswith_re = b'.*' + compval + b'$'
            substring_re = b'.*' + compval + b'.*'
        else:
            startswith_re = '^' + compval + '.*'
            endswith_re = '.*' + compval + '$'
            substring_re = '.*' + compval + '.*'

        class C(Classifier):
            dtype = categorical_dtype
            missing_value = missing
            inputs = ()
            window_length = 0

        c = C()

        # There's no significance to the values here other than that they
        # contain a mix of the comparison value and other values.
        data = LabelArray(
            np.asarray(
                [['',    'a',  'ab', 'ba'],
                 ['z',  'ab',   'a', 'ab'],
                 ['aa', 'ab',    '', 'ab'],
                 ['aa',  'a',  'ba', 'ba']],
                dtype=labelarray_dtype,
            ),
            missing_value=missing,
        )

        terms = {
            'startswith': c.startswith(compval),
            'endswith': c.endswith(compval),
            'has_substring': c.has_substring(compval),
            # Equivalent filters using regex matching.
            'startswith_re': c.matches(startswith_re),
            'endswith_re': c.matches(endswith_re),
            'has_substring_re': c.matches(substring_re),
        }

        expected = {
            'startswith': (data.startswith(compval) & (data != missing)),
            'endswith': (data.endswith(compval) & (data != missing)),
            'has_substring': (data.has_substring(compval) & (data != missing)),
        }
        for key in list(expected):
            expected[key + '_re'] = expected[key]

        self.check_terms(
            terms=terms,
            expected=expected,
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )