Beispiel #1
0
    def test_string_elementwise_predicates(self, compval, missing,
                                           labelarray_dtype):
        if labelarray_dtype == bytes_dtype:
            compval = compval.encode('utf-8')
            missing = missing.encode('utf-8')

            startswith_re = b'^' + compval + b'.*'
            endswith_re = b'.*' + compval + b'$'
            substring_re = b'.*' + compval + b'.*'
        else:
            startswith_re = '^' + compval + '.*'
            endswith_re = '.*' + compval + '$'
            substring_re = '.*' + compval + '.*'

        class C(Classifier):
            dtype = categorical_dtype
            missing_value = missing
            inputs = ()
            window_length = 0

        c = C()

        # There's no significance to the values here other than that they
        # contain a mix of the comparison value and other values.
        data = LabelArray(
            np.asarray(
                [['', 'a', 'ab', 'ba'], ['z', 'ab', 'a', 'ab'],
                 ['aa', 'ab', '', 'ab'], ['aa', 'a', 'ba', 'ba']],
                dtype=labelarray_dtype,
            ),
            missing_value=missing,
        )

        terms = {
            'startswith': c.startswith(compval),
            'endswith': c.endswith(compval),
            'has_substring': c.has_substring(compval),
            # Equivalent filters using regex matching.
            'startswith_re': c.matches(startswith_re),
            'endswith_re': c.matches(endswith_re),
            'has_substring_re': c.matches(substring_re),
        }

        expected = {
            'startswith': (data.startswith(compval) & (data != missing)),
            'endswith': (data.endswith(compval) & (data != missing)),
            'has_substring': (data.has_substring(compval) & (data != missing)),
        }
        for key in list(expected):
            expected[key + '_re'] = expected[key]

        self.check_terms(
            terms=terms,
            expected=expected,
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )
Beispiel #2
0
    def test_compare_to_str(self,
                            compval,
                            shape,
                            array_astype,
                            missing_value):

        strs = self.strs.reshape(shape).astype(array_astype)
        if missing_value is None:
            # As of numpy 1.9.2, object array != None returns just False
            # instead of an array, with a deprecation warning saying the
            # behavior will change in the future.  Work around that by just
            # using the ufunc.
            notmissing = np.not_equal(strs, missing_value)
        else:
            if not isinstance(missing_value, array_astype):
                missing_value = array_astype(missing_value, 'utf-8')
            notmissing = (strs != missing_value)

        arr = LabelArray(strs, missing_value=missing_value)

        if not isinstance(compval, array_astype):
            compval = array_astype(compval, 'utf-8')

        # arr.missing_value should behave like NaN.
        check_arrays(
            arr == compval,
            (strs == compval) & notmissing,
        )
        check_arrays(
            arr != compval,
            (strs != compval) & notmissing,
        )

        np_startswith = np.vectorize(lambda elem: elem.startswith(compval))
        check_arrays(
            arr.startswith(compval),
            np_startswith(strs) & notmissing,
        )

        np_endswith = np.vectorize(lambda elem: elem.endswith(compval))
        check_arrays(
            arr.endswith(compval),
            np_endswith(strs) & notmissing,
        )

        np_contains = np.vectorize(lambda elem: compval in elem)
        check_arrays(
            arr.has_substring(compval),
            np_contains(strs) & notmissing,
        )
Beispiel #3
0
    def test_compare_to_str(self,
                            compval,
                            shape,
                            array_astype,
                            missing_value):

        strs = self.strs.reshape(shape).astype(array_astype)
        if missing_value is None:
            # As of numpy 1.9.2, object array != None returns just False
            # instead of an array, with a deprecation warning saying the
            # behavior will change in the future.  Work around that by just
            # using the ufunc.
            notmissing = np.not_equal(strs, missing_value)
        else:
            if not isinstance(missing_value, array_astype):
                missing_value = array_astype(missing_value, 'utf-8')
            notmissing = (strs != missing_value)

        arr = LabelArray(strs, missing_value=missing_value)

        if not isinstance(compval, array_astype):
            compval = array_astype(compval, 'utf-8')

        # arr.missing_value should behave like NaN.
        check_arrays(
            arr == compval,
            (strs == compval) & notmissing,
        )
        check_arrays(
            arr != compval,
            (strs != compval) & notmissing,
        )

        np_startswith = np.vectorize(lambda elem: elem.startswith(compval))
        check_arrays(
            arr.startswith(compval),
            np_startswith(strs) & notmissing,
        )

        np_endswith = np.vectorize(lambda elem: elem.endswith(compval))
        check_arrays(
            arr.endswith(compval),
            np_endswith(strs) & notmissing,
        )

        np_contains = np.vectorize(lambda elem: compval in elem)
        check_arrays(
            arr.has_substring(compval),
            np_contains(strs) & notmissing,
        )
    def test_string_elementwise_predicates(self, compval, missing,
                                           labelarray_dtype):
        if labelarray_dtype == bytes_dtype:
            compval = compval.encode("utf-8")
            missing = missing.encode("utf-8")

            startswith_re = b"^" + compval + b".*"
            endswith_re = b".*" + compval + b"$"
            substring_re = b".*" + compval + b".*"
        else:
            startswith_re = "^" + compval + ".*"
            endswith_re = ".*" + compval + "$"
            substring_re = ".*" + compval + ".*"

        class C(Classifier):
            dtype = categorical_dtype
            missing_value = missing
            inputs = ()
            window_length = 0

        c = C()

        # There's no significance to the values here other than that they
        # contain a mix of the comparison value and other values.
        data = LabelArray(
            np.asarray(
                [
                    ["", "a", "ab", "ba"],
                    ["z", "ab", "a", "ab"],
                    ["aa", "ab", "", "ab"],
                    ["aa", "a", "ba", "ba"],
                ],
                dtype=labelarray_dtype,
            ),
            missing_value=missing,
        )

        terms = {
            "startswith": c.startswith(compval),
            "endswith": c.endswith(compval),
            "has_substring": c.has_substring(compval),
            # Equivalent filters using regex matching.
            "startswith_re": c.matches(startswith_re),
            "endswith_re": c.matches(endswith_re),
            "has_substring_re": c.matches(substring_re),
        }

        expected = {
            "startswith": (data.startswith(compval) & (data != missing)),
            "endswith": (data.endswith(compval) & (data != missing)),
            "has_substring": (data.has_substring(compval) & (data != missing)),
        }
        for key in list(expected):
            expected[key + "_re"] = expected[key]

        self.check_terms(
            terms=terms,
            expected=expected,
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )
Beispiel #5
0
    def test_string_elementwise_predicates(self,
                                           compval,
                                           missing,
                                           labelarray_dtype):
        if labelarray_dtype == bytes_dtype:
            compval = compval.encode('utf-8')
            missing = missing.encode('utf-8')

            startswith_re = b'^' + compval + b'.*'
            endswith_re = b'.*' + compval + b'$'
            substring_re = b'.*' + compval + b'.*'
        else:
            startswith_re = '^' + compval + '.*'
            endswith_re = '.*' + compval + '$'
            substring_re = '.*' + compval + '.*'

        class C(Classifier):
            dtype = categorical_dtype
            missing_value = missing
            inputs = ()
            window_length = 0

        c = C()

        # There's no significance to the values here other than that they
        # contain a mix of the comparison value and other values.
        data = LabelArray(
            np.asarray(
                [['',    'a',  'ab', 'ba'],
                 ['z',  'ab',   'a', 'ab'],
                 ['aa', 'ab',    '', 'ab'],
                 ['aa',  'a',  'ba', 'ba']],
                dtype=labelarray_dtype,
            ),
            missing_value=missing,
        )

        terms = {
            'startswith': c.startswith(compval),
            'endswith': c.endswith(compval),
            'has_substring': c.has_substring(compval),
            # Equivalent filters using regex matching.
            'startswith_re': c.matches(startswith_re),
            'endswith_re': c.matches(endswith_re),
            'has_substring_re': c.matches(substring_re),
        }

        expected = {
            'startswith': (data.startswith(compval) & (data != missing)),
            'endswith': (data.endswith(compval) & (data != missing)),
            'has_substring': (data.has_substring(compval) & (data != missing)),
        }
        for key in list(expected):
            expected[key + '_re'] = expected[key]

        self.check_terms(
            terms=terms,
            expected=expected,
            initial_workspace={c: data},
            mask=self.build_mask(self.ones_mask(shape=data.shape)),
        )