def test_compare_to_str(self, compval, shape, array_astype, missing_value): strs = self.strs.reshape(shape).astype(array_astype) if missing_value is None: # As of numpy 1.9.2, object array != None returns just False # instead of an array, with a deprecation warning saying the # behavior will change in the future. Work around that by just # using the ufunc. notmissing = np.not_equal(strs, missing_value) else: if not isinstance(missing_value, array_astype): missing_value = array_astype(missing_value, 'utf-8') notmissing = (strs != missing_value) arr = LabelArray(strs, missing_value=missing_value) if not isinstance(compval, array_astype): compval = array_astype(compval, 'utf-8') # arr.missing_value should behave like NaN. check_arrays( arr == compval, (strs == compval) & notmissing, ) check_arrays( arr != compval, (strs != compval) & notmissing, ) np_startswith = np.vectorize(lambda elem: elem.startswith(compval)) check_arrays( arr.startswith(compval), np_startswith(strs) & notmissing, ) np_endswith = np.vectorize(lambda elem: elem.endswith(compval)) check_arrays( arr.endswith(compval), np_endswith(strs) & notmissing, ) np_contains = np.vectorize(lambda elem: compval in elem) check_arrays( arr.has_substring(compval), np_contains(strs) & notmissing, )
def test_string_elementwise_predicates(self, compval, missing, labelarray_dtype): if labelarray_dtype == bytes_dtype: compval = compval.encode('utf-8') missing = missing.encode('utf-8') startswith_re = b'^' + compval + b'.*' endswith_re = b'.*' + compval + b'$' substring_re = b'.*' + compval + b'.*' else: startswith_re = '^' + compval + '.*' endswith_re = '.*' + compval + '$' substring_re = '.*' + compval + '.*' class C(Classifier): dtype = categorical_dtype missing_value = missing inputs = () window_length = 0 c = C() # There's no significance to the values here other than that they # contain a mix of the comparison value and other values. data = LabelArray( np.asarray( [['', 'a', 'ab', 'ba'], ['z', 'ab', 'a', 'ab'], ['aa', 'ab', '', 'ab'], ['aa', 'a', 'ba', 'ba']], dtype=labelarray_dtype, ), missing_value=missing, ) terms = { 'startswith': c.startswith(compval), 'endswith': c.endswith(compval), 'has_substring': c.has_substring(compval), # Equivalent filters using regex matching. 'startswith_re': c.matches(startswith_re), 'endswith_re': c.matches(endswith_re), 'has_substring_re': c.matches(substring_re), } expected = { 'startswith': (data.startswith(compval) & (data != missing)), 'endswith': (data.endswith(compval) & (data != missing)), 'has_substring': (data.has_substring(compval) & (data != missing)), } for key in list(expected): expected[key + '_re'] = expected[key] self.check_terms( terms=terms, expected=expected, initial_workspace={c: data}, mask=self.build_mask(self.ones_mask(shape=data.shape)), )