def setUp(self): self.maxDiff = None ArkoudaTest.setUp(self) base_words1 = ak.random_strings_uniform(1, 10, UNIQUE, characters='printable') base_words2 = ak.random_strings_lognormal(2, 0.25, UNIQUE, characters='printable') gremlins = np.array(['"', ' ', '']) self.gremlins = ak.array(gremlins) self.base_words = ak.concatenate((base_words1, base_words2)) self.np_base_words = np.hstack( (base_words1.to_ndarray(), base_words2.to_ndarray())) choices = ak.randint(0, self.base_words.size, N) self.strings = self.base_words[choices] self.test_strings = self.strings.to_ndarray() self.cat = ak.Categorical(self.strings) x, w = tuple( zip(*Counter(''.join(self.base_words.to_ndarray())).items())) self.delim = self._get_delimiter(x, w, gremlins) self.akset = set(ak.unique(self.strings).to_ndarray()) self.gremlins_base_words = ak.concatenate( (self.base_words, self.gremlins)) self.gremlins_strings = ak.concatenate( (self.base_words[choices], self.gremlins)) self.gremlins_test_strings = self.gremlins_strings.to_ndarray() self.gremlins_cat = ak.Categorical(self.gremlins_strings)
def testEquality(self): cat = self._getCategorical() catDupe = self._getCategorical() catNonDupe = self._getRandomizedCategorical() self.assertTrue((cat == catDupe).all()) self.assertTrue((cat != catNonDupe).all()) c1 = ak.Categorical(ak.array(['a', 'b', 'c', 'a', 'b'])) c2 = ak.Categorical(ak.array(['a', 'x', 'c', 'y', 'b'])) res = (c1 == c2) ans = ak.array([True, False, True, False, True]) self.assertTrue((res == ans).all())
def testUnique(self): cat = self._getRandomizedCategorical() self.assertTrue((ak.Categorical( ak.array( ['non-string', 'string3', 'string1', 'non-string2', 'string'])).to_ndarray() == cat.unique().to_ndarray()).all())
def test_coargsort_categorical(self): string = ak.array(['a', 'b', 'a', 'b', 'c']) cat = ak.Categorical(string) cat_from_codes = ak.Categorical.from_codes( codes=ak.array([0, 1, 0, 1, 2]), categories=ak.array(['a', 'b', 'c'])) for algo in ak.SortingAlgorithm: str_perm = ak.coargsort([string], algo) str_sorted = string[str_perm].to_ndarray() # coargsort on categorical cat_perm = ak.coargsort([cat], algo) cat_sorted = cat[cat_perm].to_ndarray() self.assertTrue((str_sorted == cat_sorted).all()) # coargsort on categorical.from_codes # coargsort sorts using codes, the order isn't guaranteed, only grouping from_codes_perm = ak.coargsort([cat_from_codes], algo) from_codes_sorted = cat_from_codes[from_codes_perm].to_ndarray() self.assertTrue((['a', 'a', 'b', 'b', 'c'] == from_codes_sorted).all()) # coargsort on 2 categoricals (one from_codes) cat_perm = ak.coargsort([cat, cat_from_codes], algo) cat_sorted = cat[cat_perm].to_ndarray() self.assertTrue((str_sorted == cat_sorted).all()) # coargsort on mixed strings and categoricals mixed_perm = ak.coargsort([cat, string, cat_from_codes], algo) mixed_sorted = cat_from_codes[mixed_perm].to_ndarray() self.assertTrue((str_sorted == mixed_sorted).all())
def test_multi_level_categorical(self): string = ak.array(['a', 'b', 'a', 'b', 'c']) cat = ak.Categorical(string) cat_from_codes = ak.Categorical.from_codes( codes=ak.array([0, 1, 0, 1, 2]), categories=ak.array(['a', 'b', 'c'])) i = ak.arange(string.size) expected = {('a', 'a'): 2, ('b', 'b'): 2, ('c', 'c'): 1} # list of 2 strings str_grouping = ak.GroupBy([string, string]) str_labels, str_values = str_grouping.nunique(i) str_dict = to_tuple_dict(str_labels, str_values) self.assertDictEqual(expected, str_dict) # list of 2 cats (one from_codes) cat_grouping = ak.GroupBy([cat, cat_from_codes]) cat_labels, cat_values = cat_grouping.nunique(i) cat_dict = to_tuple_dict(cat_labels, cat_values) self.assertDictEqual(expected, cat_dict) # One cat (from_codes) and one string mixed_grouping = ak.GroupBy([cat_from_codes, string]) mixed_labels, mixed_values = mixed_grouping.nunique(i) mixed_dict = to_tuple_dict(mixed_labels, mixed_values) self.assertDictEqual(expected, mixed_dict)
def test_categorical_registration_suite(self): """ Test register, is_registered, attach, unregister, unregister_categorical_by_name """ cleanup() # Make sure we start with a clean registry c = ak.Categorical(ak.array([f"my_cat {i}" for i in range(1, 11)])) self.assertFalse(c.is_registered(), "test_me should be unregistered") self.assertTrue( c.register("test_me").is_registered(), "test_me categorical should be registered") c = None # Should trigger destructor, but survive server deletion because it is registered self.assertTrue(c is None, "The reference to `c` should be None") c = ak.Categorical.attach("test_me") self.assertTrue( c.is_registered(), "test_me categorical should be registered after attach") c.unregister() self.assertFalse(c.is_registered(), "test_me should be unregistered") self.assertTrue( c.register("another_name").name == "another_name" and c.is_registered()) # Test static unregister_by_name ak.Categorical.unregister_categorical_by_name("another_name") self.assertFalse(c.is_registered(), "another_name should be unregistered") # now mess with the subcomponents directly to test is_registered mis-match logic c.register("another_name") unregister_pdarray_by_name("another_name.codes") with pytest.raises(RegistrationError): c.is_registered()
def _get_ak_gremlins(self): gremlins_base_words = ak.concatenate((self.base_words, self.gremlins)) gremlins_strings = ak.concatenate( (self.base_words[self.choices], self.gremlins)) gremlins_test_strings = gremlins_strings.to_ndarray() gremlins_cat = ak.Categorical(gremlins_strings) return self.Gremlins(gremlins_base_words, gremlins_strings, gremlins_test_strings, gremlins_cat)
def testIn1d(self): vals = [i % 3 for i in range(10)] valsTwo = [i % 2 for i in range(10)] stringsOne = ak.array(['String {}'.format(i) for i in vals]) stringsTwo = ak.array(['String {}'.format(i) for i in valsTwo]) catOne = ak.Categorical(stringsOne) catTwo = ak.Categorical(stringsTwo) answer = ak.array([x < 2 for x in vals]) self.assertTrue((answer == ak.in1d(catOne,catTwo)).all()) self.assertTrue((answer == ak.in1d(catOne,stringsTwo)).all()) with self.assertRaises(TypeError) as cm: ak.in1d(catOne, ak.randint(0,5,5)) self.assertEqual(('type of argument "test" must be one of (Strings, Categorical); got ' + 'arkouda.pdarrayclass.pdarray instead'), cm.exception.args[0])
def test_nunique_types(self): string = ak.array(['a', 'b', 'a', 'b', 'c']) cat = ak.Categorical(string) i = ak.array([5, 3, 5, 3, 1]) expected = ak.array([1, 1, 1]) # Try GroupBy.nunique with every combination of types, including mixed keys = (string, cat, i, (string, cat, i)) for key in keys: g = ak.GroupBy(key) for val in keys: k, n = g.nunique(val) self.assertTrue((n == expected).all())
def testBaseCategorical(self): strings = ak.array(['string {}'.format(i) for i in range(1, 11)]) cat = ak.Categorical(strings) self.assertTrue((ak.array([7, 5, 9, 8, 2, 1, 4, 0, 3, 6]) == cat.codes).all()) self.assertTrue((ak.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) == cat.segments).all()) self.assertTrue((ak.array([ 'string 8', 'string 6', 'string 5', 'string 9', 'string 7', 'string 2', 'string 10', 'string 1', 'string 4', 'string 3' ]) == cat.categories).all()) self.assertEqual(10, cat.size) self.assertEqual('category', cat.objtype)
def setUp(self): self.maxDiff = None ArkoudaTest.setUp(self) base_words1 = ak.random_strings_uniform(1, 10, UNIQUE, characters='printable') base_words2 = ak.random_strings_lognormal(2, 0.25, UNIQUE, characters='printable') gremlins = ak.array(['"', ' ', '']) self.gremlins = gremlins self.base_words = ak.concatenate((base_words1, base_words2)) self.np_base_words = np.hstack((base_words1.to_ndarray(), base_words2.to_ndarray())) choices = ak.randint(0, self.base_words.size, N) self.strings = self.base_words[choices] self.test_strings = self.strings.to_ndarray() self.cat = ak.Categorical(self.strings) x, w = tuple(zip(*Counter(''.join(self.base_words.to_ndarray())).items())) self.delim = np.random.choice(x, p=(np.array(w)/sum(w))) self.akset = set(ak.unique(self.strings).to_ndarray()) self.gremlins_base_words = base_words = ak.concatenate((base_words1, base_words2, gremlins)) self.gremlins_strings = ak.concatenate((base_words[choices], gremlins)) self.gremlins_test_strings = self.gremlins_strings.to_ndarray() self.gremlins_cat = ak.Categorical(self.gremlins_strings) print("=================In Class will check===========================") print("") print(str(base_words1)) print("After base_word1 ") print("") print(str(self.strings)) print("After Print strings") print(str(self.test_strings)) print("") print("After Print teststrings") print(str(self.strings[N//3])) print("") print("After Print strings[N//3]") print(str(self.test_strings[N//3])) print("") print("After Print test_strings[N//3]")
def testBaseCategorical(self): cat = self._getCategorical() self.assertTrue((ak.array([7,5,9,8,2,1,4,0,3,6]) == cat.codes).all()) self.assertTrue((ak.array([0,1,2,3,4,5,6,7,8,9]) == cat.segments).all()) self.assertTrue((ak.array(['string 8', 'string 6', 'string 5', 'string 9', 'string 7', 'string 2', 'string 10', 'string 1', 'string 4', 'string 3']) == cat.categories).all()) self.assertEqual(10,cat.size) self.assertEqual('category',cat.objtype) with self.assertRaises(ValueError) as cm: ak.Categorical(ak.arange(0,5,10)) self.assertEqual('Categorical: inputs other than Strings not yet supported', cm.exception.args[0])
def test_in_place_info(self): """ Tests the class level info method for pdarray, String, and Categorical """ cleanup() my_pda = ak.ones(10, ak.int64) self.assertFalse( any([sym['registered'] for sym in json.loads(my_pda.info())]), msg= 'no components of my_pda should be registered before register call' ) my_pda.register('my_pda') self.assertTrue( all([sym['registered'] for sym in json.loads(my_pda.info())]), msg= 'all components of my_pda should be registered after register call' ) my_str = ak.random_strings_uniform(1, 10, UNIQUE, characters='printable') self.assertFalse( any([sym['registered'] for sym in json.loads(my_str.info())]), msg= 'no components of my_str should be registered before register call' ) my_str.register('my_str') self.assertTrue( all([sym['registered'] for sym in json.loads(my_str.info())]), msg= 'all components of my_str should be registered after register call' ) my_cat = ak.Categorical(ak.array([f"my_cat {i}" for i in range(1, 11)])) self.assertFalse( any([sym['registered'] for sym in json.loads(my_cat.info())]), msg= 'no components of my_cat should be registered before register call' ) my_cat.register('my_cat') self.assertTrue( all([sym['registered'] for sym in json.loads(my_cat.info())]), msg= 'all components of my_cat should be registered after register call' ) cleanup()
def testConcatenate(self): catOne = self._getCategorical('string', 51) catTwo = self._getCategorical('string-two', 51) resultCat = catOne.concatenate([catTwo]) self.assertEqual('category', resultCat.objtype) self.assertIsInstance(resultCat, ak.Categorical) self.assertEqual(100, resultCat.size) # Since Categorical.concatenate uses Categorical.from_codes method, confirm # that both permutation and segments are None self.assertFalse(resultCat.permutation) self.assertFalse(resultCat.segments) resultCat = ak.concatenate([catOne, catOne], ordered=False) self.assertEqual('category', resultCat.objtype) self.assertIsInstance(resultCat, ak.Categorical) self.assertEqual(100, resultCat.size) # Since Categorical.concatenate uses Categorical.from_codes method, confirm # that both permutation and segments are None self.assertFalse(resultCat.permutation) self.assertFalse(resultCat.segments) # Concatenate two Categoricals with different categories, and test result against original strings s1 = ak.array(['abc', 'de', 'abc', 'fghi', 'de']) s2 = ak.array(['jkl', 'mno', 'fghi', 'abc', 'fghi', 'mno']) c1 = ak.Categorical(s1) c2 = ak.Categorical(s2) # Ordered concatenation s12ord = ak.concatenate([s1, s2], ordered=True) c12ord = ak.concatenate([c1, c2], ordered=True) self.assertTrue((ak.Categorical(s12ord) == c12ord).all()) # Unordered (but still deterministic) concatenation s12unord = ak.concatenate([s1, s2], ordered=False) c12unord = ak.concatenate([c1, c2], ordered=False) self.assertTrue((ak.Categorical(s12unord) == c12unord).all()) # Tiny concatenation # Used to fail when length of array was less than numLocales # CI uses 2 locales, so try with length-1 arrays a = ak.Categorical(ak.array(['a'])) b = ak.Categorical(ak.array(['b'])) c = ak.concatenate((a, b), ordered=False) ans = ak.Categorical(ak.array(['a', 'b'])) self.assertTrue((c == ans).all())
def setUp(self): ArkoudaTest.setUp(self) base_words1 = ak.random_strings_uniform(0, 10, UNIQUE, characters='printable') base_words2 = ak.random_strings_lognormal(2, 0.25, UNIQUE, characters='printable') self.base_words = ak.concatenate((base_words1, base_words2)) self.np_base_words = np.hstack( (base_words1.to_ndarray(), base_words2.to_ndarray())) choices = ak.randint(0, self.base_words.size, N) self.strings = self.base_words[choices] self.test_strings = self.strings.to_ndarray() self.cat = ak.Categorical(self.strings) x, w = tuple(zip(*Counter(''.join(self.base_words)).items())) self.delim = np.random.choice(x, p=(np.array(w) / sum(w)))
def test_unused_categories_logic(self): """ Test that Categoricals built from_codes and from slices that have unused categories behave correctly """ s = ak.array([str(i) for i in range(10)]) s12 = s[1:3] cat = ak.Categorical(s) cat12 = cat[1:3] self.assertListEqual( ak.in1d(s, s12).to_ndarray().tolist(), ak.in1d(cat, cat12).to_ndarray().tolist()) self.assertSetEqual(set(ak.unique(s12).to_ndarray().tolist()), set(ak.unique(cat12).to_ndarray().tolist())) cat_from_codes = ak.Categorical.from_codes(ak.array([1, 2]), s) self.assertListEqual( ak.in1d(s, s12).to_ndarray().tolist(), ak.in1d(cat, cat_from_codes).to_ndarray().tolist()) self.assertSetEqual( set(ak.unique(s12).to_ndarray().tolist()), set(ak.unique(cat_from_codes).to_ndarray().tolist()))
def _getRandomizedCategorical(self) -> ak.Categorical: return ak.Categorical( ak.array([ 'string', 'string1', 'non-string', 'non-string2', 'string', 'non-string', 'string3', 'non-string2', 'string', 'non-string' ]))
def testGroup(self): strings = ak.array(['string {}'.format(i) for i in range(1, 11)]) cat = ak.Categorical(strings) self.assertTrue((ak.array([7, 5, 4, 8, 6, 1, 9, 0, 3, 2]) == cat.group()).all())
def testContains(self): strings = ak.array(['string {}'.format(i) for i in range(1, 11)]) cat = ak.Categorical(strings) self.assertTrue(cat.contains('string').all())
def _getCategorical(self, prefix: str = 'string', size: int = 11) -> ak.Categorical: return ak.Categorical( ak.array(['{} {}'.format(prefix, i) for i in range(1, size)]))
print("Running test from string_test.__main__") # with open(__file__, 'r') as f: # base_words = np.array(f.read().split()) # test_strings = np.random.choice(base_words, N, replace=True) # strings = ak.array(test_strings) base_words1 = ak.random_strings_uniform(1, 10, UNIQUE, characters='printable') base_words2 = ak.random_strings_lognormal(2, 0.25, UNIQUE, characters='printable') gremlins = ak.array(['"', ' ', '']) base_words = ak.concatenate((base_words1, base_words2)) np_base_words = np.hstack((base_words1.to_ndarray(), base_words2.to_ndarray())) assert(compare_strings(base_words.to_ndarray(), np_base_words)) choices = ak.randint(0, base_words.size, N) strings = base_words[choices] test_strings = strings.to_ndarray() cat = ak.Categorical(strings) print("strings =", strings) print("categorical =", cat) print("Generation and concatenate passed") # int index run_test_index(strings, test_strings, cat, range(-len(gremlins), 0)) print("int index passed") # slice run_test_slice(strings, test_strings, cat) print("slice passed") # pdarray int index run_test_pdarray_index(strings, test_strings, cat) print("pdarray int index passed")
def _get_categorical(self): return ak.Categorical(self.strings)
def _getCategorical(self) -> ak.Categorical: return ak.Categorical( ak.array(['string {}'.format(i) for i in range(1, 11)]))