Exemple #1
0
 def setUp(self):
     self.maxDiff = None
     ArkoudaTest.setUp(self)
     base_words1 = ak.random_strings_uniform(1,
                                             10,
                                             UNIQUE,
                                             characters='printable')
     base_words2 = ak.random_strings_lognormal(2,
                                               0.25,
                                               UNIQUE,
                                               characters='printable')
     gremlins = np.array(['"', ' ', ''])
     self.gremlins = ak.array(gremlins)
     self.base_words = ak.concatenate((base_words1, base_words2))
     self.np_base_words = np.hstack(
         (base_words1.to_ndarray(), base_words2.to_ndarray()))
     choices = ak.randint(0, self.base_words.size, N)
     self.strings = self.base_words[choices]
     self.test_strings = self.strings.to_ndarray()
     self.cat = ak.Categorical(self.strings)
     x, w = tuple(
         zip(*Counter(''.join(self.base_words.to_ndarray())).items()))
     self.delim = self._get_delimiter(x, w, gremlins)
     self.akset = set(ak.unique(self.strings).to_ndarray())
     self.gremlins_base_words = ak.concatenate(
         (self.base_words, self.gremlins))
     self.gremlins_strings = ak.concatenate(
         (self.base_words[choices], self.gremlins))
     self.gremlins_test_strings = self.gremlins_strings.to_ndarray()
     self.gremlins_cat = ak.Categorical(self.gremlins_strings)
    def testEquality(self):
        cat = self._getCategorical()
        catDupe = self._getCategorical()
        catNonDupe = self._getRandomizedCategorical()

        self.assertTrue((cat == catDupe).all())
        self.assertTrue((cat != catNonDupe).all())

        c1 = ak.Categorical(ak.array(['a', 'b', 'c', 'a', 'b']))
        c2 = ak.Categorical(ak.array(['a', 'x', 'c', 'y', 'b']))
        res = (c1 == c2)
        ans = ak.array([True, False, True, False, True])
        self.assertTrue((res == ans).all())
Exemple #3
0
    def testUnique(self):
        cat = self._getRandomizedCategorical()

        self.assertTrue((ak.Categorical(
            ak.array(
                ['non-string', 'string3', 'string1', 'non-string2',
                 'string'])).to_ndarray() == cat.unique().to_ndarray()).all())
Exemple #4
0
    def test_coargsort_categorical(self):
        string = ak.array(['a', 'b', 'a', 'b', 'c'])
        cat = ak.Categorical(string)
        cat_from_codes = ak.Categorical.from_codes(
            codes=ak.array([0, 1, 0, 1, 2]),
            categories=ak.array(['a', 'b', 'c']))
        for algo in ak.SortingAlgorithm:
            str_perm = ak.coargsort([string], algo)
            str_sorted = string[str_perm].to_ndarray()

            # coargsort on categorical
            cat_perm = ak.coargsort([cat], algo)
            cat_sorted = cat[cat_perm].to_ndarray()
            self.assertTrue((str_sorted == cat_sorted).all())

            # coargsort on categorical.from_codes
            # coargsort sorts using codes, the order isn't guaranteed, only grouping
            from_codes_perm = ak.coargsort([cat_from_codes], algo)
            from_codes_sorted = cat_from_codes[from_codes_perm].to_ndarray()
            self.assertTrue((['a', 'a', 'b', 'b',
                              'c'] == from_codes_sorted).all())

            # coargsort on 2 categoricals (one from_codes)
            cat_perm = ak.coargsort([cat, cat_from_codes], algo)
            cat_sorted = cat[cat_perm].to_ndarray()
            self.assertTrue((str_sorted == cat_sorted).all())

            # coargsort on mixed strings and categoricals
            mixed_perm = ak.coargsort([cat, string, cat_from_codes], algo)
            mixed_sorted = cat_from_codes[mixed_perm].to_ndarray()
            self.assertTrue((str_sorted == mixed_sorted).all())
Exemple #5
0
    def test_multi_level_categorical(self):
        string = ak.array(['a', 'b', 'a', 'b', 'c'])
        cat = ak.Categorical(string)
        cat_from_codes = ak.Categorical.from_codes(
            codes=ak.array([0, 1, 0, 1, 2]),
            categories=ak.array(['a', 'b', 'c']))
        i = ak.arange(string.size)
        expected = {('a', 'a'): 2, ('b', 'b'): 2, ('c', 'c'): 1}

        # list of 2 strings
        str_grouping = ak.GroupBy([string, string])
        str_labels, str_values = str_grouping.nunique(i)
        str_dict = to_tuple_dict(str_labels, str_values)
        self.assertDictEqual(expected, str_dict)

        # list of 2 cats (one from_codes)
        cat_grouping = ak.GroupBy([cat, cat_from_codes])
        cat_labels, cat_values = cat_grouping.nunique(i)
        cat_dict = to_tuple_dict(cat_labels, cat_values)
        self.assertDictEqual(expected, cat_dict)

        # One cat (from_codes) and one string
        mixed_grouping = ak.GroupBy([cat_from_codes, string])
        mixed_labels, mixed_values = mixed_grouping.nunique(i)
        mixed_dict = to_tuple_dict(mixed_labels, mixed_values)
        self.assertDictEqual(expected, mixed_dict)
Exemple #6
0
    def test_categorical_registration_suite(self):
        """
        Test register, is_registered, attach, unregister, unregister_categorical_by_name
        """
        cleanup()  # Make sure we start with a clean registry
        c = ak.Categorical(ak.array([f"my_cat {i}" for i in range(1, 11)]))
        self.assertFalse(c.is_registered(), "test_me should be unregistered")
        self.assertTrue(
            c.register("test_me").is_registered(),
            "test_me categorical should be registered")
        c = None  # Should trigger destructor, but survive server deletion because it is registered
        self.assertTrue(c is None, "The reference to `c` should be None")
        c = ak.Categorical.attach("test_me")
        self.assertTrue(
            c.is_registered(),
            "test_me categorical should be registered after attach")
        c.unregister()
        self.assertFalse(c.is_registered(), "test_me should be unregistered")
        self.assertTrue(
            c.register("another_name").name == "another_name"
            and c.is_registered())

        # Test static unregister_by_name
        ak.Categorical.unregister_categorical_by_name("another_name")
        self.assertFalse(c.is_registered(),
                         "another_name should be unregistered")

        # now mess with the subcomponents directly to test is_registered mis-match logic
        c.register("another_name")
        unregister_pdarray_by_name("another_name.codes")
        with pytest.raises(RegistrationError):
            c.is_registered()
Exemple #7
0
 def _get_ak_gremlins(self):
     gremlins_base_words = ak.concatenate((self.base_words, self.gremlins))
     gremlins_strings = ak.concatenate(
         (self.base_words[self.choices], self.gremlins))
     gremlins_test_strings = gremlins_strings.to_ndarray()
     gremlins_cat = ak.Categorical(gremlins_strings)
     return self.Gremlins(gremlins_base_words, gremlins_strings,
                          gremlins_test_strings, gremlins_cat)
Exemple #8
0
    def testIn1d(self):
        vals = [i % 3 for i in range(10)]
        valsTwo = [i % 2 for i in range(10)]

        stringsOne = ak.array(['String {}'.format(i) for i in vals])
        stringsTwo = ak.array(['String {}'.format(i) for i in valsTwo])
        catOne = ak.Categorical(stringsOne)
        catTwo = ak.Categorical(stringsTwo)

        answer = ak.array([x < 2 for x in vals])

        self.assertTrue((answer == ak.in1d(catOne,catTwo)).all())
        self.assertTrue((answer == ak.in1d(catOne,stringsTwo)).all())

        with self.assertRaises(TypeError) as cm:
            ak.in1d(catOne, ak.randint(0,5,5))
        self.assertEqual(('type of argument "test" must be one of (Strings, Categorical); got ' + 
                          'arkouda.pdarrayclass.pdarray instead'), cm.exception.args[0])    
Exemple #9
0
 def test_nunique_types(self):
     string = ak.array(['a', 'b', 'a', 'b', 'c'])
     cat = ak.Categorical(string)
     i = ak.array([5, 3, 5, 3, 1])
     expected = ak.array([1, 1, 1])
     # Try GroupBy.nunique with every combination of types, including mixed
     keys = (string, cat, i, (string, cat, i))
     for key in keys:
         g = ak.GroupBy(key)
         for val in keys:
             k, n = g.nunique(val)
             self.assertTrue((n == expected).all())
Exemple #10
0
    def testBaseCategorical(self):
        strings = ak.array(['string {}'.format(i) for i in range(1, 11)])
        cat = ak.Categorical(strings)

        self.assertTrue((ak.array([7, 5, 9, 8, 2, 1, 4, 0, 3,
                                   6]) == cat.codes).all())
        self.assertTrue((ak.array([0, 1, 2, 3, 4, 5, 6, 7, 8,
                                   9]) == cat.segments).all())
        self.assertTrue((ak.array([
            'string 8', 'string 6', 'string 5', 'string 9', 'string 7',
            'string 2', 'string 10', 'string 1', 'string 4', 'string 3'
        ]) == cat.categories).all())
        self.assertEqual(10, cat.size)
        self.assertEqual('category', cat.objtype)
Exemple #11
0
 def setUp(self):
     self.maxDiff = None
     ArkoudaTest.setUp(self)
     base_words1 = ak.random_strings_uniform(1, 10, UNIQUE, characters='printable')
     base_words2 = ak.random_strings_lognormal(2, 0.25, UNIQUE, characters='printable')
     gremlins = ak.array(['"', ' ', ''])
     self.gremlins = gremlins
     self.base_words = ak.concatenate((base_words1, base_words2))
     self.np_base_words = np.hstack((base_words1.to_ndarray(), base_words2.to_ndarray()))
     choices = ak.randint(0, self.base_words.size, N)
     self.strings = self.base_words[choices]
     self.test_strings = self.strings.to_ndarray()
     self.cat = ak.Categorical(self.strings)
     x, w = tuple(zip(*Counter(''.join(self.base_words.to_ndarray())).items()))
     self.delim =  np.random.choice(x, p=(np.array(w)/sum(w)))
     self.akset = set(ak.unique(self.strings).to_ndarray())
     self.gremlins_base_words = base_words = ak.concatenate((base_words1, base_words2, gremlins))
     self.gremlins_strings = ak.concatenate((base_words[choices], gremlins))
     self.gremlins_test_strings = self.gremlins_strings.to_ndarray()
     self.gremlins_cat = ak.Categorical(self.gremlins_strings)
     print("=================In Class will check===========================")
     print("")
     print(str(base_words1))
     print("After base_word1 ")
     print("")
     print(str(self.strings))
     print("After Print strings")
     print(str(self.test_strings))
     print("")
     print("After Print teststrings")
     print(str(self.strings[N//3]))
     print("")
     print("After Print strings[N//3]")
     print(str(self.test_strings[N//3]))
     print("")
     print("After Print test_strings[N//3]")
Exemple #12
0
    def testBaseCategorical(self):
        cat = self._getCategorical()

        self.assertTrue((ak.array([7,5,9,8,2,1,4,0,3,6]) == cat.codes).all())
        self.assertTrue((ak.array([0,1,2,3,4,5,6,7,8,9]) == cat.segments).all())
        self.assertTrue((ak.array(['string 8', 'string 6', 'string 5', 'string 9', 
                                    'string 7', 'string 2', 'string 10', 'string 1', 
                                    'string 4', 'string 3']) == cat.categories).all())
        self.assertEqual(10,cat.size)
        self.assertEqual('category',cat.objtype)
        
        with self.assertRaises(ValueError) as cm:
            ak.Categorical(ak.arange(0,5,10))
        self.assertEqual('Categorical: inputs other than Strings not yet supported', 
                         cm.exception.args[0])        
Exemple #13
0
    def test_in_place_info(self):
        """
        Tests the class level info method for pdarray, String, and Categorical
        """
        cleanup()
        my_pda = ak.ones(10, ak.int64)
        self.assertFalse(
            any([sym['registered'] for sym in json.loads(my_pda.info())]),
            msg=
            'no components of my_pda should be registered before register call'
        )
        my_pda.register('my_pda')
        self.assertTrue(
            all([sym['registered'] for sym in json.loads(my_pda.info())]),
            msg=
            'all components of my_pda should be registered after register call'
        )

        my_str = ak.random_strings_uniform(1,
                                           10,
                                           UNIQUE,
                                           characters='printable')
        self.assertFalse(
            any([sym['registered'] for sym in json.loads(my_str.info())]),
            msg=
            'no components of my_str should be registered before register call'
        )
        my_str.register('my_str')
        self.assertTrue(
            all([sym['registered'] for sym in json.loads(my_str.info())]),
            msg=
            'all components of my_str should be registered after register call'
        )

        my_cat = ak.Categorical(ak.array([f"my_cat {i}"
                                          for i in range(1, 11)]))
        self.assertFalse(
            any([sym['registered'] for sym in json.loads(my_cat.info())]),
            msg=
            'no components of my_cat should be registered before register call'
        )
        my_cat.register('my_cat')
        self.assertTrue(
            all([sym['registered'] for sym in json.loads(my_cat.info())]),
            msg=
            'all components of my_cat should be registered after register call'
        )
        cleanup()
    def testConcatenate(self):
        catOne = self._getCategorical('string', 51)
        catTwo = self._getCategorical('string-two', 51)

        resultCat = catOne.concatenate([catTwo])
        self.assertEqual('category', resultCat.objtype)
        self.assertIsInstance(resultCat, ak.Categorical)
        self.assertEqual(100, resultCat.size)

        # Since Categorical.concatenate uses Categorical.from_codes method, confirm
        # that both permutation and segments are None
        self.assertFalse(resultCat.permutation)
        self.assertFalse(resultCat.segments)

        resultCat = ak.concatenate([catOne, catOne], ordered=False)
        self.assertEqual('category', resultCat.objtype)
        self.assertIsInstance(resultCat, ak.Categorical)
        self.assertEqual(100, resultCat.size)

        # Since Categorical.concatenate uses Categorical.from_codes method, confirm
        # that both permutation and segments are None
        self.assertFalse(resultCat.permutation)
        self.assertFalse(resultCat.segments)

        # Concatenate two Categoricals with different categories, and test result against original strings
        s1 = ak.array(['abc', 'de', 'abc', 'fghi', 'de'])
        s2 = ak.array(['jkl', 'mno', 'fghi', 'abc', 'fghi', 'mno'])
        c1 = ak.Categorical(s1)
        c2 = ak.Categorical(s2)
        # Ordered concatenation
        s12ord = ak.concatenate([s1, s2], ordered=True)
        c12ord = ak.concatenate([c1, c2], ordered=True)
        self.assertTrue((ak.Categorical(s12ord) == c12ord).all())
        # Unordered (but still deterministic) concatenation
        s12unord = ak.concatenate([s1, s2], ordered=False)
        c12unord = ak.concatenate([c1, c2], ordered=False)
        self.assertTrue((ak.Categorical(s12unord) == c12unord).all())

        # Tiny concatenation
        # Used to fail when length of array was less than numLocales
        # CI uses 2 locales, so try with length-1 arrays
        a = ak.Categorical(ak.array(['a']))
        b = ak.Categorical(ak.array(['b']))
        c = ak.concatenate((a, b), ordered=False)
        ans = ak.Categorical(ak.array(['a', 'b']))
        self.assertTrue((c == ans).all())
Exemple #15
0
 def setUp(self):
     ArkoudaTest.setUp(self)
     base_words1 = ak.random_strings_uniform(0,
                                             10,
                                             UNIQUE,
                                             characters='printable')
     base_words2 = ak.random_strings_lognormal(2,
                                               0.25,
                                               UNIQUE,
                                               characters='printable')
     self.base_words = ak.concatenate((base_words1, base_words2))
     self.np_base_words = np.hstack(
         (base_words1.to_ndarray(), base_words2.to_ndarray()))
     choices = ak.randint(0, self.base_words.size, N)
     self.strings = self.base_words[choices]
     self.test_strings = self.strings.to_ndarray()
     self.cat = ak.Categorical(self.strings)
     x, w = tuple(zip(*Counter(''.join(self.base_words)).items()))
     self.delim = np.random.choice(x, p=(np.array(w) / sum(w)))
    def test_unused_categories_logic(self):
        """
        Test that Categoricals built from_codes and from slices that have unused categories behave correctly
        """
        s = ak.array([str(i) for i in range(10)])
        s12 = s[1:3]
        cat = ak.Categorical(s)
        cat12 = cat[1:3]
        self.assertListEqual(
            ak.in1d(s, s12).to_ndarray().tolist(),
            ak.in1d(cat, cat12).to_ndarray().tolist())
        self.assertSetEqual(set(ak.unique(s12).to_ndarray().tolist()),
                            set(ak.unique(cat12).to_ndarray().tolist()))

        cat_from_codes = ak.Categorical.from_codes(ak.array([1, 2]), s)
        self.assertListEqual(
            ak.in1d(s, s12).to_ndarray().tolist(),
            ak.in1d(cat, cat_from_codes).to_ndarray().tolist())
        self.assertSetEqual(
            set(ak.unique(s12).to_ndarray().tolist()),
            set(ak.unique(cat_from_codes).to_ndarray().tolist()))
Exemple #17
0
 def _getRandomizedCategorical(self) -> ak.Categorical:
     return ak.Categorical(
         ak.array([
             'string', 'string1', 'non-string', 'non-string2', 'string',
             'non-string', 'string3', 'non-string2', 'string', 'non-string'
         ]))
Exemple #18
0
 def testGroup(self):
     strings = ak.array(['string {}'.format(i) for i in range(1, 11)])
     cat = ak.Categorical(strings)
     self.assertTrue((ak.array([7, 5, 4, 8, 6, 1, 9, 0, 3,
                                2]) == cat.group()).all())
Exemple #19
0
 def testContains(self):
     strings = ak.array(['string {}'.format(i) for i in range(1, 11)])
     cat = ak.Categorical(strings)
     self.assertTrue(cat.contains('string').all())
 def _getCategorical(self,
                     prefix: str = 'string',
                     size: int = 11) -> ak.Categorical:
     return ak.Categorical(
         ak.array(['{} {}'.format(prefix, i) for i in range(1, size)]))
Exemple #21
0
    print("Running test from string_test.__main__")
    # with open(__file__, 'r') as f:
    #     base_words = np.array(f.read().split())
    # test_strings = np.random.choice(base_words, N, replace=True)
    # strings = ak.array(test_strings)

    base_words1 = ak.random_strings_uniform(1, 10, UNIQUE, characters='printable')
    base_words2 = ak.random_strings_lognormal(2, 0.25, UNIQUE, characters='printable')
    gremlins = ak.array(['"', ' ', ''])
    base_words = ak.concatenate((base_words1, base_words2))
    np_base_words = np.hstack((base_words1.to_ndarray(), base_words2.to_ndarray()))
    assert(compare_strings(base_words.to_ndarray(), np_base_words))
    choices = ak.randint(0, base_words.size, N)
    strings = base_words[choices]
    test_strings = strings.to_ndarray()
    cat = ak.Categorical(strings)
    print("strings =", strings)
    print("categorical =", cat)
    print("Generation and concatenate passed")
  
    # int index
    run_test_index(strings, test_strings, cat, range(-len(gremlins), 0))
    print("int index passed")
  
    # slice
    run_test_slice(strings, test_strings, cat)
    print("slice passed")
    
    # pdarray int index
    run_test_pdarray_index(strings, test_strings, cat)
    print("pdarray int index passed")
Exemple #22
0
 def _get_categorical(self):
     return ak.Categorical(self.strings)
Exemple #23
0
 def _getCategorical(self) -> ak.Categorical:
     return ak.Categorical(
         ak.array(['string {}'.format(i) for i in range(1, 11)]))