def test_peel(self): run_test_peel(self.strings, self.test_strings, self.delim) run_test_peel(self.strings, self.test_strings, np.str_(self.delim)) run_test_peel(self.strings, self.test_strings, str.encode(str(self.delim))) # Test gremlins delimiters g = self._get_ak_gremlins() with self.assertRaises(ValueError): run_test_peel(g.gremlins_strings, g.gremlins_test_strings, '') run_test_peel(g.gremlins_strings, g.gremlins_test_strings, '"') run_test_peel(g.gremlins_strings, g.gremlins_test_strings, ' ') # Run a test with a specific set of strings to verify strings.bytes matches expected output series = pd.Series(["k1:v1", "k2:v2", "k3:v3", "no_colon"]) pda = ak.from_series(series, "string") # Convert Pandas series of strings into a byte array where each string is terminated by a null byte. # This mimics what should be stored server-side in the strings.bytes pdarray expected_series_dec = convert_to_ord(series.to_list()) actual_dec = pda._comp_to_ndarray("values").tolist() #pda.bytes.to_ndarray().tolist() self.assertListEqual(expected_series_dec, actual_dec) # Now perform the peel and verify a, b = pda.peel(":") expected_a = convert_to_ord(["k1", "k2", "k3", ""]) expected_b = convert_to_ord(["v1", "v2", "v3", "no_colon"]) self.assertListEqual(expected_a, a._comp_to_ndarray("values").tolist()) self.assertListEqual(expected_b, b._comp_to_ndarray("values").tolist())
def test_peel_delimiter_length_issue(self): # See Issue 838 d = "-" * 25 # 25 dashes as delimiter series = pd.Series([f"abc{d}xyz", f"small{d}dog", f"blue{d}hat", "last"]) pda = ak.from_series(series) a, b = pda.peel(d) aa = a.to_ndarray().tolist() bb = b.to_ndarray().tolist() self.assertListEqual(["abc", "small", "blue", ""], aa) self.assertListEqual(["xyz", "dog", "hat", "last"], bb) # Try a slight permutation since we were able to get both versions to fail at one point series = pd.Series([f"abc{d}xyz", f"small{d}dog", "last"]) pda = ak.from_series(series) a, b = pda.peel(d) aa = a.to_ndarray().tolist() bb = b.to_ndarray().tolist() self.assertListEqual(["abc", "small", ""], aa) self.assertListEqual(["xyz", "dog", "last"], bb)
def test_from_series(self): strings = ak.from_series( pd.Series(['a', 'b', 'c', 'd', 'e'], dtype="string")) self.assertIsInstance(strings, ak.Strings) self.assertEqual(5, len(strings)) objects = ak.from_series(pd.Series(['a', 'b', 'c', 'd', 'e']), dtype=np.str) self.assertIsInstance(objects, ak.Strings) self.assertEqual(np.str, objects.dtype) objects = ak.from_series(pd.Series(['a', 'b', 'c', 'd', 'e'])) self.assertIsInstance(objects, ak.Strings) self.assertEqual(np.str, objects.dtype) p_array = ak.from_series(pd.Series(np.random.randint(0, 10, 10))) self.assertIsInstance(p_array, ak.pdarray) self.assertEqual(np.int64, p_array.dtype) p_i_objects_array = ak.from_series(pd.Series(np.random.randint( 0, 10, 10), dtype='object'), dtype=np.int64) self.assertIsInstance(p_i_objects_array, ak.pdarray) self.assertEqual(np.int64, p_i_objects_array.dtype) p_array = ak.from_series( pd.Series(np.random.uniform(low=0.0, high=1.0, size=10))) self.assertIsInstance(p_array, ak.pdarray) self.assertEqual(np.float64, p_array.dtype) p_f_objects_array = ak.from_series(pd.Series(np.random.uniform( low=0.0, high=1.0, size=10), dtype='object'), dtype=np.float64) self.assertIsInstance(p_f_objects_array, ak.pdarray) self.assertEqual(np.float64, p_f_objects_array.dtype) p_array = ak.from_series( pd.Series(np.random.choice([True, False], size=10))) self.assertIsInstance(p_array, ak.pdarray) self.assertEqual(bool, p_array.dtype) p_b_objects_array = ak.from_series(pd.Series(np.random.choice( [True, False], size=10), dtype='object'), dtype=np.bool) self.assertIsInstance(p_b_objects_array, ak.pdarray) self.assertEqual(bool, p_b_objects_array.dtype) p_array = ak.from_series(pd.Series([dt.datetime(2016, 1, 1, 0, 0, 1)])) self.assertIsInstance(p_array, ak.pdarray) self.assertEqual(np.int64, p_array.dtype) p_array = ak.from_series(pd.Series([np.datetime64('2018-01-01')])) self.assertIsInstance(p_array, ak.pdarray) self.assertEqual(np.int64, p_array.dtype) p_array = ak.from_series( pd.Series( pd.to_datetime([ '1/1/2018', np.datetime64('2018-01-01'), dt.datetime(2018, 1, 1) ]))) self.assertIsInstance(p_array, ak.pdarray) self.assertEqual(np.int64, p_array.dtype) with self.assertRaises(TypeError) as cm: ak.from_series(np.ones(100)) self.assertEqual( ('type of argument "series" must be pandas.core.series.Series; ' + 'got numpy.ndarray instead'), cm.exception.args[0]) with self.assertRaises(ValueError) as cm: ak.from_series( pd.Series(np.random.randint(0, 10, 10), dtype=np.int8)) self.assertEqual( ('dtype int8 is unsupported. Supported dtypes are bool, ' + 'float64, int64, string, datetime64[ns], and timedelta64[ns]'), cm.exception.args[0])