def test_categorical_set_categories(): cat = pd.Categorical(['a', 'a', 'b', 'c', 'a'], categories=['a', 'b', 'c']) psr = pd.Series(cat) sr = Series.from_categorical(cat) # adding category expect = psr.cat.set_categories(['a', 'b', 'c', 'd']) got = sr.cat.set_categories(['a', 'b', 'c', 'd']) assert_eq(expect, got) # removing category expect = psr.cat.set_categories(['a', 'b']) got = sr.cat.set_categories(['a', 'b']) assert_eq(expect, got)
def test_categorical_set_categories(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) psr = pd.Series(cat) sr = Series.from_categorical(cat) # adding category expect = psr.cat.set_categories(["a", "b", "c", "d"]) got = sr.cat.set_categories(["a", "b", "c", "d"]) assert_eq(expect, got) # removing category expect = psr.cat.set_categories(["a", "b"]) got = sr.cat.set_categories(["a", "b"]) assert_eq(expect, got)
def _make_dictionary_series(self): """Make a dictionary-encoded series from this node """ assert self.is_dictionary # create dictionary-encoded column dict_meta = self.field_schema['dictionary'] dictid = dict_meta['id'] # start from 1 if dict_meta['indexType']['name'] != 'int': msg = 'non integer type index for dictionary' raise MetadataParsingError(msg) ordered = dict_meta['isOrdered'] # find dictionary for dictionary in self.schema['dictionaries']: if dictionary['id'] == dictid: break categories = dictionary['data']['columns'][0]['DATA'] # make dummy categorical cat = pd.Categorical([], categories=categories, ordered=ordered) # make the series return Series.from_categorical(cat, codes=self.data)
def test_categorical_unique_count(nelem): from string import ascii_letters, digits # create categorical series np.random.seed(12) pd_cat = pd.Categorical( pd.Series(np.random.choice(list(ascii_letters + digits), nelem), dtype='category')) # gdf gdf = DataFrame() gdf['a'] = Series.from_categorical(pd_cat) gdf_unique_count = gdf['a'].unique_count() # pandas pdf = pd.DataFrame() pdf['a'] = pd_cat pdf_unique = pdf['a'].unique() # verify assert gdf_unique_count == len(pdf_unique)
def test_categorical_unique(num_elements): from string import ascii_letters, digits # create categorical series np.random.seed(12) pd_cat = pd.Categorical( pd.Series(np.random.choice(list(ascii_letters + digits), num_elements), dtype='category')) # gdf gdf = DataFrame() gdf['a'] = Series.from_categorical(pd_cat) gdf_unique_sorted = np.sort(gdf['a'].unique()) # pandas pdf = pd.DataFrame() pdf['a'] = pd_cat pdf_unique_sorted = np.sort(pdf['a'].unique()) # verify np.testing.assert_array_equal(pdf_unique_sorted, gdf_unique_sorted)
def test_categorical_value_counts(num_elements): from string import ascii_letters, digits # create categorical series np.random.seed(12) pd_cat = pd.Categorical( pd.Series(np.random.choice(list(ascii_letters + digits), num_elements), dtype='category')) # gdf gdf = DataFrame() gdf['a'] = Series.from_categorical(pd_cat) gdf_value_counts = gdf['a'].value_counts() # pandas pdf = pd.DataFrame() pdf['a'] = pd_cat pdf_value_counts = pdf['a'].value_counts() # verify pandas_dict = pdf_value_counts.to_dict() gdf_dict = gdf_value_counts.to_pandas().to_dict() assert pandas_dict == gdf_dict