def test_include_na(self): s = ['a', 'b', np.nan] res = get_dummies(s, sparse=self.sparse) exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=self.sparse) exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0}, 'a': {0: 1.0, 1: 0.0, 2: 0.0}, 'b': {0: 0.0, 1: 1.0, 2: 0.0}}).reindex_axis( ['a', 'b', nan], 1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse) exp_just_na = DataFrame(Series(1.0, index=[0]), columns=[nan]) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
def test_basic_types(self): # GH 10531 s_list = list('abc') s_series = Series(s_list) s_df = DataFrame({'a': [0, 1, 0, 1, 2], 'b': ['A', 'A', 'B', 'C', 'C'], 'c': [2, 3, 3, 3, 2]}) if not self.sparse: exp_df_type = DataFrame exp_blk_type = pd.core.internals.FloatBlock else: exp_df_type = SparseDataFrame exp_blk_type = pd.core.internals.SparseBlock self.assertEqual( type(get_dummies(s_list, sparse=self.sparse)), exp_df_type) self.assertEqual( type(get_dummies(s_series, sparse=self.sparse)), exp_df_type) r = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns) self.assertEqual(type(r), exp_df_type) r = get_dummies(s_df, sparse=self.sparse, columns=['a']) self.assertEqual(type(r[['a_0']]._data.blocks[0]), exp_blk_type) self.assertEqual(type(r[['a_1']]._data.blocks[0]), exp_blk_type) self.assertEqual(type(r[['a_2']]._data.blocks[0]), exp_blk_type)
def test_basic_types(self): # GH 10531 s_list = list('abc') s_series = Series(s_list) s_df = DataFrame({'a': [0, 1, 0, 1, 2], 'b': ['A', 'A', 'B', 'C', 'C'], 'c': [2, 3, 3, 3, 2]}) expected = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype='uint8', columns=list('abc')) if not self.sparse: compare = tm.assert_frame_equal else: expected = expected.to_sparse(fill_value=0, kind='integer') compare = tm.assert_sp_frame_equal result = get_dummies(s_list, sparse=self.sparse) compare(result, expected) result = get_dummies(s_series, sparse=self.sparse) compare(result, expected) result = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns) tm.assert_series_equal(result.get_dtype_counts(), Series({'uint8': 8})) result = get_dummies(s_df, sparse=self.sparse, columns=['a']) expected = Series({'uint8': 3, 'int64': 1, 'object': 1}).sort_values() tm.assert_series_equal(result.get_dtype_counts().sort_values(), expected)
def test_basic(self): s_list = list('abc') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, 'b': {0: 0.0, 1: 1.0, 2: 0.0}, 'c': {0: 0.0, 1: 0.0, 2: 1.0}}) assert_frame_equal(get_dummies(s_list), expected) assert_frame_equal(get_dummies(s_series), expected) expected.index = list('ABC') assert_frame_equal(get_dummies(s_series_index), expected)
def test_basic(self): s_list = list("abc") s_series = Series(s_list) s_series_index = Series(s_list, list("ABC")) expected = DataFrame( {"a": {0: 1.0, 1: 0.0, 2: 0.0}, "b": {0: 0.0, 1: 1.0, 2: 0.0}, "c": {0: 0.0, 1: 0.0, 2: 1.0}} ) assert_frame_equal(get_dummies(s_list), expected) assert_frame_equal(get_dummies(s_series), expected) expected.index = list("ABC") assert_frame_equal(get_dummies(s_series_index), expected)
def test_dataframe_dummies_with_na(self): df = self.df df.loc[3, :] = [np.nan, np.nan, np.nan] result = get_dummies(df, dummy_na=True) expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1., 0, 1, 0], 'A_b': [0., 1, 0, 0], 'A_nan': [0., 0, 0, 1], 'B_b': [1., 1, 0, 0], 'B_c': [0., 0, 1, 0], 'B_nan': [0., 0, 0, 1]}) expected = expected[['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']] assert_frame_equal(result, expected) result = get_dummies(df, dummy_na=False) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_sep(self): df = self.df result = get_dummies(df, prefix_sep="..") expected = DataFrame( {"C": [1, 2, 3], "A..a": [1.0, 0, 1], "A..b": [0.0, 1, 0], "B..b": [1.0, 1, 0], "B..c": [0.0, 0, 1]} ) expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]] assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep=["..", "__"]) expected = expected.rename(columns={"B..b": "B__b", "B..c": "B__c"}) assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep={"A": "..", "B": "__"}) assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_sep(self): df = self.df result = get_dummies(df, prefix_sep='..') expected = DataFrame({'C': [1, 2, 3], 'A..a': [1., 0, 1], 'A..b': [0., 1, 0], 'B..b': [1., 1, 0], 'B..c': [0., 0, 1]}) expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep=['..', '__']) expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'}) assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}) assert_frame_equal(result, expected)
def test_get_dummies(self): from pandas.core.reshape import get_dummies, make_axis_dummies self.panel['Label'] = self.panel.index.labels[1] minor_dummies = make_axis_dummies(self.panel, 'minor') dummies = get_dummies(self.panel['Label']) self.assert_(np.array_equal(dummies.values, minor_dummies.values))
def test_just_na(self): just_na_list = [np.nan] just_na_series = Series(just_na_list) just_na_series_index = Series(just_na_list, index=["A"]) res_list = get_dummies(just_na_list) res_series = get_dummies(just_na_series) res_series_index = get_dummies(just_na_series_index) self.assertEqual(res_list.empty, True) self.assertEqual(res_series.empty, True) self.assertEqual(res_series_index.empty, True) self.assertEqual(res_list.index.tolist(), [0]) self.assertEqual(res_series.index.tolist(), [0]) self.assertEqual(res_series_index.index.tolist(), ["A"])
def test_dataframe_dummies_subset(self): df = self.df result = get_dummies(df, prefix=['from_A'], columns=['A']) expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) assert_frame_equal(result, expected)
def test_just_na(self): just_na_list = [np.nan] just_na_series = Series(just_na_list) just_na_series_index = Series(just_na_list, index = ['A']) res_list = get_dummies(just_na_list, sparse=self.sparse) res_series = get_dummies(just_na_series, sparse=self.sparse) res_series_index = get_dummies(just_na_series_index, sparse=self.sparse) self.assertEqual(res_list.empty, True) self.assertEqual(res_series.empty, True) self.assertEqual(res_series_index.empty, True) self.assertEqual(res_list.index.tolist(), [0]) self.assertEqual(res_series.index.tolist(), [0]) self.assertEqual(res_series_index.index.tolist(), ['A'])
def split_test_train(pitcher_df, date, date_col = 'date'): '''Takes in a pandas df of pitcher data (one or more pitchers) and splits it into testing and training features and targets. It also splits Categorical variables up and binarizes them as their own columns Input Args: pitcher_df: Pandas dataframe containing all pitch data for a single pitcher date: string in the form yyyy-mm-dd, specifying the cutoff for splitting test/train Output: Dictionary containing: train_data: Pandas feature DF for training data train_targets: Pandas Series of training data targets (pitch_type) test_data: Pandas feature DF for testing data test_targets: Pandas Series of testing data targets (pitch_type)''' #Reshaping from pandas.core.reshape import get_dummies #Note: requires Pandas 0.16 + pitcher_subset = pitcher_df.drop('pitch_type', axis = 1) pitcher_subset = get_dummies(pitcher_subset) #split the data and store it in a dictionary pitcher_dict = {} pitcher_dict['train_data'] = pitcher_subset[pitcher_subset[date_col] < date].drop(date_col, axis = 1) pitcher_dict['train_targets'] = pitcher_df['pitch_type'][pitcher_df[date_col] < date].astype('category') pitcher_dict['test_data'] = pitcher_subset[pitcher_subset[date_col] >= date].drop(date_col, axis = 1) pitcher_dict['test_targets'] = pitcher_df['pitch_type'][pitcher_df[date_col] >= date].astype('category') return pitcher_dict
def test_dataframe_dummies_mix_default(self): df = self.df result = get_dummies(df) expected = DataFrame( {"C": [1, 2, 3], "A_a": [1.0, 0, 1], "A_b": [0.0, 1, 0], "B_b": [1.0, 1, 0], "B_c": [0.0, 0, 1]} ) expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]] assert_frame_equal(result, expected)
def test_dataframe_dummies_all_obj(self): df = self.df[['A', 'B']] result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'A_a': [1., 0, 1], 'A_b': [0., 1, 0], 'B_b': [1., 1, 0], 'B_c': [0., 0, 1]}) assert_frame_equal(result, expected)
def test_dataframe_dummies_mix_default(self): df = self.df result = get_dummies(df) expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1], 'A_b': [0., 1, 0], 'B_b': [1., 1, 0], 'B_c': [0., 0, 1]}) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected)
def test_include_na(self): s = ["a", "b", np.nan] res = get_dummies(s) exp = DataFrame({"a": {0: 1.0, 1: 0.0, 2: 0.0}, "b": {0: 0.0, 1: 1.0, 2: 0.0}}) assert_frame_equal(res, exp) res_na = get_dummies(s, dummy_na=True) exp_na = DataFrame( {nan: {0: 0.0, 1: 0.0, 2: 1.0}, "a": {0: 1.0, 1: 0.0, 2: 0.0}, "b": {0: 0.0, 1: 1.0, 2: 0.0}} ).reindex_axis(["a", "b", nan], 1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True) exp_just_na = DataFrame(Series(1.0, index=[0]), columns=[nan]) assert_array_equal(res_just_na.values, exp_just_na.values)
def test_unicode(self): # See GH 6885 - get_dummies chokes on unicode values import unicodedata e = 'e' eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') s = [e, eacute, eacute] res = get_dummies(s, prefix='letter') exp = DataFrame({'letter_e': {0: 1.0, 1: 0.0, 2: 0.0}, u('letter_%s') % eacute: {0: 0.0, 1: 1.0, 2: 1.0}}) assert_frame_equal(res, exp)
def test_dataframe_dummies_prefix_str(self): # not that you should do this... df = self.df result = get_dummies(df, prefix='bad', sparse=self.sparse) expected = DataFrame([[1, 1., 0., 1., 0.], [2, 0., 1., 1., 0.], [3, 1., 0., 0., 1.]], columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c']) assert_frame_equal(result, expected)
def test_datafrmae_dummies_prefix_str(self): # not that you should do this... df = self.df result = get_dummies(df, prefix="bad") expected = DataFrame( [[1, 1.0, 0.0, 1.0, 0.0], [2, 0.0, 1.0, 1.0, 0.0], [3, 1.0, 0.0, 0.0, 1.0]], columns=["C", "bad_a", "bad_b", "bad_b", "bad_c"], ) assert_frame_equal(result, expected)
def test_basic_drop_first_one_level(self): # Test the case that categorical variable only has one level. s_list = list('aaa') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame(index=np.arange(3)) result = get_dummies(s_list, sparse=self.sparse, drop_first=True) assert_frame_equal(result, expected) result = get_dummies(s_series, sparse=self.sparse, drop_first=True) assert_frame_equal(result, expected) expected = DataFrame(index=list('ABC')) result = get_dummies(s_series_index, sparse=self.sparse, drop_first=True) assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_dict(self): prefixes = {'A': 'from_A', 'B': 'from_B'} df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) result = get_dummies(df, prefix=prefixes) expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0], 'from_B_b': [1., 1, 0], 'from_B_c': [0., 0, 1], 'C': [1, 2, 3]}) assert_frame_equal(result, expected)
def test_unicode(self): # See GH 6885 - get_dummies chokes on unicode values import unicodedata e = "e" eacute = unicodedata.lookup("LATIN SMALL LETTER E WITH ACUTE") s = [e, eacute, eacute] res = get_dummies(s, prefix="letter") exp = DataFrame({"letter_e": {0: 1.0, 1: 0.0, 2: 0.0}, u("letter_%s") % eacute: {0: 0.0, 1: 1.0, 2: 1.0}}) assert_frame_equal(res, exp)
def test_dataframe_dummies_drop_first_with_categorical(self): df = self.df df['cat'] = pd.Categorical(['x', 'y', 'y']) result = get_dummies(df, sparse=self.sparse, drop_first=True) expected = DataFrame({'C': [1, 2, 3], 'A_b': [0., 1, 0], 'B_c': [0., 0, 1], 'cat_y': [0., 1, 1]}) expected = expected[['C', 'A_b', 'B_c', 'cat_y']] assert_frame_equal(result, expected)
def test_include_na(self): s = ['a', 'b', np.nan] res = get_dummies(s) exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) assert_frame_equal(res, exp) res_na = get_dummies(s, dummy_na=True) exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0}, 'a': {0: 1.0, 1: 0.0, 2: 0.0}, 'b': {0: 0.0, 1: 1.0, 2: 0.0}}).reindex_axis(['a', 'b', nan], 1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True) exp_just_na = DataFrame({nan: {0: 1.0}}) # hack (NaN handling in assert_index_equal) exp_just_na.columns = res_just_na.columns assert_frame_equal(res_just_na, exp_just_na)
def test_basic(self): s_list = list('abc') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame({'a': {0: 1, 1: 0, 2: 0}, 'b': {0: 0, 1: 1, 2: 0}, 'c': {0: 0, 1: 0, 2: 1}}, dtype=np.uint8) assert_frame_equal(get_dummies(s_list, sparse=self.sparse), expected) assert_frame_equal(get_dummies(s_series, sparse=self.sparse), expected) expected.index = list('ABC') assert_frame_equal( get_dummies(s_series_index, sparse=self.sparse), expected)
def test_dataframe_dummies_prefix_str(self): # not that you should do this... df = self.df result = get_dummies(df, prefix='bad', sparse=self.sparse) expected = DataFrame([[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'], dtype=np.uint8) expected = expected.astype({"C": np.int64}) assert_frame_equal(result, expected)
def test_dataframe_dummies_subset(self): df = self.df result = get_dummies(df, prefix=['from_A'], columns=['A'], sparse=self.sparse) expected = DataFrame({'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) cols = ['from_A_a', 'from_A_b'] expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_sep(self): df = self.df result = get_dummies(df, prefix_sep='..', sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], 'A..a': [1, 0, 1], 'A..b': [0, 1, 0], 'B..b': [1, 1, 0], 'B..c': [0, 0, 1]}) expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] cols = expected.columns[1:] expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse) expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'}) assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}, sparse=self.sparse) assert_frame_equal(result, expected)
def test_dataframe_dummies_drop_first_with_na(self): df = self.df df.loc[3, :] = [np.nan, np.nan, np.nan] result = get_dummies(df, dummy_na=True, sparse=self.sparse, drop_first=True) expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_b': [0, 1, 0, 0], 'A_nan': [0, 0, 0, 1], 'B_c': [0, 0, 1, 0], 'B_nan': [0, 0, 0, 1]}) cols = ['A_b', 'A_nan', 'B_c', 'B_nan'] expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']] assert_frame_equal(result, expected) result = get_dummies(df, dummy_na=False, sparse=self.sparse, drop_first=True) expected = expected[['C', 'A_b', 'B_c']] assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_sep(self): df = self.df result = get_dummies(df, prefix_sep='..', sparse=self.sparse) expected = DataFrame({ 'C': [1, 2, 3], 'A..a': [1., 0, 1], 'A..b': [0., 1, 0], 'B..b': [1., 1, 0], 'B..c': [0., 0, 1] }) expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse) expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'}) assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep={ 'A': '..', 'B': '__' }, sparse=self.sparse) assert_frame_equal(result, expected)
def test_dataframe_dummies_subset(self): df = self.df result = get_dummies(df, prefix=['from_A'], columns=['A'], sparse=self.sparse) expected = DataFrame({ 'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3] }) cols = ['from_A_a', 'from_A_b'] expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected)
def test_dataframe_dummies_preserve_categorical_dtype(self): # GH13854 for ordered in [False, True]: cat = pd.Categorical(list("xy"), categories=list("xyz"), ordered=ordered) result = get_dummies(cat) data = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.uint8) cols = pd.CategoricalIndex(cat.categories, categories=cat.categories, ordered=ordered) expected = DataFrame(data, columns=cols) tm.assert_frame_equal(result, expected)
def test_unicode(self ): # See GH 6885 - get_dummies chokes on unicode values import unicodedata e = 'e' eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') s = [e, eacute, eacute] res = get_dummies(s, prefix='letter', sparse=self.sparse) exp = DataFrame({'letter_e': {0: 1, 1: 0, 2: 0}, u('letter_%s') % eacute: {0: 0, 1: 1, 2: 1}}, dtype=np.uint8) assert_frame_equal(res, exp)
def test_basic_drop_first_NA(self): # Test NA hadling together with drop_first s_NA = ['a', 'b', np.nan] res = get_dummies(s_NA, sparse=self.sparse, drop_first=True) exp = DataFrame({'b': {0: 0, 1: 1, 2: 0}}, dtype=np.uint8) assert_frame_equal(res, exp) res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse, drop_first=True) exp_na = DataFrame({'b': {0: 0, 1: 1, 2: 0}, nan: {0: 0, 1: 0, 2: 1}}, dtype=np.uint8).reindex_axis( ['b', nan], 1) assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse, drop_first=True) exp_just_na = DataFrame(index=np.arange(1)) assert_frame_equal(res_just_na, exp_just_na)
def _add_categorical_dummies(self, panel, cat_mappings): """ Add categorical dummies to panel Returns ------- DataFrame """ if not self._x_effects: return panel dropped_dummy = (self._entity_effects and not self._use_all_dummies) for effect in self._x_effects: self.log('-- Adding fixed effect dummies for %s' % effect) dummies = get_dummies(panel[effect]) val_map = cat_mappings.get(effect) if val_map: val_map = dict((v, k) for k, v in compat.iteritems(val_map)) if dropped_dummy or not self._use_all_dummies: if effect in self._dropped_dummies: to_exclude = mapped_name = self._dropped_dummies.get( effect) if val_map: mapped_name = val_map[to_exclude] else: to_exclude = mapped_name = dummies.columns[0] if mapped_name not in dummies.columns: # pragma: no cover raise Exception('%s not in %s' % (to_exclude, dummies.columns)) self.log('-- Excluding dummy for %s: %s' % (effect, to_exclude)) dummies = dummies.filter( dummies.columns.difference([mapped_name])) dropped_dummy = True dummies = _convertDummies(dummies, cat_mappings.get(effect)) dummies = dummies.add_prefix('%s_' % effect) panel = panel.join(dummies) return panel
def test_dataframe_dummies_prefix_list(self): prefixes = ['from_A', 'from_B'] df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) result = get_dummies(df, prefix=prefixes, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'from_B_b': [1, 1, 0], 'from_B_c': [0, 0, 1]}) cols = expected.columns[1:] expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']] assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_dict(self): prefixes = {'A': 'from_A', 'B': 'from_B'} df = DataFrame({ 'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3] }) result = get_dummies(df, prefix=prefixes) expected = DataFrame({ 'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0], 'from_B_b': [1., 1, 0], 'from_B_c': [0., 0, 1], 'C': [1, 2, 3] }) assert_frame_equal(result, expected)
def test_dataframe_dummies_with_categorical(self): df = self.df df['cat'] = pd.Categorical(['x', 'y', 'y']) result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], 'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], 'B_c': [0, 0, 1], 'cat_x': [1, 0, 0], 'cat_y': [0, 1, 1]}) cols = ['A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y'] expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y']] assert_frame_equal(result, expected)
def test_dataframe_dummies_with_categorical(self): df = self.df df['cat'] = pd.Categorical(['x', 'y', 'y']) result = get_dummies(df) expected = DataFrame({ 'C': [1, 2, 3], 'A_a': [1., 0, 1], 'A_b': [0., 1, 0], 'B_b': [1., 1, 0], 'B_c': [0., 0, 1], 'cat_x': [1., 0, 0], 'cat_y': [0., 1, 1] }) expected = expected[[ 'C', 'A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y' ]] assert_frame_equal(result, expected)
def add_claim_vectorization(self, df_model): print "adding claim vectorization..." df_model = df_model file_path = '../data/df_model.pkl' df_model.to_pickle(file_path) patent_tokenizer = PatentTokenizer() patent_tokenizer.set_df(file_path) patent_tokenizer.set_vectors() patent_claims_vectorizer = patent_tokenizer.get_claims_vectorizer() patent_claims_vectors = patent_tokenizer.get_claims_vectors() self.patent_tokenizer = patent_claims_vectorizer self.patent_vectors = patent_claims_vectors claims_features_name = patent_claims_vectorizer.get_feature_names() df_claims_features = pd.DataFrame(data=patent_claims_vectors, index=df_model.index, columns=claims_features_name) df_merge = pd.merge(df_model, df_claims_features, how='inner', left_index=True, right_index=True) non_needed = ['Number', 'Title', 'Abstract', 'Claims'] df_temp = df_merge.drop(non_needed, axis=1) dummy_class = get_dummies(df_temp['Primary Class'], dummy_na=True, prefix='primary_class') df_rf_features = pd.merge(df_temp, dummy_class, how='inner', left_index=True, right_index=True) df_rf_features = df_rf_features.drop(['Primary Class'], axis=1) return df_rf_features
def test_dataframe_dummies_prefix_list(self): prefixes = ['from_A', 'from_B'] df = DataFrame({ 'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3] }) result = get_dummies(df, prefix=prefixes, sparse=self.sparse) expected = DataFrame({ 'C': [1, 2, 3], 'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0], 'from_B_b': [1., 1, 0], 'from_B_c': [0., 0, 1] }) expected = expected[[ 'C', 'from_A_a', 'from_A_b', 'from_B_b', 'from_B_c' ]] assert_frame_equal(result, expected)
import pandas as pd from sklearn.linear_model import * from sklearn.ensemble import * from sklearn.cross_validation import cross_val_score, cross_val_predict import pandas.core.reshape as reshape df = pd.read_csv("C:\\train.csv") df.drop(df.columns[0], axis=1, inplace=True) df.drop('City', axis=1, inplace=True) df.drop('Type', axis=1, inplace=True) h = df.loc[:, 'Open Date'] h = pd.to_datetime(h) h = h.apply((lambda x: x.year)) df.loc[:, 'Open Date'] = h g = reshape.get_dummies(df.loc[:, 'City Group']) df.drop('City Group', axis=1, inplace=True) df = pd.concat([g, df], axis=1) labels = df["revenue"].values features = df[df.columns[0:(len(df.columns) - 2)]].values et = ExtraTreesRegressor(n_estimators=200) predicted = cross_val_predict(et, features, labels, n_jobs=4, cv=10) fig, ax = plt.subplots() ax.scatter(df.loc[:, 'revenue'], predicted) #ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4) ax.set_xlabel('Measured') ax.set_ylabel('Predicted')
def test_dataframe_dummies_prefix_bad_length(self): with tm.assertRaises(ValueError): get_dummies(self.df, prefix=['too few'])
def test_dataframe_dummies_all_obj(self): df = self.df[['A', 'B']] result = get_dummies(df) expected = DataFrame({'A_a': [1., 0, 1], 'A_b': [0., 1, 0], 'B_b': [1., 1, 0], 'B_c': [0., 0, 1]}) assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_sep_bad_length(self): with tm.assertRaises(ValueError): get_dummies(self.df, prefix_sep=['bad'], sparse=self.sparse)
def test_dataframe_dummies_drop_first(self): df = self.df[['A', 'B']] result = get_dummies(df, sparse=self.sparse, drop_first=True) expected = DataFrame({'A_b': [0., 1, 0], 'B_c': [0., 0, 1]}) assert_frame_equal(result, expected)