def test_dummy_sparse(self): data = self.data self.grouping.dummy_sparse() expected = categorical(data.index.get_level_values(0).values, drop=True) np.testing.assert_equal(self.grouping._dummies.toarray(), expected) if len(self.grouping.group_names) > 1: self.grouping.dummy_sparse(level=1) expected = categorical(data.index.get_level_values(1).values, drop=True) np.testing.assert_equal(self.grouping._dummies.toarray(), expected)
def test_rec_issue302(): arr = np.rec.fromrecords([[10], [11]], names="group") actual = tools.categorical(arr) expected = np.rec.array( [(10, 1.0, 0.0), (11, 0.0, 1.0)], dtype=[("group", int), ("group_10", float), ("group_11", float)] ) assert_array_equal(actual, expected)
def test_issue302(): arr = np.rec.fromrecords([[10, 12], [11, 13]], names=['group', 'whatever']) actual = tools.categorical(arr, col=['group']) expected = np.rec.array([(10, 12, 1.0, 0.0), (11, 13, 0.0, 1.0)], dtype=[('group', int), ('whatever', int), ('group_10', float), ('group_11', float)]) assert_array_equal(actual, expected)
def test_fvalue_const_only(): np.random.seed(12345) x = np.random.randint(0, 3, size=30) x = categorical(x, drop=True) x[:, 0] = 1 y = np.dot(x, [1., 2., 3.]) + np.random.normal(size=30) res = OLS(y, x, hasconst=True).fit(cov_type='HC1') assert not np.isnan(res.fvalue)
def test_const_indicator(): np.random.seed(12345) X = np.random.randint(0, 3, size=30) X = categorical(X, drop=True) y = np.dot(X, [1., 2., 3.]) + np.random.normal(size=30) modc = OLS(y, add_constant(X[:,1:], prepend=True)).fit() mod = OLS(y, X, hasconst=True).fit() assert_almost_equal(modc.rsquared, mod.rsquared, 12)
def test_structarray2d_drop(self): with pytest.warns(FutureWarning, match="recarray support"): des = tools.categorical(self.structdes, col='instrument', drop=True) test_des = np.column_stack(([des[_] for _ in des.dtype.names[-5:]])) assert_array_equal(test_des, self.dummy) assert_equal(len(des.dtype.names), 8)
def test_issue302(): arr = np.rec.fromrecords([[10, 12], [11, 13]], names=["group", "whatever"]) actual = tools.categorical(arr, col=["group"]) expected = np.rec.array( [(10, 12, 1.0, 0.0), (11, 13, 0.0, 1.0)], dtype=[("group", int), ("whatever", int), ("group_10", float), ("group_11", float)], ) assert_array_equal(actual, expected)
def test_issue302(): arr = np.rec.fromrecords([[10, 12], [11, 13]], names=['group', 'whatever']) with pytest.warns(FutureWarning, match="recarray support"): actual = tools.categorical(arr, col=['group']) expected = np.rec.array([(10, 12, 1.0, 0.0), (11, 13, 0.0, 1.0)], dtype=[('group', int), ('whatever', int), ('group_10', float), ('group_11', float)]) assert_array_equal(actual, expected)
def test_const_indicator(): np.random.seed(12345) X = np.random.randint(0, 3, size=30) X = categorical(X, drop=True) y = np.dot(X, [1., 2., 3.]) + np.random.normal(size=30) resc = OLS(y, add_constant(X[:, 1:], prepend=True)).fit() res = OLS(y, X, hasconst=True).fit() assert_almost_equal(resc.rsquared, res.rsquared, 12) assert res.model.data.k_constant == 1 assert resc.model.data.k_constant == 1
def test_categorical_pandas_errors(string_var): with pytest.raises(ValueError, match='data.name does not match col'): tools.categorical(string_var, 'unknown') df = pd.DataFrame(string_var) with pytest.raises(TypeError, match='col must be a str or int'): tools.categorical(df, None) with pytest.raises(ValueError, match='Column \'unknown\' not found in ' 'data'): tools.categorical(df, 'unknown')
def test_categorical_pandas_errors(string_var): with pytest.raises(ValueError, match="data.name does not match col"): tools.categorical(string_var, "unknown") df = pd.DataFrame(string_var) with pytest.raises(TypeError, match="col must be a str or int"): tools.categorical(df, None) with pytest.raises( ValueError, match="Column 'unknown' not found in " "data" ): tools.categorical(df, "unknown")
def fit_floating_gravity(data, deg=2, **kwargs): """Fit floating gravity model to the observations. """ # transform data df = pd.DataFrame({ 'g': np.concatenate((np.zeros_like(data.delta_g), data.delta_g)), 'h': np.concatenate((data['level_1'], data['level_2'])) / 1000, 'ci': np.tile(data.runn, 2) }) df = df.drop_duplicates(['ci', 'h', 'g']) df = df.sort_values(['ci', 'g'], ascending=[True, False]) df = df.reset_index(drop=True) # observations endog = np.asarray(df.g) # design matrix exog_1 = np.vander(df.h.values, N=deg + 1, increasing=True)[:, 1:] exog_2 = categorical(df.ci.values, drop=True) exog = np.hstack((exog_2, exog_1)) # rename unknowns h_level_1 = data.drop_duplicates(['level_1', 'runn']).level_1 h0 = ['h({:,.3f})'.format(hi / 1000) for hi in np.asarray(h_level_1)] poly_cnames = [x for x in ascii_lowercase[:deg]] cnames = np.append(h0, poly_cnames) exog = pd.DataFrame(exog, columns=cnames) # fit results = WLS(endog, exog, **kwargs).fit() return df, results
def test_array1d_drop(self): des = tools.categorical(self.string_var, drop=True) assert_array_equal(des, self.dummy) assert_equal(des.shape[1], 5)
def test_structarray1d(self): instr = self.structdes["instrument"].view(dtype=[("var1", "f4")]) dum = tools.categorical(instr) test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names[-5:]])) assert_array_equal(test_dum, self.dummy) assert_equal(len(dum.dtype.names), 6)
from sklearn.tree import DecisionTreeClassifier, export_graphviz #imported data set sf = pd.read_csv( "/Users/bonbon/downloads/USF/2019 Fall/MSIS678_DataWarehousing/Project/San_Francisco_Communitywide_Greenhouse_Gas_Inventory.csv" ) #print(sf.to_string()) #print(sf_train.to_string()) # X = sf.drop('Emissions_mtCO2e', 1) Sector_General_np = np.array(sf['Sector_General']) (Sector_General_cat, Sector_General_cat_dict) = stattools.categorical(Sector_General_np, drop=True, dictnames=True) inv_map = {v: k for k, v in Sector_General_cat_dict.items()} Sector_General_cat_pd = sf['Sector_General'].apply(lambda r: inv_map[r]) print(Sector_General_cat_pd.to_string()) Sector_Detail1_np = np.array(sf['Sector_Detail1']) (Sector_Detail1_cat, Sector_Detail1_cat_dict) = stattools.categorical(Sector_Detail1_np, drop=True, dictnames=True) inv_map = {v: k for k, v in Sector_Detail1_cat_dict.items()} Sector_Detail1_cat_pd = sf['Sector_Detail1'].apply(lambda r: inv_map[r]) Sector_Detail2_np = np.array(sf['Sector_Detail2']) (Sector_Detail2_cat,
def test_structarray2d_drop(self): des = tools.categorical(self.structdes, col='str_instr', drop=True) test_des = np.column_stack(([des[_] for _ in des.dtype.names[-5:]])) assert_array_equal(test_des, self.dummy) assert_equal(len(des.dtype.names), 8)
def test_categorical_errors(string_var): with pytest.raises(ValueError, match='Can only convert one column'): tools.categorical(string_var, (0, 1)) with pytest.raises(ValueError, match='data.name does not match col'): tools.categorical(string_var, {'a': 1})
def test_array1d(self): des = tools.categorical(self.instr) assert_array_equal(des[:, -5:], self.dummy) assert_equal(des.shape[1], 6)
def test_recarray2d_error(self): arr = np.c_[self.recdes, self.recdes] with pytest.raises(IndexError, match='col is None and the input'): with pytest.warns(FutureWarning, match="recarray support"): tools.categorical(arr, col=None)
def test_structarray1d_drop(self): instr = self.structdes["str_instr"].view(dtype=[("var1", "a10")]) dum = tools.categorical(instr, drop=True) test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names])) assert_array_equal(test_dum, self.dummy) assert_equal(len(dum.dtype.names), 5)
def test_structarray2dint(self): des = tools.categorical(self.structdes, col=3) test_des = np.column_stack(([des[_] for _ in des.dtype.names[-5:]])) assert_array_equal(test_des, self.dummy) assert_equal(len(des.dtype.names), 9)
def test_recarray2d(self): des = tools.categorical(self.recdes, col="str_instr") # better way to do this? test_des = np.column_stack(([des[_] for _ in des.dtype.names[-5:]])) assert_array_equal(test_des, self.dummy) assert_equal(len(des.dtype.names), 9)
def test_array2d(self): des1 = np.column_stack((self.des, self.instr, self.des)) des = tools.categorical(des1, col=2) assert_array_equal(des[:,-5:], self.dummy) assert_equal(des.shape[1],10)
def test_rec_issue302(): arr = np.rec.fromrecords([[10], [11]], names='group') actual = tools.categorical(arr) expected = np.rec.array([(10, 1.0, 0.0), (11, 0.0, 1.0)], dtype=[('group', int), ('group_10', float), ('group_11', float)]) assert_array_equal(actual, expected)
def test_arraylike1d_drop(self): instr = self.structdes['instrument'].tolist() dum = tools.categorical(instr, drop=True) assert_array_equal(dum, self.dummy) assert_equal(dum.shape[1], 5)
def test_arraylike2d_drop(self): des = tools.categorical(self.structdes.tolist(), col=2, drop=True) test_des = des[:,-5:] assert_array_equal(test_des, self.dummy) assert_equal(des.shape[1], 8)
def test_arraylike1d(self): instr = self.structdes['instrument'].tolist() dum = tools.categorical(instr) test_dum = dum[:,-5:] assert_array_equal(test_dum, self.dummy) assert_equal(dum.shape[1], 6)
def test_recarray1d(self): instr = self.structdes["str_instr"].view(np.recarray) dum = tools.categorical(instr) test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names[-5:]])) assert_array_equal(test_dum, self.dummy) assert_equal(len(dum.dtype.names), 6)
def test_recarray2d_error(self): arr = np.c_[self.recdes, self.recdes] with pytest.raises(IndexError, match='col is None and the input'): tools.categorical(arr, col=None)
def test_structarray2d_drop(self): des = tools.categorical(self.structdes, col="str_instr", drop=True) test_des = np.column_stack(([des[_] for _ in des.dtype.names[-5:]])) assert_array_equal(test_des, self.dummy) assert_equal(len(des.dtype.names), 8)
def test_recarray1d_drop(self): instr = self.structdes['instrument'].view(np.recarray) dum = tools.categorical(instr, drop=True) test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names])) assert_array_equal(test_dum, self.dummy) assert_equal(len(dum.dtype.names), 5)
# 3 Married <=50K 0.00000 # 4 Married <=50K 0.00000 # ============================================================================= # ============================================================================= # For simplicity, we save the Income variable as y. # y = adult_tr[['Income']] y = adult_tr[['Income']] # y was created with only one column, [18761 rows x 1 columns] # We have a categorical variable, Marital status, among our predictors. # The CART model implemented in the sklearn package needs categorical variables converted to a dummy variable form. # Thus, we will make a series of dummy variables for Marital status using the categorical() command. # ============================================================================= mar_np = np.array(adult_tr['Marital status']) #mar_np created - We turn the variable Marital status into an array using array(), mar_cat = sm.categorical(mar_np, drop=True) mar_cat_dict = stattools.categorical(mar_np, dictnames=True) #Now, we need to add the newly made dummy variables back into the X variables. mar_cat_pd = pd.DataFrame(mar_cat) #we converted the mar_cat matrix into a data frame using the DataFrame() command X = pd.concat((adult_tr[['Cap_Gains_Losses']], mar_cat_pd), axis=1) # ============================================================================= # We then use the concat() command to attach the predictor variable Cap_Gains_Losses to # the data frame of dummy variables that represent marital status. We save the result as X. # ============================================================================= # ============================================================================= # Data is like this # 18749 0.000000 0.0 1.0 0.0 0.0 0.0 # 18750 0.010550 0.0 0.0 1.0 0.0 0.0 # 18751 1.000000 0.0 1.0 0.0 0.0 0.0 # 18752 0.362489 0.0 1.0 0.0 0.0 0.0 #
def test_array1d_col_error(self): with pytest.raises(TypeError, match='col must be a str, int or None'): tools.categorical(self.instr, col={'a': 1})
def test_recarray1d(self): instr = self.structdes['str_instr'].view(np.recarray) dum = tools.categorical(instr) test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names[-5:]])) assert_array_equal(test_dum, self.dummy) assert_equal(len(dum.dtype.names), 6)
def test_array2d_drop(self): des = np.column_stack((self.des, self.instr, self.des)) des = tools.categorical(des, col=2, drop=True) assert_array_equal(des[:, -5:], self.dummy) assert_equal(des.shape[1], 9)
def test_recarray2dint(self): with pytest.warns(FutureWarning, match="recarray support"): des = tools.categorical(self.recdes, col=3) test_des = np.column_stack(([des[_] for _ in des.dtype.names[-5:]])) assert_array_equal(test_des, self.dummy) assert_equal(len(des.dtype.names), 9)
def test_structarray1d(self): instr = self.structdes['instrument'].view(dtype=[('var1', 'f4')]) dum = tools.categorical(instr) test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names[-5:]])) assert_array_equal(test_dum, self.dummy) assert_equal(len(dum.dtype.names), 6)
def test_recarray2d(self): des = tools.categorical(self.recdes, col='str_instr') # better way to do this? test_des = np.column_stack(([des[_] for _ in des.dtype.names[-5:]])) assert_array_equal(test_des, self.dummy) assert_equal(len(des.dtype.names), 9)
def test_recarray1d_drop(self): instr = self.structdes["instrument"].view(np.recarray) dum = tools.categorical(instr, drop=True) test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names])) assert_array_equal(test_dum, self.dummy) assert_equal(len(dum.dtype.names), 5)
def test_structarray1d_drop(self): instr = self.structdes['str_instr'].view(dtype=[('var1', 'a10')]) dum = tools.categorical(instr, drop=True) test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names])) assert_array_equal(test_dum, self.dummy) assert_equal(len(dum.dtype.names), 5)
import numpy as np import statsmodels.tools.tools as stattools from sklearn.tree import DecisionTreeClassifier, export_graphviz # import the dataa adult_tr = pd.read_csv( "/Users/chriskehl/Library/CloudStorage/iCloud Drive/Documents/data_files/data_science/data_sets/website_data_sets/adult_ch6_training" ) # save the income variable as y y = adult_tr[['Income']] #make dummy variables mar_np = np.array(adult_tr['Marital status']) (mar_cat, mar_cat_dict) = stattools.categorical(mar_np, drop=True, dictnames=True) # add newly created dummy variables back into the x variables mar_cat_pd = pd.DataFrame(mar_cat) X = pd.concat((adult_tr[['Cap_Gains_Losses']], mar_cat_pd), axis=1) # specify the column names in X X_names = [ "Cap_Gains_Losses", "Divorced", "Married", "Never-Married", "Separated", "Widowed" ] # explain the levels of Y y_names = ["<=50K", ">50K"]