def test_dummy_sparse(self):
        data = self.data
        self.grouping.dummy_sparse()
        expected = categorical(data.index.get_level_values(0).values,
                               drop=True)
        np.testing.assert_equal(self.grouping._dummies.toarray(), expected)

        if len(self.grouping.group_names) > 1:
            self.grouping.dummy_sparse(level=1)
            expected = categorical(data.index.get_level_values(1).values,
                    drop=True)
            np.testing.assert_equal(self.grouping._dummies.toarray(),
                                    expected)
Beispiel #2
0
def test_rec_issue302():
    arr = np.rec.fromrecords([[10], [11]], names="group")
    actual = tools.categorical(arr)
    expected = np.rec.array(
        [(10, 1.0, 0.0), (11, 0.0, 1.0)], dtype=[("group", int), ("group_10", float), ("group_11", float)]
    )
    assert_array_equal(actual, expected)
Beispiel #3
0
def test_issue302():
    arr = np.rec.fromrecords([[10, 12], [11, 13]], names=['group', 'whatever'])
    actual = tools.categorical(arr, col=['group'])
    expected = np.rec.array([(10, 12, 1.0, 0.0), (11, 13, 0.0, 1.0)],
        dtype=[('group', int), ('whatever', int), ('group_10', float),
               ('group_11', float)])
    assert_array_equal(actual, expected)
Beispiel #4
0
def test_fvalue_const_only():
    np.random.seed(12345)
    x = np.random.randint(0, 3, size=30)
    x = categorical(x, drop=True)
    x[:, 0] = 1
    y = np.dot(x, [1., 2., 3.]) + np.random.normal(size=30)
    res = OLS(y, x, hasconst=True).fit(cov_type='HC1')
    assert not np.isnan(res.fvalue)
Beispiel #5
0
def test_const_indicator():
    np.random.seed(12345)
    X = np.random.randint(0, 3, size=30)
    X = categorical(X, drop=True)
    y = np.dot(X, [1., 2., 3.]) + np.random.normal(size=30)
    modc = OLS(y, add_constant(X[:,1:], prepend=True)).fit()
    mod = OLS(y, X, hasconst=True).fit()
    assert_almost_equal(modc.rsquared, mod.rsquared, 12)
def test_fvalue_const_only():
    np.random.seed(12345)
    x = np.random.randint(0, 3, size=30)
    x = categorical(x, drop=True)
    x[:, 0] = 1
    y = np.dot(x, [1., 2., 3.]) + np.random.normal(size=30)
    res = OLS(y, x, hasconst=True).fit(cov_type='HC1')
    assert not np.isnan(res.fvalue)
Beispiel #7
0
 def test_structarray2d_drop(self):
     with pytest.warns(FutureWarning, match="recarray support"):
         des = tools.categorical(self.structdes,
                                 col='instrument',
                                 drop=True)
     test_des = np.column_stack(([des[_] for _ in des.dtype.names[-5:]]))
     assert_array_equal(test_des, self.dummy)
     assert_equal(len(des.dtype.names), 8)
Beispiel #8
0
def test_issue302():
    arr = np.rec.fromrecords([[10, 12], [11, 13]], names=["group", "whatever"])
    actual = tools.categorical(arr, col=["group"])
    expected = np.rec.array(
        [(10, 12, 1.0, 0.0), (11, 13, 0.0, 1.0)],
        dtype=[("group", int), ("whatever", int), ("group_10", float), ("group_11", float)],
    )
    assert_array_equal(actual, expected)
def test_const_indicator():
    np.random.seed(12345)
    X = np.random.randint(0, 3, size=30)
    X = categorical(X, drop=True)
    y = np.dot(X, [1., 2., 3.]) + np.random.normal(size=30)
    modc = OLS(y, add_constant(X[:,1:], prepend=True)).fit()
    mod = OLS(y, X, hasconst=True).fit()
    assert_almost_equal(modc.rsquared, mod.rsquared, 12)
Beispiel #10
0
def test_issue302():
    arr = np.rec.fromrecords([[10, 12], [11, 13]], names=['group', 'whatever'])
    with pytest.warns(FutureWarning, match="recarray support"):
        actual = tools.categorical(arr, col=['group'])
    expected = np.rec.array([(10, 12, 1.0, 0.0), (11, 13, 0.0, 1.0)],
                            dtype=[('group', int), ('whatever', int),
                                   ('group_10', float), ('group_11', float)])
    assert_array_equal(actual, expected)
def test_const_indicator():
    np.random.seed(12345)
    X = np.random.randint(0, 3, size=30)
    X = categorical(X, drop=True)
    y = np.dot(X, [1., 2., 3.]) + np.random.normal(size=30)
    resc = OLS(y, add_constant(X[:, 1:], prepend=True)).fit()
    res = OLS(y, X, hasconst=True).fit()
    assert_almost_equal(resc.rsquared, res.rsquared, 12)
    assert res.model.data.k_constant == 1
    assert resc.model.data.k_constant == 1
Beispiel #12
0
def test_const_indicator():
    np.random.seed(12345)
    X = np.random.randint(0, 3, size=30)
    X = categorical(X, drop=True)
    y = np.dot(X, [1., 2., 3.]) + np.random.normal(size=30)
    resc = OLS(y, add_constant(X[:, 1:], prepend=True)).fit()
    res = OLS(y, X, hasconst=True).fit()
    assert_almost_equal(resc.rsquared, res.rsquared, 12)
    assert res.model.data.k_constant == 1
    assert resc.model.data.k_constant == 1
Beispiel #13
0
def test_categorical_pandas_errors(string_var):
    with pytest.raises(ValueError, match='data.name does not match col'):
        tools.categorical(string_var, 'unknown')

    df = pd.DataFrame(string_var)
    with pytest.raises(TypeError, match='col must be a str or int'):
        tools.categorical(df, None)
    with pytest.raises(ValueError, match='Column \'unknown\' not found in '
                                         'data'):
        tools.categorical(df, 'unknown')
Beispiel #14
0
def test_categorical_pandas_errors(string_var):
    with pytest.raises(ValueError, match="data.name does not match col"):
        tools.categorical(string_var, "unknown")

    df = pd.DataFrame(string_var)
    with pytest.raises(TypeError, match="col must be a str or int"):
        tools.categorical(df, None)
    with pytest.raises(
        ValueError, match="Column 'unknown' not found in " "data"
    ):
        tools.categorical(df, "unknown")
Beispiel #15
0
def fit_floating_gravity(data, deg=2, **kwargs):
    """Fit floating gravity model to the observations.

    """

    # transform data
    df = pd.DataFrame({
        'g':
        np.concatenate((np.zeros_like(data.delta_g), data.delta_g)),
        'h':
        np.concatenate((data['level_1'], data['level_2'])) / 1000,
        'ci':
        np.tile(data.runn, 2)
    })

    df = df.drop_duplicates(['ci', 'h', 'g'])
    df = df.sort_values(['ci', 'g'], ascending=[True, False])
    df = df.reset_index(drop=True)

    # observations
    endog = np.asarray(df.g)

    # design matrix
    exog_1 = np.vander(df.h.values, N=deg + 1, increasing=True)[:, 1:]
    exog_2 = categorical(df.ci.values, drop=True)
    exog = np.hstack((exog_2, exog_1))

    # rename unknowns
    h_level_1 = data.drop_duplicates(['level_1', 'runn']).level_1
    h0 = ['h({:,.3f})'.format(hi / 1000) for hi in np.asarray(h_level_1)]
    poly_cnames = [x for x in ascii_lowercase[:deg]]
    cnames = np.append(h0, poly_cnames)
    exog = pd.DataFrame(exog, columns=cnames)

    # fit
    results = WLS(endog, exog, **kwargs).fit()

    return df, results
Beispiel #16
0
 def test_array1d_drop(self):
     des = tools.categorical(self.string_var, drop=True)
     assert_array_equal(des, self.dummy)
     assert_equal(des.shape[1], 5)
Beispiel #17
0
 def test_structarray1d(self):
     instr = self.structdes["instrument"].view(dtype=[("var1", "f4")])
     dum = tools.categorical(instr)
     test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names[-5:]]))
     assert_array_equal(test_dum, self.dummy)
     assert_equal(len(dum.dtype.names), 6)
Beispiel #18
0
from sklearn.tree import DecisionTreeClassifier, export_graphviz

#imported data set
sf = pd.read_csv(
    "/Users/bonbon/downloads/USF/2019 Fall/MSIS678_DataWarehousing/Project/San_Francisco_Communitywide_Greenhouse_Gas_Inventory.csv"
)
#print(sf.to_string())

#print(sf_train.to_string())

# X = sf.drop('Emissions_mtCO2e', 1)

Sector_General_np = np.array(sf['Sector_General'])
(Sector_General_cat,
 Sector_General_cat_dict) = stattools.categorical(Sector_General_np,
                                                  drop=True,
                                                  dictnames=True)
inv_map = {v: k for k, v in Sector_General_cat_dict.items()}
Sector_General_cat_pd = sf['Sector_General'].apply(lambda r: inv_map[r])
print(Sector_General_cat_pd.to_string())

Sector_Detail1_np = np.array(sf['Sector_Detail1'])
(Sector_Detail1_cat,
 Sector_Detail1_cat_dict) = stattools.categorical(Sector_Detail1_np,
                                                  drop=True,
                                                  dictnames=True)
inv_map = {v: k for k, v in Sector_Detail1_cat_dict.items()}
Sector_Detail1_cat_pd = sf['Sector_Detail1'].apply(lambda r: inv_map[r])

Sector_Detail2_np = np.array(sf['Sector_Detail2'])
(Sector_Detail2_cat,
Beispiel #19
0
 def test_structarray2d_drop(self):
     des = tools.categorical(self.structdes, col='str_instr', drop=True)
     test_des = np.column_stack(([des[_] for _ in des.dtype.names[-5:]]))
     assert_array_equal(test_des, self.dummy)
     assert_equal(len(des.dtype.names), 8)
Beispiel #20
0
def test_categorical_errors(string_var):
    with pytest.raises(ValueError, match='Can only convert one column'):
        tools.categorical(string_var, (0, 1))
    with pytest.raises(ValueError, match='data.name does not match col'):
        tools.categorical(string_var, {'a': 1})
Beispiel #21
0
 def test_array1d(self):
     des = tools.categorical(self.instr)
     assert_array_equal(des[:, -5:], self.dummy)
     assert_equal(des.shape[1], 6)
Beispiel #22
0
 def test_recarray2d_error(self):
     arr = np.c_[self.recdes, self.recdes]
     with pytest.raises(IndexError, match='col is None and the input'):
         with pytest.warns(FutureWarning, match="recarray support"):
             tools.categorical(arr, col=None)
Beispiel #23
0
 def test_structarray1d_drop(self):
     instr = self.structdes["str_instr"].view(dtype=[("var1", "a10")])
     dum = tools.categorical(instr, drop=True)
     test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names]))
     assert_array_equal(test_dum, self.dummy)
     assert_equal(len(dum.dtype.names), 5)
Beispiel #24
0
 def test_structarray2dint(self):
     des = tools.categorical(self.structdes, col=3)
     test_des = np.column_stack(([des[_] for _ in des.dtype.names[-5:]]))
     assert_array_equal(test_des, self.dummy)
     assert_equal(len(des.dtype.names), 9)
Beispiel #25
0
 def test_recarray2d(self):
     des = tools.categorical(self.recdes, col="str_instr")
     # better way to do this?
     test_des = np.column_stack(([des[_] for _ in des.dtype.names[-5:]]))
     assert_array_equal(test_des, self.dummy)
     assert_equal(len(des.dtype.names), 9)
Beispiel #26
0
 def test_array2d(self):
     des1 = np.column_stack((self.des, self.instr, self.des))
     des = tools.categorical(des1, col=2)
     assert_array_equal(des[:,-5:], self.dummy)
     assert_equal(des.shape[1],10)
Beispiel #27
0
def test_rec_issue302():
    arr = np.rec.fromrecords([[10], [11]], names='group')
    actual = tools.categorical(arr)
    expected = np.rec.array([(10, 1.0, 0.0), (11, 0.0, 1.0)],
        dtype=[('group', int), ('group_10', float), ('group_11', float)])
    assert_array_equal(actual, expected)
Beispiel #28
0
 def test_arraylike1d_drop(self):
     instr = self.structdes['instrument'].tolist()
     dum = tools.categorical(instr, drop=True)
     assert_array_equal(dum, self.dummy)
     assert_equal(dum.shape[1], 5)
Beispiel #29
0
 def test_arraylike2d_drop(self):
     des = tools.categorical(self.structdes.tolist(), col=2, drop=True)
     test_des = des[:,-5:]
     assert_array_equal(test_des, self.dummy)
     assert_equal(des.shape[1], 8)
Beispiel #30
0
 def test_arraylike1d(self):
     instr = self.structdes['instrument'].tolist()
     dum = tools.categorical(instr)
     test_dum = dum[:,-5:]
     assert_array_equal(test_dum, self.dummy)
     assert_equal(dum.shape[1], 6)
Beispiel #31
0
 def test_recarray1d(self):
     instr = self.structdes["str_instr"].view(np.recarray)
     dum = tools.categorical(instr)
     test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names[-5:]]))
     assert_array_equal(test_dum, self.dummy)
     assert_equal(len(dum.dtype.names), 6)
Beispiel #32
0
 def test_recarray2d_error(self):
     arr = np.c_[self.recdes, self.recdes]
     with pytest.raises(IndexError, match='col is None and the input'):
         tools.categorical(arr, col=None)
Beispiel #33
0
 def test_structarray2d_drop(self):
     des = tools.categorical(self.structdes, col="str_instr", drop=True)
     test_des = np.column_stack(([des[_] for _ in des.dtype.names[-5:]]))
     assert_array_equal(test_des, self.dummy)
     assert_equal(len(des.dtype.names), 8)
Beispiel #34
0
 def test_recarray1d_drop(self):
     instr = self.structdes['instrument'].view(np.recarray)
     dum = tools.categorical(instr, drop=True)
     test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names]))
     assert_array_equal(test_dum, self.dummy)
     assert_equal(len(dum.dtype.names), 5)
Beispiel #35
0
# 3        Married  <=50K           0.00000
# 4        Married  <=50K           0.00000
# =============================================================================
# =============================================================================
# For simplicity, we save the Income variable as y.
# y = adult_tr[['Income']]
y = adult_tr[['Income']]
# y was created with only one column, [18761 rows x 1 columns]
# We have a categorical variable, Marital status, among our predictors.
# The CART model implemented in the sklearn package needs categorical variables converted to a dummy variable form.
# Thus, we will make a series of dummy variables for Marital status using the categorical() command.
# =============================================================================
mar_np = np.array(adult_tr['Marital status'])
#mar_np created - We turn the variable Marital status into an array using array(),
mar_cat = sm.categorical(mar_np, drop=True)
mar_cat_dict = stattools.categorical(mar_np, dictnames=True)
#Now, we need to add the newly made dummy variables back into the X variables.
mar_cat_pd = pd.DataFrame(mar_cat)
#we converted the mar_cat matrix into a data frame using the DataFrame() command
X = pd.concat((adult_tr[['Cap_Gains_Losses']], mar_cat_pd), axis=1)
# =============================================================================
# We then use the concat() command to attach the predictor variable Cap_Gains_Losses to
# the data frame of dummy variables that represent marital status. We save the result as X.
# =============================================================================
# =============================================================================
# Data is like this
# 18749          0.000000  0.0  1.0  0.0  0.0  0.0
# 18750          0.010550  0.0  0.0  1.0  0.0  0.0
# 18751          1.000000  0.0  1.0  0.0  0.0  0.0
# 18752          0.362489  0.0  1.0  0.0  0.0  0.0
#
Beispiel #36
0
 def test_array1d_drop(self):
     des = tools.categorical(self.string_var, drop=True)
     assert_array_equal(des, self.dummy)
     assert_equal(des.shape[1], 5)
Beispiel #37
0
 def test_array1d_col_error(self):
     with pytest.raises(TypeError, match='col must be a str, int or None'):
         tools.categorical(self.instr, col={'a': 1})
Beispiel #38
0
 def test_recarray1d(self):
     instr = self.structdes['str_instr'].view(np.recarray)
     dum = tools.categorical(instr)
     test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names[-5:]]))
     assert_array_equal(test_dum, self.dummy)
     assert_equal(len(dum.dtype.names), 6)
Beispiel #39
0
 def test_array2d_drop(self):
     des = np.column_stack((self.des, self.instr, self.des))
     des = tools.categorical(des, col=2, drop=True)
     assert_array_equal(des[:, -5:], self.dummy)
     assert_equal(des.shape[1], 9)
Beispiel #40
0
 def test_recarray2dint(self):
     with pytest.warns(FutureWarning, match="recarray support"):
         des = tools.categorical(self.recdes, col=3)
     test_des = np.column_stack(([des[_] for _ in des.dtype.names[-5:]]))
     assert_array_equal(test_des, self.dummy)
     assert_equal(len(des.dtype.names), 9)
Beispiel #41
0
 def test_structarray1d(self):
     instr = self.structdes['instrument'].view(dtype=[('var1', 'f4')])
     dum = tools.categorical(instr)
     test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names[-5:]]))
     assert_array_equal(test_dum, self.dummy)
     assert_equal(len(dum.dtype.names), 6)
Beispiel #42
0
 def test_array1d(self):
     des = tools.categorical(self.instr)
     assert_array_equal(des[:, -5:], self.dummy)
     assert_equal(des.shape[1], 6)
Beispiel #43
0
 def test_recarray2d(self):
     des = tools.categorical(self.recdes, col='str_instr')
     # better way to do this?
     test_des = np.column_stack(([des[_] for _ in des.dtype.names[-5:]]))
     assert_array_equal(test_des, self.dummy)
     assert_equal(len(des.dtype.names), 9)
Beispiel #44
0
 def test_array2d_drop(self):
     des = np.column_stack((self.des, self.instr, self.des))
     des = tools.categorical(des, col=2, drop=True)
     assert_array_equal(des[:, -5:], self.dummy)
     assert_equal(des.shape[1], 9)
Beispiel #45
0
 def test_structarray2dint(self):
     des = tools.categorical(self.structdes, col=3)
     test_des = np.column_stack(([des[_] for _ in des.dtype.names[-5:]]))
     assert_array_equal(test_des, self.dummy)
     assert_equal(len(des.dtype.names), 9)
Beispiel #46
0
 def test_recarray1d_drop(self):
     instr = self.structdes["instrument"].view(np.recarray)
     dum = tools.categorical(instr, drop=True)
     test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names]))
     assert_array_equal(test_dum, self.dummy)
     assert_equal(len(dum.dtype.names), 5)
Beispiel #47
0
 def test_structarray1d_drop(self):
     instr = self.structdes['str_instr'].view(dtype=[('var1', 'a10')])
     dum = tools.categorical(instr, drop=True)
     test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names]))
     assert_array_equal(test_dum, self.dummy)
     assert_equal(len(dum.dtype.names), 5)
import numpy as np
import statsmodels.tools.tools as stattools
from sklearn.tree import DecisionTreeClassifier, export_graphviz

# import the dataa
adult_tr = pd.read_csv(
    "/Users/chriskehl/Library/CloudStorage/iCloud Drive/Documents/data_files/data_science/data_sets/website_data_sets/adult_ch6_training"
)

# save the income variable as y
y = adult_tr[['Income']]

#make dummy variables
mar_np = np.array(adult_tr['Marital status'])
(mar_cat, mar_cat_dict) = stattools.categorical(mar_np,
                                                drop=True,
                                                dictnames=True)

# add newly created dummy variables back into the x variables
mar_cat_pd = pd.DataFrame(mar_cat)
X = pd.concat((adult_tr[['Cap_Gains_Losses']], mar_cat_pd), axis=1)

# specify the column names in X
X_names = [
    "Cap_Gains_Losses", "Divorced", "Married", "Never-Married", "Separated",
    "Widowed"
]

# explain the levels of Y
y_names = ["<=50K", ">50K"]