Example #1
0
 def setupClass(cls):
     data = sm.datasets.anes96.load()
     data.exog = sm.add_constant(data.exog, prepend=False)
     cls.res1 = MNLogit(data.endog, data.exog).fit_regularized(
             method="l1", alpha=0, disp=0, acc=1e-15, maxiter=1000,
             trim_mode='auto', auto_trim_tol=0.01)
     cls.res2 = MNLogit(data.endog, data.exog).fit(disp=0, tol=1e-15)
Example #2
0
 def setupClass(cls):
     cls.kvars = 4 # Number of variables
     cls.m = 3 # Number of unregularized parameters
     data = sm.datasets.spector.load()
     data.exog = sm.add_constant(data.exog, prepend=True)
     alpha = np.array([0, 0, 0, 10])
     cls.res_reg = MNLogit(data.endog, data.exog).fit_regularized(
         method="l1", alpha=alpha, disp=0, acc=1e-15, maxiter=2000,
         trim_mode='auto')
     # Actually drop the last columnand do an unregularized fit
     exog_no_PSI = data.exog[:, :cls.m]
     cls.res_unreg = MNLogit(data.endog, exog_no_PSI).fit(
         disp=0, tol=1e-15)
Example #3
0
 def setupClass(cls):
     data = sm.datasets.anes96.load()
     cls.data = data
     exog = data.exog
     exog = sm.add_constant(exog, prepend=False)
     mymodel = MNLogit(data.endog, exog)
     cls.res1 = mymodel.fit(method="lbfgs", disp=0, maxiter=50000,
             #m=12, pgtol=1e-7, factr=1e3, # 5 failures
             #m=20, pgtol=1e-8, factr=1e2, # 3 failures
             #m=30, pgtol=1e-9, factr=1e1, # 1 failure
             m=40, pgtol=1e-10, factr=5e0,
             loglike_and_score=mymodel.loglike_and_score)
     res2 = Anes()
     res2.mnlogit_basezero()
     cls.res2 = res2
Example #4
0
 def setupClass(cls):
     data = sm.datasets.anes96.load()
     cls.data = data
     exog = data.exog
     exog = sm.add_constant(exog, prepend=False)
     mymodel = MNLogit(data.endog, exog)
     cls.res1 = mymodel.fit(method="lbfgs", disp=0, maxiter=50000,
             #m=12, pgtol=1e-7, factr=1e3, # 5 failures
             #m=20, pgtol=1e-8, factr=1e2, # 3 failures
             #m=30, pgtol=1e-9, factr=1e1, # 1 failure
             m=40, pgtol=1e-10, factr=5e0,
             loglike_and_score=mymodel.loglike_and_score)
     res2 = Anes()
     res2.mnlogit_basezero()
     cls.res2 = res2
Example #5
0
 def test_margeff_dummy(self):
     data = self.data
     vote = data.data['vote']
     exog = np.column_stack((data.exog, vote))
     exog = sm.add_constant(exog, prepend=False)
     res = MNLogit(data.endog, exog).fit(method="newton", disp=0)
     me = res.get_margeff(dummy=True)
     assert_almost_equal(me.margeff, self.res2.margeff_dydx_dummy_overall,
             6)
     assert_almost_equal(me.margeff_se,
             self.res2.margeff_dydx_dummy_overall_se, 6)
     me = res.get_margeff(dummy=True, method="eydx")
     assert_almost_equal(me.margeff, self.res2.margeff_eydx_dummy_overall,
             5)
     assert_almost_equal(me.margeff_se,
             self.res2.margeff_eydx_dummy_overall_se, 6)
Example #6
0
 def test_margeff_dummy(self):
     data = self.data
     vote = data.data['vote']
     exog = np.column_stack((data.exog, vote))
     exog = sm.add_constant(exog, prepend=False)
     res = MNLogit(data.endog, exog).fit(method="newton", disp=0)
     me = res.get_margeff(dummy=True)
     assert_almost_equal(me.margeff, self.res2.margeff_dydx_dummy_overall,
                         6)
     assert_almost_equal(me.margeff_se,
                         self.res2.margeff_dydx_dummy_overall_se, 6)
     me = res.get_margeff(dummy=True, method="eydx")
     assert_almost_equal(me.margeff, self.res2.margeff_eydx_dummy_overall,
                         5)
     assert_almost_equal(me.margeff_se,
                         self.res2.margeff_eydx_dummy_overall_se, 6)
Example #7
0
 def setupClass(cls):
     from results.results_discrete import Anes
     data = sm.datasets.anes96.load()
     cls.data = data
     exog = data.exog
     exog = sm.add_constant(exog, prepend=False)
     cls.res1 = MNLogit(data.endog, exog).fit(method="newton", disp=0)
     res2 = Anes()
     res2.mnlogit_basezero()
     cls.res2 = res2
Example #8
0
Note IRis data has perfect separation problem. add some noise to it!!!
'''

iris = datasets.load_iris()
x = iris.data  # lets use the first two features!!
x = np.c_[np.ones(x.shape[0]), x]
print(x.shape)
print(x[:4])
y = iris.target
y[1:5] = 2
y[-5:] = 1
xTrain, xTest, yTrain, yTest = train_test_split(
    x, y, test_size=0.33, random_state=50)  # equal size splits!
print(xTest.shape)
logitModel = MNLogit(yTrain, xTrain)
rs = logitModel.fit(maxiter=200)
print(rs.params)
print(rs.summary())
yPred = rs.predict()

print(yPred, yPred.shape, '\n\n')


def probtolabelPred(yPred):
    print(yPred[:10])
    labs = np.argmax(yPred, axis=1)
    return labs


predLabs = probtolabelPred(yPred)
Example #9
0
def multicategorical_continuous(dependent_var: pd.Series,
                                independent_var: pd.Series,
                                list_columns_names_export: list):
    
    dependent_var_name = dependent_var.name
    independent_var_name = independent_var.name
    long_data = pd.DataFrame.from_dict({"dependent_var_name": dependent_var,
                                        "independent_var_name": independent_var},
                                       orient="columns")
    
    groupby_stats = long_data \
        .groupby("dependent_var_name") \
        .agg(["min", "max", "median", "mean", "count"]) \
        .droplevel(level=0, axis=1) \
        .rename({"count": "nonNA_count"}, axis=1) \
        .reset_index(drop=False) \
        .melt(id_vars="dependent_var_name",
              var_name="indicator") \
        .rename({"dependent_var_name": "dependent_var_modality"},
                axis=1)\
        .round({"value": 2})
    ###############
    
    ############# ANOVA
    from scipy.stats import f_oneway
    
    wide_data = [col.dropna() for _, col in
                 long_data.pivot(index=None,
                                 columns="dependent_var_name",
                                 values="independent_var_name").iteritems()]
    
    anova = pd.DataFrame.from_dict({
        "dependent_var_modality": "overall_margin",
        "indicator": "anova_oneway",
        "value": f_oneway(*wide_data).pvalue},
        orient="index",
    ).transpose()
    #############
    
    ########## MODEL
    dependent_var_cat = dependent_var.astype(CategoricalDtype(ordered=False))
    ref_modality_dependent = groupby_stats.loc[lambda df: df["indicator"] == "nonNA_count", :] \
                                 .loc[lambda df: df["value"] == df["value"].max(), :] \
                                 .iloc[0, :] \
        ["dependent_var_modality"]
    new_levels = [ref_modality_dependent] + pd.CategoricalIndex(dependent_var_cat) \
        .remove_categories(ref_modality_dependent).categories.tolist()
    dependent_var_cat.cat.reorder_categories(new_levels, inplace=True)
    
    X = independent_var.rename(independent_var_name).to_frame().assign(intercept=1)
    model = MNLogit(dependent_var_cat, X)
    results = model.fit()
    
    params = results.params
    params.columns = dependent_var_cat.cat.categories[1:]
    params = params.rename_axis("dependent_var_modality", axis=1) \
        .rename_axis("independent_var", axis=0) \
        .drop("intercept", axis=0) \
        .melt(var_name="dependent_var_modality") \
        .assign(indicator="coeffs_LogR")
    
    ########### LRT
    LRT = pd.DataFrame.from_dict({
        "dependent_var_modality": "overall_margin",
        "indicator": "pvalue_LRT_LogR",
        "value": results.llr_pvalue
    }, orient="index").transpose()
    
    ########## pvalues model
    pvalues = results.pvalues
    pvalues.columns = dependent_var_cat.cat.categories[1:]
    pvalues = pvalues.rename_axis("independent_var_name") \
        .drop("intercept", axis=0) \
        .reset_index(drop=False) \
        .melt(id_vars="independent_var_name",
              var_name="dependent_var_modality") \
        .assign(indicator="pvalue_coeff_LogR") \
        .drop("independent_var_name", axis=1)
    
    ####### conf int param model
    conf_int = results.conf_int() \
                   .reset_index(level=1, drop=False) \
                   .rename({"level_1": "independent_var_name"}, axis=1) \
                   .rename_axis("dependent_var_modality", axis=0) \
                   .loc[lambda df: df["independent_var_name"] != "intercept", :] \
        .reset_index(drop=False) \
        .rename({"lower": "coeff_LogR_lb", "upper": "coeff_LogR_ub"}, axis=1) \
        .melt(id_vars=["dependent_var_modality", "independent_var_name"],
              value_vars=["coeff_LogR_lb", "coeff_LogR_ub"],
              var_name="indicator") \
        .drop("independent_var_name", axis=1)
    multicategorical_continuous = pd.concat([groupby_stats, params, pvalues, conf_int, LRT, anova], axis=0) \
                                 .assign(ref_modality_dependent=ref_modality_dependent,
                                         ref_modality_independent=np.NaN,
                                         independent_var_modality=np.NaN,
                                         independent_var_name=independent_var_name,
                                         dependent_var_name=dependent_var_name)
    return multicategorical_continuous[list_columns_names_export]
Example #10
0
def multicategorical_multicategorical(dependent_var:pd.Series,
                                      independent_var:pd.Series,
                                      list_columns_names_export: list):
    dependent_var_name = dependent_var.name
    independent_var_name = independent_var.name
    crosscount = pd.crosstab(index=dependent_var,
                             columns=independent_var,
                             margins=True,
                             margins_name="overall_margin")
    ref_modality_independent = crosscount.transpose() \
                                   .loc[
                               lambda df: df["overall_margin"] == df["overall_margin"].drop("overall_margin").max(), :] \
        .index[0]
    independent_dummies = pd.get_dummies(independent_var, drop_first=False) \
        .drop(ref_modality_independent, axis=1) \
        .assign(intercept=1)
    
    dependent_var_cat = dependent_var.astype(CategoricalDtype(ordered=False))
    ref_modality_dependent = \
    crosscount.loc[lambda df: df["overall_margin"] == df["overall_margin"].drop("overall_margin").max(), :] \
        .index[0]
    new_levels = [ref_modality_dependent] + pd.CategoricalIndex(dependent_var_cat) \
        .remove_categories(ref_modality_dependent) \
        .categories.tolist()
    dependent_var_cat.cat.reorder_categories(new_levels, inplace=True)
    
    # MODEL
    # model = Logit(dependent_var_cat.cat.codes, independent_dummies)
    model = MNLogit(dependent_var_cat, independent_dummies)
    results = model.fit()
    
    params = results.params
    params.columns = dependent_var_cat.cat.categories[1:]
    params = params.rename_axis("dependent_var_modality") \
        .drop("intercept", axis=0) \
        .reset_index(drop=False) \
        .melt(id_vars="dependent_var_modality",
              var_name="independent_var_modality") \
        .assign(indicator="coeffs_LogR")
    
    pvalues = results.pvalues
    pvalues.columns = dependent_var_cat.cat.categories[1:]
    pvalues = pvalues.rename_axis("dependent_var_modality") \
        .drop("intercept", axis=0) \
        .reset_index(drop=False) \
        .melt(id_vars="dependent_var_modality",
              var_name="independent_var_modality") \
        .assign(indicator="pvalue_coeff_LogR") \
        .round({"independent_var_modality": 2})
    
    conf_int = results.conf_int() \
                   .reset_index(level=1, drop=False) \
                   .rename({"level_1": "independent_var_modality"}, axis=1) \
                   .loc[lambda df: df["independent_var_modality"] != "intercept", :] \
        .rename_axis("dependent_var_modality") \
        .reset_index(drop=False) \
        .rename({"lower": "coeff_LogR_lb",
                 "upper": "coeff_LogR_ub"},
                axis=1) \
        .melt(id_vars=["dependent_var_modality", "independent_var_modality"],
              value_vars=["coeff_LogR_lb", "coeff_LogR_ub"],
              var_name="indicator")
    
    LRT = pd.DataFrame.from_dict({
        "dependent_var_modality": "overall_margin",
        "independent_var_modality": "overall_margin",
        "indicator": "pvalue_LRT_LogR",
        "value": results.llr_pvalue},
        orient="index",
    ).transpose()
    chi2_pvalue = chi2_contingency(crosscount.drop("overall_margin", axis=0) \
                                   .drop("overall_margin", axis=1))[1]
    
    chi2_crosscount = pd.DataFrame.from_dict({
        "dependent_var_modality": "overall_margin",
        "independent_var_modality": "overall_margin",
        "indicator": "pvalue_chisquare_crosscount",
        "value": chi2_pvalue},
        orient="index",
    ).transpose()
    multicategorical_multicategorical = pd.concat([params, pvalues, conf_int, LRT, chi2_crosscount], axis=0) \
        .assign(ref_modality_dependent=ref_modality_dependent,
                ref_modality_independent=ref_modality_independent,
                independent_var_name=independent_var_name,
                dependent_var_name=dependent_var_name)
    return multicategorical_multicategorical[list_columns_names_export]
Example #11
0
rf.score(X_test, y_test)


sorted_import_index = rf.feature_importances_.argsort()

"""Compute Standard Errors"""
from statsmodels.discrete.discrete_model import Logit, MNLogit

top_topics = mean_diff.topic.values
tmp = y.copy()
for i in range(10):
    y_31 = pd.Series(np.where(tmp==top_topics[i],1,0))
    sm_logit = Logit(endog=y_31, exog=X.reset_index(drop=True))
    print(top_topics[i], '\n', sm_logit.fit().summary())
MN_logit = MNLogit(endog=y.astype(str), exog=X)
MN_logit.fit(method='nm',maxiter=5000, maxfun=5000).summary()

"""Make Predictions at the means"""
sample_data = pd.DataFrame(index=range(0,50),  columns=["asian_proportion","latinx_proportion","log_income","log_price"])
sample_data[["asian_proportion","latinx_proportion","log_income","log_price"]] = df[["asian_proportion","latinx_proportion","log_income","log_price"]].mean().values.reshape(1,-1)
b_min = df.black_proportion.min()
b_max = df.black_proportion.max()
sample_data["black_proportion"] = range(1,51)
sample_data.black_proportion = ((b_max-b_min)/50*sample_data.black_proportion)
sample_data = sample_data[["black_proportion","asian_proportion","latinx_proportion","log_income","log_price"]]
predcited_values = LR.predict_proba(sample_data.values)
[np.argmax(x) for x in predcited_values]

df.to_csv('data/data514.csv')