def setupClass(cls): data = sm.datasets.anes96.load() cls.data = data exog = data.exog exog = sm.add_constant(exog, prepend=False) mymodel = MNLogit(data.endog, exog) cls.res1 = mymodel.fit(method="lbfgs", disp=0, maxiter=50000, #m=12, pgtol=1e-7, factr=1e3, # 5 failures #m=20, pgtol=1e-8, factr=1e2, # 3 failures #m=30, pgtol=1e-9, factr=1e1, # 1 failure m=40, pgtol=1e-10, factr=5e0, loglike_and_score=mymodel.loglike_and_score) res2 = Anes() res2.mnlogit_basezero() cls.res2 = res2
Note IRis data has perfect separation problem. add some noise to it!!! ''' iris = datasets.load_iris() x = iris.data # lets use the first two features!! x = np.c_[np.ones(x.shape[0]), x] print(x.shape) print(x[:4]) y = iris.target y[1:5] = 2 y[-5:] = 1 xTrain, xTest, yTrain, yTest = train_test_split( x, y, test_size=0.33, random_state=50) # equal size splits! print(xTest.shape) logitModel = MNLogit(yTrain, xTrain) rs = logitModel.fit(maxiter=200) print(rs.params) print(rs.summary()) yPred = rs.predict() print(yPred, yPred.shape, '\n\n') def probtolabelPred(yPred): print(yPred[:10]) labs = np.argmax(yPred, axis=1) return labs predLabs = probtolabelPred(yPred) print(predLabs)
def multicategorical_continuous(dependent_var: pd.Series, independent_var: pd.Series, list_columns_names_export: list): dependent_var_name = dependent_var.name independent_var_name = independent_var.name long_data = pd.DataFrame.from_dict({"dependent_var_name": dependent_var, "independent_var_name": independent_var}, orient="columns") groupby_stats = long_data \ .groupby("dependent_var_name") \ .agg(["min", "max", "median", "mean", "count"]) \ .droplevel(level=0, axis=1) \ .rename({"count": "nonNA_count"}, axis=1) \ .reset_index(drop=False) \ .melt(id_vars="dependent_var_name", var_name="indicator") \ .rename({"dependent_var_name": "dependent_var_modality"}, axis=1)\ .round({"value": 2}) ############### ############# ANOVA from scipy.stats import f_oneway wide_data = [col.dropna() for _, col in long_data.pivot(index=None, columns="dependent_var_name", values="independent_var_name").iteritems()] anova = pd.DataFrame.from_dict({ "dependent_var_modality": "overall_margin", "indicator": "anova_oneway", "value": f_oneway(*wide_data).pvalue}, orient="index", ).transpose() ############# ########## MODEL dependent_var_cat = dependent_var.astype(CategoricalDtype(ordered=False)) ref_modality_dependent = groupby_stats.loc[lambda df: df["indicator"] == "nonNA_count", :] \ .loc[lambda df: df["value"] == df["value"].max(), :] \ .iloc[0, :] \ ["dependent_var_modality"] new_levels = [ref_modality_dependent] + pd.CategoricalIndex(dependent_var_cat) \ .remove_categories(ref_modality_dependent).categories.tolist() dependent_var_cat.cat.reorder_categories(new_levels, inplace=True) X = independent_var.rename(independent_var_name).to_frame().assign(intercept=1) model = MNLogit(dependent_var_cat, X) results = model.fit() params = results.params params.columns = dependent_var_cat.cat.categories[1:] params = params.rename_axis("dependent_var_modality", axis=1) \ .rename_axis("independent_var", axis=0) \ .drop("intercept", axis=0) \ .melt(var_name="dependent_var_modality") \ .assign(indicator="coeffs_LogR") ########### LRT LRT = pd.DataFrame.from_dict({ "dependent_var_modality": "overall_margin", "indicator": "pvalue_LRT_LogR", "value": results.llr_pvalue }, orient="index").transpose() ########## pvalues model pvalues = results.pvalues pvalues.columns = dependent_var_cat.cat.categories[1:] pvalues = pvalues.rename_axis("independent_var_name") \ .drop("intercept", axis=0) \ .reset_index(drop=False) \ .melt(id_vars="independent_var_name", var_name="dependent_var_modality") \ .assign(indicator="pvalue_coeff_LogR") \ .drop("independent_var_name", axis=1) ####### conf int param model conf_int = results.conf_int() \ .reset_index(level=1, drop=False) \ .rename({"level_1": "independent_var_name"}, axis=1) \ .rename_axis("dependent_var_modality", axis=0) \ .loc[lambda df: df["independent_var_name"] != "intercept", :] \ .reset_index(drop=False) \ .rename({"lower": "coeff_LogR_lb", "upper": "coeff_LogR_ub"}, axis=1) \ .melt(id_vars=["dependent_var_modality", "independent_var_name"], value_vars=["coeff_LogR_lb", "coeff_LogR_ub"], var_name="indicator") \ .drop("independent_var_name", axis=1) multicategorical_continuous = pd.concat([groupby_stats, params, pvalues, conf_int, LRT, anova], axis=0) \ .assign(ref_modality_dependent=ref_modality_dependent, ref_modality_independent=np.NaN, independent_var_modality=np.NaN, independent_var_name=independent_var_name, dependent_var_name=dependent_var_name) return multicategorical_continuous[list_columns_names_export]
def multicategorical_multicategorical(dependent_var:pd.Series, independent_var:pd.Series, list_columns_names_export: list): dependent_var_name = dependent_var.name independent_var_name = independent_var.name crosscount = pd.crosstab(index=dependent_var, columns=independent_var, margins=True, margins_name="overall_margin") ref_modality_independent = crosscount.transpose() \ .loc[ lambda df: df["overall_margin"] == df["overall_margin"].drop("overall_margin").max(), :] \ .index[0] independent_dummies = pd.get_dummies(independent_var, drop_first=False) \ .drop(ref_modality_independent, axis=1) \ .assign(intercept=1) dependent_var_cat = dependent_var.astype(CategoricalDtype(ordered=False)) ref_modality_dependent = \ crosscount.loc[lambda df: df["overall_margin"] == df["overall_margin"].drop("overall_margin").max(), :] \ .index[0] new_levels = [ref_modality_dependent] + pd.CategoricalIndex(dependent_var_cat) \ .remove_categories(ref_modality_dependent) \ .categories.tolist() dependent_var_cat.cat.reorder_categories(new_levels, inplace=True) # MODEL # model = Logit(dependent_var_cat.cat.codes, independent_dummies) model = MNLogit(dependent_var_cat, independent_dummies) results = model.fit() params = results.params params.columns = dependent_var_cat.cat.categories[1:] params = params.rename_axis("dependent_var_modality") \ .drop("intercept", axis=0) \ .reset_index(drop=False) \ .melt(id_vars="dependent_var_modality", var_name="independent_var_modality") \ .assign(indicator="coeffs_LogR") pvalues = results.pvalues pvalues.columns = dependent_var_cat.cat.categories[1:] pvalues = pvalues.rename_axis("dependent_var_modality") \ .drop("intercept", axis=0) \ .reset_index(drop=False) \ .melt(id_vars="dependent_var_modality", var_name="independent_var_modality") \ .assign(indicator="pvalue_coeff_LogR") \ .round({"independent_var_modality": 2}) conf_int = results.conf_int() \ .reset_index(level=1, drop=False) \ .rename({"level_1": "independent_var_modality"}, axis=1) \ .loc[lambda df: df["independent_var_modality"] != "intercept", :] \ .rename_axis("dependent_var_modality") \ .reset_index(drop=False) \ .rename({"lower": "coeff_LogR_lb", "upper": "coeff_LogR_ub"}, axis=1) \ .melt(id_vars=["dependent_var_modality", "independent_var_modality"], value_vars=["coeff_LogR_lb", "coeff_LogR_ub"], var_name="indicator") LRT = pd.DataFrame.from_dict({ "dependent_var_modality": "overall_margin", "independent_var_modality": "overall_margin", "indicator": "pvalue_LRT_LogR", "value": results.llr_pvalue}, orient="index", ).transpose() chi2_pvalue = chi2_contingency(crosscount.drop("overall_margin", axis=0) \ .drop("overall_margin", axis=1))[1] chi2_crosscount = pd.DataFrame.from_dict({ "dependent_var_modality": "overall_margin", "independent_var_modality": "overall_margin", "indicator": "pvalue_chisquare_crosscount", "value": chi2_pvalue}, orient="index", ).transpose() multicategorical_multicategorical = pd.concat([params, pvalues, conf_int, LRT, chi2_crosscount], axis=0) \ .assign(ref_modality_dependent=ref_modality_dependent, ref_modality_independent=ref_modality_independent, independent_var_name=independent_var_name, dependent_var_name=dependent_var_name) return multicategorical_multicategorical[list_columns_names_export]
rf.score(X_test, y_test) sorted_import_index = rf.feature_importances_.argsort() """Compute Standard Errors""" from statsmodels.discrete.discrete_model import Logit, MNLogit top_topics = mean_diff.topic.values tmp = y.copy() for i in range(10): y_31 = pd.Series(np.where(tmp==top_topics[i],1,0)) sm_logit = Logit(endog=y_31, exog=X.reset_index(drop=True)) print(top_topics[i], '\n', sm_logit.fit().summary()) MN_logit = MNLogit(endog=y.astype(str), exog=X) MN_logit.fit(method='nm',maxiter=5000, maxfun=5000).summary() """Make Predictions at the means""" sample_data = pd.DataFrame(index=range(0,50), columns=["asian_proportion","latinx_proportion","log_income","log_price"]) sample_data[["asian_proportion","latinx_proportion","log_income","log_price"]] = df[["asian_proportion","latinx_proportion","log_income","log_price"]].mean().values.reshape(1,-1) b_min = df.black_proportion.min() b_max = df.black_proportion.max() sample_data["black_proportion"] = range(1,51) sample_data.black_proportion = ((b_max-b_min)/50*sample_data.black_proportion) sample_data = sample_data[["black_proportion","asian_proportion","latinx_proportion","log_income","log_price"]] predcited_values = LR.predict_proba(sample_data.values) [np.argmax(x) for x in predcited_values] df.to_csv('data/data514.csv')