def Reg32(): y, X = dmatrices('Dec ~ Markt + Altru_1 + Altru_2 + Geld + Müll', df_ama, return_type='dataframe') probit = sm.Probit(y, X, missing='drop') res = probit.fit() print(res.summary())
def train_probit_across_obs(obs, states, alpha=10.0, verbosity=0): """ :param states: n x k matrix, k = num elements in latent binary vector :param obs: n x j matrix, j = num elements in observation/feature binary vector :return: k+1 x j weight matrix """ states = sm.tools.add_constant(states, prepend=False) weight_columns = np.zeros((states.shape[1], obs.shape[1])) for column in range(obs.shape[1]): if verbosity == 1: print('{0} '.format(column), end="",) if column % 10 == 0: print() obs_column = obs[:, column] print('num 1\'s in obs_column: sum(obs_column)= {0}'.format(np.sum(obs_column))) if verbosity > 1: print('obs_column {0}: {1}'.format(obs_column.shape, obs_column)) probit_model = sm.Probit(obs_column, states) fresult = probit_model.fit_regularized(method='l1', alpha=alpha) weight_columns[:, column] = fresult.params if verbosity > 1: print('fresult.params: {0}'.format(fresult.params)) print('weight_columns:\n{0}'.format(weight_columns)) return weight_columns
def Reg45(): y, X = dmatrices('Dec ~ Marktgeschehen*Markt', df_ama, return_type='dataframe') probit = sm.Probit(y, X, missing='drop') res = probit.fit() print(res.summary())
def models_pattern(data, matrix): y = np.array(data['survived']) ones = np.ones(len(matrix[0])) X = sm.add_constant(np.column_stack((matrix[0], ones))) for ele in matrix[1:]: X = sm.add_constant(np.column_stack((ele, X))) logit_model = sm.Logit(y, X) logit_res = logit_model.fit(maxiter=2000) print logit_res.summary() print logit_res.wald_test('1*x1 + 1*x2 + 1*x3') print probit_model = sm.Probit(y, X) probit_res = probit_model.fit(maxiter=2000) print probit_res.summary() print logit_res.wald_test('1*x1 + 1*x2 + 1*x3') print linear_model = sm.OLS(y, X) linear_res = linear_model.fit(maxiter=2000) result = 0. for array in X: for i, item in enumerate(array): result += linear_res.params[i] * item result /= (len(X)) print 'Linear function value: {}'.format(result) print linear_res.summary() print linear_res.wald_test('1*x1 + 1*x2 + 1*x3') print
def Reg44(): y, X = dmatrices('Dec ~ WiStu*Markt + Student', df_ama, return_type='dataframe') probit = sm.Probit(y, X, missing='drop') res = probit.fit() print(res.summary())
def Reg43(): y, X = dmatrices('Dec ~ FMIS_Index*Markt + Altru_1 + Altru_2', df_ama, return_type='dataframe') probit = sm.Probit(y, X, missing='drop') res = probit.fit() print(res.summary())
def fit_psychometric_curve(log_file, plot=False, thresholds=(1, 4)): import statsmodels.api as sm df = pd.read_table(log_file) df = df[df.phase == 9] df = df.pivot_table(index=['trial_nr'], values=['choice', 'certainty', 'n1', 'n2', 'prob1', 'prob2']) df = df[~df.choice.isnull()] df['log(risky/safe)'] = np.log(df['n1'] / df['n2']) ix = df.prob1 == 1.0 print(df) if ix.sum() > 0: df.loc[ix, 'log(risky/safe)'] = np.log(df.loc[ix, 'n2'] / df.loc[ix, 'n1']) df.loc[ix, 'chose risky'] = df.loc[ix, 'choice'] == 2 if (~ix).sum() > 0: df.loc[~ix, 'log(risky/safe)'] = np.log(df.loc[~ix, 'n1'] / df.loc[~ix, 'n2']) df.loc[~ix, 'chose risky'] = df.loc[~ix, 'choice'] == 1 df['chose risky'] = df['chose risky'].astype(bool) if plot: import seaborn as sns import matplotlib.pyplot as plt fac = sns.lmplot('log(risky/safe)', 'chose risky', data=df, logistic=True) for color, x in zip(sns.color_palette()[:4], [np.log(1./.55)]): plt.axvline(x, color=color, ls='--') plt.gcf().set_size_inches(14, 6) plt.axhline(.5, c='k', ls='--') x = np.linspace(0, 1.5, 17) plt.xticks(x, [f'{e:0.2f}' for e in np.exp(x)], rotation='vertical') plt.xlim(0, 1.5) plt.show() # Fit probit df['intercept'] = 1 try: m = sm.Probit(df['chose risky'], df[['intercept', 'log(risky/safe)']]) r = m.fit() x_lower = (ss.norm.ppf(.2) - r.params.intercept) / r.params['log(risky/safe)'] x_upper = (ss.norm.ppf(.8) - r.params.intercept) / r.params['log(risky/safe)'] except Exception as e: print("Problem with calibration, using standard values") x_lower = np.log(thresholds[0]) x_upper = np.log(thresholds[1]) print(f'Original bounds: {np.exp(x_lower)}, {np.exp(x_upper)}') x_lower = np.exp(np.max((x_lower, np.log(thresholds[0])))) x_upper = np.exp(np.min((x_upper, np.log(thresholds[1])))) print(f'Final bounds: {x_lower}, {x_upper}') return x_lower, x_upper
def Reg53(): y, X = dmatrices( 'Dec ~ FMIS_Index + pol_rechts + Gespendet + Geld + weiblich + Alter + Selbst + Sozial + Marktberuf + Akademiker ', df_Markt, return_type='dataframe') probit = sm.Probit(y, X, missing='drop') res = probit.fit() print(res.summary())
def Reg51(): y, X = dmatrices( 'Dec ~ Altru_1 + Altru_2 + FMIS_Index + Marktgeschehen + pol_rechts + Gespendet + Geld + weiblich + Alter + Selbst + Sozial + Marktberuf + WiStu + NatStu + SoStu + JurStu + Akademiker', df_Markt, return_type='dataframe') probit = sm.Probit(y, X, missing='drop') res = probit.fit() print(res.summary())
def start_values(init_dict, data_frame, option): """The function selects the start values for the minimization process.""" if not isinstance(init_dict, dict): msg = 'The input object ({})for specifing the start values isn`t a dictionary.' \ .format(init_dict) raise UserError(msg) indicator = init_dict['ESTIMATION']['indicator'] dep = init_dict['ESTIMATION']['dependent'] if option == 'init': # Set coefficients equal the true init file values x0 = init_dict['AUX']['init_values'][:-6] elif option == 'auto': try: # Estimate beta1 and beta0: beta = [] sd_ = [] for i in [1.0, 0.0]: Y = data_frame[dep][data_frame[indicator] == i] if i == 1: order = init_dict['TREATED']['order'] else: order = init_dict['UNTREATED']['order'] X = data_frame[[init_dict['varnames'][j - 1] for j in order]][i == data_frame[indicator]] ols_results = sm.OLS(Y, X).fit() beta += [ols_results.params] sd_ += [np.sqrt(ols_results.scale), 0.0] # Estimate gamma via Probit Z = data_frame[[ init_dict['varnames'][j - 1] for j in init_dict['CHOICE']['order'] ]] probitRslt = sm.Probit(data_frame[indicator], Z).fit(disp=0) gamma = probitRslt.params # Adjust estimated cost-benefit shifter and intercept coefficients # Arrange starting values x0 = np.concatenate((beta[0], beta[1], gamma, sd_)) check_start_values(x0) except (PerfectSeparationError, ValueError, UserError): msg = 'The estimation process wasn`t able to provide automatic start values due to ' \ 'perfect seperation. \n ' \ ' The intialization specifications are used as start ' \ 'values during the further process.' # Set coefficients equal the true init file values x0 = init_dict['AUX']['init_values'][:-6] init_dict['ESTIMATION']['warning'] = msg option = 'init' x0 = start_value_adjustment(x0, init_dict, option) x0 = np.array(x0) return x0
def start_values(init_dict, data_frame, option): """The function selects the start values for the minimization process.""" if not isinstance(init_dict, dict): raise AssertionError() numbers = [init_dict['AUX']['num_covars_out'], init_dict['AUX']['num_covars_cost']] if option == 'init': # Set coefficients equal the true init file values x0 = init_dict['AUX']['init_values'][:2 * numbers[0] + numbers[1]] sd_ = None elif option == 'auto': try: # Estimate beta1 and beta0: beta = [] sd_ = [] for i in [0.0, 1.0]: Y, X = data_frame.Y[data_frame.D == i], data_frame.filter(regex=r'^X\_')[ data_frame.D == i] ols_results = sm.OLS(Y, X).fit() beta += [ols_results.params] sd_ += [np.sqrt(ols_results.scale)] # Estimate gamma via probit X = data_frame.filter(regex=r'^X\_') Z = (data_frame.filter(regex=r'^Z\_')).drop('Z_0', axis=1) XZ = np.concatenate((X, Z), axis=1) probitRslt = sm.Probit(data_frame.D, XZ).fit(disp=0) gamma = probitRslt.params gamma_const = np.subtract(np.subtract(beta[1][0], beta[0][0]), gamma[0]) if len(init_dict['COST']['all']) == 1: gamma = [gamma_const] else: gamma = np.concatenate(([gamma_const], gamma[-(numbers[1] - 1):])) # Arange starting values x0 = np.concatenate((beta[1], beta[0])) x0 = np.concatenate((x0, gamma)) except (PerfectSeparationError, ValueError): msg = 'The estimation process wasn`t able to provide automatic start values due to ' \ 'perfect seperation. \n ' \ ' The intialization specifications are used as start ' \ 'values during the further process.' # Set coefficients equal the true init file values x0 = init_dict['AUX']['init_values'][:2 * numbers[0] + numbers[1]] sd_ = None init_dict['ESTIMATION']['warning'] = msg option = 'init' x0, start = provide_cholesky_decom(init_dict, x0, option, sd_) init_dict['AUX']['starting_values'] = x0[:] init_dict['AUX']['start_values'] = start x0 = np.array(x0) return x0
def berry_table_5(df: pd.DataFrame) -> str: y = df['active_next_period'] x = df[['geo_mean_pop', 'distance', 'distance_squared', 'city2']] x = sm.add_constant(x) number_of_variables = x.shape[1] x_less_city2 = df[['geo_mean_pop', 'distance', 'distance_squared']] x_less_city2 = sm.add_constant(x_less_city2) # run probit with the full set of variables probit_mod = sm.Probit(y, x) probit_res = probit_mod.fit() # run probit without city2 probit_mod = sm.Probit(y, x_less_city2) probit_res_less_city2 = probit_mod.fit() # generate a container for a table table = [] for i in range(number_of_variables): if probit_res.params.index.values[i] == 'city2': table.append( [ '{}'.format(x.columns.values[i]), '{:.2f}\n({:.2f})'.format(probit_res.params.values[i], probit_res.bse.values[i]), '--\n--' ] ) else: table.append( [ '{}'.format(x.columns.values[i]), '{:.2f}\n({:.2f})'.format(probit_res.params.values[i], probit_res.bse.values[i]), '{:.2f}\n({:.2f})'.format(probit_res_less_city2.params.values[i], probit_res_less_city2.bse.values[i]) ] ) # set header headers = ['Variable', '(1) Probit\nParameters\n(Std. Error)', '(2) Probit\nParameters\n(Std. Error)'] return tabulate(table, headers, tablefmt="latex", numalign="right", floatfmt=".2f")
def start_values(init_dict, data_frame, option): """The function selects the start values for the minimization process.""" if not isinstance(init_dict, dict): msg = ("The input object ({})for specifing the start values isn`t a " "dictionary.".format(init_dict)) raise UserError(msg) indicator = init_dict["ESTIMATION"]["indicator"] dep = init_dict["ESTIMATION"]["dependent"] if option == "init": # Set coefficients equal the true init file values x0 = init_dict["AUX"]["init_values"][:-6] elif option == "auto": try: # Estimate beta1 and beta0: beta = [] sd_ = [] for i in [1.0, 0.0]: Y = data_frame[dep][data_frame[indicator] == i] if i == 1: order = init_dict["TREATED"]["order"] else: order = init_dict["UNTREATED"]["order"] X = data_frame[order][i == data_frame[indicator]] ols_results = sm.OLS(Y, X).fit() beta += [ols_results.params] sd_ += [np.sqrt(ols_results.scale), 0.0] # Estimate gamma via Probit Z = data_frame[init_dict["CHOICE"]["order"]] probitRslt = sm.Probit(data_frame[indicator], Z).fit(disp=0) gamma = probitRslt.params # Adjust estimated cost-benefit shifter and intercept coefficients # Arrange starting values x0 = np.concatenate((beta[0], beta[1], gamma, sd_)) check_start_values(x0) except (PerfectSeparationError, ValueError, UserError): msg = ("The estimation process wasn`t able to provide automatic" " start values due to perfect seperation. \n" " The intialization specifications are used as start " "values during the further process.") # Set coefficients equal the true init file values x0 = init_dict["AUX"]["init_values"][:-6] init_dict["ESTIMATION"]["warning"] = msg option = "init" x0 = start_value_adjustment(x0, init_dict, option) x0 = np.array(x0) return x0
def het_test_probit(results): """ Wald検定 for Probit ------------------ H0: homoscedasticity HA: heteroscedasticity Parameters ---------- results : Logit results instance Returns ------- Wald test statistic p-value Degree of Freedom The number of restrictions, which are equivalent to the number of explanatory variables, excluding a constant term References ---------- The test is based on (1) Wooldridge 2010, section 15.5.3 (2) https://www.statalist.org/forums/forum/general-stata-discussion/general/1292180-test-for-heteroskedasticity-in-logit-probit-models """ yhat = results.predict(linear=True) # original fitted values exog_var = results.model.exog # original exog exog_df = pd.DataFrame(exog_var) # convert to DataFrame try: # drop a column of a constant if any tt = exog_df.nunique() idx_1 = list(tt).index(1.0) exog_df = exog_df.drop(idx_1, axis=1) except ValueError: pass num_para = exog_df.shape[1] # no of non-constant parameters # X = np.exp(yhat).reshape(len(yhat),1) * exog_df.values X = yhat.reshape(len(yhat), 1) * exog_df.values endog = results.model.endog exog = np.column_stack((results.model.exog, X)) res_test = sm.Probit(endog, exog).fit(disp=False) A = np.identity(len(res_test.params)) A = A[-num_para:, :] s = res_test.wald_test(A) return print('H0: homoscedasticity\nHA: heteroscedasticity\n', '\nWald test:', "%#2.3f" % s.statistic[0][0], '\np-value:', "%#7.3f" % s.pvalue, '\ndf freedom:', "%#3.0f" % s.df_denom)
def estimate_treatment_propensity(dict_, data, logit, show_output=False): """ This function estimates the propensity of selecting into treatment for both treated and untreated individuals based on instruments Z. Z subsumes all the observable components that influence the treatment decision, e.g. the decision to enroll into college (D = 1) or not (D = 0). Estimate propensity scores via Logit (default) or Probit. Parameters ---------- dict_: dict Estimation dictionary. Returned by grmpy.read(init_file)). data: pandas.DataFrame Data set to perform the estimation on. Specified under dict_["ESTIMATION"]["file"]. logit: bool Probability model for the choice equation. If True: logit, else: probit. show_output: bool If True, intermediate outputs of the estimation process are displayed. Returns ------- data: pandas.DataFrame Propensity score (range between [0, 1]). Values closer to 1 denote a higher inclination to treatment. """ D = data[dict_["ESTIMATION"]["indicator"]].values Z = data[dict_["CHOICE"]["order"]] if logit is True: logitRslt = sm.Logit(D, Z).fit(disp=0) prop_score = logitRslt.predict(Z) if show_output is True: print(logitRslt.summary()) else: probitRslt = sm.Probit(D, Z).fit(disp=0) prop_score = probitRslt.predict(Z) if show_output is True: print(probitRslt.summary()) data.loc[:, "prop_score"] = prop_score # prop_score.values return data
def asGLM(X, y): ''' Frequentist Probit model Inputs: - X: Feature matrix (DxN) - y: Observations (binary vector of length N) ''' clf = sm.Probit(y, X.T) clf_ti = clf.fit() print 'Coefficients: ' print clf_ti.params print 'CI: ' print clf_ti.conf_int() return clf_ti
def dump_probit_results(rel_matchups): break_q = 0.66 filenames = {True: ExportedFiles.over_probit, False: ExportedFiles.under_probit} for gid, gdf in rel_matchups.groupby( rel_matchups[style_pair_vals[0]].pipe(lambda s: s > s.quantile(break_q)) ): html_str = ( sm.Probit(gdf["win"], gdf[style_pair_vals[1:]].assign(const=1)) .fit(cov_type="HC1") .summary() .tables[1] .as_html() ) with open(filenames[gid], "w") as fp: fp.write(html_str)
def SPProbit(context): # 从 Context 中获取相关数据 args = context.args # 查看上一节点发送的 args.inputData 数据 df = args.inputData featureColumns = args.featureColumns labelColumn = args.labelColumn features = df[featureColumns].values label = df[labelColumn].values arma_mod = sm.Probit(label, features, missing=args.missing) arma_res = arma_mod.fit(method=args.method) return arma_res
def compute(self, method='logistic'): """ Compute propensity score and measures of goodness-of-fit Parameters ---------- method : str Propensity score estimation method. Either 'logistic' or 'probit' """ predictors = sm.add_constant(self.covariates, prepend=False) if method == 'logistic': model = sm.Logit(self.treatment, predictors).fit(disp=False, warn_convergence=True) elif method == 'probit': model = sm.Probit(self.treatment, predictors).fit(disp=False, warn_convergence=True) else: raise ValueError('Unrecognized method') return model.predict()
def plot_probit(model, trim_pct, probit_index_loc=0): '''Return plot of probit for specification used in *model*, use full dataset. ''' data = pd.read_csv('./data/' + model_json['data']) exog_vars = ' + '.join(model.coeffs_final[probit_index_loc].index) Y, X = dmatrices(model_json['y_name'] + ' ~ ' + exog_vars, data) probit_result = sm.Probit(Y, X).fit() Xb = np.sort(probit_result.fittedvalues) p_hat = np.sort(probit_result.predict()) # To make plots comparable, standardize Xb, rescale to mean and variance of index. μ_Xb = Xb.mean() σ_Xb = Xb.std() Xb_standardized = (Xb - μ_Xb) / σ_Xb index = model.index_final[probit_index_loc] μ_index = index.mean() σ_index = index.std() Xb_scaled = Xb_standardized * σ_index + μ_index # Align limits with ASF limits. xmin = np.percentile(model.index_final[probit_index_loc], trim_pct) xmax = np.percentile(model.index_final[probit_index_loc], 100 - trim_pct) fig = plt.figure() fig.add_subplot(1, 1, 1) plt.xlim(get_lim(xmin, xmax)) plt.ylim((0, PRED_MAX)) if probit_index_loc == 0: plt.xlabel('Rescaled probit index 1') elif probit_index_loc == 1: plt.xlabel('Rescaled probit index 2') plt.ylabel(labels["y"]) plt.plot(Xb_scaled, p_hat) return fig
def WiStu_plot(): y, X = dmatrices('Dec ~ WiStu*Markt + Student', df_ama, return_type='dataframe') probit = sm.Probit(y, X, missing='drop') res = probit.fit() #print(res.summary()) #-------------------------------------- def f_Markt(VI): return res.predict([1, VI, 1, VI, 1]) def f_Baseline(VI): return res.predict([1, VI, 0, 0, 1]) #---------------Vorbereitung ------------ VI_list = list(range(0, 2)) y_Markt = [None] * len(VI_list) y_Base = [None] * len(VI_list) for i in VI_list: y_Markt[i] = f_Markt(i) for i in VI_list: y_Base[i] = f_Baseline(i) #--------------Estimated Likelihood---------- fig = plt.figure() ax = fig.add_subplot(1, 1, 1) plt.xticks(np.arange(0, 2, step=1)) ax.plot(VI_list, y_Markt, color='tab:blue', linewidth=2) ax.plot(VI_list, y_Base, color='tab:orange', linewidth=2) ax.axhline(y=df_ama['Dec'].mean(), color='gray', linewidth=1, linestyle='--') ax.axvline(x=df_ama.WiStu.mean(), color='gray', linewidth=1, linestyle='--') #ax.set_title('Likelihood of unfair decision dependent on FMIS') ax.set_ylabel('Estimated Probability of Fair Decision') ax.set_xlabel('Economics Student') ax.legend(labels=['Market', 'Baseline']) #plt.savefig(sav_dir+'\WiStu_plot.png', bbox_inches="tight") plt.show()
def compute(self, method='logistic'): """ Compute propensity score and measures of goodness-of-fit Parameters ---------- method : str Propensity score estimation method. Either 'logistic' or 'probit' """ predictions = None if method == 'logistic': # i've had a ton of issues w/ the default SM solver and w/ others (including bfgs, which I thought was # working but then started giving me 0.5 for everything) - for ex, singular matrix errors, etc. I don't # need the things like p-values that SM gives over sklearn, but I do want a more robust implementation, # so I'm going to switch to sklearn for here at least # there's some useful stuff at https://stackoverflow.com/questions/24924755/logit-estimator-in-statsmodels-and-sklearn # high C value means to regularize hardly at all - i'm not standardizing the data so i don't want to # drop features incorrectly because of different scales (some regularization is needed because of how the # solver works) lr = LogisticRegression(C=1e9, fit_intercept=True) lr.fit(self.covariates, self.treatment) self.model = lr predictions = lr.predict_proba( self.covariates)[:, 1] # index 1 because we want the prob of a 1 # old #model = sm.Logit(self.treatment, predictors).fit_regularized(alpha = 0.001, disp=False, warn_convergence=True) #model = sm.Logit(self.treatment, predictors).fit(method='bfgs', disp=False, warn_convergence=True) #model = sm.Logit(self.treatment, predictors).fit(disp=False, warn_convergence=True) #model = sm.Logit(self.treatment, predictors).fit(disp=True, warn_convergence=True, maxiter=500) elif method == 'probit': predictors = sm.add_constant(self.covariates, prepend=False) model = sm.Probit(self.treatment, predictors).fit(disp=False, warn_convergence=True) self.model = model predictions = model.predict() else: raise ValueError('Unrecognized method') return predictions
def stats(predictor, response, model): ##will apply the statistical model you enter to the variables inputed, the ##codes for each statistical model are viewable in the chain of if statements predictor = np.asarray(predictor) response = np.asarray(response) if model == 'logit': model = sm.Logit(predictor, response) elif model == 'lsr': model = sm.OLS(predictor, response) elif model == "probit": model = sm.Probit(predictor, response) elif model == "gls": model = sm.GLS(predictor, response) elif model == "glsar": model = sm.GLSAR(predictor, response) elif model == "quantreg": model = sm.QuantReg(predictor, response) else: pass model = model.fit() print(model.summary())
def FMIS_plot(): y, X = dmatrices('Dec ~ FMIS_Index*Markt', df_ama, return_type='dataframe') probit = sm.Probit(y, X, missing='drop') res = probit.fit() #print(res.summary()) #------------------------------------------------------------------ def f_Markt(FMIS): return res.predict([1, FMIS, 1, FMIS]) def f_Baseline(FMIS): return res.predict([1, FMIS, 0, 0]) #-------------Vorbereitung-------------------------------- FMIS_list = list(range(1, 12)) y_Markt = [None] * len(FMIS_list) y_Base = [None] * len(FMIS_list) for i in FMIS_list: y_Markt[i - 1] = f_Markt(i) for i in FMIS_list: y_Base[i - 1] = f_Baseline(i) #----------------Estimated Likelihood---------------------- fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.plot(FMIS_list, y_Markt, color='tab:blue', linewidth=2) ax.plot(FMIS_list, y_Base, color='tab:orange', linewidth=2) ax.axhline(y=df_ama['Dec'].mean(), color='gray', linewidth=1, linestyle='--') ax.axvline(x=df_ama.FMIS_Index.mean(), color='gray', linewidth=1, linestyle='--') #ax.set_title('Likelihood of unfair decision dependent on FMIS') ax.set_ylabel('Estimated Probability of Fair Decision') ax.set_xlabel('Fair Market Ideology Index') ax.legend(labels=['Market', 'Baseline']) #plt.savefig(sav_dir+'\FMIS_plot.png', bbox_inches="tight") plt.show()
def estimate_treatment_propensity(D, Z, logit, show_output): """ This function estimates the propensity of selecting into treatment for both treated and untreated individuals based on instruments Z. Z subsumes all the observable components that influence the treatment decision, e.g. the decision to enroll into college (D = 1) or not (D = 0). Estimate propensity scores via Logit (default) or Probit. """ if logit is True: logitRslt = sm.Logit(D, Z).fit(disp=0) ps = logitRslt.predict(Z) if show_output is True: print(logitRslt.summary()) else: probitRslt = sm.Probit(D, Z).fit(disp=0) ps = probitRslt.predict(Z) if show_output is True: print(probitRslt.summary()) return ps.values
def _fit_twostep(self): ######################################################################## # PRIVATE METHOD # Fits using Heckman two-step from Heckman (1979). ######################################################################## ## prep data Y, X, Z = self.get_datamats() ## Step 1 step1model = sm.Probit(self.treated, Z) step1res = step1model.fit(disp=False) step1_fitted = np.atleast_2d(step1res.fittedvalues).T step1_varcov = step1res.cov_params() inverse_mills = norm.pdf(step1_fitted) / norm.cdf(step1_fitted) ## Step 2 W = np.hstack((X, inverse_mills[self.treated])) step2model = sm.OLS(Y, W) step2res = step2model.fit() params = step2res.params[:-1] betaHat_inverse_mills = step2res.params[-1] ## Compute standard errors # Compute estimated error variance of censored regression delta = np.multiply(inverse_mills, inverse_mills + step1_fitted)[self.treated] sigma2Hat = step2res.resid.dot(step2res.resid) / self.nobs_uncensored + \ (betaHat_inverse_mills**2 * sum(delta)) / self.nobs_uncensored sigma2Hat = sigma2Hat[0] sigmaHat = np.sqrt(sigma2Hat) rhoHat = betaHat_inverse_mills / sigmaHat # compute standard errors of beta estimates of censored regression delta_1d = delta.T[0] Q = rhoHat**2 * ( (W.T * delta_1d).dot(Z[self.treated])).dot(step1_varcov).dot( (Z[self.treated].T * delta_1d).dot(W)) WT_W_inv = np.linalg.inv(W.T.dot(W)) WT_R = W.T * (1 - rhoHat**2 * delta_1d) normalized_varcov_all = WT_W_inv.dot(WT_R.dot(W) + Q).dot(WT_W_inv) del WT_W_inv del WT_R del delta_1d normalized_varcov = normalized_varcov_all[:-1, :-1] varcov_all = sigma2Hat * normalized_varcov_all varcov = varcov_all[:-1, :-1] stderr_all = np.sqrt(np.diag(varcov_all)) stderr = stderr_all[:-1] stderr_betaHat_inverse_mills = stderr_all[-1] ## store results results = HeckmanResults( self, params, normalized_varcov, sigma2Hat, select_res=step1res, params_inverse_mills=betaHat_inverse_mills, stderr_inverse_mills=stderr_betaHat_inverse_mills, var_reg_error=sigma2Hat, corr_eqnerrors=rhoHat, method='twostep') return results
def loglike(self, params): exog = self.exog endog = self.endog q = 2 * endog - 1 return stats.norm.logcdf(q * np.dot(exog, params)).sum() # Estimate the model and print a summary: sm_probit_manual = MyProbit(endog, exog).fit() print(sm_probit_manual.summary()) # Compare your Probit implementation to ``statsmodels``' "canned" # implementation: sm_probit_canned = sm.Probit(endog, exog).fit() print(sm_probit_canned.params) print(sm_probit_manual.params) print(sm_probit_canned.cov_params()) print(sm_probit_manual.cov_params()) # Notice that the ``GenericMaximumLikelihood`` class provides automatic # differentiation, so we didn't have to provide Hessian or Score functions # in order to calculate the covariance estimates. # # # ## Example 2: Negative Binomial Regression for Count Data #
def result_statsmodels_probit(): endog, exog = generate_test_data() result = sm.Probit(endog, exog).fit() return result
logit_res = logit_mod.fit(disp=0) print("Parameters: ", logit_res.params) # Marginal Effects margeff = logit_res.get_margeff() print(margeff.summary()) # As in all the discrete data models presented below, we can print a nice # summary of results: print(logit_res.summary()) # ## Probit Model probit_mod = sm.Probit(spector_data.endog, spector_data.exog) probit_res = probit_mod.fit() probit_margeff = probit_res.get_margeff() print("Parameters: ", probit_res.params) print("Marginal effects: ") print(probit_margeff.summary()) # ## Multinomial Logit # Load data from the American National Election Studies: anes_data = sm.datasets.anes96.load() anes_exog = anes_data.exog anes_exog = sm.add_constant(anes_exog) # Inspect the data:
from __future__ import print_function import numpy as np from scipy import stats import statsmodels.api as sm from statsmodels.base.model import GenericLikelihoodModel data = sm.datasets.spector.load() data.exog = sm.add_constant(data.exog, prepend=False) # in this dir probit_mod = sm.Probit(data.endog, data.exog) probit_res = probit_mod.fit() loglike = probit_mod.loglike score = probit_mod.score mod = GenericLikelihoodModel(data.endog, data.exog*2, loglike, score) res = mod.fit(method="nm", maxiter = 500) def probitloglike(params, endog, exog): """ Log likelihood for the probit """ q = 2*endog - 1 X = exog return np.add.reduce(stats.norm.logcdf(q*np.dot(X,params))) mod = GenericLikelihoodModel(data.endog, data.exog, loglike=probitloglike) res = mod.fit(method="nm", fargs=(data.endog,data.exog), maxiter=500) print(res)