def reduce_multi_model(orig_fitted, base_string, res, df, fit=None): """orig_fitted = an object returned from calling .fit() on a statsmodels logit model base_string = the right hand side of the formula used to estimate orig_fitted res = The string for the column name in df that has the classes. df = the pandas dataframe from which orig_fitted was estimated ========== Returns a fitted logistic regression model, and the base string used to estimate the model. If at least one variable has a p-value which is > 0.05, this function will removes the variable with the worst p-value, estimate a new logistic regression, and repeat the process until no more insignificant variables can be removed.""" #Check the class of the function inputs assert isinstance(base_string, str) assert isinstance(res, str) assert isinstance(df, pd.DataFrame) #Try to reduce the number of variables in the original model new_bvars = whittle_multi_model_vars(orig_fitted, base_string) #Initialize a variable for the smallest model small_model = orig_fitted #Initialize a variable for the smallest model base_string small_base = base_string node_variables = isolate_node_cols(df) while new_bvars is not None: #If a reduced set of variables has been found #new_base = " + ".join(["0"] + new_bvars) #Create a new base_string #new_fstring = res + " ~ " + new_base #Create a new statsmodels formula string model_vars = combat_multi_collinearity(df, new_bvars, node_variables, max_cond=2000) new_base = " + ".join(model_vars) #Create a string of all variables using in the multivariate regression new_fstring = res + " ~ " + "0 + " + new_base #Create the new formula string try: #Try to fit a new logistic regression model #Use the if...else statement to accomodate various optimization methods if fit is None: new_model = smf.logit(new_fstring, data = df).fit(maxiter=2000, disp=False) else: new_model = smf.logit(new_fstring, data = df).fit(method=fit, maxiter=2000, disp=False) #Assign small_base to the smallest identified set of base variables so far small_base = " + ".join(new_bvars) #Assign small_model to the model with smallest set of base variables so far small_model = new_model #Search for new base variables new_bvars = whittle_multi_model_vars(new_model, new_base) except Exception as inst: #If the model could not be fit, print a message saying so #print "Estimating logit model failed when using formula: {}".format(new_fstring) #Note the line below is un-tested, but I added it because it seemed #that an infinite loop would result without it. #print inst new_bvars = None #Print the model results of the most reduced model. #print "="*10 #print "The reduced model results are:" #print small_model.summary() return small_model, small_base
def RunLogisticModels(live): """Runs regressions that predict sex. live: DataFrame of pregnancy records """ #live = linear.ResampleRowsWeighted(live) df = live[live.prglngth>30] # df = JoinFemResp(df) df['boy'] = (df.babysex==1).astype(int) df['isyoung'] = (df.agepreg<20).astype(int) df['isold'] = (df.agepreg<35).astype(int) df['season'] = (((df.datend+1) % 12) / 3).astype(int) # run the simple model model = smf.logit('boy ~ agepreg', data=df) results = model.fit() print('nobs', results.nobs) print(type(results)) SummarizeResults(results) # run the complex model model = smf.logit('boy ~ agepreg + hpagelb + birthord + C(race)', data=df) results = model.fit() print('nobs', results.nobs) print(type(results)) SummarizeResults(results) # make the scatter plot exog = pandas.DataFrame(model.exog, columns=model.exog_names) endog = pandas.DataFrame(model.endog, columns=[model.endog_names]) xs = exog['agepreg'] lo = results.fittedvalues o = np.exp(lo) p = o / (o+1) #thinkplot.Scatter(xs, p, alpha=0.1) #thinkplot.Show() # compute accuracy actual = endog['boy'] baseline = actual.mean() predict = (results.predict() >= 0.5) true_pos = predict * actual true_neg = (1 - predict) * (1 - actual) acc = (sum(true_pos) + sum(true_neg)) / len(actual) print(acc, baseline) columns = ['agepreg', 'hpagelb', 'birthord', 'race'] new = pandas.DataFrame([[35, 39, 3, 1]], columns=columns) y = results.predict(new) print(y)
def calculate_odds_ratio(genotypes, phen_vector1, phen_vector2, reg_type, covariates, response='', phen_vector3=''): # diff - done """ Runs the regression for a specific phenotype vector relative to the genotype data and covariates. :param genotypes: a DataFrame containing the genotype information :param phen_vector: a array containing the phenotype vector :param covariates: a string containing all desired covariates :type genotypes: pandas DataFrame :type phen_vector: numpy array :type covariates: string .. note:: The covariates must be a string that is delimited by '+', not a list. If you are using a list of covariates and would like to convert it to the pyPhewas format, use the following:: l = ['genotype', 'age'] # a list of your covariates covariates = '+'.join(l) # pyPhewas format The covariates that are listed here *must* be headers to your genotype CSV file. """ data = genotypes data['y'] = phen_vector1 data['MaxAgeAtCPT'] = phen_vector2 # f='y~'+covariates if response: f = response + '~ y + genotype +' + covariates if phen_vector3.any(): data['phe'] = phen_vector3 f = response + '~ y + phe + genotype' + covariates else: f = 'genotype ~ y +' + covariates if phen_vector3.any(): data['phe'] = phen_vector3 f = 'genotype ~ y + phe +' + covariates try: if reg_type == 0: logreg = smf.logit(f, data).fit(method='bfgs', disp=False) p = logreg.pvalues.y odds = logreg.params.y conf = logreg.conf_int() od = [-math.log10(p), logreg.params.y, '[%s,%s]' % (conf[0]['y'], conf[1]['y'])] else: linreg = smf.logit(f, data).fit(method='bfgs', disp=False) p = linreg.pvalues.y odds = linreg.params.y conf = linreg.conf_int() od = [-math.log10(p), linreg.params.y, '[%s,%s]' % (conf[0]['y'], conf[1]['y'])] except: odds = 0 p = np.nan od = [np.nan, np.nan, np.nan] return (odds, p, od)
def LogisticRegressionExample(): """Runs a simple example of logistic regression and prints results. """ y = np.array([0, 1, 0, 1]) x1 = np.array([0, 0, 0, 1]) x2 = np.array([0, 1, 1, 1]) beta = [-1.5, 2.8, 1.1] log_o = beta[0] + beta[1] * x1 + beta[2] * x2 print(log_o) o = np.exp(log_o) print(o) p = o / (o+1) print(p) like = y * p + (1-y) * (1-p) print(like) print(np.prod(like)) df = pandas.DataFrame(dict(y=y, x1=x1, x2=x2)) results = smf.logit('y ~ x1 + x2', data=df).fit() print(results.summary())
def logistic_regression(self, use_glm=True): """ (b) it seems the statistical significant predict variable is only Lag2. How disappointing... """ formula = "Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume" model = ( smf.glm(formula, data=self.df, family=sm.families.Binomial()) if use_glm else smf.logit(formula, data=self.transformedDF) ) result = model.fit() if use_glm: probs = result.fittedvalues """Beware the prob here is the index 0's prob, so we should use the lambda function below""" pred_values = probs.map(lambda x: 0 if x > 0.5 else 1) else: """The probability of being 1""" probs = Series(result.predict(sm.add_constant(self.df[["Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume"]]))) pred_values = probs.map(lambda x: 1 if x > 0.5 else 0) """ (c) Percentage of currect predictions: (54+557)/(54+557+48+430) = 56.1%. Weeks the market goes up the logistic regression is right most of the time, 557/(557+48) = 92.1%. Weeks the market goes up the logistic regression is wrong most of the time 54/(430+54) = 11.2%. """ tp.output_table(pred_values.values, self.transformedDF[self.y_col].values)
def logistic_model(data, explanatory_variables, response_variable, maxiter = 35, verbose = True): explanatory_vars = ' + '.join(explanatory_variables) formula = response_variable + ' ~ ' + explanatory_vars try: model = smf.logit(formula = formula, data = data).fit(maxiter = maxiter) except: print('Error "' + str(sys.exc_info()[1]) + '" while processing model', formula) model = None if verbose and model != None: print() print('MODEL:', formula, '\n') print(model.summary()) print() # odds ratios with 95% confidence intervals print ("Odds Ratios") params = model.params conf = model.conf_int() conf['OR'] = params conf.columns = ['Lower CI', 'Upper CI', 'Odds Ratios'] print (numpy.exp(conf)) return(model)
def _corr(self, sel, suffix): formula = str('model_accuracy ~ human_accuracy') logreg = smf.logit(formula=formula, data=sel).fit() summ = logreg.summary() if self.html is None: print(summ) else: summ = summ.as_html().replace('class="simpletable"', 'class="simpletable table"') sel = sel.rename(columns={'human_accuracy': 'human accuracy', 'model_accuracy': 'model accuracy'}) sns.lmplot('human accuracy', 'model accuracy', data=sel, x_jitter=.01, y_jitter=.05, logistic=True, truncate=True) bins = np.digitize(sel['human accuracy'], np.arange(.05,1,.1)) #bins[bins==11] = 10 count = sel['model accuracy'].groupby(bins).count() mean = sel['model accuracy'].groupby(bins).mean() sns.plt.scatter(.1*mean.index, mean, s=10*count, c='.15', linewidths=0, alpha=.8) sns.plt.title(models.NICE_NAMES[self.model_name]) sns.plt.xlim([-.1, 1.1]) sns.plt.ylim([-.1, 1.1]) self.show(pref='corr_sil', suffix=self.model_name + '_' + suffix, caption=suffix + summ)
def run_logits(grouped, formula, var): for code, group in grouped: country = get_country(code).ljust(14) model = smf.logit(formula, data=group) results = model.fit(disp=False) nobs, param, stars = extract_res(results, var=var) arrow = '<--' if stars and param > 0 else '' print(country, nobs, '%0.3g'%param, stars, arrow, sep='\t')
def log_reg(formula, df): try: model1 = smf.logit(formula = formula, data=df).fit() print model1.summary() except Exception: print "+" * 40 print "bad formula" print "+" * 40
def logistic_regression_test(): df = pandas.DataFrame.from_csv('./generated_logistic_data.csv') generated_model = smf.logit('y ~ variable_a + variable_b + variable_c', df) generated_fit = generated_model.fit() roc_data = sklearn.metrics.roc_curve(df['y'], generated_fit.predict(df)) auc = sklearn.metrics.auc(roc_data[0], roc_data[1]) print generated_fit.summary() print "AUC score: {0}".format(auc) assert auc > .8, 'AUC should be significantly above random'
def fit_model(formula, model_file): """ Saves a model :param formula: formula for the model :param model_file: name of file to save the model to """ data = load_data() model = logit(formula=formula, data=data) fitted = model.fit() fitted.save(model_file)
def generate_model(df): ''' Create a logistic regression model from loans data based on fields FICO.score, Interest.Rate, and Interest.below12 :param df: a dataframe with fields for the independent vars fico and interest and the dependent var discrete_rate :return: a fitted logistic model ''' model = smf.logit(formula='discrete_rate ~ fico + interest', data=df) fitted_model = model.fit() return fitted_model
def logRegR(self, event): # would have to mess with Patsy formula parser to get more powerful... # too much work dlg = wx.TextEntryDialog(self.parent, "Enter the linear regression formula") if dlg.ShowModal() == wx.ID_OK: model = smf.logit(formula=dlg.GetValue(), data=self.parent.data.data) results = model.fit() self.parent.write("\n" + str(results.summary()) + "\n") sns.regplot(results.predict(), model.endog, ci=False, y_jitter=0.2) plt.show() dlg.Destroy()
def check_initial_specification(dataframe, result_string, new_var, min_specification, fit_word=None): assert isinstance(dataframe, pd.DataFrame) #Make sure dataframe is a pandas dataframe. assert isinstance(result_string, str) #Make sure the result_string is actually a string assert isinstance(new_var, list) #Make sure new_var is a list assert isinstance(min_specification, str) #Make sure the min_specification is a string base_vars = min_specification.split(" + ") #Extract the variables used in the minimum specification if "0" in base_vars: #Remove any zeros from the variables used in the minimum specification base_vars.remove("0") #Initialize starting values for the optimization start_vals = np.random.rand(len(base_vars + new_var)) #Create the formula string for the logistic regression fString = result_string + " ~ " + min_specification + " + " + " + ".join(new_var) #Make sure the matrix for the logistic regression is invertible if not check_full_rank(dataframe, base_vars + new_var): #If not, raise an error raise Exception("The base model plus {} is not of full rank.".format(new_var)) #Fit the logistic regression if fit_word is None: model = smf.logit(fString, data=dataframe).fit(start_params = start_vals, maxiter=2000, disp=False) else: model = smf.logit(fString, data=dataframe).fit(method=fit_word, start_params = start_vals, maxiter=2000, disp=False) if not model.mle_retvals["converged"]: #Check if the model converged #If it did not, raise an error raise Exception("The model for {} did not converge".format(new_var)) lowest_pval = model.pvalues[new_var[0]] #Initialize a value for the lowest p-value for orig_var in new_var: #Iterate through the new variables current_pval = model.pvalues[orig_var] #If the current variables p-value is less than the lowest p-value if current_pval < lowest_pval: #Keep track of this number lowest_pval = current_pval return lowest_pval
def fit_model(y, formula, df): from statsmodels.formula.api import ols, logit # If you have a dichotomous variable then # we're going to run a logistic regression if df[y].nunique() == 2: lm = logit(formula, df).fit() # otherwise we'll run an ordinary least # squares regression else: lm = ols(formula, df).fit() return lm
def log_regression(wine_set): # # examining the data before recoding # print(wine_set["sulphates"].describe()) # wine_set["sulphates_c"] = pd.qcut(wine_set["sulphates"], 4) # print(wine_set.groupby("sulphates_c").size()) # print() # # # print(wine_set["alcohol"].describe()) # wine_set["alcohol_c"] = pd.qcut(wine_set["alcohol"], 4) # print(wine_set.groupby("alcohol_c").size()) # print() # # print(wine_set["quality"].describe()) # wine_set["quality_c"] = pd.qcut(wine_set["quality"], 3) # print(wine_set.groupby("quality_c").size()) # print() # recode quality into 2 groups: 0:{3,4,5,6}, 1:{7,8,9} recode = {3: 0, 4: 0, 5:0, 6:0, 7:1, 8:1, 9:1} wine_set['quality_c'] = wine_set['quality'].map(recode) # recode sulphates into 2 groups: 0: <= mean, 1: > mean def sulphates_to_cat(x): if x['sulphates'] <= wine_set['sulphates'].mean(): return 0 else: return 1 wine_set['sulphates_c'] = wine_set.apply(lambda x: sulphates_to_cat(x), axis=1) # recode alcohol into 2 groups: 0: <= mean , 1: > mean def alcohol_to_cat(x): if x['alcohol'] <= wine_set['alcohol'].mean(): return 0 else: return 1 wine_set['alcohol_c'] = wine_set.apply(lambda x: alcohol_to_cat(x), axis=1) # print(wine_set.head(10)) # logistic regression for sulphates+alcohol -> quality print ("Logistic regression model for the association between wine's quality and sulphates&alcohol") model1 = smf.logit(formula="quality_c ~ sulphates_c + alcohol_c", data=wine_set) results1 = model1.fit() print(results1.summary()) # odds ratios with 95% confidence intervals print("\nConfidence intervals") conf = results1.conf_int() conf['Odds ratio'] = results1.params conf.columns = ['Lower conf.int.', 'Upper conf.int.', 'Odds ratio'] print(numpy.exp(conf))
def report_logitreg(formula, data, verbose=True): """Fit logistic regression, print a report, and return the fit object.""" results = smf.logit(formula, data=data).fit() summary = results.summary() margeff = results.get_margeff().summary() if verbose: report = """ {summary}\n\n {margeff}\n""".format(summary=summary,margeff=margeff) print(report) return results
def test_log_regression(): """Tests the results of logistic regression. Explore on the beta coefficient """ run = load_in_dataframe(2) run_added = add_gainlossratio(run) run_final = organize_columns(run_added) #fit the logistic regression line fitted = logit("respcat ~ gain + loss", run_final).fit() #get the parameters fitted_params = fitted.params.as_matrix() test_fitted_params = log_regression(run_final).as_matrix() assert_array_equal(fitted_params,test_fitted_params)
def logistic_fit(self, glm_fit=True): ''' The logit function would report error when y(Direction) is not transformed to 0/1 So glm looks easier to use ''' formula = "Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume" if glm_fit is True: model = smf.glm(formula, data=self.df, family=sm.families.Binomial()) else: # In fact, this function has wrong fittedvalues, but it's predict value is still right. model = smf.logit(formula, data=self.df) result = model.fit() print result.summary() # In logit fit there are errors here. Not sure why... if glm_fit: self.output_binary_table(result, result.fittedvalues, model.endog.astype(int), glm_fit)
def test_compare_logit(self): vs = Independence() family = Binomial() Y = 1 * (np.random.normal(size=100) < 0) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=groups, family=family, covstruct=vs).fit() sml = sm.logit("Y ~ X1 + X2 + X3", data=D).fit() assert_almost_equal(sml.params.values, md.params, decimal=10)
def test_compare_logit(self): vs = Independence() family = Binomial() Y = 1*(np.random.normal(size=100) < 0) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) rslt1 = mod1.fit() mod2 = sm.logit("Y ~ X1 + X2 + X3", data=D) rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params, decimal=10)
def log_regression(run_final): """Do logistic regression on train cols to predict the subject's decision Parameters ---------- run : data.frame behavioral data frame with organized columns Returns ------- logit_pars : logistic regression result summary the logistic regression result summary """ # do logistic regression x = logit("respcat ~ gain + loss", run_final).fit() # check the summary print(x.summary()) #store the parameters of logistic regression logit_pars = x.params return logit_pars
else: auc_value = auc(-final_sample.ix[:, i], final_sample.default_flag) gini_coeff = 2 * auc_value - 1 print "The Gini Coefficient for transformed factor %s is %s" % (i, gini_coeff) regf1 = 'default_flag ~ dsc_tran + tot_sales_amt_tran + yrs_in_bus_tran + debt_to_ebitda_rto_tran + debt_to_tnw_rto_tran + cur_rto_tran' regf2 = 'default_flag ~ dsc_tran + tot_sales_amt_tran + yrs_in_bus_tran + debt_to_ebitda_rto_tran + debt_to_tnw_rto_tran + net_margin_rto_tran' regf3 = 'default_flag ~ dsc_tran + tot_sales_amt_tran + debt_to_ebitda_rto_tran + debt_to_tnw_rto_tran + cur_rto_tran + net_margin_rto_tran' #regf4 = 'default_flag ~ dsc_tran + tot_sales_amt_tran + yrs_in_bus_tran + debt_to_ebitda_rto_tran + net_margin_rto_tran + cur_rto_tran' # need some transformation regf4 = 'default_flag ~ dsc_tran + tot_sales_amt_tran + yrs_in_bus_tran_bin + debt_to_ebitda_rto_tran + net_margin_rto_tran + cur_rto_tran' # use woe transform for yrs_in_bus regf5 = 'default_flag ~ dsc_tran + C(_tot_sales2) + yrs_in_bus_tran_bin + debt_to_ebitda_rto_tran + net_margin_rto_tran + cur_rto_tran' # use woe transform for yrs_in_bus # final sample exclude s2014 data regm1 = smf.logit(formula=str(regf1), data=final_sample).fit() auc1 = auc(regm1.predict(), final_sample.default_flag) auc_preddata = auc(regm1.predict(f121314), f121314.default_flag) print "The AUC for current model m1 is: %s, and AUC for OOT data is %s" % ( auc1, auc_preddata) regm2 = smf.logit(formula=str(regf2), data=final_sample).fit() auc2 = auc(regm2.predict(), final_sample.default_flag) auc_preddata = auc(regm2.predict(f121314), f121314.default_flag) print "The AUC for current model m2 is: %s, and AUC for OOT data is %s" % ( auc2, auc_preddata) regm3 = smf.logit(formula=str(regf3), data=final_sample).fit() auc3 = auc(regm3.predict(), final_sample.default_flag) auc_preddata = auc(regm3.predict(f121314), f121314.default_flag) print "The AUC for current model m3 is: %s, and AUC for OOT data is %s" % (
from sklearn.linear_model import LogisticRegression train_data.columns X = train_data.loc[: , 'Age': 'Married'] y = train_data.loc[: , 'Defaulter_Flag'] model = LogisticRegression() model = model.fit(X,y) import statsmodels.formula.api as smf train_data.columns logitfit = smf.logit(formula = 'Defaulter_Flag ~ Age + YOE + Gender + Married', data = train_data).fit() logitfit.summary() # summary of the model logitfit.predict() # predict logitfit.pred_table() # confusion matrix threshold = 0.5 predicted = logitfit.predict(test_data.loc[: , 'Age': 'Married']) predicted_choice = (predicted > threshold).astype(int) Confusion_matrix = pd.crosstab(test_data.Defaulter_Flag, predicted_choice, rownames=['Defaulter_Flag_count'], colnames=["Predicted_count"]) Confusion_matrix Accuracy = (Confusion_matrix[0][0] + Confusion_matrix[1][1])/ (Confusion_matrix[0][0] + Confusion_matrix[1][1] + Confusion_matrix[0][1] + Confusion_matrix[1][0])
""" LOGISTIC_BANK """ import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns bank = pd.read_csv( "C:\\EXCELR\\NOTES WRITTEN\\SOLVING_ASSIGNMENTS\\Logistic Regression\\solution\\bank_data.csv" ) bank.head(5) bank.shape bank.isnull().sum() import statsmodels.formula.api as smf logit_model = smf.logit('ATTORNEY~CLMAGE+LOSS+CLMINSUR+CLMSEX+SEATBELT', data=claimants).fit() logit_model.summary() corrmat = bank.corr() top_corr_features = corrmat.index plt.figure(figsize=(10, 10)) #plot heat map g = sns.heatmap(bank[top_corr_features].corr(), annot=True, cmap="RdYlGn") bank["pdays"] = 1 bank.pdays ################################################################################################# import pandas as pd import numpy as np from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2
# binary nicotine dependence def obamaORomney3(x): if x == 1: return 1 else: return 0 result2_sorted['prevote_primvwho'] = result2_sorted["prevote_primvwho"].apply( lambda x: obamaORomney3(x)) # logistic regression with social phobia lreg1 = smf.logit( formula='prevote_primvwho ~ econ_opnion_c + incgroup_prepost_c', data=result2_sorted).fit() print(lreg1.summary()) # odds ratios print("Odds Ratios") print(numpy.exp(lreg1.params)) # odd ratios with 95% confidence intervals params = lreg1.params conf = lreg1.conf_int() conf['OR'] = params conf.columns = ['Lower CI', 'Upper CI', 'OR'] print(numpy.exp(conf)) # ----------------------------------------------------------------------------# # Decision Tree
select_col = np.where(select_col == -99999, floor_value, np.where(select_col == 99999, cap_value, select_col)) # replace -99999/99999 select_col_after_fc = np.where(select_col <= floor_value, floor_value, np.where(select_col >= cap_value, cap_value, select_col)) #floor cap select_col_after_fc_impute = np.where(np.isnan(select_col_after_fc), impute_value, select_col_after_fc) #impute select_col_after_fc_impute_normalized = (select_col_after_fc_impute - mean_value) / std_value indata[var_x + '_tran'] = select_col_after_fc_impute indata[var_x + '_normalized'] = select_col_after_fc_impute_normalized tran_vars = [x for x in list(indata) if '_tran' in x] normalized_vars = [x for x in list(indata) if '_normalized' in x] return indata.ix[:, tran_vars + normalized_vars].describe() ## applying the function to f121314 to do transformation and normalization gcca_test_tran_summary = normalize_test(gcca_test, normalized_summary_matrix) tran_vars = [x + '_tran' for x in model_factors] f = 'df ~ ' + ' + '.join(tran_vars) f = 'df ~ net_mrgn_rto_tran + debt_srvc_cov_rto_tran + C(yrs_in_business_b) + debt_to_ebitda_rto_tran + cur_rto_tran' m1 = smf.logit(formula=str(f), data=gcca_dev).fit() print auc(m1.predict(gcca_dev), gcca_dev.df) * 2 - 1 print auc(m1.predict(gcca_test), gcca_test.df) * 2 - 1
# -*- coding: utf-8 -*- """ Created on Thu Jun 9 13:59:07 2016 @author: emg """ import pandas as pd import statsmodels.api as sm import statsmodels.formula.api as smf from variable_manipulation import df from sklearn.linear_model import LogisticRegression logistic = LogisticRegression() df = pd.read_csv( '/Users/emg/Google Drive/MSc SRMS/MSc Disseration/ass_reddit_comments_may_2015/practice_5_25.csv' ) # make subreddits numeric df['subreddit'] = pd.to_numeric(df['subreddit'], errors='coerce') # doesn't work subnums = {"N/A": 0.0, "AskSocialScience": 1.0, "AskStatistics": 2.0} df['sub_num'] = df['subreddit'].apply(subnums.get) prac = df.loc[:, ['mod', 'score', 'sub_num']] prac = df.loc[:, ['mod', 'score', 'subreddit']] lreg1 = smf.logit(formula='mod ~ sub_num', data=df).fit() print lreg1.summary()
# print(np.exp(clf.coef_)) #odds ratio # print(clf.coef_) #relationship print('method 2') # logit only accepts 0/1 as target values df_train.BOOKED.replace([1, 2], [0, 1], inplace=True) est = sm.Logit(y, X) est2 = est.fit() print(est2.summary()) params = est2.params conf = est2.conf_int() conf['Odds Ratio'] = params conf.columns = ['5%', '95%', 'Odds Ratio'] print(np.exp(conf)) ''' alternate https://pythonfordatascience.org/logistic-regression-python/ ''' model = smf.logit( formula="BOOKED~ C(PSYYR2)+ C(IRSEX)+ C(EDUCCAT2)+ C(IRMARIT)+ C(CATAG3)+ C(NEWRACE2)+ C(GOVTPROG)+ C(EMPSTATY)+ C(HVYDRK2)+ C(MJOFLAG)+ C(SUMFLAG)", data=df_train).fit() model.summary() model_odds = pd.DataFrame(np.exp(model.params), columns=['OR']) model_odds['z-value'] = model.pvalues model_odds[['2.5%', '97.5%']] = np.exp(model.conf_int()) print(model_odds)
plt.boxplot(df_affairs["rating"]) # No outlier is present ######Need log transform age for outlier########### x=np.log(df_affairs["age"]) plt.boxplot(x) #####Split the data into train and test############# from sklearn.model_selection import train_test_split train_data,test_data=train_test_split(df_affairs,test_size=0.3) train_data=train_data.reset_index() test_data=test_data.reset_index() train_data=train_data.drop(["index"],axis=1) test_data=test_data.drop(["index"],axis=1) ########Building the model############ import statsmodels.formula.api as sm train_data.isnull().sum() m1=sm.logit("AF~np.log(age)+yearsmarried+religiousness+education+occupation+rating+gender_female+gender_male+children_no+children_yes", data=train_data).fit() m1.summary() m1.summary2() #AIC=486 train_pred=m1.predict(train_data) from scipy import stats import scipy.stats as st st.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df) train_data["train_pred"]=np.zeros(420) train_data.loc[train_pred>=0.5,"train_pred"]=1
listtrain = pd.read_csv(r'H:\python\kaggle\06coupons\coupon_list_train.csv') # coupon_id_hash level listtrain.columns = [x.lower() for x in listtrain.columns] # coupon_area_train.csv - the coupon listing area for the training set coupons areatrain = pd.read_csv(r'H:\python\kaggle\06coupons\coupon_area_train.csv') areatrain.columns = [x.lower() for x in areatrain.columns] # coupon_detail_train.csv - the purchase log of users buying coupons during the training set time period. You are not provided this table for the test set period. detailtrain = pd.read_csv(r'H:\python\kaggle\06coupons\coupon_detail_train.csv') # purchaseid level detailtrain.columns = [x.lower() for x in detailtrain.columns] # coupon_visit_train.csv - the viewing log of users browsing coupons during the training set time period. You are not provided this table for the test set period. visittrain = pd.read_csv(r'H:\python\kaggle\06coupons\coupon_visit_train.csv') visittrain.columns = [x.lower() for x in visittrain.columns] capsule = pd.read_excel(r'H:\python\kaggle\06coupons\jpn_2_english.xlsx', sheetname = 'capsule') capsule.columns = [x.lower() for x in capsule.columns] genre = pd.read_excel(r'H:\python\kaggle\06coupons\jpn_2_english.xlsx', sheetname = 'genre') genre.columns = [x.lower() for x in genre.columns] # user list file userlist = pd.read_csv(r'H:\python\kaggle\06coupons\user_list.csv') userlist.columns = [x.lower() for x in userlist.columns] # merge visit table with coupon info train_data = pd.merge(listtrain, visittrain, left_on = 'coupon_id_hash', right_on = 'view_coupon_id_hash', how = 'inner') f = 'purchase_flg ~ C(genre_name) + price_rate + discount_price + C(usable_date_mon)' logfit = smf.logit(formula = str(f), data = train_data).fit()
marker='o', edgecolors='r', facecolors='none') plt.ylim([0, 80000]) plt.xlim([0, 2800]) plt.legend(('default', 'no default'), loc='upper right') # 6 - What can you infer from this plot? # It appears that the balance is more correlated with default than income ''' PART II - LOGISTIC REGRESSION ''' # 1 - Run a logistic regression on the balance variable # 2 - Is the beta value associated with balance significant? balance = smf.logit('default ~ balance', data=train).fit() balance.summary() np.exp(balance.params.balance) # Beta is significant! # 2 - Predict the probability of default for someone with a balance of $1.2k and $2k prob = balance.predict({'balance': [1200, 2000]}) # What does beta mean? Let's create some plots to find out! x = np.linspace(test.balance.min(), test.balance.max(), 500) beta = [balance.params.Intercept, balance.params.balance] y = np.exp(beta[0] + beta[1] * x) / (1 + np.exp(beta[0] + beta[1] * x)) odds = np.exp(beta[0] + beta[1] * x) log_odds = beta[0] + beta[1] * x
data.shape new_data['y'] = y new_data.shape new_data.columns """ (['education', 'age', 'rating', 'religiousness', 'occupation', 'yearsmarried', 'gender', 'children', 'y'], dtype='object') """ from sklearn.model_selection import train_test_split train, test = train_test_split(new_data, test_size=0.3) import statsmodels.formula.api as smf logit_model = smf.logit( 'y~education+age+rating+religiousness+occupation+yearsmarried+gender+children', data=train).fit() logit_model.summary() """ LLR p-value: 0.000 Log-Likelihood: -200.07 LL-Null: -225.77 LLR p-value: 2.201e-08 ================================================================================= coef std err z P>|z| [0.025 0.975] --------------------------------------------------------------------------------- Intercept 2.4451 1.091 2.241 0.025 0.307 4.583 education -0.0456 0.062 -0.731 0.465 -0.168 0.077 age -0.0558 0.023 -2.443 0.015 -0.101 -0.011 rating -0.4970 0.113 -4.411 0.000 -0.718 -0.276
############################################################################### """ Running a smf.logit regression to understand the coefficient weight of the features chosen for the models tested previously. There I identified the following variables that will increase chances of survival (will be negative coefficients since I am using IsDead): bk5_only, isMarried, book4, dateOfBirth. By doing so I can have a better idea as to what is good and what is bad for survival. """ log_got_p = smf.logit(formula="""isDead ~ got_df['alive_by_age'] + got_df['hm_books'] + got_df['popularity'] + got_df['numDeadRelations'] + got_df['bk5_only'] + got_df['isNoble'] + got_df['isMarried'] + got_df['book4'] + got_df['bk1_only'] + got_df['dateOfBirth']""", data=got_df) results_logistic_full = log_got_p.fit() results_logistic_full.summary() ############################################################################### ##################### BEST MODEL IN TERMS OF AUC ############################## ############################################################################### """ This is the model I created with specific features that gave me the best Test
import statsmodels.api as sm import statsmodels.formula.api as smf import pandas as pd import pandas as pd df = pd.read_csv('nes.dat', sep=r'\s+') df = df[['presvote', 'year', 'income', 'black']] df = df[df['presvote'] < 3] # sadece 2 partinin oylarini al # 1,2 oylari 1,0 yap, Cumhuriyetciye verildi mi evet/hayir # haline getir df['vote'] = df['presvote'].map(lambda x: x - 1) df = df.drop('presvote', axis=1) df = df.dropna() df2 = df[df['year'] == 1992] mdlm = smf.logit("vote ~ income", df2) mdlmf = mdlm.fit() print(mdlmf.summary())
def predict_ci(fitted, df, alpha=0.05): """ Compute predicted probabilities with confidence intervals based on a logistic regression model Parameters ---------- fitted A logistic regression model fitted using the statsmodels formula interface df A pandas dataframe with input data for prediction alpha Significance level (0-1). Default is 0.05 Returns ------- A dataframe with probability predictions and lower and upper confidence bounds Example ------- import numpy as np import statsmodels.formula.api as smf import pandas as pd # simulate data np.random.seed(1) x1 = np.arange(100) x2 = pd.Series(["a", "b", "c", "a"], dtype="category").sample(100, replace=True) y = (x1 * 0.5 + np.random.normal(size=100, scale=10) > 30).astype(int) df = pd.DataFrame({"y": y, "x1": x1, "x2": x2}) # estimate the model model = smf.logit(formula="y ~ x1 + x2", data=df).fit() model.summary() pred = predict_ci(model, df) plt.clf() plt.plot(x1, pred["prediction"]) plt.plot(x1, pred["2.5%"], color='black', linestyle="--", linewidth=0.5) plt.plot(x1, pred["97.5%"], color='black', linestyle="--", linewidth=0.5) plt.show() """ if alpha < 0 or alpha > 1: raise ValueError("alpha must be a numeric value between 0 and 1") # generate prediction prediction = fitted.predict(df) # adding a fake endogenous variable df = df.copy() # making a full copy df["__endog__"] = 1 form = "__endog__ ~ " + fitted.model.formula.split("~", 1)[1] df = smf.logit(formula=form, data=df).exog low, high = [alpha / 2, 1 - (alpha / 2)] Xb = np.dot(df, fitted.params) se = np.sqrt((df.dot(fitted.cov_params()) * df).sum(-1)) me = norm.ppf(high) * se lb = np.exp(Xb - me) ub = np.exp(Xb + me) return pd.DataFrame({ "prediction": prediction, f"{low*100}%": lb / (1 + lb), f"{high*100}%": ub / (1 + ub), })
affairs.affairs.value_counts() #for categorizing in 0 and 1 for logistic_regression affairs["Att_val"] = np.zeros(601) # converting the affairs to binary variable affairs.loc[affairs.affairs >= 1,"Att_val"] = 1 affairs.drop(["affairs"],axis=1,inplace=True) #encoding the string values affairs.iloc[:,0:1].columns affairs["gender"] = pd.get_dummies(affairs["gender"]) affairs["children"] = pd.get_dummies(affairs["children"]) #model for logistics regression import statsmodels.formula.api as sm logit_model = sm.logit('Att_val~age+yearsmarried+religiousness+rating',data = affairs).fit() logit_model.summary() y_pred = logit_model.predict(affairs) y_pred_val = y_pred affairs["y_pred"]=y_pred plt.hist(y_pred) affairs.loc[y_pred>=0.5,"y_pred"] = 1 affairs.loc[y_pred<0.5,"y_pred"] = 0 from sklearn.metrics import classification_report classification_report(affairs.Att_val,affairs.y_pred) #classificaion report '''
def sm_logit( df, f=None, features=None, outcome='outcome_field', add_constant=True, categorical=None, maxiter=35, reg_method=None, reg_alpha=10, missing='raise', #log_trans=None, #sort_by='z', #outcome_behavior=None, subset=None, method='newton', ): """reg_method: regularization method. None (default), 'l1' or 'l1_cvxopt_cp'. reg_alpha: weight to apply regularization penalty. Default: 1.0. higher alpha = more coeff equal to zero missing: what to do with rows with missing values. 'raise' (default) or 'drop'. """ if features is not None: df = df[features] else: features = df.columns.tolist() #if add_constant: # df = sm.tools.add_constant(df, prepend=False, has_constant='raise') if f is None: these_features = [x for x in features if x != outcome] if categorical is not None: f = '{} ~ '.format(outcome) + ' + '.join([ 'C({})'.format(x) if x in categorical else x for x in these_features ]) else: f = '{} ~ '.format(outcome) + ' + '.join( [x for x in these_features]) # debug print(f) if reg_method is not None: # if subset is not None: # df = df.loc[subset, :] # y, X = patsy.dmatrices(f, df, return_type='dataframe') # reg_alpha = reg_alpha * np.ones(X.shape[1]) # reg_alpha[X.columns.tolist().index('Intercept')] = 0 # results_log = sm.Logit(y, X, missing=missing).fit_regularized(method=reg_method, alpha=reg_alpha) results_log = smf.logit(f, df, subset=subset, missing=missing).fit_regularized( method=reg_method, alpha=reg_alpha) else: #results_log = sm.Logit.from_formula(f, df, missing='raise').fit(maxiter=maxiter) results_log = smf.logit(f, df, subset=subset, missing=missing).fit(maxiter=maxiter, method=method) #print_sm_logit_results(results_log, sort_by=sort_by, log_trans=log_trans, outcome_behavior=outcome_behavior) return results_log
featuresdf['status_group_relabel'] = featuresdf.apply(lambda row: relabel(row), axis=1) # do a table for status group and show the mean population pivot1 = featuresdf.pivot_table( index="status_group", values=["amount_tsh", "population", "gps_height"], aggfunc=[np.mean, np.median, statistics.mode]) ax = pivot1.plot.bar(rot=0) plt.show() for i in yvals: lreg1 = smf.logit(formula=i + '~ amount_tsh + population', data=featuresdf).fit() print(lreg1.summary()) print('') # odds ratios print("Odds Ratios") print(np.exp(lreg1.params)) # odd ratios with 95% confidence intervals params = lreg1.params conf = lreg1.conf_int() conf['OR'] = params conf.columns = ['Lower CI', 'Upper CI', 'OR'] print(np.exp(conf))
def regression(df, rf): model = smf.logit(data=df, formula=rf).fit() print model.summary()
bank1 = pd.concat([ag_log,bank1],axis=1) #concating log_age column with the dataset bank1.drop(["age"],inplace = True, axis=1) #since age column is insignificant and not needed x= bank1.iloc[:,0:42]#creating a new object with only taking the features or i/p variables #x.drop(["y"],inplace= True , axis=1) model1= sm.logit("y_yes~x", data = bank1 ).fit() model1.summary() model1.summary2() ## AIC: 21644.8803 #Removing all the insignificant columns which are not needed ## Majorly insignificant colmns is being removed and seeing if the insignificance of the variables are removed x.iloc[:,23].name x0= x.drop(["default_yes"],axis=1) model2= sm.logit("y_yes~x0", data = bank1).fit() model2.summary()
data = {"x1":x1.flatten(), "x2":x2.flatten(), "y_discrete":y_discrete.flatten(), "y_auxiliar":y_auxiliar.flatten()} df = pd.DataFrame(data) # Plot the data. What type is it? What should we expect from it? fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(df['x1'], df['x2'], df['y_discrete'], label="y measured", color="k") ax.set_xlabel('x1') ax.set_ylabel('x2') ax.set_zlabel('y') plt.legend(loc="upper left", fontsize=10, numpoints=1) plt.show() # Ordinary Least Squares = Linear Regression model = smf.logit(formula="y_discrete ~ x1 + x2", data=df) fitted_model = model.fit() coeffs = fitted_model.params print fitted_model.summary() print "The model obtained is y = 1./(1 + exp(-({0} + {1}*x1 + {1}*x2)))".format(*coeffs) print coeffs # Plot the data. What type is it? What should we expect from it? y_model = fitted_model.predict(df[["x1","x2"]]) y_prediction = np.where(y_model<0.5, 0, 1) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(df['x1'], df['x2'], df['y_discrete'], label="y measured", color="k") ax.plot_trisurf(df['x1'], df['x2'], y_model, label="Model", color="w", alpha=0.25) ax.scatter(df['x1'], df['x2'], y_prediction + 0.05, label="Prediction", color="r") ax.set_xlabel('x1')
## Generating auc_score based on the optimal KNN model print('knn_auc_train Score', knn_train_auc_score.round(4)) #score - 0.852 print('knn_auc_test Score', knn_test_auc_score.round(4)) #score - 0.828 ############################################################################### ## Classification with logistic ############################################################################### logistic_full = smf.logit(formula="""isAlive ~ got['male']+ got['age']+ got['isMarried']+ got['isNoble']+ got['book1_A_Game_Of_Thrones']+ got['book3_A_Storm_Of_Swords']+ got['book4_A_Feast_For_Crows']+ got['popularity']+ got['house_alive_pct']+ got['title_alive_pct']""", data=got) results_logistic_full = logistic_full.fit() results_logistic_full.summary() results_logistic_full.pvalues ############################################################################### # Hyperparameter Tuning with Logistic Regression
# RainTomorrow : 종속변수 , 나머지열 : 독립변수 # train / test dataset 으로 분리 : 과적합(overfitting) 방지목적 train , test = train_test_split(data2, test_size = 0.3, random_state = 42) # 데이터 셔플링후 30 % 의 데이터를 뽑음 print(data.shape,train.shape,test.shape) # 분류 모델 my_formula = 'RainTomorrow ~ MinTemp + MaxTemp + Rainfall....' col_select = '+'.join(train.columns.difference(['RainTomorrow'])) my_formula = 'RainTomorrow ~' + col_select print(my_formula) # 분류를 위한 학습모델의 생성 #model = smf.glm(formula = my_formula, data = train, family = sm.families.Binomial()).fit() #모델을 fitting 시킬땐 trian model = smf.logit(formula = my_formula, data = train).fit() #모델을 fitting 시킬땐 trian #print(model.summary()) #print(model.params()) print('예측값:' , np.rint(model.predict(test)[:5])) # 모델을 예측할때는 test print('실제값:' , test['RainTomorrow'][:5]) # 분류 정확도 conf_mat = model.pred_table() print(conf_mat) print((conf_mat[0][0]+ conf_mat[0][0]) / len(train)) from sklearn.metrics import accuracy_score pred = model.predict(test) print('분류 정확도 : ', accuracy_score(test['RainTomorrow'],np.around(pred)))
def binning(oldDF, dataSeries, binList, newColName, labelList, deleteOldColumn): columnForBins = pandas.cut(x = dataSeries, bins = binList, labels = labelList).to_frame() columnForBins.columns = [newColName] df_new = pandas.concat([oldDF,columnForBins],axis = 1) if deleteOldColumn is True: df_new = df_new.drop(dataSeries.name,axis=1) return df_new dataForFrame = {'IPP': data['incomeperperson'], 'IUR': data['internetuserate'], 'LE': data['lifeexpectancy']} df = pandas.DataFrame(data = dataForFrame).dropna() df = binning(df, df['LE'], [0, 70, 100], 'BinnedLE', ['0-70','70-100'], True) df = df.replace(to_replace={'BinnedLE' : {'0-70' : 0, '70-100' : 1}}) # logistic regression with Income per Person lreg1 = smf.logit(formula = 'BinnedLE ~ IPP', data = df).fit() print (lreg1.summary()) # odd ratios with 95% confidence intervals params = lreg1.params conf1 = lreg1.conf_int() conf1['OR'] = params conf1.columns = ['Lower CI', 'Upper CI', 'OR'] print (numpy.exp(conf1)) # logistic regression with IUR lreg2 = smf.logit(formula = 'BinnedLE ~ IUR', data = df).fit() print (lreg2.summary()) # odd ratios with 95% confidence intervals params = lreg2.params
sub1['hh_income_c'] = sub1['hh_income'] - sub1['hh_income'].mean() sub1['hh_income_c'].describe() sub1['hh_income_c'] = sub1['hh_income_c'] / 10000 sub1['hh_income_c'].describe() reg1 = smf.ols('neg_outlook ~ hh_income_c', data=sub1).fit() print(reg1.summary()) #Adj. R-squared = -0.000 #household income insignificant reg2 = smf.ols('neg_outlook ~ soc_class', data=sub1).fit() print(reg2.summary()) #Adj. R-squared = 0.002 reg3 = smf.ols('neg_outlook ~ soc_class + C(ethnicity)', data=sub1).fit() print(reg3.summary()) #Adj. R-squared = 0.002 -> 0.045 lreg1 = smf.logit(formula='neg_outlook ~ hh_income_c', data=sub1).fit() print(lreg1.summary()) #household income insignificant print("Odds Ratios") print(numpy.exp(lreg1.params)) lreg2 = smf.logit(formula='neg_outlook ~ soc_class', data=sub1).fit() print(lreg2.summary()) params = lreg2.params conf = lreg2.conf_int() conf['OR'] = params conf.columns = ['Lower CI', 'Upper CI', 'OR'] print(numpy.exp(conf)) lreg3 = smf.logit(formula='neg_outlook ~ soc_class + C(ethnicity)', data=sub1).fit()
election.iloc[:,2:] = election.iloc[:,2:].apply(lambda x:x.fillna(x.mean())) election.PR=election.PR.fillna(election.PR.mean()) election.iloc[:,3:] = election.iloc[:,3:].apply(lambda x:x.fillna(x.mean())) # Checking if we have na values or not election.isnull().sum() # No null values from scipy import stats import scipy.stats as st st.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df) #Model building import statsmodels.formula.api as sm logit_model=sm.logit('Result~Year+AS+PR',data=election).fit() #summary logit_model.summary() y_pred = logit_model.predict(election) election["pred_prob"] = y_pred # Creating new column for storing predicted class of Attorney # filling all the cells with zeroes election["Att_val"] = 0 # taking threshold value as 0.5 and above the prob value will be treated # as correct value election.loc[y_pred>=0.5,"Att_val"] = 1
import statsmodels.api as sm from statsmodels.formula.api import logit, probit, poisson, ols print(sm.datasets.fair.SOURCE) print(sm.datasets.fair.NOTE) dta = sm.datasets.fair.load_pandas().data dta['affair'] = (dta['affairs'] > 0).astype(float) print(dta.head(10)) print(dta.describe()) affair_mod = logit( "affair ~ occupation + educ + occupation_husb" "+ rate_marriage + age + yrs_married + children" " + religious", dta).fit() print(affair_mod.summary()) # How well are we predicting? affair_mod.pred_table() # The coefficients of the discrete choice model do not tell us much. What we're after is marginal effects. mfx = affair_mod.get_margeff() print(mfx.summary()) respondent1000 = dta.iloc[1000] print(respondent1000)
#We now apply these equations to our variables to get new columns #Note: we don't include longitude as our linear regression model showed no strong association #between it and the crater diameter data2['LATITUDE_BIN'] = data2['LATITUDE'].apply(lambda x: georegion(x)) data2['CRATER_SIZE_BIN'] = data2['DIAMETER'].apply(lambda x: cratersize(x)) data2['NUMBER_LAYERS_BIN'] = data2['NUMBER_LAYERS'].apply(lambda x: layers(x)) data2['DEPTH_BIN'] = data2['DEPTH'].apply(lambda x: depth(x)) data2.head(5) #now we'll look at our logistic regression just using our primary variable print('Modelling between crater size and latitude bin') model1 = smf.logit(formula='CRATER_SIZE_BIN ~ LATITUDE_BIN',data=data2).fit() print(model1.summary()) print('Odds Ratios') print(numpy.exp(model1.params)) #odds ratios with 95% confidence intervals params = model1.params conf = model1.conf_int() conf['OR'] = params conf.columns = ['Lower CI', 'Upper CI', 'OR'] numpy.exp(conf) print('Modelling between crater size and latitude bin and layers') model2 = smf.logit(formula='CRATER_SIZE_BIN ~ LATITUDE_BIN + NUMBER_LAYERS_BIN',data=data2).fit() print(model2.summary()) print('Odds Ratios')
plt.plot(mean_income_by_educ, 'o', alpha = 0.5) # Plot the predictions pred = results.predict(df) plt.plot(df['educ'], pred, label='Age 30') # Label axes plt.xlabel('Education (years)') plt.ylabel('Income (1986 $)') plt.legend() plt.show() # Predicting a binary variable # Let's use logistic regression to predict a binary variable. Specifically, we'll use age, sex, and education level to predict support for legalizing cannabis (marijuana) in the U.S. # In the GSS dataset, the variable grass records the answer to the question "Do you think the use of marijuana should be made legal or not?" # Recode grass gss['grass'].replace(2, 0, inplace=True) # Run logistic regression results = smf.logit('grass~ age + age2 + educ + educ2 + C(sex)', data = gss).fit() results.params
import numpy as np import statsmodels.formula.api as smf import matplotlib.pyplot as plt import scipy.stats as stats # set the random seed: np.random.seed(1234567) y = stats.binom.rvs(1, 0.5, size=100) x = stats.norm.rvs(0, 1, size=100) + 2 * y sim_data = pd.DataFrame({'y': y, 'x': x}) # estimation: reg_lin = smf.ols(formula='y ~ x', data=sim_data) results_lin = reg_lin.fit() reg_logit = smf.logit(formula='y ~ x', data=sim_data) results_logit = reg_logit.fit(disp=0) reg_probit = smf.probit(formula='y ~ x', data=sim_data) results_probit = reg_probit.fit(disp=0) # calculate partial effects: PE_lin = np.repeat(results_lin.params['x'], 100) xb_logit = results_logit.fittedvalues factor_logit = stats.logistic.pdf(xb_logit) PE_logit = results_logit.params['x'] * factor_logit xb_probit = results_probit.fittedvalues factor_probit = stats.norm.pdf(xb_probit) PE_probit = results_probit.params['x'] * factor_probit
new_data.columns """ we need o add y output variable to the new data set """ data.shape new_data['y'] = y new_data.shape new_data.columns from sklearn.model_selection import train_test_split train, test = train_test_split(new_data, test_size=0.3) import statsmodels.formula.api as smf logit_model = smf.logit( 'y~duration+balance+pdays+poutsuccess+previous+campaign+con_unknown+housing+con_cellular+joretired', data=train).fit() logit_model.summary() """ Logit Regression Results ============================================================================== Dep. Variable: y No. Observations: 31647 Model: Logit Df Residuals: 31636 Method: MLE Df Model: 10 Date: Sat, 07 Dec 2019 Pseudo R-squ.: 0.2941 Time: 06:05:38 Log-Likelihood: -8074.2 converged: True LL-Null: -11438. LLR p-value: 0.000 ================================================================================ coef std err z P>|z| [0.025 0.975] --------------------------------------------------------------------------------
def lr_pvalue(null, full, df): # compare a two models, null and full # df should be the difference in the number of parameters # return a the p-value for the deviation (2 * difference in log-likelihood) lrstat = -2 * null.llf + 2 * full.llf return stats.chi2.sf(lrstat, df=df) # using chi-square if __name__ == "__main__": # subsample samplesize = 1541;thres=float(0.05)/float(samplesize);p_lr=[] # load SNP data df = pandas.read_csv("mycsv.csv") df = df.loc[np.random.choice(df.index, samplesize, replace=False)] # null-model with just sex nullmodel = logit("PHENOTYPE ~ SEX", data=df).fit(disp=False) for i in xrange(0, 1110): mymodel = logit("PHENOTYPE ~ snp_%d + SEX" % i, data=df).fit(disp=False) p = lr_pvalue(nullmodel, mymodel, 1) if p<thres: p_lr.append(['snp_%d'% i, 'p-value %.5g' %p]) with open ('question5.txt','wt') as fout: for line in p_lr: fout.writelines(str(line)) fout.write("\n")
######3from sklearn.preprocessing import Normalizer ####33x=bank.iloc[:,:].values ####3x ##z=pd.DataFrame(x) #labelencoder_x=LabelEncoder() ##x[:,1]=labelencoder_x.fit_transform(x[:,1]) #z=pd.DataFrame(x) ###3onehotencoder=OneHotEncoder(categorical_features=[1]) #####x= onehotencoder.fit_transform(x).toarray() bank.columns bank.corr() banks=pd.DataFrame(bank) banks.shape import statsmodels.formula.api as sm logit_model=sm.logit('yy~age+balance+day+duration+campaign+pdays+previous+jobs+maritals+educations+defaults+housings+loans+contacts+months',data=banks).fit() logit_model.summary() logit_model2=sm.logit('yy~age+balance+day+duration+campaign+pdays+previous+maritals+educations+defaults+housings+loans+contacts+months',data=banks).fit() logit_model2.summary() y_pred=logit_model2.predict(banks) y_pred banks['pred_prob']=y_pred banks banks["Att_val"] = 0 bankss.loc[y_pred>=0.5,"Att_val"] = 1 banks.Att_val from sklearn.metrics import classification_report classification_report(banks.Att_val,banks.yy) confusion_matrix = pd.crosstab(banks['yy'],banks.Att_val) confusion_matrix accuracy=(39150+1112)/(39150+772+4177+1112)
sum(so) as p_stout from pitching group by playerid) p on h.playerid = p.playerid;''' new = pandas.read_sql(data, con) con.close() ############################################################## ### Start with considering as many explanatory variables ### ############################################################## model1 = logit('inducted ~ p_wins + p_loss + p_shout + p_saves + p_stout', data = new).fit() # model1 is our fitted model. print model1.summary() #============================================================================== # coef std err z P>|z| [95.0% Conf. Int.] #------------------------------------------------------------------------------ #Intercept -2.3188 0.310 -7.477 0.000 -2.927 -1.711 #p_wins 0.0273 0.006 4.377 0.000 0.015 0.039 #p_loss -0.0346 0.007 -4.957 0.000 -0.048 -0.021 #p_shout 0.0193 0.022 0.886 0.376 -0.023 0.062 #p_saves -5.344e-06 0.004 -0.001 0.999 -0.008 0.008 #p_stout 0.0005 0.000 1.430 0.153 -0.000 0.001 #==============================================================================
data['posipoli'] = data.apply(lambda row: POSIPOLI(row), axis=1) # check the new positive polity score variable print('Check positive polity score counts:') posipolicheck = data['posipoli'].value_counts(sort=False, dropna=False) print (posipolicheck) print() # center quantitative IVs for regression analysis data['incomeperperson_c'] = (data['incomeperperson'] - data['incomeperperson'].mean()) data['employrate_c'] = (data['employrate'] - data['employrate'].mean()) # logistic regression with posipoli print('Logistic regression for positive polity score and urban rate:') lreg1 = smf.logit(formula = 'posipoli ~ urbanrate', data = data).fit() print (lreg1.summary()) print() # odds ratios print ("Odds Ratios for Positive Polity Score and Urban Rate:") print (numpy.exp(lreg1.params)) print() # odd ratios with 95% confidence intervals print('Odds Ratios for Positive Polity Score and Urban Rate with 95% Confidence Intervals:') params = lreg1.params conf = lreg1.conf_int() conf['OR'] = params conf.columns = ['Lower CI', 'Upper CI', 'OR'] print (numpy.exp(conf)) print()
print(urbanization_threshold) # Set binary flag that urbanization rate is greater than the threshold def urbanrate_higher_than_threshold(urbanrate): if urbanrate > urbanization_threshold: return 1 else: return 0 subset['high_urbanrate'] = subset['urbanrate'].apply(urbanrate_higher_than_threshold) counts = subset.groupby('high_urbanrate').size() print(counts) # logistic regression with society type lreg1 = smf.logit(formula = 'high_income ~ full_democracy', data = subset).fit() print (lreg1.summary()) # odd ratios with 95% confidence intervals params = lreg1.params conf = lreg1.conf_int() conf['OR'] = params conf.columns = ['Lower CI', 'Upper CI', 'OR'] print (np.exp(conf)) # logistic regression with society type and urbanization rate lreg2 = smf.logit(formula = 'high_income ~ full_democracy + high_urbanrate', data = subset).fit() print (lreg2.summary()) # odd ratios with 95% confidence intervals params = lreg2.params
# https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8 import statsmodels.api as sm import statsmodels.formula.api as smf logit_model=smf.Logit(y_train, X_train) results=logit_model.fit() print(results.summary2()) # Column names for reference above df_column_name = pd.DataFrame(list(df_full_data.drop(['Attrition'], axis=1).columns.values)) df_column_name.index = np.arange(1, len(df_column_name) + 1) df_column_name # note: the new clustering dataset has slightly different columns #model= smf.logit(formula="Attrition~ ClusterSegment + Age + DailyRate + EnvironmentSatisfaction + JobInvolvement + JobSatisfaction + NumCompaniesWorked + RelationshipSatisfaction + TotalWorkingYears + TrainingTimesLastYear + WorkLifeBalance + OverTime + PerformanceRating + MaritalStatus_Divorced + MaritalStatus_Married + MaritalStatus_Single + DistanceFromHomeRange_1_4 + DistanceFromHomeRange_5_9 + DistanceFromHomeRange_10_19 + DistanceFromHomeRange_20_30 + DistanceFromHomeRange_Over30 + NumCompaniesWorkedRange_0_2 + NumCompaniesWorkedRange_3_5 + NumCompaniesWorkedRange_6_10 + NumCompaniesWorkedRange_10over + YearsAtCompanyRange_0_2 + YearsAtCompanyRange_3_5 + YearsAtCompanyRange_6_10 + YearsAtCompanyRange_10over", data= df_full_data).fit(method='lbfgs') model= smf.logit(formula="Attrition~ Age + DailyRate + EnvironmentSatisfaction + JobInvolvement + JobSatisfaction + RelationshipSatisfaction + TotalWorkingYears + TrainingTimesLastYear + WorkLifeBalance + OverTime + PerformanceRating + MaritalStatus_Divorced + MaritalStatus_Married + MaritalStatus_Single", data= df_full_data).fit() model.summary() # GETTING THE ODDS RATIOS, Z-VALUE, AND 95% CI model_odds = pd.DataFrame(np.exp(model.params), columns= ['OR']) model_odds['z-value']= model.pvalues model_odds[['2.5%', '97.5%']] = np.exp(model.conf_int()) model_odds """# Model 6: Neural Network""" # Random seeds np.random.seed(123) rn.seed(123) tf.set_random_seed(123)
data['bin2alcohol'] = data.apply (lambda row: bin2alcohol (row),axis=1) # create binary Female employee rate def bin2femalemployee(row): if row['femaleemployrate'] <= 50 : return 0 elif row['femaleemployrate'] > 50 : return 1 #Apply the new variable bin2alcohol to the gapmind dataset data['bin2femalemployee'] = data.apply (lambda row: bin2femalemployee (row),axis=1) ############################################################################## # LOGISTIC REGRESSION ############################################################################## # logistic regression with binary breast cancer per 100th women lreg1 = smf.logit(formula = 'bin2cancer ~ bin2alcohol', data = data).fit() print (lreg1.summary()) # odds ratios print ("Odds Ratios") print (np.exp(lreg1.params)) # odd ratios with 95% confidence intervals print ('Logistic regression with binary alcohol consumption') print ('Odd ratios with 95% confidence intervals') params = lreg1.params conf = lreg1.conf_int() conf['OR'] = params conf.columns = ['Lower CI', 'Upper CI', 'OR'] print (np.exp(conf)) print "\n-----------------------------\n" # logistic regression with binary income per person and binary alcohol consumption
# df.info() # df.Species.unique() # filtering for two species # df_subset = df[(df.Species == "versicolor") | (df.Species == "virginica")].copy() # print(df_subset.Species.unique())\ # # df_subset.Species = df_subset.Species.map({"versicolor": 1, "virginica": 0}) # # df_subset.rename(columns={"Sepal.Length": "Sepal_Length", "Sepal.Width": "Sepal_Width", # "Petal.Length": "Petal_Length", "Petal.Width": "Petal_Width"}, inplace=True) # # creating a model # model = smf.logit("Species ~ Petal_Length + Petal_Width",data=df_subset) # result = model.fit() # # print(result.summary()) # # # predicting response values # # df_new = pd.DataFrame({"Petal_Length": np.random.randn(20)*0.5 + 5, # "Petal_Width": np.random.randn(20)*0.5 + 1.7}) # df_new["P-Species"] = result.predict(df_new) # df_new["P-Species"].head(3) dataset = sm.datasets.get_rdataset("biopsy", package="MASS").data dataset.rename(columns={"class": "Class"}, inplace=True) dataset.Class = dataset.Class.map({"benign": 0, "malignant": 1}) model = smf.logit("Class ~ V1", data=dataset) result = model.fit() print(result.prsquared)