def gmm_logit_test(Y, y_pred, sample_condition_labels, condition_labels): Y = [sample_condition_labels[i] for i in Y] y_pred = np.append(y_pred, [0, 0, 1, 1]) Y.extend([ condition_labels[0], condition_labels[1], condition_labels[0], condition_labels[1] ]) Y = pd.get_dummies(Y) Y['intercept'] = 1 logit = dm.Logit(y_pred, Y[['intercept', condition_labels[1]]]) with warnings.catch_warnings(): warnings.filterwarnings('error') try: logit_mod = logit.fit(disp=0) logit_pvalue, logit_coef = logit_mod.pvalues[1], logit_mod.params[ 1] except ConvergenceWarning: logit_mod, logit_pvalue, logit_coef = "NC", 1, "NC" if logit_pvalue == 0: logit_pvalue = np.finfo(np.float).tiny logit_results = { 'pvalue': logit_pvalue, 'coef': logit_coef, 'model': logit_mod } return (logit_results)
def _fit_logit(self, X, y, keep_significant, sig_value=0.05): f = sm.Logit(y, X).fit() if keep_significant: sig_cols = list(f.params[f.pvalues <= sig_value].index) return f, sig_cols else: return f
def fit_model_disaster(df, model): '''Fit logistic regression model for outcome that if a disaster occurr. Parameters ---------- df: dataframe Contains columns of outcome and predictors. model: str A formula string used to construct a design matrix. Ex. 'has_disaster ~ C(iso3) + C(region_id) + year_id + cz_prop' Returns ------- modeled_df: dataframe Contains draws of predicted events. 1 indicates event occurs; 0 indicates otherwise. ''' # Return matrix of y and X based on model formula. y, X = dmatrices(model, df, return_type='dataframe') # Construct logit model. logit = sm.Logit(y, X) # Fit model result = logit.fit() modeled_df = df.copy() # Return predicted probability based on fitted model. modeled_df['predicted_prob'] = result.predict() # Create columns to store samples of sampled Bernoulli variables. modeled_df = modeled_df.reindex(columns=list(modeled_df.columns) + DRAW_COLS) # Draw random samples from Bernoulli distribution # based on predicted probability. modeled_df[DRAW_COLS] = np.array([bernoulli.rvs(p, size=DRAW) \ for p in modeled_df.predicted_prob.values]) return modeled_df
def backward_selection(df, dv, regression=True, alpha=.05): flag = 0 cols_dropped = [dv] if regression: while flag == 0: model = lm.OLS(endog=np.array(df[dv]), exog=np.array(df.drop(cols_dropped, axis=1))) results = model.fit() pvalues = list(results.pvalues) drop_index = pvalues.index(max(pvalues)) col_drop = df.drop(cols_dropped, axis=1).columns[drop_index] print(col_drop + '-' + str(pvalues[drop_index])) if pvalues[drop_index] > alpha: cols_dropped.append(col_drop) else: flag = 1 else: while flag == 0: model = sm.Logit(endog=np.array(df[dv]), exog=np.array(df.drop(cols_dropped, axis=1))) results = model.fit() pvalues = list(results.pvalues) drop_index = pvalues.index(max(pvalues)) col_drop = df.drop(cols_dropped, axis=1).columns[drop_index] print(col_drop + '-' + str(pvalues[drop_index])) if pvalues[drop_index] > alpha: cols_dropped.append(col_drop) else: flag = 1 cols_dropped.remove(dv) return cols_dropped
def __init__(self): from .results.results_glm import Lbw self.res2 = Lbw() self.res1 = GLM(self.res2.endog, self.res2.exog, family=sm.families.Binomial()).fit() modd = discrete.Logit(self.res2.endog, self.res2.exog) self.resd = modd.fit(start_params=self.res1.params * 0.9, disp=False)
def standard_ci_sm(X, y, active, leftout_indices, alpha=0.1): XE = X[:, active] X2, y2 = XE[leftout_indices, :], y[leftout_indices] import statsmodels.discrete.discrete_model as sm logit = sm.Logit(y2, X2) result = logit.fit(disp=0) LU = result.conf_int(alpha=alpha) return LU.T
def check_betas(data): empirical_betas = [] for i in range(data.shape[1] - 1): model = sm.Logit(data.iloc[:, 0], sm.tools.add_constant(data.iloc[:, i + 1])) result = model.fit(disp=0) empirical_betas.append(result.params[1]) return np.array(empirical_betas)
def setup_class(cls): endog_bin = (endog > endog.mean()).astype(int) cls.cov_type = 'cluster' mod1 = GLM(endog_bin, exog, family=families.Binomial()) cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group)) mod1 = smd.Logit(endog_bin, exog) cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))
def fit(self, X, Y): self.X=sm.add_constant(X) self.Y=Y self.model = dis_mod.Logit(self.Y.values, self.X) try: self.fitted_model = self.model.fit() return except np.linalg.LinAlgError: print('uh oh !!! some linear algebra broke ignore this set and move on') return -1
def fit_logit_model(self, X_df, y): '''Fit a logit model. Args: | X_df: the independent variables (covariates and or aggregated genotypes) for 1 gene. | y (numpy.ndarray): values are 0/1 (unaffected/affected). Returns: logit_result (statsmodels.discrete.discrete_model.BinaryResultsWrapper): contains results from fitting logit regression model. ''' logit_model = sm.Logit(y, X_df.transpose()) logit_result = logit_model.fit(method='bfgs') return logit_result
def regression(cps_alldata): cps_alldata['intercept'] = np.ones(len(cps_alldata)) model = sm.Logit(endog=cps_alldata.ss_indicator, exog=cps_alldata[[ 'Aged_yn', 'Disabled_yn', 'Widowed_yn', 'ssi_yn', 'sur_yn', 'vet_yn', 'paw_yn', 'hed_yn', 'hcsp_yn', 'hfdval', 'mcare', 'mcaid', 'uc_yn', 'wc_yn', 'intercept' ]]) results = model.fit() print results.summary() ypred = results.predict() cps_alldata['Prob_Received'] = ypred return cps_alldata
def q2(): print() print() print() data = pd.read_csv( "https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv" ) # Turn string attribute values into 0 or 1 dummy_ranks = pd.get_dummies(data["Sex"], prefix="Sex") # Save class labels classLabels = data["Survived"] # Remove attributes that don't help with classification or has been turned into an int already (sex) data = data.drop([ "PassengerId", "Name", "Ticket", "Cabin", "Embarked", "Sex", "Survived" ], 1) data = data.join(dummy_ranks.ix[:'Sex_female']) # Create training and testing data (80/20 split) x_train, x_test, y_train, y_test = cross_validation.train_test_split( data, classLabels, test_size=0.2) # Create and train logistic model logit = discrete_model.Logit(y_train, x_train, missing='drop') fitted = logit.fit() # Predict testing data result = fitted.predict(x_test) result = np.asarray(result) y_test = np.asarray(y_test) total = 0 correct = 0 for i in range(0, len(result)): if not math.isnan(result[i]): total += 1 if round(result[i]) == y_test[i]: correct += 1 accuracy = correct / float(total) print("Accuracy of Logistic Regression: " + str(accuracy))
def logistic_with_interactions(self, treatment_var, interaction_vars, y_var, other_vars=None, use_bootstrapped=False): """ Uses statamodels packages to do logisitic regressions on the treatment variable and up to two interaction terms and optionally other covariates. Uses either the downsampled or the bootstrapped data. Inputs: treatment_var = string with treatment var name interaction_vars = a list of up to two variable to interact with the treatment y_var = outcome variable (labels) other_vars = list of other covariates to add to the specification. use_bootstrapped = default = False, uses the .df_downsampled data, if True uses .bootstrapped Outputs: The summary table with the regression results. """ if use_bootstrapped: df = self.bootstrapped.copy() else: df = self.df_downsampled.copy() if len(interaction_vars) == 2: for i in range(len(interaction_vars) - 1): df[interaction_vars[i] + 'x' + interaction_vars[i + 1]] = df[ interaction_vars[i]] * df[interaction_vars[i + 1]] interaction_vars.append(interaction_vars[i] + 'x' + interaction_vars[i + 1]) treatment_ints = [treatment_var] for i in interaction_vars: df['Tx' + i] = df[treatment_var] * df[i] treatment_ints.append('Tx' + i) if other_vars is not None: X = df[[*treatment_ints, *interaction_vars, *other_vars]] else: X = df[[*treatment_ints, *interaction_vars]] y = df[[y_var]] LogitSM = sm.Logit(np.asarray(y.astype(int)), X.astype(int)) return LogitSM.fit().summary()
import numpy as np # read in the data & create matrices df = pd.read_csv('https://stats.idre.ucla.edu/stat/data/binary.csv') y, X = dmatrices('admit ~ gre + gpa + C(rank)', df, return_type = 'dataframe') #y = ravel.column_or_1d(y, warn=True) #y=y.reshape(-1,1) y X # sklearn output model = LogisticRegression(fit_intercept = True, C = 1e9) mdl = model.fit(X, np.ravel(y)) model.coef_ # sm logit = sm.Logit(y, X) logit.fit().params #%% import statsmodels.api as sm from sklearn.datasets import make_blobs x, y = make_blobs(n_samples=50, n_features=2, cluster_std=5.0, centers=[(0,0), (2,2)], shuffle=False, random_state=12) x logit_model = sm.Logit(y, sm.add_constant(x)).fit() print( logit_model.summary2())
CPS_dataset.disability = np.where(CPS_dataset.pediseye == 'Yes', 1, CPS_dataset.disability) CPS_dataset.disability = np.where(CPS_dataset.pedisout == 'Yes', 1, CPS_dataset.disability) CPS_dataset.disability = np.where(CPS_dataset.pedisphy == 'Yes', 1, CPS_dataset.disability) CPS_dataset.disability = np.where(CPS_dataset.pedisrem == 'Yes', 1, CPS_dataset.disability) #Regression CPS_dataset['intercept'] = np.ones(len(CPS_dataset)) CPS_dataset['indicator'] = CPS_dataset.WIC_infant CPS_dataset['infant'] = np.where(CPS_dataset.a_age < 1, 1, 0) model = sm.Logit(endog=CPS_dataset.indicator, exog=CPS_dataset[[ 'intercept', 'hfdval', 'cov_hi', 'ch_mc', 'infant', 'fwsval' ]]) logit_res = model.fit() print logit_res.summary() probs = logit_res.predict() CPS_dataset['probs'] = probs # CPS total benefits and Administrative total benefits state_benefit = {} state_recipients = {} CPS_dataset['WIC_val'] = 0 for fip in Admin_totals.Fips: this_state = (CPS_dataset.gestfips == fip) CPS_dataset.loc[this_state & (CPS_dataset.indicator == 1),
def analyze_data(dataset, ysets, xsets, type='LinR', normalize=False): #dataset = fix_dataset(dset[ysets+xsets[0]]) regre_type = '' if type == 'LinR': regre_type = 'Linear Regression' elif type == 'LogR': regre_type = 'Logistic Regression' else: print('Error Unknown regression method {:s}'.format(type)) quit() old_rsqr = 0 old_fstat = 10e20 del_rsqr = 0 del_fstat = 0 num_sig = 0 for y in ysets: print( '##############################################################################' ) print('\t\t\t\t\t\t', y) print( '##############################################################################' ) cnt = 0 for x in xsets: # my method up above to take care of missing or unusable values dmodel = fix_dataset(dataset[[y] + x]) Y = dmodel[y] print() print() print( '################################################################################' ) print( '##################################### Testing x set {:d}'. format(cnt + 1)) print( '##################################### Using {:s} on dependent variable {:s}' .format(regre_type, y)) print( '################################################################################' ) print('\t\tX or dependent variables:\n', x) print( '################################################################################' ) print( '################################################################################' ) print( '################################################################################' ) print() X = dmodel[x] #print('+++++++++++++++++++++++++++++++++++++++++Before: ', X[0,0]) if normalize: #X = pd.DataFrame(minmaxscale(X, axis=1), columns=x) X = pd.DataFrame(minmaxscale(X, axis=0), columns=x, index=dmodel.index) #print('+++++++++++++++++++++++++++++++++++++++++After: ', X.iloc[0,0]) #print(X['per_capita_income']) #X.loc[:, 'per_capita_income'] = (dmodel['per_capita_income'].values - dmodel['per_capita_income'].mean())/dmodel['per_capita_income'].std() #print(X['per_capita_income']) #X = dataset.loc[:, x] X2 = sm.add_constant(X) if type == 'LinR': est = sm.OLS(Y, X2) print('\n\nThe basic dirs are\n', dir(est)) est2 = est.fit() print('\n\nThe fitted dirs are\n', dir(est2)) rsqr = est2.rsquared if rsqr > old_rsqr: old_rsqr = rsqr pvals = est2.pvalues fval = est2.fvalue ftest = est2.f_test print('R-squared:', rsqr) print('P-values:\n', pvals) find_significant(x, pvals) print('Fvalue\n', fval) print(est2.summary()) print('\n\nThe summary dirs are:\n', dir(est2.summary())) vif = calculate_vif(X2) print('VIF:\n', vif) elif type == 'LogR': #clf = LogisticRegression(solver='lbfgs',max_iter=1000).fit(X2, Y) #params = clf.coef_ #log_like = np.log(np.abs(params)) #print(params) #print(log_like) model = dis_mod.Logit(Y, X2) model2 = model.fit() loglikly = calculate_log_like(x, model2.params) print(dir(model)) print(model.df_model) print(model2.summary()) print('model 2', dir(model2)) print('R squared:', model2.prsquared) #print(dir(model2.summary().tables)) print('The log likelyhoods are:') show_labeled_list(loglikly, x) print('pvalue for {:s}: {:f}'.format(x[0], model2.pvalues.loc[x[0]])) y_pred = model2.predict(X2, linear=True) #print(y_pred) yp = list() for e in y_pred: if e > 0: yp.append(1) else: yp.append(0) #print(model.loglikeobs(x)) #df_confusion = pd.crosstab(Y, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True) plot_confusion_matrix( Y, yp, classes=['A', 'NA'], title='Confusion matrix, without normalization') #plot_confusion_matrix(df_confusion) #vif = pd.Series([VIF(X2.values, i) # for i in range(X2.shape[1])], # index=X2.columns) vif = calculate_vif(X2) print('VIF:\n', vif) plt.show() cnt += 1 print() print() return
# # In[21]: import statsmodels.discrete.discrete_model as sm import statsmodels.formula.api as smf import statsmodels.api as sma from sklearn.model_selection import train_test_split x = sma.add_constant(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=True, random_state=1) results = sm.Logit(y_train, x_train).fit() y_pred = pd.DataFrame(results.predict(x_test)) compare = pd.concat([y_test, y_pred], axis=1) compare.columns = ['y_test', 'y_pred'] compare.loc[compare['y_pred'] >= 0.5, 'y_pred_r'] = 1 compare.loc[compare['y_pred'] < 0.5, 'y_pred_r'] = 0 compare['match'] = compare['y_pred_r'] == compare['y_test'] print(results.summary()) print('') print('This model predicted default with ' + str(len(compare[compare['match'] == True]) / len(compare) * 100) + '% accuracy') print('') print(results.params) # In[22]:
plt.hist(y_train) ## explore skewness plt.title('Histogram for labels') plt.xlabel('Class') plt.ylabel('frequency') plt.show() print("in the train set, we have ", y_train.sum(), "Class 1, ", len(y_train) - y_train.sum(), "Class 0. The skewness is not severe") ## logistic regression ### The first model we want to try is logistic regression. Logistic regression ### is simple and it can give us a test of how our features work. ### scale data set at first from sklearn import preprocessing x_train_s = preprocessing.scale(x_train) x_test_s = preprocessing.scale(x_test) import statsmodels.discrete.discrete_model as sm logit2=sm.Logit(y_train, x_train_s).fit(maxiter=200) print(logit2.summary()) print(logit2.pvalues) [(e1,e2) for (e1, e2) in zip(logit2.pvalues, x_train.columns)] from sklearn.linear_model import LogisticRegression lor = LogisticRegression() lor.get_params() param = {'C' : [0.1, 1, 5, 50, 100]} from sklearn.model_selection import GridSearchCV grid_search = GridSearchCV(lor, param_grid = param, cv = 10) grid_search.fit(x_train_s, y_train) lor_cv = grid_search.cv_results_ lor_cv_score_mean = lor_cv['mean_test_score'] lor_cv_score_sd = lor_cv['std_test_score'] lor_cv_c = ["0.1", "1", "5", "50", "100"] plt.plot(lor_cv_c, lor_cv_score_mean, color = 'blue')
skLogitModel.fit(X_trainData, y_trainData) skYPredict = skLogitModel.predict(X_trainData) falsePositiveRate, truePositiveRate, thresholds = roc_curve( y_trainData, skYPredict) # compute Area Under the Curve (AUC) using the trapezoidal rule area = auc(falsePositiveRate, truePositiveRate) crosstab = pd.crosstab(y_trainData, skYPredict, rownames=['True'], colnames=['Predicted'], margins=True) print("-----------\"Confusion Matrix\"-------------") print(crosstab) print("-----------\"Params\"-------------") statLogitModel = sm.Logit(y_trainData, X_trainData).fit_regularized() print(statLogitModel.params) print("-----------\"P-values\"-------------") print(statLogitModel.pvalues) plt.plot(falsePositiveRate, truePositiveRate, color='red', label='ROC' + str(area)) plt.plot([0, 1], [0, 1], linestyle='dotted') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC & AUC') plt.legend() plt.show()
def model(self): y, X = dmatrices('Y~X', self.data, return_type='dataframe') logit = sm.Logit(y, X) result = logit.fit() print result.summary()
CPS_dataset.cov_hi = CPS_dataset.cov_hi.astype(int) #Regression dummies = pd.get_dummies(CPS_dataset['a_mjind'], drop_first=True) CPS_dataset = pd.concat([CPS_dataset, dummies], axis=1) # Make it so most likely age to receive is mid forties CPS_dataset['age_squared'] = ((CPS_dataset.a_age - 47) * (CPS_dataset.a_age - 47)) * -1 CPS_dataset['intercept'] = np.ones(len(CPS_dataset)) CPS_dataset['indicator'] = CPS_dataset.wc_yn model = sm.Logit(endog=CPS_dataset.indicator, exog=CPS_dataset[[ 'intercept', 'Armed Forces', 'Construction', 'Educational and health services', 'Financial activities', 'Information', 'Leisure and hospitality', 'Manufacturing', 'Mining', 'Other services', 'Professional and business services', 'Public administration', 'Transportation and utilities', 'Wholesale and retail trade', 'age_squared', 'dis_cs', 'dis_hp', 'finc_dis', 'cov_hi', 'a_sex' ]]) logit_res = model.fit() probs = logit_res.predict() targets = CPS_dataset.wc_yn print 'Accuracy score for logistic regression estimation', accuracy( targets, probs) CPS_dataset['probs'] = probs # CPS total benefits and Administrative total benefits CPS_totalb = (CPS_dataset.wc_val[CPS_dataset.indicator == 1] *
pd.Series(boots).hist() pd.Series(f1).hist() pd.Series(f1_sub).hist() plt.title("F1 distribution of selected features") #### find out the p-values of the log regression ###matrix outputs NAN because features are correlated import statsmodels.discrete.discrete_model as sm y_train=pd.DataFrame(y_train) y_train.reset_index(inplace=True) y_train=y_train.squeeze()["Survived"] logit = sm.Logit(y_train,X_trans[feature_subset].dropna()) f = logit.fit() print(f.params) print(f.summary()) ####based on the p-values I should exclude definetly: # - small_fam (1) --> highy correlated with large family (skeptical) # - 1class (1) --> skeptical # - B cabin (0.9) # - E_S (embarked at S) (0.7) # - No cabin (0.2) # fem-family (1) # F- cabin (0.2) ###Fare_family, 1class feature_2=['Age', 'large_fam', 'female', '3class', "1class",
y_train = np.array(y_train) X_test = np.array(x_test) y_test = np.array(y_test) clf = LogisticRegression() clf.fit(X_train, y_train) y_true = y_test y_pred = clf.predict(X_test) y_hat = clf.predict_proba(X_test)[:, 1] y_hat = np.ravel(y_hat) fprs, tprs, thresholds = metrics.roc_curve(y_true, y_hat) plt.plot(fprs, tprs, 'k--', lw=2) plt.scatter(fprs, tprs, c='k', marker='x', s=50) plt.plot(np.arange(-.05, 1.05, .01), np.arange(-.05, 1.05, .01), '--', color='lightgray') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.xlim([0 - .05, 1 + .05]) plt.ylim([0 - .05, 1 + .05]) plt.show() print(metrics.roc_auc_score(y_test, y_hat)) print(metrics.confusion_matrix(y_true, y_pred)) res = sm.Logit(y_train, X_train).fit(method='bfgs') print(res.summary())
#===========================6.1 Load the dataset============================ data = loadtxt(trainName, delimiter=',') X = data[:, 0:2] y = data[:, 2] #================6.2 Separate positive and begative examples================ pos = where(y == 1) neg = where(y == 0) m, n = X.shape y.shape = (m, 1) #=======================6.3 Calculate the parameters======================== paramFile = open(paramName, 'w') #6.3.1 Calculate based on sequencing depth and breadth=========================== it = np.ones(shape=(m, n + 1)) it[:, 1:n + 1] = X try: logit = sm.Logit(y, it) theta = logit.fit().params #except: # iniTheta = np.ones(shape=(m, n+1)) # iniTheta[:, 1:n+1] = X # theta=decorated_cost(it, y, n) except: sys.stderr.write( 'Warning! Perfect separation found. Couldnt opimize the regression parameters.\n' ) sys.stderr.write( ' Target genome is too different from training genomes, to avoid errors consider training with another set\n' ) theta = [-34.738273, 550.229, 1080.350] paramFile.write( str(theta[0]) + "," + str(theta[1]) + "," + str(theta[2]) + "\n")
recon = [x_te_recon, y_te_recon, z_te_recon] # group counterfactual reconstruction data recon_cf = [x_te_recon_cf, y_te_recon_cf, z_te_recon_cf] # sample and numpy x_te_recon, y_te_recon, z_te_recon = [torch_sample_np(data) for data in [x_te_recon, y_te_recon, z_te_recon]] x_te_recon_cf, y_te_recon_cf, z_te_recon_cf = [torch_sample_np(data) for data in [x_te_recon_cf, y_te_recon_cf, z_te_recon_cf]] # show graph for to evaluate reconstruction output # compare_recon(x_te, np.array(np.concatenate([y_te_recon_cf, a_te[:, np.newaxis], z_te_recon_cf, x_te_recon], 1))) # Fit models using train setRF_ypred_te input_tr = np.concatenate((np.ones(len(a_tr))[:, np.newaxis], x_tr, a_tr[:, np.newaxis]), 1) # logistic regression LR = sm.Logit(y_tr, input_tr) LR_fit = LR.fit() RF = RandomForestClassifier(n_estimators=10, max_depth=4) RF.fit(input_tr, y_tr) # ------------------ test model accuracy on test set input_te = np.concatenate((np.ones(len(a_te))[:, np.newaxis], x_te, a_te[:, np.newaxis]), 1) def get_accuracies(input): # normal LR LR_ypred_te = LR_fit.predict(input) # normal LR with fixed a input_te_adjusted = input.copy() input_te_adjusted[:, -1] = 0 LR_adj_ypred_te = LR_fit.predict(input_te_adjusted)
from patsy import dmatrices dftrain, dftest = train_test_split(dataframenew, test_size=0.2) y_train1, x_train1 = dmatrices(formula1, data=dftrain, return_type='dataframe') y_test1, x_test1 = dmatrices(formula1, data=dftest, return_type='dataframe') y_train1num = np.squeeze(y_train1) x_train1num = np.squeeze(x_train1) y_test1num = np.squeeze(y_test1) x_test1num = np.squeeze(x_test1) x_train1.columns # Use this Train Test Split without changing Features # In[598]: import statsmodels.discrete.discrete_model as sm model1 = sm.Logit(y_train1, x_train1) res = model1.fit() res.summary() # In[599]: print(res.summary().as_latex()) # In[600]: from statsmodels.nonparametric.kde import KDEUnivariate kde_res1 = KDEUnivariate(res.predict()) kde_res1.fit() plt.figure(figsize=(9, 6)) plt.plot(kde_res1.support, kde_res1.density) plt.fill_between(kde_res1.support, kde_res1.density, alpha=0.2)
# plotting fig, ax = plt.subplots() plt.scatter(failures_freq.index, failures_freq, c='red', s=20) plt.scatter(no_failures_freq.index, np.zeros(len(no_failures_freq)), c='blue', s=40) plt.xlabel('X: Temperature') plt.ylabel('Number of Failures') ax.grid() #get the data in correct format y, X = dmatrices('Y ~ X', data, return_type='dataframe') #build the model logit = sm.Logit(y, X) result = logit.fit() # summarize the model print(result.summary(), '\n') print('Parameters: ', result.params, '\n') yhat = logit.predict( result.params, exog=None, linear=False ) #Predict response variable of a model given exogenous variables. yhatsum = yhat**5 fig, ax = plt.subplots() plt.plot(yhat, c='red') ax.set_ylabel('Probability of failures') ax.set_xlabel('Temperature')
df.dropna(axis=0, inplace=True) df.shape df.isnull().sum(axis=0) #model df['Diabetes'] = np.where(df.Diabetes == 2, 1, 0) df['Gender'] = np.where(df.Gender == 'Male', 1, 0) y = df.Diabetes x = df.iloc[:, :9] df.head() x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.3, random_state=0) model = sm.Logit(y_train, x_train) result = model.fit() result.summary2() #deleting the insignificant row del x_train['Alkphos'] del x_test['Alkphos'] model1 = sm.Logit(y_train, x_train) result1 = model1.fit() result1.summary2() #parameters of the model result1.params #odds ratio np.exp(result1.params)
CPS_dataset.disability = np.where(CPS_dataset.pedisphy == 'Yes', 1, CPS_dataset.disability) CPS_dataset.disability = np.where(CPS_dataset.pedisrem == 'Yes', 1, CPS_dataset.disability) #Regression CPS_dataset['ptotval'] = np.where( CPS_dataset.ptotval > 5200, 1, 0) # Did you make the enough money during your base period? CPS_dataset['intercept'] = np.ones(len(CPS_dataset)) CPS_dataset['fmoop'] = np.where(CPS_dataset.fmoop > 0, 1, 0) CPS_dataset['F_MV'] = np.where(CPS_dataset.f_mv_fs > 0, 1, 0) CPS_dataset['indicator'] = CPS_dataset.uc_yn model = sm.Logit(endog=CPS_dataset.indicator, exog=CPS_dataset[[ 'intercept', 'weuemp', 'ptotval', 'pruntype', 'a_explf', 'lkweeks', 'lkstrch', 'F_MV', 'disability' ]]) logit_res = model.fit() probs = logit_res.predict() targets = CPS_dataset.uc_yn print 'Accuracy score for logistic regression estimation', accuracy( targets, probs) CPS_dataset['probs'] = probs # CPS total benefits and Administrative total benefits state_benefit = {} state_recipients = {} for fip in Admin_totals.Fips: this_state = (CPS_dataset.gestfips == fip)
for ret in ret_list: sig = Data.loc[:, retention] == ret x = Data.loc[sig, ad_cols].fillna(0) y = Data.loc[sig, target] zero_cols = x.columns[x.sum() == 0] x.drop(columns = zero_cols , inplace = True) x['intercept'] = 1 if is_sklearn == True: model = LR(**lr_d) model.fit(x , y) score = model.score(x,y) aic_list.append((ret, -score , model)) print('Score of model is %s' %score) else: try: model = sm.Logit(y, x).fit() except: model = sm.Logit(y, x).fit(method='powell') aic_list.append((ret, model.aic, model)) print('F1 Score is: ', f1_score(y, model.predict() >= 0.5)) ret, aic, model = min(aic_list, key=operator.itemgetter(1)) print(ret, aic) # Slicing Based on Retention Period and Cleaning Columns ret_Data_df = Data.loc[(Data.loc[:, retention] == ret) | Data.retention.isna(), :].fillna(0) rm_cols = set(pk_cols + useless_cols).intersection(ret_Data_df.columns) ret_Data_df.drop(columns = list(rm_cols) , inplace = True) org_cols = list(set(ret_Data_df.columns).intersection(nmws.columns))