def forward_selected(train_data, target): remaining = set(train_data.columns) remaining.remove(target) remaining.remove('intercept') selected = ['intercept'] current_score, best_new_score = float("inf"), float("inf") while remaining and current_score == best_new_score: scores_candidates = [] for candidate in remaining: #formula = "{} ~ {} + 1".format(target, ' + '.join(selected + [candidate])) score = smf.Logit(train_data[target], train_data[selected + [candidate]]).fit().bic #score = smf.logit(formula, train_data).fit().bic scores_candidates.append((score, candidate)) scores_candidates.sort(reverse=True) print(scores_candidates) best_new_score, best_candidate = scores_candidates.pop() if current_score > best_new_score: remaining.remove(best_candidate) selected.append(best_candidate) current_score = best_new_score #formula = "{} ~ {} + 1".format(target, ' + '.join(selected)) model = smf.Logit(train_data[target], train_data[selected]).fit() return model
def logRegress(logger, df): '''Performs logistic regression This function gets the logistic regression coefficients for a dataframe that is passed in. Decorators: lD.log Arguments: logger {logging.Logger} -- logs error information df {dataframe} -- input dataframe where first column is 'sud' ''' try: print("Performing Logistic Regression...") train_cols = df.columns[1:] logit = sm.Logit(df['sud'], df[train_cols]) result = logit.fit() # Get odds, which are assessed by coeff[race/agebin/sex/setting] params = result.params conf = result.conf_int() conf['OR'] = params conf.columns = ['2.5%', '97.5%', 'OR'] CI_OR_df = np.exp(conf) resultsDF = CI_OR_df[['OR']].join(CI_OR_df.ix[:, :'97.5%']) except Exception as e: logger.error('logRegress failed because of {}'.format(e)) return resultsDF
def predict_winner_looser_housing(df_hh): df_hh['hh_income_2'] = df_hh['hh_income'] ** 2 df_hh['natural_gas:accommodation_size'] = df_hh['accommodation_size'] * df_hh['natural_gas'] df_hh['domestic_fuel:accommodation_size'] = df_hh['accommodation_size'] * df_hh['domestic_fuel'] df_hh['winner'] = 0 + 1 * (df_hh['housing_expenditures_increase'] < 55 * df_hh['nb_beneficiaries']) # weird to put 55, but works better variables = ['hh_income', 'hh_income_2', 'consumption_units', 'nb_beneficiaries', 'natural_gas', #'domestic_fuel:accommodation_size', 'natural_gas:accommodation_size', 'domestic_fuel', 'accommodation_size', 'age_18_24', 'age_25_34', 'age_35_49', 'age_50_64'] variables_ols = ['natural_gas', 'domestic_fuel', 'accommodation_size', 'age_18_24', 'age_25_34', 'age_35_49', 'age_50_64'] logit = smf.Logit(df_hh['winner'], df_hh[variables]).fit() probit = smf.Probit(df_hh['winner'], df_hh[variables]).fit() ols = smf.ols(formula = 'winner ~ \ natural_gas + domestic_fuel + accommodation_size + \ age_18_24 + age_25_34 + age_35_49 + age_50_64', data = df_hh).fit() # natural_gas * accommodation_size + domestic_fuel * accommodation_size + \ clf = tree.DecisionTreeClassifier(max_depth=3) clf = clf.fit(df_hh[variables], df_hh['winner']) # regr = tree.DecisionTreeRegressor(max_depth=3) # regr.fit(df_hh[variables], df_hh['winner']) return logit, probit, ols, clf
def CreateLogReg(x, y): """ Returns the logistic regression """ X2 = add_constant(x) est = sm.Logit(y, X2) est2 = est.fit() return est2
def Backward_Elimination(result,threshold,train_y,train_x): while np.amax(result.pvalues) > threshold: value_name=pd.Series(list(result.pvalues.values),index=result.pvalues.index) rem.append(value_name.idxmax()) train_xx=train_x.drop(rem,axis=1) model=sm.Logit(train_y,train_xx) result=model.fit() return result
def fit_firth(y, X, start_vec=None, step_limit=1000, convergence_limit=0.0001): logit_model = smf.Logit(y, X) if start_vec is None: start_vec = np.zeros(X.shape[1]) beta_iterations = [] beta_iterations.append(start_vec) for i in range(0, step_limit): pi = logit_model.predict(beta_iterations[i]) W = np.diagflat(np.multiply(pi, 1 - pi)) var_covar_mat = np.linalg.pinv( -logit_model.hessian(beta_iterations[i])) # build hat matrix rootW = np.sqrt(W) H = np.dot(np.transpose(X), np.transpose(rootW)) H = np.matmul(var_covar_mat, H) H = np.matmul(np.dot(rootW, X), H) # penalised score U = np.matmul(np.transpose(X), y - pi + np.multiply(np.diagonal(H), 0.5 - pi)) new_beta = beta_iterations[i] + np.matmul(var_covar_mat, U) # step halving j = 0 while firth_likelihood(new_beta, logit_model) > firth_likelihood( beta_iterations[i], logit_model): new_beta = beta_iterations[i] + 0.5 * (new_beta - beta_iterations[i]) j = j + 1 if (j > step_limit): raise Exception('Firth regression failed') return None beta_iterations.append(new_beta) if i > 0 and ( np.linalg.norm(beta_iterations[i] - beta_iterations[i - 1]) < convergence_limit): break return_fit = None if np.linalg.norm(beta_iterations[i] - beta_iterations[i - 1]) >= convergence_limit: raise Exception('Firth regression failed') else: # Calculate stats fitll = -firth_likelihood(beta_iterations[-1], logit_model) beta = beta_iterations[-1] bse = np.sqrt(np.diagonal(-logit_model.hessian(beta_iterations[-1]))) # Wald test pvalues = 2 * (1 - stats.norm.cdf(np.abs(beta / bse))) return_fit = beta, bse, fitll, pvalues, i return return_fit
def test_mode(): df = get_data() data = df.drop(columns = ['PassengerId','Ticket','Name','Cabin']) y, X = dmatrices("Survived ~ Pclass+Sex+Age+SibSp+Parch+Fare+Embarked", df, return_type = 'dataframe') #logit = sm.Logit(data["Survived"],data[]) logit = sm.Logit(y,X) result = logit.fit() print result.summary2()
def regressor(y, X, model_type=model_type): if model_type == "linear": regressor = sm.OLS(y, X).fit() elif model_type == "logistic": regressor = sm.Logit(y, X).fit() else: print("\nWrong Model Type : " + model_type + "\nLinear model type is seleted.") model_type = "linear" regressor = sm.OLS(y, X).fit() return regressor
def test_firth_likelihood(self): p = np.loadtxt(P_BINARY) m = np.loadtxt(M) firth_vars = np.loadtxt(FIRTH_VARS) mod = smf.Logit(p, m) fll = firth_likelihood(firth_vars, mod) self.assertAlmostEqual(fll, 97.13375906431875) with warnings.catch_warnings(): warnings.simplefilter("ignore") fll = firth_likelihood(firth_vars + 100, mod) self.assertAlmostEqual(fll, np.inf)
def backwardElimination(x,y, sl): numVars = len(x[0]) for i in range(0, numVars): regressor_OLS = sm.Logit(y, x).fit() maxVar = max(regressor_OLS.pvalues).astype(float) if maxVar > sl: for j in range(0, numVars - i): if (regressor_OLS.pvalues[j].astype(float) == maxVar): x = np.delete(x, j, 1) regressor_OLS.summary() return x
def binarylogit(df): """ Specifies a binary logistic model which aims simply to determine whether or not the room is occupied. """ print(""" BINARY LOGISTIC MODEL ___________________________________ """) # Stores the time for output. query_time = df.counts_time.max() # Trains a logit model. Prints the coefficients and their significance levels. y = df["counts_truth_is_occupied"] X = np.array(df["counts_associated"].reshape(-1, 1)) log = sm.Logit(y, X).fit() print(log.summary()) log = LogisticRegression() log.fit(X, y) pseudo_r = log.score(X, y) print("Model score: ", pseudo_r) # Splits the dataset into 60% training and 40% testing. df_train, df_test = train_test_split(df, test_size=0.4, random_state=5) # Trains model on the training set. y_train = df_train["counts_truth_is_occupied"] X_train = np.array(df_train["counts_associated"].reshape(-1, 1)) log_train = LogisticRegression() log_train.fit(X_train, y_train) # Tests model on the test set. y_test = df_test["counts_truth_is_occupied"] X_test = np.array(df_test["counts_associated"].reshape(-1, 1)) predicted = log_train.predict(X_test) probs = log_train.predict_proba(X_test) # Prints accuracy score, confusion matrix, classification report and MSE. ascore = metrics.accuracy_score(y_test, predicted) print("Accuracy score (1 = perfect prediction) ", ascore) print("Confusion matrix:\n", metrics.confusion_matrix(y_test, predicted)) print("Residual sum of squares: %.2f" % np.mean( (log_train.predict(X_test) - y_test)**2)) return [pseudo_r, 0, ascore, query_time]
def forward_select(data, response): remaining = set(data.columns) remaining.remove(response) selected = [] current_score, best_new_score = float('inf'), float('inf') while remaining: aic_with_candidates=[] for candidate in remaining: aic = smf.Logit(data[response],data[selected+[candidate]]).fit().aic aic_with_candidates.append((aic, candidate)) aic_with_candidates.sort(reverse=True) best_new_score, best_candidate=aic_with_candidates.pop() if current_score > best_new_score: remaining.remove(best_candidate) selected.append(best_candidate) current_score = best_new_score print ('aic is {},continuing!'.format(current_score)) else: print ('forward selection over!') break return selected
def trained_pipeline(self): np.random.seed(44) log_reg = sm.Logit(self.y_train, self.X_train) param_array = np.zeros(len(self.independent_variables) + 1) iterations = 5 for i in range(0, iterations): model = log_reg.fit(maxiter=5000, avextol=.0001, epsilon=.1, full_output=1, disp=0) params = list(model.params) param_array = [a + b for a, b in zip(param_array, params)] start = [p / iterations for p in param_array] model = log_reg.fit(start_params=start, maxiter=5000, avextol=.0001, epsilon=.1, full_output=1, disp=1) return model
def fit_lineage_effect(lin, c, k): """Fits the model `k ~ Wa` using binomial error with logit link. W are the lineages (either a projection of samples, or cluster indicators) and covariates. Returns the index of the most significant lineage Args: lin (numpy.array) Population structure matrix or lineage association binary matrix (n, k) c (numpy.array) Covariants matrix (n, j) k (numpy.array) Variant presence-absence vector (n, 1) Returns: max_lineage (int or None) Index of the most significant lineage or None is could not fit """ if c.shape[0] == lin.shape[0]: X = np.concatenate((np.ones(lin.shape[0]).reshape(-1, 1), lin, c), axis=1) else: X = np.concatenate((np.ones(lin.shape[0]).reshape(-1, 1), lin), axis=1) lineage_mod = smf.Logit(k, X) try: lineage_res = lineage_mod.fit(method='newton', disp=False) wald_test = np.divide(np.absolute(lineage_res.params), lineage_res.bse) # excluding intercept and covariates max_lineage = np.argmax(wald_test[1:lin.shape[1] + 1]) # In case regression fails except (statsmodels.tools.sm_exceptions.PerfectSeparationError, np.linalg.LinAlgError): max_lineage = None return max_lineage
def trained_model(train_features, train_outcomes): np.random.seed(44) log_reg = sm.Logit(train_outcomes, train_features) param_array = np.zeros(len(train_features) + 1) iterations = 5 for i in range(0, iterations): model = log_reg.fit(maxiter=5000, avextol=0.0001, epsilon=0.1, full_output=1, disp=0) params = list(model.params) param_array = [a + b for a, b in zip(param_array, params)] start = [p / iterations for p in param_array] model = log_reg.fit( start_params=start, maxiter=5000, avextol=0.0001, epsilon=0.1, full_output=1, disp=1, ) return model
def test_fit_firth(self): p = np.loadtxt(P_BINARY) m = np.loadtxt(M) mod = smf.Logit(p, m) start_vec = np.zeros(m.shape[1]) start_vec[0] = np.log(np.mean(p) / (1 - np.mean(p))) (intercept, kbeta, beta, bse, fitll) = fit_firth(mod, start_vec, m, p) self.assertAlmostEqual(intercept, 0.13954805021495864) self.assertAlmostEqual(kbeta, -0.31901219992017243) tbeta = [ 1.9588025, 0.7251749, -0.5605268, -0.5396909, 0.0594742, -0.2001795, -1.4873298, 0.5050208 ] self.assertTrue(abs((np.array(beta) - np.array(tbeta)).max()) < 1E-7) self.assertAlmostEqual(bse, 2.848207537910185) self.assertAlmostEqual(fitll, -58.249948818380204) fitll = fit_firth(mod, start_vec, m, p, step_limit=10, convergence_limit=1E-10) self.assertEqual(fitll, None)
def logit(RV, df_norm, keys): #%% ''' X contains all precursor data, incl train and test X_train, y_train are split up by TrainIsTrue Preciction is made for whole timeseries ''' if keys is None: no_data_col = ['TrainIsTrue', 'RV_mask', 'fit_model_mask'] keys = df_norm.columns keys = [k for k in keys if k not in no_data_col] X = df_norm[keys] X = add_constant(X) y = RV.RV_bin_fit # Get training years TrainIsTrue = df_norm['TrainIsTrue'] # Get mask to make only prediction for RV_mask dates pred_mask = df_norm['RV_mask'] model_set = sm.Logit(y[TrainIsTrue], X[TrainIsTrue], disp=0) try: model = model_set.fit(disp=0, maxfun=60) prediction = model.predict(X[pred_mask]) except np.linalg.LinAlgError as err: if 'Singular matrix' in str(err): model = model_set.fit(method='bfgs', disp=0) prediction = model.predict(X[pred_mask]) else: raise except Exception as e: print(e) model = model_set.fit(method='bfgs', disp=0) prediction = model.predict(X) #%% return prediction, model
data.replace({'diagnosis': 'M'}, 1, inplace=True) from sklearn.preprocessing import MinMaxScaler minmax = MinMaxScaler() data[data.columns] = minmax.fit_transform(data.values) ## Model Building features = [i for i in data.columns if i != 'diagnosis'] x = data[features] y = pd.DataFrame(data['diagnosis']) import statsmodels.formula.api as sm import statsmodels.discrete.discrete_model as sm model = sm.Logit(y, x) result = model.fit(method='ncg') print(result.summary()) #ROC Curve from sklearn.metrics import roc_curve, auc x['predict'] = result.predict(x) fpr, tpr, thresholds = roc_curve(y, x['predict']) roc_auc = auc(fpr, tpr) print("area under the ROC curve:%f" % roc_auc) # Optimal Cutoff i = np.arange(len(tpr)) roc = pd.DataFrame({ 'fpr': pd.Series(fpr, index=i),
predict2 = lr.predict(age2) predict2 ##### for age=105 value is greater than one. #######From this linear regression,we can not interpret whether a person buys or not ################ Lab: Logistic Regression ###################### #Dataset: Product Sales Data/Product_sales.csv sales = pd.read_csv( "C:\\Koti\\data science\\DS_batch1\\datasets\\Product_sales.csv") # Build a logistic Regression line between Age and buying import statsmodels.formula.api as sm logit = sm.Logit(sales['Bought'], sales['Age']) logit result = logit.fit() result result.summary2() ###coefficients Interval of each coefficient print(result.conf_int()) #One more way of fitting the model from sklearn.linear_model import LogisticRegression logistic = LogisticRegression() logistic.fit(sales[["Age"]], sales["Bought"]) #A 4 years old customer, will he buy the product?
trainingSet=data1 cat_vars=['Sex','Embarked'] data_vars=trainingSet.columns.values.tolist() to_keep=[i for i in data_vars if i not in cat_vars] data_final=trainingSet[to_keep] data_final.columns.values data_final_vars=data_final.columns.values.tolist() y=['Survived'] X=[i for i in data_final_vars if i not in y] cols=["Age", "Sex_male","Sex_female"] X=data_final[cols] y=data_final['Survived'] model = sm.Logit(y,X) result=model.fit() print(result.summary()) logreg = LogisticRegression() logreg.fit(X, y) y_pred = logreg.predict(X) print(y_pred) print('Accuracy of logistic regression classifier on training set: {:.2f}'.format(logreg.score(X, y))) #Transform categorical data inot dummies cat_vars=['Sex','Embarked'] for var in cat_vars:
dataset = df[df['gender'] == 2] dataset_data = np.array(dataset[['loan', 'months']]) predicted_2 = model.predict(dataset_data) df_gender = list(df.gender) j = 0 for i in range(0, len(df_gender) - 1): if df_gender[i] == 2: df_gender[i] = predicted_2[j] j += 1 df.gender = df_gender #SM requires that intercept be manually entered intercept = [1] * len(df) df['intercept'] = intercept x = df[[ 'months', 'group', 'female', 'Asia', 'North_America', 'South_America', 'Europe', 'Africa', 'intercept' ]] y = np.array(df[['expired']]) y = y.ravel() #To determine % change from coefficientsl #coefs = sm.Logit(y, x).fit().params.values #for i in coefs: # print abs(math.exp(i) - 1) print sm.Logit(y, x).fit().summary2()
calc_prob = model.predict_proba(X_holdout)[:, 1] Final_Output = df_holdout Final_Output['Probability_of_Attrition'] = calc_prob Final_Output['Final_Prediction'] = final Final_Output.head() # Save Dataframe to CSV Final_Output.to_csv("Final_Output.csv") # Exploration of Log Reg Coefficients # Reference used: # https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8 import statsmodels.api as sm import statsmodels.formula.api as smf logit_model=smf.Logit(y_train, X_train) results=logit_model.fit() print(results.summary2()) # Column names for reference above df_column_name = pd.DataFrame(list(df_full_data.drop(['Attrition'], axis=1).columns.values)) df_column_name.index = np.arange(1, len(df_column_name) + 1) df_column_name import statsmodels.api as sm import statsmodels.formula.api as smf #logit_model=smf.Logit(y_train, X_train) #results=logit_model.fit() #print(results.summary2()) model= smf.logit(formula="Attrition~ Age + DailyRate + DistanceFromHome + EnvironmentSatisfaction + JobInvolvement + JobSatisfaction + NumCompaniesWorked + RelationshipSatisfaction + TotalWorkingYears + TrainingTimesLastYear + WorkLifeBalance + YearsAtCompany + MaritalStatus_Divorced + MaritalStatus_Married + MaritalStatus_Single + OverTime_No + OverTime_Yes", data= df_full_data).fit()
X1, X2 = np.meshgrid( np.arange(start=X_set[:, 0].min() - 1, stop=X_set[:, 0].max() + 1, step=0.01), np.arange(start=X_set[:, 1].min() - 1, stop=X_set[:, 1].max() + 1, step=0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), alpha=0.75, cmap=ListedColormap(('red', 'green'))) plt.xlim(X1.min(), X1.max()) plt.ylim(X2.min(), X2.max()) for i, j in enumerate(np.unique(y_set)): plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c=ListedColormap(('red', 'green'))(i), label=j) plt.title('Logistic Regression (Test set)') plt.xlabel('Age') plt.ylabel('Estimated Salary') plt.legend() plt.show() import statsmodels.formula.api as sm model = sm.Logit(y_train, X_train) result = model.fit()
# Estimate a standard random utility model # Descrition from __future__ import division import statsmodels.formula.api as smf from sc_4_1_build_final_data import select_variables_final_dataset data = select_variables_final_dataset(weekend=False, selection=0) data_suburb = data.query('option_dt == 0') data_suburb['excess_cost_vp'] = data_suburb['cost_vp'] - data_suburb['cost_tc'] data_suburb['excess_consumption_vp'] = data_suburb['income'] - data_suburb[ 'excess_cost_vp'] variables = ['excess_consumption_vp'] logit = smf.Logit(data_suburb['option_vp'], data_suburb[variables]).fit() print logit.summary() params = logit.params print logit.get_margeff().summary() #probit = smf.Probit(data['option_downtown'], data[variables]).fit() #print probit.summary() # Bosser sur un logit imbriqué (banlieue vs centre ville, puis vp vs tc)
plot_roc(lr_prob) #STATSMODELS import statsmodels.api as sm from statsmodels.tools.tools import add_constant from sklearn import linear_model,cross_validation, feature_selection,preprocessing x_train,x_test, y_train, y_test = cross_validation.train_test_split(X, y, train_size=0.80, random_state=5) model = sm.Logit(y_train, add_constant(X_train)).fit() model.summary() # Spark from __future__ import print_function import sys from pyspark.sql import SparkSession
print("X:", type(X)) print(X.columns) model = smf.OLS(y, X) result = model.fit() result.summary() model = smf.OLS.from_formula('quality ~ alcohol', data=dataset) results = model.fit() print(results.params) #Classification using stats model. dataset['rate_code'] = (dataset['quality'] > 4).astype(np.float32) y, X = dmatrices('rate_code ~ alcohol', data=dataset) sns.distplot(X[y[:, 0] > 0, 1]) sns.distplot(X[y[:, 0] == 0, 1]) model = smf.Logit(y, X) result = model.fit() result.summary2() yhat = result.predict(X) sns.distplot(yhat[y[:, 0] > 0]) sns.distplot(yhat[y[:, 0] == 0]) yhat = result.predict(X) > 0.955 print(sklearn.metrics.classification_report(y, yhat)) #Classification using sklrean logistic regression. model = sklearn.linear_model.LogisticRegression() y, X = dmatrices( 'rate_code ~ alcohol + sulphates + citric_acid + fixed_acidity', data=dataset)
# auc print('auc : %.3f' % auc(fpr,tpr)) # ROC 曲线 plt.plot(fpr,tpr,'k--',label='ROC (area = %.3f)' % auc(fpr,tpr)) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title(' Roc Curve') plt.legend() plt.show() dat4x = dat4.drop(['churn'],axis=1) dat4y = dat4.churn X_train,X_test,y_train,y_test = train_test_split(dat4x,dat4y,test_size=0.3) model4 = smf.Logit(y_train,X_train).fit() y_test_proba_4 = model4.predict(X_test) # auc 面积高达0.913 是不是有问题? FuncScore(y_test,y_test_proba_4) #%% # 2.2.4 使用LASSO 和Ridge # 查看下样本比例,4:3 ,接近1:1 print(dat3.churn.value_counts()) # 使用网格搜索交叉验证 # 训练集 、 测试集拆分 from sklearn.model_selection import GridSearchCV dat5x = dat3.drop(['churn'],axis=1) dat5y = dat3.churn
dummy = pd.get_dummies(x['Gender']) ######### x = pd.concat((x[['Age', 'EstimatedSalary']], dummy[['Female']]), axis=1) # add constant! x = statsmodels.api.add_constant(x) ##################### # split into training and validation data import sklearn.model_selection x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split( x, Y, train_size=0.8) # logistic regression import statsmodels.formula.api as sm model = sm.Logit(y_train, x_train) # upper case result = model.fit() # 参数都在result里面 result.summary() # 0.42 y_pre = result.predict(x_test) def check(x): if x >= 0.5: i = 1 else: i = 0 return (i) y_pred = y_pre.map(check)
import pandas as pd import statsmodels.api as sm import statsmodels.formula.api as smf df=pd.read_excel('Final_Fluview_Practical_dataset.xlsx') df_regress=df[['Virus Strain','Age','Gender','Hospitalized?','Swine Contact?','Attended Agricultural Event?']] print(df_regress[df_regress.isna().any(axis=1)]) df_regress=df_regress.dropna() for column in df_regress: print(column,df_regress[column].unique()) df_regress['Virus Strain']=df_regress['Virus Strain'].map({'Influenza A H3N2v':1,'Influenza A H1N1v':0,'Influenza A H1N2v':0,'Influenza A H7N2':0}) df_regress['Age']=df_regress['Age'].map({'<18 Years':0,'>=18 Years':1}) df_regress['Gender']=df_regress['Gender'].map({'Male':0,'male':0,'Female':1,'female':1}) df_regress['Hospitalized?']=df_regress['Hospitalized?'].map({'No':0,'no':0,'Yes':1,'yes':1}) df_regress['Swine Contact?']=df_regress['Swine Contact?'].map({'No':0,'no':0,'Yes':1,'yes':1}) df_regress['Attended Agricultural Event?']=df_regress['Attended Agricultural Event?'].map({'No':0,'no':0,'Yes':1,'yes':1}) for column in df_regress: print(column,df_regress[column].unique()) endog=df_regress['Virus Strain'] exog=df_regress[['Age', 'Gender', 'Hospitalized?', 'Swine Contact?', 'Attended Agricultural Event?']] exog = sm.add_constant(exog) endog=endog.values exog=exog.values print(sum(endog)) logit=smf.Logit(endog,exog) result=logit.fit() print(result.summary())
# Create boxplots showing team salary distribution by world series wins box_data = binary[['std_salary', 'WSWin_Y']] bp = box_data.boxplot(by='WSWin_Y') ax = plt.gca() plt.title( 'Salary Distribution For \n World Series Winners (1) vs. Non Winners (0)') plt.suptitle("") ax.set_ylabel('Standardized Salary') ax.set_xlabel('World Series Winner? 1 = Yes, 0 = No') # To quantify the relationship between standardized salary and world series wins, I ran a logistic regression, which is the appropriate model for a situation where the dependent variable is binary. The model yields a positive coefficient for standardized salary, but interestingly, the coefficient is not statistically significant at the 95% confidence level. It might be at the 90% confidence level, but here it is not. Contrast that with the winning percentage OLS regression above, where the coefficient was statistically significant. To me, this says that the case for salary as a determinant of world series championships is perhaps weaker than it is for winning percentage. # In[56]: # Logistic regression of WS Win on Standardized Salary logit = sm.Logit(binary['WSWin_Y'], binary['std_salary']) log_result = logit.fit() print log_result.summary() # (4) Conclusion # # At the beginning of this project, I set out to understand the relationship between salary compensation and performance in baseball, ultimately using winning percentage and world series wins as measures of performance. Using python pandas and inferential statistics, I discovered that there is in fact a significant relationship between a standardized salary metric and winning percentage, though not one as strong as I had hypothesized. There is a nominally positive relationship also between salary paid and world series wins, although it is not sigificant at the 95% level. # # My advice for general managers in baseball, based on this data, is that to have a shot at the world series you want to have a salary that is above the average compared to other teams in the league (see the box plot above). After that, it's hard to say what improves your odds as having an even higher salary level doesn't necessarily do so. Further research might look at key injuries in a season, the relationship between player performance and salary, or other factors. # (5) Potential Limitations # # There are definitely limitations to this analysis which prevent it from *proving* that salary drives winning. First, as mentioned above, the inferential statistics methods I used do indicate 'statistical significance' at a 95% level. But at a higher level of confidence, we might fail to reject the null hypothesis that the coefficient for salary in this specification is actually zero. That is even clearer in the second exercise where I looked at salary as a driver of world series wins. In that case, we could not reject the null hypothesis. # # As my first reviewer also pointed out, there is also the limitation in statistics that "correlation does not imply causation." It is possible that instead of salary driving performance, the causal link could run the other direction: teams that start winning have to pay players more to keep them. You could imagine a team of young players that becomes successful, and as a result payroll becomes very expensive. We did not look within the dataset to see if this was true, and there may be data limitations which prevent us from doing so. #