def _fit_backward(self): y_train = pd.Series(self._model.model.endog.copy(), name=self.dependent_variable, index=self._observations_idx) X_train = pd.DataFrame(self._model.model.exog, columns=self._model.model.exog_names, index=self._observations_idx) model = Logit(y_train, X_train, missing='drop') results = model.fit(**self._model_params) max_pvalue = results.pvalues.drop('Intercept').max() while max_pvalue > self.sig_level_removal: x_to_drop = results.pvalues.drop('Intercept').idxmax() X_train = X_train.drop(x_to_drop, axis=1) model = Logit(y_train, X_train, missing='drop') results = model.fit(**self._model_params) max_pvalue = results.pvalues.drop('Intercept').max() self._model = results return
def _fit_logit(X, y): metadata = {} lm = Logit(y, X) try: flm = lm.fit(method='bfgs') logging.info(flm.summary()) output = format_output(flm) metadata = { 'summary': str(flm.summary()), 'summary2': str(flm.summary2()) } except (np.linalg.linalg.LinAlgError, PerfectSeparationError, ValueError) as e: # Perfect separation or singular matrix - use NaN logging.warning(e) output = { col: { "coef": None, "std_err": None, "t_values": None, "p_values": None, } for col in X.columns } return output, metadata
def get_trained_logit_model(): """ In 'data/traning_data/' specific ETFs were visually inspected and white noise (0) and not white noise (1) were assigned. This data is loaded here to train the logistic parameters, but you can use this functionality as a template to train your own :ARGS: :class:`NoneType` :RETURNS: a fitted :class:`statsmodels.Logit` Logistic regression that has been fit to the trained data """ f = pandas.ExcelFile('../data/training_data/Trained Data.xlsx') data = reduce( lambda a, b: numpy.vstack([a, b]), map(lambda x: f.parse(x, index_col=0)[['ln_chg', 'Y']], f.sheet_names)) data = pandas.DataFrame(data, columns=['ln_chg', 'Y']) #add an intercept for the model (required by statsmodels.api.Logit data['intercept'] = 1.0 #fit the model logit_model = Logit(endog=data['Y'], exog=data[['intercept', 'ln_chg']]) return logit_model.fit()
def model_fit(store_path, X_df_path, y_df_path, feature_key="Gender", X_cols=[], testing=False, include_prc=False): if testing: # If testing the just print X and y columns print store_path, X_df_path, y_df_path, feature_key, X_cols return feature_key, ({"llf": 0.1}, "TEMP SUMMARY") ## Not testing. Fit the models and return the measures print store_path, X_df_path, y_df_path, feature_key, X_cols X = pd.read_hdf(store_path, key=X_df_path, columns=X_cols) y = pd.read_hdf(store_path, key=y_df_path) print "Created dataframes, feature_key=%s" % feature_key print "X.shape = %s, y.shape = %s" % (X.shape, y.shape) model = Logit(y, X) res = model.fit() predict = res.predict() measures = get_all_eval_measures(predict, model.endog, include_prc=include_prc) measures["llf"] = res.llf measures["aic"] = res.aic measures["bic"] = res.bic measures["prsquared"] = res.prsquared measures["df_model"] = res.df_model return feature_key, (measures, res.summary2())
def run_LR(model_dir, trainSet, testSet, timestep): # get shape H, W, C = trainSet.shape[1], trainSet.shape[2], trainSet.shape[3] train_len, test_len = trainSet.shape[0], testSet.shape[0] # get XY features trainX, trainY = getXSYS(trainSet, timestep) testX, testY = getXSYS(testSet, timestep) print('Train set shape: X/Y', trainX.shape, trainY.shape) print('Test set shape: X/Y', testX.shape, testY.shape) # check data imbalance neg, pos = np.bincount(trainX.flatten()) weight_ratio = neg / pos print('Weight ratio:', round(weight_ratio, 5)) # logit logit_model = Logit(trainY, trainX) result = logit_model.fit() print(result.summary2()) # LR logreg = LogisticRegression( class_weight={1: weight_ratio}) # balance pos/neg in training set logreg.fit(trainX, trainY) predY = logreg.predict(testX) y_true = testY.reshape((-1, H, W, C)) y_pred = predY.reshape((-1, H, W, C)) print('#Positive predictions: ', y_pred[y_pred != 0].shape[0], '\n') return y_true, y_pred
def get_trained_logit_model(): """ In 'data/traning_data/' specific ETFs were visually inspected and white noise (0) and not white noise (1) were assigned. This data is loaded here to train the logistic parameters, but you can use this functionality as a template to train your own :ARGS: :class:`NoneType` :RETURNS: a fitted :class:`statsmodels.Logit` Logistic regression that has been fit to the trained data """ f = pandas.ExcelFile('../data/training_data/Trained Data.xlsx') data = reduce(lambda a, b: numpy.vstack([ a, b]), map( lambda x: f.parse(x, index_col = 0)[['ln_chg', 'Y']], f.sheet_names)) data = pandas.DataFrame(data, columns = ['ln_chg', 'Y']) #add an intercept for the model (required by statsmodels.api.Logit data['intercept'] = 1.0 #fit the model logit_model = Logit(endog = data['Y'], exog = data[['intercept', 'ln_chg']]) return logit_model.fit()
def model_fit(y,X, X_cols, y_col, feature_key="Gender", testing=False, include_prc=False): if testing: # If testing the just print X and y columns print X_cols, y_col return feature_key, ({"llf": 0.1}, "TEMP SUMMARY") ## Not testing. Fit the models and return the measures print feature_key, X.shape, X_cols, y.shape, y_col X = pd.DataFrame(X,columns=X_cols) y = pd.Series(y,name=y_col) print "Created dataframes." model = Logit(y,X) res = model.fit() measures = get_all_eval_measures(res, model.endog, include_prc=include_prc) return feature_key, (measures, res.summary2())
def validate_data_predictors(data, outcome, predictors, probabilities, survival_time=False): """Validates that for each predictor column, all values are within the range 0-1 Notes ----- If a predictor has probability `True`, checks that the column `data[predictor]` has all values in the appropriate range. If a predictor has probability `False`, converts all values in that column with logistic regression Parameters ---------- data : pd.DataFrame the data set outcome : str the column to use as 'outcome' predictors : list(str) the list of predictors for the analysis probabilities: list(bool) list marking whether a predictor is a probability survival_time : bool if the analysis is a survival time analysis """ for i in range(0, len(predictors)): if probabilities[i]: #validate that any predictors with probability TRUE are b/t 0 and 1 if (max(data[predictors[i]]) > 1) or (min(data[predictors[i]]) < 0): raise ValueError("{val} must be between 0 and 1".format( val=repr(predictors[i]))) else: if survival_time: from statsmodels.sandbox.cox import CoxPH #TODO else: from statsmodels.api import Logit #predictor is not a probability, convert with logistic regression model = Logit(data[outcome], data[predictors[i]]) data[predictors[i]] = model.fit().y_pred return data
def logit_fit(x_data, y, name='train'): """拟合逻辑回归,并绘制 gini,ks 曲线 \n 参数: ---------- x_data: dataframe, 已清洗好的训练数据的特征变量,函数会自动补上常数项 \n y: series or 1darray, 目标变量 \n name: 训练模型的名字 \n 返回值: ---------- result: statsmodel.api.Logit.fit() 返回结果对象 \n model_eval: ModelEval, 模型评估对象""" model_data = add_constant(x_data) logit_reg = Logit(y, model_data) result = logit_reg.fit(disp=False) prob = result.predict(model_data) model_eval = ModelEval(-prob, y, name, plot=False) a = "************************************" print(a + " " + name + " " + a) print(result.summary2()) model_eval.giniks_plot() return result, model_eval
def validate_data_predictors(data, outcome, predictors, probabilities, survival_time=False): """Validates that for each predictor column, all values are within the range 0-1 Notes ----- If a predictor has probability `True`, checks that the column `data[predictor]` has all values in the appropriate range. If a predictor has probability `False`, converts all values in that column with logistic regression Parameters ---------- data : pd.DataFrame the data set outcome : str the column to use as 'outcome' predictors : list(str) the list of predictors for the analysis probabilities: list(bool) list marking whether a predictor is a probability survival_time : bool if the analysis is a survival time analysis """ for i in range(0, len(predictors)): if probabilities[i]: #validate that any predictors with probability TRUE are b/t 0 and 1 if (max(data[predictors[i]]) > 1) or (min(data[predictors[i]]) < 0): raise ValueError("{val} must be between 0 and 1" .format(val=repr(predictors[i]))) else: if survival_time: from statsmodels.sandbox.cox import CoxPH #TODO else: from statsmodels.api import Logit #predictor is not a probability, convert with logistic regression model = Logit(data[outcome], data[predictors[i]]) data[predictors[i]] = model.fit().y_pred return data
def fit(self, X, y, print_detail=False): """Stepwise logistic regression. Use Score test for entry, Wald test for remove. 参数: ---------- X: array-like, n_sample * p_features. 特征变量数据集,程序会自动添加常数项 y: array-like, 目标变量 print_detail: bool, 是否打印出逐步回归选择变量的细节 返回值: ----------- result: 类型同 statsmodels.api.Logit 对象 fit 方法的返回值, 逐步回归选出的模型。""" def score_test(Xtest, y_true, y_predict): """对step forward进入的变量进行Score检验。函数假设新进入的变量放在最后. Xtest包括vars_old(似合模型并给出预测值y_predict的),和var_new(一个待检验的新变量)。 Score检验假设待检验变量的系数为0,所以Xtest虽然包括了它的数据,但拟合参数是按没有此变量计算出来的。""" u = np.dot(Xtest.T, y_true - y_predict) # 一阶导数 h = np.dot(Xtest.T * (y_predict * (1 - y_predict)).values.reshape(len(y_predict)), Xtest) # 二阶导数 score = np.dot(np.dot(u.T, np.linalg.inv(h)), u) # score 是 1*1 数组 p_value = chi2.sf(score, 1) # Score统计量服从自由度为1的卡方分布 return score, p_value def print_wrap(*obj): if print_detail: print(*obj) X = add_constant(X) xenter = ['const'] xwait = list(X.columns.drop('const')) logit_mod = Logit(y, X[xenter]) logit_res = logit_mod.fit(disp=0) y_predict = logit_res.predict(X[xenter]) step = 0 while xwait: # 停止条件1:所有变量都进入了模型 # entry test score = pd.Series(name='Score') pvalue = pd.Series(name='P>chi2') for xname in xwait: tmpX = X[xenter + [xname]] score[xname], pvalue[xname] = score_test(tmpX, y, y_predict) step += 1 print_wrap("step {}: Variables Entry test:\n".format(step), pd.concat([score, pvalue], axis=1)) # 打印运行信息 if pvalue.min() <= self.entry: # 最显著的变量选进来 xin = pvalue.argmin() xenter.append(xin) xwait.remove(xin) print_wrap("step {0}: {1} entered.\n".format(step, xin)) else: # 停止条件2:没有变量符合进入标准 print_wrap("Stopped 2: No vars can get entered any more.\n") break # remove test while True: # 程序运行到这里,说明新增了变量进来 logit_mod = Logit(y, X[xenter]) logit_res = logit_mod.fit(disp=0) y_predict = logit_res.predict(X[xenter]) test = logit_res.wald_test_terms().dframe # wald 检验 pvalue = test['P>chi2'].iloc[1:] # 常数项不参与检验 step += 1 print_wrap("step {}: Variables remove test:\n".format(step), test) if pvalue.max() < self.stay: xout = None print_wrap("step {}: No Variables removed:\n".format(step)) break # 所有变量都是显著的,不剔除变量 else: xout = pvalue.argmax() xenter.remove(xout) xwait.append(xout) print_wrap("step {0}: {1} removed.\n".format(step, xout)) # 停止条件3:如果刚进入的变量又剔除 if xin == xout: print_wrap("Stopped 3: last var entered also got removed.\n") break else: print_wrap("Stopped 1: all var available got entered.\n") return Logit(y, X[xenter]).fit(disp=0)
# ============================================================================= # 1 199032 # 0 199032 # # Now the calss is balanced # ============================================================================= # ============================================================================= # # Model building using logistic regression after SMOTE # ============================================================================= from statsmodels.api import Logit import statsmodels.api as sm Train_X = sm.add_constant(Train_X) Test_X = sm.add_constant(Test_X) M1 = Logit(Train_Y, Train_X) #Model Defination M1_Model = M1.fit() #Model Building M1_Model.summary() #Model Output/Summary # Prediction and Validation Test_X['Test_Prob'] = M1_Model.predict( Test_X) # Store probability predictions in "Text_X" df Test_X.columns # Classify 0 or 1 based on 0.5 cutoff Test_X['Test_Class'] = np.where(Test_X['Test_Prob'] >= 0.5, 1, 0) Test_X.columns #Test_X['Test_Class'].value_counts() / len(Test_X) from sklearn.metrics import accuracy_score, confusion_matrix, classification_report print(accuracy_score(Test_X['Test_Class'], Test_Y)) #0.9914445887901876
# into an Nx2 array. hw_exog = heights_weights[['Height', 'Weight']].values # Logit model 1: Using GLM and the Binomial Family w/ the Logit Link # Note I have to add constants to the `exog` matrix. The prepend = True # argument prevents a warning about future change to the default argument. logit_model = GLM(male, sm.add_constant(hw_exog, prepend = True), family = sm.families.Binomial(sm.families.links.logit)) logit_model.fit().summary() # Get the coefficient parameters. logit_pars = logit_model.fit().params # Logit model 2: Using the Logit function. logit_model2 = Logit(male, sm.add_constant(hw_exog, prepend = True)) logit_model2.fit().summary() # Get the coefficient parameters logit_pars2 = logit_model2.fit().params # Compare the two methods again. They give the same parameters. DataFrame({'GLM' : logit_pars, 'Logit' : logit_pars2}) # Draw a separating line in the [height, weight]-space. # The line will separate the space into predicted-male # and predicted-female regions. # Get the intercept and slope of the line based on the logit coefficients intercept = -logit_pars['const'] / logit_pars['x2'] slope = -logit_pars['x1'] / logit_pars['x2']
def fit_model(X, y, co=0.1): sm = Logit((y.clip(0, 1) > co).astype(float), X.clip(0, 1), missing='drop') return sm.fit(disp=False)
def fit_model(X, y, co=0.1): sm = Logit((y.clip(0, 1)>co).astype(float), X.clip(0, 1), missing='drop') return sm.fit(disp=False)
print("params") print(paramsnone) print("maxscore") print(scorenone) print("###################################") print("params") print(l2params) print("maxscore") print(l2score) print("###################################") print("params") print(l1params) print("maxscore") print(l1score) from statsmodels.api import Logit, add_constant print("Statistics with constant") logit_modelb = Logit(y_train, add_constant(X_train)) result = logit_modelb.fit() print("Summary") print(result.summary2()) print( " ########################################################################" ) print("Statistics without intercept") logit_model = Logit(y_train, X_train) result = logit_model.fit() print("Summary") print(result.summary2())
Test_X = full_raw_data.loc[full_raw_data['Source'] == 'Test'].drop(['Source', 'Loan_Status'], axis = 1).copy() Test_Y = full_raw_data.loc[full_raw_data['Source'] == 'Test'] Test_Y = Test_Y['Loan_Status'].copy() Test_Y.shape ########################### # Model Building ########################### # Build logistic regression model (using statsmodels package/library) # And drop the insignificant variables from statsmodels.api import Logit M1 = Logit(Train_Y, Train_X) # (Dep_Var, Indep_Vars) # this is model definition M1_Model = M1.fit() # This is model building M1_Model.summary() # This is model output/summary Cols_to_drop = ['Dependents_3+'] M2 = Logit(Train_Y, Train_X.drop(Cols_to_drop, axis = 1)) M2_Model = M2.fit() M2_Model.summary() Cols_to_drop.append('Self_Employed_Yes') M3 = Logit(Train_Y, Train_X.drop(Cols_to_drop, axis = 1)) M3_Model = M3.fit() M3_Model.summary() Cols_to_drop.append('Gender_Male') M4 = Logit(Train_Y, Train_X.drop(Cols_to_drop, axis = 1))
hw_exog = heights_weights[['Height', 'Weight']].values # Logit model 1: Using GLM and the Binomial Family w/ the Logit Link # Note I have to add constants to the `exog` matrix. The prepend = True # argument prevents a warning about future change to the default argument. logit_model = GLM(male, sm.add_constant(hw_exog, prepend=True), family=sm.families.Binomial(sm.families.links.logit)) logit_model.fit().summary() # Get the coefficient parameters. logit_pars = logit_model.fit().params # Logit model 2: Using the Logit function. logit_model2 = Logit(male, sm.add_constant(hw_exog, prepend=True)) logit_model2.fit().summary() # Get the coefficient parameters logit_pars2 = logit_model2.fit().params # Compare the two methods again. They give the same parameters. DataFrame({'GLM': logit_pars, 'Logit': logit_pars2}) # Draw a separating line in the [height, weight]-space. # The line will separate the space into predicted-male # and predicted-female regions. # Get the intercept and slope of the line based on the logit coefficients intercept = -logit_pars['const'] / logit_pars['x2'] slope = -logit_pars['x1'] / logit_pars['x2']
class LogisticRegression: def __init__(self, endog_name_f=None, exog_name_f=None, data_f=None, add_constant_f=True, scale_vars_list_f=list(), interaction_name_f=list(), convert_bool_dict_f=dict(), convert_ord_list_f=list(), cat_col_omit_dict_f=dict(), hier_model_vars_dict_f=dict(), hier_exog_var_names_f=list(), classification_threshold_f=0.5, **kwds): self.endog_name = endog_name_f self.exog_name = exog_name_f self.data = data_f.reindex() self.add_constant = add_constant_f self.interaction_name = interaction_name_f self.convert_bool_dict = convert_bool_dict_f # convert_bool_dict_f self.convert_ord_list = convert_ord_list_f # convert_ord_list_f self.hier_model_vars_dict = hier_model_vars_dict_f self.hier_exog_var_names = hier_exog_var_names_f self.cat_col_names = list() self.cat_col_omit_dict = cat_col_omit_dict_f self.cat_col_drop_names = list() self.dummy_col_omit_list = list() self.scale_vars_list = scale_vars_list_f self.classification_threshold = classification_threshold_f self.exog_name_model = None self.model_data = None self.model = None self.model_result = None self.est_coef = dict() self.exog_matrix = None self.endog_matrix = None self.fitted_values = None self.refresh_model_data() def check_for_exog_conflict(self): t_bool_ord = set(self.convert_bool_dict.keys()).intersection( set(self.convert_ord_list)) t_cat_bool = set(self.cat_col_omit_dict.keys()).intersection( set(self.convert_bool_dict.keys())) t_cat_ord = set(self.cat_col_omit_dict.keys()).intersection( set(self.convert_ord_list)) t_hier_exog = set(self.exog_name).intersection( set(self.hier_exog_var_names)) if len(t_bool_ord) > 0: print( 'WARNING appearing in both boolean and ordinal variable lists: %s' % ', '.join(t_bool_ord)) if len(t_cat_ord) > 0: print( 'WARNING appearing in both categorical and ordinal variable lists: %s, ignoring categorical' % ', '.join(t_cat_ord)) if len(t_cat_bool) > 0: print( 'WARNING appearing in both categorical and boolean variable lists: %s, ignoring categorical' % ', '.join(t_cat_bool)) if len(t_hier_exog) > 0: print( 'WARNING appearing in both exogenous and hierarchical exogenous variable lists: %s' ) def convert_cat_to_dummies(self): # get list of exogenous variables that are categorical and need to be converted self.cat_col_names = [ x for x in self.exog_name if ((x not in list(self.convert_bool_dict.keys())) and ( x not in self.convert_ord_list) and (self.data[x].dtype == 'O') ) ] prefix_sep = '_' [ self.cat_col_omit_dict.update( {x: self.data[x].mode(dropna=True).values[0]}) for x in self.cat_col_names if x not in list(self.cat_col_omit_dict.keys()) ] self.cat_col_drop_names = [ k + prefix_sep + v for k, v in self.cat_col_omit_dict.items() ] if len(self.cat_col_names) > 0: return pd.get_dummies(self.data[self.cat_col_names], prefix_sep=prefix_sep, columns=self.cat_col_names, dtype=bool) else: return None def convert_to_bool(self): t_df = pd.DataFrame() t_col_names = list() for k, v in self.convert_bool_dict.items(): t_col_names.append(k + '_' + v + '_TF') t_df = pd.concat([t_df, self.data[k] == v], axis=1) t_df.columns = t_col_names return t_df def convert_to_ordinal(self): t_df = pd.DataFrame() t_col_names = list() for c in self.convert_ord_list: t_col_names.append(c + '_ORD') t_df = pd.concat([t_df, self.data[c].astype(int)], axis=1) t_df.columns = t_col_names return t_df def create_hier_vars(self): t_df = pd.DataFrame() for c in self.hier_model_vars_dict.keys(): t_model = LogisticRegression( endog_name_f=self.hier_model_vars_dict[c] ['external_model'].endog_name, exog_name_f=self.hier_model_vars_dict[c] ['external_model'].exog_name, data_f=self.hier_model_vars_dict[c]['external_model'].data, add_constant_f=self.hier_model_vars_dict[c] ['external_model'].add_constant, scale_vars_list_f=self.hier_model_vars_dict[c] ['external_model'].scale_vars_list, convert_ord_list_f=self.hier_model_vars_dict[c] ['external_model'].convert_ord_list, convert_bool_dict_f=self.hier_model_vars_dict[c] ['external_model'].convert_bool_dict, cat_col_omit_dict_f=self.hier_model_vars_dict[c] ['external_model'].cat_col_omit_dict, interaction_name_f=self.hier_model_vars_dict[c] ['external_model'].interaction_name, classification_threshold_f=self.hier_model_vars_dict[c] ['classification_threshold']) ####### t_model.create_model_object() t_pred_prob, t_pred_class = self.hier_model_vars_dict[c][ 'external_model'].make_predictions( pred_data=t_model.exog_matrix, select_coef=self.hier_model_vars_dict[c]['select_coef']) t_col_names = list(t_df.columns) + [c, c + '_TF'] t_df = pd.concat([t_df, t_pred_prob, t_pred_class], axis=1) t_df.columns = t_col_names return t_df def create_interactions(self): def create_dummy_df(data_f, v1, v2, drop_list_f): prefix_sep = '_' if (data_f[v1].dtype == bool) and (data_f[v2].dtype == bool): # both bool - create interaction effect directly t_df = pd.DataFrame(data_f[v1] & data_f[v2], columns=[v1 + ' * ' + v2 + '_INT']) return t_df, ({v1: None}, {v2: None}) elif (data_f[v1].dtype != bool) and (data_f[v2].dtype != bool): # both cat v1_dummies = pd.get_dummies(data_f[v1], prefix_sep=prefix_sep, dtype=bool) v1_omit = data_f[v1].mode( dropna=True).values[0] if v1 not in list( drop_list_f.keys()) else drop_list_f[v1] v2_dummies = pd.get_dummies(data_f[v2], prefix_sep=prefix_sep, dtype=bool) v2_omit = data_f[v2].mode( dropna=True).values[0] if v2 not in list( drop_list_f.keys()) else drop_list_f[v2] t_df = pd.DataFrame(index=data_f.index) for c1 in [x for x in v1_dummies.columns if x != v1_omit]: for c2 in [x for x in v2_dummies.columns if x != v2_omit]: t_df = pd.concat([ t_df, pd.DataFrame(v1_dummies[c1] & v2_dummies[c2], columns=[c1 + ' * ' + c2 + '_INT']) ], axis=1) return t_df, ({v1: v1_omit}, {v2: v2_omit}) else: # one bool if data_f[v1].dtype == bool: vb = v1 vd = v2 else: vb = v2 vd = v1 vd_dummies = pd.get_dummies(data_f[vd], prefix_sep=prefix_sep, dtype=bool) vd_omit = data_f[vd].mode( dropna=True).values[0] if vd not in list( drop_list_f.keys()) else drop_list_f[vd] t_df = pd.DataFrame(index=data_f.index) for c in [x for x in vd_dummies.columns if x != vd_omit]: t_df = pd.concat([ t_df, pd.DataFrame(data_f[vb] & vd_dummies[c], columns=[vb + ' * ' + c + '_INT']) ], axis=1) return t_df, ({vb: None}, {vd: None}) t_all_data = pd.concat([ self.data, self.model_data[np.setdiff1d(self.model_data.columns, self.data.columns)] ], axis=1) t_df = pd.DataFrame(index=self.data.index) t_dummy_col_omit_list = list() for int_act_col1, int_act_col2 in self.interaction_name: t_dummy, t_dummy_omit = create_dummy_df( data_f=t_all_data, v1=int_act_col1, v2=int_act_col2, drop_list_f=self.cat_col_omit_dict) ##### t_df = pd.concat([t_df, t_dummy], axis=1) t_dummy_col_omit_list.append(t_dummy_omit) del t_dummy, t_dummy_omit del int_act_col1, int_act_col2 self.dummy_col_omit_list = t_dummy_col_omit_list return t_df def code_variables(self): # get new variable matrices if len(self.convert_bool_dict) > 0: df_bool_f = self.convert_to_bool() else: df_bool_f = None if len(self.convert_ord_list) > 0: df_ord_f = self.convert_to_ordinal() else: df_ord_f = None df_cat_f = self.convert_cat_to_dummies() return df_bool_f, df_ord_f, df_cat_f def refresh_model_data(self): df_bool_f, df_ord_f, df_cat_f = self.code_variables() self.check_for_exog_conflict() t_remain_exog = [ x for x in self.exog_name if ((x not in list(self.convert_bool_dict.keys())) and ( x not in list(self.convert_ord_list)) and ( x not in self.cat_col_names)) ] if df_cat_f is not None: df_cat_f_dropped_omit = df_cat_f[[ c for c in df_cat_f.columns if c not in self.cat_col_drop_names ]] else: df_cat_f_dropped_omit = None self.model_data = pd.concat([ self.data[self.endog_name], self.data[t_remain_exog], df_bool_f, df_ord_f, df_cat_f_dropped_omit ], axis=1) # ------------- # add predictions for fold based on estimation of lower model if len(self.hier_model_vars_dict) > 0: df_hier_f = self.create_hier_vars() self.data[df_hier_f.columns] = df_hier_f self.model_data[self.hier_exog_var_names] = df_hier_f[ self.hier_exog_var_names] # add interaction variables if len(self.interaction_name) > 0: df_interaction_f = self.create_interactions() self.model_data[[x for x in df_interaction_f.columns ]] = df_interaction_f self.exog_name_model = [ x for x in self.model_data if x != self.endog_name ] def create_model_object(self): model_mat = copy.deepcopy(self.model_data) # convert booleans to floats explicitly for c in model_mat.columns: if model_mat[c].dtype == bool: model_mat[c] = model_mat[c].astype(float) # scale specified vars to N(0,1) for c in self.scale_vars_list: try: xbar = model_mat[c].mean() s = model_mat[c].std() model_mat[c] = model_mat[c].apply(lambda x: (x - xbar) / s) del xbar, s except KeyError: print( 'Warning: specified variable to scale, %s, is not included in model covariates' % c) # drop rows with na model_mat.dropna(inplace=True) # add constant if needed if self.add_constant: model_mat = pd.concat([ pd.DataFrame(data=[1] * model_mat.shape[0], index=model_mat.index, columns=['const']), model_mat ], axis=1) self.endog_matrix = model_mat[self.endog_name] self.exog_matrix = model_mat[[ c for c in model_mat.columns if c != self.endog_name ]] self.model = Logit(endog=self.endog_matrix, exog=self.exog_matrix) def estimate_model(self): self.refresh_model_data() self.create_model_object() self.model_result = self.model.fit() self.est_coef.update( dict( zip(list(self.exog_matrix.columns), self.model_result._results.params))) self.make_predictions() # predict values of training data print(self.model_result.summary()) def make_predictions(self, pred_data=None, select_coef=None): def utility_calc(coef_fff, data_fff): return np.matmul(np.array(data_fff), np.array(coef_fff).reshape(len(coef_fff), 1)).flatten() def matrix_pred_calc(coef_ff, data_ff): return np.exp(utility_calc(coef_ff, data_ff)) / ( 1 + np.exp(utility_calc(coef_ff, data_ff))).flatten() def classify_pred(prob_ff, threshold_ff): return prob_ff > threshold_ff if pred_data is None: if select_coef is None: self.fitted_values = self.model_result.predict( self.exog_matrix) return self.fitted_values, classify_pred( self.fitted_values, self.classification_threshold) else: t_pred = pd.Series(matrix_pred_calc( coef_ff=[self.est_coef.get(key) for key in select_coef], data_ff=self.exog_matrix[select_coef]), index=self.exog_matrix.index) return t_pred, classify_pred(t_pred, self.classification_threshold) else: if select_coef is None: t_pred = self.model_result.predict(pred_data[:, [ x for x in pred_data.columns if x in list(self.est_coef.keys()) ]]) return t_pred, classify_pred(t_pred, self.classification_threshold) else: t_pred = pd.Series(matrix_pred_calc( coef_ff=[self.est_coef.get(key) for key in select_coef], data_ff=pred_data[select_coef]), index=pred_data.index) return t_pred, classify_pred(t_pred, self.classification_threshold)
for poly in polys[target_gene]: in_central = poly.contains_points( atlas_coords.ix[:, ['X', 'Z'], time_point].T ) not_expr = atlas_expr.ix[:, target_gene, time_point] < co in_central |= not_expr print(sum(in_central)) #in_central = (x_coord < 45) #in_central = x_coord_scale < 0.6 #fitter = logistic.LogisticRegression(fit_intercept=False) #fitter.fit(X.ix[in_central, :], y.ix[in_central] > co) sm_fitter = Logit( y.ix[in_central].clip(0, 1), X.ix[in_central].clip(0, 1)) sm_fit = sm_fitter.fit() Y_tmp = atlas_expr.ix[in_central, target_gene,time_point].copy() Y_tmp /= Y_tmp.max() Y_tmp = 1.0 * (Y_tmp > .5) all_regs = atlas_expr.ix[:, all_regs, time_point].count(axis=1) > 0 all_regs = all_regs.index[all_regs] #if True: #if (poly == poly1) or (poly == poly2) or (poly == poly12): if target_gene == 'hb': #best_tfs = ['bcdP', 'hkb', 'hkb2', 'KrP', 'bcdP2', 'const'] #best_tfs = ['bcdP', 'bcdP2', 'gtP', 'kni', 'hkb', 'KrP', 'const'] #best_tfs = atlas_expr.major_axis
# ## Model # ### Regresi Logistik # In[ ]: titanic_ = add_constant(titanic) # In[ ]: model_ = Logit(titanic_['Survived'], titanic_.drop(['Survived'], axis=1)) result = model_.fit(); result.summary() # In[ ]: odd_ratio = np.exp(result.params); odd_ratio # ### Ekstraksi variabel target # Buat dataframe dengan X berupa masukan dan y berupa target (Survived) # In[ ]: y = titanic.Survived.copy() # copy “y” column values out
for poly in polys[target_gene]: in_central = poly.contains_points(atlas_coords.ix[:, ['X', 'Z'], time_point].T) not_expr = atlas_expr.ix[:, target_gene, time_point] < co in_central |= not_expr print(sum(in_central)) #in_central = (x_coord < 45) #in_central = x_coord_scale < 0.6 #fitter = logistic.LogisticRegression(fit_intercept=False) #fitter.fit(X.ix[in_central, :], y.ix[in_central] > co) sm_fitter = Logit(y.ix[in_central].clip(0, 1), X.ix[in_central].clip(0, 1)) sm_fit = sm_fitter.fit() Y_tmp = atlas_expr.ix[in_central, target_gene, time_point].copy() Y_tmp /= Y_tmp.max() Y_tmp = 1.0 * (Y_tmp > .5) all_regs = atlas_expr.ix[:, all_regs, time_point].count(axis=1) > 0 all_regs = all_regs.index[all_regs] #if True: #if (poly == poly1) or (poly == poly2) or (poly == poly12): if target_gene == 'hb': #best_tfs = ['bcdP', 'hkb', 'hkb2', 'KrP', 'bcdP2', 'const'] #best_tfs = ['bcdP', 'bcdP2', 'gtP', 'kni', 'hkb', 'KrP', 'const'] #best_tfs = atlas_expr.major_axis best_tfs = {
def fit_model(df, formula, title="Full", fp=None, filename="Model", save=False): """ Function to fit model, collect stats and save predictions and model. df: dataframe formula: formula title: title of model (Default: "Full") fp: File pointer (Default: None) filename: Model and data file prefix ("Model") save: Weather to save predictions, model or both or none ["Both", "Data", "Model", False] (Default: False) """ if df.shape[0] < 10: print "Too less instances. Skipping. Make sure you have atleast 10 instances." return None, None print "Modelling Model[%s] with instances %s" % (title, df.shape[0]) print "Using formula:\n %s" % (formula) print "Generating patsy matrices" y, X = patsy.dmatrices(formula, df, return_type="dataframe") print "Initializing model" model = Logit(y, X) print "Fitting model" res = model.fit() print title, "\n", res.summary2() print "Confusion Matrix:", res.pred_table() precision = ems.precision(res.pred_table()) recall = ems.recall(res.pred_table()) accuracy = ems.accuracy(res.pred_table()) f_score = ems.fscore_measure(res.pred_table()) rmse = ems.rmse(res.predict(), model.endog) mae = ems.mae(res.predict(), model.endog) auc = ems.auc(res.predict(), model.endog) prc = ems.prc(res.predict(), model.endog) prc_filename = "%s.pdf" % filename plot_prc(prc, prc_filename) evaluation_metrics = "[Model Measures]: Confusion Matrix: %s\nRMSE: %s\tMAE: %s\tAUC: %s\nPrecision: %s\tRecall: %s\tAccuracy: %s\tF1-Score: %s\nPRC:\n%s" % ( res.pred_table(), rmse, mae, auc, precision, recall, accuracy, f_score, prc_filename) print evaluation_metrics print "[save=%s]" % save, "" if save else "Not", "Saving Model to %s" % filename if fp is not None: print >> fp, "Modelling Model[%s] with instances %s" % (title, df.shape[0]) print >> fp, "Using formula:\n %s" % (formula) print >> fp, title, "\n", res.summary2() print >> fp, evaluation_metrics print >> fp, "[save=%s]" % save, "" if save else "Not", "Saving Model to %s" % filename model_save, data_save = False, False if save == "Both": model_save, data_save = True, True if save == "Model" or model_save: model_file = "%s.pkl" % filename res.save(model_file, remove_data=True) # Save model if save == "Data" or data_save: data_file = "%s.data.txt" % filename # Include predictions print "df.index", df.index save_data(df[["from_id", "is_self_cite"]], res.predict(), filename=data_file) print "Done Saving" return model, res
def logregress_loose(X, y, *args, **kwargs): X = list(zip(*(_series(x) for x in X))) y = _series(y) model = Logit(y, X) result = model.fit(*args, **kwargs) return result.summary()
class LogisticRegression: def __init__(self, endog_name_f=None, exog_name_f=None, data_f=None, add_constant_f=True, scale_vars_list_f=list(), interaction_name_f=list(), convert_bool_dict_f=dict(), convert_ord_list_f=list(), cat_col_omit_dict_f=dict(), **kwds): self.endog_name = endog_name_f self.exog_name = exog_name_f self.data = data_f.reindex() self.add_constant = add_constant_f self.interaction_name = interaction_name_f self.convert_bool_dict = convert_bool_dict_f # convert_bool_dict_f self.convert_ord_list = convert_ord_list_f # convert_ord_list_f self.cat_col_names = list() self.cat_col_omit_dict = cat_col_omit_dict_f self.cat_col_drop_names = list() self.dummy_col_omit_list = list() self.scale_vars_list = scale_vars_list_f self.exog_name_model = None self.model_data = None self.model = None self.model_result = None self.refresh_model_data() def check_for_exog_conflict(self): t_bool_ord = set(self.convert_bool_dict.keys()).intersection( set(self.convert_ord_list)) t_cat_bool = set(self.cat_col_omit_dict.keys()).intersection( set(self.convert_bool_dict.keys())) t_cat_ord = set(self.cat_col_omit_dict.keys()).intersection( set(self.convert_ord_list)) if len(t_bool_ord) > 0: print( 'WARNING appearing in both boolean and ordinal variable lists: %s' % ', '.join(t_bool_ord)) if len(t_cat_ord) > 0: print( 'WARNING appearing in both categorical and ordinal variable lists: %s, ignoring categorical' % ', '.join(t_cat_ord)) if len(t_cat_bool) > 0: print( 'WARNING appearing in both categorical and boolean variable lists: %s, ignoring categorical' % ', '.join(t_cat_bool)) def convert_cat_to_dummies(self): # get list of exogenous variables that are categorical and need to be converted self.cat_col_names = [ x for x in self.exog_name if ((x not in list(self.convert_bool_dict.keys())) and ( x not in self.convert_ord_list) and (self.data[x].dtype == 'O') ) ] prefix_sep = '_' [ self.cat_col_omit_dict.update( {x: self.data[x].mode(dropna=True).values[0]}) for x in self.cat_col_names if x not in list(self.cat_col_omit_dict.keys()) ] self.cat_col_drop_names = [ k + prefix_sep + v for k, v in self.cat_col_omit_dict.items() ] if len(self.cat_col_names) > 0: return pd.get_dummies(self.data[self.cat_col_names], prefix_sep=prefix_sep, columns=self.cat_col_names, dtype=bool) else: return None def convert_to_bool(self): t_df = pd.DataFrame() t_col_names = list() for k, v in self.convert_bool_dict.items(): t_col_names.append(k + '_' + v + '_TF') t_df = pd.concat([t_df, self.data[k] == v], axis=1) t_df.columns = t_col_names return t_df def convert_to_ordinal(self): t_df = pd.DataFrame() t_col_names = list() for c in self.convert_ord_list: t_col_names.append(c + '_ORD') t_df = pd.concat([t_df, self.data[c].astype(int)], axis=1) t_df.columns = t_col_names return t_df def create_interactions(self): def create_dummy_df(data_f, v1, v2, drop_list_f): prefix_sep = '_' if (data_f[v1].dtype == bool) and (data_f[v2].dtype == bool): # both bool - create interaction effect directly t_df = pd.DataFrame(data_f[v1] & data_f[v2], columns=[v1 + ' * ' + v2 + '_INT']) return t_df, ({v1: None}, {v2: None}) elif (data_f[v1].dtype != bool) and (data_f[v2].dtype != bool): # both cat v1_dummies = pd.get_dummies(data_f[v1], prefix_sep=prefix_sep, dtype=bool) v1_omit = data_f[v1].mode( dropna=True).values[0] if v1 not in list( drop_list_f.keys()) else drop_list_f[v1] v2_dummies = pd.get_dummies(data_f[v2], prefix_sep=prefix_sep, dtype=bool) v2_omit = data_f[v2].mode( dropna=True).values[0] if v2 not in list( drop_list_f.keys()) else drop_list_f[v2] t_df = pd.DataFrame(index=data_f.index) for c1 in [x for x in v1_dummies.columns if x != v1_omit]: for c2 in [x for x in v2_dummies.columns if x != v2_omit]: t_df = pd.concat([ t_df, pd.DataFrame(v1_dummies[c1] & v2_dummies[c2], columns=[c1 + ' * ' + c2 + '_INT']) ], axis=1) return t_df, ({v1: v1_omit}, {v2: v2_omit}) else: # one bool if data_f[v1].dtype == bool: vb = v1 vd = v2 else: vb = v2 vd = v1 vd_dummies = pd.get_dummies(data_f[vd], prefix_sep=prefix_sep, dtype=bool) vd_omit = data_f[vd].mode( dropna=True).values[0] if vd not in list( drop_list_f.keys()) else drop_list_f[vd] t_df = pd.DataFrame(index=data_f.index) for c in [x for x in vd_dummies.columns if x != vd_omit]: t_df = pd.concat([ t_df, pd.DataFrame(data_f[vb] & vd_dummies[c], columns=[vb + ' * ' + c + '_INT']) ], axis=1) return t_df, ({vb: None}, {vd: None}) t_df = pd.DataFrame(index=self.data.index) t_dummy_col_omit_list = list() for int_act_col1, int_act_col2 in self.interaction_name: t_dummy, t_dummy_omit = create_dummy_df(self.data, int_act_col1, int_act_col2, self.cat_col_omit_dict) t_df = pd.concat([t_df, t_dummy], axis=1) t_dummy_col_omit_list.append(t_dummy_omit) del t_dummy, t_dummy_omit del int_act_col1, int_act_col2 self.dummy_col_omit_list = t_dummy_col_omit_list return t_df def code_variables(self): # get new variable matrices if len(self.convert_bool_dict) > 0: df_bool_f = self.convert_to_bool() else: df_bool_f = None if len(self.convert_ord_list) > 0: df_ord_f = self.convert_to_ordinal() else: df_ord_f = None df_cat_f = self.convert_cat_to_dummies() if len(self.interaction_name) > 0: df_interaction_f = self.create_interactions() else: df_interaction_f = None return df_bool_f, df_ord_f, df_cat_f, df_interaction_f def refresh_model_data(self): df_bool_f, df_ord_f, df_cat_f, df_interaction_f = self.code_variables() self.check_for_exog_conflict() t_remain_exog = [ x for x in self.exog_name if ((x not in list(self.convert_bool_dict.keys())) and ( x not in list(self.convert_ord_list)) and ( x not in self.cat_col_names)) ] if df_cat_f is not None: df_cat_f_dropped_omit = df_cat_f[[ c for c in df_cat_f.columns if c not in self.cat_col_drop_names ]] else: df_cat_f_dropped_omit = None self.model_data = pd.concat([ self.data[self.endog_name], self.data[t_remain_exog], df_bool_f, df_ord_f, df_cat_f_dropped_omit, df_interaction_f ], axis=1) self.exog_name_model = [ x for x in self.model_data if x != self.endog_name ] def create_model_object(self): model_mat = copy.deepcopy(self.model_data) # convert booleans to floats explicitly for c in model_mat.columns: if model_mat[c].dtype == bool: model_mat[c] = model_mat[c].astype(float) # scale specified vars to N(0,1) for c in self.scale_vars_list: try: xbar = model_mat[c].mean() s = model_mat[c].std() model_mat[c] = model_mat[c].apply(lambda x: (x - xbar) / s) del xbar, s except KeyError: print( 'Warning: specified variable to scale, %s, is not included in model covariates' % c) # drop rows with na model_mat.dropna(inplace=True) # add constant if needed if self.add_constant: model_mat = pd.concat([ pd.DataFrame(data=[1] * model_mat.shape[0], index=model_mat.index, columns=['const']), model_mat ], axis=1) self.model = Logit(endog=model_mat[self.endog_name], exog=model_mat[[ c for c in model_mat.columns if c != self.endog_name ]]) def estimate_model(self): self.refresh_model_data() self.create_model_object() self.model_result = self.model.fit() print(self.model_result.summary())