def prs_betaci(q, prs, df): (q0,q1)=q we_print=(q0==2) q0=df[prs].quantile((100-q0)/100.0), # pandas has 99 as the highest; we have 1 as the highest q1=df[prs].quantile((100-q1)/100.0) q40=df[prs].quantile(0.4) q60=df[prs].quantile(0.6) iids=df.index[((q0 <= df[prs]) & (df[prs] <= q1)) | ((q40 <= df[prs]) & (df[prs] <= q60))] if is_bin: data=np.vstack((expit(models['PRS']['COVAR']['train'].predict(df.loc[iids,covariates])), (q0 <= df.loc[iids,prs]) & (df.loc[iids,prs] <= q1))).T try: m=Logit(df.loc[iids,phe_code], data).fit(disp=0) except PerfectSeparationError: return None,(None,None),None b=np.exp(m.params[1]) ci=np.abs(np.exp(m.conf_int().iloc[1,:].values)-b) else: data=np.vstack((models['PRS']['COVAR']['train'].predict(df.loc[iids,covariates]), (q0 <= df.loc[iids,prs]) & (df.loc[iids,prs] <= q1))).T m=OLS(df.loc[iids,phe_code], data).fit(disp=0) b=m.params[1] ci=np.abs(m.conf_int().iloc[1,:].values-b) if we_print: print(b, [b-ci[0],b+ci[1]]) return b,ci,df.loc[(q0 <= df[prs]) & (df[prs] <= q1),phe_code].mean()
class BinaryLogisticRegression: """ Class for binary logistic regression models based on the excellent statsmodels package. Parameters ---------- method : 'enter' or 'backward' Method for predictors selection include_constant : bool (CURRENTLY UNAVAILIABLE) Whether to include constant in the model classification_cutoff : float Minimum probability to assign a prediction value 1 sig_level_entry : float (CURRENTLY UNAVAILIABLE) Max significance level to include predictor in the model sig_level_removal : float Min significance level to exclude predictor from the model Attributes ---------- predictions : pd.Series Predicted values classification_table : pd.DataFrame A classification table precision_and_recall : pd.DataFrame Table with precision, recall, and F1-score of the model variables_excluded : list Variables excluded because of zero variance variables_included : list Variables included in a model N : int Number of observations included in a model r2_pseudo_macfadden : float MacFadden's pseudo coefficient of determination r2_pseudo_cox_snell : float Cox&Snell's pseudo coefficient of determination r2_pseudo_nagelkerke : float Nagelkerke's pseudo coefficient of determination loglikelihood : float -2LL coefficients : pd.Series Regression coefficients coefficients_sterrors : pd.Series Standard errors of regression coefficients coefficients_wald_statistics : pd.Series Wald statistic of regression coefficients coefficients_zvalues : pd.Series z-statistic of regression coefficients coefficients_pvalues : pd.Series P-values of regression coefficients coefficients_exp : pd.Series e ** regression coefficients """ def __init__( self, method='enter', include_constant=True, classification_cutoff=0.5, sig_level_entry=0.05, sig_level_removal=0.05, ): self.method = method.lower().strip() self.include_constant = include_constant self.classification_cutoff = classification_cutoff self.sig_level_entry = sig_level_entry self.sig_level_removal = sig_level_removal def fit(self, data, formula, categorical_variables=None, max_iterations=100, show_results=True, confidence_intervals=True, use_patsy_notation=False, n_decimals=3): """ Fit model to the given data using formula. Parameters ---------- data : pd.DataFrame Data to fit a model formula : str Formula of a model specification, e.g. 'y ~ x1 + x2'; should be passed either in Patsy (statsmodels) notation or using the following rules: '*' for interaction of the variables, ':' for interaction & main effects, i.e., 'y ~ x:z' equals to 'y ~ x + z + x*z' (unlike the Patsy notation). If you use Patsy notation, please specify the parameter use_patsy_notation=True. categorical_variables : list List of names of the variables that should be considered categorical. These variables would be automatically converted into sets of dummy variables. If you want to use this option, please make sure that you don't have nested names of variables (e.g. 'imdb' and 'imdb_rate' at the same time), otherwise this option results in an incorrect procedure. max_iterations : int Maximum iterations for convergence show_results : bool Whether to show results of analysis confidence_intervals : bool Whether to include coefficients' confidence intervals in the summary table use_patsy_notation : bool Turn this on if you use strictly Patsy's rules to define a formula. See more: https://patsy.readthedocs.io/en/latest/quickstart.html n_decimals : int Number of digits to round results when showing them Returns ------- self The current instance of the BinaryLogisticRegression class """ self._data = data.copy() self.categorical_variables = categorical_variables self._show_ci = confidence_intervals self.max_iterations = max_iterations if '=' in formula: formula = formula.replace('=', '~') if not use_patsy_notation: formula = formula.replace('*', '^').replace(':', '*').replace('^', ':') self.formula = formula self.dependent_variable = self.formula.split('~')[0].strip() dep_cats = get_categories(self._data[self.dependent_variable]) self._dep_cats = dep_cats if len(dep_cats) != 2: raise ValueError( f"""A dependent variable should have exactly 2 unique categories. The provided variable has {len(dep_cats)}.""") self._mapper = {dep_cats[0]: 0, dep_cats[1]: 1} self._inv_mapper = {0: dep_cats[0], 1: dep_cats[1]} if not is_numeric_dtype(self._data[self.dependent_variable]): self._data[self.dependent_variable] = self._data[ self.dependent_variable].map(self._mapper).astype(int) #won't work correctly if some variables have nested names (e.g. kinopoisk_rate and kinopoisk_rate_count) if categorical_variables is not None: if not isinstance(categorical_variables, list): raise ValueError( f"""Categorical variables should be passed as list. Type {type(categorical_variables)} was passed instead.""") else: for variable in categorical_variables: formula = formula.replace(variable, f'C({variable})') self._optimizer = 'newton' try: self._model = logit(formula=formula, data=self._data).fit( maxiter=self.max_iterations, warn_convergence=False, disp=False, method=self._optimizer, full_output=True) except np.linalg.LinAlgError: self._optimizer = 'bfgs' self._model = logit(formula=formula, data=self._data).fit( maxiter=self.max_iterations, warn_convergence=False, disp=False, method=self._optimizer, full_output=True) self._model_params = { 'maxiter': self.max_iterations, 'warn_convergence': False, 'disp': False, 'method': self._optimizer, 'full_output': True } self._observations_idx = list(self._model.fittedvalues.index) self.variables_excluded = self._identify_variables_without_variation() if len(self.variables_excluded) > 0: y = pd.Series(self._model.model.endog.copy(), index=self._observations_idx, name=self.dependent_variable) X = self._remove_variables_without_variation() self._model = Logit(y, X, missing='drop').fit(**self._model_params) self.variables_excluded = [ BinaryLogisticRegression._translate_from_patsy_notation(x) for x in self.variables_excluded ] if self.method == 'backward': self._fit_backward() self._get_statistics_from_model() self.predictions = self.predict() self.classification_table = self.get_classification_table() self.precision_and_recall = self.get_precision_and_recall() if show_results: self.show_results(n_decimals) if len(self.variables_excluded) > 0: print('------------------\n') print( f"Following variables were excluded due to zero variance: {'; '.join(self.variables_excluded)}" ) return self def _fit_backward(self): y_train = pd.Series(self._model.model.endog.copy(), name=self.dependent_variable, index=self._observations_idx) X_train = pd.DataFrame(self._model.model.exog, columns=self._model.model.exog_names, index=self._observations_idx) model = Logit(y_train, X_train, missing='drop') results = model.fit(**self._model_params) max_pvalue = results.pvalues.drop('Intercept').max() while max_pvalue > self.sig_level_removal: x_to_drop = results.pvalues.drop('Intercept').idxmax() X_train = X_train.drop(x_to_drop, axis=1) model = Logit(y_train, X_train, missing='drop') results = model.fit(**self._model_params) max_pvalue = results.pvalues.drop('Intercept').max() self._model = results return def _identify_variables_without_variation(self): if self.include_constant: mask = self._model.model.exog.var(axis=0)[1:] == 0 else: mask = self._model.model.exog.var(axis=0) == 0 variables_included = [ x for x in list(self._model.params.index) if x != 'Intercept' ] return list(np.array(variables_included)[mask]) def _remove_variables_without_variation(self): X = pd.DataFrame(self._model.model.exog, columns=self._model.model.exog_names, index=self._observations_idx) X = X.drop(self.variables_excluded, axis=1) return X @staticmethod def _translate_from_patsy_notation(effect): effect = effect\ .replace(':', ' * ')\ .replace('C(', '')\ .replace('T.', '')\ .replace('[', ' = "')\ .replace(']', '"')\ .replace(')', '') return effect def _get_statistics_from_model(self): self.N = self._model.nobs self.r2_pseudo_macfadden = self._model.prsquared self.r2_pseudo_cox_snell = 1 - np.exp(-self._model.llr / self.N) self.r2_pseudo_nagelkerke = self.r2_pseudo_cox_snell / ( 1 - np.exp(-(-2 * self._model.llnull) / self.N)) self.loglikelihood = -2 * self._model.llf self.coefficients = self._model.params.copy() self.coefficients_sterrors = self._model.bse.copy() self.coefficients_wald_statistics = self._model.tvalues.copy()**2 self.coefficients_zvalues = self._model.tvalues.copy() self.coefficients_pvalues = self._model.pvalues.copy() self.coefficients_exp = self.coefficients.apply(np.exp) variables_included = [ x for x in list(self.coefficients.index) if x != 'Intercept' ] self._variables_included_patsy = variables_included.copy() variables_included = [ BinaryLogisticRegression._translate_from_patsy_notation(x) for x in variables_included ] self.variables_included = variables_included if self.include_constant: self._params_idx = ['Constant'] + variables_included else: self._params_idx = variables_included.copy() for stats in [ self.coefficients, self.coefficients_pvalues, self.coefficients_sterrors, self.coefficients_zvalues, self.coefficients_wald_statistics, self.coefficients_exp ]: stats.index = self._params_idx return def summary(self): """ Summary table with requested information related to regression coefficients. Returns ------- pd.DataFrame A summary table """ statistics = [ self.coefficients, self.coefficients_sterrors, self.coefficients_wald_statistics, self.coefficients_pvalues, self.coefficients_exp ] columns = ['B', 'Std. Error', 'Wald', 'p-value', 'Exp(B)'] if self._show_ci: statistics.append(self.coefficients_confidence_interval) columns.extend(list(self.coefficients_confidence_interval.columns)) statistics = pd.concat(statistics, axis=1) statistics.columns = columns statistics.index = self._params_idx return statistics @property def coefficients_confidence_interval(self): ci = self._model.conf_int() ci.index = self._params_idx ci.columns = [f'LB CI (95%)', f'UB CI (95%)'] return ci def show_results(self, n_decimals): """ Show results of the analysis in a readable form. Parameters ---------- n_decimals : int Number of digits to round results when showing them """ phrase = 'method {}' print('\nLOGISTIC REGRESSION SUMMARY\n') if self._model.mle_retvals['converged'] == True: print('Estimation was converged successfully.') else: print('Estimation was NOT converged successfully.') print('Please enlarge the number of iterations.') print('------------------\n') print('Dependent variable encoding') display(self.get_dependent_variable_codes().style\ .set_caption(phrase.format('.get_dependent_variable_codes()'))) print('------------------\n') print('Model summary') display(self.summary_r2().style\ .set_caption(phrase.format('.summary_r2()'))\ .set_precision(n_decimals)) print('------------------\n') print('Classification table') display(self.get_classification_table().style\ .set_caption(phrase.format('.get_classification_table()'))\ .set_precision(n_decimals)) print('------------------\n') print('Precision and recall') display(self.get_precision_and_recall().style\ .set_caption(phrase.format('.get_precision_and_recall()'))\ .set_precision(n_decimals)) print('------------------\n') print('Coefficients') display(self.summary().style\ .format(None, na_rep="")\ .set_caption(phrase.format('.summary()'))\ .set_precision(n_decimals)) def summary_r2(self): """ Summary table with information related to pseudo coefficients of determination. Returns ------- pd.DataFrame A summary table """ ll = self.loglikelihood mf = self.r2_pseudo_macfadden cs = self.r2_pseudo_cox_snell nk = self.r2_pseudo_nagelkerke statistics = [[ll, mf, cs, nk]] columns = [ '-2 Log likelihood', "MacFadden's Pseudo R2", "Cox&Snell's Pseudo R2", "Nagelkerke's Pseudo R2", ] statistics = pd.DataFrame(statistics, columns=columns, index=['']) return statistics def get_dependent_variable_codes(self): """ Get information on how categories of a dependent variable were encoded. Returns ------- pd.DataFrame A table explaining encodings """ mapper = self._mapper result = pd.DataFrame( [list(mapper.items())[0], list(mapper.items())[1]], columns=['Original value', 'Model value'], index=['', ' ']) return result def get_classification_table(self): """ Get a classification table. Returns ------- pd.DataFrame A classification table """ all_categories = self._dep_cats classification = pd.DataFrame(self._model.pred_table(), columns=self._dep_cats, index=self._dep_cats) classification.index.name = 'Observed' classification.columns.name = 'Predicted' classification['All'] = classification.sum(axis=1) classification.loc['All'] = classification.sum() n = classification.loc['All', 'All'] for category in all_categories: classification.loc[category, 'All'] = classification.loc[ category, category] / classification.loc[category, 'All'] * 100 classification.loc[ 'All', category] = classification.loc['All', category] / n * 100 classification.loc['All', 'All'] = np.diagonal( classification.loc[all_categories, all_categories]).sum() / n * 100 classification.index = all_categories + ['Percent predicted'] classification.index.name = 'Observed' classification.columns = all_categories + ['Percent correct'] classification.columns.name = 'Predicted' return classification def get_precision_and_recall(self): """ Estimate precision, recall, and F-score for all the categories. Returns ------- pd.DataFrame A table with estimated metrics """ preds = self.classification_table.iloc[:-1, :-1] results = [] categories = list(preds.index) for current_category in categories: idx = [cat for cat in categories if cat != current_category] tp = preds.loc[current_category, current_category] fp = preds.loc[idx, current_category].sum() fn = preds.loc[current_category, idx].sum() if fp == 0: precision = 0 else: precision = tp / (tp + fp) recall = tp / (tp + fn) if precision + recall != 0: f1 = 2 * (precision * recall) / (precision + recall) else: f1 = 0 results.append([precision, recall, f1]) results = pd.DataFrame(results, index=categories, columns=['Precision', 'Recall', 'F score']) results.loc['Mean'] = results.mean() return results def predict( self, data=None, group_membership=True, probability=False, logit=False, add_to_data=False, ): """ Predict values of a dependent variable using the fitted model. Parameters ---------- data : pd.DataFrame Data for prediction; may be not specified if you want to predict values for the same data that were used to fit a model group_membership : bool Whether to predict observation's membership to categories of a dependent variable probability : bool Whether to predict exact probability logit : bool Whether to predict a logit value add_to_data : bool Whether to merge predictions with the given data. Currently, this option returns data with a sorted index Returns ------- pd.DataFrame Predictions """ name_memb = f'{self.dependent_variable} (predicted)' name_prob = f'{self.dependent_variable} (predicted prob.)' name_logit = f'{self.dependent_variable} (predicted logit)' all_columns = [name_memb, name_prob, name_logit] columns_to_show = [] if group_membership: columns_to_show.append(name_memb) if probability: columns_to_show.append(name_prob) if logit: columns_to_show.append(name_logit) cutoff = self.classification_cutoff if data is None: data_init = self._data.copy() logit = self._model.fittedvalues prob = logit.apply(lambda x: np.exp(x) / (1 + np.exp(x))) memb = prob.apply(lambda x: 1 if x >= cutoff else 0).map( self._inv_mapper) result = pd.DataFrame(index=self._observations_idx, columns=all_columns) result[name_memb] = memb result[name_prob] = prob result[name_logit] = logit result = result[columns_to_show] if add_to_data: return pd.concat([data_init, result], axis=1) else: return result else: aux_model = logit(self.formula, data).fit(**self._model_params) aux_data_idx = aux_model.fittedvalues.index aux_data_cols = aux_model.model.exog_names aux_data_cols = [BinaryLogisticRegression._translate_from_patsy_notation(x)\ for x in aux_data_cols] aux_data = pd.DataFrame(aux_model.model.exog, index=aux_data_idx, columns=aux_data_cols) aux_X = add_constant(aux_data[self.variables_included].copy()) aux_y = aux_model.model.endog.copy() aux_model = Logit(aux_y, aux_X, missing='drop').fit(**self._model_params) logit = aux_model.fittedvalues prob = logit.apply(lambda x: np.exp(x) / (1 + np.exp(x))) memb = prob.apply(lambda x: 1 if x >= cutoff else 0).map( self._inv_mapper) result = pd.DataFrame(index=aux_data_idx, columns=all_columns) result[name_memb] = memb result[name_prob] = prob result[name_logit] = logit result = result[columns_to_show] if add_to_data: return pd.concat([data, result], axis=1) else: return result def save_independent_variables(self, data=None, add_to_data=False): """ Produce values of independent variable remained in a fitted model. This option is useful if you don't create dummy variables or interaction effects manually but want to use them in a further analysis. Only variables remained in a model are returned (those that are shown in a summary table). Parameters ---------- data : pd.DataFrame Data for which independent variables are requested; may be not specified if you want to save values for the same data that were used to fit a model add_to_data : bool Whether to merge new values with the given data. Currently, this option returns data with a sorted index Returns ------- pd.DataFrame Values of independent variables """ if data is None: data = self._data.copy() if self.include_constant: result = self._model.model.exog[:, 1:].copy() else: result = self._model.model.exog.copy() columns = [x for x in self.variables_included if x != 'Constant'] result = pd.DataFrame(result, columns=columns, index=self._observations_idx) else: aux_model = logit(self.formula, data).fit(**self._model_params) aux_data_idx = aux_model.fittedvalues.index aux_data_cols = aux_model.model.exog_names aux_data_cols = [BinaryLogisticRegression._translate_from_patsy_notation(x)\ for x in aux_data_cols] aux_data = pd.DataFrame(aux_model.model.exog, index=aux_data_idx, columns=aux_data_cols) result = aux_data[self.variables_included] if add_to_data: result = pd.concat([data, result], axis=1) return result def save_residuals(self, unstandardized=True, standardized=False, logit=False, deviance=False, add_to_data=False): """ Produce values of various residuals. Residuals are returned only for data used to fit a model. Parameters ---------- unstandardized : bool Whether to save unstandardized (raw) residuals standardized : bool Whether to save standardized (z-scores) residuals logit : bool Whether to save logit residuals deviance : bool Whether to save deviance residuals add_to_data : bool Whether to merge new values with data. Currently, this option returns data with a sorted index Returns ------- pd.DataFrame Requested residuals """ columns_to_show = [f'{k.capitalize().replace("ized", ".").replace("eted", ".").replace("_", " ")} res.' \ for k, v in vars().items() if v==True and k!='add_to_data'] result = [] res_unstand = self._model.resid_response res_unstand.name = 'Unstandard. res.' res_stand = self._model.resid_pearson res_stand.name = 'Standard. res.' res_deviance = self._model.resid_dev res_deviance.name = 'Deviance res.' preds_prob = self.predict(group_membership=False, probability=True) res_logit = res_unstand / (preds_prob * (1 - preds_prob)).iloc[:, 0] res_logit.name = 'Logit res.' result.extend([res_unstand, res_stand, res_deviance, res_logit]) result = pd.concat(result, axis=1) result = result[columns_to_show].copy() if add_to_data: result = pd.concat([self._data, result], axis=1) return result