def __run_binomial__(self, *args, **kws): formulas = kws.get('formulas', None) if not formulas: formulas = {'Model 1':kws.get('formula', None)} tab=pd.DataFrame() for label, formula in formulas.items(): mod = glm(formula, data=self.data, family=family.Binomial()) fit = mod.fit() tmp = pd.DataFrame({'label':label, "formula":formula, 'family':kws.get("family", 'gaussian'), "mod":[mod], "fit":[fit], "summ1":[self.__get_summary1__(fit)], "summ2":[self.__get_summary2__(fit)], "summ3":[self.__get_summary3__(mod, fit)], 'Obs':fit.nobs, 'aic':fit.aic, 'bic':fit.bic, 'r2':1-(fit.deviance/ fit.null_deviance), # 'rmse':np.sqrt(np.mean((self['y']-fit.predict())**2)) }) tab=pd.concat([tab, tmp], axis=0, ignore_index=True) return tab
from functools import partial from logging import getLogger import numpy as np import pandas as pd from patsy import NAAction, build_design_matrices, dmatrices from regularized_glm import penalized_IRLS from scipy.special import logsumexp from statsmodels.api import families from statsmodels.tsa.tsatools import lagmat FAMILY = families.Binomial() logger = getLogger(__name__) def fit_discrete_state_transition(speed, is_replay, penalty=1E-5, speed_knots=None, diagonal=None): """Estimate the predicted probablity of replay given speed and whether it was a replay in the previous time step. p(I_t | I_t-1, v_t-1) p_I_0, p_I_1 in Long Tao's code Parameters ---------- speed : ndarray, shape (n_time,)
def fit(self, X, y, X_test, feval=None, cat_feats=None, exclude_columns=None, epochs=16, batch_size=128, oof2csv=False, plot=False): """ # TODO: Rank 融合 :param X: 保证索引唯一 :param y: :param X_test: :param feval: roc_auc_score(y_true, y_score) :param cat_feats: 类别特征索引 :param exclude_columns: 仅针对 nn :param epochs: :param batch_size: :return: """ # 判断输入数据转数据框 if isinstance(y, pd.Series): y.reset_index(drop=True, inplace=True) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) X_test = pd.DataFrame(X) else: X.reset_index(drop=True, inplace=True) X_test.reset_index(drop=True, inplace=True) # oof评估函数 feval = feval if feval else roc_auc_score # 移除不需要的特征 if exclude_columns: feats = X.columns.difference(exclude_columns) X, X_test = X[feats], X_test[feats] # Score if hasattr(feval, '__repr__'): score_name = feval.__repr__().split()[1] else: score_name = None # cv num if hasattr(self.folds, 'n_splits'): num_cv = self.folds.n_splits else: num_cv = self.folds.cvargs['n_splits'] * self.folds.n_repeats # Cross validation model # Create arrays and dataframes to store results oof_preds = np.zeros(X.shape[0]) sub_preds = np.zeros((X_test.shape[0], num_cv)) self.feature_importance_df = pd.DataFrame() for n_fold, (train_idx, valid_idx) in enumerate(self.folds.split(X, y), 1): print("\n\033[94mFold %s started at %s\033[0m" % (n_fold, time.ctime())) X_train, y_train = X.iloc[train_idx], y[train_idx] X_valid, y_valid = X.iloc[valid_idx], y[valid_idx] if not hasattr(self.estimator, 'fit'): print("该算法无fit方法") break else: if 'LGBMClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, categorical_feature=cat_feats if cat_feats else 'auto', eval_metric='auc', early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'LGBMRegressor' in self.model_type: # reg_objs = ['regression_l1', 'regression_l2', 'huber', 'fair', 'poisson', 'quantile', 'mape', 'gamma', 'tweedie'] eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, categorical_feature=cat_feats if cat_feats else 'auto', # eval_metric='l2', early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'XGBClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, eval_metric='auc', early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'XGBRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, # eval_metric='rmse', early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'CatBoostClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, cat_features=cat_feats, use_best_model=True, plot=True, early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'CatBoostRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, cat_features=cat_feats, use_best_model=True, plot=True, early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'RGFClassifier' in self.model_type: pass elif 'RGFRegressor' in self.model_type: pass # https://www.cnblogs.com/flyu6/p/7691106.html elif 'KerasClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=eval_set) elif 'KerasRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=eval_set) elif self.model_type == 'GLM': # TODO: 其他模型的支持 self.estimator = GLM(y_train, X_train, family=families.Binomial()) self.estimator = self.estimator.fit().predict(X) else: # sklearn 原生模型 print('Sklearn Fitting ...') self.estimator.fit(X_train, y_train) # 计算并保存 preds # TODO: 多分类需要修改 if hasattr(self.estimator, 'predict_proba'): oof_preds[valid_idx] = self.estimator.predict_proba( X_valid)[:, 1] sub_preds[:, n_fold - 1] = self.estimator.predict_proba(X_test)[:, 1] else: oof_preds[valid_idx] = self.estimator.predict(X_valid) sub_preds[:, n_fold - 1] = self.estimator.predict(X_test) if plot and hasattr(self.estimator, 'feature_importances_'): fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = X.columns fold_importance_df[ "importance"] = self.estimator.feature_importances_ fold_importance_df["fold"] = n_fold self.feature_importance_df = fold_importance_df.append( self.feature_importance_df) # 输出需要的结果 self.oof_preds = oof_preds self.sub_preds = sub_preds.mean(1) self.sub_preds_rank = pd.DataFrame(sub_preds).rank().mean( 1) / sub_preds.shape[0] # auc work try: self.score = feval(y, self.oof_preds) except Exception as e: self.score = 0 print('Error feval:', e) print("\n\033[94mCV Score %s: %s ended at %s\033[0m" % (score_name, self.score, time.ctime())) # 保存的普通平均的得分 if oof2csv: pd.Series(np.append(self.oof_preds, self.sub_preds), name='oof').to_csv('OOF %s %.4f.csv' % (time.ctime(), self.score), index=False) # 是否输出特征重要性 if plot: self.feature_importance_df.sort_values(['fold', 'importance'], 0, False, inplace=True) self.plot_importances(self.feature_importance_df, len(X.columns))
def predict(design_matrix, coefficients): family = families.Binomial() return family.link.inverse(design_matrix @ np.squeeze(coefficients))
def fit(self, X, y, X_test, feval=None, cat_feats=None, exclude_columns=None, epochs=16, batch_size=128, oof2csv=None): """ # TODO: Rank 融合 :param X: :param y: :param X_test: :param feval: roc_auc_score(y_true, y_score) :param cat_feats: 类别特征索引 :param exclude_columns: 仅针对 nn :param epochs: :param batch_size: :return: """ # oof评估函数 feval = feval if feval else roc_auc_score # 移除不需要的特征 if exclude_columns: feats = X.columns.difference(exclude_columns) else: feats = X.columns X, X_test = X[feats], X_test[feats] if hasattr(self.folds, 'n_splits'): num_folds = self.folds.n_splits else: num_folds = self.folds.cvargs['n_splits'] # Cross validation model # Create arrays and dataframes to store results oof_preds = np.zeros(len(X)) sub_preds = np.zeros(len(X_test)) self.feature_importance_df = pd.DataFrame() for n_fold, (train_idx, valid_idx) in enumerate(self.folds.split(X, y), 1): print("\n\033[94mFold %s started at %s\033[0m" % (n_fold, time.ctime())) X_train, y_train = X.iloc[train_idx], y.iloc[train_idx] X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx] if not hasattr(self.clf, 'fit'): print("该算法无fit方法") break else: if 'LGBMClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, eval_set=eval_set, categorical_feature=cat_feats if cat_feats else 'auto', eval_metric='auc', early_stopping_rounds=100, verbose=100) elif 'LGBMRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, eval_set=eval_set, categorical_feature=cat_feats if cat_feats else 'auto', eval_metric='l2', early_stopping_rounds=100, verbose=100) elif 'XGBClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, eval_set=eval_set, eval_metric='auc', early_stopping_rounds=100, verbose=100) elif 'XGBRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, eval_set=eval_set, eval_metric='rmse', early_stopping_rounds=100, verbose=100) elif 'CatBoostClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, eval_set=eval_set, cat_features=cat_feats, use_best_model=True, plot=True, early_stopping_rounds=100, verbose=100) elif 'CatBoostRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, eval_set=eval_set, cat_features=cat_feats, use_best_model=True, plot=True, early_stopping_rounds=100, verbose=0) elif 'RGFClassifier' in self.model_type: pass elif 'RGFRegressor' in self.model_type: pass # https://www.cnblogs.com/flyu6/p/7691106.html elif 'KerasClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=eval_set) elif 'KerasRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=eval_set) elif self.model_type == 'GLM': # TODO: 其他模型的支持 self.clf = GLM(y_train, X_train, family=families.Binomial()) self.clf = self.clf.fit().predict(X) else: # sklearn 原生模型 self.clf.fit(X, y) # 计算并保存 preds # TODO: 多分类需要修改 if hasattr(self.clf, 'predict_proba'): oof_preds[valid_idx] = self.clf.predict_proba(X_valid)[:, 1] sub_preds += self.clf.predict_proba(X_test)[:, 1] / num_folds else: oof_preds[valid_idx] = self.clf.predict(X_valid) sub_preds += self.clf.predict(X_test) / num_folds if hasattr(self.clf, 'feature_importances_'): fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = self.clf.feature_importances_ fold_importance_df["fold"] = n_fold self.feature_importance_df = pd.concat([self.feature_importance_df, fold_importance_df], 0) try: score = feval(y, oof_preds) score_name = feval.__repr__().split()[1] except Exception as e: score = score_name = None print('Error feval:', e) print("\n\033[94mOOF %s: %s end at %s\n\033[0m" % (score_name, score, time.ctime())) if hasattr(self.clf, 'feature_importances_'): self.plot_importances(self.feature_importance_df) self.oof_preds = oof_preds self.test_preds = sub_preds if oof2csv: pd.Series(oof_preds.tolist() + sub_preds.tolist(), name='oof').to_csv(oof2csv + time.ctime(), index=False) return oof_preds, sub_preds
def _fit_unmatched_regression(self, statmatch): link = families.links.probit family = families.Binomial(link) reg = GLM(statmatch.treated, statmatch.design_matrix, family=family) return reg.fit()