def __init__(self, formula=None, data=None, **kwargs): # convert all variables raised to a power to float64 # this prevents mis-specification of probabilities in cases of variable overflow # (if the original var was compressed to a smaller bit integer/float) if type(data) == pd.DataFrame: power_vars = list(set(re.findall(r'(?<=power\().+?(?=,)', formula))) for var in power_vars: data[var] = data[var].astype('float64') if formula: y, X = patsy.dmatrices(formula, data, 1) self._y_design_info = y.design_info self._X_design_info = X.design_info self._model = GLM(y, X, family=Binomial(), **kwargs) self._fit = self._model.fit() self._betas = self._fit.params self._link = logit else: self._y_design_info = None self._X_design_info = None self._model = None self._fit = None self._betas = None self._link = logit
def fit_reg(covariate, treated, weights=pd.Series()): treated = add_constant(treated) if not weights.any(): reg = GLM(covariate, treated) else: reg = GLM(covariate, treated) res = reg.fit() return res
def fit_reg(covariate, treated, weights=pd.Series()): link = families.links.logit family = families.Binomial(link) if not weights.any(): reg = GLM(covariate, treated, family=family, sigma=weights) else: reg = GLM(covariate, treated, family=family) res = reg.fit() return res
def _fit_matched_regression(self, statmatch): has_match = np.isfinite(statmatch.matches) treated_index = has_match[has_match == True].index match_index = np.asarray(statmatch.matches[has_match], dtype=np.int32) regression_index = treated_index.append(match_index) link = families.links.probit family = families.Binomial(link) reg = GLM(statmatch.treated.ix[regression_index], statmatch.design_matrix.ix[regression_index], family=family) return reg.fit()
def _create_propensity_scores(self, treated, design_matrix, link_type='logit'): if link_type == 'logit': link = families.links.logit elif link_type == 'probit': link = families.links.probit family = families.Binomial(link) reg = GLM(treated, design_matrix, family=family) fitted_reg = reg.fit() return fitted_reg
def estimate_movement_std(position_info): MODEL_FORMULA = 'position ~ lagged_position - 1' response, design_matrix = dmatrices(MODEL_FORMULA, position_info) fit = GLM(response, design_matrix, family=families.Gaussian()).fit() return np.sqrt(fit.scale)
def one_cluster(formula, feature, covs, coef, method=OLS, _pat=re.compile("\+\s*CpG")): """used when we have a "cluster" with 1 probe.""" c = covs.copy() # remove the CpG in the formula formula = _pat.sub("", formula) if isinstance(feature, CountFeature): c['methylation'] = feature.methylated c['counts'] = feature.counts c = c[c['counts'] > 0] try: return get_ptc( GLM.from_formula(formula, data=c, exposure=c['counts'], family=Poisson()).fit(), coef) except PerfectSeparationError: return dict(p=np.nan, t=np.nan, coef=np.nan, covar=coef) else: c['methylation'] = feature.values res = method.from_formula(formula, data=c).fit() return get_ptc(res, coef)
def glm_fit(spikes, design_matrix, ind): '''Fits the Poisson model to the spikes from a neuron Parameters ---------- spikes : array_like design_matrix : array_like or pandas DataFrame ind : int Returns ------- fitted_model : object or NaN Returns the statsmodel object if successful. If the model fails in the weighted fit in the IRLS procedure, the model returns NaN. ''' try: logger.debug('\t\t...Neuron #{}'.format(ind + 1)) return GLM(spikes.reindex(design_matrix.index), design_matrix, family=families.Poisson(), drop='missing').fit(maxiter=30) except np.linalg.linalg.LinAlgError: warn('Data is poorly scaled for neuron #{}'.format(ind + 1)) return np.nan
def fit(self, treated, design_matrix, design_matrix_header): """Run logit or probit and set treated, design_matrix, and pscore""" #Convert to pandas data structures treated = pd.Series(treated) design_matrix = pd.DataFrame(design_matrix) #Fit propensity socre link = families.links.logit family = families.Binomial(link) reg = GLM(treated, design_matrix, family=family) fitted_reg = reg.fit() pscore = fitted_reg.fittedvalues #Store values for later refernce self.header = design_matrix_header self.treated = treated self.design_matrix = design_matrix self.pscore = pscore
def __init__(self, formula=None, data=None, link=logit, **kwargs): if formula: y, X = patsy.dmatrices(formula, data, 1) self._y_design_info = y.design_info self._X_design_info = X.design_info self._model = GLM(y, X, family=Binomial(link), **kwargs) self._fit = self._model.fit() self._betas = self._fit.params self._link = link else: self._y_design_info = None self._X_design_info = None self._model = None self._fit = None self._betas = None self._link = link
def estimate_movement_variance(position, lagged_position, speed): data = { 'position': position, 'lagged_position': lagged_position } MODEL_FORMULA = 'position ~ lagged_position - 1' response, design_matrix = dmatrices(MODEL_FORMULA, data) fit = GLM(response, design_matrix, family=families.Gaussian()).fit() return np.sqrt(fit.scale)
def fit_glm_model(spikes, design_matrix, penalty=1E-5): '''Fits the Poisson model to the spikes from a neuron Parameters ---------- spikes : array_like design_matrix : array_like or pandas DataFrame ind : int penalty : float, optional Returns ------- fitted_model : statsmodel results ''' model = GLM(spikes, design_matrix, family=families.Poisson(), drop='missing') regularization_weights = np.ones((design_matrix.shape[1], )) * penalty regularization_weights[0] = 0.0 return model.fit_regularized(alpha=regularization_weights, L1_wt=0)
def one_cluster(formula, feature, covs, coef, method=OLS, _pat=re.compile("\+\s*CpG")): """used when we have a "cluster" with 1 probe.""" c = covs.copy() # remove the CpG in the formula formula = _pat.sub("", formula) if isinstance(feature, CountFeature): c['methylation'] = feature.methylated c['counts'] = feature.counts c = c[c['counts'] > 0] try: return get_ptc(GLM.from_formula(formula, data=c, exposure=c['counts'], family=Poisson()).fit(), coef) except PerfectSeparationError: return dict(p=np.nan, t=np.nan, coef=np.nan, covar=coef) else: c['methylation'] = feature.values res = method.from_formula(formula, data=c).fit() return get_ptc(res, coef)
def estimate_movement_std(position): '''Estimates the movement standard deviation based on position. WARNING: Need to use on original position, not interpolated position. Parameters ---------- position : ndarray, shape (n_time, n_position_dim) Returns ------- movement_std : ndarray, shape (n_position_dim,) ''' position = atleast_2d(position) is_nan = np.any(np.isnan(position), axis=1) position = position[~is_nan] movement_std = [] for p in position.T: fit = GLM(p[:-1], p[1:], family=families.Gaussian()).fit() movement_std.append(np.sqrt(fit.scale)) return np.array(movement_std)
n_time, n_trials = 1500, 1000 SAMPLING_FREQUENCY = 1500 sampling_frequency = 1500 # Firing rate starts at 5 Hz and switches to 10 Hz firing_rate = np.ones((n_time, n_trials)) * 10 firing_rate[:n_time // 2, :] = 5 spike_train = simulate_poisson_process(firing_rate, sampling_frequency) time = (np.arange(0, n_time)[:, np.newaxis] / sampling_frequency * np.ones( (1, n_trials))) trial_id = (np.arange(n_trials)[np.newaxis, :] * np.ones((n_time, 1))) # Fit a spline model to the firing rate design_matrix = dmatrix('bs(time, df=5)', dict(time=time.ravel())) fit = GLM(spike_train.ravel(), design_matrix, family=families.Poisson()).fit() fig, axes = plt.subplots(1, 2, figsize=(12, 6)) axes[0].pcolormesh(np.unique(time), np.unique(trial_id), spike_train.T, cmap='viridis') axes[0].set_xlabel('Time') axes[0].set_ylabel('Trials') axes[0].set_title('Simulated Spikes') conditional_intensity = fit.mu axes[1].plot(np.unique(time), firing_rate[:, 0], linestyle='--',
def _fit_unmatched_regression(self, statmatch): link = families.links.probit family = families.Binomial(link) reg = GLM(statmatch.treated, statmatch.design_matrix, family=family) return reg.fit()
def fit_speed_model(speed, lagged_speed): response, design_matrix = dmatrices( FORMULA, dict(speed=speed, lagged_speed=lagged_speed)) results = GLM(response, design_matrix, family=FAMILY).fit() return results.params, results.scale
def fit_speed_model(speed, lagged_speed): FORMULA = 'speed ~ lagged_speed - 1' response, design_matrix = dmatrices( FORMULA, dict(speed=speed, lagged_speed=lagged_speed)) family = families.Gaussian(link=families.links.log) return GLM(response, design_matrix, family=family).fit()
def fit(self, X, y, X_test, feval=None, cat_feats=None, exclude_columns=None, epochs=16, batch_size=128, oof2csv=False, plot=False): """ # TODO: Rank 融合 :param X: 保证索引唯一 :param y: :param X_test: :param feval: roc_auc_score(y_true, y_score) :param cat_feats: 类别特征索引 :param exclude_columns: 仅针对 nn :param epochs: :param batch_size: :return: """ # 判断输入数据转数据框 if isinstance(y, pd.Series): y.reset_index(drop=True, inplace=True) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) X_test = pd.DataFrame(X) else: X.reset_index(drop=True, inplace=True) X_test.reset_index(drop=True, inplace=True) # oof评估函数 feval = feval if feval else roc_auc_score # 移除不需要的特征 if exclude_columns: feats = X.columns.difference(exclude_columns) X, X_test = X[feats], X_test[feats] # Score if hasattr(feval, '__repr__'): score_name = feval.__repr__().split()[1] else: score_name = None # cv num if hasattr(self.folds, 'n_splits'): num_cv = self.folds.n_splits else: num_cv = self.folds.cvargs['n_splits'] * self.folds.n_repeats # Cross validation model # Create arrays and dataframes to store results oof_preds = np.zeros(X.shape[0]) sub_preds = np.zeros((X_test.shape[0], num_cv)) self.feature_importance_df = pd.DataFrame() for n_fold, (train_idx, valid_idx) in enumerate(self.folds.split(X, y), 1): print("\n\033[94mFold %s started at %s\033[0m" % (n_fold, time.ctime())) X_train, y_train = X.iloc[train_idx], y[train_idx] X_valid, y_valid = X.iloc[valid_idx], y[valid_idx] if not hasattr(self.estimator, 'fit'): print("该算法无fit方法") break else: if 'LGBMClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, categorical_feature=cat_feats if cat_feats else 'auto', eval_metric='auc', early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'LGBMRegressor' in self.model_type: # reg_objs = ['regression_l1', 'regression_l2', 'huber', 'fair', 'poisson', 'quantile', 'mape', 'gamma', 'tweedie'] eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, categorical_feature=cat_feats if cat_feats else 'auto', # eval_metric='l2', early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'XGBClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, eval_metric='auc', early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'XGBRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, # eval_metric='rmse', early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'CatBoostClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, cat_features=cat_feats, use_best_model=True, plot=True, early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'CatBoostRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, cat_features=cat_feats, use_best_model=True, plot=True, early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'RGFClassifier' in self.model_type: pass elif 'RGFRegressor' in self.model_type: pass # https://www.cnblogs.com/flyu6/p/7691106.html elif 'KerasClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=eval_set) elif 'KerasRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=eval_set) elif self.model_type == 'GLM': # TODO: 其他模型的支持 self.estimator = GLM(y_train, X_train, family=families.Binomial()) self.estimator = self.estimator.fit().predict(X) else: # sklearn 原生模型 print('Sklearn Fitting ...') self.estimator.fit(X_train, y_train) # 计算并保存 preds # TODO: 多分类需要修改 if hasattr(self.estimator, 'predict_proba'): oof_preds[valid_idx] = self.estimator.predict_proba( X_valid)[:, 1] sub_preds[:, n_fold - 1] = self.estimator.predict_proba(X_test)[:, 1] else: oof_preds[valid_idx] = self.estimator.predict(X_valid) sub_preds[:, n_fold - 1] = self.estimator.predict(X_test) if plot and hasattr(self.estimator, 'feature_importances_'): fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = X.columns fold_importance_df[ "importance"] = self.estimator.feature_importances_ fold_importance_df["fold"] = n_fold self.feature_importance_df = fold_importance_df.append( self.feature_importance_df) # 输出需要的结果 self.oof_preds = oof_preds self.sub_preds = sub_preds.mean(1) self.sub_preds_rank = pd.DataFrame(sub_preds).rank().mean( 1) / sub_preds.shape[0] # auc work try: self.score = feval(y, self.oof_preds) except Exception as e: self.score = 0 print('Error feval:', e) print("\n\033[94mCV Score %s: %s ended at %s\033[0m" % (score_name, self.score, time.ctime())) # 保存的普通平均的得分 if oof2csv: pd.Series(np.append(self.oof_preds, self.sub_preds), name='oof').to_csv('OOF %s %.4f.csv' % (time.ctime(), self.score), index=False) # 是否输出特征重要性 if plot: self.feature_importance_df.sort_values(['fold', 'importance'], 0, False, inplace=True) self.plot_importances(self.feature_importance_df, len(X.columns))
class OOF(object): """Out of flod prediction # TODO 支持回归 lightGBM一个一个地建立节点; XGboost一层一层地建立节点 https://blog.csdn.net/friyal/article/details/82758532 Catboost总是使用完全二叉树。它的节点是镜像的(对称树)。Catboost称对称树有利于避免overfit,增加可靠性,并且能大大加速预测等等。 计算某个category出现的频率,加上超参数,生成新的numerical features # https://blog.csdn.net/linxid/article/details/80723811 """ _params = { 'metric': 'auc', 'learning_rate': 0.01, 'n_estimators': 30000, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced', 'scale_pos_weight': 1, 'random_state': 2019, 'verbosity': -1 } lgb = LGBMClassifier(n_jobs=16, **_params) # TODO: 常用模型另存为其他模块 xgb = XGBClassifier() cat = CatBoostClassifier(n_estimators=20000, learning_rate=0.05, loss_function='Logloss', eval_metric='AUC', random_state=2019) def __init__(self, estimator=None, folds=None, early_stopping_rounds=300, verbose=100): # 指定lgb: metric xgb: eval_metric self.estimator = self.lgb if estimator is None else estimator self.folds = folds if folds else StratifiedKFold( 5, True, 2019) # 支持 RepeatedStratifiedKFold self.model_type = self.estimator.__repr__() self.early_stopping_rounds = early_stopping_rounds self.verbose = verbose # self.estimator_agrs = self.getfullargspec(self.estimator.fit).args if hasattr(self.estimator, 'fit') else None def fit(self, X, y, X_test, feval=None, cat_feats=None, exclude_columns=None, epochs=16, batch_size=128, oof2csv=False, plot=False): """ # TODO: Rank 融合 :param X: 保证索引唯一 :param y: :param X_test: :param feval: roc_auc_score(y_true, y_score) :param cat_feats: 类别特征索引 :param exclude_columns: 仅针对 nn :param epochs: :param batch_size: :return: """ # 判断输入数据转数据框 if isinstance(y, pd.Series): y.reset_index(drop=True, inplace=True) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) X_test = pd.DataFrame(X) else: X.reset_index(drop=True, inplace=True) X_test.reset_index(drop=True, inplace=True) # oof评估函数 feval = feval if feval else roc_auc_score # 移除不需要的特征 if exclude_columns: feats = X.columns.difference(exclude_columns) X, X_test = X[feats], X_test[feats] # Score if hasattr(feval, '__repr__'): score_name = feval.__repr__().split()[1] else: score_name = None # cv num if hasattr(self.folds, 'n_splits'): num_cv = self.folds.n_splits else: num_cv = self.folds.cvargs['n_splits'] * self.folds.n_repeats # Cross validation model # Create arrays and dataframes to store results oof_preds = np.zeros(X.shape[0]) sub_preds = np.zeros((X_test.shape[0], num_cv)) self.feature_importance_df = pd.DataFrame() for n_fold, (train_idx, valid_idx) in enumerate(self.folds.split(X, y), 1): print("\n\033[94mFold %s started at %s\033[0m" % (n_fold, time.ctime())) X_train, y_train = X.iloc[train_idx], y[train_idx] X_valid, y_valid = X.iloc[valid_idx], y[valid_idx] if not hasattr(self.estimator, 'fit'): print("该算法无fit方法") break else: if 'LGBMClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, categorical_feature=cat_feats if cat_feats else 'auto', eval_metric='auc', early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'LGBMRegressor' in self.model_type: # reg_objs = ['regression_l1', 'regression_l2', 'huber', 'fair', 'poisson', 'quantile', 'mape', 'gamma', 'tweedie'] eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, categorical_feature=cat_feats if cat_feats else 'auto', # eval_metric='l2', early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'XGBClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, eval_metric='auc', early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'XGBRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, # eval_metric='rmse', early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'CatBoostClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, cat_features=cat_feats, use_best_model=True, plot=True, early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'CatBoostRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, cat_features=cat_feats, use_best_model=True, plot=True, early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'RGFClassifier' in self.model_type: pass elif 'RGFRegressor' in self.model_type: pass # https://www.cnblogs.com/flyu6/p/7691106.html elif 'KerasClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=eval_set) elif 'KerasRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=eval_set) elif self.model_type == 'GLM': # TODO: 其他模型的支持 self.estimator = GLM(y_train, X_train, family=families.Binomial()) self.estimator = self.estimator.fit().predict(X) else: # sklearn 原生模型 print('Sklearn Fitting ...') self.estimator.fit(X_train, y_train) # 计算并保存 preds # TODO: 多分类需要修改 if hasattr(self.estimator, 'predict_proba'): oof_preds[valid_idx] = self.estimator.predict_proba( X_valid)[:, 1] sub_preds[:, n_fold - 1] = self.estimator.predict_proba(X_test)[:, 1] else: oof_preds[valid_idx] = self.estimator.predict(X_valid) sub_preds[:, n_fold - 1] = self.estimator.predict(X_test) if plot and hasattr(self.estimator, 'feature_importances_'): fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = X.columns fold_importance_df[ "importance"] = self.estimator.feature_importances_ fold_importance_df["fold"] = n_fold self.feature_importance_df = fold_importance_df.append( self.feature_importance_df) # 输出需要的结果 self.oof_preds = oof_preds self.sub_preds = sub_preds.mean(1) self.sub_preds_rank = pd.DataFrame(sub_preds).rank().mean( 1) / sub_preds.shape[0] # auc work try: self.score = feval(y, self.oof_preds) except Exception as e: self.score = 0 print('Error feval:', e) print("\n\033[94mCV Score %s: %s ended at %s\033[0m" % (score_name, self.score, time.ctime())) # 保存的普通平均的得分 if oof2csv: pd.Series(np.append(self.oof_preds, self.sub_preds), name='oof').to_csv('OOF %s %.4f.csv' % (time.ctime(), self.score), index=False) # 是否输出特征重要性 if plot: self.feature_importance_df.sort_values(['fold', 'importance'], 0, False, inplace=True) self.plot_importances(self.feature_importance_df, len(X.columns)) def plot_importances(self, df, topk=64): """Display/plot feature importance""" assert "feature" in df.columns and "importance" in df.columns, '无["feature", "importance"]' data = (df[["feature", "importance" ]].groupby("feature").mean().reset_index().sort_values( "importance", 0, False))[:topk] self.feature_importance_df_agg = data plt.figure(figsize=(12, topk // 4)) sns.barplot(x="importance", y="feature", data=data.assign(feature='col_' + data.feature.astype(str))) plt.title('Features (avg over folds)') plt.tight_layout() plt.savefig('importances.png')
# Logistic regression of sex on height and weight # Sex is coded in the binary variable `male`. # LHS binary variable male = (heights_weights['Gender'] == 'Male') * 1 # Matrix of predictor variables: hieght and weight from data frame # into an Nx2 array. hw_exog = heights_weights[['Height', 'Weight']].values # Logit model 1: Using GLM and the Binomial Family w/ the Logit Link # Note I have to add constants to the `exog` matrix. The prepend = True # argument prevents a warning about future change to the default argument. logit_model = GLM(male, sm.add_constant(hw_exog, prepend=True), family=sm.families.Binomial(sm.families.links.logit)) logit_model.fit().summary() # Get the coefficient parameters. logit_pars = logit_model.fit().params # Logit model 2: Using the Logit function. logit_model2 = Logit(male, sm.add_constant(hw_exog, prepend=True)) logit_model2.fit().summary() # Get the coefficient parameters logit_pars2 = logit_model2.fit().params # Compare the two methods again. They give the same parameters. DataFrame({'GLM': logit_pars, 'Logit': logit_pars2})
plt.savefig('height_weight_lowess.png') # Logistic regression of sex on height and weight # Sex is coded in the binary variable `male`. # LHS binary variable male = (heights_weights['Gender'] == 'Male') * 1 # Matrix of predictor variables: hieght and weight from data frame # into an Nx2 array. hw_exog = heights_weights[['Height', 'Weight']].values # Logit model 1: Using GLM and the Binomial Family w/ the Logit Link # Note I have to add constants to the `exog` matrix. The prepend = True # argument prevents a warning about future change to the default argument. logit_model = GLM(male, sm.add_constant(hw_exog, prepend = True), family = sm.families.Binomial(sm.families.links.logit)) logit_model.fit().summary() # Get the coefficient parameters. logit_pars = logit_model.fit().params # Logit model 2: Using the Logit function. logit_model2 = Logit(male, sm.add_constant(hw_exog, prepend = True)) logit_model2.fit().summary() # Get the coefficient parameters logit_pars2 = logit_model2.fit().params # Compare the two methods again. They give the same parameters. DataFrame({'GLM' : logit_pars, 'Logit' : logit_pars2})
X = pd.DataFrame({ '$R$': race, '$I$': income, '$C$': crime, '$E$': industry, '$N$': neighborhood }) X.corr() from statsmodels.api import GLM import statsmodels.api as sm X['$1/I$'] = 1. / X['$I$'] model = GLM(X['$C$'], X[['$1/I$']], family=sm.families.Gamma()) result = model.fit() result.summary() races = {0: 'african-american', 1: 'hispanic', 2: 'asian', 3: 'white'} X['race'] = X['$R$'].map(races) race_dummies = pd.get_dummies(X['race']) X[race_dummies.columns] = race_dummies X_restricted = X[X['$E$'] == 0] model = OLS(X_restricted['$C$'], X_restricted[race_dummies.columns]) result = model.fit()
class LogitRegression(object): """Patsy wrapper for logit model estimation and prediction. Example usage: # construct and estimate model using patsy formula # uses the cps pickle file under dataset processor cps["EarnedWage"] = (cps.WageIncomeLastYear > 0).astype(int) model = LogitRegression( "EarnedWage ~ C(Race)", cps, freq_weights=cps.Weight ) # print model summary print(model) # return predicted probability of working for blacks prob_works = model.predict( pandas.DataFrame({ "Race": ["Black"] }) ) """ def __init__(self, formula=None, data=None, link=logit, **kwargs): if formula: y, X = patsy.dmatrices(formula, data, 1) self._y_design_info = y.design_info self._X_design_info = X.design_info self._model = GLM(y, X, family=Binomial(link), **kwargs) self._fit = self._model.fit() self._betas = self._fit.params self._link = link else: self._y_design_info = None self._X_design_info = None self._model = None self._fit = None self._betas = None self._link = link def __repr__(self): return str(self._fit.summary()) if self._fit \ else "Logistic regression" def predict(self, data, linear=False): if len(data) == 0: return [] (X, ) = patsy.build_design_matrices([self._X_design_info], data) if not linear: return self._link.inverse(self._link(), linear_transform( numpy.asarray(X), self._betas)) else: return linear_transform(numpy.asarray(X), self._betas) def draw(self, data, rand_engine): prediction = self.predict(data) return rand_engine.binomial(1, prediction) def to_pickle(self, filename): pickle.dump((self._y_design_info, self._X_design_info, self._betas, self._link), open(filename, "wb")) @staticmethod def read_pickle(filename): y_design_info, X_design_info, betas, link = pickle.load( open(filename, "rb")) logit_regression = LogitRegression() logit_regression._y_design_info = y_design_info logit_regression._X_design_info = X_design_info logit_regression._betas = betas logit_regression._link = link return logit_regression def __add__(self, other): ret = copy(self) ret._betas = self._betas + other._betas return ret def __sub__(self, other): ret = copy(self) ret._betas = self._betas - other._betas return ret def __mul__(self, other): ret = copy(self) ret._betas = ret._betas * other return ret
class LogitRegression(object): """Patsy wrapper for logit model estimation and prediction. Example usage: # construct and estimate model using patsy formula # uses the cps pickle file under dataset processor cps["EarnedWage"] = (cps.WageIncomeLastYear > 0).astype(int) model = LogitRegression( "EarnedWage ~ C(Race)", cps, freq_weights=cps.Weight ) # print model summary print(model) # return predicted probability of working for blacks prob_works = model.predict( pd.DataFrame({ "Race": ["Black"] }) ) """ def __init__(self, formula=None, data=None, **kwargs): # convert all variables raised to a power to float64 # this prevents mis-specification of probabilities in cases of variable overflow # (if the original var was compressed to a smaller bit integer/float) if type(data) == pd.DataFrame: power_vars = list(set(re.findall(r'(?<=power\().+?(?=,)', formula))) for var in power_vars: data[var] = data[var].astype('float64') if formula: y, X = patsy.dmatrices(formula, data, 1) self._y_design_info = y.design_info self._X_design_info = X.design_info self._model = GLM(y, X, family=Binomial(), **kwargs) self._fit = self._model.fit() self._betas = self._fit.params self._link = logit else: self._y_design_info = None self._X_design_info = None self._model = None self._fit = None self._betas = None self._link = logit def __repr__(self): return str(self._fit.summary()) if self._fit \ else "Logistic regression" def predict(self, data, linear=False): if len(data) == 0: return [] # identifies exponential variables from the design matrix (via the 'power' flag) and converts to float64 # this prevents mis-specification of probabilities in cases of variable overflow # (if the original var was compressed to a smaller bit integer/float) power_vars = list(set([ re.search(r'(?<=power\().+?(?=,)', column).group() for column in \ self._X_design_info.column_names if 'power' in column ])) for var in power_vars: data[var] = data[var].astype('float64') (X, ) = patsy.build_design_matrices([self._X_design_info], data) if not linear: return self._link.inverse( self._link(), linear_transform(np.asarray(X), self._betas)) else: return linear_transform(np.asarray(X), self._betas) def draw(self, data, rand_engine): prediction = self.predict(data) return rand_engine.binomial(1, prediction) def to_pickle(self, filename): with open(filename, "wb") as f: pickle.dump((self._y_design_info, self._X_design_info, self._betas, self._link), f) @staticmethod def read_pickle(filename): y_design_info, X_design_info, betas, link = pickle.load( open(filename, "rb")) logit_regression = LogitRegression() logit_regression._y_design_info = y_design_info logit_regression._X_design_info = X_design_info logit_regression._betas = betas logit_regression._link = link return logit_regression def __add__(self, other): ret = copy(self) ret._betas = self._betas + other._betas return ret def __sub__(self, other): ret = copy(self) ret._betas = self._betas - other._betas return ret def __mul__(self, other): ret = copy(self) ret._betas = ret._betas * other return ret
class OOF(object): """Out of flod prediction # TODO 支持回归 lightGBM一个一个地建立节点; XGboost一层一层地建立节点 https://blog.csdn.net/friyal/article/details/82758532 Catboost总是使用完全二叉树。它的节点是镜像的(对称树)。Catboost称对称树有利于避免overfit,增加可靠性,并且能大大加速预测等等。 计算某个category出现的频率,加上超参数,生成新的numerical features # https://blog.csdn.net/linxid/article/details/80723811 """ _params = {'metric': 'auc', 'learning_rate': 0.01, 'n_estimators': 30000, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced', ## 'scale_pos_weight': 1, ## 'random_state': 2019, 'verbosity': -1} lgb = LGBMClassifier(n_jobs=16, **_params) xgb = XGBClassifier() cat = CatBoostClassifier(n_estimators=20000, learning_rate=0.05, loss_function='Logloss', eval_metric='AUC', random_state=2019) def __init__(self, clf=None, folds=None): self.clf = clf if clf else self.lgb self.folds = folds if folds else StratifiedKFold(5, True, 2019) # 支持 RepeatedStratifiedKFold self.model_type = self.clf.__repr__() # self.clf_agrs = self.getfullargspec(self.clf.fit).args if hasattr(self.clf, 'fit') else None def fit(self, X, y, X_test, feval=None, cat_feats=None, exclude_columns=None, epochs=16, batch_size=128, oof2csv=None): """ # TODO: Rank 融合 :param X: :param y: :param X_test: :param feval: roc_auc_score(y_true, y_score) :param cat_feats: 类别特征索引 :param exclude_columns: 仅针对 nn :param epochs: :param batch_size: :return: """ # oof评估函数 feval = feval if feval else roc_auc_score # 移除不需要的特征 if exclude_columns: feats = X.columns.difference(exclude_columns) else: feats = X.columns X, X_test = X[feats], X_test[feats] if hasattr(self.folds, 'n_splits'): num_folds = self.folds.n_splits else: num_folds = self.folds.cvargs['n_splits'] # Cross validation model # Create arrays and dataframes to store results oof_preds = np.zeros(len(X)) sub_preds = np.zeros(len(X_test)) self.feature_importance_df = pd.DataFrame() for n_fold, (train_idx, valid_idx) in enumerate(self.folds.split(X, y), 1): print("\n\033[94mFold %s started at %s\033[0m" % (n_fold, time.ctime())) X_train, y_train = X.iloc[train_idx], y.iloc[train_idx] X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx] if not hasattr(self.clf, 'fit'): print("该算法无fit方法") break else: if 'LGBMClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, eval_set=eval_set, categorical_feature=cat_feats if cat_feats else 'auto', eval_metric='auc', early_stopping_rounds=100, verbose=100) elif 'LGBMRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, eval_set=eval_set, categorical_feature=cat_feats if cat_feats else 'auto', eval_metric='l2', early_stopping_rounds=100, verbose=100) elif 'XGBClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, eval_set=eval_set, eval_metric='auc', early_stopping_rounds=100, verbose=100) elif 'XGBRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, eval_set=eval_set, eval_metric='rmse', early_stopping_rounds=100, verbose=100) elif 'CatBoostClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, eval_set=eval_set, cat_features=cat_feats, use_best_model=True, plot=True, early_stopping_rounds=100, verbose=100) elif 'CatBoostRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, eval_set=eval_set, cat_features=cat_feats, use_best_model=True, plot=True, early_stopping_rounds=100, verbose=0) elif 'RGFClassifier' in self.model_type: pass elif 'RGFRegressor' in self.model_type: pass # https://www.cnblogs.com/flyu6/p/7691106.html elif 'KerasClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=eval_set) elif 'KerasRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=eval_set) elif self.model_type == 'GLM': # TODO: 其他模型的支持 self.clf = GLM(y_train, X_train, family=families.Binomial()) self.clf = self.clf.fit().predict(X) else: # sklearn 原生模型 self.clf.fit(X, y) # 计算并保存 preds # TODO: 多分类需要修改 if hasattr(self.clf, 'predict_proba'): oof_preds[valid_idx] = self.clf.predict_proba(X_valid)[:, 1] sub_preds += self.clf.predict_proba(X_test)[:, 1] / num_folds else: oof_preds[valid_idx] = self.clf.predict(X_valid) sub_preds += self.clf.predict(X_test) / num_folds if hasattr(self.clf, 'feature_importances_'): fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = self.clf.feature_importances_ fold_importance_df["fold"] = n_fold self.feature_importance_df = pd.concat([self.feature_importance_df, fold_importance_df], 0) try: score = feval(y, oof_preds) score_name = feval.__repr__().split()[1] except Exception as e: score = score_name = None print('Error feval:', e) print("\n\033[94mOOF %s: %s end at %s\n\033[0m" % (score_name, score, time.ctime())) if hasattr(self.clf, 'feature_importances_'): self.plot_importances(self.feature_importance_df) self.oof_preds = oof_preds self.test_preds = sub_preds if oof2csv: pd.Series(oof_preds.tolist() + sub_preds.tolist(), name='oof').to_csv(oof2csv + time.ctime(), index=False) return oof_preds, sub_preds def plot_importances(self, df, topk=64): """Display/plot feature importance""" assert "feature" in df.columns and "importance" in df.columns, '无["feature", "importance"]' data = (df[["feature", "importance"]] .groupby("feature") .mean() .reset_index() .sort_values("importance", 0, False))[:topk] plt.figure(figsize=(12, int(topk / 4))) sns.barplot(x="importance", y="feature", data=data.assign(feature=data.feature.astype(str))) plt.title('LightGBM Features (avg over folds)') plt.tight_layout() plt.savefig('lgbm_importances.png')
def fit(self, X, y, X_test, feval=None, cat_feats=None, exclude_columns=None, epochs=16, batch_size=128, oof2csv=None): """ # TODO: Rank 融合 :param X: :param y: :param X_test: :param feval: roc_auc_score(y_true, y_score) :param cat_feats: 类别特征索引 :param exclude_columns: 仅针对 nn :param epochs: :param batch_size: :return: """ # oof评估函数 feval = feval if feval else roc_auc_score # 移除不需要的特征 if exclude_columns: feats = X.columns.difference(exclude_columns) else: feats = X.columns X, X_test = X[feats], X_test[feats] if hasattr(self.folds, 'n_splits'): num_folds = self.folds.n_splits else: num_folds = self.folds.cvargs['n_splits'] # Cross validation model # Create arrays and dataframes to store results oof_preds = np.zeros(len(X)) sub_preds = np.zeros(len(X_test)) self.feature_importance_df = pd.DataFrame() for n_fold, (train_idx, valid_idx) in enumerate(self.folds.split(X, y), 1): print("\n\033[94mFold %s started at %s\033[0m" % (n_fold, time.ctime())) X_train, y_train = X.iloc[train_idx], y.iloc[train_idx] X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx] if not hasattr(self.clf, 'fit'): print("该算法无fit方法") break else: if 'LGBMClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, eval_set=eval_set, categorical_feature=cat_feats if cat_feats else 'auto', eval_metric='auc', early_stopping_rounds=100, verbose=100) elif 'LGBMRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, eval_set=eval_set, categorical_feature=cat_feats if cat_feats else 'auto', eval_metric='l2', early_stopping_rounds=100, verbose=100) elif 'XGBClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, eval_set=eval_set, eval_metric='auc', early_stopping_rounds=100, verbose=100) elif 'XGBRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, eval_set=eval_set, eval_metric='rmse', early_stopping_rounds=100, verbose=100) elif 'CatBoostClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, eval_set=eval_set, cat_features=cat_feats, use_best_model=True, plot=True, early_stopping_rounds=100, verbose=100) elif 'CatBoostRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, eval_set=eval_set, cat_features=cat_feats, use_best_model=True, plot=True, early_stopping_rounds=100, verbose=0) elif 'RGFClassifier' in self.model_type: pass elif 'RGFRegressor' in self.model_type: pass # https://www.cnblogs.com/flyu6/p/7691106.html elif 'KerasClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=eval_set) elif 'KerasRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=eval_set) elif self.model_type == 'GLM': # TODO: 其他模型的支持 self.clf = GLM(y_train, X_train, family=families.Binomial()) self.clf = self.clf.fit().predict(X) else: # sklearn 原生模型 self.clf.fit(X, y) # 计算并保存 preds # TODO: 多分类需要修改 if hasattr(self.clf, 'predict_proba'): oof_preds[valid_idx] = self.clf.predict_proba(X_valid)[:, 1] sub_preds += self.clf.predict_proba(X_test)[:, 1] / num_folds else: oof_preds[valid_idx] = self.clf.predict(X_valid) sub_preds += self.clf.predict(X_test) / num_folds if hasattr(self.clf, 'feature_importances_'): fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = self.clf.feature_importances_ fold_importance_df["fold"] = n_fold self.feature_importance_df = pd.concat([self.feature_importance_df, fold_importance_df], 0) try: score = feval(y, oof_preds) score_name = feval.__repr__().split()[1] except Exception as e: score = score_name = None print('Error feval:', e) print("\n\033[94mOOF %s: %s end at %s\n\033[0m" % (score_name, score, time.ctime())) if hasattr(self.clf, 'feature_importances_'): self.plot_importances(self.feature_importance_df) self.oof_preds = oof_preds self.test_preds = sub_preds if oof2csv: pd.Series(oof_preds.tolist() + sub_preds.tolist(), name='oof').to_csv(oof2csv + time.ctime(), index=False) return oof_preds, sub_preds