コード例 #1
0
ファイル: models.py プロジェクト: DiogoFerrari/pandasci
 def __run_binomial__(self, *args, **kws):
     formulas = kws.get('formulas', None)
     if not formulas:
         formulas = {'Model 1':kws.get('formula', None)}
     tab=pd.DataFrame()
     for label, formula in formulas.items():
         mod = glm(formula, data=self.data, family=family.Binomial())
         fit = mod.fit()
         tmp = pd.DataFrame({'label':label,
                             "formula":formula,
                             'family':kws.get("family", 'gaussian'),
                             "mod":[mod],
                             "fit":[fit],
                             "summ1":[self.__get_summary1__(fit)],
                             "summ2":[self.__get_summary2__(fit)],
                             "summ3":[self.__get_summary3__(mod, fit)],
                             'Obs':fit.nobs,
                             'aic':fit.aic,
                             'bic':fit.bic,
                             'r2':1-(fit.deviance/ fit.null_deviance),
                             # 'rmse':np.sqrt(np.mean((self['y']-fit.predict())**2))
                             })
         tab=pd.concat([tab, tmp], axis=0, ignore_index=True)
     return tab
from functools import partial
from logging import getLogger

import numpy as np
import pandas as pd
from patsy import NAAction, build_design_matrices, dmatrices
from regularized_glm import penalized_IRLS
from scipy.special import logsumexp
from statsmodels.api import families
from statsmodels.tsa.tsatools import lagmat

FAMILY = families.Binomial()

logger = getLogger(__name__)


def fit_discrete_state_transition(speed,
                                  is_replay,
                                  penalty=1E-5,
                                  speed_knots=None,
                                  diagonal=None):
    """Estimate the predicted probablity of replay given speed and whether
    it was a replay in the previous time step.

    p(I_t | I_t-1, v_t-1)

    p_I_0, p_I_1 in Long Tao's code

    Parameters
    ----------
    speed : ndarray, shape (n_time,)
コード例 #3
0
ファイル: OOF.py プロジェクト: Bayeshijiu/DataMining
    def fit(self,
            X,
            y,
            X_test,
            feval=None,
            cat_feats=None,
            exclude_columns=None,
            epochs=16,
            batch_size=128,
            oof2csv=False,
            plot=False):
        """
        # TODO: Rank 融合
        :param X: 保证索引唯一
        :param y:
        :param X_test:
        :param feval: roc_auc_score(y_true, y_score)
        :param cat_feats: 类别特征索引
        :param exclude_columns:
        仅针对 nn
        :param epochs:
        :param batch_size:
        :return:
        """
        # 判断输入数据转数据框
        if isinstance(y, pd.Series):
            y.reset_index(drop=True, inplace=True)

        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
            X_test = pd.DataFrame(X)
        else:
            X.reset_index(drop=True, inplace=True)
            X_test.reset_index(drop=True, inplace=True)

        # oof评估函数
        feval = feval if feval else roc_auc_score

        # 移除不需要的特征
        if exclude_columns:
            feats = X.columns.difference(exclude_columns)
            X, X_test = X[feats], X_test[feats]

        # Score
        if hasattr(feval, '__repr__'):
            score_name = feval.__repr__().split()[1]
        else:
            score_name = None

        # cv num
        if hasattr(self.folds, 'n_splits'):
            num_cv = self.folds.n_splits
        else:
            num_cv = self.folds.cvargs['n_splits'] * self.folds.n_repeats

        # Cross validation model
        # Create arrays and dataframes to store results
        oof_preds = np.zeros(X.shape[0])
        sub_preds = np.zeros((X_test.shape[0], num_cv))
        self.feature_importance_df = pd.DataFrame()

        for n_fold, (train_idx,
                     valid_idx) in enumerate(self.folds.split(X, y), 1):
            print("\n\033[94mFold %s started at %s\033[0m" %
                  (n_fold, time.ctime()))

            X_train, y_train = X.iloc[train_idx], y[train_idx]
            X_valid, y_valid = X.iloc[valid_idx], y[valid_idx]

            if not hasattr(self.estimator, 'fit'):
                print("该算法无fit方法")
                break
            else:
                if 'LGBMClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        categorical_feature=cat_feats if cat_feats else 'auto',
                        eval_metric='auc',
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)
                elif 'LGBMRegressor' in self.model_type:
                    # reg_objs = ['regression_l1', 'regression_l2', 'huber', 'fair', 'poisson', 'quantile', 'mape', 'gamma', 'tweedie']
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        categorical_feature=cat_feats if cat_feats else 'auto',
                        # eval_metric='l2',
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)

                elif 'XGBClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        eval_metric='auc',
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)
                elif 'XGBRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        # eval_metric='rmse',
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)

                elif 'CatBoostClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        cat_features=cat_feats,
                        use_best_model=True,
                        plot=True,
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)
                elif 'CatBoostRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        cat_features=cat_feats,
                        use_best_model=True,
                        plot=True,
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)

                elif 'RGFClassifier' in self.model_type:
                    pass
                elif 'RGFRegressor' in self.model_type:
                    pass

                # https://www.cnblogs.com/flyu6/p/7691106.html
                elif 'KerasClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(X_train,
                                       y_train,
                                       epochs=epochs,
                                       batch_size=batch_size,
                                       validation_data=eval_set)
                elif 'KerasRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(X_train,
                                       y_train,
                                       epochs=epochs,
                                       batch_size=batch_size,
                                       validation_data=eval_set)

                elif self.model_type == 'GLM':
                    # TODO: 其他模型的支持
                    self.estimator = GLM(y_train,
                                         X_train,
                                         family=families.Binomial())
                    self.estimator = self.estimator.fit().predict(X)
                else:
                    # sklearn 原生模型
                    print('Sklearn Fitting ...')
                    self.estimator.fit(X_train, y_train)

                # 计算并保存 preds
                # TODO: 多分类需要修改
                if hasattr(self.estimator, 'predict_proba'):
                    oof_preds[valid_idx] = self.estimator.predict_proba(
                        X_valid)[:, 1]
                    sub_preds[:, n_fold -
                              1] = self.estimator.predict_proba(X_test)[:, 1]
                else:
                    oof_preds[valid_idx] = self.estimator.predict(X_valid)
                    sub_preds[:, n_fold - 1] = self.estimator.predict(X_test)

            if plot and hasattr(self.estimator, 'feature_importances_'):
                fold_importance_df = pd.DataFrame()
                fold_importance_df["feature"] = X.columns
                fold_importance_df[
                    "importance"] = self.estimator.feature_importances_
                fold_importance_df["fold"] = n_fold
                self.feature_importance_df = fold_importance_df.append(
                    self.feature_importance_df)

        # 输出需要的结果
        self.oof_preds = oof_preds
        self.sub_preds = sub_preds.mean(1)
        self.sub_preds_rank = pd.DataFrame(sub_preds).rank().mean(
            1) / sub_preds.shape[0]  # auc work

        try:
            self.score = feval(y, self.oof_preds)
        except Exception as e:
            self.score = 0
            print('Error feval:', e)

        print("\n\033[94mCV Score %s: %s ended at %s\033[0m" %
              (score_name, self.score, time.ctime()))

        # 保存的普通平均的得分
        if oof2csv:
            pd.Series(np.append(self.oof_preds, self.sub_preds),
                      name='oof').to_csv('OOF %s %.4f.csv' %
                                         (time.ctime(), self.score),
                                         index=False)

        # 是否输出特征重要性
        if plot:
            self.feature_importance_df.sort_values(['fold', 'importance'],
                                                   0,
                                                   False,
                                                   inplace=True)
            self.plot_importances(self.feature_importance_df, len(X.columns))
コード例 #4
0
def predict(design_matrix, coefficients):
    family = families.Binomial()
    return family.link.inverse(design_matrix @ np.squeeze(coefficients))
コード例 #5
0
ファイル: __init__.py プロジェクト: mindis/tql-Python
    def fit(self, X, y, X_test, feval=None, cat_feats=None, exclude_columns=None, epochs=16, batch_size=128,
            oof2csv=None):
        """
        # TODO: Rank 融合
        :param X:
        :param y:
        :param X_test:
        :param feval: roc_auc_score(y_true, y_score)
        :param cat_feats: 类别特征索引
        :param exclude_columns:
        仅针对 nn
        :param epochs:
        :param batch_size:
        :return:
        """
        # oof评估函数
        feval = feval if feval else roc_auc_score

        # 移除不需要的特征
        if exclude_columns:
            feats = X.columns.difference(exclude_columns)
        else:
            feats = X.columns

        X, X_test = X[feats], X_test[feats]

        if hasattr(self.folds, 'n_splits'):
            num_folds = self.folds.n_splits
        else:
            num_folds = self.folds.cvargs['n_splits']

        # Cross validation model
        # Create arrays and dataframes to store results
        oof_preds = np.zeros(len(X))
        sub_preds = np.zeros(len(X_test))
        self.feature_importance_df = pd.DataFrame()

        for n_fold, (train_idx, valid_idx) in enumerate(self.folds.split(X, y), 1):
            print("\n\033[94mFold %s started at %s\033[0m" % (n_fold, time.ctime()))

            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]

            if not hasattr(self.clf, 'fit'):
                print("该算法无fit方法")
                break
            else:
                if 'LGBMClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 eval_set=eval_set,
                                 categorical_feature=cat_feats if cat_feats else 'auto',
                                 eval_metric='auc',
                                 early_stopping_rounds=100,
                                 verbose=100)
                elif 'LGBMRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 eval_set=eval_set,
                                 categorical_feature=cat_feats if cat_feats else 'auto',
                                 eval_metric='l2',
                                 early_stopping_rounds=100,
                                 verbose=100)

                elif 'XGBClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 eval_set=eval_set,
                                 eval_metric='auc',
                                 early_stopping_rounds=100,
                                 verbose=100)
                elif 'XGBRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 eval_set=eval_set,
                                 eval_metric='rmse',
                                 early_stopping_rounds=100,
                                 verbose=100)

                elif 'CatBoostClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 eval_set=eval_set,
                                 cat_features=cat_feats,
                                 use_best_model=True,
                                 plot=True,
                                 early_stopping_rounds=100,
                                 verbose=100)
                elif 'CatBoostRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 eval_set=eval_set,
                                 cat_features=cat_feats,
                                 use_best_model=True,
                                 plot=True,
                                 early_stopping_rounds=100,
                                 verbose=0)

                elif 'RGFClassifier' in self.model_type:
                    pass
                elif 'RGFRegressor' in self.model_type:
                    pass

                # https://www.cnblogs.com/flyu6/p/7691106.html
                elif 'KerasClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 epochs=epochs,
                                 batch_size=batch_size,
                                 validation_data=eval_set)
                elif 'KerasRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 epochs=epochs,
                                 batch_size=batch_size,
                                 validation_data=eval_set)

                elif self.model_type == 'GLM':
                    # TODO: 其他模型的支持
                    self.clf = GLM(y_train, X_train, family=families.Binomial())
                    self.clf = self.clf.fit().predict(X)
                else:
                    # sklearn 原生模型
                    self.clf.fit(X, y)

                # 计算并保存 preds
                # TODO: 多分类需要修改
                if hasattr(self.clf, 'predict_proba'):
                    oof_preds[valid_idx] = self.clf.predict_proba(X_valid)[:, 1]
                    sub_preds += self.clf.predict_proba(X_test)[:, 1] / num_folds
                else:
                    oof_preds[valid_idx] = self.clf.predict(X_valid)
                    sub_preds += self.clf.predict(X_test) / num_folds

            if hasattr(self.clf, 'feature_importances_'):
                fold_importance_df = pd.DataFrame()
                fold_importance_df["feature"] = feats
                fold_importance_df["importance"] = self.clf.feature_importances_
                fold_importance_df["fold"] = n_fold
                self.feature_importance_df = pd.concat([self.feature_importance_df, fold_importance_df], 0)

        try:
            score = feval(y, oof_preds)
            score_name = feval.__repr__().split()[1]
        except Exception as e:
            score = score_name = None
            print('Error feval:', e)

        print("\n\033[94mOOF %s: %s end at %s\n\033[0m" % (score_name, score, time.ctime()))

        if hasattr(self.clf, 'feature_importances_'):
            self.plot_importances(self.feature_importance_df)

        self.oof_preds = oof_preds
        self.test_preds = sub_preds
        if oof2csv:
            pd.Series(oof_preds.tolist() + sub_preds.tolist(), name='oof').to_csv(oof2csv + time.ctime(), index=False)

        return oof_preds, sub_preds
コード例 #6
0
 def _fit_unmatched_regression(self, statmatch):
     link = families.links.probit
     family = families.Binomial(link)
     reg = GLM(statmatch.treated, statmatch.design_matrix, family=family)
     return reg.fit()