コード例 #1
0
    def __init__(self, formula=None, data=None, **kwargs):

        # convert all variables raised to a power to float64
        # this prevents mis-specification of probabilities in cases of variable overflow
        # (if the original var was compressed to a smaller bit integer/float)
        if type(data) == pd.DataFrame:
            power_vars = list(set(re.findall(r'(?<=power\().+?(?=,)',
                                             formula)))
            for var in power_vars:
                data[var] = data[var].astype('float64')

        if formula:
            y, X = patsy.dmatrices(formula, data, 1)
            self._y_design_info = y.design_info
            self._X_design_info = X.design_info
            self._model = GLM(y, X, family=Binomial(), **kwargs)
            self._fit = self._model.fit()
            self._betas = self._fit.params
            self._link = logit
        else:
            self._y_design_info = None
            self._X_design_info = None
            self._model = None
            self._fit = None
            self._betas = None
            self._link = logit
コード例 #2
0
def fit_reg(covariate, treated, weights=pd.Series()):
    treated = add_constant(treated)
    if not weights.any():
        reg = GLM(covariate, treated)
    else:
        reg = GLM(covariate, treated)
    res = reg.fit()
    return res
コード例 #3
0
def fit_reg(covariate, treated, weights=pd.Series()):
    link = families.links.logit
    family = families.Binomial(link)
    if not weights.any():
        reg = GLM(covariate, treated, family=family, sigma=weights)
    else:
        reg = GLM(covariate, treated, family=family)
    res = reg.fit()
    return res
コード例 #4
0
    def _fit_matched_regression(self, statmatch):
        has_match = np.isfinite(statmatch.matches)
        treated_index = has_match[has_match == True].index
        match_index = np.asarray(statmatch.matches[has_match], dtype=np.int32)
        regression_index = treated_index.append(match_index)

        link = families.links.probit
        family = families.Binomial(link)
        reg = GLM(statmatch.treated.ix[regression_index],
                  statmatch.design_matrix.ix[regression_index],
                  family=family)
        return reg.fit()
コード例 #5
0
    def _create_propensity_scores(self,
                                  treated,
                                  design_matrix,
                                  link_type='logit'):
        if link_type == 'logit':
            link = families.links.logit
        elif link_type == 'probit':
            link = families.links.probit

        family = families.Binomial(link)
        reg = GLM(treated, design_matrix, family=family)
        fitted_reg = reg.fit()
        return fitted_reg
コード例 #6
0
def estimate_movement_std(position_info):

    MODEL_FORMULA = 'position ~ lagged_position - 1'
    response, design_matrix = dmatrices(MODEL_FORMULA, position_info)
    fit = GLM(response, design_matrix, family=families.Gaussian()).fit()

    return np.sqrt(fit.scale)
コード例 #7
0
ファイル: crystal.py プロジェクト: LeonardJ09/crystal
def one_cluster(formula,
                feature,
                covs,
                coef,
                method=OLS,
                _pat=re.compile("\+\s*CpG")):
    """used when we have a "cluster" with 1 probe."""
    c = covs.copy()
    # remove the CpG in the formula
    formula = _pat.sub("", formula)
    if isinstance(feature, CountFeature):
        c['methylation'] = feature.methylated
        c['counts'] = feature.counts
        c = c[c['counts'] > 0]
        try:
            return get_ptc(
                GLM.from_formula(formula,
                                 data=c,
                                 exposure=c['counts'],
                                 family=Poisson()).fit(), coef)
        except PerfectSeparationError:
            return dict(p=np.nan, t=np.nan, coef=np.nan, covar=coef)
    else:
        c['methylation'] = feature.values
        res = method.from_formula(formula, data=c).fit()
        return get_ptc(res, coef)
コード例 #8
0
def glm_fit(spikes, design_matrix, ind):
    '''Fits the Poisson model to the spikes from a neuron

    Parameters
    ----------
    spikes : array_like
    design_matrix : array_like or pandas DataFrame
    ind : int

    Returns
    -------
    fitted_model : object or NaN
        Returns the statsmodel object if successful. If the model fails in
        the weighted fit in the IRLS procedure, the model returns NaN.

    '''
    try:
        logger.debug('\t\t...Neuron #{}'.format(ind + 1))
        return GLM(spikes.reindex(design_matrix.index),
                   design_matrix,
                   family=families.Poisson(),
                   drop='missing').fit(maxiter=30)
    except np.linalg.linalg.LinAlgError:
        warn('Data is poorly scaled for neuron #{}'.format(ind + 1))
        return np.nan
コード例 #9
0
 def fit(self, treated, design_matrix, design_matrix_header):
     """Run logit or probit and set treated, design_matrix, and pscore"""
     #Convert to pandas data structures
     treated = pd.Series(treated)
     design_matrix = pd.DataFrame(design_matrix)
     #Fit propensity socre
     link = families.links.logit
     family = families.Binomial(link)
     reg = GLM(treated, design_matrix, family=family)
     fitted_reg = reg.fit()
     pscore = fitted_reg.fittedvalues
     #Store values for later refernce
     self.header = design_matrix_header
     self.treated = treated
     self.design_matrix = design_matrix
     self.pscore = pscore
コード例 #10
0
    def __init__(self, formula=None, data=None, link=logit, **kwargs):

        if formula:
            y, X = patsy.dmatrices(formula, data, 1)
            self._y_design_info = y.design_info
            self._X_design_info = X.design_info
            self._model = GLM(y, X, family=Binomial(link), **kwargs)
            self._fit = self._model.fit()
            self._betas = self._fit.params
            self._link = link
        else:
            self._y_design_info = None
            self._X_design_info = None
            self._model = None
            self._fit = None
            self._betas = None
            self._link = link
コード例 #11
0
def estimate_movement_variance(position, lagged_position, speed):

    data = {
        'position': position,
        'lagged_position': lagged_position
    }

    MODEL_FORMULA = 'position ~ lagged_position - 1'
    response, design_matrix = dmatrices(MODEL_FORMULA, data)
    fit = GLM(response, design_matrix, family=families.Gaussian()).fit()

    return np.sqrt(fit.scale)
コード例 #12
0
ファイル: sorted_spikes.py プロジェクト: madkinsz/mst-decoder
def fit_glm_model(spikes, design_matrix, penalty=1E-5):
    '''Fits the Poisson model to the spikes from a neuron

    Parameters
    ----------
    spikes : array_like
    design_matrix : array_like or pandas DataFrame
    ind : int
    penalty : float, optional

    Returns
    -------
    fitted_model : statsmodel results

    '''
    model = GLM(spikes,
                design_matrix,
                family=families.Poisson(),
                drop='missing')
    regularization_weights = np.ones((design_matrix.shape[1], )) * penalty
    regularization_weights[0] = 0.0
    return model.fit_regularized(alpha=regularization_weights, L1_wt=0)
コード例 #13
0
ファイル: crystal.py プロジェクト: LeonardJ09/crystal
def one_cluster(formula, feature, covs, coef, method=OLS,
                _pat=re.compile("\+\s*CpG")):
    """used when we have a "cluster" with 1 probe."""
    c = covs.copy()
    # remove the CpG in the formula
    formula = _pat.sub("", formula)
    if isinstance(feature, CountFeature):
        c['methylation'] = feature.methylated
        c['counts'] = feature.counts
        c = c[c['counts'] > 0]
        try:
            return get_ptc(GLM.from_formula(formula, data=c,
                                        exposure=c['counts'],
                                        family=Poisson()).fit(), coef)
        except PerfectSeparationError:
            return dict(p=np.nan, t=np.nan, coef=np.nan, covar=coef)
    else:
        c['methylation'] = feature.values
        res = method.from_formula(formula, data=c).fit()
        return get_ptc(res, coef)
def estimate_movement_std(position):
    '''Estimates the movement standard deviation based on position.

    WARNING: Need to use on original position, not interpolated position.

    Parameters
    ----------
    position : ndarray, shape (n_time, n_position_dim)

    Returns
    -------
    movement_std : ndarray, shape (n_position_dim,)

    '''
    position = atleast_2d(position)
    is_nan = np.any(np.isnan(position), axis=1)
    position = position[~is_nan]
    movement_std = []
    for p in position.T:
        fit = GLM(p[:-1], p[1:], family=families.Gaussian()).fit()
        movement_std.append(np.sqrt(fit.scale))
    return np.array(movement_std)
コード例 #15
0
n_time, n_trials = 1500, 1000
SAMPLING_FREQUENCY = 1500
sampling_frequency = 1500

# Firing rate starts at 5 Hz and switches to 10 Hz
firing_rate = np.ones((n_time, n_trials)) * 10
firing_rate[:n_time // 2, :] = 5
spike_train = simulate_poisson_process(firing_rate, sampling_frequency)
time = (np.arange(0, n_time)[:, np.newaxis] / sampling_frequency * np.ones(
    (1, n_trials)))
trial_id = (np.arange(n_trials)[np.newaxis, :] * np.ones((n_time, 1)))

# Fit a spline model to the firing rate
design_matrix = dmatrix('bs(time, df=5)', dict(time=time.ravel()))
fit = GLM(spike_train.ravel(), design_matrix, family=families.Poisson()).fit()

fig, axes = plt.subplots(1, 2, figsize=(12, 6))

axes[0].pcolormesh(np.unique(time),
                   np.unique(trial_id),
                   spike_train.T,
                   cmap='viridis')
axes[0].set_xlabel('Time')
axes[0].set_ylabel('Trials')
axes[0].set_title('Simulated Spikes')
conditional_intensity = fit.mu

axes[1].plot(np.unique(time),
             firing_rate[:, 0],
             linestyle='--',
コード例 #16
0
 def _fit_unmatched_regression(self, statmatch):
     link = families.links.probit
     family = families.Binomial(link)
     reg = GLM(statmatch.treated, statmatch.design_matrix, family=family)
     return reg.fit()
コード例 #17
0
def fit_speed_model(speed, lagged_speed):
    response, design_matrix = dmatrices(
        FORMULA, dict(speed=speed, lagged_speed=lagged_speed))
    results = GLM(response, design_matrix, family=FAMILY).fit()
    return results.params, results.scale
コード例 #18
0
def fit_speed_model(speed, lagged_speed):
    FORMULA = 'speed ~ lagged_speed - 1'
    response, design_matrix = dmatrices(
        FORMULA, dict(speed=speed, lagged_speed=lagged_speed))
    family = families.Gaussian(link=families.links.log)
    return GLM(response, design_matrix, family=family).fit()
コード例 #19
0
ファイル: OOF.py プロジェクト: Bayeshijiu/DataMining
    def fit(self,
            X,
            y,
            X_test,
            feval=None,
            cat_feats=None,
            exclude_columns=None,
            epochs=16,
            batch_size=128,
            oof2csv=False,
            plot=False):
        """
        # TODO: Rank 融合
        :param X: 保证索引唯一
        :param y:
        :param X_test:
        :param feval: roc_auc_score(y_true, y_score)
        :param cat_feats: 类别特征索引
        :param exclude_columns:
        仅针对 nn
        :param epochs:
        :param batch_size:
        :return:
        """
        # 判断输入数据转数据框
        if isinstance(y, pd.Series):
            y.reset_index(drop=True, inplace=True)

        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
            X_test = pd.DataFrame(X)
        else:
            X.reset_index(drop=True, inplace=True)
            X_test.reset_index(drop=True, inplace=True)

        # oof评估函数
        feval = feval if feval else roc_auc_score

        # 移除不需要的特征
        if exclude_columns:
            feats = X.columns.difference(exclude_columns)
            X, X_test = X[feats], X_test[feats]

        # Score
        if hasattr(feval, '__repr__'):
            score_name = feval.__repr__().split()[1]
        else:
            score_name = None

        # cv num
        if hasattr(self.folds, 'n_splits'):
            num_cv = self.folds.n_splits
        else:
            num_cv = self.folds.cvargs['n_splits'] * self.folds.n_repeats

        # Cross validation model
        # Create arrays and dataframes to store results
        oof_preds = np.zeros(X.shape[0])
        sub_preds = np.zeros((X_test.shape[0], num_cv))
        self.feature_importance_df = pd.DataFrame()

        for n_fold, (train_idx,
                     valid_idx) in enumerate(self.folds.split(X, y), 1):
            print("\n\033[94mFold %s started at %s\033[0m" %
                  (n_fold, time.ctime()))

            X_train, y_train = X.iloc[train_idx], y[train_idx]
            X_valid, y_valid = X.iloc[valid_idx], y[valid_idx]

            if not hasattr(self.estimator, 'fit'):
                print("该算法无fit方法")
                break
            else:
                if 'LGBMClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        categorical_feature=cat_feats if cat_feats else 'auto',
                        eval_metric='auc',
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)
                elif 'LGBMRegressor' in self.model_type:
                    # reg_objs = ['regression_l1', 'regression_l2', 'huber', 'fair', 'poisson', 'quantile', 'mape', 'gamma', 'tweedie']
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        categorical_feature=cat_feats if cat_feats else 'auto',
                        # eval_metric='l2',
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)

                elif 'XGBClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        eval_metric='auc',
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)
                elif 'XGBRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        # eval_metric='rmse',
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)

                elif 'CatBoostClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        cat_features=cat_feats,
                        use_best_model=True,
                        plot=True,
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)
                elif 'CatBoostRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        cat_features=cat_feats,
                        use_best_model=True,
                        plot=True,
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)

                elif 'RGFClassifier' in self.model_type:
                    pass
                elif 'RGFRegressor' in self.model_type:
                    pass

                # https://www.cnblogs.com/flyu6/p/7691106.html
                elif 'KerasClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(X_train,
                                       y_train,
                                       epochs=epochs,
                                       batch_size=batch_size,
                                       validation_data=eval_set)
                elif 'KerasRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(X_train,
                                       y_train,
                                       epochs=epochs,
                                       batch_size=batch_size,
                                       validation_data=eval_set)

                elif self.model_type == 'GLM':
                    # TODO: 其他模型的支持
                    self.estimator = GLM(y_train,
                                         X_train,
                                         family=families.Binomial())
                    self.estimator = self.estimator.fit().predict(X)
                else:
                    # sklearn 原生模型
                    print('Sklearn Fitting ...')
                    self.estimator.fit(X_train, y_train)

                # 计算并保存 preds
                # TODO: 多分类需要修改
                if hasattr(self.estimator, 'predict_proba'):
                    oof_preds[valid_idx] = self.estimator.predict_proba(
                        X_valid)[:, 1]
                    sub_preds[:, n_fold -
                              1] = self.estimator.predict_proba(X_test)[:, 1]
                else:
                    oof_preds[valid_idx] = self.estimator.predict(X_valid)
                    sub_preds[:, n_fold - 1] = self.estimator.predict(X_test)

            if plot and hasattr(self.estimator, 'feature_importances_'):
                fold_importance_df = pd.DataFrame()
                fold_importance_df["feature"] = X.columns
                fold_importance_df[
                    "importance"] = self.estimator.feature_importances_
                fold_importance_df["fold"] = n_fold
                self.feature_importance_df = fold_importance_df.append(
                    self.feature_importance_df)

        # 输出需要的结果
        self.oof_preds = oof_preds
        self.sub_preds = sub_preds.mean(1)
        self.sub_preds_rank = pd.DataFrame(sub_preds).rank().mean(
            1) / sub_preds.shape[0]  # auc work

        try:
            self.score = feval(y, self.oof_preds)
        except Exception as e:
            self.score = 0
            print('Error feval:', e)

        print("\n\033[94mCV Score %s: %s ended at %s\033[0m" %
              (score_name, self.score, time.ctime()))

        # 保存的普通平均的得分
        if oof2csv:
            pd.Series(np.append(self.oof_preds, self.sub_preds),
                      name='oof').to_csv('OOF %s %.4f.csv' %
                                         (time.ctime(), self.score),
                                         index=False)

        # 是否输出特征重要性
        if plot:
            self.feature_importance_df.sort_values(['fold', 'importance'],
                                                   0,
                                                   False,
                                                   inplace=True)
            self.plot_importances(self.feature_importance_df, len(X.columns))
コード例 #20
0
ファイル: OOF.py プロジェクト: Bayeshijiu/DataMining
class OOF(object):
    """Out of flod prediction
    # TODO 支持回归

    lightGBM一个一个地建立节点; XGboost一层一层地建立节点
    https://blog.csdn.net/friyal/article/details/82758532
    Catboost总是使用完全二叉树。它的节点是镜像的(对称树)。Catboost称对称树有利于避免overfit,增加可靠性,并且能大大加速预测等等。
        计算某个category出现的频率,加上超参数,生成新的numerical features
    # https://blog.csdn.net/linxid/article/details/80723811
    """
    _params = {
        'metric': 'auc',
        'learning_rate': 0.01,
        'n_estimators': 30000,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'class_weight': 'balanced',
        'scale_pos_weight': 1,
        'random_state': 2019,
        'verbosity': -1
    }
    lgb = LGBMClassifier(n_jobs=16, **_params)  # TODO: 常用模型另存为其他模块
    xgb = XGBClassifier()
    cat = CatBoostClassifier(n_estimators=20000,
                             learning_rate=0.05,
                             loss_function='Logloss',
                             eval_metric='AUC',
                             random_state=2019)

    def __init__(self,
                 estimator=None,
                 folds=None,
                 early_stopping_rounds=300,
                 verbose=100):
        # 指定lgb: metric xgb: eval_metric
        self.estimator = self.lgb if estimator is None else estimator
        self.folds = folds if folds else StratifiedKFold(
            5, True, 2019)  # 支持 RepeatedStratifiedKFold
        self.model_type = self.estimator.__repr__()

        self.early_stopping_rounds = early_stopping_rounds
        self.verbose = verbose
        # self.estimator_agrs = self.getfullargspec(self.estimator.fit).args if hasattr(self.estimator, 'fit') else None

    def fit(self,
            X,
            y,
            X_test,
            feval=None,
            cat_feats=None,
            exclude_columns=None,
            epochs=16,
            batch_size=128,
            oof2csv=False,
            plot=False):
        """
        # TODO: Rank 融合
        :param X: 保证索引唯一
        :param y:
        :param X_test:
        :param feval: roc_auc_score(y_true, y_score)
        :param cat_feats: 类别特征索引
        :param exclude_columns:
        仅针对 nn
        :param epochs:
        :param batch_size:
        :return:
        """
        # 判断输入数据转数据框
        if isinstance(y, pd.Series):
            y.reset_index(drop=True, inplace=True)

        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
            X_test = pd.DataFrame(X)
        else:
            X.reset_index(drop=True, inplace=True)
            X_test.reset_index(drop=True, inplace=True)

        # oof评估函数
        feval = feval if feval else roc_auc_score

        # 移除不需要的特征
        if exclude_columns:
            feats = X.columns.difference(exclude_columns)
            X, X_test = X[feats], X_test[feats]

        # Score
        if hasattr(feval, '__repr__'):
            score_name = feval.__repr__().split()[1]
        else:
            score_name = None

        # cv num
        if hasattr(self.folds, 'n_splits'):
            num_cv = self.folds.n_splits
        else:
            num_cv = self.folds.cvargs['n_splits'] * self.folds.n_repeats

        # Cross validation model
        # Create arrays and dataframes to store results
        oof_preds = np.zeros(X.shape[0])
        sub_preds = np.zeros((X_test.shape[0], num_cv))
        self.feature_importance_df = pd.DataFrame()

        for n_fold, (train_idx,
                     valid_idx) in enumerate(self.folds.split(X, y), 1):
            print("\n\033[94mFold %s started at %s\033[0m" %
                  (n_fold, time.ctime()))

            X_train, y_train = X.iloc[train_idx], y[train_idx]
            X_valid, y_valid = X.iloc[valid_idx], y[valid_idx]

            if not hasattr(self.estimator, 'fit'):
                print("该算法无fit方法")
                break
            else:
                if 'LGBMClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        categorical_feature=cat_feats if cat_feats else 'auto',
                        eval_metric='auc',
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)
                elif 'LGBMRegressor' in self.model_type:
                    # reg_objs = ['regression_l1', 'regression_l2', 'huber', 'fair', 'poisson', 'quantile', 'mape', 'gamma', 'tweedie']
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        categorical_feature=cat_feats if cat_feats else 'auto',
                        # eval_metric='l2',
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)

                elif 'XGBClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        eval_metric='auc',
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)
                elif 'XGBRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        # eval_metric='rmse',
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)

                elif 'CatBoostClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        cat_features=cat_feats,
                        use_best_model=True,
                        plot=True,
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)
                elif 'CatBoostRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        cat_features=cat_feats,
                        use_best_model=True,
                        plot=True,
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)

                elif 'RGFClassifier' in self.model_type:
                    pass
                elif 'RGFRegressor' in self.model_type:
                    pass

                # https://www.cnblogs.com/flyu6/p/7691106.html
                elif 'KerasClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(X_train,
                                       y_train,
                                       epochs=epochs,
                                       batch_size=batch_size,
                                       validation_data=eval_set)
                elif 'KerasRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(X_train,
                                       y_train,
                                       epochs=epochs,
                                       batch_size=batch_size,
                                       validation_data=eval_set)

                elif self.model_type == 'GLM':
                    # TODO: 其他模型的支持
                    self.estimator = GLM(y_train,
                                         X_train,
                                         family=families.Binomial())
                    self.estimator = self.estimator.fit().predict(X)
                else:
                    # sklearn 原生模型
                    print('Sklearn Fitting ...')
                    self.estimator.fit(X_train, y_train)

                # 计算并保存 preds
                # TODO: 多分类需要修改
                if hasattr(self.estimator, 'predict_proba'):
                    oof_preds[valid_idx] = self.estimator.predict_proba(
                        X_valid)[:, 1]
                    sub_preds[:, n_fold -
                              1] = self.estimator.predict_proba(X_test)[:, 1]
                else:
                    oof_preds[valid_idx] = self.estimator.predict(X_valid)
                    sub_preds[:, n_fold - 1] = self.estimator.predict(X_test)

            if plot and hasattr(self.estimator, 'feature_importances_'):
                fold_importance_df = pd.DataFrame()
                fold_importance_df["feature"] = X.columns
                fold_importance_df[
                    "importance"] = self.estimator.feature_importances_
                fold_importance_df["fold"] = n_fold
                self.feature_importance_df = fold_importance_df.append(
                    self.feature_importance_df)

        # 输出需要的结果
        self.oof_preds = oof_preds
        self.sub_preds = sub_preds.mean(1)
        self.sub_preds_rank = pd.DataFrame(sub_preds).rank().mean(
            1) / sub_preds.shape[0]  # auc work

        try:
            self.score = feval(y, self.oof_preds)
        except Exception as e:
            self.score = 0
            print('Error feval:', e)

        print("\n\033[94mCV Score %s: %s ended at %s\033[0m" %
              (score_name, self.score, time.ctime()))

        # 保存的普通平均的得分
        if oof2csv:
            pd.Series(np.append(self.oof_preds, self.sub_preds),
                      name='oof').to_csv('OOF %s %.4f.csv' %
                                         (time.ctime(), self.score),
                                         index=False)

        # 是否输出特征重要性
        if plot:
            self.feature_importance_df.sort_values(['fold', 'importance'],
                                                   0,
                                                   False,
                                                   inplace=True)
            self.plot_importances(self.feature_importance_df, len(X.columns))

    def plot_importances(self, df, topk=64):
        """Display/plot feature importance"""
        assert "feature" in df.columns and "importance" in df.columns, '无["feature", "importance"]'

        data = (df[["feature", "importance"
                    ]].groupby("feature").mean().reset_index().sort_values(
                        "importance", 0, False))[:topk]

        self.feature_importance_df_agg = data
        plt.figure(figsize=(12, topk // 4))
        sns.barplot(x="importance",
                    y="feature",
                    data=data.assign(feature='col_' +
                                     data.feature.astype(str)))
        plt.title('Features (avg over folds)')
        plt.tight_layout()
        plt.savefig('importances.png')
コード例 #21
0
# Logistic regression of sex on height and weight
# Sex is coded in the binary variable `male`.

# LHS binary variable
male = (heights_weights['Gender'] == 'Male') * 1

# Matrix of predictor variables: hieght and weight from data frame
# into an Nx2 array.
hw_exog = heights_weights[['Height', 'Weight']].values

# Logit model 1: Using GLM and the Binomial Family w/ the Logit Link
# Note I have to add constants to the `exog` matrix. The prepend = True
# argument prevents a warning about future change to the default argument.
logit_model = GLM(male,
                  sm.add_constant(hw_exog, prepend=True),
                  family=sm.families.Binomial(sm.families.links.logit))
logit_model.fit().summary()

# Get the coefficient parameters.
logit_pars = logit_model.fit().params

# Logit model 2: Using the Logit function.
logit_model2 = Logit(male, sm.add_constant(hw_exog, prepend=True))
logit_model2.fit().summary()

# Get the coefficient parameters
logit_pars2 = logit_model2.fit().params

# Compare the two methods again. They give the same parameters.
DataFrame({'GLM': logit_pars, 'Logit': logit_pars2})
コード例 #22
0
ファイル: ch2.py プロジェクト: ANB2/Will_it_Python
plt.savefig('height_weight_lowess.png')

# Logistic regression of sex on height and weight
# Sex is coded in the binary variable `male`.

# LHS binary variable
male = (heights_weights['Gender'] == 'Male') * 1

# Matrix of predictor variables: hieght and weight from data frame
# into an Nx2 array.
hw_exog = heights_weights[['Height', 'Weight']].values

# Logit model 1: Using GLM and the Binomial Family w/ the Logit Link
# Note I have to add constants to the `exog` matrix. The prepend = True
# argument prevents a warning about future change to the default argument.
logit_model = GLM(male, sm.add_constant(hw_exog, prepend = True), family = sm.families.Binomial(sm.families.links.logit))
logit_model.fit().summary()

# Get the coefficient parameters.
logit_pars = logit_model.fit().params


# Logit model 2: Using the Logit function.
logit_model2 = Logit(male, sm.add_constant(hw_exog, prepend = True))
logit_model2.fit().summary()

# Get the coefficient parameters
logit_pars2 = logit_model2.fit().params

# Compare the two methods again. They give the same parameters.
DataFrame({'GLM' : logit_pars, 'Logit' : logit_pars2})
コード例 #23
0
X = pd.DataFrame({
    '$R$': race,
    '$I$': income,
    '$C$': crime,
    '$E$': industry,
    '$N$': neighborhood
})

X.corr()

from statsmodels.api import GLM
import statsmodels.api as sm

X['$1/I$'] = 1. / X['$I$']
model = GLM(X['$C$'], X[['$1/I$']], family=sm.families.Gamma())
result = model.fit()
result.summary()

races = {0: 'african-american', 1: 'hispanic', 2: 'asian', 3: 'white'}

X['race'] = X['$R$'].map(races)

race_dummies = pd.get_dummies(X['race'])

X[race_dummies.columns] = race_dummies

X_restricted = X[X['$E$'] == 0]

model = OLS(X_restricted['$C$'], X_restricted[race_dummies.columns])
result = model.fit()
コード例 #24
0
class LogitRegression(object):
    """Patsy wrapper for logit model estimation and prediction.

    Example usage:

    # construct and estimate model using patsy formula
    # uses the cps pickle file under dataset processor
    cps["EarnedWage"] = (cps.WageIncomeLastYear > 0).astype(int)
    model = LogitRegression(
        "EarnedWage ~ C(Race)",
        cps,
        freq_weights=cps.Weight
    )

    # print model summary
    print(model)

    # return predicted probability of working for blacks
    prob_works = model.predict(
        pandas.DataFrame({
            "Race": ["Black"]
        })
    )
    """

    def __init__(self, formula=None, data=None, link=logit, **kwargs):

        if formula:
            y, X = patsy.dmatrices(formula, data, 1)
            self._y_design_info = y.design_info
            self._X_design_info = X.design_info
            self._model = GLM(y, X, family=Binomial(link), **kwargs)
            self._fit = self._model.fit()
            self._betas = self._fit.params
            self._link = link
        else:
            self._y_design_info = None
            self._X_design_info = None
            self._model = None
            self._fit = None
            self._betas = None
            self._link = link

    def __repr__(self):
        return str(self._fit.summary()) if self._fit                           \
            else "Logistic regression"

    def predict(self, data, linear=False):

        if len(data) == 0:
            return []

        (X, ) = patsy.build_design_matrices([self._X_design_info], data)

        if not linear:
            return self._link.inverse(self._link(),
                                      linear_transform(
                                          numpy.asarray(X), self._betas))

        else:
            return linear_transform(numpy.asarray(X), self._betas)

    def draw(self, data, rand_engine):

        prediction = self.predict(data)

        return rand_engine.binomial(1, prediction)

    def to_pickle(self, filename):

        pickle.dump((self._y_design_info, self._X_design_info, self._betas,
                     self._link), open(filename, "wb"))

    @staticmethod
    def read_pickle(filename):
        y_design_info, X_design_info, betas, link = pickle.load(
            open(filename, "rb"))

        logit_regression = LogitRegression()
        logit_regression._y_design_info = y_design_info
        logit_regression._X_design_info = X_design_info
        logit_regression._betas = betas
        logit_regression._link = link

        return logit_regression

    def __add__(self, other):
        ret = copy(self)
        ret._betas = self._betas + other._betas
        return ret

    def __sub__(self, other):
        ret = copy(self)
        ret._betas = self._betas - other._betas
        return ret

    def __mul__(self, other):
        ret = copy(self)
        ret._betas = ret._betas * other
        return ret
コード例 #25
0
class LogitRegression(object):
    """Patsy wrapper for logit model estimation and prediction.

	Example usage:

	# construct and estimate model using patsy formula
	# uses the cps pickle file under dataset processor
	cps["EarnedWage"] = (cps.WageIncomeLastYear > 0).astype(int)
	model = LogitRegression(
		"EarnedWage ~ C(Race)",
		cps,
		freq_weights=cps.Weight
	)

	# print model summary
	print(model)

	# return predicted probability of working for blacks
	prob_works = model.predict(
		pd.DataFrame({
			"Race": ["Black"]
		})
	)
	"""
    def __init__(self, formula=None, data=None, **kwargs):

        # convert all variables raised to a power to float64
        # this prevents mis-specification of probabilities in cases of variable overflow
        # (if the original var was compressed to a smaller bit integer/float)
        if type(data) == pd.DataFrame:
            power_vars = list(set(re.findall(r'(?<=power\().+?(?=,)',
                                             formula)))
            for var in power_vars:
                data[var] = data[var].astype('float64')

        if formula:
            y, X = patsy.dmatrices(formula, data, 1)
            self._y_design_info = y.design_info
            self._X_design_info = X.design_info
            self._model = GLM(y, X, family=Binomial(), **kwargs)
            self._fit = self._model.fit()
            self._betas = self._fit.params
            self._link = logit
        else:
            self._y_design_info = None
            self._X_design_info = None
            self._model = None
            self._fit = None
            self._betas = None
            self._link = logit

    def __repr__(self):
        return str(self._fit.summary()) if self._fit                           \
         else "Logistic regression"

    def predict(self, data, linear=False):

        if len(data) == 0:
            return []

        # identifies exponential variables from the design matrix (via the 'power' flag) and converts to float64
        # this prevents mis-specification of probabilities in cases of variable overflow
        # (if the original var was compressed to a smaller bit integer/float)
        power_vars = list(set([
         re.search(r'(?<=power\().+?(?=,)', column).group() for column in \
         self._X_design_info.column_names if 'power' in column
        ]))
        for var in power_vars:
            data[var] = data[var].astype('float64')

        (X, ) = patsy.build_design_matrices([self._X_design_info], data)

        if not linear:
            return self._link.inverse(
                self._link(), linear_transform(np.asarray(X), self._betas))

        else:
            return linear_transform(np.asarray(X), self._betas)

    def draw(self, data, rand_engine):

        prediction = self.predict(data)

        return rand_engine.binomial(1, prediction)

    def to_pickle(self, filename):

        with open(filename, "wb") as f:
            pickle.dump((self._y_design_info, self._X_design_info, self._betas,
                         self._link), f)

    @staticmethod
    def read_pickle(filename):
        y_design_info, X_design_info, betas, link = pickle.load(
            open(filename, "rb"))

        logit_regression = LogitRegression()
        logit_regression._y_design_info = y_design_info
        logit_regression._X_design_info = X_design_info
        logit_regression._betas = betas
        logit_regression._link = link

        return logit_regression

    def __add__(self, other):
        ret = copy(self)
        ret._betas = self._betas + other._betas
        return ret

    def __sub__(self, other):
        ret = copy(self)
        ret._betas = self._betas - other._betas
        return ret

    def __mul__(self, other):
        ret = copy(self)
        ret._betas = ret._betas * other
        return ret
コード例 #26
0
ファイル: __init__.py プロジェクト: mindis/tql-Python
class OOF(object):
    """Out of flod prediction
    # TODO 支持回归

    lightGBM一个一个地建立节点; XGboost一层一层地建立节点
    https://blog.csdn.net/friyal/article/details/82758532
    Catboost总是使用完全二叉树。它的节点是镜像的(对称树)。Catboost称对称树有利于避免overfit,增加可靠性,并且能大大加速预测等等。
        计算某个category出现的频率,加上超参数,生成新的numerical features
    # https://blog.csdn.net/linxid/article/details/80723811
    """
    _params = {'metric': 'auc',
               'learning_rate': 0.01,
               'n_estimators': 30000,
               'subsample': 0.8,
               'colsample_bytree': 0.8,
               'class_weight': 'balanced',  ##
               'scale_pos_weight': 1,  ##
               'random_state': 2019,
               'verbosity': -1}
    lgb = LGBMClassifier(n_jobs=16, **_params)
    xgb = XGBClassifier()
    cat = CatBoostClassifier(n_estimators=20000, learning_rate=0.05, loss_function='Logloss', eval_metric='AUC',
                             random_state=2019)

    def __init__(self, clf=None, folds=None):
        self.clf = clf if clf else self.lgb
        self.folds = folds if folds else StratifiedKFold(5, True, 2019)  # 支持 RepeatedStratifiedKFold
        self.model_type = self.clf.__repr__()
        # self.clf_agrs = self.getfullargspec(self.clf.fit).args if hasattr(self.clf, 'fit') else None

    def fit(self, X, y, X_test, feval=None, cat_feats=None, exclude_columns=None, epochs=16, batch_size=128,
            oof2csv=None):
        """
        # TODO: Rank 融合
        :param X:
        :param y:
        :param X_test:
        :param feval: roc_auc_score(y_true, y_score)
        :param cat_feats: 类别特征索引
        :param exclude_columns:
        仅针对 nn
        :param epochs:
        :param batch_size:
        :return:
        """
        # oof评估函数
        feval = feval if feval else roc_auc_score

        # 移除不需要的特征
        if exclude_columns:
            feats = X.columns.difference(exclude_columns)
        else:
            feats = X.columns

        X, X_test = X[feats], X_test[feats]

        if hasattr(self.folds, 'n_splits'):
            num_folds = self.folds.n_splits
        else:
            num_folds = self.folds.cvargs['n_splits']

        # Cross validation model
        # Create arrays and dataframes to store results
        oof_preds = np.zeros(len(X))
        sub_preds = np.zeros(len(X_test))
        self.feature_importance_df = pd.DataFrame()

        for n_fold, (train_idx, valid_idx) in enumerate(self.folds.split(X, y), 1):
            print("\n\033[94mFold %s started at %s\033[0m" % (n_fold, time.ctime()))

            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]

            if not hasattr(self.clf, 'fit'):
                print("该算法无fit方法")
                break
            else:
                if 'LGBMClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 eval_set=eval_set,
                                 categorical_feature=cat_feats if cat_feats else 'auto',
                                 eval_metric='auc',
                                 early_stopping_rounds=100,
                                 verbose=100)
                elif 'LGBMRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 eval_set=eval_set,
                                 categorical_feature=cat_feats if cat_feats else 'auto',
                                 eval_metric='l2',
                                 early_stopping_rounds=100,
                                 verbose=100)

                elif 'XGBClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 eval_set=eval_set,
                                 eval_metric='auc',
                                 early_stopping_rounds=100,
                                 verbose=100)
                elif 'XGBRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 eval_set=eval_set,
                                 eval_metric='rmse',
                                 early_stopping_rounds=100,
                                 verbose=100)

                elif 'CatBoostClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 eval_set=eval_set,
                                 cat_features=cat_feats,
                                 use_best_model=True,
                                 plot=True,
                                 early_stopping_rounds=100,
                                 verbose=100)
                elif 'CatBoostRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 eval_set=eval_set,
                                 cat_features=cat_feats,
                                 use_best_model=True,
                                 plot=True,
                                 early_stopping_rounds=100,
                                 verbose=0)

                elif 'RGFClassifier' in self.model_type:
                    pass
                elif 'RGFRegressor' in self.model_type:
                    pass

                # https://www.cnblogs.com/flyu6/p/7691106.html
                elif 'KerasClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 epochs=epochs,
                                 batch_size=batch_size,
                                 validation_data=eval_set)
                elif 'KerasRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 epochs=epochs,
                                 batch_size=batch_size,
                                 validation_data=eval_set)

                elif self.model_type == 'GLM':
                    # TODO: 其他模型的支持
                    self.clf = GLM(y_train, X_train, family=families.Binomial())
                    self.clf = self.clf.fit().predict(X)
                else:
                    # sklearn 原生模型
                    self.clf.fit(X, y)

                # 计算并保存 preds
                # TODO: 多分类需要修改
                if hasattr(self.clf, 'predict_proba'):
                    oof_preds[valid_idx] = self.clf.predict_proba(X_valid)[:, 1]
                    sub_preds += self.clf.predict_proba(X_test)[:, 1] / num_folds
                else:
                    oof_preds[valid_idx] = self.clf.predict(X_valid)
                    sub_preds += self.clf.predict(X_test) / num_folds

            if hasattr(self.clf, 'feature_importances_'):
                fold_importance_df = pd.DataFrame()
                fold_importance_df["feature"] = feats
                fold_importance_df["importance"] = self.clf.feature_importances_
                fold_importance_df["fold"] = n_fold
                self.feature_importance_df = pd.concat([self.feature_importance_df, fold_importance_df], 0)

        try:
            score = feval(y, oof_preds)
            score_name = feval.__repr__().split()[1]
        except Exception as e:
            score = score_name = None
            print('Error feval:', e)

        print("\n\033[94mOOF %s: %s end at %s\n\033[0m" % (score_name, score, time.ctime()))

        if hasattr(self.clf, 'feature_importances_'):
            self.plot_importances(self.feature_importance_df)

        self.oof_preds = oof_preds
        self.test_preds = sub_preds
        if oof2csv:
            pd.Series(oof_preds.tolist() + sub_preds.tolist(), name='oof').to_csv(oof2csv + time.ctime(), index=False)

        return oof_preds, sub_preds

    def plot_importances(self, df, topk=64):
        """Display/plot feature importance"""
        assert "feature" in df.columns and "importance" in df.columns, '无["feature", "importance"]'
        data = (df[["feature", "importance"]]
                .groupby("feature")
                .mean()
                .reset_index()
                .sort_values("importance", 0, False))[:topk]

        plt.figure(figsize=(12, int(topk / 4)))
        sns.barplot(x="importance", y="feature", data=data.assign(feature=data.feature.astype(str)))
        plt.title('LightGBM Features (avg over folds)')
        plt.tight_layout()
        plt.savefig('lgbm_importances.png')
コード例 #27
0
ファイル: __init__.py プロジェクト: mindis/tql-Python
    def fit(self, X, y, X_test, feval=None, cat_feats=None, exclude_columns=None, epochs=16, batch_size=128,
            oof2csv=None):
        """
        # TODO: Rank 融合
        :param X:
        :param y:
        :param X_test:
        :param feval: roc_auc_score(y_true, y_score)
        :param cat_feats: 类别特征索引
        :param exclude_columns:
        仅针对 nn
        :param epochs:
        :param batch_size:
        :return:
        """
        # oof评估函数
        feval = feval if feval else roc_auc_score

        # 移除不需要的特征
        if exclude_columns:
            feats = X.columns.difference(exclude_columns)
        else:
            feats = X.columns

        X, X_test = X[feats], X_test[feats]

        if hasattr(self.folds, 'n_splits'):
            num_folds = self.folds.n_splits
        else:
            num_folds = self.folds.cvargs['n_splits']

        # Cross validation model
        # Create arrays and dataframes to store results
        oof_preds = np.zeros(len(X))
        sub_preds = np.zeros(len(X_test))
        self.feature_importance_df = pd.DataFrame()

        for n_fold, (train_idx, valid_idx) in enumerate(self.folds.split(X, y), 1):
            print("\n\033[94mFold %s started at %s\033[0m" % (n_fold, time.ctime()))

            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]

            if not hasattr(self.clf, 'fit'):
                print("该算法无fit方法")
                break
            else:
                if 'LGBMClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 eval_set=eval_set,
                                 categorical_feature=cat_feats if cat_feats else 'auto',
                                 eval_metric='auc',
                                 early_stopping_rounds=100,
                                 verbose=100)
                elif 'LGBMRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 eval_set=eval_set,
                                 categorical_feature=cat_feats if cat_feats else 'auto',
                                 eval_metric='l2',
                                 early_stopping_rounds=100,
                                 verbose=100)

                elif 'XGBClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 eval_set=eval_set,
                                 eval_metric='auc',
                                 early_stopping_rounds=100,
                                 verbose=100)
                elif 'XGBRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 eval_set=eval_set,
                                 eval_metric='rmse',
                                 early_stopping_rounds=100,
                                 verbose=100)

                elif 'CatBoostClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 eval_set=eval_set,
                                 cat_features=cat_feats,
                                 use_best_model=True,
                                 plot=True,
                                 early_stopping_rounds=100,
                                 verbose=100)
                elif 'CatBoostRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 eval_set=eval_set,
                                 cat_features=cat_feats,
                                 use_best_model=True,
                                 plot=True,
                                 early_stopping_rounds=100,
                                 verbose=0)

                elif 'RGFClassifier' in self.model_type:
                    pass
                elif 'RGFRegressor' in self.model_type:
                    pass

                # https://www.cnblogs.com/flyu6/p/7691106.html
                elif 'KerasClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 epochs=epochs,
                                 batch_size=batch_size,
                                 validation_data=eval_set)
                elif 'KerasRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.clf.fit(X_train, y_train,
                                 epochs=epochs,
                                 batch_size=batch_size,
                                 validation_data=eval_set)

                elif self.model_type == 'GLM':
                    # TODO: 其他模型的支持
                    self.clf = GLM(y_train, X_train, family=families.Binomial())
                    self.clf = self.clf.fit().predict(X)
                else:
                    # sklearn 原生模型
                    self.clf.fit(X, y)

                # 计算并保存 preds
                # TODO: 多分类需要修改
                if hasattr(self.clf, 'predict_proba'):
                    oof_preds[valid_idx] = self.clf.predict_proba(X_valid)[:, 1]
                    sub_preds += self.clf.predict_proba(X_test)[:, 1] / num_folds
                else:
                    oof_preds[valid_idx] = self.clf.predict(X_valid)
                    sub_preds += self.clf.predict(X_test) / num_folds

            if hasattr(self.clf, 'feature_importances_'):
                fold_importance_df = pd.DataFrame()
                fold_importance_df["feature"] = feats
                fold_importance_df["importance"] = self.clf.feature_importances_
                fold_importance_df["fold"] = n_fold
                self.feature_importance_df = pd.concat([self.feature_importance_df, fold_importance_df], 0)

        try:
            score = feval(y, oof_preds)
            score_name = feval.__repr__().split()[1]
        except Exception as e:
            score = score_name = None
            print('Error feval:', e)

        print("\n\033[94mOOF %s: %s end at %s\n\033[0m" % (score_name, score, time.ctime()))

        if hasattr(self.clf, 'feature_importances_'):
            self.plot_importances(self.feature_importance_df)

        self.oof_preds = oof_preds
        self.test_preds = sub_preds
        if oof2csv:
            pd.Series(oof_preds.tolist() + sub_preds.tolist(), name='oof').to_csv(oof2csv + time.ctime(), index=False)

        return oof_preds, sub_preds