コード例 #1
0
ファイル: cleaners.py プロジェクト: KevinLiao159/klearn
 def _transform(self, X):
     """
     Returns
     ----------
     X: same type as X, with squished values
     """
     self._type_check(X)
     # HACK: v stack median, mad
     try:
         X = np.vstack(
             [
                 force_array(X),
                 self.median,
                 self.mad
             ]
         )
     except ValueError:
         X = np.hstack(
             [
                 force_array(X),
                 self.median,
                 self.mad
             ]
         )
     X = np.apply_along_axis(func1d=self._squish, axis=0, arr=X)
     if self.is_dataframe:
         X = pd.DataFrame(X, columns=self.df_cols, index=self.df_idx)
     return X
コード例 #2
0
ファイル: dispatch.py プロジェクト: KevinLiao159/klearn
    def predict_proba(self, X):
        check_is_fitted(self, 'model_dict')
        # NOTE: let's say we respect dataframe
        if not isinstance(X, (pd.DataFrame, pd.Series)):
            X = pd.DataFrame(force_array(X))

        # predict on dispatcher and get group
        group_new = self.dispatcher.predict(X)
        group_new = self.le.transform(group_new)
        index_dict = \
            {
                group: np.where(group_new == group)[0]
                for group in self.unique_groups
            }
        # predict by group
        proba_dfs = []
        for (group, index) in index_dict.items():
            if len(index):
                df_proba = pd.DataFrame(self.model_dict[group].prodict_proba(
                    X.iloc[index]),
                                        index=index)
                proba_dfs.append(df_proba)
        # concat all prodictions into one dataframe
        df_proba = pd.concat(proba_dfs)
        return force_array(df_proba.sort_index())
コード例 #3
0
def ts_train_test_split(X=None, y=None, groups=None, train_window=None,
                        buff_window=380, test_window=365):
    """
    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        Training data, where n_samples is the number of samples
        and n_features is the number of features.

    y : array-like, shape (n_samples,)
        Always ignored, exists for compatibility.

    groups (dates) : array-like, index, shape (n_samples, )
        with datetime type

    train_window : int, default = 730 (days)
        The number of days in the training window

    buff_window : int, default = 380 (days)
        The number of days to skip between the end of the training window
        and the start of the testing window.

    test_window : int, default = 365 (days)
        The number of days in the testing window

    Returns
    -------
    train : ndarray
        The training set indices for that split.

    test : ndarray
        The testing set indices for that split.
    """
    # check group
    if groups is None:
        raise ValueError('You have to pass in groups')
    if X is not None:
        check_consistent_length(force_array(X), force_array(groups))
    # get min/max date
    dates = groups
    # min_d = min(dates)
    max_d = max(dates)
    # get test start date:
    test_start = max_d - timedelta(days=test_window)
    # get train end date
    train_end = test_start - timedelta(days=buff_window)
    # get train start date
    if train_window is None:
        train_start = min(dates)
    else:  # train_window is given
        train_start = train_end - timedelta(days=train_window)
    # get id
    train_start_id = np.searchsorted(a=dates, v=train_start, side='left')
    train_end_id = np.searchsorted(a=dates, v=train_end, side='right')
    test_start_id = np.searchsorted(a=dates, v=test_start, side='left')
    end_id = np.searchsorted(a=dates, v=max_d, side='right')
    # train, test list
    train = np.arange(start=train_start_id, stop=train_end_id, step=1)
    test = np.arange(start=test_start_id, stop=end_id, step=1)
    return [tuple((train, test))]
コード例 #4
0
def mean_absolute_percentage_error(y_true, y_pred, robust=False):
    """mean_absolute_percentage_error

    Use case:
        y is expressed in percent and we want to take pct into account

    Formula:
        mean_absolute_percentage_error = \
            mean(abs((y_true - y_pred) / y_true) * 100

    Parameters
    ----------
    y_true : array-like of shape = (n_samples) or (n_samples, n_outputs)
        Ground truth (correct) target values.

    y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs)
        Estimated target values.

    robust : bool, if True, use median, otherwise, mean
        Default is False

    Returns
    -------
    loss : float or ndarray of floats
        A non-negative floating point value (the best value is 0.0)
    """
    y_true = force_array(y_true)
    y_pred = force_array(y_pred)
    if robust:
        loss = np.median(np.abs((y_true - y_pred) / y_true)) * 100
    else:  # use mean
        loss = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return loss
コード例 #5
0
ファイル: cleaners.py プロジェクト: KevinLiao159/klearn
 def _transform(self, X):
     """
     Returns
     -------
     X:  array-like, type and shape are preserved
         after truncating outliers and replacing them with quantile values
     """
     self._type_check(X)
     # HACK: v stack median, threshole, hi, lo
     try:
         X = np.vstack(
             [
                 force_array(X),
                 self.median,
                 self.outliers_thres,
                 self.hi_replacement,
                 self.lo_replacement
             ]
         )
     except ValueError:
         X = np.hstack(
             [
                 force_array(X),
                 self.median,
                 self.outliers_thres,
                 self.hi_replacement,
                 self.lo_replacement
             ]
         )
     X = np.apply_along_axis(func1d=self._replace, axis=0, arr=X)
     if self.is_dataframe:
         X = pd.DataFrame(X, columns=self.df_cols, index=self.df_idx)
     return X
コード例 #6
0
    def scatterplot(self, x, y, group=None, legend_name=None):
        """plot scatter plot

        Parameters
        ----------
        x: a vector, x is one column(feature) of a data

        y: a vector, y is another column(feature) of a data

        group: a vector, group is the label of data, it should have
            the same length as x, y

        legend_name: a name, or a dictionary of names, match the number of
            unique values in group
            eg. {
                    label_one: 'type one',
                    label_two: 'type two'
                }

        Returns
        -------
        A renderable plot
        """
        # check lenth
        x = force_array(x)
        y = force_array(y)
        check_consistent_length(x, y)
        if group is not None:
            group = force_array(group)
            check_consistent_length(x, y, group)
            # get all unique values from group
            unique_groups = np.unique(group)

            # check other args
            if legend_name is not None:
                check_consistent_length(unique_groups, list(legend_name))
            elif legend_name is None:
                legend_name = {v: v for v in unique_groups}

            # store data
            data = []
            for grp in unique_groups:
                data.append(
                    go.Scattergl(x=x[group == grp],
                                 y=y[group == grp],
                                 name=legend_name[grp],
                                 mode='markers'))
        elif group is None:
            trace = go.Scattergl(x=x, y=y, mode='markers')
            data = [trace]

        layout = go.Layout(title=self.title,
                           yaxis=self.yaxis,
                           xaxis=self.xaxis,
                           width=self.width,
                           height=self.height,
                           margin=go.layout.Margin(t=65, b=60, pad=4))
        fig = go.Figure(data=data, layout=layout)
        plotly.offline.iplot(fig)
コード例 #7
0
def _select_top_and_bottom(y_true,
                           y_score,
                           percentile=10,
                           interpolation='midpoint'):
    """
    Select truth values, predictions, scores of the top and bottom observations

    Parameters
    ----------
    y_true : array, shape = [n_samples] or [n_samples, ]
        True binary labels in binary label indicators.

    y_score : array, shape = [n_samples] or [n_samples, 2]
        Target scores, can either be probability estimates of the positive
        class, confidence values, or binary decisions.

    percentile: float, default 10 (10% quantile) 0 <= percentile <= 100,
        the top and bottom quantile(s) to select from all true values

    interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
        New in version 0.18.0.
        This optional parameter specifies the interpolation method to use,\
        when the desired quantile lies between two data points i and j:
        linear: i + (j - i) * fraction, where fraction is the fractional part\
        of the index surrounded by i and j.
        lower: i.
        higher: j.
        nearest: i or j whichever is nearest.
        midpoint: (i + j) / 2.

    Returns
    -------
    y_true_ext : array, shape = [n_samples] or [n_samples, ]
        True binary labels in binary label indicators of top and bottom

    y_score_ext : array, shape = [n_samples] or [n_samples, 2]
        Target scores, can either be probability estimates of the positive
        class, confidence values, or binary decisions of top and bottom.

    y_pred_ext : array, shape = [n_samples] or [n_samples, ]
        Target prediction, can either be 1 or 0, top is always 1 and bottom\
        is always 0.
    """
    y_true = force_array(y_true)
    y_score = force_array(y_score)
    upperQ = np.percentile(y_score[:, 1],
                           q=(100 - percentile),
                           interpolation=interpolation)
    lowerQ = np.percentile(y_score[:, 1],
                           q=percentile,
                           interpolation=interpolation)
    top_bottom_filter = (y_score[:, 1] >= upperQ) | (y_score[:, 1] <= lowerQ)

    y_true_ext = y_true[top_bottom_filter]
    y_score_ext = y_score[top_bottom_filter]
    y_pred_ext = y_score_ext[:, 1] >= 0.5
    return y_true_ext, y_score_ext, y_pred_ext
コード例 #8
0
ファイル: transformers.py プロジェクト: KevinLiao159/klearn
    def fit(self, X, y):
        """Pipeline: RFECV, standardizer, standardizer, PCA

        Parameters
        ----------
        X: array-like, shape [n_samples, n_features]

        y: array, shape[n_samples,].
        """
        if self.copy:
            X, y = X.copy(), y.copy()
        X, y = force_array(X), force_array(y)
        self._fit(X, y)
        return self
コード例 #9
0
    def split(self, X=None, y=None, groups=None):
        """Generate indices to split data into training and test set.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape (n_samples,)
            Always ignored, exists for compatibility.

        groups (dates) : array-like, index, shape (n_samples, )
            with datetime type
            NOTE: dates have to be sorted (ascending)

        Returns
        -------
        train : ndarray
            The training set indices for that split.

        test : ndarray
            The testing set indices for that split.
        """
        # check group
        if groups is None:
            raise ValueError('You have to pass in groups')
        if X is not None:
            check_consistent_length(force_array(X), force_array(groups))
        # get dates
        dates = groups
        tot_len = self.train_window + self.buff_window + self.test_window
        min_d = min(dates)
        max_d = max(dates)
        day_shift = ((max_d - min_d).days - tot_len)/(self.n_splits - 1)

        starts = \
            [min_d + timedelta(days=day_shift*i) for i in range(self.n_splits)]

        for start in starts:
            end_train, start_test, end = self._return_winds(start)
            # get id
            start_id = np.searchsorted(a=dates, v=start, side='left')
            end_train_id = np.searchsorted(a=dates, v=end_train, side='right')
            start_test_id = np.searchsorted(a=dates, v=start_test, side='left')
            end_id = np.searchsorted(a=dates, v=end, side='right')
            yield (
                np.arange(start=start_id, stop=end_train_id, step=1),          # noqa
                np.arange(start=start_test_id, stop=end_id, step=1))           # noqa
コード例 #10
0
ファイル: transformers.py プロジェクト: KevinLiao159/klearn
    def transform(self, X):
        """Merge reserved df with PCA

        Parameters
        ----------
        X: array-like, shape [n_samples, n_features]

        Returns
        -------
        X : new array with dimension reduction,
          shape [n_samples, n_features]
        """
        check_is_fitted(self, 'pca')
        if self.copy:
            X = X.copy()
        X = force_array(X)
        # implement RFECV transform method
        X_reserved = X[:, self.rfecv.support_]
        if sum(self.cols_for_pca) > 1:
            # converted
            X_pca = \
                self.pca.transform(
                    self.standardizer.fit_transform(X[:, self.cols_for_pca])
                    )
            return np.hstack((X_reserved, X_pca))
        else:
            # speical case: no feature is eliminated
            return X_reserved
コード例 #11
0
ファイル: modifiers.py プロジェクト: KevinLiao159/klearn
 def _fit(self, X, y, *args, **kwargs):
     """
     private method to train n base models for n folds of cv
     fit method should never get called
     """
     # get list of folds of indices
     self.folds = list(check_cv(self.cv).split(X, y))
     # Paralellization
     parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)
     if isinstance(X, pd.DataFrame):
         if not isinstance(y, (pd.Series, pd.DataFrame)):
             y = pd.DataFrame(y)
         self.fitted_models = parallel(
             delayed(fit_model)(model=deepcopy(self.model),
                                X=X.iloc[train],
                                y=y.iloc[train],
                                *args,
                                **kwargs) for train, test in self.folds)
     else:  # X is not a dataframe
         self.fitted_models = parallel(
             delayed(fit_model)(model=deepcopy(self.model),
                                X=X[train],
                                y=force_array(y)[train],
                                *args,
                                **kwargs) for train, test in self.folds)
     # train model with full 100% data
     if self.full_train:
         self.full_fitted_model = fit_model(model=deepcopy(self.model),
                                            X=X,
                                            y=y,
                                            *args,
                                            **kwargs)
コード例 #12
0
ファイル: cleaners.py プロジェクト: KevinLiao159/klearn
 def transform(self, X):
     """
     Returns
     ----------
     X: same type as X, with all finite values
     """
     self._type_check(X)
     # remove inf and fillna
     X = force_array(
         pd.DataFrame(force_array(X))
         .replace([-np.inf, np.inf], np.nan)
         .fillna(np.float32(self.fill))
     )
     if self.is_dataframe:
         X = pd.DataFrame(X, columns=self.df_cols, index=self.df_idx)
     return X
コード例 #13
0
ファイル: cleaners.py プロジェクト: KevinLiao159/klearn
 def fit(self, X, y=None):
     """
     Compute median, top, and bottom quantile replacement value
     for each column
     """
     # Reset internal state before fitting
     self._reset()
     # compute median, outliers threshole, hi, lo
     self.median = np.median(a=force_array(X), axis=0)
     self.outliers_thres = np.apply_along_axis(
         func1d=self._get_outliers_thres,
         axis=0,
         arr=force_array(X)
     )
     self.hi_replacement = np.percentile(a=force_array(X), q=self.hi, axis=0)    # noqa
     self.lo_replacement = np.percentile(a=force_array(X), q=self.lo, axis=0)    # noqa
     return self
コード例 #14
0
 def transform(self, X, y=None):
     self._type_check(X, y)
     X = np.apply_along_axis(func1d=self._transform_per_feature,
                             axis=0,
                             arr=force_array(X))
     if self.is_dataframe:
         X = pd.DataFrame(X, columns=self.df_cols, index=self.df_idx)
     return X
コード例 #15
0
ファイル: modifiers.py プロジェクト: KevinLiao159/klearn
 def transform(self, X, *args, **kwargs):
     if self.proba:
         y_hat = self.model.predict_proba(X, *args, **kwargs)[:, 1]
     else:
         y_hat = self.model.predict(X, *args, **kwargs)
     # reshape 1d array for horizontal merge in feature union
     y_hat = force_array(y_hat)
     y_hat = np.reshape(y_hat, newshape=(y_hat.shape[0], 1))
     return y_hat
コード例 #16
0
 def transform(self, X, y=None):
     check_is_fitted(self, 'features_selected')
     self._type_check(X, y)
     X = force_array(X)[:, self.features_selected]
     if self.is_dataframe:
         X = pd.DataFrame(X,
                          columns=self.df_cols[self.support_],
                          index=self.df_idx)
     return X
コード例 #17
0
ファイル: cleaners.py プロジェクト: KevinLiao159/klearn
 def fit(self, X, y=None):
     """
     Compute median and mad for each column. Assuming data is
     inf-free
     """
     # Reset internal state before fitting
     self._reset()
     # compute median, outliers threshole, hi, lo
     self.median = np.apply_along_axis(
         func1d=self._calc_median_abs_deviation,
         axis=0,
         arr=force_array(X)
     )[0]
     self.mad = np.apply_along_axis(
         func1d=self._calc_median_abs_deviation,
         axis=0,
         arr=force_array(X)
     )[1]
     return self
コード例 #18
0
ファイル: dispatch.py プロジェクト: KevinLiao159/klearn
    def fit(self, X, y=None, *args, **kwargs):
        # NOTE: let's say we respect dataframe
        if not isinstance(X, (pd.DataFrame, pd.Series)):
            X = pd.DataFrame(force_array(X))
        if not isinstance(y, (pd.DataFrame, pd.Series)):
            y = pd.DataFrame(force_array(y))

        # First, fit dispatcher and get group
        if self.supervise_cutoff is None:
            self._fit_unsupervise(X, *args, **kwargs)
        else:  # supervise
            self._fit_supervise(X, y, *args, **kwargs)

        # Second, fit Label encoder
        self.le = LabelEncoder().fit(self.group)
        self.group = self.le.transform(self.group)
        self.unique_groups = np.unique(self.group)
        # Third, get a model dictionary for two class of data
        self.model_dict = \
            {
                group: self.model_list[i][-1]
                for i, group in enumerate(self.unique_groups)
            }
        # Nest, get list of index
        index_dict = \
            {
                group: np.where(self.group == group)[0]
                for group in self.unique_groups
            }
        # Paralization and fit downstream models
        parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)
        func = delayed(fit_model)
        fitted_model_list = parallel(
            func(self.model_dict[group], X.iloc[index], y.iloc[index])
            for (group, index) in index_dict.items())
        # update models
        fitted_model_list = iter(fitted_model_list)
        self.model_dict = {
            group: next(fitted_model_list)
            for group in self.unique_groups
        }
        return self
コード例 #19
0
 def transform(self, X, y=None):
     self._type_check(X, y)
     X = force_array(X)
     # map col name to index if X is dataframe
     if self.is_dataframe:
         self.col_thres_dict = {
             np.where(self.df_cols == key)[0][0]: value
             for key, value in self.col_thres_dict.items()
         }
     # handle single vector X
     if X.ndim == 1:
         X = self._label_encode(X, self.col_thres_dict[0])
     else:  # for 2d array
         # iter thru columns
         for key, threshold in self.col_thres_dict.items():
             X[:, key] = self._label_encode(X[:, key], threshold)
     # finalized
     if self.is_dataframe:
         X = pd.DataFrame(X, columns=self.df_cols, index=self.df_idx)
     return X
コード例 #20
0
ファイル: ensemble.py プロジェクト: KevinLiao159/klearn
 def _fit(self, X, y, *args, **kwargs):
     """
     private method to train n base models for last fold of cv
     """
     # get list of folds of indices
     self.last_fold = list(check_cv(self.cv).split(X, y))[-1]
     self.in_fold = self.last_fold[0]
     self.out_of_fold = self.last_fold[-1]
     # Paralellization
     parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)
     if isinstance(X, pd.DataFrame):
         if not isinstance(y, (pd.Series, pd.DataFrame)):
             y = pd.DataFrame(y)
         self.fitted_models = parallel(delayed(fit_model)(
             model=deepcopy(model),
             X=X.iloc[self.in_fold],
             y=y.iloc[self.in_fold],
             *args,
             **kwargs
             ) for (_, model) in self.base_models
         )
     else:  # X is not a dataframe
         self.fitted_models = parallel(delayed(fit_model)(
             model=deepcopy(model),
             X=X[self.in_fold],
             y=force_array(y)[self.in_fold],
             *args,
             **kwargs
             ) for (_, model) in self.base_models
         )
     # train model with full 100% data
     if self.full_train:
         self.full_fitted_models = parallel(delayed(fit_model)(
             model=deepcopy(model),
             X=X,
             y=y,
             *args,
             **kwargs
             ) for (_, model) in self.base_models
         )
コード例 #21
0
 def fit(self, X, y=None):
     self._type_check(X, y)
     X = force_array(X)
     # data transformation and standardization
     if self.transformation == 'auto':
         transformer_pipe = Pipeline([
             ('inf_handler', InfHandler(strategy='max', refit=True)),
             ('imputer', Imputer(missing_values="NaN", strategy='mean')),
             ('normalize_transformer',
              NormalizeTransformer(
                  skewness_threshold=self.skewness_threshold,
                  abs_deviations_threshold=self.abs_deviations_threshold,
                  k=self.k)), ('standard_scaler', StandardScaler())
         ])
         X = transformer_pipe.fit_transform(X)
     # Initialization
     n_features = X.shape[1]
     support_ = np.ones(n_features, dtype=np.bool)
     features = np.arange(n_features)[support_]
     # Elimination
     # first pass
     vifs = np.array(
         [variance_inflation_factor(X, i) for i in range(n_features)])
     # recursive pruning
     while vifs.max() > self.vif_threshold:  # there is at least one feature
         # remaining features
         support_[features[np.argsort(vifs)[-1]]] = False
         features = np.arange(n_features)[support_]
         # get vifs for the remaining features
         vifs = np.array([
             variance_inflation_factor(X[:, features], i)
             for i in range(X[:, features].shape[1])
         ])
     # set final attributes
     self.support_ = support_
     self.n_features_ = support_.sum()
     self.features = np.arange(n_features)[support_]
     self.vifs = vifs
     return self
コード例 #22
0
ファイル: transformers.py プロジェクト: KevinLiao159/klearn
    def transform(self, X):
        """Return final transformed df_X

        Parameters
        ----------
        X: array-like, shape [n_samples, n_features]

        Returns
        -------
        X : new array with dimension reduction,
            shape [n_samples, n_features]
        """
        check_is_fitted(self, 'rfecv')
        if self.copy:
            X = X.copy()
        X = force_array(X)
        # first step - PCA transformation
        X_pca = self.pca.transform(self.standardizer.fit_transform(X))
        if self.pca.n_components_ > 1:
            X_pruned = self.rfecv.transform(X_pca)
            return X_pruned
        else:
            return X_pca
コード例 #23
0
ファイル: modifiers.py プロジェクト: KevinLiao159/klearn
 def fit_transform(self, X, y, *args, **kwargs):
     """
     fit_transform method gets called when the ensemble is fitted to data
     It implements _fit to fit base models for different folds and output
     out-of-sample predictions
     """
     # call _fit
     self._fit(X, y, *args, **kwargs)
     # generate out-of-sample predictions and reserve same order!!
     proba_dfs = []
     if isinstance(X, pd.DataFrame):
         for i, (train, test) in enumerate(self.folds):
             df_proba = pd.DataFrame(
                 {
                     'proba':
                     self.fitted_models[i].predict_proba(X.iloc[test])[:, 1]
                 },  # noqa
                 index=test)
             proba_dfs.append(df_proba)
     else:  # X is not a dataframe
         for i, (train, test) in enumerate(self.folds):
             df_proba = pd.DataFrame(
                 {
                     'proba': self.fitted_models[i].predict_proba(
                         X[test])[:, 1]
                 },  # noqa
                 index=test)
             proba_dfs.append(df_proba)
     # concat dfs and revert to origin order
     df_pred = pd.concat(proba_dfs).sort_index()
     # get y_out_of_sample
     y_out_of_sample = force_array(df_pred).reshape((len(df_pred), 1))
     # if need to convert to predict
     if not self.proba:
         y_out_of_sample = y_out_of_sample > 0.5
     return y_out_of_sample
コード例 #24
0
ファイル: cleaners.py プロジェクト: KevinLiao159/klearn
    def fit(self, X, y=None):
        """
        Compute inf-free, nan-free, max, min, mean, and median
        for each column

        Parameters
        ----------
        X: array-like, it only allows int, float, and bool
        """
        # Reset internal state before fitting
        self._reset()
        # make X inf-free
        X = force_array(
            pd.DataFrame(force_array(X)).replace([-np.inf, np.inf], np.nan)
        )
        # HACK: for dealing with bool
        X = np.asarray(X, dtype='float32')
        # compute inf-free, nan-free, max, min, mean, and median for cols
        self.max = np.nanmax(a=force_array(X), axis=0)
        self.min = np.nanmin(a=force_array(X), axis=0)
        self.mean = np.nanmean(a=force_array(X), axis=0)
        self.median = np.nanmedian(a=force_array(X), axis=0)
        return self
コード例 #25
0
ファイル: ensemble.py プロジェクト: KevinLiao159/klearn
 def predict(self, X, *args, **kwargs):
     df_proba = self.predict_proba(X, *args, **kwargs)[:, 1]
     df_pred = df_proba > 0.5
     return force_array(df_pred)
コード例 #26
0
ファイル: ensemble.py プロジェクト: KevinLiao159/klearn
def _base_model_cross_val(model, X, y, cv=None, proba=True, *args, **kwargs):
    """
    A private function that trains each base model for each fold
    and outputs fitted base models, its out-of-fold predictions,
    and array of y (in same order of out-of-fold predictions)
    for fitting ensembler

    Parameters
    ----------
    model : object, base model

    X : array-like, or dataframe

    y : array-like, or dataframe

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
        - None, to use the default 3-fold cross validation,
        - integer, to specify the number of folds in a `(Stratified)KFold`,
        - An object to be used as a cross-validation generator.
        - An iterable yielding train, test splits.

    proba : bool, if True, model will implement predict_proba when it
            gets called

    Returns
    -------
    list of fitted model for each fold, Xt(out-of-fold pred),
        y(matched with Xt)
    """
    # get list of folds of indices
    all_folds = list(check_cv(cv).split(X, y))
    # check data type
    if not isinstance(X, (pd.DataFrame, pd.Series)):
        X = pd.DataFrame(force_array(X))
    if not isinstance(y, (pd.DataFrame, pd.Series)):
        y = pd.DataFrame(force_array(y))
    # iterate each train-fold and fit base model
    fitted_models = [
        fit_model(
            model=deepcopy(model),
            X=X.iloc[train],
            y=y.iloc[train],
            *args,
            **kwargs
        ) for train, test in all_folds
    ]
    # generate out-of-sample predictions and reserve same order!!
    proba_dfs = []
    for i, (train, test) in enumerate(all_folds):
        df_proba = pd.DataFrame(
            {'proba': fitted_models[i].predict_proba(X.iloc[test])[:, 1]},       # noqa
            index=test
        )
        proba_dfs.append(df_proba)
    # concat dfs, sort index, and record index
    df_out_of_sample = pd.concat(proba_dfs).sort_index()
    idx = df_out_of_sample.index.values
    # get pred_out_of_sample
    pred_out_of_sample = \
        force_array(df_out_of_sample).reshape((len(df_out_of_sample), 1))
    # if need to convert to predict
    if not proba:
        pred_out_of_sample = pred_out_of_sample > 0.5
    # get y matched with pred_out_of_sample
    y_out_of_sample = y.iloc[idx]

    return fitted_models, pred_out_of_sample, y_out_of_sample
コード例 #27
0
ファイル: trainers.py プロジェクト: KevinLiao159/klearn
    def evaluate(self, X=None, y=None,
                 kind='prediction',
                 scoring=None,
                 aggregator=None,
                 **score_kwargs):
        """
        This is a convenient method for quick evaluating out-of-sample scores

        Parameters
        ----------
        X : X is NOT required

        y : y has to be the same y passed in its train method

        kind : str, one of ['prediction', 'proba']. If 'prediction' is chosen,
            then it will score prediction against out of sample targets
            If 'proba' is chosen, then it will score proba against
            out of sample targets

        scoring : dictionary with {metrics name: metrics callable}
            eg. {'accuracy': sklearn.metrics.accuracy_score}
            Default is accuracy

        aggregator: a function or a callable, to aggregate a vector

        **score_kwargs : this is passed to metrics callable

        Returns
        -------
        score_dict : a dictionary of score
            eg. {
                    'accuracy': [0.84, 0.92, 0.86, 0.78],
                    'roc_auc': [0.72, 0.77, 0.73, 0.69]
                }
        """
        allowed_kind = ['prediction', 'proba']
        if kind not in allowed_kind:
            raise ValueError('kind must be one of {}'.format(allowed_kind))
        if kind == 'prediction':
            check_has_set_attr(self, 'preds_dict')
            y_hat_dict = self.preds_dict
        else:  # kind == 'proba'
            check_has_set_attr(self, 'probas_dict')
            y_hat_dict = self.probas_dict
            for i, y_probas in y_hat_dict.items():
                if np.dim(y_probas) == 2:
                    y_hat_dict[i] = y_probas[:, -1]
        # check y
        if y is None:
            raise ValueError('You must pass in y')
        else:
            y = force_array(y)
        # check scoring
        if scoring is None:
            scoring = {'accuracy': accuracy_score}
        # score out of sample
        score_dict = {}
        for name, score in scoring.items():
            # get scores for every folds
            scores_list = [
                score(y[self.cv[i][1]], y_hat_dict[i], **score_kwargs)
                for i in range(len(self.cv))
            ]
            # save scores with score name in score_dict
            score_dict = {
                **score_dict,
                **{name: scores_list}
            }
        # aggregator
        if aggregator:
            score_dict = {
                name: aggregator(scores)
                for (name, scores) in score_dict.items()
            }
        return score_dict
コード例 #28
0
def ts_predefined_split(X=None, y=None, groups=None,
                        test_fold=None,
                        train_window=20 * 52,
                        buff_window=1 * 52,
                        test_window=1 * 52):
    """
    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        Training data, where n_samples is the number of samples
        and n_features is the number of features.

    y : array-like, shape (n_samples,)
        Always ignored, exists for compatibility.

    test_fold : array-like, a list of timestamps of the beginning of
        eaching rolling test fold. Default is None, it will take the
        latest available date, which is end_date - test_window

    groups (dates) : array-like, index, shape (n_samples, )
        with datetime type
        NOTE: for the time being, it will convert to 'datetime64[D]'
            we could support 'datetime64[ns]' in the future

    train_window : int, default = 20 * 52 (weeks)
        The number of weeks in the training window

    buff_window : int, default = 1 * 52 (weeks)
        The number of weeks to skip between the end of the training window
        and the start of the testing window.

    test_window : int, default = 1 * 52 (weeks)
        The number of weeks in the testing window

    Returns
    -------
    train : ndarray
        The training set indices for that split.

    test : ndarray
        The testing set indices for that split.
    """
    # check group
    if groups is None:
        raise ValueError('You have to pass in groups')
    if X is not None:
        check_consistent_length(force_array(X), force_array(groups))
    # get min/max date
    dates = np.sort(np.array(groups, dtype='datetime64[D]'))
    min_d = dates[0]
    max_d = dates[-1]
    # get test_fold
    if test_fold is None:
        test_start = max_d - np.timedelta64(test_window, 'W')
        test_fold = [test_start]
    else:  # if test_fold is NOT None
        if not isinstance(test_fold, (list, tuple, np.ndarray, pd.Index)):
            test_fold = [test_fold]
    # sort test_fold
    test_fold = np.sort(np.array(test_fold, dtype='datetime64[D]'))
    # check the last test fold
    last_test_start = test_fold[-1]
    if last_test_start >= max_d:
        raise ValueError('No testing data available for the last fold! '
                         'Please re-enter parameters!')
    # check the first training fold
    first_train_end = test_fold[0] - np.timedelta64(buff_window, 'W')
    if first_train_end <= min_d:
        raise ValueError('No trainning data available for the first fold! '
                         'Please re-enter parameters!')
    # generate index for rolling window folds
    for test_start in test_fold:
        # NOTE: there will be missing one day if it's in Leap Year
        # missing at 2004-12-31
        # get test_end
        test_end = test_start + np.timedelta64(test_window, 'W')
        test_end = np.min([max_d, test_end])
        # get train_end
        train_end = test_start - np.timedelta64(buff_window, 'W')
        # get train_start
        train_start = train_end - np.timedelta64(train_window, 'W')
        train_start = np.max([min_d, train_start])
        # get id
        train_start_idx = np.searchsorted(a=dates, v=train_start, side='left')
        train_end_idx = np.searchsorted(a=dates, v=train_end, side='right')
        test_start_idx = np.searchsorted(a=dates, v=test_start, side='left')
        test_end_idx = np.searchsorted(a=dates, v=test_end, side='right')
        yield (np.arange(start=train_start_idx, stop=train_end_idx, step=1),
               np.arange(start=test_start_idx, stop=test_end_idx, step=1))
コード例 #29
0
def group_train_test_split(X=None, y=None, groups=None,
                           train_size=None, random_state=None):
    """
    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        Training data, where n_samples is the number of samples
        and n_features is the number of features.

    y : array-like, shape (n_samples,)
        Always ignored, exists for compatibility.

    groups : array-like, index, shape (n_samples, )

    train_size : float, int, or None, default None
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the absolute number of train samples. If None,
        the value is automatically set to 0.80.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Returns
    -------
    train : ndarray
        The training set indices for that split.

    test : ndarray
        The testing set indices for that split.
    """
    # check group
    if groups is None:
        raise ValueError('You have to pass in groups')
    if X is not None:
        check_consistent_length(force_array(X), force_array(groups))
    # get unique groups and n_groups
    groups = force_array(groups)
    unique_groups = np.unique(groups)
    n_groups = len(unique_groups)
    # convert train_size to int
    if train_size is None:
        train_size = 0.80
    if train_size < 1:
        train_size = int(train_size * n_groups)
    # sample groups_train
    if random_state:
        np.random.seed(random_state)
    groups_train = np.random.choice(
        a=unique_groups,
        size=train_size,
        replace=False
    )
    # train, test list
    train_filter = force_array(pd.DataFrame(groups).isin(groups_train))
    test_filter = np.logical_not(train_filter)
    train = np.where(train_filter)[0]
    test = np.where(test_filter)[0]
    return [tuple((train, test))]
コード例 #30
0
def _select_top_and_bottom(y_true,
                           y_score,
                           top=50,
                           bottom=50,
                           interpolation='midpoint'):
    """
    Select truth values, predictions, scores of the top and bottom observations

    Parameters
    ----------
    y_true : array, shape = [n_samples] or [n_samples, ]
        True binary labels in binary label indicators.

    y_score : array, shape = [n_samples, ]
        Target scores, can either be probability estimates of the positive
        class, confidence values, or binary decisions.

    top, bottom : float, int, or None, default 50.
        If int, it filters top/bottom n samples
        If float, it should be between 0.0 and 1.0 and it filters top/bottom x
        percentage of the each class data

    interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
        New in version 0.18.0.
        This optional parameter specifies the interpolation method to use,\
        when the desired quantile lies between two data points i and j:
        linear: i + (j - i) * fraction, where fraction is the fractional part\
        of the index surrounded by i and j.
        lower: i.
        higher: j.
        nearest: i or j whichever is nearest.
        midpoint: (i + j) / 2.

    Returns
    -------
    y_true_ext : array, shape = [n_samples] or [n_samples, ]
        True binary labels in binary label indicators of top and bottom

    y_score_ext : array, shape = [n_samples] or [n_samples, 2]
        Target scores, can either be probability estimates of the positive
        class, confidence values, or binary decisions of top and bottom.

    y_pred_ext : array, shape = [n_samples] or [n_samples, ]
        Target prediction, can either be 1 or 0, top is always 1 and bottom\
        is always 0.
    """
    # check input
    check_consistent_length(y_true, y_score)
    y_true = force_array(y_true)
    y_score = force_array(y_score)
    n_samples_class_zero = np.sum(y_score < 0.5)
    n_samples_class_one = np.sum(y_score >= 0.5)
    # convert float to int for top and bottom
    if isinstance(top, float):
        if not (0 < top <= 1.0):
            raise ValueError('Warning! top is out of the range (0, 1.0)')
        top = int(round(top * n_samples_class_one))
    if isinstance(bottom, float):
        if not (0 < bottom <= 1.0):
            raise ValueError('Warning! bottom is out of the range (0, 1.0)')
        bottom = int(round(bottom * n_samples_class_zero))
    # filter top and bottom
    top_idx = np.argsort(y_score)[::-1][:top]
    bottom_idx = np.argsort(y_score)[:bottom]
    filter_idx = np.sort(np.concatenate([top_idx, bottom_idx]))
    # filtering
    y_true_ext = y_true[filter_idx]
    y_score_ext = y_score[filter_idx]
    y_pred_ext = y_score_ext >= 0.5
    return y_true_ext, y_score_ext, y_pred_ext