Exemple #1
0
def ts_train_test_split(X=None, y=None, groups=None, train_window=None,
                        buff_window=380, test_window=365):
    """
    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        Training data, where n_samples is the number of samples
        and n_features is the number of features.

    y : array-like, shape (n_samples,)
        Always ignored, exists for compatibility.

    groups (dates) : array-like, index, shape (n_samples, )
        with datetime type

    train_window : int, default = 730 (days)
        The number of days in the training window

    buff_window : int, default = 380 (days)
        The number of days to skip between the end of the training window
        and the start of the testing window.

    test_window : int, default = 365 (days)
        The number of days in the testing window

    Returns
    -------
    train : ndarray
        The training set indices for that split.

    test : ndarray
        The testing set indices for that split.
    """
    # check group
    if groups is None:
        raise ValueError('You have to pass in groups')
    if X is not None:
        check_consistent_length(force_array(X), force_array(groups))
    # get min/max date
    dates = groups
    # min_d = min(dates)
    max_d = max(dates)
    # get test start date:
    test_start = max_d - timedelta(days=test_window)
    # get train end date
    train_end = test_start - timedelta(days=buff_window)
    # get train start date
    if train_window is None:
        train_start = min(dates)
    else:  # train_window is given
        train_start = train_end - timedelta(days=train_window)
    # get id
    train_start_id = np.searchsorted(a=dates, v=train_start, side='left')
    train_end_id = np.searchsorted(a=dates, v=train_end, side='right')
    test_start_id = np.searchsorted(a=dates, v=test_start, side='left')
    end_id = np.searchsorted(a=dates, v=max_d, side='right')
    # train, test list
    train = np.arange(start=train_start_id, stop=train_end_id, step=1)
    test = np.arange(start=test_start_id, stop=end_id, step=1)
    return [tuple((train, test))]
Exemple #2
0
 def _train(self, X, y):
     # check X, y
     check_consistent_length(X, y)
     self._type_check(X, y)
     if not self.is_dataframe:
         if not isinstance(X, (pd.DataFrame, pd.Series)):
             X = ensure_2d_array(X, axis=1)
             X = pd.DataFrame(X)
         if not isinstance(y, (pd.DataFrame, pd.Series)):
             y = ensure_2d_array(y, axis=1)
             y = pd.DataFrame(y)
     # check cv
     self.cv = list(self.cv.split(X, y))
     # parallel
     parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)
     func = delayed(fit_model)
     fitted_model_list = parallel(
         func(
             model,
             X.iloc[self.cv[i][0]],
             y.iloc[self.cv[i][0]])
         for (i, model) in self.model_dict.items()
     )
     # update models
     fitted_model_list = iter(fitted_model_list)
     self.model_dict = {
         i: next(fitted_model_list)
         for i in range(len(self.cv))
     }
     if self.verbose > 0:
         logger.info('Training is done!')
Exemple #3
0
 def _save(self, X, y,
           save_models=True,
           save_predictions=True,
           save_probas=True):
     # check fitted
     check_has_set_attr(self, 'is_trained')
     # check X, y
     check_consistent_length(X, y)
     # lazy don't want to handle single axis tensor
     X = ensure_2d_array(X, axis=1)
     y = ensure_2d_array(y, axis=1)
     # check locations
     if self.save_location is None:
         logger.warning('Warning! Nothing gets saved. '
                        'Please reset save_location '
                        'if you want to write results to disk')
     # save object
     self.preds_dict = {}
     self.probas_dict = {}
     for i, model in self.model_dict.items():
         # save model
         if self.models_location and save_models:
             self.save_model(model, name='model_{}'.format(i))
         # predict
         if hasattr(model, 'predict'):
             self.preds_dict = {
                 **self.preds_dict,
                 **{i: model.predict(X[self.cv[i][1]])}
             }
         else:
             logger.warning('Model does NOT implement predict')
         # predict_proba
         if hasattr(model, 'predict_proba'):
             self.probas_dict = {
                 **self.probas_dict,
                 **{i: model.predict_proba(X[self.cv[i][1]])}
             }
         else:
             logger.warning('Model does NOT implement predict_proba')
     # collect data
     if self.preds_dict:
         preds_list = list(self.preds_dict.values())
         self.pred_out_of_sample = np.vstack(preds_list)
         # save pred
         if self.predictions_location and save_predictions:
             self.save_prediction(self.pred_out_of_sample)
     if self.probas_dict:
         probas_list = list(self.probas_dict.values())
         self.proba_out_of_sample = np.vstack(probas_list)
         # save probas
         if self.probas_location and save_probas:
             self.save_proba(self.proba_out_of_sample)
     if self.verbose > 0:
         logger.info('Saving is done')
Exemple #4
0
    def split(self, X=None, y=None, groups=None):
        """Generate indices to split data into training and test set.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape (n_samples,)
            Always ignored, exists for compatibility.

        groups (dates) : array-like, index, shape (n_samples, )
            with datetime type
            NOTE: dates have to be sorted (ascending)

        Returns
        -------
        train : ndarray
            The training set indices for that split.

        test : ndarray
            The testing set indices for that split.
        """
        # check group
        if groups is None:
            raise ValueError('You have to pass in groups')
        if X is not None:
            check_consistent_length(force_array(X), force_array(groups))
        # get dates
        dates = groups
        tot_len = self.train_window + self.buff_window + self.test_window
        min_d = min(dates)
        max_d = max(dates)
        day_shift = ((max_d - min_d).days - tot_len)/(self.n_splits - 1)

        starts = \
            [min_d + timedelta(days=day_shift*i) for i in range(self.n_splits)]

        for start in starts:
            end_train, start_test, end = self._return_winds(start)
            # get id
            start_id = np.searchsorted(a=dates, v=start, side='left')
            end_train_id = np.searchsorted(a=dates, v=end_train, side='right')
            start_test_id = np.searchsorted(a=dates, v=start_test, side='left')
            end_id = np.searchsorted(a=dates, v=end, side='right')
            yield (
                np.arange(start=start_id, stop=end_train_id, step=1),          # noqa
                np.arange(start=start_test_id, stop=end_id, step=1))           # noqa
Exemple #5
0
 def _train(self, X, y):
     # check X, y
     check_consistent_length(X, y)
     X = ensure_2d_array(X, axis=1)
     y = ensure_2d_array(y, axis=1)
     # check cv
     self.cv = list(self.cv.split(X, y))
     # parallel
     parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)
     func = delayed(fit_model)
     fitted_model_list = parallel(
         func(model, X[self.cv[i][0]], y[self.cv[i][0]])
         for (i, model) in self.model_dict.items()
     )
     # update models
     fitted_model_list = iter(fitted_model_list)
     self.model_dict = {
         i: next(fitted_model_list)
         for i in range(len(self.cv))
     }
     if self.verbose > 0:
         logger.info('Training is done!')
Exemple #6
0
    def scatterplot(self, x, y, group=None, legend_name=None):
        """plot scatter plot

        Parameters
        ----------
        x: a vector, x is one column(feature) of a data

        y: a vector, y is another column(feature) of a data

        group: a vector, group is the label of data, it should have
            the same length as x, y

        legend_name: a name, or a dictionary of names, match the number of
            unique values in group
            eg. {
                    label_one: 'type one',
                    label_two: 'type two'
                }

        Returns
        -------
        A renderable plot
        """
        # check lenth
        x = force_array(x)
        y = force_array(y)
        check_consistent_length(x, y)
        if group is not None:
            group = force_array(group)
            check_consistent_length(x, y, group)
            # get all unique values from group
            unique_groups = np.unique(group)

            # check other args
            if legend_name is not None:
                check_consistent_length(unique_groups, list(legend_name))
            elif legend_name is None:
                legend_name = {v: v for v in unique_groups}

            # store data
            data = []
            for grp in unique_groups:
                data.append(
                    go.Scattergl(x=x[group == grp],
                                 y=y[group == grp],
                                 name=legend_name[grp],
                                 mode='markers'))
        elif group is None:
            trace = go.Scattergl(x=x, y=y, mode='markers')
            data = [trace]

        layout = go.Layout(title=self.title,
                           yaxis=self.yaxis,
                           xaxis=self.xaxis,
                           width=self.width,
                           height=self.height,
                           margin=go.layout.Margin(t=65, b=60, pad=4))
        fig = go.Figure(data=data, layout=layout)
        plotly.offline.iplot(fig)
Exemple #7
0
def ts_predefined_split(X=None, y=None, groups=None,
                        test_fold=None,
                        train_window=20 * 52,
                        buff_window=1 * 52,
                        test_window=1 * 52):
    """
    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        Training data, where n_samples is the number of samples
        and n_features is the number of features.

    y : array-like, shape (n_samples,)
        Always ignored, exists for compatibility.

    test_fold : array-like, a list of timestamps of the beginning of
        eaching rolling test fold. Default is None, it will take the
        latest available date, which is end_date - test_window

    groups (dates) : array-like, index, shape (n_samples, )
        with datetime type
        NOTE: for the time being, it will convert to 'datetime64[D]'
            we could support 'datetime64[ns]' in the future

    train_window : int, default = 20 * 52 (weeks)
        The number of weeks in the training window

    buff_window : int, default = 1 * 52 (weeks)
        The number of weeks to skip between the end of the training window
        and the start of the testing window.

    test_window : int, default = 1 * 52 (weeks)
        The number of weeks in the testing window

    Returns
    -------
    train : ndarray
        The training set indices for that split.

    test : ndarray
        The testing set indices for that split.
    """
    # check group
    if groups is None:
        raise ValueError('You have to pass in groups')
    if X is not None:
        check_consistent_length(force_array(X), force_array(groups))
    # get min/max date
    dates = np.sort(np.array(groups, dtype='datetime64[D]'))
    min_d = dates[0]
    max_d = dates[-1]
    # get test_fold
    if test_fold is None:
        test_start = max_d - np.timedelta64(test_window, 'W')
        test_fold = [test_start]
    else:  # if test_fold is NOT None
        if not isinstance(test_fold, (list, tuple, np.ndarray, pd.Index)):
            test_fold = [test_fold]
    # sort test_fold
    test_fold = np.sort(np.array(test_fold, dtype='datetime64[D]'))
    # check the last test fold
    last_test_start = test_fold[-1]
    if last_test_start >= max_d:
        raise ValueError('No testing data available for the last fold! '
                         'Please re-enter parameters!')
    # check the first training fold
    first_train_end = test_fold[0] - np.timedelta64(buff_window, 'W')
    if first_train_end <= min_d:
        raise ValueError('No trainning data available for the first fold! '
                         'Please re-enter parameters!')
    # generate index for rolling window folds
    for test_start in test_fold:
        # NOTE: there will be missing one day if it's in Leap Year
        # missing at 2004-12-31
        # get test_end
        test_end = test_start + np.timedelta64(test_window, 'W')
        test_end = np.min([max_d, test_end])
        # get train_end
        train_end = test_start - np.timedelta64(buff_window, 'W')
        # get train_start
        train_start = train_end - np.timedelta64(train_window, 'W')
        train_start = np.max([min_d, train_start])
        # get id
        train_start_idx = np.searchsorted(a=dates, v=train_start, side='left')
        train_end_idx = np.searchsorted(a=dates, v=train_end, side='right')
        test_start_idx = np.searchsorted(a=dates, v=test_start, side='left')
        test_end_idx = np.searchsorted(a=dates, v=test_end, side='right')
        yield (np.arange(start=train_start_idx, stop=train_end_idx, step=1),
               np.arange(start=test_start_idx, stop=test_end_idx, step=1))
Exemple #8
0
def group_train_test_split(X=None, y=None, groups=None,
                           train_size=None, random_state=None):
    """
    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        Training data, where n_samples is the number of samples
        and n_features is the number of features.

    y : array-like, shape (n_samples,)
        Always ignored, exists for compatibility.

    groups : array-like, index, shape (n_samples, )

    train_size : float, int, or None, default None
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the absolute number of train samples. If None,
        the value is automatically set to 0.80.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Returns
    -------
    train : ndarray
        The training set indices for that split.

    test : ndarray
        The testing set indices for that split.
    """
    # check group
    if groups is None:
        raise ValueError('You have to pass in groups')
    if X is not None:
        check_consistent_length(force_array(X), force_array(groups))
    # get unique groups and n_groups
    groups = force_array(groups)
    unique_groups = np.unique(groups)
    n_groups = len(unique_groups)
    # convert train_size to int
    if train_size is None:
        train_size = 0.80
    if train_size < 1:
        train_size = int(train_size * n_groups)
    # sample groups_train
    if random_state:
        np.random.seed(random_state)
    groups_train = np.random.choice(
        a=unique_groups,
        size=train_size,
        replace=False
    )
    # train, test list
    train_filter = force_array(pd.DataFrame(groups).isin(groups_train))
    test_filter = np.logical_not(train_filter)
    train = np.where(train_filter)[0]
    test = np.where(test_filter)[0]
    return [tuple((train, test))]
Exemple #9
0
def _select_top_and_bottom(y_true,
                           y_score,
                           top=50,
                           bottom=50,
                           interpolation='midpoint'):
    """
    Select truth values, predictions, scores of the top and bottom observations

    Parameters
    ----------
    y_true : array, shape = [n_samples] or [n_samples, ]
        True binary labels in binary label indicators.

    y_score : array, shape = [n_samples, ]
        Target scores, can either be probability estimates of the positive
        class, confidence values, or binary decisions.

    top, bottom : float, int, or None, default 50.
        If int, it filters top/bottom n samples
        If float, it should be between 0.0 and 1.0 and it filters top/bottom x
        percentage of the each class data

    interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
        New in version 0.18.0.
        This optional parameter specifies the interpolation method to use,\
        when the desired quantile lies between two data points i and j:
        linear: i + (j - i) * fraction, where fraction is the fractional part\
        of the index surrounded by i and j.
        lower: i.
        higher: j.
        nearest: i or j whichever is nearest.
        midpoint: (i + j) / 2.

    Returns
    -------
    y_true_ext : array, shape = [n_samples] or [n_samples, ]
        True binary labels in binary label indicators of top and bottom

    y_score_ext : array, shape = [n_samples] or [n_samples, 2]
        Target scores, can either be probability estimates of the positive
        class, confidence values, or binary decisions of top and bottom.

    y_pred_ext : array, shape = [n_samples] or [n_samples, ]
        Target prediction, can either be 1 or 0, top is always 1 and bottom\
        is always 0.
    """
    # check input
    check_consistent_length(y_true, y_score)
    y_true = force_array(y_true)
    y_score = force_array(y_score)
    n_samples_class_zero = np.sum(y_score < 0.5)
    n_samples_class_one = np.sum(y_score >= 0.5)
    # convert float to int for top and bottom
    if isinstance(top, float):
        if not (0 < top <= 1.0):
            raise ValueError('Warning! top is out of the range (0, 1.0)')
        top = int(round(top * n_samples_class_one))
    if isinstance(bottom, float):
        if not (0 < bottom <= 1.0):
            raise ValueError('Warning! bottom is out of the range (0, 1.0)')
        bottom = int(round(bottom * n_samples_class_zero))
    # filter top and bottom
    top_idx = np.argsort(y_score)[::-1][:top]
    bottom_idx = np.argsort(y_score)[:bottom]
    filter_idx = np.sort(np.concatenate([top_idx, bottom_idx]))
    # filtering
    y_true_ext = y_true[filter_idx]
    y_score_ext = y_score[filter_idx]
    y_pred_ext = y_score_ext >= 0.5
    return y_true_ext, y_score_ext, y_pred_ext
Exemple #10
0
def gravity_evaluate(df_true, df_score, level='date', scoring=None,
                     aggregator=None, **score_kwargs):
        """
        This is a wrapper function for quick scoring
        NOTE it is specifically for gravity research

        Parameters
        ----------
        df_true : dataframe, gravity outcomes data with gravity index

        df_score : dataframe, gravity trainer out of sample probas with \
            gravity index

        level : str, one of ['date', 'tradingitemid']

        scoring : dictionary with {metrics name: metrics callable}
            eg. {'accuracy': sklearn.metrics.accuracy_score}
            Default is top_bottom_accuracy_score

        aggregator: a function or a callable, to aggregate a vector

        **score_kwargs : this is passed to metrics callable

        Returns
        -------
        score_dict : a dictionary of score
            eg. {
                    'level': ['2007-01-05', '2007-01-12', '2007-01-19'],
                    'accuracy': [0.84, 0.92, 0.86],
                    'roc_auc': [0.72, 0.77, 0.73]
                }
        """
        allowed_level = ['date', 'tradingitemid']
        if level not in allowed_level:
            raise ValueError('level must be one of {}'.format(allowed_level))
        # check input data
        check_consistent_length(df_true, df_score)
        check_gravity_index(df_true)
        check_gravity_index(df_score)
        # check ndim of df_score
        if np.ndim(df_score) == 2:
            df_join = df_score.iloc[:, -1:].join(df_true, how='left')
        else:  # else if ndim is 1
            df_join = df_score.join(df_true, how='left')
        # check scoring
        if scoring is None:
            scoring = {'accuracy': top_bottom_accuracy_score}
        # score out of sample
        score_dict = \
            {level: df_join.index.get_level_values(level).unique().values}
        for name, score in scoring.items():
            # get scores for every point on level
            scores_list = df_join.groupby(level=level).apply(
                lambda df: score(
                    df.iloc[:, 1],
                    df.iloc[:, 0],
                    **score_kwargs)
            ).values
            # save scores with score name in score_dict
            score_dict = {
                **score_dict,
                **{name: scores_list}
            }
        # aggregator
        if aggregator:
            score_dict = {
                name: aggregator(scores)
                for (name, scores) in score_dict.items() if name != level
            }
        return score_dict
Exemple #11
0
    def _save(self, X, y,
              save_models=True,
              save_predictions=True,
              save_probas=True):
        # check fitted
        check_has_set_attr(self, 'is_trained')
        # check X, y
        check_consistent_length(X, y)
        if not self.is_dataframe:
            if not isinstance(X, (pd.DataFrame, pd.Series)):
                X = ensure_2d_array(X, axis=1)
                X = pd.DataFrame(X)
            if not isinstance(y, (pd.DataFrame, pd.Series)):
                y = ensure_2d_array(y, axis=1)
                y = pd.DataFrame(y)
        # check locations
        if self.save_location is None:
            logger.warning('Warning! Nothing gets saved. '
                           'Please reset save_location '
                           'if you want to write results to disk')
        # save object
        self.preds_dict = {}
        self.probas_dict = {}
        for i, model in self.model_dict.items():
            # save model
            if save_models:
                self.save_model(model, name='model_{}'.format(i))
            # pred
            if hasattr(model, 'predict'):
                self.preds_dict = {
                    **self.preds_dict,
                    **{
                        i: pd.DataFrame(
                            model.predict(X.iloc[self.cv[i][1]]),
                            index=X.iloc[self.cv[i][1]].index
                        )
                    }
                }
            else:
                logger.warning('Model does NOT implement predict')
            # probas
            if hasattr(model, 'predict_proba'):
                self.probas_dict = {
                    **self.probas_dict,
                    **{
                        i: pd.DataFrame(
                            model.predict_proba(X.iloc[self.cv[i][1]]),
                            index=X.iloc[self.cv[i][1]].index
                        )
                    }
                }
            else:
                logger.warning('Model does NOT implement predict_proba')

        if self.preds_dict:
            preds_list = list(self.preds_dict.values())
            self.pred_out_of_sample = \
                pd.concat(preds_list, verify_integrity=True).sort_index()
            # save pred
            if self.predictions_location and save_predictions:
                self.save_prediction(self.pred_out_of_sample)
        if self.probas_dict:
            probas_list = list(self.probas_dict.values())
            self.proba_out_of_sample = \
                pd.concat(probas_list, verify_integrity=True).sort_index()
            # save probas
            if self.probas_location and save_probas:
                self.save_proba(self.proba_out_of_sample)
        if self.verbose > 0:
            logger.info('Saving is done')