Exemple #1
0
 def _save(self, X, y,
           save_models=True,
           save_predictions=True,
           save_probas=True):
     # check fitted
     check_has_set_attr(self, 'is_trained')
     # check X, y
     check_consistent_length(X, y)
     # lazy don't want to handle single axis tensor
     X = ensure_2d_array(X, axis=1)
     y = ensure_2d_array(y, axis=1)
     # check locations
     if self.save_location is None:
         logger.warning('Warning! Nothing gets saved. '
                        'Please reset save_location '
                        'if you want to write results to disk')
     # save object
     self.preds_dict = {}
     self.probas_dict = {}
     for i, model in self.model_dict.items():
         # save model
         if self.models_location and save_models:
             self.save_model(model, name='model_{}'.format(i))
         # predict
         if hasattr(model, 'predict'):
             self.preds_dict = {
                 **self.preds_dict,
                 **{i: model.predict(X[self.cv[i][1]])}
             }
         else:
             logger.warning('Model does NOT implement predict')
         # predict_proba
         if hasattr(model, 'predict_proba'):
             self.probas_dict = {
                 **self.probas_dict,
                 **{i: model.predict_proba(X[self.cv[i][1]])}
             }
         else:
             logger.warning('Model does NOT implement predict_proba')
     # collect data
     if self.preds_dict:
         preds_list = list(self.preds_dict.values())
         self.pred_out_of_sample = np.vstack(preds_list)
         # save pred
         if self.predictions_location and save_predictions:
             self.save_prediction(self.pred_out_of_sample)
     if self.probas_dict:
         probas_list = list(self.probas_dict.values())
         self.proba_out_of_sample = np.vstack(probas_list)
         # save probas
         if self.probas_location and save_probas:
             self.save_proba(self.proba_out_of_sample)
     if self.verbose > 0:
         logger.info('Saving is done')
Exemple #2
0
 def __init__(self,
              estimator,
              pct_threshold=1.0,
              groups=None,
              scoring=None,
              cv=None,
              n_jobs=1,
              verbose=0,
              pre_dispatch='2*n_jobs'):
     check_has_set_attr(estimator, 'fit')
     self.estimator = estimator
     self.pct_threshold = pct_threshold
     self.groups = groups
     self.scoring = scoring
     self.cv = cv
     self.n_jobs = n_jobs
     self.verbose = verbose
     self.pre_dispatch = pre_dispatch
Exemple #3
0
    def evaluate(self, X=None, y=None,
                 level='date',
                 scoring=None,
                 aggregator=None,
                 **score_kwargs):
        """
        This is a convenient method for quick evaluating out-of-sample scores
        from gravity research. The score will be calculated on per level basis

        NOTE it's designed specifically for gravity research. It can be further
            refactored into new forms for other research

        NOTE it does NOT support 'kind', because it always assume using probas
            for scoring evaluations

        Parameters
        ----------
        X : X is NOT required

        y : y has to be the same y passed in its train method

        level : str, one of ['date', 'tradingitemid']

        scoring : dictionary with {metrics name: metrics callable}
            eg. {'accuracy': sklearn.metrics.accuracy_score}
            Default is top_bottom_accuracy_score

        aggregator: a function or a callable, to aggregate a vector

        **score_kwargs : this is passed to metrics callable

        Returns
        -------
        score_dict : a dictionary of score
            eg. {
                    'level': ['2007-01-05', '2007-01-12', '2007-01-19'],
                    'accuracy': [0.84, 0.92, 0.86],
                    'roc_auc': [0.72, 0.77, 0.73]
                }
        """
        allowed_level = ['date', 'tradingitemid']
        if level not in allowed_level:
            raise ValueError('level must be one of {}'.format(allowed_level))
        # check y
        if y is None:
            raise ValueError('You must pass in y')
        else:
            check_gravity_index(y)
        # join out of sample probas with out of sample groud truth
        check_has_set_attr(self, 'proba_out_of_sample')
        check_gravity_index(self.proba_out_of_sample)
        # check ndim of self.proba_out_of_sample
        if np.ndim(self.proba_out_of_sample) == 2:
            df_join = self.proba_out_of_sample.iloc[:, -1:].join(y, how='left')
        else:  # else if ndim is 1
            df_join = self.proba_out_of_sample.join(y, how='left')
        # check scoring
        if scoring is None:
            scoring = {'accuracy': top_bottom_accuracy_score}
        # score out of sample
        score_dict = \
            {level: df_join.index.get_level_values(level).unique().values}
        for name, score in scoring.items():
            # get scores for every point on level
            scores_list = df_join.groupby(level=level).apply(
                lambda df: score(
                    df.iloc[:, 1],
                    df.iloc[:, 0],
                    **score_kwargs)
            ).values
            # save scores with score name in score_dict
            score_dict = {
                **score_dict,
                **{name: scores_list}
            }
        # aggregator
        if aggregator:
            score_dict = {
                name: aggregator(scores)
                for (name, scores) in score_dict.items() if name != level
            }
        return score_dict
Exemple #4
0
    def _save(self, X, y,
              save_models=True,
              save_predictions=True,
              save_probas=True):
        # check fitted
        check_has_set_attr(self, 'is_trained')
        # check X, y
        check_consistent_length(X, y)
        if not self.is_dataframe:
            if not isinstance(X, (pd.DataFrame, pd.Series)):
                X = ensure_2d_array(X, axis=1)
                X = pd.DataFrame(X)
            if not isinstance(y, (pd.DataFrame, pd.Series)):
                y = ensure_2d_array(y, axis=1)
                y = pd.DataFrame(y)
        # check locations
        if self.save_location is None:
            logger.warning('Warning! Nothing gets saved. '
                           'Please reset save_location '
                           'if you want to write results to disk')
        # save object
        self.preds_dict = {}
        self.probas_dict = {}
        for i, model in self.model_dict.items():
            # save model
            if save_models:
                self.save_model(model, name='model_{}'.format(i))
            # pred
            if hasattr(model, 'predict'):
                self.preds_dict = {
                    **self.preds_dict,
                    **{
                        i: pd.DataFrame(
                            model.predict(X.iloc[self.cv[i][1]]),
                            index=X.iloc[self.cv[i][1]].index
                        )
                    }
                }
            else:
                logger.warning('Model does NOT implement predict')
            # probas
            if hasattr(model, 'predict_proba'):
                self.probas_dict = {
                    **self.probas_dict,
                    **{
                        i: pd.DataFrame(
                            model.predict_proba(X.iloc[self.cv[i][1]]),
                            index=X.iloc[self.cv[i][1]].index
                        )
                    }
                }
            else:
                logger.warning('Model does NOT implement predict_proba')

        if self.preds_dict:
            preds_list = list(self.preds_dict.values())
            self.pred_out_of_sample = \
                pd.concat(preds_list, verify_integrity=True).sort_index()
            # save pred
            if self.predictions_location and save_predictions:
                self.save_prediction(self.pred_out_of_sample)
        if self.probas_dict:
            probas_list = list(self.probas_dict.values())
            self.proba_out_of_sample = \
                pd.concat(probas_list, verify_integrity=True).sort_index()
            # save probas
            if self.probas_location and save_probas:
                self.save_proba(self.proba_out_of_sample)
        if self.verbose > 0:
            logger.info('Saving is done')
Exemple #5
0
    def evaluate(self, X=None, y=None,
                 kind='prediction',
                 scoring=None,
                 aggregator=None,
                 **score_kwargs):
        """
        This is a convenient method for quick evaluating out-of-sample scores

        Parameters
        ----------
        X : X is NOT required

        y : y has to be the same y passed in its train method

        kind : str, one of ['prediction', 'proba']. If 'prediction' is chosen,
            then it will score prediction against out of sample targets
            If 'proba' is chosen, then it will score proba against
            out of sample targets

        scoring : dictionary with {metrics name: metrics callable}
            eg. {'accuracy': sklearn.metrics.accuracy_score}
            Default is accuracy

        aggregator: a function or a callable, to aggregate a vector

        **score_kwargs : this is passed to metrics callable

        Returns
        -------
        score_dict : a dictionary of score
            eg. {
                    'accuracy': [0.84, 0.92, 0.86, 0.78],
                    'roc_auc': [0.72, 0.77, 0.73, 0.69]
                }
        """
        allowed_kind = ['prediction', 'proba']
        if kind not in allowed_kind:
            raise ValueError('kind must be one of {}'.format(allowed_kind))
        if kind == 'prediction':
            check_has_set_attr(self, 'preds_dict')
            y_hat_dict = self.preds_dict
        else:  # kind == 'proba'
            check_has_set_attr(self, 'probas_dict')
            y_hat_dict = self.probas_dict
            for i, y_probas in y_hat_dict.items():
                if np.dim(y_probas) == 2:
                    y_hat_dict[i] = y_probas[:, -1]
        # check y
        if y is None:
            raise ValueError('You must pass in y')
        else:
            y = force_array(y)
        # check scoring
        if scoring is None:
            scoring = {'accuracy': accuracy_score}
        # score out of sample
        score_dict = {}
        for name, score in scoring.items():
            # get scores for every folds
            scores_list = [
                score(y[self.cv[i][1]], y_hat_dict[i], **score_kwargs)
                for i in range(len(self.cv))
            ]
            # save scores with score name in score_dict
            score_dict = {
                **score_dict,
                **{name: scores_list}
            }
        # aggregator
        if aggregator:
            score_dict = {
                name: aggregator(scores)
                for (name, scores) in score_dict.items()
            }
        return score_dict
Exemple #6
0
 def get_probas_dict(self):
     # check fitted
     check_has_set_attr(self, 'probas_dict')
     return self.probas_dict
Exemple #7
0
 def get_out_of_sample_probas(self):
     # check fitted
     check_has_set_attr(self, 'proba_out_of_sample')
     return self.proba_out_of_sample
Exemple #8
0
 def get_out_of_sample_predictions(self):
     # check fitted
     check_has_set_attr(self, 'pred_out_of_sample')
     return self.pred_out_of_sample
Exemple #9
0
 def get_trained_model_dict(self):
     # check fitted
     check_has_set_attr(self, 'is_trained')
     return self.model_dict
Exemple #10
0
 def get_model_dict(self):
     # check fitted
     check_has_set_attr(self, 'model_dict')
     return self.model_dict
Exemple #11
0
 def save_proba(self, proba, name='proba'):
     check_has_set_attr(self, 'probas_location')
     pathlib.Path(self.probas_location).mkdir(parents=True, exist_ok=True)
     filepath = os.path.join(self.probas_location, '{}.pkl'.format(name))             # noqa
     save_object(proba, filepath)
Exemple #12
0
 def save_model(self, model, name='model'):
     check_has_set_attr(self, 'models_location')
     pathlib.Path(self.models_location).mkdir(parents=True, exist_ok=True)
     filepath = os.path.join(self.models_location, '{}.pkl'.format(name))
     save_object(model, filepath)