def _save(self, X, y, save_models=True, save_predictions=True, save_probas=True): # check fitted check_has_set_attr(self, 'is_trained') # check X, y check_consistent_length(X, y) # lazy don't want to handle single axis tensor X = ensure_2d_array(X, axis=1) y = ensure_2d_array(y, axis=1) # check locations if self.save_location is None: logger.warning('Warning! Nothing gets saved. ' 'Please reset save_location ' 'if you want to write results to disk') # save object self.preds_dict = {} self.probas_dict = {} for i, model in self.model_dict.items(): # save model if self.models_location and save_models: self.save_model(model, name='model_{}'.format(i)) # predict if hasattr(model, 'predict'): self.preds_dict = { **self.preds_dict, **{i: model.predict(X[self.cv[i][1]])} } else: logger.warning('Model does NOT implement predict') # predict_proba if hasattr(model, 'predict_proba'): self.probas_dict = { **self.probas_dict, **{i: model.predict_proba(X[self.cv[i][1]])} } else: logger.warning('Model does NOT implement predict_proba') # collect data if self.preds_dict: preds_list = list(self.preds_dict.values()) self.pred_out_of_sample = np.vstack(preds_list) # save pred if self.predictions_location and save_predictions: self.save_prediction(self.pred_out_of_sample) if self.probas_dict: probas_list = list(self.probas_dict.values()) self.proba_out_of_sample = np.vstack(probas_list) # save probas if self.probas_location and save_probas: self.save_proba(self.proba_out_of_sample) if self.verbose > 0: logger.info('Saving is done')
def __init__(self, estimator, pct_threshold=1.0, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, pre_dispatch='2*n_jobs'): check_has_set_attr(estimator, 'fit') self.estimator = estimator self.pct_threshold = pct_threshold self.groups = groups self.scoring = scoring self.cv = cv self.n_jobs = n_jobs self.verbose = verbose self.pre_dispatch = pre_dispatch
def evaluate(self, X=None, y=None, level='date', scoring=None, aggregator=None, **score_kwargs): """ This is a convenient method for quick evaluating out-of-sample scores from gravity research. The score will be calculated on per level basis NOTE it's designed specifically for gravity research. It can be further refactored into new forms for other research NOTE it does NOT support 'kind', because it always assume using probas for scoring evaluations Parameters ---------- X : X is NOT required y : y has to be the same y passed in its train method level : str, one of ['date', 'tradingitemid'] scoring : dictionary with {metrics name: metrics callable} eg. {'accuracy': sklearn.metrics.accuracy_score} Default is top_bottom_accuracy_score aggregator: a function or a callable, to aggregate a vector **score_kwargs : this is passed to metrics callable Returns ------- score_dict : a dictionary of score eg. { 'level': ['2007-01-05', '2007-01-12', '2007-01-19'], 'accuracy': [0.84, 0.92, 0.86], 'roc_auc': [0.72, 0.77, 0.73] } """ allowed_level = ['date', 'tradingitemid'] if level not in allowed_level: raise ValueError('level must be one of {}'.format(allowed_level)) # check y if y is None: raise ValueError('You must pass in y') else: check_gravity_index(y) # join out of sample probas with out of sample groud truth check_has_set_attr(self, 'proba_out_of_sample') check_gravity_index(self.proba_out_of_sample) # check ndim of self.proba_out_of_sample if np.ndim(self.proba_out_of_sample) == 2: df_join = self.proba_out_of_sample.iloc[:, -1:].join(y, how='left') else: # else if ndim is 1 df_join = self.proba_out_of_sample.join(y, how='left') # check scoring if scoring is None: scoring = {'accuracy': top_bottom_accuracy_score} # score out of sample score_dict = \ {level: df_join.index.get_level_values(level).unique().values} for name, score in scoring.items(): # get scores for every point on level scores_list = df_join.groupby(level=level).apply( lambda df: score( df.iloc[:, 1], df.iloc[:, 0], **score_kwargs) ).values # save scores with score name in score_dict score_dict = { **score_dict, **{name: scores_list} } # aggregator if aggregator: score_dict = { name: aggregator(scores) for (name, scores) in score_dict.items() if name != level } return score_dict
def _save(self, X, y, save_models=True, save_predictions=True, save_probas=True): # check fitted check_has_set_attr(self, 'is_trained') # check X, y check_consistent_length(X, y) if not self.is_dataframe: if not isinstance(X, (pd.DataFrame, pd.Series)): X = ensure_2d_array(X, axis=1) X = pd.DataFrame(X) if not isinstance(y, (pd.DataFrame, pd.Series)): y = ensure_2d_array(y, axis=1) y = pd.DataFrame(y) # check locations if self.save_location is None: logger.warning('Warning! Nothing gets saved. ' 'Please reset save_location ' 'if you want to write results to disk') # save object self.preds_dict = {} self.probas_dict = {} for i, model in self.model_dict.items(): # save model if save_models: self.save_model(model, name='model_{}'.format(i)) # pred if hasattr(model, 'predict'): self.preds_dict = { **self.preds_dict, **{ i: pd.DataFrame( model.predict(X.iloc[self.cv[i][1]]), index=X.iloc[self.cv[i][1]].index ) } } else: logger.warning('Model does NOT implement predict') # probas if hasattr(model, 'predict_proba'): self.probas_dict = { **self.probas_dict, **{ i: pd.DataFrame( model.predict_proba(X.iloc[self.cv[i][1]]), index=X.iloc[self.cv[i][1]].index ) } } else: logger.warning('Model does NOT implement predict_proba') if self.preds_dict: preds_list = list(self.preds_dict.values()) self.pred_out_of_sample = \ pd.concat(preds_list, verify_integrity=True).sort_index() # save pred if self.predictions_location and save_predictions: self.save_prediction(self.pred_out_of_sample) if self.probas_dict: probas_list = list(self.probas_dict.values()) self.proba_out_of_sample = \ pd.concat(probas_list, verify_integrity=True).sort_index() # save probas if self.probas_location and save_probas: self.save_proba(self.proba_out_of_sample) if self.verbose > 0: logger.info('Saving is done')
def evaluate(self, X=None, y=None, kind='prediction', scoring=None, aggregator=None, **score_kwargs): """ This is a convenient method for quick evaluating out-of-sample scores Parameters ---------- X : X is NOT required y : y has to be the same y passed in its train method kind : str, one of ['prediction', 'proba']. If 'prediction' is chosen, then it will score prediction against out of sample targets If 'proba' is chosen, then it will score proba against out of sample targets scoring : dictionary with {metrics name: metrics callable} eg. {'accuracy': sklearn.metrics.accuracy_score} Default is accuracy aggregator: a function or a callable, to aggregate a vector **score_kwargs : this is passed to metrics callable Returns ------- score_dict : a dictionary of score eg. { 'accuracy': [0.84, 0.92, 0.86, 0.78], 'roc_auc': [0.72, 0.77, 0.73, 0.69] } """ allowed_kind = ['prediction', 'proba'] if kind not in allowed_kind: raise ValueError('kind must be one of {}'.format(allowed_kind)) if kind == 'prediction': check_has_set_attr(self, 'preds_dict') y_hat_dict = self.preds_dict else: # kind == 'proba' check_has_set_attr(self, 'probas_dict') y_hat_dict = self.probas_dict for i, y_probas in y_hat_dict.items(): if np.dim(y_probas) == 2: y_hat_dict[i] = y_probas[:, -1] # check y if y is None: raise ValueError('You must pass in y') else: y = force_array(y) # check scoring if scoring is None: scoring = {'accuracy': accuracy_score} # score out of sample score_dict = {} for name, score in scoring.items(): # get scores for every folds scores_list = [ score(y[self.cv[i][1]], y_hat_dict[i], **score_kwargs) for i in range(len(self.cv)) ] # save scores with score name in score_dict score_dict = { **score_dict, **{name: scores_list} } # aggregator if aggregator: score_dict = { name: aggregator(scores) for (name, scores) in score_dict.items() } return score_dict
def get_probas_dict(self): # check fitted check_has_set_attr(self, 'probas_dict') return self.probas_dict
def get_out_of_sample_probas(self): # check fitted check_has_set_attr(self, 'proba_out_of_sample') return self.proba_out_of_sample
def get_out_of_sample_predictions(self): # check fitted check_has_set_attr(self, 'pred_out_of_sample') return self.pred_out_of_sample
def get_trained_model_dict(self): # check fitted check_has_set_attr(self, 'is_trained') return self.model_dict
def get_model_dict(self): # check fitted check_has_set_attr(self, 'model_dict') return self.model_dict
def save_proba(self, proba, name='proba'): check_has_set_attr(self, 'probas_location') pathlib.Path(self.probas_location).mkdir(parents=True, exist_ok=True) filepath = os.path.join(self.probas_location, '{}.pkl'.format(name)) # noqa save_object(proba, filepath)
def save_model(self, model, name='model'): check_has_set_attr(self, 'models_location') pathlib.Path(self.models_location).mkdir(parents=True, exist_ok=True) filepath = os.path.join(self.models_location, '{}.pkl'.format(name)) save_object(model, filepath)