def save_start_time(self, seed): start_time = time.time() time_file = self._get_start_time_filename(seed) write_file(time_file, str(start_time)) return time_file
def load_model_and_predict(model_file, X, y=None, task="classification", scoring=None, save_prediction_types='dataframe', backend=None, output_path='tmp'): """ Predict on an arbitrary dataset using the trained model and save predictions and/or scores. Args: model_file: str Path to the trained model file. X: array-like The data to fit. Can be for example a list, or an array. y: array-like, optional (default: None) The target variable to predict in the case of supervise learning. task: str, 'classification' or 'regression' (default: classification) Model's task type, only support 'classification' or 'regression' scoring: str or callable or a list of them or None (default: None) A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. save_prediction_types: str or [str] (default: dataframe) It effect when save_prediction is True. The optional parameters are: ["npy", "txt", "dataframe"]. backend: Backend object (default: None) MLBackend object which defined output_path, environment configuration, save_predictions, and so on. If None, use default MLBackend object. output_path: str (default: 'tmp') Output path of PredictPipeline, if is None or 'tmp', use the default output path: '/tmp/amlearn/task_%pid/output_%timestamp'. Returns: predictions: np.array Predictions from the trained model. """ if backend is None: backend = create_ml_backend(output_path=output_path) model = joblib.load(model_file) if isinstance(model, RegressorMixin): if task == 'regression' or task is None: task = 'regression' if scoring is None: scoring = [ 'r2', 'neg_mean_absolute_error', 'neg_mean_squared_error' ] else: raise TypeError('Model type of model_file is "regression", ' 'but the task parameter is not. Please make ' 'sure these two match.') elif isinstance(model, ClassifierMixin): if task == 'classification' or task is None: task = 'classification' if scoring is None: scoring = ['roc_auc', 'accuracy', 'f1', 'precision', 'recall'] else: raise TypeError('Model type of model_file is "classification",' 'but the task parameter is not. Please make ' 'sure these two match.') else: raise TypeError('Model must be instance of RegressorMixin or ' 'ClassifierMixin.') if task == 'classification': if hasattr(model, 'predict_proba'): predictions = model.predict_proba(X) elif hasattr(model, 'decision_function'): predictions = model.decision_function(X) else: predictions = model.predict(X) targets_and_predictions = np.array(list(zip(y, predictions[:, 1]))) \ if y is not None else predictions[:, 1] elif task == 'regression': predictions = model.predict(X) targets_and_predictions = np.array(list(zip(y, predictions))) \ if y is not None else predictions else: raise ValueError('task only support classification or regression') if scoring and y is not None: scores, _ = calc_scores(X=X, y=y, estimator=model, scoring=scoring) write_file( os.path.join(backend.output_path, 'scores.txt'), '{}\n{}'.format( ','.join(['dataset'] + list(scores.keys())), ','.join(['predict'] + list(map(str, scores.values()))))) if not isinstance(save_prediction_types, list_like()): save_prediction_types = [save_prediction_types] for predict_type in save_prediction_types: if predict_type in backend.valid_predictions_type: getattr(backend, 'save_predictions_as_{}'.format(predict_type))\ (targets_and_predictions, subdir='') else: raise ValueError('predict_type {} is unknown, ' 'Possible values are {}'.format( predict_type, backend.valid_predictions_type)) return predictions
def _fit_cv(self, X, y, random_state=None, scoring=None, cv_num=1, cv_params=None, val_size=0.3, save_model=False, save_score=True, save_prediction=False, prediction_types='dataframe', save_feature_importances=True, save_train_val_idx=False, **fit_params): # If user's cv_params contains 'cv_num' parameter, use the max value # between function parameter 'cv_num' and cv_params's 'cv_num'. if not self.imblearn: self.backend.logger.info('Start Cross Validation.') cv_start_time = time.time() if cv_params is None: cv_params = {} if 'cv_num' in cv_params.keys(): cv_num = max(cv_num, cv_params['cv_num']) cv_params.pop('cv_num') if 'scoring' in cv_params.keys(): cv_params.pop('scoring') return_train_score = cv_params.get('return_train_score', True) if cv_num > 1: np.random.seed(random_state) classifier_params = \ appropriate_kwargs(fit_params, self.classifier.fit) results, scorers = \ cross_validate(estimator=self.classifier, scoring=scoring, fit_params=classifier_params, X=X, y=y, cv=cv_num, **cv_params) else: results, scorers = self._fit(X, y, self.classifier, val_size=val_size, return_train_score=return_train_score, random_state=random_state, scoring=scoring, **fit_params) cv_num = 1 # TODO: now if scoring is more than one, score_name only can be the first of them. self.score_name = self.score_name if hasattr(self, 'score_name') \ else list(scorers.keys())[0] self.best_score_, (self.best_model_, self.best_model_tag_)= \ max(zip(results['test_{}'.format(self.score_name)], zip(results['estimators'], [''] if cv_num == 1 else ["cv_{}".format(i) for i in range(cv_num)])), key=lambda x: x[0]) if not self.imblearn: self.backend.logger.info( "\tCV classification finish in {:.4f} seconds.".format( time.time() - cv_start_time)) if save_model or save_score or save_train_val_idx or save_prediction \ or save_feature_importances: imblearn_output_path = \ os.path.join(self.backend.output_path, self.imblearn_tag) create_path(imblearn_output_path) if save_score: write_file( os.path.join(imblearn_output_path, 'mean_scores.txt'), '{}\n{}\n{}'.format( ','.join(['dataset'] + list(scorers.keys())), ','.join(['test'] + [ str(np.mean(results['test_{}'.format(score_name)])) for score_name in scorers.keys() ]), ','.join(['train'] + [ str(np.mean(results['train_{}'.format( score_name)])) for score_name in scorers.keys() ]) if return_train_score else -1)) check_path_while_saving(self.backend.output_path) for cv_idx in range(cv_num): sub_path = os.path.join(self.imblearn_tag, "cv_{}".format(cv_idx)) cv_output_path = \ os.path.join(self.backend.output_path, sub_path) create_path(cv_output_path) if save_score: write_file( os.path.join(cv_output_path, 'scores.txt'), '{}\n{}\n{}'.format( ','.join(['dataset'] + list(scorers.keys())), ','.join(['test'] + [ str(results['test_{}'.format(score_name)] [cv_idx]) for score_name in scorers.keys() ]), ','.join(['train'] + [ str(results['train_{}'.format(score_name)] [cv_idx]) for score_name in scorers.keys() ]) if return_train_score else -1)) score_model = results['estimators'][cv_idx] if save_model: self.backend.save_model(score_model, sub_path) if save_feature_importances: self.backend.save_json( self.feature_importances_dict(score_model), sub_path, name='feature_importances') if save_train_val_idx: train_idx = results['indices'][cv_idx][0] val_idx = results['indices'][cv_idx][1] write_file(os.path.join(cv_output_path, 'train_idx.txt'), "\n".join(list(map(str, train_idx)))) write_file(os.path.join(cv_output_path, 'val_idx.txt'), "\n".join(list(map(str, val_idx)))) if save_prediction: if 'X_val' in fit_params.keys( ) and 'y_val' in fit_params.keys(): test_X = fit_params['X_val'] test_y = fit_params['y_val'] else: test_X = X[results['indices'][cv_idx][1]] \ if isinstance(X, np.ndarray) \ else X.iloc[results['indices'][cv_idx][1]] test_y = y[results['indices'][cv_idx][1]] \ if isinstance(y, np.ndarray) \ else y.iloc[results['indices'][cv_idx][1]] if hasattr(score_model, 'predict_proba'): predictions = score_model.predict_proba(test_X) elif hasattr(score_model, 'decision_function'): predictions = score_model.decision_function(test_X) else: predictions = score_model.predict(test_X) targets_and_predictions = \ np.array(list(zip(test_y, predictions[:, 1]))) if not isinstance(prediction_types, list_like()): prediction_types = [prediction_types] for predict_type in prediction_types: if predict_type in self.backend.valid_predictions_type: getattr( self.backend, 'save_predictions_as_{}'.format(predict_type))( targets_and_predictions, sub_path) else: raise ValueError( 'predict_type {} is unknown, ' 'Possible values are {}'.format( predict_type, self.backend.valid_predictions_type)) return results, scorers
def _fit_imblearn(self, X, y, random_state=None, scoring=None, imblearn_method=None, imblearn_params=None, cv_num=1, cv_params=None, val_size=0.3, save_model=True, save_score=True, save_prediction=True, prediction_types='dataframe', save_train_val_idx=True, save_feature_importances=True, **fit_params): self.backend.logger.info('Start Imblearn.') imblearn_start_time = time.time() imblearn = ImblearnPreprocessor() if imblearn_method is None: imblearn_method = 'EasyEnsemble' if imblearn_params is None: imblearn_params = {"random_state": random_state, "n_subsets": 3} if 'random_state' not in imblearn_params: imblearn_params['random_state'] = random_state if 'train_idx' in fit_params.keys() and 'val_idx' in fit_params.keys( ) and cv_num <= 1: train_idx = fit_params['train_idx'] val_idx = fit_params['val_idx'] X_val = copy(X.loc[val_idx]) y_val = copy(y.loc[val_idx]) X = X.loc[train_idx] y = y.loc[train_idx] else: X_val = None y_val = None X, y = imblearn.fit(X, y, imblearn_method, imblearn_params) score_model_list = list() # get the imblearn n_subsets num from X shape. if len(X.shape) == 2: n_subsets = 1 X = [X] y = [y] elif len(X.shape) == 3: n_subsets = X.shape[0] else: raise ValueError("imblearn result error!") self.backend.logger.info( '\tData imblearn finished in {:.4f} seconds.'.format( time.time() - imblearn_start_time)) all_results = dict() return_train_score = cv_params.get('return_train_score', True) \ if cv_params is not None else True for imblearn_idx in range(n_subsets): self.backend.logger.info( 'Start imblearn_{} classification.'.format(imblearn_idx)) start_time = time.time() X_imb = np.array(copy(X))[imblearn_idx, :, :] y_imb = np.array(copy(y))[imblearn_idx, :] self.imblearn_tag = 'imblearn_{}'.format(imblearn_idx) results, scorers = self._fit_cv( X=X_imb, y=y_imb, random_state=random_state, scoring=scoring, cv_params=cv_params, cv_num=cv_num, val_size=val_size, save_model=save_model, save_score=save_score, save_prediction=save_prediction, prediction_types=prediction_types, save_feature_importances=save_feature_importances, save_train_val_idx=save_train_val_idx, X_val=X_val, y_val=y_val, **fit_params) for score_name in scorers.keys(): if 'test_{}'.format(score_name) in all_results.keys(): if return_train_score: all_results['train_{}'.format(score_name)] += \ results['train_{}'.format(score_name)] all_results['test_{}'.format(score_name)] += \ results['test_{}'.format(score_name)] else: if return_train_score: all_results['train_{}'.format(score_name)] = \ results['train_{}'.format(score_name)] else: all_results['train_{}'.format(score_name)] = [-1] all_results['test_{}'.format(score_name)] = \ results['test_{}'.format(score_name)] score_model_list.append( (self.best_score_, (self.best_model_, "imblearn_{}_{}".format( imblearn_idx, self.best_model_tag_)))) self.backend.logger.info( "\tImblearn_{} classification finish in {:.4f} seconds.". format(imblearn_idx, time.time() - start_time)) if save_score: print([ all_results['test_{}'.format(score_name)] for score_name in scorers.keys() ]) write_file( os.path.join(self.backend.output_path, 'mean_scores.txt'), '{}\n{}\n{}'.format( ','.join(['dataset'] + list(scorers.keys())), ','.join(['test'] + [ str( np.mean( np.array(all_results['test_{}'.format( score_name)]))) for score_name in scorers.keys() ]), ','.join(['train'] + [ str( np.mean( np.array(all_results['train_{}'.format( score_name)]))) for score_name in scorers.keys() ]))) self.best_score_, (self.best_model_, self.best_model_tag_) = \ max(score_model_list, key=lambda x: x[0]) self.backend.logger.info('Whole classification finish in {:.4f} ' 'seconds.'.format(time.time() - imblearn_start_time)) return self
def _fit_cv(self, X, y, val_size=0.3, random_state=None, scoring=None, cv_num=1, cv_params=None, save_train_val_idx=True, save_model=True, save_score=True, save_prediction=True, prediction_types='dataframe', save_feature_importances=True, **fit_params): # If user's cv_params contains 'cv_num' parameter, use the max value # between function parameter 'cv_num' and cv_params's 'cv_num'. self.backend.logger.info('Start Cross Validation.') cv_start_time = time.time() if cv_params is None: cv_params = {} if 'cv_num' in cv_params.keys(): cv_num = max(cv_num, cv_params['cv_num']) cv_params.pop('cv_num') if 'scoring' in cv_params.keys(): cv_params.pop('scoring') return_train_score = cv_params.get('return_train_score', True) if cv_num > 1: if random_state is False: pass else: np.random.seed(random_state) results, scorers = \ cross_validate(estimator=self.regressor, scoring=scoring, fit_params=fit_params, X=X, y=y, cv=cv_num, **cv_params) else: results, scorers = self._fit( X, y, self.regressor, val_size=val_size, return_train_score=return_train_score, random_state=random_state, scoring=scoring, **fit_params) cv_num = 1 # TODO: now if scorers list length is more than 1, score_name only can # be the first of them. self.score_name = self.score_name if hasattr(self, 'score_name') \ else list(scorers.keys())[0] self.best_score_, (self.best_model_, self.best_model_tag_)= \ max(zip(results['test_{}'.format(self.score_name)], zip(results['estimators'], [''] if cv_num == 1 else ["cv_{}".format(i) for i in range(cv_num)])), key=lambda x: x[0]) self.backend.logger.info( "\tCV regression finish in {:.4f} seconds.".format( time.time() - cv_start_time)) if save_score: write_file( os.path.join(self.backend.output_path, 'mean_scores.txt'), '{}\n{}\n{}'.format( ','.join(['dataset'] + list(scorers.keys())), ','.join(['test'] + [str(np.mean(results['test_{}'.format( score_name)])) for score_name in scorers.keys()]), ','.join(['train'] + [str(np.mean(results['train_{}'.format( score_name)])) for score_name in scorers.keys()]) if return_train_score else -1)) for cv_idx in range(cv_num): cv_tag = "cv_{}".format(cv_idx) cv_output_path = os.path.join(self.backend.output_path, cv_tag) create_path(cv_output_path, merge=True) if save_score: write_file(os.path.join(cv_output_path, 'scores.txt'), '{}\n{}\n{}'.format( ','.join(['dataset'] + list(scorers.keys())), ','.join(['test'] + [str(results['test_{}'.format( score_name)][cv_idx]) for score_name in scorers.keys()]), ','.join(['train'] + [str(results['train_{}'.format( score_name)][cv_idx]) for score_name in scorers.keys()]) if return_train_score else -1)) score_model = results['estimators'][cv_idx] if save_model: self.backend.save_model(score_model, cv_tag) if save_feature_importances: self.backend.save_json( self.feature_importances_dict(score_model), cv_tag, name='feature_importances') if save_train_val_idx: train_idx = results['indices'][cv_idx][0] val_idx = results['indices'][cv_idx][1] write_file(os.path.join(cv_output_path, 'train_idx.txt'), "\n".join(list(map(str, train_idx)))) write_file(os.path.join(cv_output_path, 'val_idx.txt'), "\n".join(list(map(str, val_idx)))) if save_prediction: predictions = \ score_model.predict(X.iloc[results['indices'][cv_idx][1]]) targets_and_predictions = \ np.array(list(zip(y.iloc[results['indices'][cv_idx][1]], predictions))) if not isinstance(prediction_types, list_like()): prediction_types = [prediction_types] for predict_type in prediction_types: if predict_type in self.backend.valid_predictions_type: instance = getattr(self.backend, 'save_predictions_as_{}'.format( predict_type)) instance(targets_and_predictions, cv_tag) else: raise ValueError( 'predict_type {} is unknown, ' 'Possible values are {}'.format( predict_type, self.backend.valid_predictions_type)) return self