def _fe_fit_and_score(estimator, X, y, scorers, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, fold_specific_X_extractor=None): if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) if fold_specific_X_extractor: # extend by fold-specific features X_train_additional = fold_specific_X_extractor(train) if X_train_additional is not None: X_train = np.concatenate([X_train, X_train_additional], axis=1) # if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) # test_scores = [ _score(estimator, X_test, y_test, scorer) for scorer in scorers ] if return_train_score: train_scores = [ _score(estimator, X_train, y_train, scorer) for scorer in scorers ] # if verbose > 2: msg += ", scores=%s" % test_scores if verbose > 1: print("[CV] %s %s" % ((64 - len(msg)) * '.', msg)) ret = [train_scores, test_scores] if return_train_score else [test_scores] if return_parameters: ret.append(parameters) return ret
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params=None): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) if num_samples(train) == 0 or num_samples(test) == 0: raise RuntimeError( 'Cross validation error in fit_estimator. The total data set ' 'contains %d elements, which were split into a training set ' 'of %d elements and a test set of %d elements. Unfortunately, ' 'you can\'t have a %s set with 0 elements.' % ( num_samples(X), num_samples(train), num_samples(test), 'training' if num_samples(train) == 0 else 'test')) # adjust length of sample weights n_samples = num_samples(X) fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, np.asarray(v)[train] if hasattr(v, '__len__') and len(v) == n_samples else v) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) # fit and score start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) test_score = _score(estimator, X_test, y_test, scorer) train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time msmbuilder_api = is_msmbuilder_estimator(estimator) n_samples_test = num_samples(X_test, is_nested=msmbuilder_api) n_samples_train = num_samples(X_train, is_nested=msmbuilder_api) if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) return (test_score, n_samples_test, train_score, n_samples_train, scoring_time)
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params=None): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) if num_samples(train) == 0 or num_samples(test) == 0: raise RuntimeError( 'Cross validation error in fit_estimator. The total data set ' 'contains %d elements, which were split into a training set ' 'of %d elements and a test set of %d elements. Unfortunately, ' 'you can\'t have a %s set with 0 elements.' % ( num_samples(X), num_samples(train), num_samples(test), 'training' if num_samples(train) == 0 else 'test')) # adjust length of sample weights n_samples = num_samples(X) fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, np.asarray(v)[train] if hasattr(v, '__len__') and len(v) == n_samples else v) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) # fit and score start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) test_score = _score(estimator, X_test, y_test, scorer) train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time msmbuilder_api = is_msmbuilder_estimator(estimator) n_samples_test = num_samples(X_test, is_nested=msmbuilder_api) n_samples_train = num_samples(X_train, is_nested=msmbuilder_api) if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) return (test_score, n_samples_test, train_score, n_samples_train, scoring_time)
def _rfe_train_test(rfe, estimator, X, y, train, test, scorer): """ Return the score for a fit across one fold. """ X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) train_score = rfe._fit( X_train, y_train, lambda estimator, features: _score( estimator, X_test[:, features], y_test, scorer)).scores_ test_score = rfe._fit( X_train, y_train, lambda estimator, features: _score( estimator, X_train[:, features], y_train, scorer)).scores_ return train_score, test_score
def _fit_and_score_grid(estimator, X, y, scorer, train, test, grid, fit_params, error_score='raise'): '''Doc String''' X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: estimator.fit(X_train, y_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): scores = [error_score] * len(grid) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: origParams = estimator.get_params() scores = [ _score(estimator.set_params(**params), X_test, y_test, scorer) for params in grid ] estimator.set_params(**origParams) return scores
def evaluate(self, scoring='accuracy'): from sklearn.model_selection._validation import _score sample_x = np.array( [tuple(x) for x in self._Xtest[self._ftnames].values]) return _score(self._estimator, sample_x, self._Xtest.rate.values.ravel().tolist(), check_scoring(self._estimator, scoring=scoring))
def _multi_time_fit(random_state, baf, X, y, scorer): """""" baf_i = clone(baf) baf_i.random_state = random_state baf_i.refit = True baf_i._fit(X, y) return baf_i.support_, _score(baf_i.estimator_, baf_i.transform(X, ), y, scorer), baf_i.score_
def evaluate(self, dataset, pipelines): if not self.is_valid(dataset): raise AssertionError("Dataset is not appropriate for evaluation") # this is a bit akward, but we need to check if at least one pipe # have to be run before loading the data. If at least one pipeline # need to be run, we have to load all the data. # we might need a better granularity, if we query the DB run_pipes = {} for subject in dataset.subject_list: run_pipes.update( self.results.not_yet_computed(pipelines, dataset, subject)) if len(run_pipes) != 0: # get the data X, y, metadata = self.paradigm.get_data(dataset) # encode labels le = LabelEncoder() y = le.fit_transform(y) # extract metadata groups = metadata.subject.values sessions = metadata.session.values scorer = get_scorer(self.paradigm.scoring) # perform leave one subject out CV cv = LeaveOneGroupOut() for train, test in cv.split(X, y, groups): subject = groups[test[0]] # now we can check if this subject has results run_pipes = self.results.not_yet_computed( pipelines, dataset, subject) # iterate over pipelines for name, clf in run_pipes.items(): t_start = time() model = deepcopy(clf).fit(X[train], y[train]) duration = time() - t_start # we eval on each session for session in np.unique(sessions[test]): ix = sessions[test] == session score = _score(model, X[test[ix]], y[test[ix]], scorer) res = { "time": duration, "dataset": dataset, "subject": subject, "session": session, "score": score, "n_samples": len(train), "n_channels": X.shape[1], "pipeline": name, } yield res
def _baf_single_fit(train, test, baf, estimator, X, y, scorer, random_state): """""" X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) baf_i = clone(baf) baf_i.random_state = random_state baf_i._fit(X_train, y_train) return baf_i.support_, _score(baf_i.estimator_, baf_i.transform(X_test, ), y_test, scorer), baf_i.score_
def _rfa_single_fit(rfa, estimator, X, y, train, test, scorer): """ Return the score for a fit across one fold. """ X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) return rfa._fit( X_train, y_train, lambda estimator, features: _score(estimator, X_test[:, features], y_test, scorer)).scores_
def _score(self, X, y, scoring=None, clf=None): from sklearn.model_selection._validation import _score if scoring is None: scoring = self._scorer if clf is None: clf = self._estimator return _score(clf, X, y, check_scoring(clf, scoring=scoring))
def _score(self, X, y, scoring=None, clf=None): from sklearn.model_selection._validation import _score if scoring is None: scoring = self._scorer if clf is None: clf = self._estimator return _score(clf, X, y, check_scoring(clf, scoring=scoring))
def _scoring(self, net, X_test, y_test): """Resolve scoring and apply it to data.""" scorer = check_scoring(net, self.scoring) scores = _score( estimator=net, X_test=X_test, y_test=y_test, scorer=scorer, is_multimetric=False, ) return scores
def _scoring(self, net, X_test, y_test): """Resolve scoring and apply it to data. Use cached prediction instead of running inference again, if available.""" scorer = check_scoring(net, self.scoring) scores = _score( estimator=net, X_test=X_test, y_test=y_test, scorer=scorer, is_multimetric=False, ) return scores
def score_explicit(self, clf, X_train, y_train, X_test, y_test): scorer = get_scorer(self.paradigm.scoring) t_start = time() try: model = clf.fit(X_train, y_train) score = _score(model, X_test, y_test, scorer) except ValueError as e: if self.error_score == "raise": raise e score = self.error_score duration = time() - t_start return score, duration
def _scoring(self, net, X_test, y_test): """Resolve scoring and apply it to data. Use cached prediction instead of running inference again, if available.""" scorer = check_scoring(net, self.scoring_) scores = _score( estimator=net, X_test=X_test, y_test=y_test, scorer=scorer, is_multimetric=False, ) return scores
def _incremental_fit_estimator(estimator, X, y, classes, train, test, train_sizes, scorer, verbose): """Train estimator on training subsets incrementally and compute scores.""" train_scores, test_scores = [], [] partitions = zip(train_sizes, np.split(train, train_sizes)[:-1]) for n_train_samples, partial_train in partitions: train_subset = train[:n_train_samples] # NOTE: wrapper patch X_train, y_train = _patch_split(estimator, X, y, train_subset) X_partial_train, y_partial_train = _patch_split(estimator, X, y, partial_train) X_test, y_test = _patch_split(estimator, X, y, test, train_subset) if y_partial_train is None: estimator.partial_fit(X_partial_train, classes=classes) else: estimator.partial_fit(X_partial_train, y_partial_train, classes=classes) train_scores.append(_score(estimator, X_train, y_train, scorer)) test_scores.append(_score(estimator, X_test, y_test, scorer)) return np.array((train_scores, test_scores)).T
def _wrap_score(y_pred, y_true, scorers, is_multimetric): start_time = time.time() results = {} # we use -1 to signify missing predictions because numpy has no integer NaN if np.any(y_pred < 0): if is_multimetric: for name in scorers: results[name] = np.nan else: results["score"] = np.nan else: estimator = _MockEstimator(y_pred) results = _score(estimator, None, y_true, scorers, is_multimetric) score_time = time.time() - start_time return results, score_time
def main(inputs, infile_estimator, outfile_eval, infile1=None, infile2=None): """ Parameter --------- inputs : str File path to galaxy tool parameter infile_estimator : strgit File path to trained estimator input outfile_eval : str File path to save the evalulation results, tabular infile1 : str File path to dataset containing features infile2 : str File path to dataset containing target values """ warnings.filterwarnings('ignore') with open(inputs, 'r') as param_handler: params = json.load(param_handler) X_test, y_test = _get_X_y(params, infile1, infile2) # load model estimator = load_model_from_h5(infile_estimator) estimator = clean_params(estimator) # handle scorer, convert to scorer dict scoring = params['scoring'] scorer = get_scoring(scoring) if not isinstance(scorer, (dict, list)): scorer = [scoring['primary_scoring']] scorer = _check_multimetric_scoring(estimator, scoring=scorer) if hasattr(estimator, 'evaluate'): scores = estimator.evaluate(X_test, y_test=y_test, scorer=scorer) else: scores = _score(estimator, X_test, y_test, scorer) # handle output for name, score in scores.items(): scores[name] = [score] df = pd.DataFrame(scores) df = df[sorted(df.columns)] df.to_csv(path_or_buf=outfile_eval, sep='\t', header=True, index=False)
def score_explicit(self, clf, X_train, y_train, X_test, y_test): if not self.mne_labels: # convert labels if array, keep them if epochs and mne_labels is set le = LabelEncoder() y_train = le.fit_transform(y_train) y_test = le.transform(y_test) scorer = get_scorer(self.paradigm.scoring) t_start = time() try: model = clf.fit(X_train, y_train) score = _score(model, X_test, y_test, scorer) except ValueError as e: if self.error_score == "raise": raise e score = self.error_score duration = time() - t_start return score, duration
def test_score(self, X, y, cv, scoring): '''return test scores of estimator ''' # test scores data_splits = _split_cv(X, y=y, cv=cv, random_state=self.seed) # get_scorers = _validation._check_multimetric_scoring # scorer, _ = get_scorers(self.estimator, scoring=scoring) # is_multimetric = not callable(scorer) scorer = self._get_scorer(scoring) is_multimetric = not callable(scorer) scores = [] for item in data_splits: x0 = item[0][1] y0 = item[1][1] scores.append( _validation._score(self.estimator, x0, y0, scorer, is_multimetric)) scores = pd.DataFrame(scores).reset_index(drop=True) return scores
def _score_on_validation_sets( self, estimator: BaseEstimator, datasets: Dict[str, Tuple[pd.DataFrame, pd.Series]], ) -> Dict[str, Dict[str, Any]]: results = defaultdict(dict) if not datasets: return results if callable(self.scoring): scorers = self.scoring elif self.scoring is None or isinstance(self.scoring, str): scorers = check_scoring(estimator, self.scoring) else: scorers = _check_multimetric_scoring(estimator, self.scoring) for key, X_y_tuple in datasets.items(): X_test, y_test = X_y_tuple start_time = time() try: test_scores = _score(estimator, X_test, y_test, scorers) except Exception: if isinstance(scorers, dict): test_scores = {k: np.nan for k in scorers} else: test_scores = np.nan warnings.warn( f"Scoring on validation set {key} failed. The score(s) for " f"this set will be set to nan. Details: \n" f"{format_exc()}", UserWarning, ) score_time = time() - start_time results[key]["score_time"] = score_time if not isinstance(test_scores, dict): test_scores = {"score": test_scores} for name in test_scores: results[key][f"test_{name}"] = test_scores[name] return results
def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer): """ Return the score for a fit across one fold. """ X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) oversampler = SMOTE(ratio='minority', random_state=None, k=None, k_neighbors=5, m=None, m_neighbors=10, out_step=0.5, kind='regular', svm_estimator=None, n_jobs=1) X_train, y_train = oversampler.fit_sample(X_train, y_train) return rfe._fit( X_train, y_train, lambda estimator, features: _score( estimator, X_test[:, features], y_test, scorer)).scores_
def nested_fit_and_score(estimator, X, y, scorer, train, test, verbose=1, parameters=None, fit_params=None, return_train_score=False, return_times=False, error_score='raise'): """ """ from sklearn.externals.joblib.logger import short_format_time # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) if verbose > 1: LOG.info( 'CV iteration: Xtrain=%d, Ytrain=%d/%d -- Xtest=%d, Ytest=%d/%d.', len(X_train), len(X_train) - sum(y_train), sum(y_train), len(X_test), len(X_test) - sum(y_test), sum(y_test)) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score LOG.warning( "Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r", error_score, e) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time test_score = None score_time = 0.0 if len(set(y_test)) > 1: test_score = _score(estimator, X_test, y_test, scorer) score_time = time.time() - start_time - fit_time else: LOG.warning( 'Test set has no positive labels, scoring has been skipped ' 'in this loop.') if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) acc_score = _score(estimator, X_test, y_test, check_scoring(estimator, scoring='accuracy')) if verbose > 0: total_time = score_time + fit_time if test_score is not None: LOG.info('Iteration took %s, score=%f, accuracy=%f.', short_format_time(total_time), test_score, acc_score) else: LOG.info('Iteration took %s, score=None, accuracy=%f.', short_format_time(total_time), acc_score) ret = {'test': {'score': test_score, 'accuracy': acc_score}} if return_train_score: ret['train'] = {'score': train_score} if return_times: ret['times'] = [fit_time, score_time] return ret, estimator
def _model_fit_and_score(estimator_str, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, error_score='raise'): """ """ if verbose > 1: msg = '[CV model=%s]' % estimator_str.upper() if parameters is not None: msg += ' %s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) LOG.info("%s %s", msg, (89 - len(msg)) * '.') estimator = _clf_build(estimator_str) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn( "Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time scorer = check_scoring(estimator, scoring=scorer) test_score = _score(estimator, X_test, y_test, scorer) score_time = time.time() - start_time - fit_time if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time)) LOG.info(end_msg) ret = [train_score, test_score] if return_train_score else [test_score] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append((estimator_str, parameters)) return ret
def _model_fit_and_score(estimator_str, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, error_score='raise'): """ """ if verbose > 1: msg = '[CV model=%s]' % estimator_str.upper() if parameters is not None: msg += ' %s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) LOG.info("%s %s", msg, (89 - len(msg)) * '.') estimator = _clf_build(estimator_str) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time scorer = check_scoring(estimator, scoring=scorer) test_score = _score(estimator, X_test, y_test, scorer) score_time = time.time() - start_time - fit_time if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time)) LOG.info(end_msg) ret = [train_score, test_score] if return_train_score else [test_score] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append((estimator_str, parameters)) return ret
def nested_fit_and_score( estimator, X, y, scorer, train, test, verbose=1, parameters=None, fit_params=None, return_train_score=False, return_times=False, error_score='raise'): """ """ from sklearn.externals.joblib.logger import short_format_time # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) if verbose > 1: LOG.info('CV iteration: Xtrain=%d, Ytrain=%d/%d -- Xtest=%d, Ytest=%d/%d.', len(X_train), len(X_train) - sum(y_train), sum(y_train), len(X_test), len(X_test) - sum(y_test), sum(y_test)) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score LOG.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r", error_score, e) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time test_score = None score_time = 0.0 if len(set(y_test)) > 1: test_score = _score(estimator, X_test, y_test, scorer) score_time = time.time() - start_time - fit_time else: LOG.warn('Test set has no positive labels, scoring has been skipped ' 'in this loop.') if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) acc_score = _score(estimator, X_test, y_test, check_scoring(estimator, scoring='accuracy')) if verbose > 0: total_time = score_time + fit_time if test_score is not None: LOG.info('Iteration took %s, score=%f, accuracy=%f.', short_format_time(total_time), test_score, acc_score) else: LOG.info('Iteration took %s, score=None, accuracy=%f.', short_format_time(total_time), acc_score) ret = { 'test': {'score': test_score, 'accuracy': acc_score} } if return_train_score: ret['train'] = {'score': train_score} if return_times: ret['times'] = [fit_time, score_time] return ret, estimator
def main(inputs, infile_estimator, infile1, infile2, outfile_result, outfile_object=None, outfile_weights=None, groups=None, ref_seq=None, intervals=None, targets=None, fasta_path=None): """ Parameter --------- inputs : str File path to galaxy tool parameter infile_estimator : str File path to estimator infile1 : str File path to dataset containing features infile2 : str File path to dataset containing target values outfile_result : str File path to save the results, either cv_results or test result outfile_object : str, optional File path to save searchCV object outfile_weights : str, optional File path to save deep learning model weights groups : str File path to dataset containing groups labels ref_seq : str File path to dataset containing genome sequence file intervals : str File path to dataset containing interval file targets : str File path to dataset compressed target bed file fasta_path : str File path to dataset containing fasta file """ warnings.simplefilter('ignore') with open(inputs, 'r') as param_handler: params = json.load(param_handler) # load estimator with open(infile_estimator, 'rb') as estimator_handler: estimator = load_model(estimator_handler) # swap hyperparameter swapping = params['experiment_schemes']['hyperparams_swapping'] swap_params = _eval_swap_params(swapping) estimator.set_params(**swap_params) estimator_params = estimator.get_params() # store read dataframe object loaded_df = {} input_type = params['input_options']['selected_input'] # tabular input if input_type == 'tabular': header = 'infer' if params['input_options']['header1'] else None column_option = (params['input_options']['column_selector_options_1'] ['selected_column_selector_option']) if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']: c = params['input_options']['column_selector_options_1']['col1'] else: c = None df_key = infile1 + repr(header) df = pd.read_csv(infile1, sep='\t', header=header, parse_dates=True) loaded_df[df_key] = df X = read_columns(df, c=c, c_option=column_option).astype(float) # sparse input elif input_type == 'sparse': X = mmread(open(infile1, 'r')) # fasta_file input elif input_type == 'seq_fasta': pyfaidx = get_module('pyfaidx') sequences = pyfaidx.Fasta(fasta_path) n_seqs = len(sequences.keys()) X = np.arange(n_seqs)[:, np.newaxis] for param in estimator_params.keys(): if param.endswith('fasta_path'): estimator.set_params( **{param: fasta_path}) break else: raise ValueError( "The selected estimator doesn't support " "fasta file input! Please consider using " "KerasGBatchClassifier with " "FastaDNABatchGenerator/FastaProteinBatchGenerator " "or having GenomeOneHotEncoder/ProteinOneHotEncoder " "in pipeline!") elif input_type == 'refseq_and_interval': path_params = { 'data_batch_generator__ref_genome_path': ref_seq, 'data_batch_generator__intervals_path': intervals, 'data_batch_generator__target_path': targets } estimator.set_params(**path_params) n_intervals = sum(1 for line in open(intervals)) X = np.arange(n_intervals)[:, np.newaxis] # Get target y header = 'infer' if params['input_options']['header2'] else None column_option = (params['input_options']['column_selector_options_2'] ['selected_column_selector_option2']) if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']: c = params['input_options']['column_selector_options_2']['col2'] else: c = None df_key = infile2 + repr(header) if df_key in loaded_df: infile2 = loaded_df[df_key] else: infile2 = pd.read_csv(infile2, sep='\t', header=header, parse_dates=True) loaded_df[df_key] = infile2 y = read_columns( infile2, c=c, c_option=column_option, sep='\t', header=header, parse_dates=True) if len(y.shape) == 2 and y.shape[1] == 1: y = y.ravel() if input_type == 'refseq_and_interval': estimator.set_params( data_batch_generator__features=y.ravel().tolist()) y = None # end y # load groups if groups: groups_selector = (params['experiment_schemes']['test_split'] ['split_algos']).pop('groups_selector') header = 'infer' if groups_selector['header_g'] else None column_option = \ (groups_selector['column_selector_options_g'] ['selected_column_selector_option_g']) if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']: c = groups_selector['column_selector_options_g']['col_g'] else: c = None df_key = groups + repr(header) if df_key in loaded_df: groups = loaded_df[df_key] groups = read_columns( groups, c=c, c_option=column_option, sep='\t', header=header, parse_dates=True) groups = groups.ravel() # del loaded_df del loaded_df # handle memory memory = joblib.Memory(location=CACHE_DIR, verbose=0) # cache iraps_core fits could increase search speed significantly if estimator.__class__.__name__ == 'IRAPSClassifier': estimator.set_params(memory=memory) else: # For iraps buried in pipeline new_params = {} for p, v in estimator_params.items(): if p.endswith('memory'): # for case of `__irapsclassifier__memory` if len(p) > 8 and p[:-8].endswith('irapsclassifier'): # cache iraps_core fits could increase search # speed significantly new_params[p] = memory # security reason, we don't want memory being # modified unexpectedly elif v: new_params[p] = None # handle n_jobs elif p.endswith('n_jobs'): # For now, 1 CPU is suggested for iprasclassifier if len(p) > 8 and p[:-8].endswith('irapsclassifier'): new_params[p] = 1 else: new_params[p] = N_JOBS # for security reason, types of callback are limited elif p.endswith('callbacks'): for cb in v: cb_type = cb['callback_selection']['callback_type'] if cb_type not in ALLOWED_CALLBACKS: raise ValueError( "Prohibited callback type: %s!" % cb_type) estimator.set_params(**new_params) # handle scorer, convert to scorer dict scoring = params['experiment_schemes']['metrics']['scoring'] scorer = get_scoring(scoring) scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer) # handle test (first) split test_split_options = (params['experiment_schemes'] ['test_split']['split_algos']) if test_split_options['shuffle'] == 'group': test_split_options['labels'] = groups if test_split_options['shuffle'] == 'stratified': if y is not None: test_split_options['labels'] = y else: raise ValueError("Stratified shuffle split is not " "applicable on empty target values!") X_train, X_test, y_train, y_test, groups_train, groups_test = \ train_test_split_none(X, y, groups, **test_split_options) exp_scheme = params['experiment_schemes']['selected_exp_scheme'] # handle validation (second) split if exp_scheme == 'train_val_test': val_split_options = (params['experiment_schemes'] ['val_split']['split_algos']) if val_split_options['shuffle'] == 'group': val_split_options['labels'] = groups_train if val_split_options['shuffle'] == 'stratified': if y_train is not None: val_split_options['labels'] = y_train else: raise ValueError("Stratified shuffle split is not " "applicable on empty target values!") X_train, X_val, y_train, y_val, groups_train, groups_val = \ train_test_split_none(X_train, y_train, groups_train, **val_split_options) # train and eval if hasattr(estimator, 'validation_data'): if exp_scheme == 'train_val_test': estimator.fit(X_train, y_train, validation_data=(X_val, y_val)) else: estimator.fit(X_train, y_train, validation_data=(X_test, y_test)) else: estimator.fit(X_train, y_train) if hasattr(estimator, 'evaluate'): scores = estimator.evaluate(X_test, y_test=y_test, scorer=scorer, is_multimetric=True) else: scores = _score(estimator, X_test, y_test, scorer, is_multimetric=True) # handle output for name, score in scores.items(): scores[name] = [score] df = pd.DataFrame(scores) df = df[sorted(df.columns)] df.to_csv(path_or_buf=outfile_result, sep='\t', header=True, index=False) memory.clear(warn=False) if outfile_object: main_est = estimator if isinstance(estimator, pipeline.Pipeline): main_est = estimator.steps[-1][-1] if hasattr(main_est, 'model_') \ and hasattr(main_est, 'save_weights'): if outfile_weights: main_est.save_weights(outfile_weights) del main_est.model_ del main_est.fit_params del main_est.model_class_ del main_est.validation_data if getattr(main_est, 'data_generator_', None): del main_est.data_generator_ with open(outfile_object, 'wb') as output_handler: pickle.dump(estimator, output_handler, pickle.HIGHEST_PROTOCOL)
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, return_estimator=False, split_progress=None, candidate_progress=None, error_score=np.nan, online_train_val_split=False): """Fit estimator and compute scores for a given dataset split. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape (n_samples, n_features) The data to fit. y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None The target variable to try to predict in the case of supervised learning. scorer : A single callable or dict mapping scorer name to the callable If it is a single callable, the return value for ``train_scores`` and ``test_scores`` is a single float. For a dict, it should be one mapping the scorer name to the scorer callable object / function. The callable object / fn should have signature ``scorer(estimator, X, y)``. train : array-like of shape (n_train_samples,) Indices of training samples. test : array-like of shape (n_test_samples,) Indices of test samples. verbose : int The verbosity level. error_score : 'raise' or numeric, default=np.nan Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. return_train_score : bool, default=False Compute and return score on training set. return_parameters : bool, default=False Return parameters that has been used for the estimator. split_progress : {list, tuple} of int, default=None A list or tuple of format (<current_split_id>, <total_num_of_splits>). candidate_progress : {list, tuple} of int, default=None A list or tuple of format (<current_candidate_id>, <total_number_of_candidates>). return_n_test_samples : bool, default=False Whether to return the ``n_test_samples``. return_times : bool, default=False Whether to return the fit/score times. return_estimator : bool, default=False Whether to return the fitted estimator. Returns ------- result : dict with the following attributes train_scores : dict of scorer name -> float Score on training set (for all the scorers), returned only if `return_train_score` is `True`. test_scores : dict of scorer name -> float Score on testing set (for all the scorers). n_test_samples : int Number of test samples. fit_time : float Time spent for fitting in seconds. score_time : float Time spent for scoring in seconds. parameters : dict or None The parameters that have been evaluated. estimator : estimator object The fitted estimator. fit_failed : bool The estimator failed to fit. """ if not isinstance(error_score, numbers.Number) and error_score != 'raise': raise ValueError( "error_score must be the string 'raise' or a numeric value. " "(Hint: if using 'raise', please make sure that it has been " "spelled correctly.)") progress_msg = "" if verbose > 2: if split_progress is not None: progress_msg = f" {split_progress[0]+1}/{split_progress[1]}" if candidate_progress and verbose > 9: progress_msg += (f"; {candidate_progress[0]+1}/" f"{candidate_progress[1]}") if verbose > 1: if parameters is None: params_msg = '' else: sorted_keys = sorted(parameters) # Ensure deterministic o/p params_msg = (', '.join(f'{k}={parameters[k]}' for k in sorted_keys)) if verbose > 9: start_msg = f"[CV{progress_msg}] START {params_msg}" print(f"{start_msg}{(80 - len(start_msg)) * '.'}") # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = _check_fit_params(X, fit_params, train) if parameters is not None: # clone after setting parameters in case any parameters # are estimators (like pipeline steps) # because pipeline doesn't clone steps in fit cloned_parameters = {} for k, v in parameters.items(): cloned_parameters[k] = clone(v, safe=False) estimator = estimator.set_params(**cloned_parameters) start_time = time.time() if online_train_val_split: # inject the train and test data into the corresponding Subset selectors set_train_estim = False set_test_estim = False for estim in estimator: if set_train_estim and set_test_estim: break if isinstance(estim, TrainSubset): estim.date_range = [train] set_train_estim = True if isinstance(estim, TestSubset): estim.date_range = [test] set_test_estim = True if isinstance(estim, CVSubset) and isinstance( estim, EvalOnlyWrapper): estim.set_range(test) set_test_estim = True if isinstance(estim, CVSubset) and isinstance( estim, TrainOnlyWrapper): estim.set_range(train) set_train_estim = True if not set_train_estim or not set_test_estim: raise ValueError( "when specifying online learning a KeepTrain and KeepTest have to be in the pipeline" ) else: X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) result = {} try: if online_train_val_split: estimator = estimator.train() estimator.fit(X, y, **fit_params) else: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): if isinstance(scorer, dict): test_scores = {name: error_score for name in scorer} if return_train_score: train_scores = test_scores.copy() else: test_scores = error_score if return_train_score: train_scores = error_score warnings.warn( "Estimator fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%s" % (error_score, format_exc()), FitFailedWarning) result["fit_failed"] = True y_sample_len = len(test) else: result["fit_failed"] = False fit_time = time.time() - start_time estimator.eval() if online_train_val_split: # select estimator without the classifier and transform x and y # to retrieve y_test _, y_prime = estimator[:-1].transform(X, y) if isinstance(y_prime, pd.DataFrame) and len(y_prime.columns) == 1: y_prime = y_prime.T.values.squeeze() y_sample_len = len(y_prime) test_scores = _score(estimator, X, y_prime, scorer) else: test_scores = _score(estimator, X_test, y_test, scorer, error_score) score_time = time.time() - start_time - fit_time if return_train_score: if online_train_val_split: estimator.train() _, y_prime = estimator[:-1].transform(X, y) if isinstance(y_prime, pd.DataFrame) and len( y_prime.columns) == 1: y_prime = y_prime.T.values.squeeze() train_scores = _score(estimator, X, y_prime, scorer) estimator.eval() else: train_scores = _score(estimator, X_train, y_train, scorer, error_score) if verbose > 1: total_time = score_time + fit_time end_msg = f"[CV{progress_msg}] END " result_msg = params_msg + (";" if params_msg else "") if verbose > 2 and isinstance(test_scores, dict): for scorer_name in sorted(test_scores): result_msg += f" {scorer_name}: (" if return_train_score: scorer_scores = train_scores[scorer_name] result_msg += f"train={scorer_scores:.3f}, " result_msg += f"test={test_scores[scorer_name]:.3f})" result_msg += f" total time={logger.short_format_time(total_time)}" # Right align the result_msg end_msg += "." * (80 - len(end_msg) - len(result_msg)) end_msg += result_msg print(end_msg) result["test_scores"] = test_scores if return_train_score: result["train_scores"] = train_scores if return_n_test_samples: if online_train_val_split: result["n_test_samples"] = y_sample_len else: result["n_test_samples"] = _num_samples(X_test) if return_times: result["fit_time"] = fit_time result["score_time"] = score_time if return_parameters: result["parameters"] = parameters if return_estimator: result["estimator"] = estimator return result
def fit_and_score_te_oracle(estimator, X, y, w, p, t, scorer, train, test, parameters=None, fit_params=None, return_train_score=False, return_parameters=False, return_times=False, return_estimator=False, error_score=np.nan, return_test_score_only=False): """Fit estimator and compute scores for a given dataset split, using oracle knowledge of treatment effects. Based on sklearn.model_selection._validation _fit_and_score, adapted to allow more inputs (treatments and treatment effects) Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape (n_samples, n_features) The features to fit to y : array-like of shape (n_samples,) or (n_samples, ) The outcome variable w: array-like of shape (n_samples,) The treatment indicator p: array-like of shape (n_samples,) The treatment propensity t: array-like of shape (n_samples,) the true treatment effect to evaluate against scorer : A single callable or dict mapping scorer name to the callable If it is a single callable, the return value for ``train_scores`` and ``test_scores`` is a single float. For a dict, it should be one mapping the scorer name to the scorer callable object / function. The callable object / fn should have signature ``scorer(estimator, X, y)``. train : array-like of shape (n_train_samples,) Indices of training samples. test : array-like of shape (n_test_samples,) Indices of test samples. error_score : 'raise' or numeric, default=np.nan Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. return_train_score : bool, default=False Compute and return score on training set. return_parameters : bool, default=False Return parameters that has been used for the estimator. return_times : bool, default=False Whether to return the fit/score times. return_estimator : bool, default=False Whether to return the fitted estimator. return_test_score_only: bool, default=False Whether to only return a test score Returns ------- train_scores : dict of scorer name -> float Score on training set (for all the scorers), returned only if `return_train_score` is `True`. test_scores : float or dict of scorer name -> float If return_test_score_only and scorer == str, then returns only test score. Otherwise, s on testing set (for all the scorers) n_test_samples : int Number of test samples. fit_time : float Time spent for fitting in seconds. score_time : float Time spent for scoring in seconds. parameters : dict or None The parameters that have been evaluated. estimator : estimator object The fitted estimator """ if not isinstance(estimator, BaseTEModel): raise ValueError("This method works only for BaseTEModel") scorers, _ = _check_multimetric_scoring(estimator, scoring=scorer) # Adjust length of sample weights (if ant) fit_params = fit_params if fit_params is not None else {} fit_params = _check_fit_params(X, fit_params, train) train_scores = {} if parameters is not None: # clone after setting parameters in case any parameters # are estimators (like pipeline steps) # because pipeline doesn't clone steps in fit cloned_parameters = {} for k, v in parameters.items(): cloned_parameters[k] = clone(v, safe=False) estimator = estimator.set_params(**cloned_parameters) start_time = time.time() X_train, y_train, w_train, p_train, t_train = _safe_split_te( X, y, w, p, t, train) X_test, y_test, w_test, p_test, t_test = _safe_split_te( X, y, w, p, t, test) try: estimator.fit(X_train, y_train, w_train, p_train, **fit_params) except Exception as e: if return_test_score_only: if error_score == 'raise': raise else: return np.nan # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): if isinstance(scorer, dict): test_scores = {name: error_score for name in scorer} if return_train_score: train_scores = test_scores.copy() else: test_scores = error_score if return_train_score: train_scores = error_score warnings.warn( "Estimator fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%s" % (error_score, format_exc()), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time try: test_scores = _score(estimator, X_test, t_test, scorers) except Exception: if return_test_score_only: if error_score == 'raise': raise else: return np.nan score_time = time.time() - start_time - fit_time if return_test_score_only: if type(scorer) == str: return test_scores['score'] else: return test_scores if return_train_score: train_scores = _score(estimator, X_train, t_train, scorers) ret = [train_scores, test_scores] if return_train_score else [test_scores] if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append(parameters) if return_estimator: ret.append(estimator) return ret
def _fit_and_score_keras2(method, X, y, scorer, train, test, verbose, parameters, fit_params, type="Classification", return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, error_score='raise'): """Fit estimator and compute scores for a given dataset split for KerasClassifier and KerasRegressor. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. scorer : A single callable or dict mapping scorer name to the callable If it is a single callable, the return value for ``train_scores`` and ``test_scores`` is a single float. For a dict, it should be one mapping the scorer name to the scorer callable object / function. The callable object / fn should have signature ``scorer(estimator, X, y)``. train : array-like, shape (n_train_samples,) Indices of training samples. test : array-like, shape (n_test_samples,) Indices of test samples. verbose : integer The verbosity level. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. return_train_score : boolean, optional, default: False Compute and return score on training set. return_parameters : boolean, optional, default: False Return parameters that has been used for the estimator. return_n_test_samples : boolean, optional, default: False Whether to return the ``n_test_samples`` return_times : boolean, optional, default: False Whether to return the fit/score times. session : Keras backend with a tensorflow session attached The keras backend session for applying K.clear_session() after the classifier or regressor has been train and scored given the split. This is mainly required to avoid posible Out Of Memory errors with tensorflow not deallocating the GPU memory after each iteration of the Cross Validation. Returns ------- train_scores : dict of scorer name -> float, optional Score on training set (for all the scorers), returned only if `return_train_score` is `True`. test_scores : dict of scorer name -> float, optional Score on testing set (for all the scorers). n_test_samples : int Number of test samples. fit_time : float Time spent for fitting in seconds. score_time : float Time spent for scoring in seconds. parameters : dict or None, optional The parameters that have been evaluated. """ from keras import backend as K import tensorflow as tf tf.logging.set_verbosity( tf.logging.ERROR) # This is useful to avoid the info log of tensorflow # The next 4 lines are for avoiding tensorflow to allocate all the GPU memory config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) K.set_session(sess) if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) test_scores = {} train_scores = {} estimator = None if type == "Classification": from keras.wrappers.scikit_learn import KerasClassifier estimator = KerasClassifier(build_fn=method, verbose=0) else: from keras.wrappers.scikit_learn import KerasRegressor estimator = KerasRegressor(build_fn=method, verbose=0) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) is_multimetric = not callable(scorer) n_scorers = len(scorer.keys()) if is_multimetric else 1 try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): if is_multimetric: test_scores = dict( zip(scorer.keys(), [ error_score, ] * n_scorers)) if return_train_score: train_scores = dict( zip(scorer.keys(), [ error_score, ] * n_scorers)) else: test_scores = error_score if return_train_score: train_scores = error_score warnings.warn( "Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time # _score will return dict if is_multimetric is True test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric) score_time = time.time() - start_time - fit_time if return_train_score: train_scores = _score(estimator, X_train, y_train, scorer, is_multimetric) if verbose > 2: if is_multimetric: for scorer_name, score in test_scores.items(): msg += ", %s=%s" % (scorer_name, score) else: msg += ", score=%s" % test_scores if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_scores, test_scores] if return_train_score else [test_scores] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append(parameters) # The estimator is erased del estimator # We assign the keras backend # Clean the session K.clear_session() # The garbage collector is called in order to ensure that the estimator is erased from memory for i in range(15): gc.collect() return ret
def main(inputs, infile_estimator, infile1, infile2, outfile_result, outfile_object=None, outfile_weights=None, outfile_y_true=None, outfile_y_preds=None, groups=None, ref_seq=None, intervals=None, targets=None, fasta_path=None): """ Parameter --------- inputs : str File path to galaxy tool parameter infile_estimator : str File path to estimator infile1 : str File path to dataset containing features infile2 : str File path to dataset containing target values outfile_result : str File path to save the results, either cv_results or test result outfile_object : str, optional File path to save searchCV object outfile_weights : str, optional File path to save deep learning model weights outfile_y_true : str, optional File path to target values for prediction outfile_y_preds : str, optional File path to save deep learning model weights groups : str File path to dataset containing groups labels ref_seq : str File path to dataset containing genome sequence file intervals : str File path to dataset containing interval file targets : str File path to dataset compressed target bed file fasta_path : str File path to dataset containing fasta file """ warnings.simplefilter('ignore') with open(inputs, 'r') as param_handler: params = json.load(param_handler) # load estimator with open(infile_estimator, 'rb') as estimator_handler: estimator = load_model(estimator_handler) estimator = clean_params(estimator) # swap hyperparameter swapping = params['experiment_schemes']['hyperparams_swapping'] swap_params = _eval_swap_params(swapping) estimator.set_params(**swap_params) estimator_params = estimator.get_params() # store read dataframe object loaded_df = {} input_type = params['input_options']['selected_input'] # tabular input if input_type == 'tabular': header = 'infer' if params['input_options']['header1'] else None column_option = (params['input_options']['column_selector_options_1'] ['selected_column_selector_option']) if column_option in [ 'by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name' ]: c = params['input_options']['column_selector_options_1']['col1'] else: c = None df_key = infile1 + repr(header) df = pd.read_csv(infile1, sep='\t', header=header, parse_dates=True) loaded_df[df_key] = df X = read_columns(df, c=c, c_option=column_option).astype(float) # sparse input elif input_type == 'sparse': X = mmread(open(infile1, 'r')) # fasta_file input elif input_type == 'seq_fasta': pyfaidx = get_module('pyfaidx') sequences = pyfaidx.Fasta(fasta_path) n_seqs = len(sequences.keys()) X = np.arange(n_seqs)[:, np.newaxis] for param in estimator_params.keys(): if param.endswith('fasta_path'): estimator.set_params(**{param: fasta_path}) break else: raise ValueError( "The selected estimator doesn't support " "fasta file input! Please consider using " "KerasGBatchClassifier with " "FastaDNABatchGenerator/FastaProteinBatchGenerator " "or having GenomeOneHotEncoder/ProteinOneHotEncoder " "in pipeline!") elif input_type == 'refseq_and_interval': path_params = { 'data_batch_generator__ref_genome_path': ref_seq, 'data_batch_generator__intervals_path': intervals, 'data_batch_generator__target_path': targets } estimator.set_params(**path_params) n_intervals = sum(1 for line in open(intervals)) X = np.arange(n_intervals)[:, np.newaxis] # Get target y header = 'infer' if params['input_options']['header2'] else None column_option = (params['input_options']['column_selector_options_2'] ['selected_column_selector_option2']) if column_option in [ 'by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name' ]: c = params['input_options']['column_selector_options_2']['col2'] else: c = None df_key = infile2 + repr(header) if df_key in loaded_df: infile2 = loaded_df[df_key] else: infile2 = pd.read_csv(infile2, sep='\t', header=header, parse_dates=True) loaded_df[df_key] = infile2 y = read_columns(infile2, c=c, c_option=column_option, sep='\t', header=header, parse_dates=True) if len(y.shape) == 2 and y.shape[1] == 1: y = y.ravel() if input_type == 'refseq_and_interval': estimator.set_params(data_batch_generator__features=y.ravel().tolist()) y = None # end y # load groups if groups: groups_selector = (params['experiment_schemes']['test_split'] ['split_algos']).pop('groups_selector') header = 'infer' if groups_selector['header_g'] else None column_option = \ (groups_selector['column_selector_options_g'] ['selected_column_selector_option_g']) if column_option in [ 'by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name' ]: c = groups_selector['column_selector_options_g']['col_g'] else: c = None df_key = groups + repr(header) if df_key in loaded_df: groups = loaded_df[df_key] groups = read_columns(groups, c=c, c_option=column_option, sep='\t', header=header, parse_dates=True) groups = groups.ravel() # del loaded_df del loaded_df # cache iraps_core fits could increase search speed significantly memory = joblib.Memory(location=CACHE_DIR, verbose=0) main_est = get_main_estimator(estimator) if main_est.__class__.__name__ == 'IRAPSClassifier': main_est.set_params(memory=memory) # handle scorer, convert to scorer dict scoring = params['experiment_schemes']['metrics']['scoring'] scorer = get_scoring(scoring) scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer) # handle test (first) split test_split_options = ( params['experiment_schemes']['test_split']['split_algos']) if test_split_options['shuffle'] == 'group': test_split_options['labels'] = groups if test_split_options['shuffle'] == 'stratified': if y is not None: test_split_options['labels'] = y else: raise ValueError("Stratified shuffle split is not " "applicable on empty target values!") X_train, X_test, y_train, y_test, groups_train, groups_test = \ train_test_split_none(X, y, groups, **test_split_options) exp_scheme = params['experiment_schemes']['selected_exp_scheme'] # handle validation (second) split if exp_scheme == 'train_val_test': val_split_options = ( params['experiment_schemes']['val_split']['split_algos']) if val_split_options['shuffle'] == 'group': val_split_options['labels'] = groups_train if val_split_options['shuffle'] == 'stratified': if y_train is not None: val_split_options['labels'] = y_train else: raise ValueError("Stratified shuffle split is not " "applicable on empty target values!") X_train, X_val, y_train, y_val, groups_train, groups_val = \ train_test_split_none(X_train, y_train, groups_train, **val_split_options) # train and eval if hasattr(estimator, 'config') and hasattr(estimator, 'model_type'): if exp_scheme == 'train_val_test': estimator.fit(X_train, y_train, validation_data=(X_val, y_val)) else: estimator.fit(X_train, y_train, validation_data=(X_test, y_test)) else: estimator.fit(X_train, y_train) if isinstance(estimator, KerasGBatchClassifier): scores = {} steps = estimator.prediction_steps batch_size = estimator.batch_size data_generator = estimator.data_generator_ scores, predictions, y_true = _evaluate_keras_and_sklearn_scores( estimator, data_generator, X_test, y=y_test, sk_scoring=sk_scoring, steps=steps, batch_size=batch_size, return_predictions=bool(outfile_y_true)) else: scores = {} if hasattr(estimator, 'model_') \ and hasattr(estimator.model_, 'metrics_names'): batch_size = estimator.batch_size score_results = estimator.model_.evaluate(X_test, y=y_test, batch_size=batch_size, verbose=0) metrics_names = estimator.model_.metrics_names if not isinstance(metrics_names, list): scores[metrics_names] = score_results else: scores = dict(zip(metrics_names, score_results)) if hasattr(estimator, 'predict_proba'): predictions = estimator.predict_proba(X_test) else: predictions = estimator.predict(X_test) y_true = y_test sk_scores = _score(estimator, X_test, y_test, scorer, is_multimetric=True) scores.update(sk_scores) # handle output if outfile_y_true: try: pd.DataFrame(y_true).to_csv(outfile_y_true, sep='\t', index=False) pd.DataFrame(predictions).astype(np.float32).to_csv( outfile_y_preds, sep='\t', index=False, float_format='%g', chunksize=10000) except Exception as e: print("Error in saving predictions: %s" % e) # handle output for name, score in scores.items(): scores[name] = [score] df = pd.DataFrame(scores) df = df[sorted(df.columns)] df.to_csv(path_or_buf=outfile_result, sep='\t', header=True, index=False) memory.clear(warn=False) if outfile_object: main_est = estimator if isinstance(estimator, Pipeline): main_est = estimator.steps[-1][-1] if hasattr(main_est, 'model_') \ and hasattr(main_est, 'save_weights'): if outfile_weights: main_est.save_weights(outfile_weights) del main_est.model_ del main_est.fit_params del main_est.model_class_ main_est.callbacks = [] if getattr(main_est, 'data_generator_', None): del main_est.data_generator_ with open(outfile_object, 'wb') as output_handler: pickle.dump(estimator, output_handler, pickle.HIGHEST_PROTOCOL)
def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer): X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) return rfe._fit( X_train, y_train, lambda estimator, features: _score( estimator, X_test[:, features], y_test, scorer)).scores_
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, error_score='raise', return_estimator=False, return_idx=False): """Fit estimator and compute scores for a given dataset split. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. scorer : A single callable or dict mapping scorer name to the callable If it is a single callable, the return value for ``train_scores`` and ``test_scores`` is a single float. For a dict, it should be one mapping the scorer name to the scorer callable object / function. The callable object / fn should have signature ``scorer(estimator, X, y)``. train : array-like, shape (n_train_samples,) Indices of training samples. test : array-like, shape (n_test_samples,) Indices of test samples. verbose : integer The verbosity level. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. return_train_score : boolean, optional, default: False Compute and return score on training set. return_parameters : boolean, optional, default: False Return parameters that has been used for the estimator. return_n_test_samples : boolean, optional, default: False Whether to return the ``n_test_samples`` return_times : boolean, optional, default: False Whether to return the fit/score times. Returns ------- train_scores : dict of scorer name -> float, optional Score on training set (for all the scorers), returned only if `return_train_score` is `True`. test_scores : dict of scorer name -> float, optional Score on testing set (for all the scorers). n_test_samples : int Number of test samples. fit_time : float Time spent for fitting in seconds. score_time : float Time spent for scoring in seconds. parameters : dict or None, optional The parameters that have been evaluated. """ if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) test_scores = {} train_scores = {} if parameters is not None: estimator.set_params(**parameters) start_time = time.time() # do it for each patient X_train, y_train, X_test, y_test = _safe_split_multi( estimator, X, y, train, test) is_multimetric = not callable(scorer) n_scorers = len(scorer.keys()) if is_multimetric else 1 try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): if is_multimetric: test_scores = dict(zip(scorer.keys(), [error_score, ] * n_scorers)) if return_train_score: train_scores = dict(zip(scorer.keys(), [error_score, ] * n_scorers)) else: test_scores = error_score if return_train_score: train_scores = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time # _score will return dict if is_multimetric is True test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric) score_time = time.time() - start_time - fit_time if return_train_score: train_scores = _score(estimator, X_train, y_train, scorer, is_multimetric) if verbose > 2: if is_multimetric: for scorer_name, score in test_scores.items(): msg += ", %s=%s" % (scorer_name, score) else: msg += ", score=%s" % test_scores if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_scores, test_scores] if return_train_score else [test_scores] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append(parameters) if return_estimator: ret.append(estimator) if return_idx: ret.extend([train, test]) return ret
def _do_train_test_split_val(searcher, X, y, params, error_score='raise', primary_scoring=None, groups=None, outfile=None): """ do train test split, searchCV validates on the train and then use the best_estimator_ to evaluate on the test Returns -------- Fitted SearchCV object """ train_test_split = try_get_attr('galaxy_ml.model_validations', 'train_test_split') split_options = params['outer_split'] # splits if split_options['shuffle'] == 'stratified': split_options['labels'] = y X, X_test, y, y_test = train_test_split(X, y, **split_options) elif split_options['shuffle'] == 'group': if groups is None: raise ValueError("No group based CV option was choosen for " "group shuffle!") split_options['labels'] = groups if y is None: X, X_test, groups, _ =\ train_test_split(X, groups, **split_options) else: X, X_test, y, y_test, groups, _ =\ train_test_split(X, y, groups, **split_options) else: if split_options['shuffle'] == 'None': split_options['shuffle'] = None X, X_test, y, y_test =\ train_test_split(X, y, **split_options) if error_score == 'raise': searcher.fit(X, y, groups=groups) else: warnings.simplefilter('always', FitFailedWarning) with warnings.catch_warnings(record=True) as w: try: searcher.fit(X, y, groups=groups) except ValueError: pass for warning in w: print(repr(warning.message)) scorer_ = searcher.scorer_ if isinstance(scorer_, collections.Mapping): is_multimetric = True else: is_multimetric = False best_estimator_ = getattr(searcher, 'best_estimator_') # TODO Solve deep learning models in pipeline if best_estimator_.__class__.__name__ == 'KerasGBatchClassifier': test_score = best_estimator_.evaluate(X_test, scorer=scorer_, is_multimetric=is_multimetric) else: test_score = _score(best_estimator_, X_test, y_test, scorer_, is_multimetric=is_multimetric) if not is_multimetric: test_score = {primary_scoring: test_score} for key, value in test_score.items(): test_score[key] = [value] result_df = pd.DataFrame(test_score) result_df.to_csv(path_or_buf=outfile, sep='\t', header=True, index=False) return searcher
def _fit_and_score_multisignal(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, error_score='raise', logger=logger): """Fit estimator and compute scores for a given dataset split. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. scorer : A single callable or dict mapping scorer name to the callable If it is a single callable, the return value for ``train_scores`` and ``test_scores`` is a single float. For a dict, it should be one mapping the scorer name to the scorer callable object / function. The callable object / fn should have signature ``scorer(estimator, X, y)``. train : array-like, shape (n_train_samples,) Indices of training samples. test : array-like, shape (n_test_samples,) Indices of test samples. verbose : integer The verbosity level. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. return_train_score : boolean, optional, default: False Compute and return score on training set. return_parameters : boolean, optional, default: False Return parameters that has been used for the estimator. return_n_test_samples : boolean, optional, default: False Whether to return the ``n_test_samples`` return_times : boolean, optional, default: False Whether to return the fit/score times. Returns ------- train_scores : dict of scorer name -> float, optional Score on training set (for all the scorers), returned only if `return_train_score` is `True`. test_scores : dict of scorer name -> float, optional Score on testing set (for all the scorers). n_test_samples : int Number of test samples. fit_time : float Time spent for fitting in seconds. score_time : float Time spent for scoring in seconds. parameters : dict or None, optional The parameters that have been evaluated. """ if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) logger.info("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) test_scores = {} train_scores = {} if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split_multisignal(estimator, X, y, train) X_test, y_test = _safe_split_multisignal(estimator, X, y, test, train) is_multimetric = not callable(scorer) n_scorers = len(scorer.keys()) if is_multimetric else 1 try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): if is_multimetric: test_scores = dict(zip(scorer.keys(), [error_score, ] * n_scorers)) if return_train_score: train_scores = dict(zip(scorer.keys(), [error_score, ] * n_scorers)) else: test_scores = error_score if return_train_score: train_scores = error_score logger.warning("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time # _score will return dict if is_multimetric is True test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric) score_time = time.time() - start_time - fit_time if return_train_score: train_scores = _score(estimator, X_train, y_train, scorer, is_multimetric) if verbose > 2: if is_multimetric: for scorer_name, score in test_scores.items(): msg += ", %s=%s" % (scorer_name, score) else: msg += ", score=%s" % test_scores if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, short_format_time(total_time)) logger.info("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_scores, test_scores] if return_train_score else [test_scores] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append(parameters) return ret
def main(inputs, infile_estimator, infile1, infile2, outfile_result, outfile_object=None, groups=None): """ Parameter --------- inputs : str File path to galaxy tool parameter infile_estimator : str File path to estimator infile1 : str File path to dataset containing features infile2 : str File path to dataset containing target values outfile_result : str File path to save the results, either cv_results or test result outfile_object : str, optional File path to save searchCV object groups : str File path to dataset containing groups labels """ warnings.simplefilter('ignore') with open(inputs, 'r') as param_handler: params = json.load(param_handler) if groups: (params['search_schemes']['options']['cv_selector'] ['groups_selector']['infile_g']) = groups params_builder = params['search_schemes']['search_params_builder'] input_type = params['input_options']['selected_input'] if input_type == 'tabular': header = 'infer' if params['input_options']['header1'] else None column_option = (params['input_options']['column_selector_options_1'] ['selected_column_selector_option']) if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']: c = params['input_options']['column_selector_options_1']['col1'] else: c = None X = read_columns( infile1, c=c, c_option=column_option, sep='\t', header=header, parse_dates=True).astype(float) else: X = mmread(open(infile1, 'r')) header = 'infer' if params['input_options']['header2'] else None column_option = (params['input_options']['column_selector_options_2'] ['selected_column_selector_option2']) if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']: c = params['input_options']['column_selector_options_2']['col2'] else: c = None y = read_columns( infile2, c=c, c_option=column_option, sep='\t', header=header, parse_dates=True) y = y.ravel() optimizer = params['search_schemes']['selected_search_scheme'] optimizer = getattr(model_selection, optimizer) options = params['search_schemes']['options'] splitter, groups = get_cv(options.pop('cv_selector')) options['cv'] = splitter options['n_jobs'] = N_JOBS primary_scoring = options['scoring']['primary_scoring'] options['scoring'] = get_scoring(options['scoring']) if options['error_score']: options['error_score'] = 'raise' else: options['error_score'] = np.NaN if options['refit'] and isinstance(options['scoring'], dict): options['refit'] = primary_scoring if 'pre_dispatch' in options and options['pre_dispatch'] == '': options['pre_dispatch'] = None with open(infile_estimator, 'rb') as estimator_handler: estimator = load_model(estimator_handler) memory = joblib.Memory(location=CACHE_DIR, verbose=0) # cache iraps_core fits could increase search speed significantly if estimator.__class__.__name__ == 'IRAPSClassifier': estimator.set_params(memory=memory) else: for p, v in estimator.get_params().items(): if p.endswith('memory'): if len(p) > 8 and p[:-8].endswith('irapsclassifier'): # cache iraps_core fits could increase search # speed significantly new_params = {p: memory} estimator.set_params(**new_params) elif v: new_params = {p, None} estimator.set_params(**new_params) elif p.endswith('n_jobs'): new_params = {p: 1} estimator.set_params(**new_params) param_grid = _eval_search_params(params_builder) searcher = optimizer(estimator, param_grid, **options) # do train_test_split do_train_test_split = params['train_test_split'].pop('do_split') if do_train_test_split == 'yes': # make sure refit is choosen if not options['refit']: raise ValueError("Refit must be `True` for shuffle splitting!") split_options = params['train_test_split'] # splits if split_options['shuffle'] == 'stratified': split_options['labels'] = y X, X_test, y, y_test = train_test_split(X, y, **split_options) elif split_options['shuffle'] == 'group': if not groups: raise ValueError("No group based CV option was " "choosen for group shuffle!") split_options['labels'] = groups X, X_test, y, y_test, groups, _ =\ train_test_split(X, y, **split_options) else: if split_options['shuffle'] == 'None': split_options['shuffle'] = None X, X_test, y, y_test =\ train_test_split(X, y, **split_options) # end train_test_split if options['error_score'] == 'raise': searcher.fit(X, y, groups=groups) else: warnings.simplefilter('always', FitFailedWarning) with warnings.catch_warnings(record=True) as w: try: searcher.fit(X, y, groups=groups) except ValueError: pass for warning in w: print(repr(warning.message)) if do_train_test_split == 'no': # save results cv_results = pandas.DataFrame(searcher.cv_results_) cv_results = cv_results[sorted(cv_results.columns)] cv_results.to_csv(path_or_buf=outfile_result, sep='\t', header=True, index=False) # output test result using best_estimator_ else: best_estimator_ = searcher.best_estimator_ if isinstance(options['scoring'], collections.Mapping): is_multimetric = True else: is_multimetric = False test_score = _score(best_estimator_, X_test, y_test, options['scoring'], is_multimetric=is_multimetric) if not is_multimetric: test_score = {primary_scoring: test_score} for key, value in test_score.items(): test_score[key] = [value] result_df = pandas.DataFrame(test_score) result_df.to_csv(path_or_buf=outfile_result, sep='\t', header=True, index=False) memory.clear(warn=False) if outfile_object: with open(outfile_object, 'wb') as output_handler: pickle.dump(searcher, output_handler, pickle.HIGHEST_PROTOCOL)