def split_cv(*arrays, y=None, groups=None, cv=3, random_state=None): '''supervise splitting arrays : 2d arrays arrays to be splitted, usually X y : 1d array class label, if None not to stratify groups - split by groups cv - number of splits return ---- generator of list containing splited arrays,shape = [m*n*k], for 1 fold [(0train, 0test), (1train, 1test), ...] m - indices of folds [0 : cv-1] n - indice of variable/arrays [0 : n_arrays-1] k - indice of train(0)/test[1] set [0:1] ''' n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") validation.check_consistent_length(*arrays, y, groups) arrays = list(arrays) if cv == 1: if y is not None: arrays.append(y) return [[(i, i) for i in arrays]] # get cross validator if y is not None: arrays.append(y) cv = check_cv(cv, y=y, classifier=True) else: cv = check_cv(cv, classifier=False) # set random state if hasattr(cv, 'random_state'): cv.random_state = random_state # reset_index pandas df or series arrays = _reset_index(*arrays) arrays = indexable(*arrays) # get indexing method train_test = ([ (safe_indexing(i, train_index), safe_indexing(i, test_index)) for i in arrays ] for train_index, test_index in cv.split(arrays[0], y, groups)) return train_test
def permutations(estimator, X, y, cv=None, n_permuations=100, random_state=0, scoring=None): """ This follows the sklearn API sklearn.inspection.permutation_test_score I have modified accordinlgy to accomodate filtering of features using correlation matrix before running cross-validation using the model """ Xs, ys = indexable(X, y) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) random_state = check_random_state(random_state) # corr = CorrMatrix() # corr.fit(X,y) # Xs, ys = corr.transform() score = _permutations(clone(estimator), Xs, ys, cv, scorer) permutation_scores = np.zeros((n_permuations)) for i in range(n_permuations): # corr_p = CorrMatrix() # corr_p.fit(X, y) # Xp, yp = corr_p.transform() yp = _safe_indexing(y, random_state.permutation(len(y))) permutation_scores[i] = _permutations(clone(estimator), Xs, yp, cv, scorer) pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permuations + 1) return score, permutation_scores, pvalue
def _wrapped_cross_val_score(sklearn_pipeline, features, target, cv, scoring_function, sample_weight=None, groups=None): """Fit estimator and compute scores for a given dataset split. Parameters ---------- sklearn_pipeline : pipeline object implementing 'fit' The object to use to fit the data. features : array-like of shape at least 2D The data to fit. target : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. cv: int or cross-validation generator If CV is a number, then it is the number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. If it is an object then it is an object to be used as a cross-validation generator. scoring_function : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. sample_weight : array-like, optional List of sample weights to balance (or un-balanace) the dataset target as needed groups: array-like {n_samples, }, optional Group labels for the samples used while splitting the dataset into train/test set """ sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight) features, target, groups = indexable(features, target, groups) cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline)) cv_iter = list(cv.split(features, target, groups)) scorer = check_scoring(sklearn_pipeline, scoring=scoring_function) try: with warnings.catch_warnings(): warnings.simplefilter('ignore') scores = [ _fit_and_score(estimator=clone(sklearn_pipeline), X=features, y=target, scorer=scorer, train=train, test=test, verbose=0, parameters=None, fit_params=sample_weight_dict) for train, test in cv_iter ] CV_score = np.array(scores)[:, 0] return np.nanmean(CV_score) except TimeoutException: return "Timeout" except Exception as e: return -float('inf')
def fit(self, X, y): cv = check_cv(self.cv, y, classifier=True) self.estimators_ = [] self.scores = [] self.score = 0 for train, valid in cv.split(X, y): score1 = 0 test = len(y[valid]) print(X[train].shape) print(y[train].shape) clf = lgb.LGBMClassifier(**self.lgb_params).fit( X[train], y[train], eval_set=[(X[train], y[train])], early_stopping_rounds=15) for i in range(0, test): yt = clf.predict(X[valid][i, :]) if yt == y[valid][i]: score1 += 1 score1 = score1 / test print(score1) self.scores.append(score1) self.estimators_.append(clf) self.score = sum(self.scores) / len(self.scores) return self
def permutation_test_score(estimator, X, y, groups=None, cv=None, n_permutations=100, n_jobs=1, random_state=0, verbose=0, scoring=None): """ Evaluate the significance of a cross-validated score with permutations, as in test 1 of [Ojala2010]_. A modification of original sklearn's permutation test score function to evaluate p-value outside this function, so that the score can be reused from outside. .. [Ojala2010] Ojala and Garriga. Permutation Tests for Studying Classifier Performance. The Journal of Machine Learning Research (2010) vol. 11 """ X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) random_state = check_random_state(random_state) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(_permutation_test_score)( clone(estimator), X, _shuffle(y, groups, random_state), groups, cv, scorer) for _ in range(n_permutations)) permutation_scores = np.array(permutation_scores) return permutation_scores
def test_fit_and_score_return_dict(self): # Scoring accuracy_scorer = make_scorer(accuracy_score, normalize='weighted') # Test estimator dumb = DummyClassifier(strategy='constant', constant=1) # Test custom scorer bagAccScorer = BagScorer(accuracy_scorer, sparse=True) # Rename for easier parameters X = self.train_bags y = self.train_labels scoring = {'bag-scorer': bagAccScorer} estimator = dumb groups = None cv = 3 n_jobs = 3 verbose = 0 pre_dispatch = 6 fit_params = None return_estimator = True error_score = 'raise' return_train_score = True parameters = None # Test _fit_and_score method X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorers = _check_multimetric_scoring(estimator, scoring=scoring) # Use one cross-validation split generator = cv.split(X, y, groups) # Get training and test split of training data train, test = next(generator) # Generate scores using BagScorer scores = _fit_and_score(clone(estimator), X, y, scorers, train, test, verbose, parameters, fit_params, return_train_score=return_train_score, return_times=True, return_estimator=return_estimator, return_n_test_samples=False, error_score=error_score) # Returned dictionary contains keys self.assertIn('train_scores', scores.keys()) self.assertIn('test_scores', scores.keys()) self.assertIn('fit_time', scores.keys()) self.assertIn('score_time', scores.keys()) self.assertIn('estimator', scores.keys()) return None
def fit_and_save(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', return_train_score=True, parameters=dict(), uuid='', url='http://127.0.0.1:8000'): import json, requests, numpy from sklearn.model_selection._validation import cross_validate X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorers, _ = _check_multimetric_scoring(estimator, scoring=scoring) _base_scores = [0. for _ in range(cv.get_n_splits(X, y, groups))] cv_score = {} cv_score.update( {'train_%s' % s: numpy.array(_base_scores) for s in scorers}) cv_score.update( {'test_%s' % s: numpy.array(_base_scores) for s in scorers}) cv_score.update({'fit_time': _base_scores, 'score_time': _base_scores}) try: cv_score = cross_validate(estimator, X, y, groups, scorers, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score) error = None except Exception as e: error = '{}: {}'.format(type(e).__name__, str(e)) try: for k, v in cv_score.items(): if type(v) == type(numpy.array([])): cv_score[k] = v.tolist() response = requests.post('{url}/grids/{uuid}/results'.format( url=url, uuid=uuid), data={ 'gridsearch': uuid, 'params': json.dumps(parameters), 'errors': error, 'cv_data': json.dumps(cv_score) }) except requests.exceptions.ConnectionError as e: response = None if response is None: return return response
def cross_val_predict(estimator, X, y=None, groups=None, cv='warn', n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', method='predict'): """ Minor modifications and simplications brought to the sklearn function in order to allow for application with non-partition CV scheme. """ X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) prediction_blocks = parallel(delayed(_fit_and_predict)( clone(estimator), X, y, train, test, verbose, fit_params, method) for train, test in cv.split(X, y, groups)) # Concatenate the predictions predictions = [pred_block_i for pred_block_i, _ in prediction_blocks] predictions = np.concatenate(predictions) test_indices = np.concatenate([indices_i for _, indices_i in prediction_blocks]) test_index = [y.index[_] for _ in test_indices] #print(predictions) if y.ndim == 1: return pd.Series(predictions, index = test_index) elif y.ndim>1: return pd.DataFrame(predictions, index = test_index)
def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): """ Evaluate a score by cross-validation """ if not isinstance(scoring, (list, tuple)): scoring = [scoring] X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) splits = list(cv.split(X, y, groups)) scorer = [check_scoring(estimator, scoring=s) for s in scoring] # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, verbose, None, fit_params) for train, test in splits) group_order = [] if hasattr(cv, 'groups'): group_order = [np.array(cv.groups)[test].tolist()[0] for _, test in splits] return np.squeeze(np.array(scores)), group_order
def fit(self, X, y=None, *, groups=None, **fit_params): """Run fit with all sets of parameters. Parameters ---------- X : array-like of shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like of shape (n_samples, n_output) \ or (n_samples,), default=None Target relative to X for classification or regression; None for unsupervised learning. groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`~sklearn.model_selection.GroupKFold`). **fit_params : dict of str -> object Parameters passed to the ``fit`` method of the estimator """ estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) scorers, self.multimetric_ = _check_multimetric_scoring( self.estimator, scoring=self.scoring) X, y, groups = indexable(X, y, groups) self._run_search(X, y, cv) return self
def learning_curve(estimator, X, mixed_y, groups=None, train_sizes=np.linspace(0.1, 1.0, 5), cv=None, scoring=None, exploit_incremental_learning=False, n_jobs=1, pre_dispatch="all", verbose=0, shuffle=False, random_state=None): """Learning curve.""" if exploit_incremental_learning and not hasattr(estimator, "partial_fit"): raise ValueError("An estimator must support the partial_fit interface " "to exploit incremental learning") # TODO: wrapper patch, key hard coding? _y = mixed_y['classifier'] if isinstance(mixed_y, dict) else mixed_y X, y, groups = indexable(X, _y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) # Store it as list as we will be iterating over the list multiple times cv_iter = list(cv.split(X, y, groups)) scorer = check_scoring(estimator, scoring=scoring) n_max_training_samples = len(cv_iter[0][0]) # Because the lengths of folds can be significantly different, it is # not guaranteed that we use all of the available training data when we # use the first 'n_max_training_samples' samples. train_sizes_abs = _translate_train_sizes(train_sizes, n_max_training_samples) n_unique_ticks = train_sizes_abs.shape[0] if verbose > 0: print("[learning_curve] Training set sizes: " + str(train_sizes_abs)) parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose) if shuffle: rng = check_random_state(random_state) cv_iter = ((rng.permutation(train), test) for train, test in cv_iter) if exploit_incremental_learning: classes = np.unique(y) if is_classifier(estimator) else None out = parallel(delayed(_incremental_fit_estimator)( clone(estimator), X, mixed_y, classes, train, test, train_sizes_abs, scorer, verbose) for train, test in cv_iter) else: train_test_proportions = [] for train, test in cv_iter: for n_train_samples in train_sizes_abs: train_test_proportions.append((train[:n_train_samples], test)) out = parallel(delayed(_fit_and_score)( clone(estimator), X, mixed_y, scorer, train, test, verbose, parameters=None, fit_params=None, return_train_score=True) for train, test in train_test_proportions) out = np.array(out) n_cv_folds = out.shape[0] // n_unique_ticks out = out.reshape(n_cv_folds, n_unique_ticks, 2) out = np.asarray(out).transpose((2, 1, 0)) return train_sizes_abs, out[0], out[1]
def _get_cv(self, y_tr): cv = check_cv(self.cv, y_tr, classifier=is_classifier(self.estimator)) return cv
def fit(self, X, y, groups=None): """Actual fitting, performing the search over parameters.""" results = dict() best_index = None best_parameters = None for bracket_idx in range(self.num_brackets - 1, -1, -1): successive_halving_steps = bracket_idx + 1 # TODO: num_arms should be different estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) base_estimator = clone(self.estimator) arms_pulled = 0 if 'mean_test_score' in results: arms_pulled = len(results['mean_test_score']) res = self._successive_halving(X, y, groups, cv, self.eta, successive_halving_steps - 1, self.num_brackets - 1) bracket_results, bracket_best_index, bracket_best_parameters = res for key, values in bracket_results.items(): if key not in results: results[key] = values else: results[key] = np.append(results[key], values) if best_index is None: best_index = bracket_best_index + arms_pulled best_parameters = bracket_best_parameters elif bracket_results['mean_test_score'][ bracket_best_index] > results['mean_test_score'][ best_index]: best_index = bracket_best_index + arms_pulled best_parameters = bracket_best_parameters self.cv_results_ = results self.best_index_ = best_index self.n_splits_ = n_splits if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best_parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def regress(exp: Experiment, field, estimator, cv=RepeatedSortedStratifiedKFold(3, 1), params=None): '''Evaluate regression during cross validation. Parameters ---------- field : str column name in the sample metadata, which contains the variable we want to predict. estimator : estimator object implementing `fit` and `predict` scikit-learn estimator. e.g. :class:`sklearn.ensemble.RandomForestRegressor` cv : int, cross-validation generator or an iterable similar to the `cv` parameter in :class:`sklearn.model_selection.GridSearchCV` params : dict of string to sequence, or sequence of such For example, the output of :class:`sklearn.model_selection.ParameterGrid` or :class:`sklearn.model_selection.ParameterSampler`. By default, it uses whatever default parameters of the `estimator` set in `scikit-learn` Yields ------ pandas.DataFrame The result of prediction per sample for a given parameter set. It contains the following columns: - Y_TRUE: the true value for the samples - SAMPLE: sample IDs - CV: which split of the cross validation - Y_PRED: the predicted value for the samples ''' X = exp.data y = exp.sample_metadata[field] cv = check_cv(cv, y, classifier=is_classifier(estimator)) if params is None: # use sklearn default param values for the given estimator params = [{}] for param in params: logger.debug('run regression with parameters: %r' % param) dfs = [] for i, (train, test) in enumerate(cv.split(X, y)): # deep copy the model by clone to avoid the impact from last iteration of fit. model = clone(estimator) model = model.set_params(**param) model.fit(X[train], y[train]) pred = model.predict(X[test]) df = pd.DataFrame({ 'Y_PRED': pred, 'Y_TRUE': y[test].values, 'SAMPLE': y[test].index.values, 'CV': i }) dfs.append(df) yield pd.concat(dfs, axis=0).reset_index(drop=True)
def _check_input_parameters(self, X, y, groups): if (self.budget_on != 'n_samples' and self.budget_on not in self.estimator.get_params()): raise ValueError( 'Cannot budget on parameter {} which is not supported ' 'by estimator {}'.format(self.budget_on, self.estimator.__class__.__name__)) if isinstance(self.max_budget, str) and self.max_budget != 'auto': raise ValueError( "max_budget must be either 'auto' or a positive number") if self.max_budget != 'auto' and self.max_budget <= 0: raise ValueError( "max_budget must be either 'auto' or a positive number") if isinstance(self.r_min, str) and self.r_min != 'auto': raise ValueError( "r_min must be either 'auto' or a positive number no greater " "than max_budget.") if self.r_min != 'auto' and self.r_min <= 0: raise ValueError( "r_min must be either 'auto' or a positive number no greater " "than max_budget.") if self.force_exhaust_budget and self.r_min != 'auto': raise ValueError( 'r_min must be set to auto if force_exhaust_budget is True.') self.r_min_ = self.r_min if self.r_min_ == 'auto': if self.budget_on == 'n_samples': cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) n_splits = cv.get_n_splits(X, y, groups) # please see https://gph.is/1KjihQe for a justification magic_factor = 2 self.r_min_ = n_splits * magic_factor if is_classifier(self.estimator): n_classes = np.unique(y).shape[0] self.r_min_ *= n_classes else: self.r_min_ = 1 self.max_budget_ = self.max_budget if self.max_budget_ == 'auto': if self.budget_on == 'n_samples': self.max_budget_ = X.shape[0] else: self.max_budget_ = 20 # FIXME # n_candidates * r_min?? if self.r_min_ > self.max_budget_: raise ValueError( 'r_min_={} is greater than max_budget_={}.'.format( self.r_min_, self.max_budget_))
def fit(self, X, y=None, groups=None, **fit_params): """Run fit with all sets of parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. **fit_params : dict of string -> object Parameters passed to the ``fit`` method of the estimator """ cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) scorers, self.multimetric_ = _check_multimetric_scoring( self.estimator, scoring=self.scoring) score_function = partial( cross_val_score, X=X, y=y, groups=groups, scoring=self.scoring, cv=cv, n_jobs=self.n_jobs, verbose=self.verbose, fit_params=fit_params) self.f = partial( _fit_score, mdl=self.estimator, param_names=self.param_names, score_function=score_function) self.objective = SingleObjective( self.f, self.batch_size, self.objective_name) self._init_design_chooser() self.run_optimization(max_iter=self.max_iter, verbosity=self.verbosity) self.best_index_ = self.Y.argmin() self.best_params_ = dict(zip(self.param_names, 10 ** self.X[self.best_index_])) self.best_score_ = self.Y[self.Y.argmin()] # Store the only scorer not as a dict for single metric evaluation self.scorer_ = scorers if self.multimetric_ else scorers['score'] if self.refit: self.best_estimator_ = clone(self.estimator).set_params( **self.best_params_) if y is not None: self.best_estimator_.fit(X, y, **fit_params) else: self.best_estimator_.fit(X, **fit_params) return self
def fit(self, X, y): y_labels = self._get_labels(y, self.n_classes) cv = check_cv(self.cv, y_labels, classifier=is_classifier(self.estimator)) self.estimators_ = [] for train, _ in cv.split(X, y_labels): self.estimators_.append( clone(self.estimator).fit(X[train], y_labels[train]) ) return self
def transform(self, X, y=None): cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) X_prob = np.zeros((X.shape[0], self.n_classes)) X_pred = np.zeros(X.shape[0]) for estimator, (_, test) in zip(self.estimators_, cv.split(X)): X_prob[test] = estimator.predict_proba(X[test]) X_pred[test] = estimator.predict(X[test]) return np.hstack([X_prob, np.array([X_pred]).T])
def fit(self, X, y): y_labels = self._get_labels(y) cv = check_cv(self.cv, y_labels, classifier=is_classifier(self.estimator)) self.estimators_ = [] for train, _ in cv.split(X, y_labels): self.estimators_.append( clone(self.estimator).fit(X[train], y_labels[train]) ) return self
def fit(self, X, y, **fit_params): cv = check_cv(self.cv, y, classifier=False) self.estimators_ = [] for train, valid in cv.split(X, y): self.estimators_.append( xgb.XGBRegressor(**self.xgb_params).fit(X[train], y[train], eval_set=[(X[valid], y[valid])], **self.fit_params)) return self
def fit(self, X, y): print("随机森林开始拟合") cv = check_cv(self.cv, y, classifier=False) self.estimators_ = [] self.scores = [] for train, valid in cv.split(X, y): model = RandomForestRegressor(**self.rf_params).fit( X.iloc[train], y.iloc[train]) self.estimators_.append(model) score = rmsple(y.iloc[valid], model.predict(X.iloc[valid])) self.scores.append(score) return self
def _wrapped_cross_val_score(sklearn_pipeline, features, target, cv, scoring_function, sample_weight=None, groups=None): """Fit estimator and compute scores for a given dataset split. Parameters ---------- sklearn_pipeline : pipeline object implementing 'fit' The object to use to fit the data. features : array-like of shape at least 2D The data to fit. target : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. cv: int or cross-validation generator If CV is a number, then it is the number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. If it is an object then it is an object to be used as a cross-validation generator. scoring_function : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. sample_weight : array-like, optional List of sample weights to balance (or un-balanace) the dataset target as needed groups: array-like {n_samples, }, optional Group labels for the samples used while splitting the dataset into train/test set """ sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight) features, target, groups = indexable(features, target, groups) cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline)) cv_iter = list(cv.split(features, target, groups)) scorer = check_scoring(sklearn_pipeline, scoring=scoring_function) try: with warnings.catch_warnings(): warnings.simplefilter('ignore') scores = [_fit_and_score(estimator=clone(sklearn_pipeline), X=features, y=target, scorer=scorer, train=train, test=test, verbose=0, parameters=None, fit_params=sample_weight_dict) for train, test in cv_iter] CV_score = np.array(scores)[:, 0] return np.nanmean(CV_score) except TimeoutException: return "Timeout" except Exception as e: return -float('inf')
def cross_validate(estimator, X, mixed_y=None, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', return_train_score="warn"): """Evaluate metric(s) by cross-validation and also record fit/score times.""" # TODO: wrapper patch, key hard coding? _y = mixed_y['classifier'] if isinstance(mixed_y, dict) else mixed_y X, y, groups = indexable(X, _y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorers, _ = _check_multimetric_scoring(estimator, scoring=scoring) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) scores = parallel( delayed(_fit_and_score)( clone(estimator), X, mixed_y, scorers, train, test, verbose, None, fit_params, return_train_score=return_train_score, return_times=True) for train, test in cv.split(X, y, groups)) if return_train_score: train_scores, test_scores, fit_times, score_times = zip(*scores) train_scores = _aggregate_score_dicts(train_scores) else: test_scores, fit_times, score_times = zip(*scores) test_scores = _aggregate_score_dicts(test_scores) # TODO: replace by a dict in 0.21 ret = DeprecationDict() if return_train_score == 'warn' else {} ret['fit_time'] = np.array(fit_times) ret['score_time'] = np.array(score_times) for name in scorers: ret['test_%s' % name] = np.array(test_scores[name]) if return_train_score: key = 'train_%s' % name ret[key] = np.array(train_scores[name]) if return_train_score == 'warn': message = ( 'You are accessing a training score ({!r}), ' 'which will not be available by default ' 'any more in 0.21. If you need training scores, ' 'please set return_train_score=True').format(key) # warn on key access ret.add_warning(key, message, FutureWarning) return ret
def create_features(self): self.n_class = 5 self.cv = 5 estimator = self.get_rfc() estimators = [] y_labels = self._get_labels(y) cv = check_cv(self.cv, y_labels, classifier=is_classifier(estimator)) for tr_idx, _ in cv.split(train, y_labels): estimators.append( clone(estimator).fit(train.loc[tr_idx], y_labels[tr_idx])) train_prob = np.zeros([train.shape[0], self.n_class]) train_pred = np.zeros(train.shape[0]) test_prob = np.zeros([test.shape[0], self.n_class]) test_pred = np.zeros(test.shape[0]) cv = check_cv(self.cv, classifier=is_classifier(estimator)) for estimator, (_, te_idx) in zip(estimators, cv.split(train)): train_prob[te_idx] = estimator.predict_proba(train.loc[te_idx]) train_pred[te_idx] = estimator.predict(train.loc[te_idx]) for estimator, (_, te_idx) in zip(estimators, cv.split(test)): test_prob[te_idx] = estimator.predict_proba(test.loc[te_idx]) test_pred[te_idx] = estimator.predict(test.loc[te_idx]) tmp_train = pd.DataFrame(train_prob) tmp_test = pd.DataFrame(test_prob) tmp_train["class_pred"] = np.array([train_pred]).T tmp_test["class_pred"] = np.array([test_pred]).T columns = ["{}_prob".format(i) for i in range(self.n_class)] + ["class_pred"] tmp_train.columns = columns tmp_test.column = columns self.train = tmp_train self.test = tmp_test
def fit(self, X, y): X = X.replace([np.inf, -np.inf], np.nan) X = X.fillna(0) y_labels = self._get_labels(y) cv = check_cv(self.cv, y_labels, classifier=is_classifier(self.estimator)) self.estimators_ = [] for train, _ in cv.split(X, y_labels): X = np.array(X) self.estimators_.append( clone(self.estimator).fit(X[train], y_labels[train])) return self
def fit(self, X, y, **fit_params): cv = check_cv(self.cv, y, classifier=False) self.estimators_ = [] for train, valid in cv.split(X, y): self.estimators_.append( xgb.XGBRegressor(**self.xgb_params).fit( X[train], y[train], eval_set=[(X[valid], y[valid])], **self.fit_params ) ) return self
def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', datasets=None): """ Evaluate a score by cross-validation """ if not isinstance(scoring, (list, tuple)): scoring = [scoring] X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) splits = list(cv.split(X, y, groups)) scorer = [check_scoring(estimator, scoring=s) for s in scoring] # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) """ if datasets is None: datasets = read_all_datasets() """ scores = parallel( delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, verbose, None, fit_params, to_evaluate=datasets) for train, test in splits) group_order = [] if hasattr(cv, 'groups'): group_order = [ np.array(cv.groups)[test].tolist()[0] for _, test in splits ] return np.squeeze(np.array(scores)), group_order #scores
def fit(self, X, y, **fit_xgb_params): cv = check_cv(self.cv, y, classifier=False) print(X.shape, " ", y.shape) self.estimators_ = [] self.scores = [] for train, valid in cv.split(X, y): model = xgb.XGBRegressor(**self.xgb_params).fit( X.iloc[train], y.iloc[train], eval_set=[(X.iloc[valid], y.iloc[valid])], **self.fit_xgb_params) self.estimators_.append(model) score = rmsple(y.iloc[valid], model.predict(X.iloc[valid])) self.scores.append(score) return self
def cross_val_transform(target_encoder, X, y=None, cv=5, classifier=False, n_jobs=None): cv = check_cv(cv, y, classifier=classifier) splits = list(cv.split(X, y)) transform_outputs = Parallel(n_jobs=n_jobs)( delayed(_fit_and_transform)(clone(target_encoder), X, y, train_idx, test_idx) for train_idx, test_idx in splits) output = np.zeros_like(X) for (_, test_idx), transform_output in zip(splits, transform_outputs): output[test_idx] = transform_output return output
def cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', method='predict', pickle_predictions=False, **pickler_kwargs): """Please see sklearn for documenation This has only been modified so binned regressors can return probabilites and predictions can be cached during computation """ X, y, groups = indexable(X, y, groups) pickler = CachingPickler(**pickler_kwargs) if pickle_predictions else None cv = check_cv(cv, y, classifier=is_classifier(estimator)) if method in ['decision_function', 'predict_proba', 'predict_log_proba'] and is_classifier(estimator): le = LabelEncoder() y = le.fit_transform(y) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) prediction_blocks = parallel(delayed(_fit_and_predict)( clone(estimator), X, y, train, test, verbose, fit_params, method, pickler) for train, test in cv.split(X, y, groups)) # Concatenate the predictions if pickle_predictions: predictions = [pickler.unpickle_data(pred_block_i) for pred_block_i, _ in prediction_blocks] else: predictions = [pred_block_i for pred_block_i, _ in prediction_blocks] test_indices = np.concatenate([indices_i for _, indices_i in prediction_blocks]) if not _check_is_permutation(test_indices, _num_samples(X)): raise ValueError('cross_val_predict only works for partitions') inv_test_indices = np.empty(len(test_indices), dtype=int) inv_test_indices[test_indices] = np.arange(len(test_indices)) # Check for sparse predictions if sp.issparse(predictions[0]): predictions = sp.vstack(predictions, format=predictions[0].format) else: predictions = np.concatenate(predictions) return predictions[inv_test_indices]
def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, fit_params=None, verbose=0): X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) scores = [] for train, test in cv.split(X, y, groups): score = delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, \ verbose, None, fit_params) scores.append(score) result = delayed(concat_cv_scores)(scores) return result
def _plot(cls, estimator, X, y, train_sizes=None, cv=None, n_jobs=1, ax=None, cmap='tab10'): cv = check_cv(cv) plotter = cls()._create(estimator, X, y, train_sizes=train_sizes, cv=cv, n_jobs=n_jobs) plotter.plot(ax=ax, cmap=cmap) return plotter
def cross_val_predict_proba(estimator, X, y, groups = None, cv = None, n_jobs = 1, verbose = 0, fit_params = None, pre_dispatch = '2*n_jobs'): ''' Gets class probability predictions for test examples over cross validations runs. Adapted from mne.decoding.base.cross_val_multiscore(). See that func's documentation for details on inputs. ''' import time import numbers from mne.parallel import parallel_func from mne.fixes import is_classifier from sklearn.base import clone from sklearn.utils import indexable from sklearn.model_selection._split import check_cv # check arguments X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier = is_classifier(estimator)) cv_iter = list(cv.split(X, y, groups)) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. # Note: this parallelization is implemented using MNE Parallel parallel, p_func, n_jobs = parallel_func(_predict_proba, n_jobs, pre_dispatch = pre_dispatch) preds = parallel(p_func(clone(estimator), X, y, train, test, 0, None, fit_params) for train, test in cv_iter) # flatten over parallel output y_hat = np.concatenate([p[0] for p in preds], axis = 0) is_y_true = True try: y_true = np.concatenate([p[1] for p in preds], axis = 0) except: # learner was unsupervised is_y_true = False # return results if is_y_true: return y_hat, y_true else: return y_hat
def _run_search(self, evaluate_candidates, X, y): rng = check_random_state(self.random_state) candidate_params = list(self._generate_candidate_params()) n_iterations = int(ceil(log2(len(candidate_params)))) print(n_iterations) n_samples_total = X.shape[0] cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) n_classes = len(np.unique(y)) if is_classifier(self.estimator) else 1 min_n_samples = cv.get_n_splits(X, y) * n_classes * 2 # max_iter = int(ceil(n_samples_total / (min_n_samples * n_candidates))) # n_iterations = min(n_iterations, max_iter) for iter_i in range(n_iterations): n_candidates = len(candidate_params) # randomly sample training samples n_samples_iter = floor(n_samples_total / (n_candidates * n_iterations)) if is_classifier(self.estimator): n_samples_iter = max(n_samples_iter, min_n_samples) print("n_samples_iter: {}".format(n_samples_iter)) indices = rng.choice(n_samples_total, n_samples_iter, replace=False) X_iter, y_iter = X[indices], y[indices] more_results= {'iter': [iter_i] * n_candidates, 'n_samples': [n_samples_iter] * n_candidates} out = evaluate_candidates(candidate_params, X_iter, y_iter, more_results=more_results) # Select the best half of the candidates for the next iteration # We need to filter out candidates from the previous iterations n_candidates_to_keep = ceil(n_candidates / 2) best_candidates_indices = np.argsort(out['mean_test_score'])[::-1] best_candidates_indices = [i for i in best_candidates_indices if out['iter'][i] == iter_i] best_candidates_indices = \ best_candidates_indices[:n_candidates_to_keep] candidate_params = [out['params'][i] for i in best_candidates_indices] assert len(candidate_params) == n_candidates_to_keep == 1
def fit(self,X,y): cv = check_cv(self.cv,y,classifier = True) self.estimators_ = [] self.scores = [] self.score = 0 for train,valid in cv.split(X,y): score1 = 0 test = len(y[valid]) clf = SGDClassifier(**self.sgd_params).fit(X[train],y[train]) for i in range(0,test): yt = clf.predict(X[valid][i,:]) if yt == y[valid][i]: score1 += 1 score1 = score1 / test print(score1) self.scores.append(score1) self.estimators_.append(clf) self.score = sum(self.scores) / len(self.scores) return self
def _wrapped_cross_val_score(sklearn_pipeline, features, target, cv, scoring_function, sample_weight=None, groups=None, use_dask=False): """Fit estimator and compute scores for a given dataset split. Parameters ---------- sklearn_pipeline : pipeline object implementing 'fit' The object to use to fit the data. features : array-like of shape at least 2D The data to fit. target : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. cv: int or cross-validation generator If CV is a number, then it is the number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. If it is an object then it is an object to be used as a cross-validation generator. scoring_function : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. sample_weight : array-like, optional List of sample weights to balance (or un-balanace) the dataset target as needed groups: array-like {n_samples, }, optional Group labels for the samples used while splitting the dataset into train/test set use_dask : bool, default False Whether to use dask """ sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight) features, target, groups = indexable(features, target, groups) cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline)) cv_iter = list(cv.split(features, target, groups)) scorer = check_scoring(sklearn_pipeline, scoring=scoring_function) if use_dask: try: import dask_ml.model_selection # noqa import dask # noqa from dask.delayed import Delayed except ImportError: msg = "'use_dask' requires the optional dask and dask-ml depedencies." raise ImportError(msg) dsk, keys, n_splits = dask_ml.model_selection._search.build_graph( estimator=sklearn_pipeline, cv=cv, scorer=scorer, candidate_params=[{}], X=features, y=target, groups=groups, fit_params=sample_weight_dict, refit=False, error_score=float('-inf'), ) cv_results = Delayed(keys[0], dsk) scores = [cv_results['split{}_test_score'.format(i)] for i in range(n_splits)] CV_score = dask.delayed(np.array)(scores)[:, 0] return dask.delayed(np.nanmean)(CV_score) else: try: with warnings.catch_warnings(): warnings.simplefilter('ignore') scores = [_fit_and_score(estimator=clone(sklearn_pipeline), X=features, y=target, scorer=scorer, train=train, test=test, verbose=0, parameters=None, fit_params=sample_weight_dict) for train, test in cv_iter] CV_score = np.array(scores)[:, 0] return np.nanmean(CV_score) except TimeoutException: return "Timeout" except Exception as e: return -float('inf')
def fit(self, X, y, groups=None, sample_weight=None): """ Fit ensemble classifers and the meta-classifier. Parameters ---------- X : numpy array, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : numpy array, shape = [n_samples] Target values. groups : numpy array/None, shape = [n_samples] The group that each sample belongs to. This is used by specific folding strategies such as GroupKFold() sample_weight : array-like, shape = [n_samples], optional Sample weights passed as sample_weights to each regressor in the regressors list as well as the meta_regressor. Raises error if some regressor does not support sample_weight in the fit() method. Returns ------- self : object """ if self.use_clones: self.clfs_ = clone(self.classifiers) self.meta_clf_ = clone(self.meta_classifier) else: self.clfs_ = self.classifiers self.meta_clf_ = self.meta_classifier if self.verbose > 0: print("Fitting %d classifiers..." % (len(self.classifiers))) final_cv = check_cv(self.cv, y, classifier=self.stratify) if isinstance(self.cv, int): # Override shuffle parameter in case of self generated # cross-validation strategy final_cv.shuffle = self.shuffle final_cv.random_state = self.random_state # Input validation. X, y = check_X_y(X, y, accept_sparse=['csc', 'csr']) if sample_weight is None: fit_params = None else: fit_params = dict(sample_weight=sample_weight) meta_features = None for n, model in enumerate(self.clfs_): if self.verbose > 0: i = self.clfs_.index(model) + 1 print("Fitting classifier%d: %s (%d/%d)" % (i, _name_estimators((model,))[0][0], i, len(self.clfs_))) if self.verbose > 2: if hasattr(model, 'verbose'): model.set_params(verbose=self.verbose - 2) if self.verbose > 1: print(_name_estimators((model,))[0][1]) prediction = cross_val_predict( model, X, y, groups=groups, cv=final_cv, n_jobs=self.n_jobs, fit_params=fit_params, verbose=self.verbose, pre_dispatch=self.pre_dispatch, method='predict_proba' if self.use_probas else 'predict') if not self.use_probas: prediction = prediction[:, np.newaxis] elif self.drop_last_proba: prediction = prediction[:, :-1] if meta_features is None: meta_features = prediction else: meta_features = np.column_stack((meta_features, prediction)) if self.store_train_meta_features: self.train_meta_features_ = meta_features # Fit the base models correctly this time using ALL the training set for model in self.clfs_: if sample_weight is None: model.fit(X, y) else: model.fit(X, y, sample_weight=sample_weight) # Fit the secondary model if self.use_features_in_secondary: meta_features = self._stack_first_level_features( X, meta_features ) if sample_weight is None: self.meta_clf_.fit(meta_features, y) else: self.meta_clf_.fit(meta_features, y, sample_weight=sample_weight) return self
def fit(self, X, y, groups=None): """ Fit ensemble classifers and the meta-classifier. Parameters ---------- X : numpy array, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : numpy array, shape = [n_samples] Target values. groups : numpy array/None, shape = [n_samples] The group that each sample belongs to. This is used by specific folding strategies such as GroupKFold() Returns ------- self : object """ if self.use_clones: self.clfs_ = [clone(clf) for clf in self.classifiers] self.meta_clf_ = clone(self.meta_classifier) else: self.clfs_ = self.classifiers self.meta_clf_ = self.meta_classifier if self.verbose > 0: print("Fitting %d classifiers..." % (len(self.classifiers))) final_cv = check_cv(self.cv, y, classifier=self.stratify) if isinstance(self.cv, int): # Override shuffle parameter in case of self generated # cross-validation strategy final_cv.shuffle = self.shuffle skf = list(final_cv.split(X, y, groups)) all_model_predictions = np.array([]).reshape(len(y), 0) for model in self.clfs_: if self.verbose > 0: i = self.clfs_.index(model) + 1 print("Fitting classifier%d: %s (%d/%d)" % (i, _name_estimators((model,))[0][0], i, len(self.clfs_))) if self.verbose > 2: if hasattr(model, 'verbose'): model.set_params(verbose=self.verbose - 2) if self.verbose > 1: print(_name_estimators((model,))[0][1]) if not self.use_probas: single_model_prediction = np.array([]).reshape(0, 1) else: single_model_prediction = np.array([]).reshape(0, len(set(y))) for num, (train_index, test_index) in enumerate(skf): if self.verbose > 0: print("Training and fitting fold %d of %d..." % ((num + 1), final_cv.get_n_splits())) try: model.fit(X[train_index], y[train_index]) except TypeError as e: raise TypeError(str(e) + '\nPlease check that X and y' 'are NumPy arrays. If X and y are lists' ' of lists,\ntry passing them as' ' numpy.array(X)' ' and numpy.array(y).') except KeyError as e: raise KeyError(str(e) + '\nPlease check that X and y' ' are NumPy arrays. If X and y are pandas' ' DataFrames,\ntry passing them as' ' X.values' ' and y.values.') if not self.use_probas: prediction = model.predict(X[test_index]) prediction = prediction.reshape(prediction.shape[0], 1) else: prediction = model.predict_proba(X[test_index]) single_model_prediction = np.vstack([single_model_prediction. astype(prediction.dtype), prediction]) all_model_predictions = np.hstack([all_model_predictions. astype(single_model_prediction. dtype), single_model_prediction]) if self.store_train_meta_features: # Store the meta features in the order of the # original X,y arrays reodered_indices = np.array([]).astype(y.dtype) for train_index, test_index in skf: reodered_indices = np.concatenate((reodered_indices, test_index)) self.train_meta_features_ = all_model_predictions[np.argsort( reodered_indices)] # We have to shuffle the labels in the same order as we generated # predictions during CV (we kinda shuffled them when we did # Stratified CV). # We also do the same with the features (we will need this only IF # use_features_in_secondary is True) reordered_labels = np.array([]).astype(y.dtype) reordered_features = np.array([]).reshape((0, X.shape[1]))\ .astype(X.dtype) for train_index, test_index in skf: reordered_labels = np.concatenate((reordered_labels, y[test_index])) reordered_features = np.concatenate((reordered_features, X[test_index])) # Fit the base models correctly this time using ALL the training set for model in self.clfs_: model.fit(X, y) # Fit the secondary model if not self.use_features_in_secondary: self.meta_clf_.fit(all_model_predictions, reordered_labels) else: self.meta_clf_.fit(np.hstack((reordered_features, all_model_predictions)), reordered_labels) return self
def fit(self, X, y, groups=None): """ Fit ensemble regressors and the meta-regressor. Parameters ---------- X : numpy array, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : numpy array, shape = [n_samples] Target values. groups : numpy array/None, shape = [n_samples] The group that each sample belongs to. This is used by specific folding strategies such as GroupKFold() Returns ------- self : object """ if self.refit: self.regr_ = [clone(clf) for clf in self.regressors] self.meta_regr_ = clone(self.meta_regressor) else: self.regr_ = self.regressors self.meta_regr_ = self.meta_regressor kfold = check_cv(self.cv, y) if isinstance(self.cv, int): # Override shuffle parameter in case of self generated # cross-validation strategy kfold.shuffle = self.shuffle meta_features = np.zeros((X.shape[0], len(self.regressors))) # # The outer loop iterates over the base-regressors. Each regressor # is trained cv times and makes predictions, after which we train # the meta-regressor on their combined results. # for i, regr in enumerate(self.regressors): # # In the inner loop, each model is trained cv times on the # training-part of this fold of data; and the holdout-part of data # is used for predictions. This is repeated cv times, so in # the end we have predictions for each data point. # # Advantage of this complex approach is that data points we're # predicting have not been trained on by the algorithm, so it's # less susceptible to overfitting. # for train_idx, holdout_idx in kfold.split(X, y, groups): instance = clone(regr) instance.fit(X[train_idx], y[train_idx]) y_pred = instance.predict(X[holdout_idx]) meta_features[holdout_idx, i] = y_pred # save meta-features for training data if self.store_train_meta_features: self.train_meta_features_ = meta_features # Train meta-model on the out-of-fold predictions if self.use_features_in_secondary: self.meta_regr_.fit(np.hstack((X, meta_features)), y) else: self.meta_regr_.fit(meta_features, y) # Retrain base models on all data for regr in self.regr_: regr.fit(X, y) return self
def fit(self): LOG.info('Start fitting ...') gs_cv_params = {'n_jobs': self.n_jobs, 'cv': _cv_build(self.cv_inner), 'verbose': 0} zscore_cv_auc = [] zscore_cv_acc = [] split_id = 0 for dozs in [False, True]: LOG.info('Generate %sz-scored sample ...', '' if dozs else 'non ') X, y, groups = self._generate_sample(zscored=dozs) # The inner CV loop is a grid search on clf_params LOG.info('Creating ModelAndGridSearchCV') inner_cv = ModelAndGridSearchCV(self.param, **gs_cv_params) # Some sklearn's validations scoring = check_scoring(inner_cv, scoring=self._scorer) cv_outer = check_cv(_cv_build(self.cv_outer), y, classifier=is_classifier(inner_cv)) # Outer CV loop outer_cv_scores = [] outer_cv_acc = [] LOG.info('Starting nested cross-validation ...') for train, test in list(cv_outer.split(X, y, groups)): # Find the groups in the train set, in case inner CV is LOSO. fit_params = None if self.cv_inner.get('type') == 'loso': train_groups = [groups[i] for i in train] fit_params = {'groups': train_groups} result = nested_fit_and_score( clone(inner_cv), X, y, scoring, train, test, fit_params=fit_params, verbose=1) # Test group has no positive cases if result is None: continue score, clf = result test_group = list(set(groups[i] for i in test))[0] self._models.append({ # 'clf_type': clf_str, 'zscored': int(dozs), 'outer_split_id': split_id, 'left-out-sites': self.sites[test_group], 'best_model': clf.best_model_, 'best_params': clf.best_params_, 'best_score': clf.best_score_, 'best_index': clf.best_index_, 'cv_results': clf.cv_results_, 'cv_scores': score['test']['score'], 'cv_accuracy': score['test']['accuracy'], 'cv_params': clf.cv_results_['params'], 'cv_auc_means': clf.cv_results_['mean_test_score'], 'cv_splits': {'split%03d' % i: clf.cv_results_['split%d_test_score' % i] for i in list(range(clf.n_splits_))} }) # Store the outer loop scores if score['test']['score'] is not None: outer_cv_scores.append(score['test']['score']) outer_cv_acc.append(score['test']['accuracy']) split_id += 1 # LOG.info( # '[%s-%szs] Outer CV: roc_auc=%f, accuracy=%f, ' # 'Inner CV: best roc_auc=%f, params=%s. ', # clf.best_model_[0], 'n' if not dozs else '', # score['test']['score'] if score['test']['score'] is not None else -1.0, # score['test']['accuracy'], # clf.best_score_, clf.best_model_[1]) LOG.info('Outer CV loop finished, %s=%f (+/-%f), accuracy=%f (+/-%f)', self._scorer, np.mean(outer_cv_scores), 2 * np.std(outer_cv_scores), np.mean(outer_cv_acc), 2 * np.std(outer_cv_acc)) zscore_cv_auc.append(outer_cv_scores) zscore_cv_acc.append(outer_cv_acc) # Select best performing model best_inner_loops = [model['best_score'] for model in self._models] best_idx = np.argmax(best_inner_loops) self._best_model = self._models[best_idx] LOG.info('Inner CV [%d models compared] - best model %s-%szs, score=%f, params=%s', len(best_inner_loops) * len(self._models[0]['cv_params']), self._best_model['best_model'][0], 'n' if not self._best_model['zscored'] else '', self._best_model['best_score'], self._best_model['best_params']) # Write out evaluation result best_zs = 1 if self._best_model['zscored'] else 0 LOG.info('CV - estimated performance: %s=%f (+/-%f), accuracy=%f (+/-%f)', self._scorer, np.mean(zscore_cv_auc[best_zs]), 2 * np.std(zscore_cv_auc[best_zs]), np.mean(zscore_cv_acc[best_zs]), 2 * np.std(zscore_cv_acc[best_zs]), )
def fit(self, X, y=None, labels=None): #return self._fit( # X, y, labels, # parameter_iterable # parameter_iterable, \in Sized, it actually does len(parameter_iterable) in _fit #) # FIXME code duplication from BaseSearchCV._fit estimator = self.estimator cv = _split.check_cv(self.cv, y, classifier=is_classifier(estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y, labels = indexable(X, y, labels) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) n_splits = cv.get_n_splits(X, y, labels) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch # FIXME how to handle pre_dispatch # FIXME recursively getting new parameters to evaluate # parameter_iterable = ... # the magic # # # The evaluation (Parallel) stuff # out = Parallel( # n_jobs=self.n_jobs, verbose=self.verbose, # pre_dispatch=pre_dispatch # )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, # train, test, self.verbose, parameters, # self.fit_params, return_parameters=True, # error_score=self.error_score) # for parameters in parameter_iterable # for train, test in cv.split(X, y, labels)) # # n_fits on each (train, test) def cross_validation(raw_parameters): parameters = dict(zip( self.param_grid.keys(), raw_parameters )) # TODO more robust way of doing this print(parameters) return Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for train, test in cv.split(X, y, labels)) x = cartesian_product(*self.param_grid.values()) # FIXME implement as non-recursive def bo_(x_obs, y_obs, n_iter): if n_iter > 0: kernel = kernels.Matern() + kernels.WhiteKernel() gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=16) gp.fit(x_obs, 1-y_obs) a = a_EI(gp, x_obs=x_obs, y_obs=1-y_obs) argmax_f_x_ = x[np.argmax(a(x))] # heavy evaluation f_argmax_f_x_ = cross_validation(argmax_f_x_) y_ob = np.atleast_2d(mean_mean_validation_scores(f_argmax_f_x_)).T return f_argmax_f_x_ + bo_( x_obs=np.vstack((x_obs, argmax_f_x_)), y_obs=np.vstack((y_obs, y_ob)), n_iter=n_iter-1, ) else: return [] # FIXME (most informative) decision like Numerical Probabilistics stuff for integrations # sobol initilization? sampled_x_ind = np.random.choice( x.shape[0], size=self.n_initial_points, replace=False, ) print(sampled_x_ind) x_obs = x[sampled_x_ind] f_x_obs = list(map(cross_validation, x_obs)) y_obs = np.atleast_2d(list(map(mean_mean_validation_scores, f_x_obs))).T out = sum(f_x_obs, []) + bo_(x_obs, y_obs, n_iter=self.n_iter) n_fits = len(out) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_splits): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _ , parameters in \ out[grid_start:grid_start + n_splits]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_splits) scores.append((score, parameters)) grid_scores.append(_search._CVScoreTuple( parameters, score, np.array(all_scores))) self.grid_scores_ = grid_scores best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, X, y, groups, parameter_iterable): """Actual fitting, performing the search over parameters.""" X, y, groups = indexable(X, y, groups) cv = check_cv(self.cv, y, classifier=True) n_splits = cv.get_n_splits(X, y, groups) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) LOG.info("Fitting %d folds for each of %d candidates, totalling" " %d fits", n_splits, n_candidates, n_candidates * n_splits) pre_dispatch = self.pre_dispatch cv_iter = list(cv.split(X, y, groups)) out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )(delayed(_model_fit_and_score)( estimator, X, y, self.scoring, train, test, self.verbose, parameters, fit_params=self.fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, return_times=True, return_parameters=True, error_score=self.error_score) for estimator, parameters in parameter_iterable for train, test in cv_iter) # if one choose to see train score, "out" will contain train score info if self.return_train_score: (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out) else: (test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out) candidate_params = parameters[::n_splits] n_candidates = len(candidate_params) results = dict() def _store(key_name, array, weights=None, splits=False, rank=False): """A small helper to store the scores/times to the cv_results_""" array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits) if splits: for split_i in range(n_splits): results["split%d_%s" % (split_i, key_name)] = array[:, split_i] array_means = np.average(array, axis=1, weights=weights) results['mean_%s' % key_name] = array_means # Weighted std is not directly available in numpy array_stds = np.sqrt(np.average((array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights)) results['std_%s' % key_name] = array_stds if rank: results["rank_%s" % key_name] = np.asarray( rankdata(-array_means, method='min'), dtype=np.int32) # Computed the (weighted) mean and std for test scores alone # NOTE test_sample counts (weights) remain the same for all candidates test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int) _store('test_score', test_scores, splits=True, rank=True, weights=test_sample_counts if self.iid else None) if self.return_train_score: _store('train_score', train_scores, splits=True) _store('fit_time', fit_time) _store('score_time', score_time) best_index = np.flatnonzero(results["rank_test_score"] == 1)[0] best_parameters = candidate_params[best_index][1] # Use one MaskedArray and mask all the places where the param is not # applicable for that candidate. Use defaultdict as each candidate may # not contain all the params param_results = defaultdict(partial(MaskedArray, np.empty(n_candidates,), mask=True, dtype=object)) for cand_i, params in enumerate(candidate_params): _, param_values = params for name, value in param_values.items(): # An all masked empty array gets created for the key # `"param_%s" % name` at the first occurence of `name`. # Setting the value at an index also unmasks that index param_results["param_%s" % name][cand_i] = value results.update(param_results) # Store a list of param dicts at the key 'params' results['params'] = candidate_params self.cv_results_ = results self.best_index_ = best_index self.n_splits_ = n_splits self.best_model_ = candidate_params[best_index] if self.refit: # build best estimator and fit best_estimator = _clf_build(self.best_model_[0]) best_estimator.set_params(**best_parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self