def test_check_scoring(): """Test all branches of check_scoring""" estimator = EstimatorWithoutFit() pattern = (r"estimator should a be an estimator implementing 'fit' method," r" .* was passed") assert_raises_regexp(TypeError, pattern, check_scoring, estimator) estimator = EstimatorWithFitAndScore() estimator.fit([[1]], [1]) scorer = check_scoring(estimator) assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0) estimator = EstimatorWithFitAndPredict() estimator.fit([[1]], [1]) pattern = (r"If no scoring is specified, the estimator passed should have" r" a 'score' method\. The estimator .* does not\.") assert_raises_regexp(TypeError, pattern, check_scoring, estimator) scorer = check_scoring(estimator, "accuracy") assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0) estimator = EstimatorWithFit() pattern = (r"The estimator passed should have a 'score'" r" or a 'predict' method\. The estimator .* does not\.") assert_raises_regexp(TypeError, pattern, check_scoring, estimator, "accuracy") estimator = EstimatorWithFit() scorer = check_scoring(estimator, allow_none=True) assert_true(scorer is None)
def fit_ipp(self, X, y, grid): self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) if self.grid_parallel: scores, grid_scores = grid_cv_scores(self.estimator, X, y, grid, self.scoring, self.cv, self.profile, self.n_jobs, self.verbose) else: scores = [] # grid = grid_search.ParameterGrid(self.param_grid) grid_scores = []; for parameters in grid: self.estimator.set_params(**parameters) scores_cv = cross_val_score(self.estimator, X, y, self.scoring, self.cv, profile=self.profile) scores.append(np.array(scores_cv).mean()) grid_scores.append(grid_search._CVScoreTuple( parameters, scores_cv.mean(), scores_cv)) max_idx = np.array(scores).argmax() self.best_estimator_ = self.estimator.set_params(**list(grid)[max_idx]) self.best_params_ = list(grid)[max_idx] self.scores_ = scores self.best_score_ = np.array(scores).max() self.grid_scores_ = grid_scores if self.refit: self.best_estimator_.fit(X, y) return self
def permutation_test_score(estimator, X, y, groups=None, cv=None, n_permutations=100, n_jobs=1, random_state=0, verbose=0, scoring=None): """ Evaluate the significance of a cross-validated score with permutations, as in test 1 of [Ojala2010]_. A modification of original sklearn's permutation test score function to evaluate p-value outside this function, so that the score can be reused from outside. .. [Ojala2010] Ojala and Garriga. Permutation Tests for Studying Classifier Performance. The Journal of Machine Learning Research (2010) vol. 11 """ X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) random_state = check_random_state(random_state) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(_permutation_test_score)( clone(estimator), X, _shuffle(y, groups, random_state), groups, cv, scorer) for _ in range(n_permutations)) permutation_scores = np.array(permutation_scores) return permutation_scores
def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): """ Evaluate a score by cross-validation """ if not isinstance(scoring, (list, tuple)): scoring = [scoring] X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) splits = list(cv.split(X, y, groups)) scorer = [check_scoring(estimator, scoring=s) for s in scoring] # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, verbose, None, fit_params) for train, test in splits) group_order = [] if hasattr(cv, 'groups'): group_order = [np.array(cv.groups)[test].tolist()[0] for _, test in splits] return np.squeeze(np.array(scores)), group_order
def plot_auc_curve(estimator, X_test, y_test): y_score = estimator.predict_proba(X_test) scorer = check_scoring(estimator, scoring=None) scorer(estimator, X_test, y_test) fpr = dict() tpr = dict() roc_auc = dict() fpr[0], tpr[0], _ = roc_curve(y_test, y_score[:, 1]) roc_auc[0] = auc(fpr[0], tpr[0]) plt.figure() plt.plot(fpr[0], tpr[0], label='ROC curve (area = %0.2f)' % roc_auc[0]) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.show() return roc_auc[0]
def score(self, X, y): """Score each of the estimators on the tested dimensions. Parameters ---------- X : array, shape (n_samples, nd_features, n_slices) The input samples. For each data slice, the corresponding estimator scores the prediction, e.g.: ``[estimators[ii].score(X[..., ii], y) for ii in range(n_slices)]``. The feature dimension can be multidimensional e.g. ``X.shape = (n_samples, n_features_1, n_features_2, n_estimators)``. y : array, shape (n_samples,) | (n_samples, n_targets) The target values. Returns ------- score : array, shape (n_samples, n_estimators, n_slices) Score for each estimator / data slice couple. """ # noqa: E501 from sklearn.metrics.scorer import check_scoring self._check_Xy(X) # For predictions/transforms the parallelization is across the data and # not across the estimators to avoid memory load. parallel, p_func, n_jobs = parallel_func(_gl_score, self.n_jobs) n_jobs = min(n_jobs, X.shape[-1]) X_splits = np.array_split(X, n_jobs, axis=-1) scoring = check_scoring(self.base_estimator, self.scoring) y = _fix_auc(scoring, y) score = parallel(p_func(self.estimators_, scoring, x, y) for x in X_splits) score = np.concatenate(score, axis=1) return score
def test_check_scoring_gridsearchcv(): # test that check_scoring works on GridSearchCV and pipeline. # slightly redundant non-regression test. grid = GridSearchCV(LinearSVC(), param_grid={"C": [0.1, 1]}) scorer = check_scoring(grid, "f1") assert_true(isinstance(scorer, _PredictScorer)) pipe = make_pipeline(LinearSVC()) scorer = check_scoring(pipe, "f1") assert_true(isinstance(scorer, _PredictScorer)) # check that cross_val_score definitely calls the scorer # and doesn't make any assumptions about the estimator apart from having a # fit. scores = cross_val_score(EstimatorWithFit(), [[1], [2], [3]], [1, 0, 1], scoring=DummyScorer()) assert_array_equal(scores, 1)
def _score(self, X, y, scoring=None, clf=None): from sklearn.model_selection._validation import _score if scoring is None: scoring = self._scorer if clf is None: clf = self._estimator return _score(clf, X, y, check_scoring(clf, scoring=scoring))
def _scoring(self, net, X_test, y_test): """Resolve scoring and apply it to data. Use cached prediction instead of running inference again, if available.""" scorer = check_scoring(net, self.scoring) scores = _score( estimator=net, X_test=X_test, y_test=y_test, scorer=scorer, is_multimetric=False, ) return scores
def train_predict(descriptions_models, X_train, y_train, X_valid, y_valid, scoring=None): """Run preliminary performance analyses of multiple machine learning models. Parameters ---------- descriptions_models : Iterable of 2-tuples (str, object) Each 2-tuple element contains descriptive text and a model object. i.e. [('Model1 info', model1), ('Model2 info', model2), ...] X_train : pandas.DataFrame Training features data y_train : pandas.Series Training target data X_valid, y_valid : same as X_train, y_train, but used for validation scoring : str, callable or None, default=None See `scoring` parameter description for sklearn.grid_search.GridSearchCV.html Returns ------- df_summary : pandas.DataFrame Performance summary of all the models """ results = [] for description, model in descriptions_models: scorer = check_scoring(model, scoring=scoring) result = {"description": description} # Train start = time.time() model.fit(X_train, y_train) result["time_train"] = time.time() - start # Predict train start = time.time() result["score_train"] = scorer(model, X_train, y_train) result["time_predict_train"] = time.time() - start # Predict validation start = time.time() result["score_valid"] = scorer(model, X_valid, y_valid) result["time_predict_valid"] = time.time() - start results.append(result) return pd.DataFrame(results)[ ["description", "score_train", "score_valid", "time_train", "time_predict_train", "time_predict_valid"] ]
def retrain_estimator(estimator, X, y, n, scoring=None, fit_params=None, split_params=None): scorer = check_scoring(estimator, scoring=scoring) split_params = split_params if split_params is not None else {} X_train, X_test, y_train, y_test = train_test_split(X,y,**split_params) #FIXME: shuffles fit_params = fit_params if fit_params is not None else {} estimators = [clone(estimator).set_params(nn__random_state=i).fit(X_train,y_train,**fit_params) for i in range(n)] scores = [scorer(e,X_test,y_test) for e in estimators] print scores return estimators[np.argmax(scores)]
def _wrapped_cross_val_score(sklearn_pipeline, features, target, cv, scoring_function, sample_weight=None, groups=None): """Fit estimator and compute scores for a given dataset split. Parameters ---------- sklearn_pipeline : pipeline object implementing 'fit' The object to use to fit the data. features : array-like of shape at least 2D The data to fit. target : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. cv: int or cross-validation generator If CV is a number, then it is the number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. If it is an object then it is an object to be used as a cross-validation generator. scoring_function : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. sample_weight : array-like, optional List of sample weights to balance (or un-balanace) the dataset target as needed groups: array-like {n_samples, }, optional Group labels for the samples used while splitting the dataset into train/test set """ sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight) features, target, groups = indexable(features, target, groups) cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline)) cv_iter = list(cv.split(features, target, groups)) scorer = check_scoring(sklearn_pipeline, scoring=scoring_function) try: with warnings.catch_warnings(): warnings.simplefilter('ignore') scores = [_fit_and_score(estimator=clone(sklearn_pipeline), X=features, y=target, scorer=scorer, train=train, test=test, verbose=0, parameters=None, fit_params=sample_weight_dict) for train, test in cv_iter] CV_score = np.array(scores)[:, 0] return np.nanmean(CV_score) except TimeoutException: return "Timeout" except Exception as e: return -float('inf')
def test_check_scoring(): """Test all branches of check_scoring""" estimator = EstimatorWithoutFit() assert_raise_message(TypeError, "'fit' method", check_scoring, estimator) estimator = EstimatorWithFitAndScore() estimator.fit([[1]], [1]) scorer = check_scoring(estimator) assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0) estimator = EstimatorWithFitAndPredict() estimator.fit([[1]], [1]) assert_raise_message(TypeError, "no scoring", check_scoring, estimator) scorer = check_scoring(estimator, "accuracy") assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0) estimator = EstimatorWithFit() assert_raise_message(TypeError, "'score' or a 'predict'", check_scoring, estimator, "accuracy") estimator = EstimatorWithFit() scorer = check_scoring(estimator, allow_none=True) assert_true(scorer is None)
def fit(self, X, y): """Fit the model to the training data.""" X, y = check_X_y(X, y, force_all_finite=False, multi_output=self.multi_output) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) if RANK == 0: if self.experiments_folder is not None: assert_path(self.experiments_folder) self._fit_master(X, y) else: self._fit_slave(X, y) return self
def fit(self, X, y=None, groups=None, **fit_params): """Run fit with all sets of parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning. groups : array-like, shape = [n_samples], optional Group labels for the samples used while splitting the dataset into train/test set. **fit_params Parameters passed to the ``fit`` method of the estimator """ estimator = self.estimator self.scorer_ = check_scoring(estimator, scoring=self.scoring) error_score = self.error_score if not (isinstance(error_score, numbers.Number) or error_score == 'raise'): raise ValueError("error_score must be the string 'raise' or a" " numeric value.") dsk, keys, n_splits = build_graph(estimator, self.cv, self.scorer_, list(self._get_param_iterator()), X, y, groups, fit_params, iid=self.iid, refit=self.refit, error_score=error_score, return_train_score=self.return_train_score, cache_cv=self.cache_cv) self.dask_graph_ = dsk self.n_splits_ = n_splits n_jobs = _normalize_n_jobs(self.n_jobs) scheduler = _normalize_scheduler(self.scheduler, n_jobs) out = scheduler(dsk, keys, num_workers=n_jobs) self.cv_results_ = results = out[0] self.best_index_ = np.flatnonzero(results["rank_test_score"] == 1)[0] if self.refit: self.best_estimator_ = out[1] return self
def fit(self, X, y): """Fit the model to the training data.""" X, y = check_X_y(X, y, force_all_finite=False, multi_output=self.multi_output) _check_param_grid(self.param_grid) # cv = _check_cv(self.cv, X, y, classifier=is_classifier(self.estimator)) cv = _check_cv(self.cv, y, classifier=is_classifier(self.estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) if comm_rank == 0: self._fit_master(X, y, cv) else: self._fit_slave() return self
def score(self, X, y): """Score each estimator on each task. The number of tasks in X should match the number of tasks/estimators given at fit time, i.e. we need ``X.shape[-1] == len(self.estimators_)``. Parameters ---------- X : array, shape (n_samples, nd_features, n_tasks) The input samples. For each data slice, the corresponding estimator scores the prediction, e.g.: ``[estimators[ii].score(X[..., ii], y) for ii in range(n_estimators)]``. The feature dimension can be multidimensional e.g. X.shape = (n_samples, n_features_1, n_features_2, n_tasks) y : array, shape (n_samples,) | (n_samples, n_targets) The target values. Returns ------- score : array, shape (n_samples, n_estimators) Score for each estimator/task. """ # noqa: E501 from sklearn.metrics.scorer import check_scoring self._check_Xy(X) if X.shape[-1] != len(self.estimators_): raise ValueError('The number of estimators does not match ' 'X.shape[-1]') scoring = check_scoring(self.base_estimator, self.scoring) y = _fix_auc(scoring, y) # For predictions/transforms the parallelization is across the data and # not across the estimators to avoid memory load. parallel, p_func, n_jobs = parallel_func(_sl_score, self.n_jobs) n_jobs = min(n_jobs, X.shape[-1]) X_splits = np.array_split(X, n_jobs, axis=-1) est_splits = np.array_split(self.estimators_, n_jobs) score = parallel(p_func(est, scoring, x, y) for (est, x) in zip(est_splits, X_splits)) score = np.concatenate(score, axis=0) return score
def _optimize_n_neighbors(self, X, y): print('Auto optimizing n_neighbors using ' + str(self.n_neighbor_candidates)) X_train, X_validate, y_train, y_validate = self._get_split(X, y) estimator = copy.copy(self) estimator.auto_optimize_k = False estimator.fit(X_train, y_train) scorer = check_scoring(estimator, scoring=self.scoring) configs = [] for n_neighbors in self.n_neighbor_candidates: estimator.n_neighbors = n_neighbors score = scorer(estimator, X_validate, y_validate) print('N_neighbors = ' + str(n_neighbors) + ' score: ' + str(self.scoring) + ' ' + str(score)) configs.append((n_neighbors, score)) configs = sorted(configs, key=lambda i: i[1], reverse=True) print('Configs in order of score: ') pprint.pprint(configs) self.n_neighbors = configs[0][0]
def get_cv_classifier(classifier, cv): if classifier["name"] == 'linear-ridge': c = RidgeClassifier() elif classifier["name"] == 'SVC': c = SVC() elif classifier["name"] == "l2-SVC": c = L2KernelClassifier() elif classifier["name"] == "fredholm": c = L2FredholmClassifier() elif classifier["name"] == "TSVM": c = SVMLight() elif classifier["name"] == "Lap-RLSC": c = LapRLSC() elif classifier["name"] == "fred_kernel_appr": c = FredholmKernelApprClassifier() else: raise NameError('Not existing classifier: ' + classifier["name"] + '.') return GridSearchCV(c, classifier["params_grid"], scoring=check_scoring(c), fit_params={}, n_jobs=classifier["n_jobs"], cv=cv)
def cross_val_score_filter_feature_selection(model,filter_function,filter_criteria, X, y, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): X, y = indexable(X, y) cv = check_cv(cv, X, y, classifier=is_classifier(model)) scorer = check_scoring(model, scoring=scoring) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) # scores = parallel(delayed(_fit_and_score)(clone(model), filter_function(X,y,train,filter_criteria), y, scorer, train, test, verbose, None, fit_params) for train, test in cv) return np.array(scores)[:, 0]
def fit(self,X,Y): if not self.best_subset: self.fshape = np.shape(X)[1] self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) self.cv = check_cv(self.cv, X, Y, classifier=is_classifier(self.estimator)) self.best_subset = tuple() self.best_subset_score = 0 self.scores_ = {self.best_subset:self.best_subset_score} X = np.array(X) Y = np.array(Y) try: self.get_best_subset(X,Y) except KeyboardInterrupt: pass self.estimator = self.estimator.fit(X[:,self.best_subset],Y) return self
def score(self, X, y): """Score each of the estimators on the tested dimensions. Parameters ---------- X : array, shape (n_samples, nd_features, n_slices) The input samples. For each data slice, the corresponding estimator scores the prediction, e.g.: ``[estimators[ii].score(X[..., ii], y) for ii in range(n_slices)]``. The feature dimension can be multidimensional e.g. ``X.shape = (n_samples, n_features_1, n_features_2, n_estimators)``. y : array, shape (n_samples,) | (n_samples, n_targets) The target values. Returns ------- score : array, shape (n_samples, n_estimators, n_slices) Score for each estimator / data slice couple. """ # noqa: E501 from sklearn.metrics.scorer import check_scoring self._check_Xy(X) # For predictions/transforms the parallelization is across the data and # not across the estimators to avoid memory load. mesg = 'Scoring %s' % (self.__class__.__name__,) parallel, p_func, n_jobs = parallel_func(_gl_score, self.n_jobs, verbose=False) n_jobs = min(n_jobs, X.shape[-1]) scoring = check_scoring(self.base_estimator, self.scoring) y = _fix_auc(scoring, y) with ProgressBar(X.shape[-1] * len(self.estimators_), verbose_bool='auto', mesg=mesg) as pb: score = parallel(p_func(self.estimators_, scoring, x, y, pb.subset(pb_idx)) for pb_idx, x in array_split_idx( X, n_jobs, axis=-1, n_per_split=len(self.estimators_))) score = np.concatenate(score, axis=1) return score
def fit(self, X, y): """Fit KNN model by choosing the best `n_neighbors`. Parameters ----------- X : scipy.sparse matrix, (n_samples, vocab_size) Data y : ndarray, shape (n_samples,) or (n_samples, n_targets) Target """ if self.n_neighbors_try is None: n_neighbors_try = range(1, 6) else: n_neighbors_try = self.n_neighbors_try X = check_array(X, accept_sparse='csr', copy=True) X = normalize(X, norm='l1', copy=False) cv = check_cv(self.cv, X, y) knn = KNeighborsClassifier(metric='precomputed', algorithm='brute') scorer = check_scoring(knn, scoring=self.scoring) scores = [] for train_ix, test_ix in cv: dist = self._pairwise_wmd(X[test_ix], X[train_ix]) knn.fit(X[train_ix], y[train_ix]) scores.append([ scorer(knn.set_params(n_neighbors=k), dist, y[test_ix]) for k in n_neighbors_try ]) scores = np.array(scores) self.cv_scores_ = scores best_k_ix = np.argmax(np.mean(scores, axis=0)) best_k = n_neighbors_try[best_k_ix] self.n_neighbors = self.n_neighbors_ = best_k return super(WordMoversKNNCV, self).fit(X, y)
def _fit(self, X, y, groups, parameter_iterable): """Actual fitting, performing the search over parameters.""" base_estimator = clone(self.estimator) cv = check_cv(self.cv, y, classifier=is_classifier(base_estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) cv_iter = list(cv.split(X, y, groups)) # Original: joblib # out = Parallel( # n_jobs=self.n_jobs, verbose=self.verbose, # pre_dispatch=pre_dispatch # )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, # train, test, self.verbose, parameters, # fit_params=self.fit_params, # return_train_score=self.return_train_score, # return_n_test_samples=True, # return_times=True, return_parameters=True, # error_score=self.error_score) # for parameters in parameter_iterable # for train, test in cv_iter) name = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name) if not os.path.exists(tempfolder): os.makedirs(tempfolder) # Create the parameter files parameter_files = dict() for num, parameters in enumerate(parameter_iterable): parameters["Number"] = str(num) # Convert parameter set to json fname = ('settings_{}.json').format(str(num)) sourcename = os.path.join(tempfolder, 'parameters', fname) if not os.path.exists(os.path.dirname(sourcename)): os.makedirs(os.path.dirname(sourcename)) with open(sourcename, 'w') as fp: json.dump(parameters, fp, indent=4) parameter_files[str(num)] = ('vfs://tmp/{}/{}/{}/{}').format( 'GS', name, 'parameters', fname) # Create test-train splits traintest_files = dict() # TODO: ugly nummering solution num = 0 for train, test in cv_iter: source_labels = ['train', 'test'] source_data = pd.Series([train, test], index=source_labels, name='Train-test data') fname = ('traintest_{}.hdf5').format(str(num)) sourcename = os.path.join(tempfolder, 'traintest', fname) if not os.path.exists(os.path.dirname(sourcename)): os.makedirs(os.path.dirname(sourcename)) traintest_files[str(num)] = ('vfs://tmp/{}/{}/{}/{}').format( 'GS', name, 'traintest', fname) sourcelabel = ("Source Data Iteration {}").format(str(num)) source_data.to_hdf(sourcename, sourcelabel) num += 1 # Create the files containing the estimator and settings estimator_labels = [ 'base_estimator', 'X', 'y', 'scorer', 'verbose', 'fit_params', 'return_train_score', 'return_n_test_samples', 'return_times', 'return_parameters', 'error_score' ] estimator_data = pd.Series([ base_estimator, X, y, self.scorer_, self.verbose, self.fit_params, self.return_train_score, True, True, True, self.error_score ], index=estimator_labels, name='estimator Data') fname = 'estimatordata.hdf5' estimatorname = os.path.join(tempfolder, fname) estimator_data.to_hdf(estimatorname, 'Estimator Data') estimatordata = ("vfs://tmp/{}/{}/{}").format('GS', name, fname) # Create the fastr network network = fastr.Network('GridSearch_' + name) estimator_data = network.create_source('HDF5', id_='estimator_source') traintest_data = network.create_source('HDF5', id_='traintest') parameter_data = network.create_source('JsonFile', id_='parameters') sink_output = network.create_sink('HDF5', id_='output') fitandscore = network.create_node('fitandscore', memory='2G', id_='fitandscore') fitandscore.inputs['estimatordata'].input_group = 'estimator' fitandscore.inputs['traintest'].input_group = 'traintest' fitandscore.inputs['parameters'].input_group = 'parameters' fitandscore.inputs['estimatordata'] = estimator_data.output fitandscore.inputs['traintest'] = traintest_data.output fitandscore.inputs['parameters'] = parameter_data.output sink_output.input = fitandscore.outputs['fittedestimator'] source_data = { 'estimator_source': estimatordata, 'traintest': traintest_files, 'parameters': parameter_files } sink_data = { 'output': ("vfs://tmp/{}/{}/output_{{sample_id}}_{{cardinality}}{{ext}}" ).format('GS', name) } network.execute(source_data, sink_data, tmpdir=os.path.join(tempfolder, 'GS', name, 'tmp')) # Read in the output data once finished # TODO: expanding fastr url is probably a nicer way sink_files = glob.glob( os.path.join(fastr.config.mounts['tmp'], 'GS', name) + '/output*.hdf5') save_data = list() feature_labels = list() for output in sink_files: data = pd.read_hdf(output) save_data.append(data['RET']) feature_labels.append(data['feature_labels']) # if one choose to see train score, "out" will contain train score info if self.return_train_score: (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*save_data) else: (test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*save_data) candidate_params = parameters[::n_splits] n_candidates = len(candidate_params) results = dict() def _store(key_name, array, weights=None, splits=False, rank=False): """A small helper to store the scores/times to the cv_results_""" array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits) if splits: for split_i in range(n_splits): results["split%d_%s" % (split_i, key_name)] = array[:, split_i] array_means = np.average(array, axis=1, weights=weights) results['mean_%s' % key_name] = array_means # Weighted std is not directly available in numpy array_stds = np.sqrt( np.average((array - array_means[:, np.newaxis])**2, axis=1, weights=weights)) results['std_%s' % key_name] = array_stds if rank: results["rank_%s" % key_name] = np.asarray(rankdata( -array_means, method='min'), dtype=np.int32) # Computed the (weighted) mean and std for test scores alone # NOTE test_sample counts (weights) remain the same for all candidates test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int) _store('test_score', test_scores, splits=True, rank=True, weights=test_sample_counts if self.iid else None) if self.return_train_score: _store('train_score', train_scores, splits=True) _store('fit_time', fit_time) _store('score_time', score_time) best_index = np.flatnonzero(results["rank_test_score"] == 1)[0] best_parameters = candidate_params[best_index] # Use one MaskedArray and mask all the places where the param is not # applicable for that candidate. Use defaultdict as each candidate may # not contain all the params param_results = defaultdict( partial(MaskedArray, np.empty(n_candidates, ), mask=True, dtype=object)) for cand_i, params in enumerate(candidate_params): for name, value in params.items(): # An all masked empty array gets created for the key # `"param_%s" % name` at the first occurence of `name`. # Setting the value at an index also unmasks that index param_results["param_%s" % name][cand_i] = value results.update(param_results) # Store a list of param dicts at the key 'params' results['params'] = candidate_params results['feature_labels'] = feature_labels self.cv_results_ = results self.best_index_ = best_index self.n_splits_ = n_splits if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best_parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def fit( self, X, # type: TwoDimArrayLikeType y=None, # type: Optional[Union[OneDimArrayLikeType, TwoDimArrayLikeType]] groups=None, # type: Optional[OneDimArrayLikeType] **fit_params # type: Any ): # type: (...) -> 'OptunaSearchCV' """Run fit with all sets of parameters. Args: X: Training data. y: Target variable. groups: Group labels for the samples used while splitting the dataset into train/test set. **fit_params: Parameters passed to ``fit`` on the estimator. Returns: self: Return self. """ self._check_params() random_state = check_random_state(self.random_state) max_samples = self.subsample n_samples = _num_samples(X) old_level = logger.getEffectiveLevel() if self.verbose > 1: logger.setLevel(DEBUG) elif self.verbose > 0: logger.setLevel(INFO) else: logger.setLevel(WARNING) self.sample_indices_ = np.arange(n_samples) if type(max_samples) is float: max_samples = int(max_samples * n_samples) if max_samples < n_samples: self.sample_indices_ = random_state.choice(self.sample_indices_, max_samples, replace=False) self.sample_indices_.sort() X_res = safe_indexing(X, self.sample_indices_) y_res = safe_indexing(y, self.sample_indices_) groups_res = safe_indexing(groups, self.sample_indices_) fit_params_res = fit_params if fit_params_res is not None: fit_params_res = { key: _index_param_value(X, value, self.sample_indices_) for key, value in fit_params.items() } classifier = is_classifier(self.estimator) cv = check_cv(self.cv, y_res, classifier) self.n_splits_ = cv.get_n_splits(X_res, y_res, groups=groups_res) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) if self.study is None: seed = random_state.randint(0, np.iinfo('int32').max) sampler = samplers.TPESampler(seed=seed) self.study_ = study_module.create_study(direction='maximize', sampler=sampler) else: self.study_ = self.study objective = _Objective(self.estimator, self.param_distributions, X_res, y_res, cv, self.enable_pruning, self.error_score, fit_params_res, groups_res, self.max_iter, self.return_train_score, self.scorer_) logger.info('Searching the best hyperparameters using {} ' 'samples...'.format(_num_samples(self.sample_indices_))) self.study_.optimize(objective, n_jobs=self.n_jobs, n_trials=self.n_trials, timeout=self.timeout) logger.info('Finished hyperparemeter search!') if self.refit: self._refit(X, y, **fit_params) logger.setLevel(old_level) return self
def _extendedFit(self, X, y, parameter_iterable): estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print( "Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( delayed(_extended_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() grid_extras = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] all_extras = [] for this_score, this_n_test_samples, _, parameters, extra in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) all_extras.append(extra) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) grid_extras.append(all_extras) # Store the computed scores self.grid_scores_ = grid_scores self.extras_ = grid_extras # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: print "Refitting best estimator" # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def fit(self, X, y=None, groups=None, **fit_params): """Run fit on the estimator with randomly drawn parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. **fit_params : dict of string -> object Parameters passed to the ``fit`` method of the estimator """ estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) self._random_state = check_random_state(self.random_state) X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) R = list(self.cost_parameter_max.values())[0] if self.cost_parameter_min is None: Rmin = 1 else: Rmin = list(self.cost_parameter_min.values())[0] if self.verbose > 0: n_candidates = hyperband_num_per_run(self.eta, R, Rmin) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch cv_iter = list(cv.split(X, y, groups)) out = [] smax = int(np.floor(np.log(R / Rmin) / np.log(self.eta))) B = (smax + 1.0) * R for s in range(smax, -1, -1): n = int(np.ceil(B / R * np.power(self.eta, s) / (s + 1.0))) r = int(R / np.power(self.eta, s)) T = list( ParameterSampler(self.param_distributions, n, random_state=self._random_state)) for i in range(0, s + 1): n_i = int(np.floor(n / np.power(self.eta, i))) r_i = int(r * np.power(self.eta, i)) _jobs = [] for parameters in T: _parameters = copy.deepcopy(parameters) _parameters.update( {list(self.cost_parameter_max.keys())[0]: r_i}) for train, test in cv_iter: _jobs.append( delayed(_fit_and_score)( clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, _parameters, fit_params=fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, return_times=True, return_parameters=True, error_score=self.error_score)) _out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)(_jobs) results, _ = self._process_outputs(_out, n_splits) num_to_keep = int(np.floor(n_i / self.eta)) sind = np.argsort(results["rank_test_score"]) msk = np.zeros(len(results['rank_test_score'])) msk[sind[0:num_to_keep]] = 1 msk = msk.astype(bool) T = [p for k, p in enumerate(results['params']) if msk[k]] out += _out results, best_index = self._process_outputs(out, n_splits) self.cv_results_ = results self.best_index_ = best_index self.n_splits_ = n_splits self.multimetric_ = False if not hasattr(self, 'best_score_'): self.best_score_ = results['mean_test_score'][best_index] if not hasattr(self, 'best_params_'): self.best_params_ = results['params'][best_index] if self.refit: best_estimator = clone(self.estimator).set_params( **self.cv_results_['params'][self.best_index_]) if y is not None: best_estimator.fit(X, y, **fit_params) else: best_estimator.fit(X, **fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, X, y): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch # setup SigOpt experiment and run optimization self._create_sigopt_exp() for jk in xrange(self.n_iter): suggestion = self.conn.experiments( self.experiment.id).suggestions().create() parameters = suggestion.assignments.to_json() # convert all unicode names and values to plain strings non_unicode_parameters = self._convert_unicode_dict(parameters) if self.verbose > 0: print "Evaluating params : ", non_unicode_parameters # do CV folds in parallel using joblib # returns scores on test set out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, non_unicode_parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for train, test in cv) # grab scores from results scores = [o[0] for o in out] self.conn.experiments(self.experiment.id).observations().create( suggestion=suggestion.id, value=numpy.mean(scores), value_stddev=numpy.std(scores)) # return best SigOpt observation so far best_obs = self.conn.experiments( self.experiment.id).fetch().progress.best_observation self.best_params_ = best_obs.assignments.to_json() # convert all unicode names and values to plain strings self.best_params_ = self._convert_unicode_dict(self.best_params_) self.best_score_ = best_obs.value if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **self.best_params_) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator
def cross_val_score_track_trials(estimator, X, y=None, scoring=accuracy_score, cv=5, args_to_scorer=None): """ Use the given estimator to perform fit and predict for splits defined by 'cv' and compute the given score on each of the splits. Parameters ---------- estimator: A valid sklearn_wrapper estimator X, y: Valid data and target values that work with the estimator scoring: string or a scorer object created using https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html#sklearn.metrics.make_scorer. A string from sklearn.metrics.SCORERS.keys() can be used or a scorer created from one of sklearn.metrics (https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics). A completely custom scorer object can be created from a python function following the example at https://scikit-learn.org/stable/modules/model_evaluation.html The metric has to return a scalar value, cv: an integer or an object that has a split function as a generator yielding (train, test) splits as arrays of indices. Integer value is used as number of folds in sklearn.model_selection.StratifiedKFold, default is 5. Note that any of the iterators from https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators can be used here. args_to_scorer: A dictionary of additional keyword arguments to pass to the scorer. Used for cases where the scorer has a signature such as ``scorer(estimator, X, y, **kwargs)``. Returns ------- cv_results: a list of scores corresponding to each cross validation fold """ if isinstance(cv, int): cv = StratifiedKFold(cv) if args_to_scorer is None: args_to_scorer = {} scorer = check_scoring(estimator, scoring=scoring) cv_results: List[float] = [] log_loss_results = [] time_results = [] for train, test in cv.split(X, y): X_train, y_train = split_with_schemas(estimator, X, y, train) X_test, y_test = split_with_schemas(estimator, X, y, test, train) start = time.time() #Not calling sklearn.base.clone() here, because: # (1) For Lale pipelines, clone() calls the pipeline constructor # with edges=None, so the resulting topology is incorrect. # (2) For Lale individual operators, the fit() method already # clones the impl object, so cloning again is redundant. trained = estimator.fit(X_train, y_train) score_value = scorer(trained, X_test, y_test, **args_to_scorer) execution_time = time.time() - start # not all estimators have predict probability try: y_pred_proba = trained.predict_proba(X_test) logloss = log_loss(y_true=y_test, y_pred=y_pred_proba) log_loss_results.append(logloss) except BaseException: logger.debug("Warning, log loss cannot be computed") cv_results.append(score_value) time_results.append(execution_time) result = np.array(cv_results).mean(), np.array( log_loss_results).mean(), np.array(execution_time).mean() return result
def _fit(self, X, y, groups, parameter_iterable): estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) param_grid = [(parameters, train, test) for parameters in parameter_iterable for train, test in list(cv.split(X, y, groups))] # Because the original python code expects a certain order for the elements, we need to # respect it. indexed_param_grid = list(zip(range(len(param_grid)), param_grid)) par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid)) X_bc = self.sc.broadcast(X) y_bc = self.sc.broadcast(y) scorer = self.scorer_ verbose = self.verbose error_score = self.error_score fit_params = self.fit_params return_train_score = self.return_train_score fas = _fit_and_score def fun(tup): (index, (parameters, train, test)) = tup local_estimator = clone(base_estimator) local_X = X_bc.value local_y = y_bc.value res = fas(local_estimator, local_X, local_y, scorer, train, test, verbose, parameters, fit_params, return_train_score=return_train_score, return_n_test_samples=True, return_times=True, return_parameters=True, error_score=error_score) return (index, res) indexed_out0 = dict(par_param_grid.map(fun).collect()) out = [indexed_out0[idx] for idx in range(len(param_grid))] if return_train_score: (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out) else: (test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out) X_bc.unpersist() y_bc.unpersist() candidate_params = parameters[::n_splits] n_candidates = len(candidate_params) results = dict() def _store(key_name, array, weights=None, splits=False, rank=False): """A small helper to store the scores/times to the cv_results_""" # When iterated first by splits, then by parameters array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits) if splits: for split_i in range(n_splits): results["split%d_%s" % (split_i, key_name)] = array[:, split_i] array_means = np.average(array, axis=1, weights=weights) results['mean_%s' % key_name] = array_means # Weighted std is not directly available in numpy array_stds = np.sqrt(np.average((array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights)) results['std_%s' % key_name] = array_stds if rank: results["rank_%s" % key_name] = np.asarray( rankdata(-array_means, method='min'), dtype=np.int32) # Computed the (weighted) mean and std for test scores alone # NOTE test_sample counts (weights) remain the same for all candidates test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int) _store('test_score', test_scores, splits=True, rank=True, weights=test_sample_counts if self.iid else None) if self.return_train_score: _store('train_score', train_scores, splits=True) _store('fit_time', fit_time) _store('score_time', score_time) best_index = np.flatnonzero(results["rank_test_score"] == 1)[0] best_parameters = candidate_params[best_index] # Use one MaskedArray and mask all the places where the param is not # applicable for that candidate. Use defaultdict as each candidate may # not contain all the params param_results = defaultdict(partial(MaskedArray, np.empty(n_candidates,), mask=True, dtype=object)) for cand_i, params in enumerate(candidate_params): for name, value in params.items(): # An all masked empty array gets created for the key # `"param_%s" % name` at the first occurence of `name`. # Setting the value at an index also unmasks that index param_results["param_%s" % name][cand_i] = value results.update(param_results) # Store a list of param dicts at the key 'params' results['params'] = candidate_params self.cv_results_ = results self.best_index_ = best_index self.n_splits_ = n_splits if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best_parameters) if y is not None: best_estimator.fit(X, y, **fit_params) else: best_estimator.fit(X, **fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, Z, parameter_iterable): """Actual fitting, performing the search over parameters.""" self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) cv = self.cv cv = _check_cv(cv, Z) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch, backend="threading")( delayed(_fit_and_score)(clone(base_estimator), Z, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) best_estimator.fit(Z, **self.fit_params) self.best_estimator_ = best_estimator return self
def _wrapped_cross_val_score(sklearn_pipeline, features, target, cv, scoring_function, sample_weight=None, groups=None, use_dask=False): """Fit estimator and compute scores for a given dataset split. Parameters ---------- sklearn_pipeline : pipeline object implementing 'fit' The object to use to fit the data. features : array-like of shape at least 2D The data to fit. target : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. cv: int or cross-validation generator If CV is a number, then it is the number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. If it is an object then it is an object to be used as a cross-validation generator. scoring_function : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. sample_weight : array-like, optional List of sample weights to balance (or un-balanace) the dataset target as needed groups: array-like {n_samples, }, optional Group labels for the samples used while splitting the dataset into train/test set use_dask : bool, default False Whether to use dask """ sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight) features, target, groups = indexable(features, target, groups) cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline)) cv_iter = list(cv.split(features, target, groups)) scorer = check_scoring(sklearn_pipeline, scoring=scoring_function) if use_dask: try: import dask_ml.model_selection # noqa import dask # noqa from dask.delayed import Delayed except ImportError: msg = "'use_dask' requires the optional dask and dask-ml depedencies." raise ImportError(msg) dsk, keys, n_splits = dask_ml.model_selection._search.build_graph( estimator=sklearn_pipeline, cv=cv, scorer=scorer, candidate_params=[{}], X=features, y=target, groups=groups, fit_params=sample_weight_dict, refit=False, error_score=float('-inf'), ) cv_results = Delayed(keys[0], dsk) scores = [cv_results['split{}_test_score'.format(i)] for i in range(n_splits)] CV_score = dask.delayed(np.array)(scores)[:, 0] return dask.delayed(np.nanmean)(CV_score) else: try: with warnings.catch_warnings(): warnings.simplefilter('ignore') scores = [_fit_and_score(estimator=clone(sklearn_pipeline), X=features, y=target, scorer=scorer, train=train, test=test, verbose=0, parameters=None, fit_params=sample_weight_dict) for train, test in cv_iter] CV_score = np.array(scores)[:, 0] return np.nanmean(CV_score) except TimeoutException: return "Timeout" except Exception as e: return -float('inf')
def fit(self, X, y=None, labels=None): #return self._fit( # X, y, labels, # parameter_iterable # parameter_iterable, \in Sized, it actually does len(parameter_iterable) in _fit #) # FIXME code duplication from BaseSearchCV._fit estimator = self.estimator cv = _split.check_cv(self.cv, y, classifier=is_classifier(estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y, labels = indexable(X, y, labels) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) n_splits = cv.get_n_splits(X, y, labels) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch # FIXME how to handle pre_dispatch # FIXME recursively getting new parameters to evaluate # parameter_iterable = ... # the magic # # # The evaluation (Parallel) stuff # out = Parallel( # n_jobs=self.n_jobs, verbose=self.verbose, # pre_dispatch=pre_dispatch # )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, # train, test, self.verbose, parameters, # self.fit_params, return_parameters=True, # error_score=self.error_score) # for parameters in parameter_iterable # for train, test in cv.split(X, y, labels)) # # n_fits on each (train, test) def cross_validation(raw_parameters): parameters = dict(zip( self.param_grid.keys(), raw_parameters )) # TODO more robust way of doing this print(parameters) return Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for train, test in cv.split(X, y, labels)) x = cartesian_product(*self.param_grid.values()) # FIXME implement as non-recursive def bo_(x_obs, y_obs, n_iter): if n_iter > 0: kernel = kernels.Matern() + kernels.WhiteKernel() gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=16) gp.fit(x_obs, 1-y_obs) a = a_EI(gp, x_obs=x_obs, y_obs=1-y_obs) argmax_f_x_ = x[np.argmax(a(x))] # heavy evaluation f_argmax_f_x_ = cross_validation(argmax_f_x_) y_ob = np.atleast_2d(mean_mean_validation_scores(f_argmax_f_x_)).T return f_argmax_f_x_ + bo_( x_obs=np.vstack((x_obs, argmax_f_x_)), y_obs=np.vstack((y_obs, y_ob)), n_iter=n_iter-1, ) else: return [] # FIXME (most informative) decision like Numerical Probabilistics stuff for integrations # sobol initilization? sampled_x_ind = np.random.choice( x.shape[0], size=self.n_initial_points, replace=False, ) print(sampled_x_ind) x_obs = x[sampled_x_ind] f_x_obs = list(map(cross_validation, x_obs)) y_obs = np.atleast_2d(list(map(mean_mean_validation_scores, f_x_obs))).T out = sum(f_x_obs, []) + bo_(x_obs, y_obs, n_iter=self.n_iter) n_fits = len(out) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_splits): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _ , parameters in \ out[grid_start:grid_start + n_splits]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_splits) scores.append((score, parameters)) grid_scores.append(_search._CVScoreTuple( parameters, score, np.array(all_scores))) self.grid_scores_ = grid_scores best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator foldsForEstimator = {} cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) from collections import Sized # Splits the data based on provided cross-validation splitting strategy. cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling \ {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch # Change from original scikit code: adding a new argument, # foldsForEstimator, to the _fit_and_score function to track metadata # for each estimator, for each fold. # _fit_and_score fits the estimator and computes the score for a given # data-split, for given parameters. out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, foldsForEstimator, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) # Computes the scores for each of the folds, for all the possible # parameters, and stores them in grid_scores. scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in out[ grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append( CVScoreTuple(parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params(**best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator else: # If refit is false, we cannot _best_estimator_ is unavailable, and # further predictions can't be made on instance raise Warning( "Note: Refit has been set to false, which makes it impossible to " "make predictions using this GridSearchCV instance after fitting. " "Change refit to true to enable this") # Change from original scikit code: # Populate new field with necessary attributes for storing # cross-validation event self.grid_cv_event = [ X, foldsForEstimator, 0, type_of_target(y), self.best_estimator_, self.best_estimator_, n_folds ] return self
def get_scores(self, X, y, row_keys, scoring, collect_n=None): """ Gives scores for prediction on cross-validation. Parameters ---------- X : array-like The data to fit. Can be, for example a list, or an array at least 2d, or dictionary. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. row_keys : list of strings List of transformers names. ``Pipeliner`` takes transformers from ``named_steps`` using keys from ``row_keys`` and creates pipeline to transform. scoring : string, callable or None, default=None A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. If None, the score method of the estimator is used. collect_n : list of strings List of keys from data dictionary you want to collect and create feature vectors. Returns ------- scores : array-like Scores calculated on cross-validation. """ columns = list(self.plan_table.columns)[-len(row_keys):] param_key = ''.join(row_keys) + str(scoring) steps = list() for row_key, column in zip(row_keys, columns): steps.append((row_key, self.named_steps[column][row_key])) steps[-1][1].set_params(**self.best_params[param_key]) if not collect_n: scores = cross_val_score(Pipeline(steps), X, y, scoring=scoring, cv=self.eval_cv, n_jobs=-1) else: init_random_state = self.eval_cv.random_state scores = list() for i in range(collect_n): fold_prediction = cross_val_predict(Pipeline(steps), X, y, cv=self.eval_cv, n_jobs=-1) metric = check_scoring(steps[-1][1], scoring=scoring).__dict__['_score_func'] scores.append(metric(y, fold_prediction)) self.eval_cv.random_state += 1 self.eval_cv.random_state = init_random_state return scores
def fit(self, X, y=None, groups=None, **fit_params): """ Run fit with all sets of parameters. :param X: array-like, `shape = [n_samples, n_features]` Training vector, where n_samples is the number of samples and n_features is the number of features. :param y: array-like, `shape = [n_samples]` or `[n_samples, n_output]`, optional; Target relative to X for classification or regression; None for unsupervised learning. :param groups: array-like, with shape `(n_samples,)`, optional; Group labels for the samples used while splitting the dataset into train/test set. :param fit_params: dict of `string -> object`; Parameters passed to the fit method of the estimator :return: `self` """ from random import uniform from numpy import array, unique, sqrt from sklearn.base import clone, is_classifier from sklearn.metrics.scorer import check_scoring from sklearn.model_selection._search import check_cv from sklearn.model_selection._validation import _fit_and_score # from lightgbm.sklearn import LightGBMError radius_list = [] self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) target_classes = [] if y is not None: target_classes = unique(y) # if type(self.cv) is int: # cv = ShuffleSplit(n_splits=self.cv, test_size=.25) # else: cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) x0_ = [] self.bounds = list(self.bounds) for param in self.params_list: param_num_range = self.params[param] if param_num_range.VType != "hdreal": radius_list.append( (param_num_range.upper - param_num_range.lower) / 2.0) if param in self.init: if param_num_range.VType == "categorical": x0_.append( param_num_range.items.index(self.init[param])) else: x0_.append(self.init[param]) else: x0_.append( uniform(param_num_range.lower, param_num_range.upper)) self.bounds.append(param_num_range.bound_tuple) else: for i in range(param_num_range.n): radius_list.append( (param_num_range.bound_tuple[i][1] - param_num_range.bound_tuple[i][0]) / 2.0) if (param in self.init) and (i in self.init[param]): x0_.append(self.init[param][i]) else: x0_.append( uniform( param_num_range.bound_tuple[i][0], param_num_range.bound_tuple[i][1], )) self.bounds = self.bounds + list(param_num_range.bound_tuple) if self.radius is None: rds = 0.0 for r in radius_list: rds += r**2 self.radius = sqrt(rds) self.x0 = array(x0_) self.bounds = tuple(self.bounds) cv_dat = list(cv.split(X, y)) def obj(x): cand_params = {} _idx = 0 for _param in self.params_list: _param_num_range = self.params[_param] if _param_num_range.VType != "hdreal": if _param_num_range.VType == "integer": cand_params[_param] = int(round(x[_idx])) elif _param_num_range.VType == "categorical": cand_params[_param] = _param_num_range.items[int( round(x[_idx]))] else: cand_params[_param] = x[_idx] _idx += 1 else: _cls_dict = {} for i_ in range(_param_num_range.n): _cls_dict[target_classes[i_]] = x[_idx] _idx += 1 cand_params[_param] = _cls_dict #cl = clone(self.estimator) #cl.set_params(**cand_params) score = 0 n_test = 0 def parallel_fit_score(cl, cand_params, X, y, scorer, train, test, verbose, fit_params, error_score): cl.set_params(**cand_params) try: _score = _fit_and_score( estimator=cl, X=X, y=y, scorer=scorer, # train=train, test=test, verbose=verbose, # parameters=cand_params, fit_params=fit_params, # error_score=error_score, # )[0] return _score except ValueError: if self.verbose > 1: print("Model evaluation error") else: pass except: # LightGBMError: pass return None scores = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch)( delayed(parallel_fit_score)(clone(self.estimator), cand_params=cand_params, X=X, y=y, scorer=self.scorer_, train=train, test=test, verbose=self.verbose, fit_params=self.fit_params, error_score=self.error_score) for train, test in cv_dat) for sc in scores: if sc is not None: score += sc n_test += 1 score = score / float(max(n_test, 1)) return -score self.OPTIM = SurrogateSearch( obj, x0=self.x0, max_iter=self.max_iter, min_evals=self.min_evals, ineqs=self.ineqs, bounds=self.bounds, verbose=self.verbose, radius=self.radius, regressor=self.regressor, sampling=self.sampling, search_sphere=self.search_sphere, contraction=self.contraction, max_itr_no_prog=self.max_itr_no_prog, optimizer=self.optimizer, scipy_solver=self.scipy_solver, optimithon_dd_method=self.optimithon_dd_method, optimithon_difftool=self.optimithon_difftool, optimithon_t_method=self.optimithon_t_method, optimithon_ls_method=self.optimithon_ls_method, optimithon_ls_bt_method=self.optimithon_ls_bt_method, optimithon_br_func=self.optimithon_br_func, optimithon_penalty=self.optimithon_penalty, task_name=self.task_name, warm_start=self.warm_start, Continue=self.Continue, ) x, scr = self.OPTIM() best_params_ = {} idx = 0 for param in self.params_list: param_num_range = self.params[param] if param_num_range.VType != "hdreal": if param_num_range.VType == "integer": best_params_[param] = int(round(x[idx])) elif param_num_range.VType == "categorical": best_params_[param] = param_num_range.items[int( round(x[idx]))] else: best_params_[param] = x[idx] idx += 1 else: cls_dict = {} for i in range(param_num_range.n): cls_dict[target_classes[i]] = x[idx] idx += 1 best_params_[param] = cls_dict self.best_estimator_ = clone(self.estimator).set_params(**best_params_) self.best_estimator_score = scr self.best_score_ = scr return self
def fit(self): LOG.info('Start fitting ...') gs_cv_params = { 'n_jobs': self.n_jobs, 'cv': _cv_build(self.cv_inner), 'verbose': 0 } zscore_cv_auc = [] zscore_cv_acc = [] split_id = 0 for dozs in [False, True]: LOG.info('Generate %sz-scored sample ...', '' if dozs else 'non ') X, y, groups = self._generate_sample(zscored=dozs) # The inner CV loop is a grid search on clf_params LOG.info('Creating ModelAndGridSearchCV') inner_cv = ModelAndGridSearchCV(self.param, **gs_cv_params) # Some sklearn's validations scoring = check_scoring(inner_cv, scoring='roc_auc') cv_outer = check_cv(_cv_build(self.cv_outer), y, classifier=is_classifier(inner_cv)) # Outer CV loop outer_cv_scores = [] outer_cv_acc = [] LOG.info('Starting nested cross-validation ...') for train, test in list(cv_outer.split(X, y, groups)): # Find the groups in the train set, in case inner CV is LOSO. fit_params = None if self.cv_inner.get('type') == 'loso': train_groups = [groups[i] for i in train] fit_params = {'groups': train_groups} result = nested_fit_and_score(clone(inner_cv), X, y, scoring, train, test, fit_params=fit_params, verbose=1) # Test group has no positive cases if result is None: continue score, clf = result test_group = list(set(groups[i] for i in test))[0] self._models.append({ # 'clf_type': clf_str, 'zscored': int(dozs), 'outer_split_id': split_id, 'left-out-sites': self.sites[test_group], 'best_model': clf.best_model_, 'best_params': clf.best_params_, 'best_score': clf.best_score_, 'best_index': clf.best_index_, 'cv_results': clf.cv_results_, 'cv_scores': score['test']['roc_auc'], 'cv_accuracy': score['test']['accuracy'], 'cv_params': clf.cv_results_['params'], 'cv_auc_means': clf.cv_results_['mean_test_score'], 'cv_splits': { 'split%03d' % i: clf.cv_results_['split%d_test_score' % i] for i in list(range(clf.n_splits_)) } }) # Store the outer loop scores if score['test']['roc_auc'] is not None: outer_cv_scores.append(score['test']['roc_auc']) outer_cv_acc.append(score['test']['accuracy']) split_id += 1 # LOG.info( # '[%s-%szs] Outer CV: roc_auc=%f, accuracy=%f, ' # 'Inner CV: best roc_auc=%f, params=%s. ', # clf.best_model_[0], 'n' if not dozs else '', # score['test']['roc_auc'] if score['test']['roc_auc'] is not None else -1.0, # score['test']['accuracy'], # clf.best_score_, clf.best_model_[1]) LOG.info( 'Outer CV loop finished, roc_auc=%f (+/-%f), accuracy=%f (+/-%f)', np.mean(outer_cv_scores), 2 * np.std(outer_cv_scores), np.mean(outer_cv_acc), 2 * np.std(outer_cv_acc)) zscore_cv_auc.append(outer_cv_scores) zscore_cv_acc.append(outer_cv_acc) # Select best performing model best_inner_loops = [model['best_score'] for model in self._models] best_idx = np.argmax(best_inner_loops) self._best_model = self._models[best_idx] LOG.info( 'Inner CV [%d models compared] - best model %s-%szs, score=%f, params=%s', len(best_inner_loops) * len(self._models[0]['cv_params']), self._best_model['best_model'][0], 'n' if not self._best_model['zscored'] else '', self._best_model['best_score'], self._best_model['best_params']) # Write out evaluation result best_zs = 1 if self._best_model['zscored'] else 0 LOG.info( 'CV - estimated performance: roc_auc=%f (+/-%f), accuracy=%f (+/-%f)', np.mean(zscore_cv_auc[best_zs]), 2 * np.std(zscore_cv_auc[best_zs]), np.mean(zscore_cv_acc[best_zs]), 2 * np.std(zscore_cv_acc[best_zs]), )
def fit_and_score_estimator(estimator, parameters, cv, X, y=None, scoring=None, iid=True, n_jobs=1, verbose=1, pre_dispatch='2*n_jobs'): """Fit and score an estimator with cross-validation This function is basically a copy of sklearn's grid_search._BaseSearchCV._fit(), which is the core of the GridSearchCV fit() method. Unfortunately, that class does _not_ return the training set scores, which we want to save in the database, and because of the way it's written, you can't change it by subclassing or monkeypatching. This function uses some undocumented internal sklearn APIs (non-public). It was written against sklearn version 0.16.1. Prior Versions are likely to fail due to changes in the design of cross_validation module. Returns ------- out : dict, with keys 'mean_test_score' 'test_scores', 'train_scores' The scores on the training and test sets, as well as the mean test set score. """ scorer = check_scoring(estimator, scoring=scoring) n_samples = num_samples(X) X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr', allow_nans=True) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv=cv, y=y, classifier=is_classifier(estimator)) out = Parallel( n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch )( delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, verbose, parameters, fit_params=None) for train, test in cv.split(X, y)) assert len(out) == cv.n_splits train_scores, test_scores = [], [] n_train_samples, n_test_samples = [], [] for test_score, n_test, train_score, n_train, _ in out: train_scores.append(train_score) test_scores.append(test_score) n_test_samples.append(n_test) n_train_samples.append(n_train) train_scores, test_scores = map(list, check_arrays(train_scores, test_scores, warn_nans=True, replace_nans=True)) if iid: if verbose > 0 and is_msmbuilder_estimator(estimator): print('[CV] Using MSMBuilder API n_samples averaging') print('[CV] n_train_samples: %s' % str(n_train_samples)) print('[CV] n_test_samples: %s' % str(n_test_samples)) mean_test_score = np.average(test_scores, weights=n_test_samples) mean_train_score = np.average(train_scores, weights=n_train_samples) else: mean_test_score = np.average(test_scores) mean_train_score = np.average(train_scores) grid_scores = { 'mean_test_score': mean_test_score, 'test_scores': test_scores, 'mean_train_score': mean_train_score, 'train_scores': train_scores, 'n_test_samples': n_test_samples, 'n_train_samples': n_train_samples} return grid_scores
def fit_and_score_estimator(estimator, parameters, cv, X, y=None, scoring=None, iid=True, n_jobs=1, verbose=1, pre_dispatch='2*n_jobs'): """Fit and score an estimator with cross-validation This function is basically a copy of sklearn's grid_search._BaseSearchCV._fit(), which is the core of the GridSearchCV fit() method. Unfortunately, that class does _not_ return the training set scores, which we want to save in the database, and because of the way it's written, you can't change it by subclassing or monkeypatching. This function uses some undocumented internal sklearn APIs (non-public). It was written against sklearn version 0.16.1. Prior Versions are likely to fail due to changes in the design of cross_validation module. Returns ------- out : dict, with keys 'mean_test_score' 'test_scores', 'train_scores' The scores on the training and test sets, as well as the mean test set score. """ scorer = check_scoring(estimator, scoring=scoring) n_samples = num_samples(X) X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr', allow_nans=True) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv=cv, y=y, classifier=is_classifier(estimator)) out = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)( delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, verbose, parameters, fit_params=None) for train, test in cv.split(X, y)) assert len(out) == cv.n_splits train_scores, test_scores = [], [] n_train_samples, n_test_samples = [], [] for test_score, n_test, train_score, n_train, _ in out: train_scores.append(train_score) test_scores.append(test_score) n_test_samples.append(n_test) n_train_samples.append(n_train) train_scores, test_scores = map( list, check_arrays(train_scores, test_scores, warn_nans=True, replace_nans=True)) if iid: if verbose > 0 and is_msmbuilder_estimator(estimator): print('[CV] Using MSMBuilder API n_samples averaging') print('[CV] n_train_samples: %s' % str(n_train_samples)) print('[CV] n_test_samples: %s' % str(n_test_samples)) mean_test_score = np.average(test_scores, weights=n_test_samples) mean_train_score = np.average(train_scores, weights=n_train_samples) else: mean_test_score = np.average(test_scores) mean_train_score = np.average(train_scores) grid_scores = { 'mean_test_score': mean_test_score, 'test_scores': test_scores, 'mean_train_score': mean_train_score, 'train_scores': train_scores, 'n_test_samples': n_test_samples, 'n_train_samples': n_train_samples } return grid_scores
def rerun_nested_for_scoring(nested: NestedCV, score: str, X, y=None, groups=None, how='max', n_jobs=1, verbose=0, pre_dispatch='2*n_jobs', return_estimators=False): """ Rerun a nested CV grid / random hyper param run but very efficiently by using the stored scoring data from a previous run Parameters ---------- nested : An already "scored" NestedCV score : A string of a score calculated during the scoring run of nested how : 'max' or 'min', optional, default='max' will look for the min or max of the score provided return_estimators : if true return a tuple with new estimators in addition to nested cross, optional, default=False Returns ------- nested with new values, (optional, new_estimators) """ sub_scores = [ extract_score_grid(searcher) for searcher in nested.estimators_ ] sub_scores_means = [sub_score[[c for c in sub_score.columns if 'test' in c and 'mean' in c]] \ for sub_score in sub_scores] def create_summary(mean_table): return pd.DataFrame({ 'maxidx': mean_table.idxmax(), 'max': mean_table.max(), 'min': mean_table.min(), 'minidx': mean_table.idxmin() }) sub_scores_summary = [ create_summary(mean_table) for mean_table in sub_scores_means ] row = "mean_{}_test".format(score) col = how + "idx" idxs = [summary.loc[row, col] for summary in sub_scores_summary] params = [ pd.DataFrame(estimator.cv_results_)['params'][idx] for idx, estimator in zip(idxs, nested.estimators_) ] nested.best_params_ = params nested.best_idxs_ = idxs new_estimators = [ clone(estimator.estimator).set_params(**param) for param, estimator in zip(params, nested.estimators_) ] #set the random state so can reproduce results for est in new_estimators: est.set_params(random_state=nested.random_state) if hasattr(nested.scoring, 'change_decision_score'): new_scoring = nested.scoring.change_decision_score(score) else: new_scoring = nested.scoring parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) scores = parallel( delayed(_fit_and_score_with_extra_data)( estimator, X, y, check_scoring(estimator, new_scoring), train, test, verbose, None, nested.fit_params, return_train_score=True, return_times=True, return_estimator=return_estimators) for (train, test), estimator in zip(nested.cv_iter_, new_estimators)) if return_estimators: (nested.train_score_datas_, nested.train_scores_, nested.test_score_datas_, nested.test_scores_, nested.fit_times_, nested.score_times_, new_estimators) = zip(*scores) return nested, new_estimators else: (nested.train_score_datas_, nested.train_scores_, nested.test_score_datas_, nested.test_scores_, nested.fit_times_, nested.score_times_) = zip(*scores) return nested
def _fit(self, X, y, parameter_dict): self._cv_results = None # To indicate to the property the need to update self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(self.cv, y=y, classifier=is_classifier(self.estimator)) toolbox = base.Toolbox() name_values, gene_type, maxints = _get_param_types_maxint( parameter_dict) if self.gene_type is None: self.gene_type = gene_type if self.verbose: print("Types %s and maxint %s detected" % (self.gene_type, maxints)) toolbox.register("individual", _initIndividual, creator.Individual, maxints=maxints) toolbox.register("population", tools.initRepeat, list, toolbox.individual) # If n_jobs is an int, greater than 1 or less than 0 (indicating to use as # many jobs as possible) then we are going to create a default pool. # Windows users need to be warned of this feature as it only works properly # on linux. They need to encapsulate their pool in an if __name__ == "__main__" # wrapper so that pools are not recursively created when the module is reloaded in each map if isinstance(self.n_jobs, int): if self.n_jobs > 1 or self.n_jobs < 0: from multiprocessing import Pool # Only imports if needed if os.name == 'nt': # Checks if we are on Windows warnings.warn(( "Windows requires Pools to be declared from within " "an \'if __name__==\"__main__\":\' structure. In this " "case, n_jobs will accept map functions as well to " "facilitate custom parallelism. Please check to see " "that all code is working as expected.")) pool = Pool(self.n_jobs) toolbox.register("map", pool.map) # If it's not an int, we are going to pass it as the map directly else: try: toolbox.register("map", self.n_jobs) except Exception: raise TypeError( "n_jobs must be either an integer or map function. Received: {}" .format(type(self.n_jobs))) toolbox.register("evaluate", _evalFunction, name_values=name_values, X=X, y=y, scorer=self.scorer_, cv=cv, iid=self.iid, verbose=self.verbose, error_score=self.error_score, fit_params=self.fit_params, score_cache=self.score_cache) toolbox.register("mate", _cxIndividual, indpb=self.gene_crossover_prob, gene_type=self.gene_type) toolbox.register("mutate", _mutIndividual, indpb=self.gene_mutation_prob, up=maxints) toolbox.register("select", tools.selTournament, tournsize=self.tournament_size) pop = toolbox.population(n=self.population_size) hof = tools.HallOfFame(1) # Stats stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", np.nanmean) stats.register("min", np.nanmin) stats.register("max", np.nanmax) stats.register("std", np.nanstd) # History hist = tools.History() toolbox.decorate("mate", hist.decorator) toolbox.decorate("mutate", hist.decorator) hist.update(pop) if self.verbose: print('--- Evolve in {0} possible combinations ---'.format( np.prod(np.array(maxints) + 1))) pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=self.generations_number, stats=stats, halloffame=hof, verbose=self.verbose) # Save History self.all_history_.append(hist) self.all_logbooks_.append(logbook) current_best_score_ = hof[0].fitness.values[0] current_best_params_ = _individual_to_params(hof[0], name_values) if self.verbose: print("Best individual is: %s\nwith fitness: %s" % (current_best_params_, current_best_score_)) if current_best_score_ > self.best_mem_score_: self.best_mem_score_ = current_best_score_ self.best_mem_params_ = current_best_params_ # Check memoization, potentially unknown bug # assert str(hof[0]) in self.score_cache, "Best individual not stored in score_cache for cv_results_." # Close your pools if you made them if isinstance(self.n_jobs, int) and (self.n_jobs > 1 or self.n_jobs < 0): pool.close() pool.join() self.best_score_ = current_best_score_ self.best_params_ = current_best_params_
def _wrapped_cross_val_score(sklearn_pipeline, features, target, cv, scoring_function, sample_weight=None, groups=None, use_dask=False): """Fit estimator and compute scores for a given dataset split. Parameters ---------- sklearn_pipeline : pipeline object implementing 'fit' The object to use to fit the data. features : array-like of shape at least 2D The data to fit. target : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. cv: int or cross-validation generator If CV is a number, then it is the number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. If it is an object then it is an object to be used as a cross-validation generator. scoring_function : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. sample_weight : array-like, optional List of sample weights to balance (or un-balanace) the dataset target as needed groups: array-like {n_samples, }, optional Group labels for the samples used while splitting the dataset into train/test set use_dask : bool, default False Whether to use dask """ sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight) features, target, groups = indexable(features, target, groups) cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline)) cv_iter = list(cv.split(features, target, groups)) scorer = check_scoring(sklearn_pipeline, scoring=scoring_function) if use_dask: try: import dask_ml.model_selection # noqa import dask # noqa from dask.delayed import Delayed except ImportError: msg = "'use_dask' requires the optional dask and dask-ml depedencies." raise ImportError(msg) dsk, keys, n_splits = dask_ml.model_selection._search.build_graph( estimator=sklearn_pipeline, cv=cv, scorer=scorer, candidate_params=[{}], X=features, y=target, groups=groups, fit_params=sample_weight_dict, refit=False, error_score=float('-inf'), ) cv_results = Delayed(keys[0], dsk) scores = [ cv_results['split{}_test_score'.format(i)] for i in range(n_splits) ] CV_score = dask.delayed(np.array)(scores)[:, 0] return dask.delayed(np.nanmean)(CV_score) else: try: with warnings.catch_warnings(): warnings.simplefilter('ignore') scores = [ _fit_and_score(estimator=clone(sklearn_pipeline), X=features, y=target, scorer=scorer, train=train, test=test, verbose=0, parameters=None, fit_params=sample_weight_dict) for train, test in cv_iter ] CV_score = np.array(scores)[:, 0] return np.nanmean(CV_score) except TimeoutException: return "Timeout" except Exception as e: return -float('inf')
def fit(self, X, y, sample_weight=None): # fitting everything except weights original_weights = self.weights # have to set self.weights because VotingClassifier.fit checks them self.weights = np.ones(len(self.estimators)) # fit to the full data super(VotingClassifierCV, self).fit(X, y, sample_weight=sample_weight) estimators = self.estimators_ self.weights = original_weights # generate cross_validated predictions for each classifier cv = check_cv(self.cv) scoring = check_scoring(self, self.scoring) parallel = Parallel(n_jobs=self.n_jobs) method = 'predict_proba' if self.voting == 'soft' else 'predict' fit_params = {} if sample_weight is not None: fit_params['sample_weight'] = sample_weight verbose = False preds = [] for name, est in self.estimators: prediction_blocks = parallel( delayed(_fit_and_predict)(clone(est), X, y, train, test, verbose, fit_params, method) for train, test in cv.split(X, y)) preds.append([pred for pred, _ in prediction_blocks]) # prepare sample weights and targets for each fold test_targets = [] test_weights = [] for train, test in cv.split(y): test_targets.append(y[test]) if sample_weight: test_weights.append(sample_weight[test]) else: test_weights.append(None) # recreate list of possible weights weights_array = np.array(self.weights) if len(weights_array.shape) == 2: self.weigths_seq_ = self.weights else: if len(weights_array.shape) == 0: weight_wec = np.arange(self.weights) else: # assume it is 1d weight_wec = self.weights clf_len = len(self.estimators) self.weigths_seq_ = [ x for x in product(*[weight_wec] * clf_len) if sum(x) > 0 ] # score the classifier at different weights scores = [] for weights_vector in self.weigths_seq_: self.weights_ = weights_vector cv_scores = [] for fold, pred_vectors in enumerate(zip(*preds)): self.estimators_ = [ PredefinedClassifier(pred_vector) for pred_vector in pred_vectors ] test_y = test_targets[fold] test_x = None test_w = test_weights[fold] cv_scores.append(scoring(self, test_x, test_y, test_w)) scores.append(cv_scores) self.scores_ = np.array(scores) # choose the best weight self.weights_ = self.weigths_seq_[np.argmax( np.mean(self.scores_, axis=1))] self.estimators_ = estimators
def _scoring(self, net, X_test, y_test): """Resolve scoring and apply it to data. Use cached prediction instead of running inference again, if available.""" scorer = check_scoring(net, self.scoring_) return scorer(net, X_test, y_test)
def fit(self, X, y, Xd, yd): """Fit the gradient boosting model. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. Xd : array-like, shape=(n_design, n_features) The design samples. y : array-like, shape=(n_samples, output_dim=2) Target values. yd : array-like, shape=(n_samples, output_dim=2) Design target values. Returns ------- self : object """ #add# #if (not hasattr(self, 'Xd')) or (not hasattr(self, 'yd')): # raise NotImplementedError fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes # time spent predicting X for gradient and hessians update acc_prediction_time = 0. # TODO: add support for mixed-typed (numerical + categorical) data # TODO: add support for missing data # TODO: add support for pre-binned data (pass-through)? # TODO: test input checking X, y = check_X_y( X, y, dtype=[np.float32, np.float64], ## Add by k## multi_output=True) y = self._encode_y(y) #add# rng = check_random_state(self.random_state) ## Add by k ## Xd_, yd_ = check_X_y( Xd, yd, dtype=[np.float32, np.float64], ## Add by k## multi_output=True) yd_ = self._encode_y(yd_) self.Xd = Xd_ # X design self.yd = yd_ # y design self.n_design = self.Xd.shape[0] self.output_dim = np.reshape(self.yd, (1, -1)).shape[1] assert (self.Xd.shape[1] == X.shape[1]) ## Add by k ## self._validate_parameters() tic = time() self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng) # modify X diff_X_Xd = np.abs(X[:, :, np.newaxis] - self.Xd[:, :, np.newaxis].T).reshape( X.shape[0], self.Xd.shape[0], X.shape[1], order='f') mean_X_Xd = .5 * ( X[:, :, np.newaxis] + self.Xd[:, :, np.newaxis].T).reshape( X.shape[0], self.Xd.shape[0], X.shape[1], order='f') Phi_X_Xd = np.concatenate((diff_X_Xd, mean_X_Xd), axis=2) #print(Phi_X_Xd.shape) if self.verbose: print(f"Binning {Phi_X_Xd.nbytes / 1e9:.3f} GB of data: ", end="", flush=True) X_binned = self.bin_mapper_.fit_transform(Phi_X_Xd) ## add by k ## ######## ## OK ## ######## toc = time() if self.verbose: duration = toc - tic troughput = X.nbytes / duration print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)") self.loss_ = self._get_loss() if self.validation_split is not None: # stratify for classification stratify = y if hasattr(self.loss_, 'predict_proba') else None if hasattr(self.loss_, 'predict_proba'): raise (NotImplementedError) X_binned_train, X_binned_val, y_train, y_val = train_test_split( X_binned, y, test_size=self.validation_split, stratify=stratify, random_state=rng) X_binned_train = np.asfortranarray(X_binned_train) X_binned_val = np.asfortranarray(X_binned_val) # Histogram computation is faster on feature-aligned data. else: X_binned_train, y_train = X_binned, y X_binned_val, y_val = None, None X_binned_train = np.asfortranarray(X_binned_train) # Subsample the training set for score-based monitoring. subsample_size = 10000 if X_binned_train.shape[0] < subsample_size: X_binned_small_train = np.asfortranarray(X_binned_train) y_small_train = y_train else: indices = rng.choice(np.arange(X_binned_train.shape[0]), subsample_size) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] self.X_binned_small_train = X_binned_small_train self.X_binned_val = X_binned_val if self.verbose: print("Fitting gradient boosted rounds:") #n_samples = X_binned_train.shape[0] * X_binned_train.shape[1] n_samples = X_binned_small_train.shape[0] # values predicted by the trees. Used as-is in regression, and # transformed into probas and / or classes for classification raw_predictions = np.zeros(shape=(n_samples, self.n_trees_per_iteration_), dtype=y_train.dtype) # gradients and hessians are 1D arrays of size # n_samples * n_trees_per_iteration # gradients and hessians have changed gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, n_trees_per_iteration=self.n_trees_per_iteration_) #print('raw_', raw_predictions) #print('gradient', gradients) # predictors_ is a matrix of TreePredictor objects with shape # (n_iter_, n_trees_per_iteration) self.predictors_ = predictors = [] self.train_scores_ = [] if self.validation_split is not None: self.validation_scores_ = [] scorer = check_scoring(self, self.scoring) gb_start_time = time() # TODO: compute training loss and use it for early stopping if no # validation data is provided? self.n_iter_ = 0 while True: should_stop = self._stopping_criterion(gb_start_time, scorer, X_binned_small_train, y_small_train, X_binned_val, y_val) if should_stop or self.n_iter_ == self.max_iter: break # Update gradients and hessians, inplace self.loss_.update_gradients_and_hessians(gradients, hessians, y_small_train, raw_predictions) #print('grad', gradients) predictors.append([]) # Build `n_trees_per_iteration` trees. for k, (gradients_at_k, hessians_at_k) in enumerate( zip(np.array_split(gradients, self.n_trees_per_iteration_), np.array_split(hessians, self.n_trees_per_iteration_))): # the xxxx_at_k arrays are **views** on the original arrays. # Note that for binary classif and regressions, # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the # whole array. #X_binned_small_train_, _, gradients_at_k_, _, indices_subsample_, _ = \ #train_test_split(X_binned_small_train, gradients_at_k, np.arange(len(X_binned_small_train)), \ #train_size=subsample_ratio, shuffle=False, random_state=0) #X_binned_small_train_ = np.asfortranarray(X_binned_small_train_) grower = TreeGrower( X_binned_small_train, gradients_at_k, hessians_at_k, yd, max_bins=self.max_bins, n_bins_per_feature=self.bin_mapper_.n_bins_per_feature_, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, l2_regularization=self.l2_regularization, shrinkage=self.learning_rate) grower.grow() #print('I grew') acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time predictor = grower.make_predictor( bin_thresholds=self.bin_mapper_.bin_thresholds_) predictors[-1].append(predictor) tic_pred = time() # prepare leaves_data so that _update_raw_predictions can be # @njitted leaves_data = [(l.value, l.sample_indices) for l in grower.finalized_leaves] _update_raw_predictions(leaves_data, self.yd, raw_predictions[:, k]) #print('raw_pred', raw_predictions) toc_pred = time() acc_prediction_time += toc_pred - tic_pred self.n_iter_ += 1 #self.learning_rate *= 1. # maybe to set self.learning_rate = 1. / (self.n_iter_ + 1) #self.max_depth += 1 #self.max_depth = min(5, self.max_depth) #print('pred', raw_predictions) #print('n_iter', self.n_iter_) if self.verbose: duration = time() - fit_start_time n_total_leaves = sum( predictor.get_n_leaf_nodes() for predictors_at_ith_iteration in self.predictors_ for predictor in predictors_at_ith_iteration) n_predictors = sum( len(predictors_at_ith_iteration) for predictors_at_ith_iteration in self.predictors_) print(f"Fit {n_predictors} trees in {duration:.3f} s, " f"({n_total_leaves} total leaves)") print(f"{'Time spent finding best splits:':<32} " f"{acc_find_split_time:.3f}s") print(f"{'Time spent applying splits:':<32} " f"{acc_apply_split_time:.3f}s") print(f"{'Time spent predicting:':<32} " f"{acc_prediction_time:.3f}s") self.train_scores_ = np.asarray(self.train_scores_) if self.validation_split is not None: self.validation_scores_ = np.asarray(self.validation_scores_) return self
def _fit(self, X, y, groups, parameter_iterable): """ Actual fitting, performing the search over parameters. Taken from https://github.com/scikit-learn/scikit-learn/blob/0.18.X .../sklearn/model_selection/_search.py """ estimator = self.estimator cv = sklearn.model_selection._validation.check_cv( self.cv, y, classifier=is_classifier(estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch cv_iter = list(cv.split(X, y, groups)) out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( delayed(sklearn.model_selection._validation._fit_and_score)( clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, fit_params=self.fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, return_times=True, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv_iter) # if one choose to see train score, "out" will contain train score info if self.return_train_score: (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out) else: (test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out) candidate_params = parameters[::n_splits] n_candidates = len(candidate_params) results = dict() def _store(key_name, array, weights=None, splits=False, rank=False): """A small helper to store the scores/times to the cv_results_""" array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits) if splits: for split_i in range(n_splits): results["split%d_%s" % (split_i, key_name)] = array[:, split_i] array_means = np.average(array, axis=1, weights=weights) results['mean_%s' % key_name] = array_means # Weighted std is not directly available in numpy array_stds = np.sqrt( np.average((array - array_means[:, np.newaxis])**2, axis=1, weights=weights)) results['std_%s' % key_name] = array_stds if rank: results["rank_%s" % key_name] = np.asarray(rankdata( -array_means, method='min'), dtype=np.int32) # Computed the (weighted) mean and std for test scores alone # NOTE test_sample counts (weights) remain the same for all candidates test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int) _store('test_score', test_scores, splits=True, rank=True, weights=test_sample_counts if self.iid else None) if self.return_train_score: _store('train_score', train_scores, splits=True) _store('fit_time', fit_time) _store('score_time', score_time) best_index = np.flatnonzero(results["rank_test_score"] == 1)[0] best_parameters = candidate_params[best_index] # Use one MaskedArray and mask all the places where the param is not # applicable for that candidate. Use defaultdict as each candidate may # not contain all the params param_results = defaultdict( partial(MaskedArray, np.empty(n_candidates, ), mask=True, dtype=object)) for cand_i, params in enumerate(candidate_params): for name, value in params.items(): # An all masked empty array gets created for the key # `"param_%s" % name` at the first occurence of `name`. # Setting the value at an index also unmasks that index param_results["param_%s" % name][cand_i] = value results.update(param_results) # Store a list of param dicts at the key 'params' results['params'] = candidate_params self.cv_results_ = results self.best_index_ = best_index self.n_splits_ = n_splits if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best_parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _wrapped_cross_val_score(sklearn_pipeline, features, target, cv, scoring_function, sample_weight=None, groups=None, feature_selectors=None, fs_modifier=None): """Fit estimator and compute scores for a given dataset split. Parameters ---------- sklearn_pipeline : pipeline object implementing 'fit' The object to use to fit the data. features : array-like of shape at least 2D The data to fit. target : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. cv: int or cross-validation generator If CV is a number, then it is the number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. If it is an object then it is an object to be used as a cross-validation generator. scoring_function : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. sample_weight : array-like, optional List of sample weights to balance (or un-balanace) the dataset target as needed groups: array-like {n_samples, }, optional Group labels for the samples used while splitting the dataset into train/test set fs_modifier: float, optional (default: 0.0001) Modifier value how the number of features should reduce the score. Only used when feature_selection is not None """ sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight) features, target, groups = indexable(features, target, groups) cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline)) cv_iter = list(cv.split(features, target, groups)) scorer = check_scoring(sklearn_pipeline, scoring=scoring_function) try: with warnings.catch_warnings(): warnings.simplefilter('ignore') # START ADDITIONAL FS code # Split sklearn_pipeline steps to find out the number of features left after selection if (not fs_modifier is None): fs_pipeline = None # Check for every pipeline step, starting with the last for i in reversed(range(len(sklearn_pipeline.steps))): # Whether it is a feature selector if (sklearn_pipeline.steps[i][0]) in feature_selectors: fs_pipeline = Pipeline( steps=sklearn_pipeline.steps[:i + 1]) break # If part of it is a feature selection pipeline: if not fs_pipeline is None: n_features = fs_pipeline.fit_transform(features, target).shape[1] else: n_features = features.shape[1] # END ADDITIONAL FS code scores = [ _fit_and_score(estimator=clone(sklearn_pipeline), X=features, y=target, scorer=scorer, train=train, test=test, verbose=0, parameters=None, fit_params=sample_weight_dict) for train, test in cv_iter ] # START ADDITIONAL FS code # Alter the score by removing a set number of features if not fs_modifier is None: scores = [[ unlisted_score * (fs_modifier**n_features) for unlisted_score in score ] for score in scores] print( "The final score becomes %f, after multiplying it it by %.2f for %i features and modifier %.2f" % (np.average(np.asarray(scores)), fs_modifier** n_features, n_features, fs_modifier)) # END ADDITIONAL FS code CV_score = np.array(scores)[:, 0] return np.nanmean(CV_score) except TimeoutException: return "Timeout" except Exception as e: return -float('inf')
def _fit(self, X, y, groups, parameter_iterable): estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) if hasattr(cv, 'random_state'): if not cv.random_state: cv.random_state = randint(1000, 9999) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) param_grid = [(parameters, test_sequence_index) for parameters in parameter_iterable for test_sequence_index in range(n_splits)] # Because the original python code expects a certain order for the elements, we need to # respect it. indexed_param_grid = list(zip(range(len(param_grid)), param_grid)) par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid)) X_bc = self.sc.broadcast(X) y_bc = self.sc.broadcast(y) groups_bc = self.sc.broadcast(groups) scorer = self.scorer_ verbose = self.verbose error_score = self.error_score fit_params = self.fit_params return_train_score = self.return_train_score fas = _fit_and_score def fun(tup): (index, (parameters, test_sequence_index)) = tup local_estimator = clone(base_estimator) local_X = X_bc.value local_y = y_bc.value local_groups = groups_bc.value train, test = next( islice(cv.split(local_X, local_y, local_groups), test_sequence_index, test_sequence_index + 1)) res = fas(local_estimator, local_X, local_y, scorer, train, test, verbose, parameters, fit_params, return_train_score=return_train_score, return_n_test_samples=True, return_times=True, return_parameters=True, error_score=error_score) return (index, res) indexed_out0 = dict(par_param_grid.map(fun).collect()) out = [indexed_out0[idx] for idx in range(len(param_grid))] if return_train_score: (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out) else: (test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out) X_bc.unpersist() y_bc.unpersist() groups_bc.unpersist() candidate_params = parameters[::n_splits] n_candidates = len(candidate_params) results = dict() def _store(key_name, array, weights=None, splits=False, rank=False): """A small helper to store the scores/times to the cv_results_""" # When iterated first by splits, then by parameters array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits) if splits: for split_i in range(n_splits): results["split%d_%s" % (split_i, key_name)] = array[:, split_i] array_means = np.average(array, axis=1, weights=weights) results['mean_%s' % key_name] = array_means # Weighted std is not directly available in numpy array_stds = np.sqrt( np.average((array - array_means[:, np.newaxis])**2, axis=1, weights=weights)) results['std_%s' % key_name] = array_stds if rank: results["rank_%s" % key_name] = np.asarray(rankdata( -array_means, method='min'), dtype=np.int32) # Computed the (weighted) mean and std for test scores alone # NOTE test_sample counts (weights) remain the same for all candidates test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int) _store('test_score', test_scores, splits=True, rank=True, weights=test_sample_counts if self.iid else None) if self.return_train_score: _store('train_score', train_scores, splits=True) _store('fit_time', fit_time) _store('score_time', score_time) best_index = np.flatnonzero(results["rank_test_score"] == 1)[0] best_parameters = candidate_params[best_index] # Use one MaskedArray and mask all the places where the param is not # applicable for that candidate. Use defaultdict as each candidate may # not contain all the params param_results = defaultdict( partial(MaskedArray, np.empty(n_candidates, ), mask=True, dtype=object)) for cand_i, params in enumerate(candidate_params): for name, value in params.items(): # An all masked empty array gets created for the key # `"param_%s" % name` at the first occurence of `name`. # Setting the value at an index also unmasks that index param_results["param_%s" % name][cand_i] = value results.update(param_results) # Store a list of param dicts at the key 'params' results['params'] = candidate_params self.cv_results_ = results self.best_index_ = best_index self.n_splits_ = n_splits if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best_parameters) if y is not None: best_estimator.fit(X, y, **fit_params) else: best_estimator.fit(X, **fit_params) self.best_estimator_ = best_estimator return self
def score(self, X, y=None, groups=None, n_jobs=1, verbose=0, pre_dispatch='2*n_jobs'): """ Will score the estimator and score according to self.cv """ X, y, groups = indexable(X, y, groups) if not isinstance( self.random_state, (numbers.Integral, np.integer)) and self.use_same_random_state: raise ValueError( "If use_same_randome_state, the random state passed in must be an Integer" ) def clone_estimator(): """Clone the estimator and put in the correct random state for the nested cross validation """ estimator = clone(self.estimator) if self.use_same_random_state and ( 'random_state' in estimator.get_params().keys()): estimator.set_params(random_state=self.random_state) return estimator cv = check_cv2(self.cv, y, classifier=is_classifier(self.estimator), random_state=self.random_state) self.cv_iter_ = list(cv.split(X, y, groups)) scorer = check_scoring(self.estimator, scoring=self.scoring) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) scores = parallel( delayed(_fit_and_score_with_extra_data)(clone_estimator(), X, y, scorer, train, test, verbose, None, self.fit_params, return_train_score=True, return_times=True, return_estimator=True) for train, test in self.cv_iter_) (self.train_score_datas_, self.train_scores_, self.test_score_datas_, self.test_scores_, self.fit_times_, self.score_times_, self.estimators_) = zip(*scores) if hasattr(self.estimators_[0], 'best_params_'): self.best_params_ = [ estimator.best_params_ for estimator in self.estimators_ ] else: print("WARN: NestedCV.best_params_ set to None") self.best_params_ = None if hasattr(self.estimators_[0], 'best_index_'): self.best_idxs_ = [ estimator.best_index_ for estimator in self.estimators_ ] else: print("WARN: NestedCV.best_idxs_ set to None") self.best_idxs_ = None return self.test_scores_
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) if slversion == 18: cv = check_cv(cv, y, classifier=is_classifier(estimator)) # cv is actually not the same generator else: cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) param_grid = [(parameters, train, test) for parameters in parameter_iterable for (train, test) in cv] # Because the original python code expects a certain order for the elements, we need to # respect it. indexed_param_grid = list(zip(range(len(param_grid)), param_grid)) par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid)) X_bc = self.sc.broadcast(X) y_bc = self.sc.broadcast(y) scorer = self.scorer_ verbose = self.verbose fit_params = self.fit_params error_score = self.error_score fas = _fit_and_score def fun(tup): (index, (parameters, train, test)) = tup local_estimator = clone(base_estimator) local_X = X_bc.value local_y = y_bc.value res = fas(local_estimator, local_X, local_y, scorer, train, test, verbose, parameters, fit_params, return_parameters=True, error_score=error_score) return (index, res) indexed_out0 = dict(par_param_grid.map(fun).collect()) out = [indexed_out0[idx] for idx in range(len(param_grid))] X_bc.unpersist() y_bc.unpersist() # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple(parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params(**best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, Z, parameter_iterable): """Actual fitting, performing the search over parameters.""" self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) cv = self.cv cv = _check_cv(cv, Z) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch, backend="threading" )( delayed(_fit_and_score)(clone(base_estimator), Z, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) best_estimator.fit(Z, **self.fit_params) self.best_estimator_ = best_estimator return self
def cross_val_multiscore(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): """Evaluate a score by cross-validation. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like, shape (n_samples, n_dimensional_features,) The data to fit. Can be, for example a list, or an array at least 2d. y : array-like, shape (n_samples, n_targets,) The target variable to try to predict in the case of supervised learning. groups : array-like, with shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. scoring : string, callable | None A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. cv : int, cross-validation generator | iterable Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross validation, - integer, to specify the number of folds in a ``(Stratified)KFold``, - An object to be used as a cross-validation generator. - An iterable yielding train, test splits. For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`sklearn.model_selection.StratifiedKFold` is used. In all other cases, :class:`sklearn.model_selection.KFold` is used. n_jobs : integer, optional The number of CPUs to use to do the computation. -1 means 'all CPUs'. verbose : integer, optional The verbosity level. fit_params : dict, optional Parameters to pass to the fit method of the estimator. pre_dispatch : int, or string, optional Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs - An int, giving the exact number of total jobs that are spawned - A string, giving an expression as a function of n_jobs, as in '2*n_jobs' Returns ------- scores : array of float, shape (n_splits,) | shape (n_splits, n_scores) Array of scores of the estimator for each run of the cross validation. """ # This code is copied from sklearn from sklearn.base import clone from sklearn.utils import indexable from sklearn.metrics.scorer import check_scoring from sklearn.model_selection._split import check_cv X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) cv_iter = list(cv.split(X, y, groups)) scorer = check_scoring(estimator, scoring=scoring) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. # Note: this parallelization is implemented using MNE Parallel parallel, p_func, n_jobs = parallel_func(_fit_and_score, n_jobs, pre_dispatch=pre_dispatch) scores = parallel( p_func(clone(estimator), X, y, scorer, train, test, verbose, None, fit_params) for train, test in cv_iter) return np.array(scores)[:, 0, ...] # flatten over joblib output.
def __init__(self, parameters, estimator, X=None, y=None, model='GCP', score_format = 'cv', fit_params=None, scoring=None, cv=5, acquisition_function = 'UCB', corr_kernel= 'squared_exponential', n_clusters=1, n_clusters_max=5, cluster_evol = 'constant', GCP_mapWithNoise=False, GCP_useAllNoisyY=False, model_noise=None, n_iter=100, n_init=10, n_final_iter = 5, n_candidates = 500, nugget=1.e-10, detailed_res=1, verbose=1): self.parameters = parameters self.n_parameters = len(parameters) self.n_iter = n_iter self.n_init = n_init self.n_final_iter = n_final_iter self.n_candidates = n_candidates self.param_names = sorted(parameters.keys()) self.param_isInt = np.array([ 0 if (parameters[k][0]=='float') else 1 for k in self.param_names ]) self.param_bounds = np.zeros((self.n_parameters,2)) self.verbose = verbose self.scoring = scoring self.estimator = estimator self.fit_params = fit_params if fit_params is not None else {} self.cv = cv self.X = X self.y = y self.model = model self.score_format = score_format # 'cv' or 'avg' self.acquisition_function = acquisition_function self.corr_kernel = corr_kernel self.n_clusters = n_clusters self.n_clusters_max = n_clusters_max self.cluster_evol = cluster_evol self.GCP_mapWithNoise = GCP_mapWithNoise self.GCP_useAllNoisyY = GCP_useAllNoisyY self.model_noise = model_noise self.GCP_upperBound_coef = 1.96 self.nugget = nugget self.detailed_res = detailed_res self.best_parameter_ = None self.tested_parameters_ = None self.cv_scores_ = None if(cluster_evol != 'constant'): self.GCP_args = [corr_kernel, 1,GCP_mapWithNoise,GCP_useAllNoisyY,model_noise,nugget,self.GCP_upperBound_coef] else: self.GCP_args = [corr_kernel, n_clusters,GCP_mapWithNoise,GCP_useAllNoisyY,model_noise,nugget,self.GCP_upperBound_coef] if(callable(estimator)): self._callable_estimator = True if(verbose): print('Estimator is a callable and not an sklearn Estimator') else: self._callable_estimator = False if not self._callable_estimator: self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) # init param_bounds for i in range(self.n_parameters): if(parameters[self.param_names[i]][0]=='cat'): self.param_bounds[i,0] = 0 self.param_bounds[i,1] = len(parameters[self.param_names[i]][1]) else: self.param_bounds[i] = np.array(parameters[self.param_names[i]][1]) if(parameters[self.param_names[i]][0]=='int'): self.param_bounds[i,1] += 1 if(self.verbose): print(self.parameters) print(self.param_names) print(self.param_isInt) print(self.param_bounds)
def _fit(self, X, y): X, y = check_X_y(X, y, "csr") # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) scorer = check_scoring(self.estimator, scoring=self.scoring) n_features = X.shape[1] if self.max_features is not None: if not isinstance(self.max_features, numbers.Integral): raise TypeError("'max_features' should be an integer between 1 and {} features." " Got {!r} instead." .format(n_features, self.max_features)) elif self.max_features < 1 or self.max_features > n_features: raise ValueError("'max_features' should be between 1 and {} features." " Got {} instead." .format(n_features, self.max_features)) max_features = self.max_features else: max_features = n_features estimator = clone(self.estimator) # Genetic Algorithm toolbox = base.Toolbox() toolbox.register("attr_bool", random.randint, 0, 1) toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=n_features) toolbox.register("population", tools.initRepeat, list, toolbox.individual) toolbox.register("evaluate", _evalFunction, gaobject=self, estimator=estimator, X=X, y=y, cv=cv, scorer=scorer, verbose=self.verbose, fit_params=self.fit_params, max_features=max_features, caching=self.caching) toolbox.register("mate", tools.cxUniform, indpb=self.crossover_independent_proba) toolbox.register("mutate", tools.mutFlipBit, indpb=self.mutation_independent_proba) toolbox.register("select", tools.selTournament, tournsize=self.tournament_size) if self.n_jobs == 0: raise ValueError("n_jobs == 0 has no meaning.") elif self.n_jobs > 1: pool = multiprocessing.Pool(processes=self.n_jobs) toolbox.register("map", pool.map) elif self.n_jobs < 0: pool = multiprocessing.Pool(processes=max(cpu_count() + 1 + self.n_jobs, 1)) toolbox.register("map", pool.map) pop = toolbox.population(n=self.n_population) hof = tools.HallOfFame(1, similar=np.array_equal) stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", np.mean, axis=0) stats.register("std", np.std, axis=0) stats.register("min", np.min, axis=0) stats.register("max", np.max, axis=0) if self.verbose > 0: print("Selecting features with genetic algorithm.") _, log = algorithms.eaSimple(pop, toolbox, cxpb=self.crossover_proba, mutpb=self.mutation_proba, ngen=self.n_generations, stats=stats, halloffame=hof, verbose=self.verbose) if self.n_jobs != 1: pool.close() pool.join() # Set final attributes support_ = np.array(hof, dtype=np.bool)[0] self.estimator_ = clone(self.estimator) self.estimator_.fit(X[:, support_], y) self.generation_scores_ = np.array([score for score, _ in log.select("max")]) self.n_features_ = support_.sum() self.support_ = support_ return self
def _fit(self, X, y, parameter_iterable=None): if parameter_iterable is not None: raise NotImplementedError('The parameter_iterable argument is not supported.') # Actual fitting, performing the search over parameters. estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch # setup SigOpt experiment and run optimization n_folds = len(cv) self._create_sigopt_exp(self.sigopt_connection, n_folds) # start tracking time to optimize estimator opt_start_time = time.time() for jk in range(0, self.n_iter, self.n_sug): # check for opt timeout, ensuring at least 1 observation # TODO : handling failure observations if ( self.opt_timeout is not None and time.time() - opt_start_time > self.opt_timeout and jk >= 1 ): # break out of loop and refit model with best params so far break suggestions = [] jobs = [] for _ in range(self.n_sug): for train, test in cv: suggestion = self.sigopt_connection.experiments(self.experiment.id).suggestions().create() parameters = self._convert_sigopt_api_to_sklearn_assignments(suggestion.assignments.to_json()) suggestions.append(suggestion) jobs.append([parameters, train, test]) if self.verbose > 0: print('Evaluating params : ', [job[0] for job in jobs]) # do CV folds in parallel using joblib # returns scores on test set obs_timed_out = False try: par_kwargs = {'n_jobs': self.n_jobs, 'verbose': self.verbose, 'pre_dispatch': pre_dispatch} # add timeout kwarg if version of joblib supports it if 'timeout' in getfullargspec(Parallel.__init__).args: par_kwargs['timeout'] = self.cv_timeout out = Parallel( **par_kwargs )( delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters, train, test in jobs) except TimeoutError: obs_timed_out = True if not obs_timed_out: # grab scores from results for sidx, suggestion in enumerate(suggestions): score = out[sidx][0] self.sigopt_connection.experiments(self.experiment.id).observations().create( suggestion=suggestion.id, value=score) else: # obsevation timed out so report a failure self.sigopt_connection.experiments(self.experiment.id).observations().create( suggestion=suggestion.id, failed=True) # return best SigOpt assignments so far best_assignments = self.sigopt_connection.experiments(self.experiment.id).best_assignments().fetch().data if not best_assignments: raise RuntimeError( 'No valid observations found. ' 'Make sure opt_timeout and cv_timeout provide sufficient time for observations to be reported.') self.best_params_ = self._convert_sigopt_api_to_sklearn_assignments(best_assignments[0].assignments.to_json()) self.best_score_ = best_assignments[0].value if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params(**self.best_params_) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, X, y, parameter_dict): self._cv_results = None # To indicate to the property the need to update self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(self.cv, y=y, classifier=is_classifier(self.estimator)) creator.create("FitnessMax", base.Fitness, weights=(1.0, )) creator.create("Individual", list, est=clone(self.estimator), fitness=creator.FitnessMax) toolbox = base.Toolbox() name_values, gene_type, maxints = _get_param_types_maxint( parameter_dict) if self.gene_type is None: self.gene_type = gene_type if self.verbose: print("Types %s and maxint %s detected" % (self.gene_type, maxints)) toolbox.register("individual", _initIndividual, creator.Individual, maxints=maxints) toolbox.register("population", tools.initRepeat, list, toolbox.individual) if self.n_jobs > 1: pool = Pool(processes=self.n_jobs) toolbox.register("map", pool.map) toolbox.register("evaluate", _evalFunction, name_values=name_values, X=X, y=y, scorer=self.scorer_, cv=cv, iid=self.iid, verbose=self.verbose, error_score=self.error_score, fit_params=self.fit_params, score_cache=self.score_cache) toolbox.register("mate", _cxIndividual, indpb=self.gene_crossover_prob, gene_type=self.gene_type) toolbox.register("mutate", _mutIndividual, indpb=self.gene_mutation_prob, up=maxints) toolbox.register("select", tools.selTournament, tournsize=self.tournament_size) pop = toolbox.population(n=self.population_size) hof = tools.HallOfFame(1) # Stats stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", np.nanmean) stats.register("min", np.nanmin) stats.register("max", np.nanmax) # History hist = tools.History() toolbox.decorate("mate", hist.decorator) toolbox.decorate("mutate", hist.decorator) hist.update(pop) if self.verbose: print('--- Evolve in {0} possible combinations ---'.format( np.prod(np.array(maxints) + 1))) pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=self.generations_number, stats=stats, halloffame=hof, verbose=self.verbose) # Save History self.all_history_.append(hist) current_best_score_ = hof[0].fitness.values[0] current_best_params_ = _individual_to_params(hof[0], name_values) if self.verbose: print("Best individual is: %s\nwith fitness: %s" % (current_best_params_, current_best_score_)) if current_best_score_ > self.best_mem_score_: self.best_mem_score_ = current_best_score_ self.best_mem_params_ = current_best_params_ # Check memoization, potentially unknown bug assert str( hof[0] ) in self.score_cache, "Best individual not stored in score_cache for cv_results_." if self.n_jobs > 1: pool.close() pool.join()
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" X, y = _indexable(X, y) # for debugging assert not isinstance(X, pd.DataFrame) assert not isinstance(y, pd.DataFrame) # begin sklearn code estimator = self.estimator self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) # n_samples = _num_samples(X) # don't need for now... cv = self.cv cv = _set_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch # get groups, add it to kwargs X, y, groups = _get_groups(X, y) kwargs = {'groups': groups} # test_score, n_samples, _, parameters out = _do_fit(self.n_jobs, self.verbose, pre_dispatch, base_estimator, X, y, self.scorer_, parameter_iterable, self.fit_params, self.error_score, cv, **kwargs) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = _cv_len(cv, X, y) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def fit(self, X, y): """Fit the RFE model and automatically tune the number of selected features. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vector, where `n_samples` is the number of samples and `n_features` is the total number of features. y : array-like, shape = [n_samples] Target values (integers for classification, real numbers for regression). """ X, y = check_X_y(X, y, "csr") # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) scorer = check_scoring(self.estimator, scoring=self.scoring) n_features = X.shape[1] n_features_to_select = 1 if 0.0 < self.step < 1.0: step = int(max(1, self.step * n_features)) else: step = int(self.step) if step <= 0: raise ValueError("Step must be >0") rfe = RFE(estimator=self.estimator, n_features_to_select=n_features_to_select, step=self.step, verbose=self.verbose) # Determine the number of subsets of features by fitting across # the train folds and choosing the "features_to_select" parameter # that gives the least averaged error across all folds. # Note that joblib raises a non-picklable error for bound methods # even if n_jobs is set to 1 with the default multiprocessing # backend. # This branching is done so that to # make sure that user code that sets n_jobs to 1 # and provides bound methods as scorers is not broken with the # addition of n_jobs parameter in version 0.18. if self.n_jobs == 1: parallel, func = list, _rfe_single_fit else: parallel, func, = Parallel( n_jobs=self.n_jobs), delayed(_rfe_single_fit) scores = parallel( func(rfe, self.estimator, X, y, train, test, scorer) for train, test in cv.split(X, y)) scores = np.sum(scores, axis=0) n_features_to_select = max(n_features - (np.argmax(scores) * step), n_features_to_select) # Re-execute an elimination with best_k over the whole set rfe = RFE(estimator=self.estimator, n_features_to_select=n_features_to_select, step=self.step) rfe.fit(X, y) # Set final attributes self.support_ = rfe.support_ self.n_features_ = rfe.n_features_ self.ranking_ = rfe.ranking_ self.estimator_ = clone(self.estimator) self.estimator_.fit(self.transform(X), y) # Fixing a normalization error, n is equal to get_n_splits(X, y) - 1 # here, the scores are normalized by get_n_splits(X, y) self.grid_scores_ = scores[::-1] / cv.get_n_splits(X, y) return self
def _fit(self, X, y): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch # setup SigOpt experiment and run optimization self._create_sigopt_exp() for jk in xrange(self.n_iter): suggestion = self.conn.experiments(self.experiment.id).suggestions().create() parameters = suggestion.assignments.to_json() # convert all unicode names and values to plain strings non_unicode_parameters = self._convert_unicode_dict(parameters) if self.verbose > 0: print "Evaluating params : ",non_unicode_parameters # do CV folds in parallel using joblib # returns scores on test set out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, non_unicode_parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for train, test in cv) # grab scores from results scores = [o[0] for o in out] self.conn.experiments(self.experiment.id).observations().create( suggestion=suggestion.id, value=numpy.mean(scores), value_stddev=numpy.std(scores) ) # return best SigOpt observation so far best_obs = self.conn.experiments(self.experiment.id).fetch().progress.best_observation self.best_params_ = best_obs.assignments.to_json() # convert all unicode names and values to plain strings self.best_params_ = self._convert_unicode_dict(self.best_params_) self.best_score_ = best_obs.value if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **self.best_params_) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator
def _fit(self, X, y, parameter_dict): self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError( "Target variable (y) has a different number " "of samples (%i) than data (X: %i samples)" % (len(y), n_samples) ) cv = self.cv with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) # * y.shape[1] creator.create("FitnessMax", base.Fitness, weights=(1.0,)) creator.create( "Individual", list, est=clone(self.estimator), fitness=creator.FitnessMax ) toolbox = base.Toolbox() name_values, gene_type, maxints = _get_param_types_maxint(parameter_dict) if self.gene_type is None: self.gene_type = gene_type if self.verbose: print("Types %s and maxint %s detected" % (self.gene_type, maxints)) toolbox.register("individual", _initIndividual, creator.Individual, maxints=maxints) toolbox.register("population", tools.initRepeat, list, toolbox.individual) toolbox.register( "evaluate", _evalFunction, name_values=name_values, X=X, y=y, scorer=self.scorer_, cv=cv, iid=self.iid, verbose=self.verbose, error_score=self.error_score, fit_params=self.fit_params, ) toolbox.register( "mate", _cxIndividual, indpb=self.gene_crossover_prob, gene_type=self.gene_type ) toolbox.register("mutate", _mutIndividual, indpb=self.gene_mutation_prob, up=maxints) toolbox.register("select", tools.selTournament, tournsize=self.tournament_size) if self.n_jobs > 1: pool = Pool(processes=self.n_jobs) toolbox.register("map", pool.map) pop = toolbox.population(n=self.population_size) hof = tools.HallOfFame(1) stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", np.mean) stats.register("min", np.min) stats.register("max", np.max) if self.verbose: msg_template = "--- Evolve in {0} possible combinations ---" print(msg_template.format(np.prod(np.array(maxints) + 1))) pop, logbook = algorithms.eaSimple( pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=self.generations_number, stats=stats, halloffame=hof, verbose=self.verbose, ) print(hof[0].fitness.values) current_best_score_ = hof[0].fitness.values[0] current_best_params_ = _individual_to_params(hof[0], name_values) print("cbp", current_best_params_) if self.verbose: print( "Best individual is: %s\nwith fitness: %s" % (current_best_params_, current_best_score_) ) if current_best_score_ > self.best_score_: self.best_score_ = current_best_score_ self.best_params_ = current_best_params_