def cross_val_score_fn(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): """ Evaluate a score by cross-validation. This overrides the cross_val_score method typically found in cross_validation.py. Changes are clearly marked in comments, but the main change is augmenting the function to store Fit and Metric Events for each fold. """ X, y = indexable(X, y) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) # Default scoring scheme is 'accuracy' unless provided by user. if scoring is None: scoring = 'accuracy' # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) # Change from original scikit code: adding a new argument, scoring, to the # _fit_and_score function to track scoring function and create # MetricEvents. scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, verbose, None, fit_params, scoring) for train, test in cv) return np.array(scores)[:, 0]
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) X, y = indexable(X, y) cv = check_cv(self.cv, X, y, classifier=is_classifier(self.estimator)) base_estimator = clone(self.estimator) best = best_parameters(base_estimator, cv, X, y, parameter_iterable, self.scorer_, self.fit_params, self.iid) best = best.compute() self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if isinstance(base_estimator, Pipeline): base_estimator = base_estimator.to_sklearn().compute() if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = base_estimator.set_params(**best.parameters) if y is not None: self.best_estimator_ = best_estimator.fit(X, y, **self.fit_params) else: self.best_estimator_ = best_estimator.fit(X, **self.fit_params) return self
def evaluate_estimator(datafile, estimator, task, metric=None, logger=None): if metric and metric not in METRIC: raise ValueError("Invalid metric") def scorer(estimator, X, y): if task in REGRESSION_TASKS: y_pr = estimator.predict(X) elif task in CLASSIFICATION_TASKS: y_pr = estimator.predict_proba(X, batch_size=1000) else: raise NotImplementedError() score = _calculate_score(y, y_pr, task, metric) return score eval_s = time.time() data_pkl = joblib.load(datafile, 'r') resampling = data_pkl['resampling'] if resampling == 'holdout': X_tr = data_pkl["X"] y_tr = data_pkl["y"] X_val = data_pkl["valid_X"] y_val = data_pkl["valid_y"] estimator.fit(X_tr, y_tr) score = scorer(estimator, X_val, y_val) elif resampling == 'cv': X, y = data_pkl["X"], data_pkl["y"] cv = cross_validation.check_cv(None, X, y, classifier=(task in CLASSIFICATION_TASKS)) score = defaultdict(list) if metric is None else [] for train, test in cv: X_tr, X_val = X[train], X[test] y_tr, y_val = y[train], y[test] estimator.fit(X_tr, y_tr) score_ = scorer(estimator, X_val, y_val) if metric is None: for m in score_: score[m].append(score_[m]) else: score.append(score_) if metric is None: for m in score: score[m] = np.mean(score[m]) else: score = np.mean(score) estimator.fit(X, y) else: raise NotImplementedError() eval_e = time.time() if logger: logger.debug("Evaluation done, score: %s | %s sec\n%s" % (score, eval_e - eval_s, estimator)) return score
def fit(self, X, y): if self.n_neighbors_try is None: n_neighbors_try = range(1, 6) else: n_neighbors_try = self.n_neighbors_try X = check_array(X, accept_sparse='csr', copy=True) X = normalize(X, norm='l1', copy=False) cv = check_cv(self.cv, X, y) knn = KNeighborsClassifier(metric='precomputed', algorithm='brute') scorer = check_scoring(knn, scoring=self.scoring) scores = [] for train_ix, test_ix in cv: dist = self._pairwise_wmd(X[test_ix], X[train_ix]) knn.fit(X[train_ix], y[train_ix]) scores.append([ scorer(knn.set_params(n_neighbors=k), dist, y[test_ix]) for k in n_neighbors_try ]) scores = np.array(scores) self.cv_scores_ = scores best_k_ix = np.argmax(np.mean(scores, axis=0)) best_k = n_neighbors_try[best_k_ix] self.n_neighbors = self.n_neighbors_ = best_k return super(WordMoversKNNCV, self).fit(X, y)
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) X, y = indexable(X, y) cv = check_cv(self.cv, X, y, classifier=is_classifier(self.estimator)) base_estimator = clone(self.estimator) best = best_parameters(base_estimator, cv, X, y, parameter_iterable, self.scorer_, self.fit_params, self.iid) best = best.compute() self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if isinstance(base_estimator, Pipeline): base_estimator = base_estimator.to_sklearn().compute() if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = base_estimator.set_params(**best.parameters) if y is not None: self.best_estimator_ = best_estimator.fit( X, y, **self.fit_params) else: self.best_estimator_ = best_estimator.fit(X, **self.fit_params) return self
def Bootstrap_cv(estimator1, estimator2, X, y, score_func, cv=None, n_jobs=1, verbose=0, ratio=.5): X, y = cross_validation.check_arrays(X, y, sparse_format='csr') cv = cross_validation.check_cv(cv, X, y, classifier= cross_validation.is_classifier(estimator1)) if score_func is None: if not hasattr(estimator1, 'score') or \ not hasattr(estimator2, 'score'): raise TypeError( "If no score_func is specified, the estimator passed " "should have a 'score' method. The estimator %s " "does not." % estimator1) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. scores = \ cross_validation.Parallel( n_jobs=n_jobs, verbose=verbose)( cross_validation.delayed( dual_cross_val_score) (cross_validation.clone(estimator1), cross_validation.clone(estimator2), X, y, score_func, train, test, verbose, ratio) for train, test in cv) return np.array(scores)
def fit(self, epochs, y=None): from sklearn.cross_validation import check_cv, StratifiedKFold from mne.decoding.time_gen import _check_epochs_input X, y, self.gat.picks_ = _check_epochs_input(epochs, y, self.gat.picks) gat_list = list() cv = self.cv if isinstance(cv, (int, np.int)): cv = StratifiedKFold(y, cv) cv = check_cv(cv, X, y, classifier=True) # Construct meta epoch and fit gat with a single fold for ii, (train, test) in enumerate(cv): # meta trial epochs_ = make_meta_epochs(epochs[train], y[train], n_bin=self.n) # fit gat gat_ = deepcopy(self.gat) cv_one_fold = [(range(len(epochs_)), [])] gat_.cv = cv_one_fold gat_.fit(epochs_, epochs_.events[:, 2]) gat_list.append(gat_) # gather self.gat = gat_ self.gat.train_times_ = gat_.train_times_ self.gat.estimators_ = np.squeeze( [gat.estimators_ for gat in gat_list]).T.tolist() self.gat.cv_ = cv self.gat.y_train_ = y
def benchmark(clf, X, y, cv=None): X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) cv = check_cv(cv, X, y, classifier=is_classifier(clf)) # learning_curve_ = learning_curve(clf, X_all, y_all, cv=cv) train_times = [] test_times = [] confusion_matrices = [] confusion_matrix_indices = [] coefs = [] for train, test in cv: X_train, y_train = X[train], y[train] X_test, y_test = X[test], y[test] t0 = time() clf.fit(X_train, y_train) train_times.append(time()-t0) t0 = time() y_pred = clf.predict(X_test) test_times.append(time()-t0) confusion_matrices.append(confusion_matrix(y_test, y_pred)) confusion_matrix_indices.append(np.array([[test[pred] for pred in true] for true in confusion_matrix_instances(y_test, y_pred)])) coefs.append(clf.coef_) return dict( train_times = np.array(train_times), test_times = np.array(test_times), confusion_matrices = np.array(confusion_matrices), confusion_matrix_indices = np.array(confusion_matrix_indices), coefs = np.array(coefs) )
def cross_val_score(estimator, X, y=None, score_func=None, cv=None, n_jobs=-1, verbose=0, as_dvalues=False): """Evaluate a score by cross-validation. Replacement of :func:`sklearn.cross_validation.cross_val_score`, used to support computation of decision values. """ X, y = check_arrays(X, y, sparse_format='csr') cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if score_func is None: if not hasattr(estimator, 'score'): raise TypeError( "If no score_func is specified, the estimator passed " "should have a 'score' method. The estimator %s " "does not." % estimator) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. scores = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(_cross_val_score)(clone(estimator), X, y, score_func, train, test, verbose, as_dvalues) for train, test in cv) return np.array(scores)
def fit(self, X, y): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = indexable(X, y) param_grid = ParameterSampler(self.param_distributions, self.n_iter, random_state=self.random_state) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(param_grid, Sized): n_candidates = len(param_grid) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) indexed_param_grid = list(zip(range(len(param_grid)), param_grid)) par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid)) X_bc = self.sc.broadcast(X) y_bc = self.sc.broadcast(y) scorer = self.scorer_ indexed_output = dict( par_param_grid.map( lambda i: local_fit(i[0], i[1], base_estimator, X_bc.value, y_bc.value, scorer, cv)).collect()) out = [indexed_output[idx] for idx in range(len(param_grid))] X_bc.unpersist() y_bc.unpersist() best = sorted(out, key=lambda x: x[0], reverse=True)[0] self.best_params_ = best[1] self.best_score_ = best[0] if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params(**best[1]) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _grid_search(self, train_X, train_y): if callable(self.inner_cv): inner_cv = self.inner_cv(train_X, train_y) else: inner_cv = check_cv(self.inner_cv, train_X, train_y, classifier=is_classifier(self.estimator)) master = MPIGridSearchCVMaster(self.param_grid, inner_cv, self.estimator, self.scorer_, self.fit_params) return master.run(train_X, train_y)
def add_del_cv(df, predictors, target, model, scoring='roc_auc', cv1=None, n_folds=8, n_jobs=-1, start=[], selmax=None, selmin=1, min_ratio=1e-7, max_steps=10, verbosity=0): """ Forward-Backward (ADD-DEL) selection using model. Parameters ---------- Returns ------- selected: list selected predictors Example ------- References ---------- """ def test_to_break(selected, selected_curr, to_break): if set(selected) == set(selected_curr): to_break += 1 else: to_break = 0 return to_break X, y, _ = df_xyf(df, predictors=predictors, target=target) cv1 = cross_validation.check_cv( cv1, X=X, y=y, classifier=is_classifier(model)) selected_curr = start to_break = 0 for i_step in xrange(max_steps): selected = forward_cv( df, predictors, target, model, scoring=scoring, cv1=cv1, n_folds=n_folds, n_jobs=n_jobs, start=selected_curr, selmax=selmax, min_ratio=min_ratio, verbosity=verbosity-1) to_break = test_to_break(selected, selected_curr, to_break) selected_curr = selected if verbosity > 0: print('forward:', ' '.join(selected_curr)) if to_break > 1: break selected = backward_cv( df, selected_curr, target, model, scoring=scoring, cv1=cv1, n_folds=n_folds, n_jobs=n_jobs, selmin=selmin, min_ratio=min_ratio, verbosity=verbosity-1) to_break = test_to_break(selected, selected_curr, to_break) selected_curr = selected if verbosity > 0: print('backward:', ' '.join(selected_curr)) if to_break > 0: break return selected_curr
def score(self, test_parameter): """ The score function to call in order to evaluate the quality of the parameter test_parameter Parameters ---------- tested_parameter : dict, the parameter to test Returns ------- score : the CV score, either the list of all cv results or the mean (depending of score_format) """ if not self._callable_estimator: cv = check_cv(self.cv, self.X, self.y, classifier=is_classifier(self.estimator)) cv_score = [ _fit_and_score(clone(self.estimator), self.X, self.y, self.scorer_, train, test, False, test_parameter, self.fit_params, return_parameters=True) for train, test in cv ] n_test_samples = 0 mean_score = 0 detailed_score = [] for tmp_score, tmp_n_test_samples, _, _ in cv_score: detailed_score.append(tmp_score) tmp_score *= tmp_n_test_samples n_test_samples += tmp_n_test_samples mean_score += tmp_score mean_score /= float(n_test_samples) if (self.score_format == 'avg'): score = mean_score else: # format == 'cv' score = detailed_score else: if (self.score_format == 'avg'): score = [self.estimator(test_parameter)] else: # format == 'cv' score = self.estimator(test_parameter) return score
def dynamic_cross_val_predict(estimator, fv, esa_feature_list, unigram_feature_list, dynamic_X, y=None, cv=None, verbose=0, fit_params=None): print "dynamic predict cross val mit %s" % esa_feature_list + unigram_feature_list vec = DictVectorizer() tfidf = TfidfTransformer() X = vec.fit_transform(fv).toarray() # X = tfidf.fit_transform(X).toarray() X, y = cross_validation.indexable(X, y) cv = cross_validation.check_cv(cv, X, y, classifier=cross_validation.is_classifier(estimator)) preds_blocks = [] cross_val_step = 0 for train, test in cv: fv_copy = copy.deepcopy(fv) #baue X in jedem Schritt neu for i in range(0,len(fv)): #jedes i steht für einen featuredict feature_dict = fv_copy[i] dynamic_vec = dynamic_X[cross_val_step] #zeigt auf esa_vec for feature in esa_feature_list: feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten for feature in unigram_feature_list: feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten X = vec.fit_transform(fv_copy).toarray() # X = tfidf.fit_transform(X).toarray() preds_blocks.append(cross_validation._fit_and_predict(cross_validation.clone(estimator), X, y, train, test, verbose, fit_params)) cross_val_step+=1 preds = [p for p, _ in preds_blocks] locs = np.concatenate([loc for _, loc in preds_blocks]) if not cross_validation._check_is_partition(locs, cross_validation._num_samples(X)): raise ValueError('cross_val_predict only works for partitions') inv_locs = np.empty(len(locs), dtype=int) inv_locs[locs] = np.arange(len(locs)) # Check for sparse predictions if sp.issparse(preds[0]): preds = sp.vstack(preds, format=preds[0].format) else: preds = np.concatenate(preds) return preds[inv_locs]
def fit(self, X, y): """Actual fitting, performing the search over parameters.""" parameter_iterable = ParameterSampler(self.param_distributions, self.n_iter, random_state=self.random_state) estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( delayed(cv_fit_and_score)(clone(base_estimator), X, y, self.scoring, parameters, cv=cv) for parameters in parameter_iterable) best = sorted(out, reverse=True)[0] self.best_params_ = best[1] self.best_score_ = best[0] if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best[1]) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def fit(self, X, y=None, sample_weight=None, exposure=None): # For later parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch, max_nbytes=None) # Extract arguments fit_args = self._process_args(X=X, y=y, sample_weight=sample_weight, exposure=exposure) # Sort out cv parameters if self.cv == 1: cv = no_cv(X=X, y=y) else: if hasattr(self.cv, 'split'): cv_args = dict(X=X) if y is not None: cv_args['y'] = np.ravel(y) cv = self.cv.split(**cv_args) else: cv_args = dict(X=X) if y is not None: cv_args['y'] = shrinkd(1,np.asarray(y)) cv = check_cv(self.cv, classifier=is_classifier(self.estimator), **cv_args) # Do the cross validation fits # print(valmap(lambda x: x.shape, fit_args)) # print('num_folds = %d' % self.cv.get_n_splits(X=X)) cv_fits = parallel(delayed(_fit_and_predict)(clone(self.estimator), fit_args, train, test, self.verbose) for train, test in cv) # Combine predictions from cv fits prediction = np.empty_like(y) if y is not None else np.empty(shape=X.shape[0]) for fit in cv_fits: safe_assign_subset(prediction, fit[2], fit[1]) # Store cross validation models self.cv_estimators_ = [fit[0] for fit in cv_fits] self.cv_indices_ = [fit[2] for fit in cv_fits] self.cv_predictions_ = prediction # If a metric was provided, compute the score if self.metric is not None: metric_args = {} if 'sample_weight' in fit_args: metric_args['sample_weight'] = fit_args['sample_weight'] if 'exposure' in fit_args: metric_args['exposure'] = fit_args['exposure'] self.score_ = safer_call(self.metric, y, self.cv_predictions_, **metric_args) # Fit on entire data set self.estimator_ = clone(self.estimator) self.estimator_.fit(**fit_args) return self
def test_searchlight(): # Create a toy dataset to run searchlight on # Initialize with 4x4x4 scans of random values on 30 frames rand = np.random.RandomState(0) frames = 30 data = rand.rand(5, 5, 5, frames) mask = np.ones((5, 5, 5), np.bool) mask_img = nibabel.Nifti1Image(mask.astype(np.int), np.eye(4)) # Create a condition array cond = np.arange(frames, dtype=int) > frames // 2 # Create an activation pixel. data[2, 2, 2, :] = 0 data[2, 2, 2][cond.astype(np.bool)] = 2 data_img = nibabel.Nifti1Image(data, np.eye(4)) # Define cross validation from sklearn.cross_validation import check_cv # avoid using KFold for compatibility with sklearn 0.10-0.13 cv = check_cv(4, cond) n_jobs = 1 # Run Searchlight with different radii # Small radius : only one pixel is selected sl = searchlight.SearchLight(mask_img, process_mask_img=mask_img, radius=0.5, n_jobs=n_jobs, scoring='accuracy', cv=cv) sl.fit(data_img, cond) assert_equal(np.where(sl.scores_ == 1)[0].size, 1) assert_equal(sl.scores_[2, 2, 2], 1.) # Medium radius : little ball selected sl = searchlight.SearchLight(mask_img, process_mask_img=mask_img, radius=1, n_jobs=n_jobs, scoring='accuracy', cv=cv) sl.fit(data_img, cond) assert_equal(np.where(sl.scores_ == 1)[0].size, 7) assert_equal(sl.scores_[2, 2, 2], 1.) assert_equal(sl.scores_[1, 2, 2], 1.) assert_equal(sl.scores_[2, 1, 2], 1.) assert_equal(sl.scores_[2, 2, 1], 1.) assert_equal(sl.scores_[3, 2, 2], 1.) assert_equal(sl.scores_[2, 3, 2], 1.) assert_equal(sl.scores_[2, 2, 3], 1.) # Big radius : big ball selected sl = searchlight.SearchLight(mask_img, process_mask_img=mask_img, radius=2, n_jobs=n_jobs, scoring='accuracy', cv=cv) sl.fit(data_img, cond) assert_equal(np.where(sl.scores_ == 1)[0].size, 33) assert_equal(sl.scores_[2, 2, 2], 1.)
def test_check_cv_return_types(): X = np.ones((9, 2)) cv = cval.check_cv(3, X, classifier=False) assert_true(isinstance(cv, cval.KFold)) y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1]) cv = cval.check_cv(3, X, y_binary, classifier=True) assert_true(isinstance(cv, cval.StratifiedKFold)) y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2]) cv = cval.check_cv(3, X, y_multiclass, classifier=True) assert_true(isinstance(cv, cval.StratifiedKFold)) X = np.ones((5, 2)) y_multilabel = [[1, 0, 1], [1, 1, 0], [0, 0, 0], [0, 1, 1], [1, 0, 0]] cv = cval.check_cv(3, X, y_multilabel, classifier=True) assert_true(isinstance(cv, cval.KFold)) y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]]) cv = cval.check_cv(3, X, y_multioutput, classifier=True) assert_true(isinstance(cv, cval.KFold))
def test_searchlight(): # Create a toy dataset to run searchlight on # Initialize with 4x4x4 scans of random values on 30 frames rand = np.random.RandomState(0) frames = 30 data = rand.rand(5, 5, 5, frames) mask = np.ones((5, 5, 5), np.bool) mask_img = nibabel.Nifti1Image(mask.astype(np.int), np.eye(4)) # Create a condition array cond = np.arange(frames, dtype=int) > frames / 2 # Create an activation pixel. data[2, 2, 2, :] = 0 data[2, 2, 2][cond.astype(np.bool)] = 2 data_img = nibabel.Nifti1Image(data, np.eye(4)) # Define cross validation from sklearn.cross_validation import check_cv # avoid using KFold for compatibility with sklearn 0.10-0.13 cv = check_cv(4, cond) n_jobs = 1 # Run Searchlight with different radii # Small radius : only one pixel is selected sl = searchlight.SearchLight(mask_img, process_mask_img=mask_img, radius=0.5, n_jobs=n_jobs, scoring='accuracy', cv=cv) sl.fit(data_img, cond) assert_equal(np.where(sl.scores_ == 1)[0].size, 1) assert_equal(sl.scores_[2, 2, 2], 1.) # Medium radius : little ball selected sl = searchlight.SearchLight(mask_img, process_mask_img=mask_img, radius=1, n_jobs=n_jobs, scoring='accuracy', cv=cv) sl.fit(data_img, cond) assert_equal(np.where(sl.scores_ == 1)[0].size, 7) assert_equal(sl.scores_[2, 2, 2], 1.) assert_equal(sl.scores_[1, 2, 2], 1.) assert_equal(sl.scores_[2, 1, 2], 1.) assert_equal(sl.scores_[2, 2, 1], 1.) assert_equal(sl.scores_[3, 2, 2], 1.) assert_equal(sl.scores_[2, 3, 2], 1.) assert_equal(sl.scores_[2, 2, 3], 1.) # Big radius : big ball selected sl = searchlight.SearchLight(mask_img, process_mask_img=mask_img, radius=2, n_jobs=n_jobs, scoring='accuracy', cv=cv) sl.fit(data_img, cond) assert_equal(np.where(sl.scores_ == 1)[0].size, 33) assert_equal(sl.scores_[2, 2, 2], 1.)
def _score_lambda_path(est, X, y, sample_weight, cv, scoring, classifier, n_jobs, verbose): """Score each model found by glmnet using cross validation. Parameters ---------- est : estimator The previously fitted estimator. X : array, shape (n_samples, n_features) Input features y : array, shape (n_samples,) Target values. sample_weight : array, shape (n_samples,) Weight of each row in X. n_folds : int Number of folds for cross validation, must be at least 3. scoring : string, callable or None Scoring method to apply to each model. n_jobs: int Maximum number of threads to use for scoring models. verbose : bool Emit logging data and warnings when True. Returns ------- scores : array, shape (n_lambda,) Scores for each value of lambda over all cv folds. """ scorer = check_scoring(est, scoring) cv = check_cv(cv, X, y, classifier) # We score the model for every value of lambda, for classification # models, this will be an intercept-only model, meaning it predicts # the same class regardless of the input. Obviously, this makes some of # the scikit-learn metrics unhappy, so we are silencing these warnings. # Also note, catch_warnings is not thread safe. with warnings.catch_warnings(): action = 'always' if verbose else 'ignore' warnings.simplefilter(action, UndefinedMetricWarning) scores = Parallel(n_jobs=n_jobs, verbose=verbose, backend='threading')( delayed(_fit_and_score)(est, scorer, X, y, sample_weight, est.lambda_path_, train_idx, test_idx) for (train_idx, test_idx) in cv) return scores
def test_check_cv_return_types(): X = np.ones((9, 2)) cv = cval.check_cv(3, X, classifier=False) assert_true(isinstance(cv, cval.KFold)) y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1]) cv = cval.check_cv(3, X, y_binary, classifier=True) assert_true(isinstance(cv, cval.StratifiedKFold)) y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2]) cv = cval.check_cv(3, X, y_multiclass, classifier=True) assert_true(isinstance(cv, cval.StratifiedKFold)) X = np.ones((5, 2)) y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]] with warnings.catch_warnings(record=True): # deprecated sequence of sequence format cv = cval.check_cv(3, X, y_seq_of_seqs, classifier=True) assert_true(isinstance(cv, cval.KFold)) y_indicator_matrix = LabelBinarizer().fit_transform(y_seq_of_seqs) cv = cval.check_cv(3, X, y_indicator_matrix, classifier=True) assert_true(isinstance(cv, cval.KFold)) y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]]) cv = cval.check_cv(3, X, y_multioutput, classifier=True) assert_true(isinstance(cv, cval.KFold))
def my_cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', method='predict'): X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) # Ensure the estimator has implemented the passed decision function if not callable(getattr(estimator, method)): raise AttributeError('{} not implemented in estimator'.format(method)) if method in ['decision_function', 'predict_proba', 'predict_log_proba']: le = LabelEncoder() y = le.fit_transform(y) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) prediction_blocks = parallel( delayed(_my_fit_and_predict)(clone(estimator), X, y, train, test, verbose, fit_params, method) for train, test in cv.split(X, y, groups)) # Concatenate the predictions predictions = [pred_block_i for pred_block_i, _, _ in prediction_blocks] test_indices = np.concatenate( [indices_i for _, indices_i, _ in prediction_blocks]) scores = np.concatenate([score_i for _, _, score_i in prediction_blocks]) if not _check_is_permutation(test_indices, _num_samples(X)): raise ValueError('cross_val_predict only works for partitions') inv_test_indices = np.empty(len(test_indices), dtype=int) inv_test_indices[test_indices] = np.arange(len(test_indices)) # Check for sparse predictions if sp.issparse(predictions[0]): predictions = sp.vstack(predictions, format=predictions[0].format) else: predictions = np.concatenate(predictions) out_predictions = predictions[inv_test_indices] return out_predictions.reshape(y.shape), scores
def cross_validate(self, k=10): """Performs a k-fold cross validation of our training data. Args: k: The number of folds for cross validation. """ self.scores = [] X, y = check_arrays(self.feature_vector, self.classification_vector, sparse_format='csr') cv = cross_validation.check_cv(k, self.feature_vector, self.classification_vector, classifier=True) for train, test in cv: self.classifier1.fit(self.feature_vector[train], self.classification_vector[train]) self.classifier2.fit(self.feature_vector[train], self.classification_vector[train]) self.classifier3.fit(self.feature_vector[train], self.classification_vector[train]) classification1 = self.classifier1.predict( self.feature_vector[test]) classification2 = self.classifier2.predict( self.feature_vector[test]) classification3 = self.classifier3.predict( self.feature_vector[test]) classification = [] for predictions in zip(classification1, classification2, classification3): neutral_count = predictions.count(0) positive_count = predictions.count(1) negative_count = predictions.count(-1) if (neutral_count == negative_count and negative_count == positive_count): classification.append(predictions[0]) elif (neutral_count > positive_count and neutral_count > negative_count): classification.append(0) elif (positive_count > neutral_count and positive_count > negative_count): classification.append(1) elif (negative_count > neutral_count and negative_count > positive_count): classification.append(-1) classification = numpy.array(classification) self.scores.append(self.score_func(y[test], classification))
def fit(self, X, y=None, sample_weight=None, exposure=None): if self.cv == 1: cv = no_cv(X=X, y=y) else: if hasattr(self.cv, 'split'): cv = self.cv.split(X, y) else: cv = check_cv(self.cv, X=X, y=y, classifier=is_classifier(self.calibrator)) parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch) # Fit the estimator on each train set and predict with it on each test set fit_args = {'X': X} if y is not None: fit_args['y'] = y if self.est_weight and sample_weight is not None: fit_args['sample_weight'] = sample_weight if self.est_exposure and exposure is not None: fit_args['exposure'] = exposure # Do the cross validation fits cv_fits = parallel(delayed(_fit_and_predict)(clone(self.estimator), fit_args, train, test) for train, test in cv) # Combine predictions from cv fits prediction = np.empty_like(y) for fit in cv_fits: safe_assign_subset(prediction, fit[2], fit[1]) # fit_predict_results = parallel(delayed(_fit_and_predict)(estimator=clone(self.estimator), # train=train, test=test, **fit_args) for train, test in cv) # # # Combine the predictions # prediction = np.empty_like(y) # for _, pred, _, test in zip(fit_predict_results, cv): # prediction = np.concatenate([pred for _, pred in cv_fits], axis=0) # Fit the calibrator on the predictions cal_args = {'X': prediction[:, None] if len(prediction.shape) == 1 else prediction, 'y': y} if self.cal_weight and sample_weight is not None: cal_args['sample_weight'] = sample_weight if self.cal_exposure and exposure is not None: cal_args['exposure'] = exposure self.calibrator_ = clone(self.calibrator).fit(**cal_args) # Fit the estimator on the entire data set self.estimator_ = clone(self.estimator).fit(**fit_args) return self
def cross_validate(self, k=10): """Performs a k-fold cross validation of our training data. Args: k: The number of folds for cross validation. """ self.scores = [] X, y = check_arrays(self.feature_vector, self.classification_vector, sparse_format='csr') cv = cross_validation.check_cv( k, self.feature_vector, self.classification_vector, classifier=True) for train, test in cv: self.classifier1.fit(self.feature_vector[train], self.classification_vector[train]) self.classifier2.fit(self.feature_vector[train], self.classification_vector[train]) self.classifier3.fit(self.feature_vector[train], self.classification_vector[train]) classification1 = self.classifier1.predict( self.feature_vector[test]) classification2 = self.classifier2.predict( self.feature_vector[test]) classification3 = self.classifier3.predict( self.feature_vector[test]) classification = [] for predictions in zip(classification1, classification2, classification3): neutral_count = predictions.count(0) positive_count = predictions.count(1) negative_count = predictions.count(-1) if (neutral_count == negative_count and negative_count == positive_count): classification.append(predictions[0]) elif (neutral_count > positive_count and neutral_count > negative_count): classification.append(0) elif (positive_count > neutral_count and positive_count > negative_count): classification.append(1) elif (negative_count > neutral_count and negative_count > positive_count): classification.append(-1) classification = numpy.array(classification) self.scores.append(self.score_func(y[test], classification))
def fit(self, X, y): if master: LOG.info("comm_size:" + str(comm_size)) X, y = check_X_y(X, y, force_all_finite=False, multi_output=self.multi_output, accept_sparse='csr') _check_param_grid(self.param_grid) cv = check_cv(self.cv, X, y, classifier=is_classifier(self.estimator)) if master: LOG.info("cv length:" + str(len(cv))) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) if master: self._fit_master(X, y, cv) else: self._fit_slave() return self
def score(self,test_parameter): """ The score function to call in order to evaluate the quality of the parameter test_parameter Parameters ---------- `tested_parameter` : dict, the parameter to test Returns ------- `score` : the CV score, either the list of all cv results or the mean (depending of score_format) """ if not self._callable_estimator: cv = check_cv(self.cv, self.X, self.y, classifier=is_classifier(self.estimator)) cv_score = [ _fit_and_score(clone(self.estimator), self.X, self.y, self.scorer_, train, test, False, test_parameter, self.fit_params, return_parameters=True) for train, test in cv ] n_test_samples = 0 mean_score = 0 detailed_score = [] for tmp_score, tmp_n_test_samples, _, _ in cv_score: detailed_score.append(tmp_score) tmp_score *= tmp_n_test_samples n_test_samples += tmp_n_test_samples mean_score += tmp_score mean_score /= float(n_test_samples) if(self.score_format == 'avg'): score = mean_score else: # format == 'cv' score = detailed_score else: if(self.score_format == 'avg'): score = [self.estimator(test_parameter)] else: # format == 'cv' score = self.estimator(test_parameter) return score
def fit(self, X, y): """Fit KNN model by choosing the best `n_neighbors`. Parameters ----------- X : scipy.sparse matrix, (n_samples, vocab_size) Data y : ndarray, shape (n_samples,) or (n_samples, n_targets) Target """ if self.n_neighbors is None: n_neighbors = 2 else: n_neighbors = self.n_neighbors X = check_array(X, accept_sparse='csr', copy=True) X = normalize(X, norm='l1', copy=False) cv = check_cv(self.cv, X, y) # knn = KNeighborsClassifier(metric='precomputed', algorithm='brute') knn = KNeighborsClassifier(n_jobs=self.n_jobs) scorer = check_scoring(knn, scoring=self.scoring) scores = [] cycle = 1 for train_ix, test_ix in cv: dist = self._pairwise_wmd(X[test_ix], X[train_ix]) knn.fit(X[train_ix], y[train_ix]) # scores.append([ # scorer(knn.set_params(n_neighbors=k), dist, y[test_ix]) # for k in n_neighbors_try # ]) scores.append( scorer(knn.set_params(n_neighbors=n_neighbors), dist, y[test_ix])) logger.info("%i/%i folds done!", cycle, self.cv) cycle += 1 scores = np.array(scores) self.cv_scores_ = scores # best_k_ix = np.argmax(np.mean(scores, axis=0)) # best_k = n_neighbors_try[best_k_ix] # self.n_neighbors = self.n_neighbors_ = best_k return super(WordMoversKNNCV, self).fit(X, y)
def cross_val_predict(estimator, X, y, cv=5, n_jobs=1, refit=False, predict_fun="predict"): X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) pred = Parallel(n_jobs=n_jobs)( delayed(_cross_val_predict)( clone(estimator), X, y, train, test, predict_fun) for train, test in cv) pred = np.concatenate(pred) if cv.indices: index = np.concatenate([test for _, test in cv]) else: index = np.concatenate([np.where(test)[0] for _, test in cv]) ## pred[index] = pred doesn't work as expected pred[index] = pred.copy() if refit: return pred, clone(estimator).fit(X,y) else: return pred
def dynamic_cross_val_score(estimator, fv, esa_feature_list, unigram_feature_list, dynamic_X, y=None, scoring=None, cv=None, verbose=0, fit_params=None): print "dynamic cross val mit %s" % esa_feature_list + unigram_feature_list vec = DictVectorizer() tfidf = TfidfTransformer() X = vec.fit_transform(fv).toarray() # X= tfidf.fit_transform(X).toarray() X, y = cross_validation.indexable(X, y) cv = cross_validation.check_cv(cv, X, y, classifier=cross_validation.is_classifier(estimator)) scorer = cross_validation.check_scoring(estimator, scoring=scoring) scores = [] cross_val_step = 0 for train, test in cv: fv_copy = copy.deepcopy(fv) #baue X in jedem Schritt neu for i in range(0,len(fv)): #jedes i steht für einen featuredict feature_dict = fv_copy[i] dynamic_vec = dynamic_X[cross_val_step] #zeigt auf esa_vec for feature in esa_feature_list: feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten for feature in unigram_feature_list: feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten X = vec.fit_transform(fv_copy).toarray() # X = tfidf.fit_transform(X).toarray() scores.append(cross_validation._fit_and_score(cross_validation.clone(estimator), X, y, scorer, train, test, verbose, None, fit_params)) cross_val_step += 1 return np.array(scores)[:, 0]
def my_cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', method='predict'): X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) # Ensure the estimator has implemented the passed decision function if not callable(getattr(estimator, method)): raise AttributeError('{} not implemented in estimator' .format(method)) if method in ['decision_function', 'predict_proba', 'predict_log_proba']: le = LabelEncoder() y = le.fit_transform(y) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) prediction_blocks = parallel(delayed(_my_fit_and_predict)( clone(estimator), X, y, train, test, verbose, fit_params, method) for train, test in cv.split(X, y, groups)) # Concatenate the predictions predictions = [pred_block_i for pred_block_i, _, _ in prediction_blocks] test_indices = np.concatenate([indices_i for _, indices_i, _ in prediction_blocks]) scores = np.concatenate([score_i for _, _, score_i in prediction_blocks]) if not _check_is_permutation(test_indices, _num_samples(X)): raise ValueError('cross_val_predict only works for partitions') inv_test_indices = np.empty(len(test_indices), dtype=int) inv_test_indices[test_indices] = np.arange(len(test_indices)) # Check for sparse predictions if sp.issparse(predictions[0]): predictions = sp.vstack(predictions, format=predictions[0].format) else: predictions = np.concatenate(predictions) return predictions[inv_test_indices], scores
def fit(self,X,Y): if not self.best_subset: self.fshape = np.shape(X)[1] self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) self.cv = check_cv(self.cv, X, Y, classifier=is_classifier(self.estimator)) self.best_subset = tuple() self.best_subset_score = 0 self.scores_ = {self.best_subset:self.best_subset_score} X = np.array(X) Y = np.array(Y) try: self.get_best_subset(X,Y) except KeyboardInterrupt: pass self.estimator = self.estimator.fit(X[:,self.best_subset],Y) return self
def cross_val_score_filter_feature_selection(model,filter_function,filter_criteria, X, y, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): X, y = indexable(X, y) cv = check_cv(cv, X, y, classifier=is_classifier(model)) scorer = check_scoring(model, scoring=scoring) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) # scores = parallel(delayed(_fit_and_score)(clone(model), filter_function(X,y,train,filter_criteria), y, scorer, train, test, verbose, None, fit_params) for train, test in cv) return np.array(scores)[:, 0]
def check_cv(cv, X=None, y=None, classifier=False): """Input checker utility for building a CV in a user friendly way. Parameters ---------- cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - An object to be used as a cross-validation generator. - An iterable yielding train/test splits. For integer/None inputs, if classifier is True, ``X`` and ``y`` aren't dask collections, and ``y`` is binary or multiclass, ``StratifiedKFold`` used. In all other cases, ``KFold`` is used. X : array-like The data the cross-val object will be applied on. y : array-like The target variable for a supervised learning problem. classifier : boolean optional Whether the task is a classification task. """ if is_dask_collection(X) or is_dask_collection(y): if cv is None: return KFold(n_folds=3) elif isinstance(cv, Integral): return KFold(n_folds=cv) elif not isinstance(cv, DaskBaseCV): raise TypeError("Unexpected cv type {0}".format(type(cv).__name__)) else: return cv if isinstance(cv, DaskBaseCV) and not isinstance(cv, _DaskCVWrapper): raise ValueError("Can't use dask cv object with non-dask X and y") cv = cross_validation.check_cv(cv, X=X, y=y, classifier=classifier) return _DaskCVWrapper(cv)
def cross_val_predict(estimator, X, y, cv=5, n_jobs=1, refit=False, predict_fun="predict"): X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) pred = Parallel(n_jobs=n_jobs)(delayed(_cross_val_predict)(clone( estimator), X, y, train, test, predict_fun) for train, test in cv) pred = np.concatenate(pred) if cv.indices: index = np.concatenate([test for _, test in cv]) else: index = np.concatenate([np.where(test)[0] for _, test in cv]) ## pred[index] = pred doesn't work as expected pred[index] = pred.copy() if refit: return pred, clone(estimator).fit(X, y) else: return pred
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr') if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if not self.dataset_filenames: self.save_dataset_filename(X, y, cv) dataset_filenames = self.dataset_filenames client = Client() lb_view = client.load_balanced_view() if self.verbose > 0: print("Number of CPU core %d" % len(client.ids())) self.tasks = [([lb_view.apply(evaluate, estimator, dataset_filename, params) for dataset_filename in dataset_filenames], params) for params in parameter_iterable] if self.sync: self.wait() self.set_grid_scores() self.set_best_score_params() if self.refit: self.set_best_estimator(estimator) return self
def fit(self, X, y): """Fit KNN model by choosing the best `n_neighbors`. Parameters ----------- X : scipy.sparse matrix, (n_samples, vocab_size) Data y : ndarray, shape (n_samples,) or (n_samples, n_targets) Target """ if self.n_neighbors_try is None: n_neighbors_try = range(1, 6) else: n_neighbors_try = self.n_neighbors_try X = check_array(X, accept_sparse='csr', copy=True) X = normalize(X, norm='l1', copy=False) cv = check_cv(self.cv, X, y) knn = KNeighborsClassifier(metric='precomputed', algorithm='brute') scorer = check_scoring(knn, scoring=self.scoring) scores = [] for train_ix, test_ix in cv: dist = self._pairwise_wmd(X[test_ix], X[train_ix]) knn.fit(X[train_ix], y[train_ix]) scores.append([ scorer(knn.set_params(n_neighbors=k), dist, y[test_ix]) for k in n_neighbors_try ]) scores = np.array(scores) self.cv_scores_ = scores best_k_ix = np.argmax(np.mean(scores, axis=0)) best_k = n_neighbors_try[best_k_ix] self.n_neighbors = self.n_neighbors_ = best_k return super(WordMoversKNNCV, self).fit(X, y)
def fit(self, X, y=None, sample_weight=None): """Run fit with all sets of parameters Returns the best classifier Parameters ---------- X: array, [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. y: array-like, shape = [n_samples], optional Target vector relative to X for classification; None for unsupervised learning. sample_weight : array-like, shape = [n_samples], optional Sample weights """ estimator = self.estimator cv = self.cv if hasattr(X, 'shape'): n_samples = X.shape[0] else: # support list of unstructured objects on which feature # extraction will be applied later in the tranformer chain n_samples = len(X) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) base_clf = clone(self.estimator) # first fit at each grid point using the maximum n_estimators param_grid = self.param_grid param_grid['n_estimators'] = [self.max_n_estimators] grid = ParameterGrid(param_grid) pre_dispatch = self.pre_dispatch clfs = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( delayed(fit_grid_point)( X, y, sample_weight, base_clf, clf_params, train, test, self.verbose, **self.fit_params) for clf_params in grid for train, test in cv) # now use the already fitted ensembles but trancate to N estimators for # N from 1 to n_estimators_max - 1 (inclusive) out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( delayed(score_each_boost)( X, y, sample_weight, clf, clf_params, self.min_n_estimators, train, test, self.loss_func, self.score_func, self.verbose) for clf, clf_params, train, test in clfs) out = reduce(operator.add, [zip(*stage) for stage in out]) # out is now a list of triplet: score, estimator_params, n_test_samples n_estimators_points = self.max_n_estimators - self.min_n_estimators + 1 n_grid_points = len(list(grid)) * n_estimators_points n_fits = len(out) n_folds = n_fits // n_grid_points scores = list() cv_scores = list() for block in range(0, n_fits, n_folds * n_estimators_points): for grid_start in range(block, block + n_estimators_points): n_test_samples = 0 score = 0 these_points = list() for this_score, clf_params, this_n_test_samples in \ out[grid_start: grid_start + n_folds * n_estimators_points: n_estimators_points]: these_points.append(this_score) if self.iid: this_score *= this_n_test_samples score += this_score n_test_samples += this_n_test_samples if self.iid: score /= float(n_test_samples) scores.append((score, clf_params)) cv_scores.append(these_points) cv_scores = np.asarray(cv_scores) # Note: we do not use max(out) to make ties deterministic even if # comparison on estimator instances is not deterministic best_score = -np.inf for score, params in scores: if score > best_score: best_score = score best_params = params if best_score is None: raise ValueError('Best score could not be found') self.best_score_ = best_score self.best_params_ = best_params if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_clf).set_params(**best_params) if sample_weight is not None: best_estimator.fit(X, y, sample_weight, **self.fit_params) else: best_estimator.fit(X, y, **self.fit_params) self.best_estimator_ = best_estimator # Store the computed scores # XXX: the name is too specific, it shouldn't have # 'grid' in it. Also, we should be retrieving/storing variance self.grid_scores_ = [ (clf_params, score, all_scores) for (score, clf_params), all_scores in zip(scores, cv_scores)] return self
def _fit(self, X, y, parameter_iterable=None): if parameter_iterable is not None: raise NotImplementedError('The parameter_iterable argument is not supported.') # Actual fitting, performing the search over parameters. estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch # setup SigOpt experiment and run optimization n_folds = len(cv) self._create_sigopt_exp(self.sigopt_connection, n_folds) # start tracking time to optimize estimator opt_start_time = time.time() for jk in range(0, self.n_iter, self.n_sug): # check for opt timeout, ensuring at least 1 observation # TODO : handling failure observations if ( self.opt_timeout is not None and time.time() - opt_start_time > self.opt_timeout and jk >= 1 ): # break out of loop and refit model with best params so far break suggestions = [] jobs = [] for _ in range(self.n_sug): for train, test in cv: suggestion = self.sigopt_connection.experiments(self.experiment.id).suggestions().create() parameters = self._convert_sigopt_api_to_sklearn_assignments(suggestion.assignments.to_json()) suggestions.append(suggestion) jobs.append([parameters, train, test]) if self.verbose > 0: print('Evaluating params : ', [job[0] for job in jobs]) # do CV folds in parallel using joblib # returns scores on test set obs_timed_out = False try: par_kwargs = {'n_jobs': self.n_jobs, 'verbose': self.verbose, 'pre_dispatch': pre_dispatch} # add timeout kwarg if version of joblib supports it if 'timeout' in getfullargspec(Parallel.__init__).args: par_kwargs['timeout'] = self.cv_timeout out = Parallel( **par_kwargs )( delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters, train, test in jobs) except TimeoutError: obs_timed_out = True if not obs_timed_out: # grab scores from results for sidx, suggestion in enumerate(suggestions): score = out[sidx][0] self.sigopt_connection.experiments(self.experiment.id).observations().create( suggestion=suggestion.id, value=score) else: # obsevation timed out so report a failure self.sigopt_connection.experiments(self.experiment.id).observations().create( suggestion=suggestion.id, failed=True) # return best SigOpt assignments so far best_assignments = self.sigopt_connection.experiments(self.experiment.id).best_assignments().fetch().data if not best_assignments: raise RuntimeError( 'No valid observations found. ' 'Make sure opt_timeout and cv_timeout provide sufficient time for observations to be reported.') self.best_params_ = self._convert_sigopt_api_to_sklearn_assignments(best_assignments[0].assignments.to_json()) self.best_score_ = best_assignments[0].value if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params(**self.best_params_) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def time_generalization(epochs_list, clf=None, cv=5, scoring="roc_auc", shuffle=True, random_state=None, n_jobs=1, verbose=None): """Fit decoder at each time instant and test at all others The function returns the cross-validation scores when the train set is from one time instant and the test from all others. The decoding will be done using all available data channels, but will only work if 1 type of channel is availalble. For example epochs should contain only gradiometers. Parameters ---------- epochs_list : list of Epochs The epochs in all the conditions. clf : object | None A object following scikit-learn estimator API (fit & predict). If None the classifier will be a linear SVM (C=1.) after feature standardization. cv : integer or cross-validation generator, optional If an integer is passed, it is the number of fold (default 5). Specific cross-validation objects can be passed, see sklearn.cross_validation module for the list of possible objects. scoring : {string, callable, None}, optional, default: "roc_auc" A string (see model evaluation documentation in scikit-learn) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. shuffle : bool If True, shuffle the epochs before splitting them in folds. random_state : None | int The random state used to shuffle the epochs. Ignored if shuffle is False. n_jobs : int Number of jobs to eggie in parallel. Each fold is fit in parallel. Returns ------- scores : array, shape (n_times, n_times) The scores averaged across folds. scores[i, j] contains the generalization score when learning at time j and testing at time i. The diagonal is the cross-validation score at each time-independant instant. Notes ----- The function implements the method used in: Jean-Remi King, Alexandre Gramfort, Aaron Schurger, Lionel Naccache and Stanislas Dehaene, "Two distinct dynamic modes subtend the detection of unexpected sounds", PLOS ONE, 2013 """ from sklearn.base import clone from sklearn.utils import check_random_state from sklearn.svm import SVC from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import check_cv if clf is None: scaler = StandardScaler() svc = SVC(C=1, kernel='linear') clf = Pipeline([('scaler', scaler), ('svc', svc)]) info = epochs_list[0].info data_picks = pick_types(info, meg=True, eeg=True, exclude='bads') # Make arrays X and y such that : # X is 3d with X.shape[0] is the total number of epochs to classify # y is filled with integers coding for the class to predict # We must have X.shape[0] equal to y.shape[0] X = [e.get_data()[:, data_picks, :] for e in epochs_list] y = [k * np.ones(len(this_X)) for k, this_X in enumerate(X)] X = np.concatenate(X) y = np.concatenate(y) cv = check_cv(cv, X, y, classifier=True) ch_types = set([channel_type(info, idx) for idx in data_picks]) logger.info('Running time generalization on %s epochs using %s.' % (len(X), ch_types.pop())) if shuffle: rng = check_random_state(random_state) order = np.argsort(rng.randn(len(X))) X = X[order] y = y[order] parallel, p_time_gen, _ = parallel_func(_time_gen_one_fold, n_jobs) scores = parallel( p_time_gen(clone(clf), X, y, train, test, scoring) for train, test in cv) scores = np.mean(scores, axis=0) return scores
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr') self.scorer_ = _deprecate_loss_and_score_funcs(self.loss_func, self.score_func, self.scoring) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)(delayed(fit_grid_point)( X, y, base_estimator, parameters, train, test, self.scorer_, self.verbose, **{ 'sample_weight': balance_weights(y[train]) }) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, parameters, this_n_test_samples in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, sample_weight=balance_weights(y), **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def fit(self, X, y=None, x_is_index=False): """ fit creates a task for every pair of folds and combination of hyperparameters in the grid it then distributes the tasks to ipyparallel view and waits for completion :param X: ndarray of data :param y: ndarray of target variables :param x_is_index: boolean variable to indicate that X is not the data itself, but the index of the data to be used on remote machines. Useful when sending the data by network is unfeasible """ if not self.loader: self.loader = lambda: (X, y) parameter_iterable = ParameterGrid(self.param_grid) """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) if x_is_index and self.loader is None: raise ValueError('no loader given') X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) train_test_parameters = ((train, test, apply_transforms(parameters, self.transforms)) \ for parameters in parameter_iterable for train, test in cv) length = len(parameter_iterable) * len(cv) if self.callback: self.callback(len(self.cacher), length) if x_is_index: X_to_pass = X y_to_pass = y if self.loader is None else None else: if self.loader is not None: X_to_pass = None y_to_pass = None else: X_to_pass = X y_to_pass = y # print('sequences') # sequences = [ # train_test_parameters, # [clone(base_estimator)] * length, # [X_to_pass] * length, # [y_to_pass] * length, # [self.verbose] * length, # [self.fit_params] * length, # [True] * length, # [self.scorer_] * length, # [x_is_index] * length, # ] f = partial(GridSearchCVParallel.my_fit_and_score, estimator=base_estimator, X=X_to_pass, y=y_to_pass, fit_params=self.fit_params, scorer=self.scorer_, x_is_index=x_is_index, loader=self.loader, fit_callback=self.fit_callback) iterable = itertools.ifilter(lambda (i, ttp): i not in self.cacher, enumerate(train_test_parameters)) results_by_params = defaultdict(list) for id, result in self.cacher.iteritems(): score, test_size, time, params = result results_by_params[frozenset(map(map_param, params.iteritems()))].append(score) try: for index, result in self.mapper(f, iterable): self.cacher[index] = result score, test_size, time, params = result results_by_params[frozenset(map( map_param, params.iteritems()))].append(score) if self.callback: best_scores = next( iter( sorted(itertools.ifilter( lambda scores: len(scores) == len(cv), results_by_params.values()), key=lambda scores: np.mean(scores), reverse=True)), [0]) self.callback(1, length, description='%.3f+-%.3f' % (np.mean(best_scores), np.std(best_scores))) except Exception as e: print(e) e_type, e_value, e_tb = sys.exc_info() traceback.print_tb(e_tb) # assert len(self.cacher) == length and (np.array(self.cacher.keys()) == np.arange(length)).all() # out = self.cacher.values() # # # Out is a list of triplet: score, estimator, n_test_samples # n_fits = len(out) # n_folds = len(cv) # # scores = list() # grid_scores = list() # for grid_start in range(0, n_fits, n_folds): # n_test_samples = 0 # score = 0 # all_scores = [] # for this_score, this_n_test_samples, _, parameters in \ # out[grid_start:grid_start + n_folds]: # all_scores.append(this_score) # if self.iid: # this_score *= this_n_test_samples # n_test_samples += this_n_test_samples # score += this_score # if self.iid: # score /= float(n_test_samples) # else: # score /= float(n_folds) # scores.append((score, parameters)) # # TODO: shall we also store the test_fold_sizes? # grid_scores.append(_CVScoreTuple( # parameters, # score, # np.array(all_scores))) grid_scores = [] for set_params, all_scores in results_by_params.iteritems(): grid_scores.append( _CVScoreTuple(dict(set_params), np.mean(all_scores), np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores print(len(grid_scores)) # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def fit(self, epochs, y=None): """ Train a classifier on each specified time slice. Note. This function sets the ``picks_``, ``ch_names``, ``cv_``, ``y_train``, ``train_times_`` and ``estimators_`` attributes. Parameters ---------- epochs : instance of Epochs The epochs. y : list or ndarray of int, shape (n_samples,) or None, optional To-be-fitted model values. If None, y = epochs.events[:, 2]. Returns ------- self : GeneralizationAcrossTime Returns fitted GeneralizationAcrossTime object. Notes ------ If X and y are not C-ordered and contiguous arrays of np.float64 and X is not a scipy.sparse.csr_matrix, X and/or y may be copied. If X is a dense array, then the other methods will not support sparse matrices as input. """ from sklearn.base import clone from sklearn.cross_validation import check_cv, StratifiedKFold # clean attributes for att in ['picks_', 'ch_names', 'y_train_', 'cv_', 'train_times_', 'estimators_', 'test_times_', 'y_pred_', 'y_true_', 'scores_', 'scorer_']: if hasattr(self, att): delattr(self, att) n_jobs = self.n_jobs # Extract data from MNE structure X, y, self.picks_ = _check_epochs_input(epochs, y, self.picks) self.ch_names = [epochs.ch_names[p] for p in self.picks_] cv = self.cv if isinstance(cv, (int, np.int)): cv = StratifiedKFold(y, cv) cv = check_cv(cv, X, y, classifier=True) self.cv_ = cv # update CV self.y_train_ = y # Cross validation scheme # XXX Cross validation should later be transformed into a make_cv, and # defined in __init__ self.train_times_ = copy.deepcopy(self.train_times) if 'slices' not in self.train_times_: self.train_times_ = _sliding_window(epochs.times, self.train_times) # Parallel across training time parallel, p_time_gen, n_jobs = parallel_func(_fit_slices, n_jobs) n_chunks = min(X.shape[2], n_jobs) splits = np.array_split(self.train_times_['slices'], n_chunks) def f(x): return np.unique(np.concatenate(x)) out = parallel(p_time_gen(clone(self.clf), X[..., f(train_slices_chunk)], y, train_slices_chunk, cv) for train_slices_chunk in splits) # Unpack estimators into time slices X folds list of lists. self.estimators_ = sum(out, list()) return self
def fit(self, X, y=None): parameter_iterable = ParameterGrid(self.param_grid) param_list, first = self._grid_to_simple_list(parameter_iterable) estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) base_estimator = clone(self.estimator) n_folds = len(cv) grid_scores = list() def func(x): parameters = self._list_to_grid_point(x, parameter_iterable) n_test_samples = 0 score = 0 all_scores = [] for train, test in cv: this_score, this_n_test_samples, _, parameters = \ _fit_and_score(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) #print 'In func:', x, score return score max_evals = 17 if getattr(self, 'max_evals', None) == None else self.max_evals l = COGP(func, maxEvaluations=max_evals, grid=param_list, minimize=False) out = l.learn() #print 'Out:', out self.grid_scores_ = grid_scores best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, X, y, features_names=None, preload_features=None, ranking_th=0.005): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], multi_output=True) # Initialization n_features = X.shape[1] features = np.arange(n_features) cv = self.cv cv = check_cv(cv, y, classifier=is_classifier(self.estimator)) if sklearn.__version__ == '0.17': n_splits = cv.n_folds else: n_splits = cv.get_n_splits(X, y) if self.verbose > 0: print("Fitting {0} folds for each of iteration".format(n_splits)) if 0.0 < self.n_features_step < 1.0: step = int(max(1, self.n_features_step * n_features)) else: step = int(self.n_features_step) if step <= 0: raise ValueError("Step must be >0") if features_names is not None: features_names = np.array(features_names) else: if self.features_names is not None: features_names = self.features_names else: features_names = np.arange(n_features) # use indices tentative_support_ = np.zeros(n_features, dtype=np.bool) current_support_ = np.zeros(n_features, dtype=np.bool) self.scores_ = [] self.scores_confidences_ = [] self.features_per_it_ = [] if preload_features is not None: preload_features = np.unique(preload_features).astype('int') current_support_[preload_features] = True X_selected = X[:, features[current_support_]] y_hat, cv_scores = my_cross_val_predict(clone(self.estimator), X_selected, y, cv=cv) target = y - y_hat else: target = y.copy() score, confidence_interval = -np.inf, 0 proceed = np.sum(current_support_) < X.shape[1] while proceed: if self.verbose > 0: print('\nN-times variance of target: {}'.format( target.var() * target.shape[0])) # update values old_confidence_interval = confidence_interval old_score = score if self.scale: target = StandardScaler().fit_transform(target.reshape( -1, 1)).ravel() # target = MinMaxScaler().fit_transform(target.reshape( # -1,1)).ravel() if self.verbose > 0: print() print('Feature ranking') print() print("target shape: {}".format(target.shape)) print() # Rank the remaining features start_t = time.time() rank_estimator = clone(self.estimator) rank_estimator.fit(X, target) end_fit = time.time() - start_t # Get coefs start_t = time.time() if hasattr(rank_estimator, 'coef_'): coefs = rank_estimator.coef_ elif hasattr(rank_estimator, 'feature_importances_'): coefs = rank_estimator.feature_importances_ else: raise RuntimeError('The classifier does not expose ' '"coef_" or "feature_importances_" ' 'attributes') end_rank = time.time() - start_t # Get ranks by ordering in ascending way if coefs.ndim > 1: ranks = np.argsort(safe_sqr(coefs).sum(axis=0)) coefs = coefs.sum(axis=0) else: ranks = np.argsort(safe_sqr(coefs)) # for sparse case ranks is matrix ranks = np.ravel(ranks) if self.verbose > 0: ranked_f = features[ranks] if features_names is not None: ranked_n = features_names[ranks] else: ranked_n = ['-'] * n_features print('{:6}\t{:6}\t{:8}\t{}'.format('Rank', 'Index', 'Score', 'Feature Name')) for i in range(n_features): idx = n_features - i - 1 if (coefs[ranks[idx]] < ranking_th) and (i > 2): print(' ...') break print('#{:6}\t{:6}\t{:6f}\t{}'.format(str(i), str(ranked_f[idx]), coefs[ranks[idx]], ranked_n[idx])) print( "\n Fit done in {} s and rank done in {} s".format(end_fit, end_rank)) # if coefs[ranks][-1] < 1e-5: # if self.verbose > 0: # import warnings # warnings.warn('scores are too small to be used, please standardize inputs') # break # get the best features (ie, the latest one) # if the most ranked features is selected go on a select # other features accordingly to the ranking # threshold = step # step_features = features[ranks][-threshold:] ii = len(features_names) - 1 step_features = features[ranks][ii] while np.all(current_support_[step_features]) and ii > 0: ii -= 1 step_features = features[ranks][ii] if np.all(current_support_[step_features]): if self.verbose > 0: print("Selected features: {} {}".format( features_names[step_features], step_features)) # if features_names is not None: # print("Selected features: {} {}".format(features_names[ranks][-threshold:], step_features)) # else: # print("Selected features: {}".format(step_features)) print('Ended because selected features already selected') step_features = None break # update selected features tentative_support_[step_features] = True # get the selected features X_selected = X[:, features[tentative_support_]] start_t = time.time() # cross validates to obtain the scores y_hat, cv_scores = my_cross_val_predict(clone(self.estimator), X_selected, y, cv=cv) # y_hat = cross_val_predict(clone(self.estimator), X_selected, y, cv=cv) # compute new target target = y - y_hat # compute score and confidence interval # score = r2_score(y_true=y, y_pred=y_hat, multioutput='uniform_average') # np.mean(cv_scores) if self.verbose > 0: print('r2: {}'.format(np.mean(cv_scores, axis=0))) score = np.mean(cv_scores) if len(cv_scores.shape) > 1: cv_scores = np.mean(cv_scores, axis=1) m2 = np.mean(cv_scores * cv_scores) confidence_interval_or = np.sqrt( (m2 - score * score) / (n_splits - 1)) end_t = time.time() - start_t if self.verbose > 0: # if features_names is not None: print("Selected features: {} {}".format( features_names[step_features], step_features)) print("Total features: {} {}".format( features_names[tentative_support_], features[tentative_support_])) # else: # print("Selected features: {}".format(step_features)) # print("Total features: {}".format(features[tentative_support_])) print("R2= {} +- {}".format(score, confidence_interval_or)) print("\nCrossvalidation done in {} s".format(end_t)) confidence_interval = confidence_interval_or * self.significance # do not trust confidence interval completely # check terminal condition proceed = score - old_score > old_confidence_interval + confidence_interval if self.verbose > 0: print("PROCEED: {}\n\t{} - {} > {} + {}\n\t{} > {} )".format( proceed, score, old_score, old_confidence_interval, confidence_interval, score - old_score, old_confidence_interval + confidence_interval)) if proceed or np.sum(current_support_) == 0: # last feature set proved to be informative # we need to take into account of the new features (update current support) current_support_[step_features] = True self.features_per_it_.append(features_names[step_features]) self.scores_.append(score) self.scores_confidences_.append(confidence_interval) # all the features are selected, stop if np.sum(current_support_) == n_features: if self.verbose > 0: print("All the features has been selected.") proceed = False else: # last feature set proved to be not informative # keep old support and delete the current one (it is no more necessary) del tentative_support_ if self.verbose > 0: print('Last feature {} not added to the set'.format( features_names[step_features])) # Set final attributes self.estimator_ = clone(self.estimator) # self.estimator_.fit(Xns[:, current_support_], yns) self.estimator_.fit(X[:, current_support_], y) self.n_features_ = current_support_.sum() self.support_ = current_support_ # self.ranking_ = ranking_ return self
def _fit(self, X, y, parameter_dict): self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(self.cv, y=y, classifier=is_classifier(self.estimator)) creator.create("FitnessMax", base.Fitness, weights=(1.0, )) creator.create("Individual", list, est=clone(self.estimator), fitness=creator.FitnessMax) toolbox = base.Toolbox() name_values, gene_type, maxints = _get_param_types_maxint( parameter_dict) if self.gene_type is None: self.gene_type = gene_type if self.verbose: print("Types %s and maxint %s detected" % (self.gene_type, maxints)) toolbox.register("individual", _initIndividual, creator.Individual, maxints=maxints) toolbox.register("population", tools.initRepeat, list, toolbox.individual) toolbox.register("evaluate", _evalFunction, name_values=name_values, X=X, y=y, scorer=self.scorer_, cv=cv, iid=self.iid, verbose=self.verbose, error_score=self.error_score, fit_params=self.fit_params) toolbox.register("mate", _cxIndividual, indpb=self.gene_crossover_prob, gene_type=self.gene_type) toolbox.register("mutate", _mutIndividual, indpb=self.gene_mutation_prob, up=maxints) toolbox.register("select", tools.selTournament, tournsize=self.tournament_size) if self.n_jobs > 1: pool = Pool(processes=self.n_jobs) toolbox.register("map", pool.map) pop = toolbox.population(n=self.population_size) hof = tools.HallOfFame(1) stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", np.mean) stats.register("min", np.min) stats.register("max", np.max) if self.verbose: print('--- Evolve in {0} possible combinations ---'.format( np.prod(np.array(maxints) + 1))) pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=self.generations_number, stats=stats, halloffame=hof, verbose=self.verbose) current_best_score_ = hof[0].fitness.values[0] current_best_params_ = _individual_to_params(hof[0], name_values) if self.verbose: print("Best individual is: %s\nwith fitness: %s" % (current_best_params_, current_best_score_)) if current_best_score_ > self.best_score_: self.best_score_ = current_best_score_ self.best_params_ = current_best_params_
def time_generalization(epochs_list, clf=None, cv=5, scoring="roc_auc", shuffle=True, random_state=None, n_jobs=1, verbose=None): """Fit decoder at each time instant and test at all others The function returns the cross-validation scores when the train set is from one time instant and the test from all others. The decoding will be done using all available data channels, but will only work if 1 type of channel is availalble. For example epochs should contain only gradiometers. Parameters ---------- epochs_list : list of Epochs The epochs in all the conditions. clf : object | None A object following scikit-learn estimator API (fit & predict). If None the classifier will be a linear SVM (C=1.) after feature standardization. cv : integer or cross-validation generator, optional If an integer is passed, it is the number of fold (default 5). Specific cross-validation objects can be passed, see sklearn.cross_validation module for the list of possible objects. scoring : {string, callable, None}, optional, default: "roc_auc" A string (see model evaluation documentation in scikit-learn) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. shuffle : bool If True, shuffle the epochs before splitting them in folds. random_state : None | int The random state used to shuffle the epochs. Ignored if shuffle is False. n_jobs : int Number of jobs to run in parallel. Each fold is fit in parallel. Returns ------- scores : array, shape (n_times, n_times) The scores averaged across folds. scores[i, j] contains the generalization score when learning at time j and testing at time i. The diagonal is the cross-validation score at each time-independant instant. Notes ----- The function implements the method used in: Jean-Remi King, Alexandre Gramfort, Aaron Schurger, Lionel Naccache and Stanislas Dehaene, "Two distinct dynamic modes subtend the detection of unexpected sounds", PLOS ONE, 2013 """ from sklearn.base import clone from sklearn.utils import check_random_state from sklearn.svm import SVC from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import check_cv if clf is None: scaler = StandardScaler() svc = SVC(C=1, kernel='linear') clf = Pipeline([('scaler', scaler), ('svc', svc)]) info = epochs_list[0].info data_picks = pick_types(info, meg=True, eeg=True, exclude='bads') # Make arrays X and y such that : # X is 3d with X.shape[0] is the total number of epochs to classify # y is filled with integers coding for the class to predict # We must have X.shape[0] equal to y.shape[0] X = [e.get_data()[:, data_picks, :] for e in epochs_list] y = [k * np.ones(len(this_X)) for k, this_X in enumerate(X)] X = np.concatenate(X) y = np.concatenate(y) cv = check_cv(cv, X, y, classifier=True) ch_types = set([channel_type(info, idx) for idx in data_picks]) logger.info('Running time generalization on %s epochs using %s.' % (len(X), ch_types.pop())) if shuffle: rng = check_random_state(random_state) order = np.argsort(rng.randn(len(X))) X = X[order] y = y[order] parallel, p_time_gen, _ = parallel_func(_time_gen_one_fold, n_jobs) scores = parallel(p_time_gen(clone(clf), X, y, train, test, scoring) for train, test in cv) scores = np.mean(scores, axis=0) return scores
def fit(self, X, y=None, x_is_index=False, X_name='X', y_name='y'): parameter_iterable = ParameterGrid(self.param_grid) """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) # out = Parallel( # n_jobs=self.n_jobs, verbose=self.verbose, # pre_dispatch=pre_dispatch # )( # delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, # train, test, self.verbose, parameters, # self.fit_params, return_parameters=True, # error_score=self.error_score) # for parameters in parameter_iterable # for train, test in cv) train_test_parameters = ((train, test, parameters) \ for parameters in parameter_iterable for train, test in cv) length = len(parameter_iterable) * len(cv) if x_is_index: X_to_pass = X y_to_pass = None else: X_to_pass = None y_to_pass = None self.view.block = False # print('sequences') # sequences = [ # train_test_parameters, # [clone(base_estimator)] * length, # [X_to_pass] * length, # [y_to_pass] * length, # [self.verbose] * length, # [self.fit_params] * length, # [True] * length, # [self.scorer_] * length, # [x_is_index] * length, # ] f = partial(my_fit_and_score, estimator=clone(base_estimator), X=X_to_pass, y=y_to_pass, verbose=self.verbose, fit_params=self.fit_params, return_parameters=True, scorer=None, x_is_index=x_is_index, names=(X_name, y_name)) # print('before map') # import cProfile # # pr = cProfile.Profile() # pr.enable() chunksize = 10 out = self.view.map(f, itertools.islice(train_test_parameters, 0, length), ordered=False, block=False, chunksize=chunksize) # length / len(self.view)) # pr.disable() # pr.print_stats('cumulative') print('map called') if self.callback is not None: old_progress = out.progress while not out.ready(): self.callback(out.progress * chunksize, length, out.elapsed) if old_progress == out.progress and out.progress > 0: for id, info in self.view.queue_status(verbose=True).iteritems(): # print(id, info) if isinstance(info, dict) and 'queue' in info and len(info['queue']) > 0: print(id, info['queue']) pass old_progress = out.progress sleep(10) print('map ready') out = out.get() # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def fit(self, X, y): """Fit the learner Parameters ---------- X : list of Niimg-like objects See http://nilearn.github.io/manipulating_visualizing/manipulating_images.html#niimg Data on which model is to be fitted. If this is a list, the affine is considered the same for all. y : array or list of length n_samples The dependent variable (age, sex, QI, etc.). Notes ----- self : `SpaceNet` object Model selection is via cross-validation with bagging. """ # misc self.check_params() if self.memory is None or isinstance(self.memory, _basestring): self.memory_ = Memory(self.memory, verbose=max(0, self.verbose - 1)) else: self.memory_ = self.memory if self.verbose: tic = time.time() # nifti masking if isinstance(self.mask, NiftiMasker): self.masker_ = clone(self.mask) else: self.masker_ = NiftiMasker(mask_img=self.mask, target_affine=self.target_affine, target_shape=self.target_shape, standardize=self.standardize, low_pass=self.low_pass, high_pass=self.high_pass, mask_strategy='epi', t_r=self.t_r, memory=self.memory_) X = self.masker_.fit_transform(X) # misc self.Xmean_ = X.mean(axis=0) self.Xstd_ = X.std(axis=0) self.Xstd_[self.Xstd_ < 1e-8] = 1 self.mask_img_ = self.masker_.mask_img_ self.mask_ = self.mask_img_.get_data().astype(np.bool) n_samples, _ = X.shape y = np.array(y).copy() l1_ratios = self.l1_ratios if isinstance(l1_ratios, numbers.Number): l1_ratios = [l1_ratios] alphas = self.alphas if isinstance(alphas, numbers.Number): alphas = [alphas] if not self.loss is None: loss = self.loss elif self.is_classif: loss = "logistic" else: loss = "mse" # set backend solver if self.penalty.lower() == "graph-net": if not self.is_classif or loss == "mse": solver = _graph_net_squared_loss else: solver = _graph_net_logistic else: if not self.is_classif or loss == "mse": solver = partial(tvl1_solver, loss="mse") else: solver = partial(tvl1_solver, loss="logistic") # generate fold indices case1 = (None in [alphas, l1_ratios]) and self.n_alphas > 1 case2 = (not alphas is None) and min(len(l1_ratios), len(alphas)) > 1 if case1 or case2: self.cv_ = list(check_cv(self.cv, X=X, y=y, classifier=self.is_classif)) else: # no cross-validation needed, user supplied all params self.cv_ = [(np.arange(n_samples), [])] n_folds = len(self.cv_) # number of problems to solve if self.is_classif: y = self._binarize_y(y) else: y = y[:, np.newaxis] if self.is_classif and self.n_classes_ > 2: n_problems = self.n_classes_ else: n_problems = 1 # standardize y self.ymean_ = np.zeros(y.shape[0]) if n_problems == 1: y = y[:, 0] # scores & mean weights map over all folds self.cv_scores_ = [[] for i in range(n_problems)] w = np.zeros((n_problems, X.shape[1] + 1)) self.all_coef_ = np.ndarray((n_problems, n_folds, X.shape[1])) self.screening_percentile_ = _adjust_screening_percentile( self.screening_percentile, self.mask_img_, verbose=self.verbose) # main loop: loop on classes and folds solver_params = dict(tol=self.tol, max_iter=self.max_iter) self.best_model_params_ = [] self.alpha_grids_ = [] for (test_scores, best_w, best_alpha, best_l1_ratio, alphas, y_train_mean, (cls, fold)) in Parallel( n_jobs=self.n_jobs, verbose=2 * self.verbose)( delayed(self._cache(path_scores, func_memory_level=2))( solver, X, y[:, cls] if n_problems > 1 else y, self.mask_, alphas, l1_ratios, self.cv_[fold][0], self.cv_[fold][1], solver_params, n_alphas=self.n_alphas, eps=self.eps, is_classif=self.loss == "logistic", key=(cls, fold), debias=self.debias, verbose=self.verbose, screening_percentile=self.screening_percentile_, ) for cls in range(n_problems) for fold in range(n_folds)): self.best_model_params_.append((best_alpha, best_l1_ratio)) self.alpha_grids_.append(alphas) self.ymean_[cls] += y_train_mean self.all_coef_[cls, fold] = best_w[:-1] if len(np.atleast_1d(l1_ratios)) == 1: test_scores = test_scores[0] self.cv_scores_[cls].append(test_scores) w[cls] += best_w # misc self.cv_scores_ = np.array(self.cv_scores_) self.alpha_grids_ = np.array(self.alpha_grids_) self.ymean_ /= n_folds if not self.is_classif: self.all_coef_ = np.array(self.all_coef_) w = w[0] self.ymean_ = self.ymean_[0] # bagging: average best weights maps over folds w /= n_folds # set coefs and intercepts self._set_coef_and_intercept(w) # unmask weights map as a niimg self.coef_img_ = self.masker_.inverse_transform(self.coef_) # report time elapsed if self.verbose: duration = time.time() - tic print("Time Elapsed: %g seconds, %i minutes." % ( duration, duration / 60.)) return self
def _fit(self, X, y): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch # setup SigOpt experiment and run optimization self._create_sigopt_exp() for jk in xrange(self.n_iter): suggestion = self.conn.experiments(self.experiment.id).suggestions().create() parameters = suggestion.assignments.to_json() # convert all unicode names and values to plain strings non_unicode_parameters = self._convert_unicode_dict(parameters) if self.verbose > 0: print "Evaluating params : ",non_unicode_parameters # do CV folds in parallel using joblib # returns scores on test set out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, non_unicode_parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for train, test in cv) # grab scores from results scores = [o[0] for o in out] self.conn.experiments(self.experiment.id).observations().create( suggestion=suggestion.id, value=numpy.mean(scores), value_stddev=numpy.std(scores) ) # return best SigOpt observation so far best_obs = self.conn.experiments(self.experiment.id).fetch().progress.best_observation self.best_params_ = best_obs.assignments.to_json() # convert all unicode names and values to plain strings self.best_params_ = self._convert_unicode_dict(self.best_params_) self.best_score_ = best_obs.value if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **self.best_params_) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr') self.scorer_ = _deprecate_loss_and_score_funcs( self.loss_func, self.score_func, self.scoring) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( delayed(fit_grid_point_extended)( X, y, base_estimator, parameters, train, test, self.scorer_, self.verbose, **self.fit_params) for parameters in parameter_iterable for train, test in cv) # out = [] # for parameters in parameter_iterable: # fold = 1 # for train, test in cv: # print "Processing fold", fold, self.fit_params # out.append(fit_grid_point_extended(X, y, base_estimator, parameters, train, test, self.scorer_, self.verbose, **self.fit_params)) # fold += 1 # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_extras = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] all_extras = list() for this_score, parameters, this_n_test_samples, extra in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) all_extras.append(extra) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) grid_extras.append(all_extras) # Store the computed scores self.grid_scores_ = grid_scores self.extras_ = grid_extras # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def fit(self, X, y=None, **params): """Run fit with all sets of parameters Returns the best classifier Parameters ---------- X: array, [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. y: array-like, shape = [n_samples], optional Target vector relative to X for classification; None for unsupervised learning. """ import os import binascii import itertools self._set_params(**params) estimator = self.estimator cv = self.cv if hasattr(X, 'shape'): n_samples = X.shape[0] else: # support list of unstructured objects on which feature # extraction will be applied later in the tranformer chain n_samples = len(X) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) grid = IterGrid(self.param_grid) random.shuffle(list(grid)) base_clf = clone(self.estimator) pre_dispatch = self.pre_dispatch suffix = binascii.hexlify(os.urandom(10)) @parallel.util.interactive def push_data(X, y, suffix): data = dict(X=X, y=y) g = globals() g.update({'data_'+suffix : data}) push_ars = [] ids = self.view.targets or self.view.client.ids for id in ids: with self.view.temp_flags(targets=[id]): push_ars.append(self.view.apply_async(push_data, X, y, suffix)) self._push_results = push_ars self.view.follow = parallel.Dependency(push_ars, all=False) ars = [] rX = parallel.Reference('data_%s["X"]' % suffix) ry = parallel.Reference('data_%s["y"]' % suffix) for param_id, clf_params in enumerate(grid): for train,test in cv: ars.append(self.view.apply_async(fit_grid_point, rX, ry, base_clf, clf_params, train, test, self.loss_func, self.score_func, self.verbose, param_id=param_id, **self.fit_params) ) # clear folllow dep self.view.follow = None self._fit_results = ars return self
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) param_grid = [(parameters, train, test) for parameters in parameter_iterable for (train, test) in cv] # Because the original python code expects a certain order for the elements, we need to # respect it. indexed_param_grid = list(zip(range(len(param_grid)), param_grid)) par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid)) X_bc = self.sc.broadcast(X) y_bc = self.sc.broadcast(y) scorer = self.scorer_ verbose = self.verbose fit_params = self.fit_params error_score = self.error_score fas = _fit_and_score def fun(tup): (index, (parameters, train, test)) = tup local_estimator = clone(base_estimator) local_X = X_bc.value local_y = y_bc.value res = fas(local_estimator, local_X, local_y, scorer, train, test, verbose, parameters, fit_params, return_parameters=True, error_score=error_score) return (index, res) indexed_out0 = dict(par_param_grid.map(fun).collect()) out = [indexed_out0[idx] for idx in range(len(param_grid))] X_bc.unpersist() y_bc.unpersist() # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self