def test_check_sample_weight(): from sklearn.cluster.k_means_ import _check_sample_weight sample_weight = None checked_sample_weight = _check_sample_weight(X, sample_weight) assert_equal(_num_samples(X), _num_samples(checked_sample_weight)) assert_almost_equal(checked_sample_weight.sum(), _num_samples(X)) assert_equal(X.dtype, checked_sample_weight.dtype)
def _index_param_value(X, v, indices): """Private helper function for parameter value indexing.""" if not _is_arraylike(v) or _num_samples(v) != _num_samples(X): # pass through: skip indexing return v if sp.issparse(v): v = v.tocsr() return safe_indexing(v, indices)
def _fit_and_score(estimator, depthmaps, offset_points_projected, direction_vectors, true_joints, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise'): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(depthmaps, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() depth_train, offsets_train, directions_train, truths_train = _safe_split(depthmaps, offset_points_projected, direction_vectors, true_joints, train) depth_test, offsets_test, directions_test, truths_test = _safe_split(depthmaps, offset_points_projected, direction_vectors, true_joints, test) try: estimator.fit(depth_train, offsets_train, directions_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)" ) else: test_score = _score(estimator, depth_test, truths_test, scorer) if return_train_score: train_score = _score(estimator, depth_train, truths_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(depth_test), scoring_time]) if return_parameters: ret.append(parameters) return ret
def test_retrieve_samples_from_non_standard_shape(): class TestNonNumericShape: def __init__(self): self.shape = ("not numeric",) def __len__(self): return len([1, 2, 3]) X = TestNonNumericShape() assert _num_samples(X) == len(X) # check that it gives a good error if there's no __len__ class TestNoLenWeirdShape: def __init__(self): self.shape = ("not numeric",) with pytest.raises(TypeError, match="Expected sequence or array-like"): _num_samples(TestNoLenWeirdShape())
def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. :param X: array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. :param y: array-like of shape (n_samples,) Always ignored, exists for compatibility. :param groups: array-like of shape (n_samples,) Always ignored, exists for compatibility. :returns: train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ if groups is None: raise ValueError("The 'groups' parameter should not be None.") X, y, groups = indexable(X, y, groups) groups = check_array(groups, ensure_2d=False, dtype=None) unique_groups, groups = np.unique(groups, return_inverse=True) n_samples_per_group = np.bincount(groups) n_groups = len(unique_groups) n_samples = _num_samples(X) n_splits = self.n_splits n_folds = n_splits + 1 if self.n_splits > n_groups: raise ValueError("Cannot have number of splits n_splits=%d greater" " than the number of groups: %d." % (self.n_splits, n_groups)) indices = np.arange(n_samples) test_size = (n_groups // n_folds) test_starts = range(test_size + n_groups % n_folds, n_groups, test_size) for test_start in test_starts: # here we already have groups after inverse operation # and don't need to use unique_group if self.max_train_size: # find number of group for start not to overflow train size sizes = n_samples_per_group[:test_start][::-1].cumsum() appropriate_indices = np.where(sizes <= self.max_train_size)[0] if appropriate_indices.size == 0: train_start = max(test_start - 1, 0) else: train_start = test_start - appropriate_indices.max() - 1 yield (indices[(groups < test_start) & (groups >= train_start)], indices[(groups >= test_start) & (groups < test_start + test_size)]) else: yield (indices[groups < test_start], indices[(groups >= test_start) & (groups < test_start + test_size)])
def _fit_and_score(estimator, Z, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise'): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in list(parameters.items()))) print(("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))) fit_params = fit_params if fit_params is not None else {} if parameters is not None: estimator.set_params(**parameters) start_time = time.time() Z_train = Z[train] Z_test = Z[test] try: estimator.fit(Z_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)" ) else: test_score = _score(estimator, Z_test, scorer) if return_train_score: train_score = _score(estimator, Z_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print(("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(Z_test), scoring_time]) if return_parameters: ret.append(parameters) return ret
def split(self, X, y=None, groups=None): n_samples = _num_samples(X) indices = np.arange(n_samples) # all events which are true are considered to be rare rare_event_indices = indices[np.sum(y, axis=tuple(range(1, y.ndim))) >= 0.999] for f, (train_idx, test_idx) in enumerate(super().split(X, y, groups)): yield np.hstack([train_idx, rare_event_indices]), np.hstack([test_idx, rare_event_indices])
def _check_chunk_size(reduced, chunk_size): """Checks chunk is a sequence of expected size or a tuple of same """ is_tuple = isinstance(reduced, tuple) if not is_tuple: reduced = (reduced,) if any(isinstance(r, tuple) or not hasattr(r, '__iter__') for r in reduced): raise TypeError('reduce_func returned %r. ' 'Expected sequence(s) of length %d.' % (reduced if is_tuple else reduced[0], chunk_size)) if any(_num_samples(r) != chunk_size for r in reduced): # XXX: we use int(_num_samples...) because sometimes _num_samples # returns a long in Python 2, even for small numbers. actual_size = tuple(int(_num_samples(r)) for r in reduced) raise ValueError('reduce_func returned object of length %s. ' 'Expected same length as input: %d.' % (actual_size if is_tuple else actual_size[0],chunk_size))
def split(self, X, y=None, groups=None): X, y, groups = indexable(X, y, groups) # pylint: disable=unbalanced-tuple-unpacking n_samples = _num_samples(X) n_splits = self.n_splits n_folds = n_splits + 1 delay = self.delay if n_folds > n_samples: raise ValueError( f'Cannot have number of folds={n_folds} greater than the number of samples: {n_samples}.' ) indices = np.arange(n_samples) split_size = n_samples // n_folds train_size = self.train_size or split_size * self.n_splits test_size = self.test_size or n_samples // n_folds full_test = test_size + delay if full_test + n_splits > n_samples: raise ValueError( f'test_size\\({test_size}\\) + delay\\({delay}\\) = {test_size + delay} + ' f'n_splits={n_splits} \n' f' greater than the number of samples: {n_samples}. Cannot create fold logic.' ) # Generate logic for splits. # Overwrite fold test_starts ranges if force_step_size is specified. if self.force_step_size: step_size = self.force_step_size final_fold_start = n_samples - (train_size + full_test) range_start = (final_fold_start % step_size) + train_size test_starts = range(range_start, n_samples, step_size) else: if not self.train_size: step_size = split_size range_start = (split_size - full_test) + split_size + (n_samples % n_folds) else: step_size = (n_samples - (train_size + full_test)) // n_folds final_fold_start = n_samples - (train_size + full_test) range_start = (final_fold_start - (step_size * (n_splits - 1))) + train_size test_starts = range(range_start, n_samples, step_size) # Generate data splits. for test_start in test_starts: idx_start = test_start - train_size if self.train_size is not None else 0 # Ensure we always return a test set of the same size if indices[test_start:test_start + full_test].size < full_test: continue yield (indices[idx_start:test_start], indices[test_start + delay:test_start + full_test])
def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) The target variable for supervised learning problems. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. Returns ------- train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ if groups is not None: # find all indices that are at the beginning of a group groups_unique = np.unique(groups) possible_test_start = [ np.where(i == groups)[0][0] for i in np.nditer(groups_unique) ] possible_test_start = np.asarray(possible_test_start) X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) n_splits = self.n_splits n_folds = n_splits + 1 if n_folds > n_samples: raise ValueError(("Cannot have number of folds ={0} greater" " than the number of samples: {1}.").format( n_folds, n_samples)) indices = np.arange(n_samples) test_size = (n_samples // n_folds) test_starts = range(test_size + n_samples % n_folds, n_samples, test_size) if groups is not None: # find all possible starts that are closest to predefined test_starts test_starts = [ possible_test_start[np.abs(possible_test_start - i).argmin()] for i in test_starts ] for test_start in test_starts: yield (indices[:test_start], indices[test_start:test_start + test_size])
def score_each_boost(estimator, parameters, min_n_estimators, X, y, sample_weight, score_func, train, test, verbose): """Run fit on one set of parameters Returns the score and the instance of the classifier """ if verbose > 1: start_time = time.time() msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.iteritems())) print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.') X_test, y_test, sample_weight_test = _safe_split( estimator, X, y, sample_weight, test, train) test_score_params = {} if sample_weight is not None: test_score_params['sample_weight'] = sample_weight_test this_n_test_samples = _num_samples(X_test) all_scores = [] all_clf_params = [] n_test_samples = [] for i, y_pred in enumerate(estimator.staged_predict(X_test)): if i + 1 < min_n_estimators: continue score = score_func(y_test, y_pred, **test_score_params) all_scores.append(score) clf_para = copy(parameters) clf_para['n_estimators'] = i + 1 all_clf_params.append(clf_para) n_test_samples.append(this_n_test_samples) # boosting may have stopped early if len(all_scores) < estimator.n_estimators - min_n_estimators + 1: last_score = all_scores[-1] last_clf_params = all_clf_params[-1] for i in range(len(all_scores), estimator.n_estimators - min_n_estimators + 1): all_scores.append(last_score) clf_para = copy(last_clf_params) clf_para['n_estimators'] = i + 1 all_clf_params.append(clf_para) n_test_samples.append(this_n_test_samples) if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(time.time() - start_time)) print "[BoostGridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg) return all_scores, all_clf_params, n_test_samples
def predict_proba(self, X): """ Predict class probabilities for X. The predicted class probabilities of an input sample are computed as the mean predicted class probabilities of the trees in the forest. The class probability of a single tree is the fraction of samples of the same class in a leaf. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- p : ndarray of shape (n_samples, n_classes), or a list of n_outputs such arrays if n_outputs > 1. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. """ if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) if hasattr(self, 'n_features_in_'): try: num_features = _daal_num_features(X) except TypeError: num_features = _num_samples(X) if num_features != self.n_features_in_: raise ValueError( (f'X has {num_features} features, ' f'but RandomForestClassifier is expecting ' f'{self.n_features_in_} features as input')) _patching_status = PatchingConditionsChain( "sklearn.ensemble.RandomForestClassifier.predict_proba") _dal_ready = _patching_status.and_conditions([ (hasattr(self, 'daal_model_'), "oneDAL model was not trained."), (not sp.issparse(X), "X is sparse. Sparse input is not supported."), (daal_check_version((2021, 'P', 400)), "oneDAL version is lower than 2021.4.")]) if hasattr(self, 'n_outputs_'): _dal_ready = _patching_status.and_conditions([ (self.n_outputs_ == 1, f"Number of outputs ({self.n_outputs_}) is not 1.")]) _patching_status.write_log() if not _dal_ready: return super(RandomForestClassifier, self).predict_proba(X) X = check_array(X, dtype=[np.float64, np.float32]) check_is_fitted(self) if sklearn_check_version('0.23'): self._check_n_features(X, reset=False) return _daal_predict_proba(self, X)
def test_retrieve_samples_from_non_standard_shape(): class TestNonNumericShape: def __init__(self): self.shape = ("not numeric",) def __len__(self): return len([1, 2, 3]) X = TestNonNumericShape() assert _num_samples(X) == len(X)
def _iter_test_indices(self, X, y=None, groups=None): if self.random_state is not None: random.seed(self.random_state) n_samples = _num_samples(X) for i in random.sample(range(n_samples), self.iters): yield np.array(i)
def predict(self, X): n_samples = _num_samples(X) maxima = np.empty(n_samples, dtype=float) maxima.fill(-np.inf) argmaxima = np.zeros(n_samples, dtype=int) for i, e in enumerate(self.estimators): pred = np.ravel(e.decision_function(X)) np.maximum(maxima, pred, out=maxima) argmaxima[maxima == pred] = i return self.classes[np.array(argmaxima.T)]
def __check_validity(self, X, y=None, groups=None): if X is None: raise ValueError("The 'X' parameter should not be None.") n_samples = _num_samples(X) gap_before, gap_after = self.gap_before, self.gap_after if (0 >= n_samples - gap_after - self.p and gap_before >= n_samples - self.p): raise ValueError("Not enough training samples available.") if n_samples - gap_after - self.p <= gap_before + 1: warnings.warn(SINGLETON_WARNING, Warning)
def test_retrieve_samples_from_non_standard_shape(): class TestNonNumericShape: def __init__(self): self.shape = ("not numeric", ) def __len__(self): return len([1, 2, 3]) X = TestNonNumericShape() assert _num_samples(X) == len(X)
def split(self, X, y=None, groups=None): X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) train_size = round((1 - self.test_size) * n_samples) train_index = np.arange(train_size - self.n_reduce) test_index = np.arange(train_size, n_samples) yield train_index, test_index
def split(self, X, y=None, labels=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like, of length n_samples The target variable for supervised learning problems. ignored labels : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. ignored Returns ------- train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ n = _num_samples(X) n_slices = self.n_folds + 1 # loop from the first 2 folds to the total number of folds for i in range(2, n_slices + 1): # the split is the percentage at which to split the folds into train # and test. For example when i = 2 we are taking the first 2 folds out # of the total available. In this specific case we have to split the # two of them in half (train on the first, test on the second), # so split = 1/2 = 0.5 = 50%. When i = 3 we are taking the first 3 folds # out of the total available, meaning that we have to split the three of them # in two at split = 2/3 = 0.66 = 66% (train on the first 2 and test on the # following) split = float(i - 1) / i # as we loop over the folds X and y are updated and increase in size. # This is the data that is going to be split and it increases in size # in the loop as we account for more folds. If k = 300, with i starting from 2 # the result is the following in the loop # i = 2 # X = X_train[:(600)] # y = y_train[:(600)] # # i = 3 # X = X_train[:(900)] # y = y_train[:(900)] # .... n_sub = int(np.floor(float(n * i) / n_slices)) subset = range(0, n_sub) # X and y contain both the folds to train and the fold to test. # index is the integer telling us where to split, according to the # split percentage we have set above n_train = int(np.floor(n_sub * split)) train_index = np.arange(0, n_train) test_index = np.arange(n_train, n_sub) yield train_index, test_index
def score_each_boost(estimator, parameters, min_n_estimators, X, y, sample_weight, score_func, train, test, verbose): """Run fit on one set of parameters Returns the score and the instance of the classifier """ if verbose > 1: start_time = time.time() msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.iteritems())) print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.') X_test, y_test, sample_weight_test = _safe_split(estimator, X, y, sample_weight, test, train) test_score_params = {} if sample_weight is not None: test_score_params['sample_weight'] = sample_weight_test this_n_test_samples = _num_samples(X_test) all_scores = [] all_clf_params = [] n_test_samples = [] for i, y_pred in enumerate(estimator.staged_predict(X_test)): if i + 1 < min_n_estimators: continue score = score_func(y_test, y_pred, **test_score_params) all_scores.append(score) clf_para = copy(parameters) clf_para['n_estimators'] = i + 1 all_clf_params.append(clf_para) n_test_samples.append(this_n_test_samples) # boosting may have stopped early if len(all_scores) < estimator.n_estimators - min_n_estimators + 1: last_score = all_scores[-1] last_clf_params = all_clf_params[-1] for i in range(len(all_scores), estimator.n_estimators - min_n_estimators + 1): all_scores.append(last_score) clf_para = copy(last_clf_params) clf_para['n_estimators'] = i + 1 all_clf_params.append(clf_para) n_test_samples.append(this_n_test_samples) if verbose > 1: end_msg = "%s -%s" % ( msg, logger.short_format_time(time.time() - start_time)) print "[BoostGridSearchCV] %s %s" % ( (64 - len(end_msg)) * '.', end_msg) return all_scores, all_clf_params, n_test_samples
def split(self, X, y=None, groups=None): X, y, groups = indexable(X, y, groups) n_samples = validation._num_samples(X) n_splits = self.n_splits n_folds = n_splits + 1 indices = np.arange(n_samples) test_size = (n_samples // n_folds) train_starts = range(0, n_samples - 2 * test_size, test_size) for train_start in train_starts: yield (indices[train_start:-test_size], indices[-test_size:])
def fit(self, X, y): """Actual fitting, performing the search over parameters.""" parameter_iterable = ParameterSampler(self.param_distributions, self.n_iter, random_state=self.random_state) estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( delayed(cv_fit_and_score)(clone(base_estimator), X, y, self.scoring, parameters, cv=cv) for parameters in parameter_iterable) best = sorted(out, reverse=True)[0] self.best_params_ = best[1] self.best_score_ = best[0] if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best[1]) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def predict(self, X): check_is_fitted(self, 'estimators_') n_samples = _num_samples(X) maxima = np.empty(n_samples, dtype=float) maxima.fill(-np.inf) argmaxima = np.zeros(n_samples, dtype=int) for i, e in enumerate(self.estimators_): pred = _predict_binary(e, X) np.maximum(maxima, pred, out=maxima) argmaxima[maxima == pred] = i return self.classes_[np.array(argmaxima.T)]
def mse_variance(y, y_pred, **kwargs): """ MSE variance estimator. **kwargs for compatibility """ m = _num_samples(y) mse_hat = mean_squared_error(y, y_pred) loss = (y - y_pred)**2 mse_var = sum((mse_hat - loss)**2) / (m * (m - 1)) return mse_var
def _iter_test_indices(self, X, y=None, groups=None): self.__check_validity(X, y, groups) n_samples = _num_samples(X) gap_before, gap_after = self.gap_before, self.gap_after if n_samples - gap_after - self.p >= gap_before + 1: for i in range(n_samples - self.p + 1): yield np.arange(i, i + self.p) else: for i in range(n_samples - gap_after - self.p): yield np.arange(i, i + self.p) for i in range(gap_before + 1, n_samples - self.p + 1): yield np.arange(i, i + self.p)
def _compute_chunked_score_samples(iforest, tree_idx, X): n_samples = _num_samples(X) if iforest._max_features == X.shape[1]: subsample_features = False else: subsample_features = True chunk_n_rows = get_chunk_n_rows(row_bytes=16 * iforest._max_features, max_n_rows=n_samples) slices = gen_batches(n_samples, chunk_n_rows) scores = np.zeros(n_samples, order="f") for sl in slices: scores[sl] = _compute_score_samples_single_tree(iforest, tree_idx, X[sl], subsample_features) return scores
def my_cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', method='predict'): X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) # Ensure the estimator has implemented the passed decision function if not callable(getattr(estimator, method)): raise AttributeError('{} not implemented in estimator'.format(method)) if method in ['decision_function', 'predict_proba', 'predict_log_proba']: le = LabelEncoder() y = le.fit_transform(y) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) prediction_blocks = parallel( delayed(_my_fit_and_predict)(clone(estimator), X, y, train, test, verbose, fit_params, method) for train, test in cv.split(X, y, groups)) # Concatenate the predictions predictions = [pred_block_i for pred_block_i, _, _ in prediction_blocks] test_indices = np.concatenate( [indices_i for _, indices_i, _ in prediction_blocks]) scores = np.concatenate([score_i for _, _, score_i in prediction_blocks]) if not _check_is_permutation(test_indices, _num_samples(X)): raise ValueError('cross_val_predict only works for partitions') inv_test_indices = np.empty(len(test_indices), dtype=int) inv_test_indices[test_indices] = np.arange(len(test_indices)) # Check for sparse predictions if sp.issparse(predictions[0]): predictions = sp.vstack(predictions, format=predictions[0].format) else: predictions = np.concatenate(predictions) out_predictions = predictions[inv_test_indices] return out_predictions.reshape(y.shape), scores
def _iter_indices(self, X, y=None, groups=None): n_samples = _num_samples(self.mapping) n_train, n_test = _validate_shuffle_split(n_samples,self.test_size,self.train_size) rng = check_random_state(self.random_state) for _ in range(self.n_splits): # random partition permutation = rng.permutation(n_samples) ind_test = [] for it in permutation[:n_test]: ind_test.extend(self.mapping[it]) ind_train = [] for it in permutation[n_test:(n_test + n_train)]: ind_train.extend(self.mapping[it]) yield ind_train, ind_test
def data_sample(n_folds=5, frac=0.2, X=None, y=None, groups=None, oob=True, random_state=0): """ 把数据集划分成多份用于模型训练 n_folds:如果是int类型 那么就做bootstrap抽样 抽取n_folds份 如果是是包含split函数的类 那么就调用其split函数 取出valid部分 frac:抽取的样本比例 只有到n_folds是int的时候有效 值在0到1之间 X: X数据 y: Y数据 groups: 如果根据自定义的分组情况进行CV 那么就需要这个参数 比如LeaveOneGroupOut这个数据切分方法 oob: 是否需要同时返回out of bag的index random_state:随机种子 return: index_list n个index array组成的list """ train_index_list = [] oob_index_list = [] num_samples = _num_samples(X) np.random.seed(random_state) if isinstance(n_folds, int): if frac is None: batch_size = round(num_samples / n_folds) elif frac >= 0 and frac <= 1: batch_size = round(num_samples * frac) else: raise ValueError( "expect frac is a int object between 0 and 1 but got {0}". format(frac)) for i in range(n_folds): train_index = np.random.choice(num_samples, batch_size, replace=True) oob_index = [i for i in range(num_samples) if i not in train_index] train_index_list.append(train_index) oob_index_list.append(oob_index) elif hasattr(n_folds, 'split'): for fold_n, (train_index, valid_index) in enumerate(n_folds.split(X, y, groups)): train_index_list.append(valid_index) oob_index_list.append(train_index) if oob: return train_index_list, oob_index_list else: return train_index_list
def _iter_test_indices(self, X, y=None, groups=None): n_samples = _num_samples(X) indices = np.arange(n_samples) if self.shuffle: check_random_state(self.random_state).shuffle(indices) n_splits = self.n_splits fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int) fold_sizes[:n_samples % n_splits] += 1 current = 0 for fold_size in fold_sizes: start, stop = current, current + fold_size yield indices[start:stop] current = stop
def _iter_test_indices(self, X=None, y=None, groups=None): n_samples = _num_samples(X) _ks = _KennardStone() indices = _ks._get_indexes(X) n_splits = self.n_splits fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=int) fold_sizes[:n_samples % n_splits] += 1 current = 0 for fold_size in fold_sizes: start, stop = current, current + fold_size yield indices[start:stop] current = stop
def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Always ignored, exists for compatibility. groups : array-like, with shape (n_samples,) Always ignored, exists for compatibility. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) n_splits = self.n_splits n_folds = n_splits + 1 gap_size = self.gap_size test_size = self.test_size if self.test_size else n_samples // n_folds # Make sure we have enough samples for the given split parameters if n_folds > n_samples: raise ValueError( ("Cannot have number of folds ={0} greater" " than the number of samples: {1}.").format(n_folds, n_samples)) if n_samples - gap_size - (test_size * n_splits) <= 0: raise ValueError( ("Too many splits ={0} for number of samples" " ={1} with test_size ={2} and gap_size ={3}." "").format(n_splits, n_samples, test_size, gap_size)) indices = np.arange(n_samples) test_starts = range(n_samples - n_splits * test_size, n_samples, test_size) for test_start in test_starts: train_end = test_start - gap_size if self.max_train_size and self.max_train_size < train_end: yield (indices[train_end - self.max_train_size:train_end], indices[test_start:test_start + test_size]) else: yield (indices[:train_end], indices[test_start:test_start + test_size])
def transform(self, y): check_is_fitted(self, 'classes_') # y = column_or_1d(y, warn=True) if _num_samples(y) == 0: return np.array([]) indices = np.isin(y, self.classes_) if not self.ignore_unknown and not np.all(indices): raise ValueError("y contains new labels: %s" % str(np.setdiff1d(y, self.classes_))) y_transformed = np.searchsorted(self.classes_, y) y_transformed[~indices] = self.unknown_encoded_value return y_transformed
def predict(self, X): """ applying multiple estimators for prediction Args: X (numpy.ndarray): input points """ n_samples = _num_samples(X) maxima = np.empty(n_samples, dtype=float) maxima.fill(-np.inf) argmaxima = np.zeros(n_samples, dtype=int) for i, e in enumerate(self.estimators): pred = np.ravel(e.decision_function(X)) np.maximum(maxima, pred, out=maxima) argmaxima[maxima == pred] = i return self.classes[np.array(argmaxima.T)]
def __init__(self, *arrays, batch_size, shuffle=False, to_dense=True): self.arrays = arrays if self.n_arrays == 0: raise ValueError('At least one data array is required') for array in arrays[1:]: if _num_samples(array) != self.n_samples: raise ValueError( 'Input arrays must have the same number of elements' ) self.batch_size = batch_size self.shuffle = shuffle self.to_dense = to_dense self.reset()
def _iter_indices(self, X, y=None, groups=None): _ks = _KennardStone() inds = _ks._get_indexes(X) n_samples = _num_samples(X) n_train, n_test = _validate_shuffle_split( n_samples, self.test_size, self.train_size, default_test_size=self._default_test_size) for _ in range(self.n_splits): ind_test = inds[:n_test] ind_train = inds[n_test:(n_test + n_train)] yield ind_train, ind_test
def _iter_test_indices(self, X, y, groups=None): n_samples = _num_samples(X) n_splits = self.n_splits y = np.asarray(y) sorted_index = np.argsort(y) if self.shuffle: current = 0 rng = check_random_state(self.random_state) for i in range(n_samples // int(n_splits)): start, stop = current, current + n_splits rng.shuffle(sorted_index[start:stop]) current = stop rng.shuffle(sorted_index[current:]) for i in range(n_splits): yield sorted_index[i:n_samples:n_splits]
def check_holdout(holdout, X, y, classifier=True): is_sparse = sp.issparse(X) if holdout is None: holdout = 0.8 if isinstance(holdout, numbers.Integral): if classifier: if type_of_target(y) in ['binary', 'multiclass']: holdout = StratifiedShuffleSplit(y, train_size=holdout) else: holdout = ShuffleSplit(_num_samples(y), train_size=holdout) else: if not is_sparse: n_samples = len(X) else: n_samples = X.shape[0] holdout = ShuffleSplit(n_samples, train_size=holdout) return holdout
def check_cv_coverage(cv, X, y, labels, expected_n_iter=None): n_samples = _num_samples(X) # Check that a all the samples appear at least once in a test fold if expected_n_iter is not None: assert_equal(cv.get_n_splits(X, y, labels), expected_n_iter) else: expected_n_iter = cv.get_n_splits(X, y, labels) collected_test_samples = set() iterations = 0 for train, test in cv.split(X, y, labels): check_valid_split(train, test, n_samples=n_samples) iterations += 1 collected_test_samples.update(test) # Check that the accumulated test samples cover the whole dataset assert_equal(iterations, expected_n_iter) if n_samples is not None: assert_equal(collected_test_samples, set(range(n_samples)))
def my_cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', method='predict'): X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) # Ensure the estimator has implemented the passed decision function if not callable(getattr(estimator, method)): raise AttributeError('{} not implemented in estimator' .format(method)) if method in ['decision_function', 'predict_proba', 'predict_log_proba']: le = LabelEncoder() y = le.fit_transform(y) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) prediction_blocks = parallel(delayed(_my_fit_and_predict)( clone(estimator), X, y, train, test, verbose, fit_params, method) for train, test in cv.split(X, y, groups)) # Concatenate the predictions predictions = [pred_block_i for pred_block_i, _, _ in prediction_blocks] test_indices = np.concatenate([indices_i for _, indices_i, _ in prediction_blocks]) scores = np.concatenate([score_i for _, _, score_i in prediction_blocks]) if not _check_is_permutation(test_indices, _num_samples(X)): raise ValueError('cross_val_predict only works for partitions') inv_test_indices = np.empty(len(test_indices), dtype=int) inv_test_indices[test_indices] = np.arange(len(test_indices)) # Check for sparse predictions if sp.issparse(predictions[0]): predictions = sp.vstack(predictions, format=predictions[0].format) else: predictions = np.concatenate(predictions) return predictions[inv_test_indices], scores
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr') if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if not self.dataset_filenames: self.save_dataset_filename(X, y, cv) dataset_filenames = self.dataset_filenames client = Client() lb_view = client.load_balanced_view() if self.verbose > 0: print("Number of CPU core %d" % len(client.ids())) self.tasks = [([lb_view.apply(evaluate, estimator, dataset_filename, params) for dataset_filename in dataset_filenames], params) for params in parameter_iterable] if self.sync: self.wait() self.set_grid_scores() self.set_best_score_params() if self.refit: self.set_best_estimator(estimator) return self
def predict(self, X): neighbors = self.nbrs.kneighbors(X, self.n_neighbors, return_distance=False) neighbors_set = get_neighbors_above_threshold(self._fit_y, neighbors[0], self.threshold) check_is_fitted(self, 'estimators_') if (hasattr(self.estimators_[0], "decision_function") and is_classifier(self.estimators_[0])): thresh = 0 else: thresh = .5 n_samples = _num_samples(X) if self.label_binarizer_.y_type_ == "multiclass": maxima = np.empty(n_samples, dtype=float) maxima.fill(-np.inf) argmaxima = np.zeros(n_samples, dtype=int) for i, e in enumerate(self.estimators_): if not i in neighbors_set: continue pred = _predict_binary(e, X) np.maximum(maxima, pred, out=maxima) argmaxima[maxima == pred] = i return self.label_binarizer_.classes_[np.array(argmaxima.T)] else: indices = array.array('i') indptr = array.array('i', [0]) for i, e in enumerate(self.estimators_): if not i in neighbors_set: continue indices.extend(np.where(_predict_binary(e, X) > thresh)[0]) indptr.append(len(indices)) data = np.ones(len(indices), dtype=int) indicator = sp.csc_matrix((data, indices, indptr), shape=(n_samples, len(self.estimators_))) return self.label_binarizer_.inverse_transform(indicator)
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise'): if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) test_score = estimator.score(X_test, y_test) scoring_time = time.time() - start_time ret = [test_score, _num_samples(X_test), scoring_time] if return_parameters: ret.append(parameters) return ret
def _fit(self, X, y, parameter_dict): self._cv_results = None # Indicador de necesidad de actualización self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) # verificar longitudes x,y if _num_samples(y) != n_samples: raise ValueError('Target [y], data [X] no coinciden') self.cv = check_cv(self.cv, y=y, classifier=is_classifier(self.estimator)) toolbox = base.Toolbox() # name_values = lista de parametros, gene_type = [1:categorico; 2:numérico], maxints = size(parametros) name_values, self.gene_type, maxints = _get_param_types_maxint(parameter_dict) if self.verbose: print("Tipos: %s, rangos: %s" % (self.gene_type, maxints)) # registro de función Individuo toolbox.register("individual", _initIndividual, creator.Individual, maxints=maxints) # registro de función Población toolbox.register("population", tools.initRepeat, list, toolbox.individual) # Paralelísmo, create pool if not isinstance(self.n_jobs, int): self.n_jobs=1 pool = Pool(self.n_jobs) toolbox.register("map", pool.map) # registro de función Evaluación toolbox.register("evaluate", _evalFunction, name_values=name_values, X=X, y=y, scorer=self.scorer_, cv=self.cv, uniform=self.uniform, verbose=self.verbose, error_score=self.error_score, fit_params=self.fit_params, score_cache=self.score_cache) # registro de función Cruce toolbox.register("mate", _cxIndividual, prob_cruce=self.gene_crossover_prob, gene_type=self.gene_type) # registro de función Mutación toolbox.register("mutate", _mutIndividual, prob_mutacion=self.gene_mutation_prob, maxints=maxints) # registro de función Selección toolbox.register("select", tools.selTournament, tournsize=self.tournament_size) # Creación de Población pop = toolbox.population(n=self.population_size) # Mejor Individuo que ha existido hof = tools.HallOfFame(1) # Stats stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", np.nanmean) stats.register("min", np.nanmin) stats.register("max", np.nanmax) stats.register("std", np.nanstd) # Genealogía hist = tools.History() # Decoración de operadores de variaznza toolbox.decorate("mate", hist.decorator) toolbox.decorate("mutate", hist.decorator) hist.update(pop) # Posibles combinaciones if self.verbose: print('--- Evolve in {0} possible combinations ---'.format(np.prod(np.array(maxints) + 1))) pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=self.generations_number, stats=stats, halloffame=hof, verbose=self.verbose) #pop, logbook = algorithms.eaGenerateUpdate(toolbox, # ngen=self.generations_number, stats=stats, # halloffame=hof, verbose=self.verbose) print(logbook) # Save History self.all_history_ = hist self.all_logbooks_ = logbook # Mejor score y parametros current_best_score_ = hof[0].fitness.values[0] current_best_params_ = _individual_to_params(hof[0], name_values) if self.verbose: print("Best individual is: %s\nwith fitness: %s" % ( current_best_params_, current_best_score_)) if current_best_score_ > self.best_mem_score_: self.best_mem_score_ = current_best_score_ self.best_mem_params_ = current_best_params_ # fin paralelización, close pool pool.close() #pool.join() self.best_score_ = current_best_score_ self.best_params_ = current_best_params_
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr') self.scorer_ = _deprecate_loss_and_score_funcs( self.loss_func, self.score_func, self.scoring) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( delayed(fit_grid_point_extended)( X, y, base_estimator, parameters, train, test, self.scorer_, self.verbose, **self.fit_params) for parameters in parameter_iterable for train, test in cv) # out = [] # for parameters in parameter_iterable: # fold = 1 # for train, test in cv: # print "Processing fold", fold, self.fit_params # out.append(fit_grid_point_extended(X, y, base_estimator, parameters, train, test, self.scorer_, self.verbose, **self.fit_params)) # fold += 1 # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_extras = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] all_extras = list() for this_score, parameters, this_n_test_samples, extra in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) all_extras.append(extra) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) grid_extras.append(all_extras) # Store the computed scores self.grid_scores_ = grid_scores self.extras_ = grid_extras # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def fit_grid_point_extended(X, y, base_estimator, parameters, train, test, scorer, verbose, loss_func=None, extraOut="auto", **fit_params): """Run fit on one set of parameters. Parameters ---------- X : array-like, sparse matrix or list Input data. y : array-like or None Targets for input data. base_estimator : estimator object This estimator will be cloned and then fitted. parameters : dict Parameters to be set on base_estimator clone for this grid point. train : ndarray, dtype int or bool Boolean mask or indices for training set. test : ndarray, dtype int or bool Boolean mask or indices for test set. scorer : callable or None. If provided must be a scorer callable object / function with signature ``scorer(estimator, X, y)``. verbose : int Verbosity level. **fit_params : kwargs Additional parameter passed to the fit function of the estimator. Returns ------- score : float Score of this parameter setting on given training / test split. parameters : dict The parameters that have been evaluated. n_samples_test : int Number of test samples in this split. """ if verbose > 1: start_time = time.time() msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[GridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')) # update parameters of the classifier after a copy of its base structure clf = clone(base_estimator) clf.set_params(**parameters) if hasattr(base_estimator, 'kernel') and callable(base_estimator.kernel): # cannot compute the kernel values with custom function raise ValueError("Cannot use a custom kernel function. " "Precompute the kernel matrix instead.") if not hasattr(X, "shape"): if getattr(base_estimator, "_pairwise", False): raise ValueError("Precomputed kernels or affinity matrices have " "to be passed as arrays or sparse matrices.") X_train = [X[idx] for idx in train] X_test = [X[idx] for idx in test] else: if getattr(base_estimator, "_pairwise", False): # X is a precomputed square kernel matrix if X.shape[0] != X.shape[1]: raise ValueError("X should be a square kernel matrix") X_train = X[np.ix_(train, train)] X_test = X[np.ix_(test, train)] else: X_train = X[safe_mask(X, train)] X_test = X[safe_mask(X, test)] if y is not None: y_test = y[safe_mask(y, test)] y_train = y[safe_mask(y, train)] clf.fit(X_train, y_train, **fit_params) if scorer is not None: this_score = scorer(clf, X_test, y_test) else: this_score = clf.score(X_test, y_test) else: clf.fit(X_train, **fit_params) if scorer is not None: this_score = scorer(clf, X_test) else: this_score = clf.score(X_test) if not isinstance(this_score, numbers.Number): raise ValueError("scoring must return a number, got %s (%s)" " instead." % (str(this_score), type(this_score))) if verbose > 2: msg += ", score=%f" % this_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(time.time() - start_time)) print("[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) extraRVs = {} if extraOut != None: if "estimator" in extraOut: extraRVs["estimator"] = clf if extraOut == "auto" or "predictions" in extraOut: predictions = clf.predict(X) predictionIndex = 0 predictionByIndex = {} for exampleIndex in safe_mask(X, test): predictionByIndex[exampleIndex] = predictions[predictionIndex] predictionIndex += 1 extraRVs["predictions"] = predictionByIndex if (extraOut == "auto" or "importances" in extraOut) and hasattr(clf, "feature_importances_"): extraRVs["importances"] = clf.feature_importances_ rvs = [this_score, parameters, _num_samples(X_test), extraRVs] return rvs
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, error_score='raise'): """ Fit estimator and compute scores for a given dataset split. """ if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) LOG.info("[CV] %s %s", msg, (64 - len(msg)) * '.') # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time test_score = [_score(estimator, X_test, y_test, s) for s in scorer] score_time = time.time() - start_time - fit_time if return_train_score: train_score = [_score(estimator, X_train, y_train, s) for s in scorer] if verbose > 2: msg += ", score=".join(('%f' % ts for ts in test_score)) if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time)) LOG.info("[CV] %s %s", (64 - len(end_msg)) * '.', end_msg) ret = [train_score, test_score] if return_train_score else [test_score] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append(parameters) return ret
def _fit(self, X, y): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch # setup SigOpt experiment and run optimization self._create_sigopt_exp() for jk in xrange(self.n_iter): suggestion = self.conn.experiments(self.experiment.id).suggestions().create() parameters = suggestion.assignments.to_json() # convert all unicode names and values to plain strings non_unicode_parameters = self._convert_unicode_dict(parameters) if self.verbose > 0: print "Evaluating params : ",non_unicode_parameters # do CV folds in parallel using joblib # returns scores on test set out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, non_unicode_parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for train, test in cv) # grab scores from results scores = [o[0] for o in out] self.conn.experiments(self.experiment.id).observations().create( suggestion=suggestion.id, value=numpy.mean(scores), value_stddev=numpy.std(scores) ) # return best SigOpt observation so far best_obs = self.conn.experiments(self.experiment.id).fetch().progress.best_observation self.best_params_ = best_obs.assignments.to_json() # convert all unicode names and values to plain strings self.best_params_ = self._convert_unicode_dict(self.best_params_) self.best_score_ = best_obs.value if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **self.best_params_) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator
def _extended_fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise', extraOut="auto"): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)" ) else: test_score = _score(estimator, X_test, y_test, scorer) if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(X_test), scoring_time]) if return_parameters: ret.append(parameters) # Add additional return values extraRVs = {} if extraOut != None: extraRVs["counts"] = {"train":train.shape[0], "test":test.shape[0]} if "estimator" in extraOut: extraRVs["estimator"] = estimator if extraOut == "auto" or "predictions" in extraOut: assert test.shape[0] == X_test.shape[0] probabilities = estimator.predict_proba(X_test) probabilityByIndex = {} for exampleIndex, prediction in zip(test, probabilities): probabilityByIndex[exampleIndex] = prediction extraRVs["probabilities"] = probabilityByIndex if (extraOut == "auto" or "importances" in extraOut) and hasattr(estimator, "feature_importances_"): extraRVs["importances"] = estimator.feature_importances_ ret.append(extraRVs) return ret
def check_sample_weight_invariance(name, metric, y1, y2): rng = np.random.RandomState(0) sample_weight = rng.randint(1, 10, size=len(y1)) # check that unit weights gives the same score as no weight unweighted_score = metric(y1, y2, sample_weight=None) assert_allclose( unweighted_score, metric(y1, y2, sample_weight=np.ones(shape=len(y1))), err_msg="For %s sample_weight=None is not equivalent to " "sample_weight=ones" % name) # check that the weighted and unweighted scores are unequal weighted_score = metric(y1, y2, sample_weight=sample_weight) # use context manager to supply custom error message with assert_raises(AssertionError) as cm: assert_allclose(unweighted_score, weighted_score) cm.msg = ("Unweighted and weighted scores are unexpectedly almost " "equal (%s) and (%s) for %s" % (unweighted_score, weighted_score, name)) # check that sample_weight can be a list weighted_score_list = metric(y1, y2, sample_weight=sample_weight.tolist()) assert_allclose( weighted_score, weighted_score_list, err_msg=("Weighted scores for array and list " "sample_weight input are not equal (%s != %s) for %s") % ( weighted_score, weighted_score_list, name)) # check that integer weights is the same as repeated samples repeat_weighted_score = metric( np.repeat(y1, sample_weight, axis=0), np.repeat(y2, sample_weight, axis=0), sample_weight=None) assert_allclose( weighted_score, repeat_weighted_score, err_msg="Weighting %s is not equal to repeating samples" % name) # check that ignoring a fraction of the samples is equivalent to setting # the corresponding weights to zero sample_weight_subset = sample_weight[1::2] sample_weight_zeroed = np.copy(sample_weight) sample_weight_zeroed[::2] = 0 y1_subset = y1[1::2] y2_subset = y2[1::2] weighted_score_subset = metric(y1_subset, y2_subset, sample_weight=sample_weight_subset) weighted_score_zeroed = metric(y1, y2, sample_weight=sample_weight_zeroed) assert_allclose( weighted_score_subset, weighted_score_zeroed, err_msg=("Zeroing weights does not give the same result as " "removing the corresponding samples (%s != %s) for %s" % (weighted_score_zeroed, weighted_score_subset, name))) if not name.startswith('unnormalized'): # check that the score is invariant under scaling of the weights by a # common factor for scaling in [2, 0.3]: assert_allclose( weighted_score, metric(y1, y2, sample_weight=sample_weight * scaling), err_msg="%s sample_weight is not invariant " "under scaling" % name) # Check that if number of samples in y_true and sample_weight are not # equal, meaningful error is raised. error_message = ("Found input variables with inconsistent numbers of " "samples: [{}, {}, {}]".format( _num_samples(y1), _num_samples(y2), _num_samples(sample_weight) * 2)) assert_raise_message(ValueError, error_message, metric, y1, y2, sample_weight=np.hstack([sample_weight, sample_weight]))
def fit(self, X, y=None, x_is_index=False, X_name='X', y_name='y'): parameter_iterable = ParameterGrid(self.param_grid) """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) # out = Parallel( # n_jobs=self.n_jobs, verbose=self.verbose, # pre_dispatch=pre_dispatch # )( # delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, # train, test, self.verbose, parameters, # self.fit_params, return_parameters=True, # error_score=self.error_score) # for parameters in parameter_iterable # for train, test in cv) train_test_parameters = ((train, test, parameters) \ for parameters in parameter_iterable for train, test in cv) length = len(parameter_iterable) * len(cv) if x_is_index: X_to_pass = X y_to_pass = None else: X_to_pass = None y_to_pass = None self.view.block = False # print('sequences') # sequences = [ # train_test_parameters, # [clone(base_estimator)] * length, # [X_to_pass] * length, # [y_to_pass] * length, # [self.verbose] * length, # [self.fit_params] * length, # [True] * length, # [self.scorer_] * length, # [x_is_index] * length, # ] f = partial(my_fit_and_score, estimator=clone(base_estimator), X=X_to_pass, y=y_to_pass, verbose=self.verbose, fit_params=self.fit_params, return_parameters=True, scorer=None, x_is_index=x_is_index, names=(X_name, y_name)) # print('before map') # import cProfile # # pr = cProfile.Profile() # pr.enable() chunksize = 10 out = self.view.map(f, itertools.islice(train_test_parameters, 0, length), ordered=False, block=False, chunksize=chunksize) # length / len(self.view)) # pr.disable() # pr.print_stats('cumulative') print('map called') if self.callback is not None: old_progress = out.progress while not out.ready(): self.callback(out.progress * chunksize, length, out.elapsed) if old_progress == out.progress and out.progress > 0: for id, info in self.view.queue_status(verbose=True).iteritems(): # print(id, info) if isinstance(info, dict) and 'queue' in info and len(info['queue']) > 0: print(id, info['queue']) pass old_progress = out.progress sleep(10) print('map ready') out = out.get() # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) param_grid = [(parameters, train, test) for parameters in parameter_iterable for (train, test) in cv] # Because the original python code expects a certain order for the elements, we need to # respect it. indexed_param_grid = list(zip(range(len(param_grid)), param_grid)) par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid)) X_bc = self.sc.broadcast(X) y_bc = self.sc.broadcast(y) scorer = self.scorer_ verbose = self.verbose fit_params = self.fit_params error_score = self.error_score fas = _fit_and_score def fun(tup): (index, (parameters, train, test)) = tup local_estimator = clone(base_estimator) local_X = X_bc.value local_y = y_bc.value res = fas(local_estimator, local_X, local_y, scorer, train, test, verbose, parameters, fit_params, return_parameters=True, error_score=error_score) return (index, res) indexed_out0 = dict(par_param_grid.map(fun).collect()) out = [indexed_out0[idx] for idx in range(len(param_grid))] X_bc.unpersist() y_bc.unpersist() # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, error_score='raise', return_estimator=False, return_idx=False): """Fit estimator and compute scores for a given dataset split. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. scorer : A single callable or dict mapping scorer name to the callable If it is a single callable, the return value for ``train_scores`` and ``test_scores`` is a single float. For a dict, it should be one mapping the scorer name to the scorer callable object / function. The callable object / fn should have signature ``scorer(estimator, X, y)``. train : array-like, shape (n_train_samples,) Indices of training samples. test : array-like, shape (n_test_samples,) Indices of test samples. verbose : integer The verbosity level. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. return_train_score : boolean, optional, default: False Compute and return score on training set. return_parameters : boolean, optional, default: False Return parameters that has been used for the estimator. return_n_test_samples : boolean, optional, default: False Whether to return the ``n_test_samples`` return_times : boolean, optional, default: False Whether to return the fit/score times. Returns ------- train_scores : dict of scorer name -> float, optional Score on training set (for all the scorers), returned only if `return_train_score` is `True`. test_scores : dict of scorer name -> float, optional Score on testing set (for all the scorers). n_test_samples : int Number of test samples. fit_time : float Time spent for fitting in seconds. score_time : float Time spent for scoring in seconds. parameters : dict or None, optional The parameters that have been evaluated. """ if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) test_scores = {} train_scores = {} if parameters is not None: estimator.set_params(**parameters) start_time = time.time() # do it for each patient X_train, y_train, X_test, y_test = _safe_split_multi( estimator, X, y, train, test) is_multimetric = not callable(scorer) n_scorers = len(scorer.keys()) if is_multimetric else 1 try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): if is_multimetric: test_scores = dict(zip(scorer.keys(), [error_score, ] * n_scorers)) if return_train_score: train_scores = dict(zip(scorer.keys(), [error_score, ] * n_scorers)) else: test_scores = error_score if return_train_score: train_scores = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time # _score will return dict if is_multimetric is True test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric) score_time = time.time() - start_time - fit_time if return_train_score: train_scores = _score(estimator, X_train, y_train, scorer, is_multimetric) if verbose > 2: if is_multimetric: for scorer_name, score in test_scores.items(): msg += ", %s=%s" % (scorer_name, score) else: msg += ", score=%s" % test_scores if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_scores, test_scores] if return_train_score else [test_scores] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append(parameters) if return_estimator: ret.append(estimator) if return_idx: ret.extend([train, test]) return ret
def fit(self, X, y=None): super(OneClassSVM, self).fit(X, np.ones(_num_samples(X)))
def _fit(self, X, y, labels, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y, labels = indexable(X, y, labels) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) n_splits = cv.get_n_splits(X, y, labels) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv.split(X, y, labels)) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_splits): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_splits]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_splits) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def fit(self, X, y=None, labels=None): #return self._fit( # X, y, labels, # parameter_iterable # parameter_iterable, \in Sized, it actually does len(parameter_iterable) in _fit #) # FIXME code duplication from BaseSearchCV._fit estimator = self.estimator cv = _split.check_cv(self.cv, y, classifier=is_classifier(estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y, labels = indexable(X, y, labels) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) n_splits = cv.get_n_splits(X, y, labels) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch # FIXME how to handle pre_dispatch # FIXME recursively getting new parameters to evaluate # parameter_iterable = ... # the magic # # # The evaluation (Parallel) stuff # out = Parallel( # n_jobs=self.n_jobs, verbose=self.verbose, # pre_dispatch=pre_dispatch # )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, # train, test, self.verbose, parameters, # self.fit_params, return_parameters=True, # error_score=self.error_score) # for parameters in parameter_iterable # for train, test in cv.split(X, y, labels)) # # n_fits on each (train, test) def cross_validation(raw_parameters): parameters = dict(zip( self.param_grid.keys(), raw_parameters )) # TODO more robust way of doing this print(parameters) return Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for train, test in cv.split(X, y, labels)) x = cartesian_product(*self.param_grid.values()) # FIXME implement as non-recursive def bo_(x_obs, y_obs, n_iter): if n_iter > 0: kernel = kernels.Matern() + kernels.WhiteKernel() gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=16) gp.fit(x_obs, 1-y_obs) a = a_EI(gp, x_obs=x_obs, y_obs=1-y_obs) argmax_f_x_ = x[np.argmax(a(x))] # heavy evaluation f_argmax_f_x_ = cross_validation(argmax_f_x_) y_ob = np.atleast_2d(mean_mean_validation_scores(f_argmax_f_x_)).T return f_argmax_f_x_ + bo_( x_obs=np.vstack((x_obs, argmax_f_x_)), y_obs=np.vstack((y_obs, y_ob)), n_iter=n_iter-1, ) else: return [] # FIXME (most informative) decision like Numerical Probabilistics stuff for integrations # sobol initilization? sampled_x_ind = np.random.choice( x.shape[0], size=self.n_initial_points, replace=False, ) print(sampled_x_ind) x_obs = x[sampled_x_ind] f_x_obs = list(map(cross_validation, x_obs)) y_obs = np.atleast_2d(list(map(mean_mean_validation_scores, f_x_obs))).T out = sum(f_x_obs, []) + bo_(x_obs, y_obs, n_iter=self.n_iter) n_fits = len(out) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_splits): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _ , parameters in \ out[grid_start:grid_start + n_splits]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_splits) scores.append((score, parameters)) grid_scores.append(_search._CVScoreTuple( parameters, score, np.array(all_scores))) self.grid_scores_ = grid_scores best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, depthmaps, offset_points_projected, direction_vectors, true_joints, parameter_iterable): cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(depthmaps) if _num_samples(offset_points_projected) != n_samples: raise ValueError('offset_points_projected has a different number ' 'of samples ({0}) than data (depthmaps: {1} samples)'.format(_num_samples(offset_points_projected), n_samples)) if _num_samples(direction_vectors) != n_samples: raise ValueError('direction_vectors has a different number ' 'of samples ({0}) than data (depthmaps: {1} samples)'.format(_num_samples(direction_vectors), n_samples)) cv = _check_cv(cv, n_samples) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( delayed(_fit_and_score)(clone(base_estimator), depthmaps, offset_points_projected, direction_vectors, true_joints, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) best_estimator.fit(depthmaps, offset_points_projected, direction_vectors, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, X, y, parameter_dict): self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(self.cv, y=y, classifier=is_classifier(self.estimator)) creator.create("FitnessMax", base.Fitness, weights=(1.0,)) creator.create("Individual", list, est=clone(self.estimator), fitness=creator.FitnessMax) toolbox = base.Toolbox() name_values, gene_type, maxints = _get_param_types_maxint(parameter_dict) if self.gene_type is None: self.gene_type = gene_type if self.verbose: print("Types %s and maxint %s detected" % (self.gene_type, maxints)) toolbox.register("individual", _initIndividual, creator.Individual, maxints=maxints) toolbox.register("population", tools.initRepeat, list, toolbox.individual) toolbox.register("evaluate", _evalFunction, searchobj=self, name_values=name_values, X=X, y=y, scorer=self.scorer_, cv=cv, iid=self.iid, verbose=self.verbose, error_score=self.error_score, fit_params=self.fit_params) toolbox.register("mate", _cxIndividual, indpb=self.gene_crossover_prob, gene_type=self.gene_type) toolbox.register("mutate", _mutIndividual, indpb=self.gene_mutation_prob, up=maxints) toolbox.register("select", tools.selTournament, tournsize=self.tournament_size) if self.n_jobs > 1: pool = Pool(processes=self.n_jobs) toolbox.register("map", pool.map) pop = toolbox.population(n=self.population_size) hof = tools.HallOfFame(1) stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", np.mean) stats.register("min", np.min) stats.register("max", np.max) if self.verbose: print('--- Evolve in {0} possible combinations ---'.format(np.prod(np.array(maxints)+1))) pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=self.generations_number, stats=stats, halloffame=hof, verbose=self.verbose) current_best_score_ = hof[0].fitness.values[0] current_best_params_ = _individual_to_params(hof[0], name_values) if self.verbose: print("Best individual is: %s\nwith fitness: %s" % ( current_best_params_, current_best_score_) ) print "Scoring evaluations: %d, Cache hits: %d, Total: %d" % ( self.num_evaluations, self.num_cache_hits, self.num_evaluations + self.num_cache_hits) if current_best_score_ > self.best_score_: self.best_score_ = current_best_score_ self.best_params_ = current_best_params_
def predict(self, T): if self.check_X is not None: assert self.check_X(T) return self.classes_[np.zeros(_num_samples(T), dtype=np.int)]