def test_auto_init(n_samples, n_features, n_classes, n_components): # Test that auto choose the init as expected with every configuration # of order of n_samples, n_features, n_classes and n_components. rng = np.random.RandomState(42) nca_base = NeighborhoodComponentsAnalysis(init='auto', n_components=n_components, max_iter=1, random_state=rng) if n_classes >= n_samples: pass # n_classes > n_samples is impossible, and n_classes == n_samples # throws an error from lda but is an absurd case else: X = rng.randn(n_samples, n_features) y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples] if n_components > n_features: # this would return a ValueError, which is already tested in # test_params_validation pass else: nca = clone(nca_base) nca.fit(X, y) if n_components <= min(n_classes - 1, n_features): nca_other = clone(nca_base).set_params(init='lda') elif n_components < min(n_features, n_samples): nca_other = clone(nca_base).set_params(init='pca') else: nca_other = clone(nca_base).set_params(init='identity') nca_other.fit(X, y) assert_array_almost_equal(nca.components_, nca_other.components_)
def calibrate_probs(labels, weights, probs, logistic=False, random_state=11, threshold=0., return_calibrator=False, symmetrize=False): """ Calibrate output to probabilities using 2-folding to calibrate all data :param probs: probabilities, numpy.array of shape [n_samples] :param labels: numpy.array of shape [n_samples] with labels :param weights: numpy.array of shape [n_samples] :param threshold: float, to set labels 0/1 :param logistic: bool, use logistic or isotonic regression :param symmetrize: bool, do symmetric calibration, ex. for B+, B- :return: calibrated probabilities """ labels = (labels > threshold) * 1 ind = numpy.arange(len(probs)) ind_1, ind_2 = train_test_split(ind, random_state=random_state, train_size=0.5) calibrator = LogisticRegression(C=100) if logistic else IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip') est_calib_1, est_calib_2 = clone(calibrator), clone(calibrator) probs_1 = probs[ind_1] probs_2 = probs[ind_2] if logistic: probs_1 = numpy.clip(probs_1, 0.001, 0.999) probs_2 = numpy.clip(probs_2, 0.001, 0.999) probs_1 = logit(probs_1)[:, numpy.newaxis] probs_2 = logit(probs_2)[:, numpy.newaxis] if symmetrize: est_calib_1.fit(numpy.r_[probs_1, 1-probs_1], numpy.r_[labels[ind_1] > 0, labels[ind_1] <= 0]) est_calib_2.fit(numpy.r_[probs_2, 1-probs_2], numpy.r_[labels[ind_2] > 0, labels[ind_2] <= 0]) else: est_calib_1.fit(probs_1, labels[ind_1]) est_calib_2.fit(probs_2, labels[ind_2]) else: if symmetrize: est_calib_1.fit(numpy.r_[probs_1, 1-probs_1], numpy.r_[labels[ind_1] > 0, labels[ind_1] <= 0], numpy.r_[weights[ind_1], weights[ind_1]]) est_calib_2.fit(numpy.r_[probs_2, 1-probs_2], numpy.r_[labels[ind_2] > 0, labels[ind_2] <= 0], numpy.r_[weights[ind_2], weights[ind_2]]) else: est_calib_1.fit(probs_1, labels[ind_1], weights[ind_1]) est_calib_2.fit(probs_2, labels[ind_2], weights[ind_2]) calibrated_probs = numpy.zeros(len(probs)) if logistic: calibrated_probs[ind_1] = est_calib_2.predict_proba(probs_1)[:, 1] calibrated_probs[ind_2] = est_calib_1.predict_proba(probs_2)[:, 1] else: calibrated_probs[ind_1] = est_calib_2.transform(probs_1) calibrated_probs[ind_2] = est_calib_1.transform(probs_2) if return_calibrator: return calibrated_probs, (est_calib_1, est_calib_2) else: return calibrated_probs
def __init__( self, estimator=LinearSVC(), masker=NiftiMasker(), labelizer=LabelEncoder(), reporter=Reporter(), estimated_name="coef_", ): self.estimator = clone(estimator) self.masker = clone(masker) self.labelizer = clone(labelizer) self.reporter = reporter self.estimated_name = estimated_name
def classification_metrics(self, X, y, n_iter=10, test_size=0.25, random_state=0): """ returns the roc auc of the classifier binary only., and the portion of correct predictions via CV @param y: all non-zero will be set to 1 @param n_iter, test_size: StratifiedShuffleSplit parameters @param random_state: random state used for StratifiedShuffleSplit @return: roc, accuracy, accuracy_zero, accuracy_one """ roc = 0 accuracy = 0 accuracy_zero = 0 # portion of zeros correctly predicted accuracy_one = 0 # portion of ones correctly predicted y = np.array([0 if d == 0 else 1 for d in y]) prePipe = clone(self.common_preprocessing_pipe) pipeToUse = clone(self.classifier_pipe) cvObj = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state) for trainInds, testInds in cvObj: # all cv data trainX = X[trainInds] trainY = y[trainInds] testX = X[testInds] testY = y[testInds] trainX = prePipe.fit_transform(trainX) testX = prePipe.transform(testX) pipeToUse.fit(trainX, trainY) y_scores = pipeToUse.predict_proba(testX) y_pred = pipeToUse.predict(testX) temp = next((i for i in range(len(testY)) if y_pred[i] == 1), None) roc += roc_auc_score(testY, y_scores[:, 1]) accuracy += sum(y_pred == testY) * 1.0 / len(testY) accuracy_zero += 1.0 * sum(np.logical_and(y_pred == testY, testY == 0)) / sum(testY == 0) accuracy_one += 1.0 * sum(np.logical_and(y_pred == testY, testY == 1)) / sum(testY == 1) roc /= n_iter accuracy_zero /= n_iter accuracy_one /= n_iter accuracy /= n_iter print ">>> The classifier has roc = %0.3f, zero-accuracy = %0.3f, " "one-accuracy = %0.3f, overall accuracy = %0.3f." % ( roc, accuracy_zero, accuracy_one, accuracy, ) return roc, accuracy, accuracy_zero, accuracy_one
def _fit_stage(self, X, y, rmTolerance): """ fits one stage of gradient boosting @param X: @param y: @param rmTolerance: tolerance for 1D optimization @return: nothing """ residuals = self.lossFunction.negative_gradient(y, self._currentPrediction) trainX, trainY, _, _ = splitTrainTest(X, residuals, 1-self.subsample) # stochastic boosting. train only on a portion of the data if len(np.unique(trainY))==1: hm = MajorityPredictor().fit(trainY) else: cvObj = KFold(n=len(trainX), n_folds=self.cvNumFolds, indices=False, shuffle=True, random_state=self.randomState) # find the h that best mimics the negative gradient if self.n_jobs > 1: # parallel n_jobs = max(1, self.n_jobs/len(self.learners), self.cvNumFolds) # n_jobs = 1 pool = MyPool(processes=self.n_jobs, initializer=gbjjInit, initargs=(trainX, trainY, self.lossFunction, n_jobs, cvObj)) temp = pool.map_async(gbjjInnerLoop, self.learners) temp.wait() h_res = temp.get() pool.close() pool.join() else: # single thread h_res = [] for learner in self.learners: if self.verbosity >= 2: print 'Fitting learner:', learner l = clone(learner) scores = jjcross_val_score(l, trainX, trainY, score_func=self.lossFunction, n_jobs=1, cv=cvObj) h_res.append(scores.mean()) hm = clone(self.learners[np.argsort(h_res)[0]]) if self.verbosity>=1: print "The best classifier is", hm.__class__ # find rm hm.fit(trainX, trainY) hmx = hm.predict(X) rm = minimize_scalar(lambda r: self.lossFunction(y, self._currentPrediction + r*hmx), tol=rmTolerance).x # append estimator and weight self._estimators.append((hm, rm))
def fit(self, X, y, sample_weight=None): assert isinstance(self.base_estimators, dict), 'Estimators should be passed in a dictionary' assert len(X) == len(y), 'the lengths are different' assert sample_weight is None or len(sample_weight) == len(y), 'the lengths are different' if sample_weight is None: sample_weight = numpy.ones(len(y)) assert self.feature_name in X.columns, 'there is no feature %s' % self.feature_name self.columns_order = X.columns column = numpy.array(X[self.feature_name]) self.column_values = list(set(column)) self.stayed_columns = dict() # value -> list of columns self.common_features = dict() # (value_from, value_to) -> list of columns self.classifiers = dict() # (value_from, value_to, classifier_name) -> classifier self.final_classifiers = dict() # (value, classifier_name) -> classifier rows_dict = dict() # (value) -> boolean list of rows self.final_columns_orders = dict() # (value) -> list of features for value in self.column_values: rows = numpy.array(X[self.feature_name] == value) rows_dict[value] = rows x_part = X.loc[rows, :] cols = pandas.notnull(x_part).all() self.stayed_columns[value] = cols[cols==True].keys() for value_to, rows_to in rows_dict.items(): columns_to = self.stayed_columns[value_to] new_features = pandas.DataFrame() for value_from, rows_from in rows_dict.items(): if value_from == value_to: continue common_columns = list(set(self.stayed_columns[value_from]).union(set(self.stayed_columns[value_to]))) common_columns.remove(self.feature_name) self.common_features[value_from, value_to] = common_columns for name, estimator in self.base_estimators.items(): rows_from = rows_dict[value_from] new_classifier = sklearn.clone(estimator)\ .fit(X.loc[rows_from, common_columns], y[rows_from], sample_weight=sample_weight[rows_from]) self.classifiers[value_from, value_to, name] = new_classifier new_feature = new_classifier.predict_proba(X.loc[rows_to, common_columns])[:, 1] new_features[str(value_from) + "_" + name] = new_feature X_to_part = X.loc[rows_to, columns_to] new_features = new_features.set_index(X_to_part.index) X_to_part = pandas.concat([X_to_part, new_features], axis=1) final_classifier = sklearn.clone(self.final_estimator) final_classifier.fit(X_to_part, y[rows_to], sample_weight=sample_weight[rows_to]) self.final_columns_orders[value_to] = X_to_part.columns self.final_classifiers[value_to] = final_classifier return self
def fit(self, original, target, original_weight=None, target_weight=None): """ Prepare reweighting formula by training a sequence of trees. :param original: values from original distribution, array-like of shape [n_samples, n_features] :param target: values from target distribution, array-like of shape [n_samples, n_features] :param original_weight: weights for samples of original distributions :param target_weight: weights for samples of original distributions :return: self """ original, original_weight = self._normalize_input(original, original_weight, normalize=False) target, target_weight = self._normalize_input(target, target_weight, normalize=False) folds_original = self._get_folds_column(len(original)) folds_target = self._get_folds_column(len(target)) for _ in range(self.n_folds): self.reweighters.append(clone(self.base_reweighter)) original = numpy.array(original) target = numpy.array(target) for i in range(self.n_folds): self.reweighters[i].fit(original[folds_original != i, :], target[folds_target != i, :], original_weight=original_weight[folds_original != i], target_weight=target_weight[folds_target != i]) self.train_length = len(original) return self
def train(self,num_examples,deltas=list(range(1,6)),use_transformations=False,use_weights=True,verbosity=0): '''Train ensemble of classifiers using newly generated data for every member of the ensemble. ''' time_start = time.time() self.classifiers = dict() for delta in deltas: self.classifiers[delta] = [] # list for ensemble of classifiers for i in range(self.n_estimators): # base classifier clf = clone(self.base_classifier) if use_weights: train_x,train_y,train_w = self.make_weighted_training_data(create_examples(num_examples=num_examples,deltas=[delta]),use_transformations=use_transformations) if verbosity>1: print ('delta={0}, #{1}: training with {2} weighted examples'.format(delta,i,len(train_x))) # fit clf.fit(train_x,train_y,sample_weight=train_w) else: train_x,train_y = self.make_training_data(create_examples(num_examples=num_examples,deltas=[delta]),use_transformations=use_transformations) if verbosity>1: print ('delta={0}, #{1}: training with {2} examples'.format(delta,i,len(train_x))) clf.fit(train_x,train_y) self.classifiers[delta].append(clf) time_end = time.time() if verbosity>0: print('training completed in {0:.1f} seconds'.format(time_end-time_start))
def train(self, examples, use_transformations=False,use_weights=True): time_start = time.time() self.classifiers = dict() deltas = set([e.delta for e in examples]) for delta in deltas: # create classifier with same params as base classifier clf = clone(self.base_classifier) if use_weights: # weighted training data for current delta # training data for current delta (train_x,train_y,train_w) = self.make_weighted_training_data([e for e in examples if e.delta==delta],use_transformations=use_transformations) print ('delta={0}, training with {1} weighted examples'.format(delta,len(train_x))) # fit clf.fit(train_x,train_y,train_w) else: # training data for current delta (train_x,train_y) = self.make_training_data([e for e in examples if e.delta==delta],use_transformations=use_transformations) print ('delta={0}, training with {1} examples'.format(delta,len(train_x))) # fit clf.fit(train_x,train_y) # store self.classifiers[delta] = clf time_end = time.time() print('training completed in {0} seconds'.format(time_end-time_start))
def test_no_attributes_set_in_init(estimator, preprocessor): """Check setting during init. Adapted from scikit-learn.""" estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) if hasattr(type(estimator).__init__, "deprecated_original"): return init_params = _get_args(type(estimator).__init__) parents_init_params = [param for params_parent in (_get_args(parent) for parent in type(estimator).__mro__) for param in params_parent] # Test for no setting apart from parameters during init invalid_attr = (set(vars(estimator)) - set(init_params) - set(parents_init_params)) assert not invalid_attr, \ ("Estimator %s should not set any attribute apart" " from parameters during init. Found attributes %s." % (type(estimator).__name__, sorted(invalid_attr))) # Ensure that each parameter is set in init invalid_attr = (set(init_params) - set(vars(estimator)) - set(["self"])) assert not invalid_attr, \ ("Estimator %s should store all parameters" " as an attribute during init. Did not find " "attributes %s." % (type(estimator).__name__, sorted(invalid_attr)))
def test_various_scoring_on_tuples_learners(estimator, build_dataset, with_preprocessor): """Tests that scikit-learn's scoring returns something finite, for other scoring than default scoring. (List of scikit-learn's scores can be found in sklearn.metrics.scorer). For each type of output (predict, predict_proba, decision_function), we test a bunch of scores. We only test on pairs learners because quadruplets don't have a y argument. """ input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) # scores that need a predict function: every tuples learner should have a # predict function (whether the pair is of positive samples or negative # samples) for scoring in ['accuracy', 'f1']: check_score_is_finite(scoring, estimator, input_data, labels) # scores that need a predict_proba: if hasattr(estimator, "predict_proba"): for scoring in ['neg_log_loss', 'brier_score']: check_score_is_finite(scoring, estimator, input_data, labels) # scores that need a decision_function: every tuples learner should have a # decision function (the metric between points) for scoring in ['roc_auc', 'average_precision', 'precision', 'recall']: check_score_is_finite(scoring, estimator, input_data, labels)
def best_pipelines_by_algo(grid, algo_tag = 'regressor__algorithm', bigger_is_better=True): results = [] if bigger_is_better: best_mean = -np.inf def is_better(x, y): return x > y else: best_mean = np.inf def is_better(x, y): return x < y best_params_by_algo = {} for params, mean, std in grid.grid_scores_: # print(params) algo_type = type(params[algo_tag]) if algo_type not in best_params_by_algo: best_params_by_algo[algo_type] = (mean, params) else: best_mean = best_params_by_algo[algo_type][0] if is_better(mean, best_mean): best_params_by_algo[algo_type] = (mean, params) for algo, (mean, params) in best_params_by_algo.iteritems(): new_estimator = clone(grid.estimator) new_estimator.set_params(**params) results.append(new_estimator) return results
def test_get_metric_works_does_not_raise(estimator, build_dataset): """Tests that the metric returned by get_metric does not raise errors (or warnings) similarly to the distance functions in scipy.spatial.distance""" input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) model.fit(input_data, labels) metric = model.get_metric() list_test_get_metric_doesnt_raise = [(X[0], X[1]), (X[0].tolist(), X[1].tolist()), (X[0][None], X[1][None])] for u, v in list_test_get_metric_doesnt_raise: with pytest.warns(None) as record: metric(u, v) assert len(record) == 0 # Test that the scalar case works model.transformer_ = np.array([3.1]) metric = model.get_metric() for u, v in [(5, 6.7), ([5], [6.7]), ([[5]], [[6.7]])]: with pytest.warns(None) as record: metric(u, v) assert len(record) == 0
def param_search(estimator, param_dict, n_iter=None, seed=None): """ Generator for cloned copies of `estimator` set with parameters as specified by `param_dict`. `param_dict` can contain either lists of parameter values (grid search) or a scipy distribution function to be sampled from. If distributions, you must specify `n_iter`. Parameters: ___________ estimator: sklearn-like estimator param_dict: dict of parameter name: values, where values can be an iterable or a distribution function n_iter: number of draws to take from parameter distributions """ if n_iter is None: param_iter = ParameterGrid(param_dict) else: param_iter = ParameterSampler(param_dict, n_iter, random_state=seed) estimators = [] for params in param_iter: new_estimator = sklearn.clone(estimator) new_estimator.set_params(**params) estimators.append(new_estimator) return estimators
def _fit_best_estimator(self, X, y, sample_weight=None): # Training classifier once again self.best_estimator_ = sklearn.clone(self.base_estimator).set_params(**self.generator.best_params_) if sample_weight is None: self.best_estimator_.fit(X, y) else: self.best_estimator_.fit(X, y, sample_weight=sample_weight)
def _fit(self, X, y, num_workers=1, verbose=False): t = time.time() param_grid = [{ 'C': [.001, .1, 10, 100], }] base_clf = sklearn.linear_model.LogisticRegression( fit_intercept=False, class_weight='auto', dual=False, penalty='l2') kfold = sklearn.cross_validation.StratifiedKFold(y, 3) param_iterator = sklearn.grid_search.ParameterGrid(param_grid) out = joblib.Parallel(n_jobs=num_workers, pre_dispatch=num_workers * 2)( joblib.delayed(fit_grid_point)( X[train], y[train], X[test], y[test], base_clf, clf_params) for clf_params in param_iterator for train, test in kfold ) df = pandas.DataFrame(out, columns=['setting', 'score', 'entropy']) dfg = df.groupby('setting').mean() if verbose: print(dfg) dfg = dfg.sort(['score', 'entropy'], ascending=[0, 1]) best_params = json.loads(dfg.index[0]) best_score, best_entropy = dfg.ix[0].values print('Best at {}: {:.2f} | {:.2f} and took {:.2f} s'.format(best_params, best_score, best_entropy, time.time() - t)) clf = sklearn.clone(base_clf) clf.set_params(**best_params) clf.fit(X, y) return clf, best_score, best_entropy
def make_per_customer_submission( dataset_name, estimator, x_transformation=identity, y_transformation=identity, include_variables=None ): """BROKEN FOR NOW!""" with open(j(DATA_DIR, dataset_name, "per-customer-train.pickle"), "rb") as f: dv, train_customers, train_y, train_x, train_weights = pickle.load(f) choose_columnns = lambda x: x[ :, np.array(get_feature_indexi(dv.get_feature_names(), include_variables), dtype="int") ] x_all_transforms = lambda x: x_transformation(choose_columnns(x)) model = clone(estimator) try: model.fit(x_all_transforms(train_x), y_transformation(train_y), sample_weight=train_weights) except TypeError: print("%s doesn't support `sample_weight`. Ignoring it." % str(model)) model.fit(x_all_transforms(train_x), y_transformation(train_y)) with open(j(DATA_DIR, dataset_name, "per-customer-test.pickle"), "rb") as f: _, test_customers, test_y, test_x, test_weights = pickle.load(f) with open(j("submissions", "%s.csv" % datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")), "w") as f: f.write("customer_ID,plan\n") for c, ps in zip(test_customers, model.predict(x_all_transforms(test_x))): f.write("%s,%s\n" % (c, "".join(str(pp) for pp in ps)))
def fit(self, X, feature): try: feature = int(feature) except Exception: self.logger("feature should be int") raise TypeError("feature should be int") X = X.view(np.ndarray) self.input_col_count = X.shape[1] self.feature = feature my_X = Misc.exclude_cols(X, self.feature) my_y = X[:, self.feature] y_mean = np.mean(my_y) y_std = np.std(my_y) # ref: http://www.sciencedirect.com/science/article/pii/S0893608004002102 self._learner.C = max(abs(y_mean + 3 * y_std), abs(y_mean - 3 * y_std)) cvs = cv.KFold(len(X), 10, shuffle=True) output_errors = np.empty(0) for train, test in cvs: tmp_l = sklearn.clone(self._learner) tmp_l.fit(my_X[train, :], X[train, self.feature]) output_errors = np.hstack((output_errors, tmp_l.predict(my_X[test, :]) - X[test, self.feature])) self.error_std = np.std(output_errors) self.error_mean = np.mean(output_errors) self._learner.fit(my_X, X[:, self.feature]) return self
def fit(self, X, y, sample_weight=None): label = self.uniform_label self.uniform_label = numpy.array([label]) if isinstance(label, numbers.Number) else numpy.array(label) sample_weight = check_sample_weight(y, sample_weight=sample_weight).copy() assert numpy.all(numpy.in1d(y, [0, 1])), 'only two-class classification is supported by now' y = column_or_1d(y) y_signed = 2 * y - 1 X = pandas.DataFrame(X) knn_indices = computeKnnIndicesOfSameClass(self.uniform_variables, X, y, self.n_neighbours) # for those events with non-uniform label we repeat it's own index several times for label in [0, 1]: if label not in self.uniform_label: knn_indices[y == label, :] = numpy.arange(len(y))[y == label][:, numpy.newaxis] X = self.get_train_vars(X) cumulative_score = numpy.zeros(len(X)) self.estimators = [] for stage in range(self.n_estimators): classifier = sklearn.clone(self.base_estimator) classifier.fit(X, y, sample_weight=sample_weight) score = self.learning_rate * self.compute_score(classifier, X=X) cumulative_score += score sample_weight *= numpy.exp(- y_signed * numpy.take(score, knn_indices).mean(axis=1)) sample_weight = self.normalize_weights(y=y, sample_weight=sample_weight) self.estimators.append(classifier)
def lambda_choice(penalty, lambdas, n_folds, K, y, n_iter=10000, verbose=0, n_jobs=-1): estimator = fista.Fista(penalty=penalty, n_iter=n_iter) infos = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(_sub_info)(clone(estimator), K, y, K, y, lambda_) for lambda_ in lambdas) return infos
def fit_models(self, X, Y, bands=None): """ Fit timeseries models for `bands` within `Y` for a given `X` Args: X (np.ndarray): design matrix (number of observations x number of features) Y (np.ndarray): independent variable matrix (number of series x number of observations) observation in the X design matrix bands (iterable): Subset of bands of `Y` to fit. If None are provided, fit all bands in Y Returns: np.ndarray: fitted model objects """ if bands is None: bands = np.arange(self.n_series) models = [] for b in bands: y = Y.take(b, axis=0) model = sklearn.clone(self.lm).fit(X, y) # TODO: no clone? # Add in RMSE calculation # TODO: numba? model.rmse = ((y - model.predict(X)) ** 2).mean(axis=0) ** 0.5 # Add intercept to intercept term of design matrix model.coef = model.coef_.copy() model.coef[0] += model.intercept_ models.append(model) return np.array(models)
def run(self, directory="datasets/"): loader = ArffLoader("{}/{}.arff".format(directory, self.dataset)) inputs, labels = loader.get_dataset() n_features = inputs.shape[1] if self.subset_size >= n_features: return None if self.normalize: preprocessing.normalize(inputs, copy=False) results = { "experiment": self, "scores": {scorer_name: numpy.zeros(self.n_runs) for scorer_name, _ in self.scorers}, "score_times": {scorer_name: numpy.zeros(self.n_runs) for scorer_name, _ in self.scorers}, "errors": {classifier_name: numpy.zeros(self.n_runs) for classifier_name, _ in self.classifiers}, "classifier_times": {classifier_name: numpy.zeros(self.n_runs) for classifier_name, _ in self.classifiers} } for run in range(self.n_runs): numpy.random.seed(run) indices = numpy.random.choice(n_features, size=self.subset_size, replace=False) inputs_subset = inputs[:, indices].copy() for scorer_name, scorer in self.scorers: score, t = self._execute_score_run(run, scorer, inputs_subset, labels) results["scores"][scorer_name][run] = score results["score_times"][scorer_name][run] = t for classifier_name, classifier in self.classifiers: error, t = self._execute_classifier_run(run, sklearn.clone(classifier), inputs_subset, labels) results["errors"][classifier_name][run] = error results["classifier_times"][classifier_name][run] = t return results
def score(parms): e = sklearn.clone(learner) pp = dict(parms) pp.pop("id") try: e.set_params(**pp) e.fit(*trainSet, maxIters=maxIters) if visualParams is not None: imgPathBase = os.path.join(imageDestFolder, "{}".format(store.params["id"])) # Write some images! e.visualize(visualParams, path=imgPathBase + ".png") e.visualize(visualParams, path=imgPathBase + "_example.png", inputs=testSet[0][0]) if scoreModel is None: return dict(score=e.score(*testSet)) else: return scoreModel(e, testSet) except: sys.stderr.write("Error for {}:\n{}\n".format(parms, traceback.format_exc())) if e.UNSUPERVISED: score = 1.0 else: score = -1.0 e = None
def estimate_classifier(params_dict, base_estimator, X, y, folds, fold_checks, score_function, sample_weight=None, label=1, scorer_needs_x=False, catch_exceptions=True): """This function is needed to train classifier with some parameters on the cluster.""" try: k_folder = StratifiedKFold(y=y, n_folds=folds, shuffle=True) score = 0. for train_indices, test_indices in islice(k_folder, fold_checks): trainX, trainY = X.irow(train_indices), y[train_indices] testX, testY = X.irow(test_indices), y[test_indices] estimator = sklearn.clone(base_estimator).set_params(**params_dict) train_options = {} test_options = {} if sample_weight is not None: train_weights, test_weights = \ sample_weight[train_indices], sample_weight[test_indices] train_options['sample_weight'] = train_weights test_options['sample_weight'] = test_weights if scorer_needs_x: test_options['X'] = testX estimator.fit(trainX, trainY, **train_options) proba = estimator.predict_proba(testX) score += score_function(testY, proba[:, label], **test_options) return score / fold_checks except Exception as e: # If there was some exception on the node, it will be returned if catch_exceptions: return e else: raise
def train(self, n_experts, instances, labels): experts = [] self.weigher_sampler.train(instances) for centroid in self.centroid_picker.pick(instances, labels, n_experts): expert = LocalExpert(sklearn.clone(self.base_estimator), self.weigher_sampler) expert.train(instances, labels, centroid) experts.append(expert) return experts
def __init__(self, test_indices=None, lm=sklearn.linear_model.Lasso(alpha=20), **kwargs): self.test_indices = np.asarray(test_indices) self.lm = sklearn.clone(lm) self.n_record = 0 self.record = []
def single_learning_rate(mf, learning_rate, X_tr, X_te): mf = clone(mf) mf.set_params(learning_rate=learning_rate, verbose=5) cb = Callback(X_tr, X_te) mf.set_params(callback=cb) mf.fit(X_tr) return dict(time=cb.times, rmse=cb.rmse)
def __call__(self, base_estimator, params, X, y, sample_weight=None): cl = clone(base_estimator) cl.set_params(**params) if sample_weight is not None: cl.fit(X, y, sample_weight) else: cl.fit(X, y) return roc_auc_score(self.testY, cl.predict_proba(self.testX)[:, 1])
def custom(base_estimator, params, X, y, sample_weight=None): cl = clone(base_estimator) cl.set_params(**params) if sample_weight is not None: cl.fit(X, y, sample_weight) else: cl.fit(X, y) return roc_auc_score(labels, cl.predict_proba(test)[:, 1])
def fit_grid_point(X, y, X_val, y_val, base_clf, clf_params): clf = sklearn.clone(base_clf) clf.set_params(**clf_params) clf.fit(X, y) proba = clf.predict_proba(X_val) score = sklearn.metrics.accuracy_score(proba.argmax(1), y_val) entropy = (scipy.stats.distributions.entropy(proba.T) / np.log(proba.shape[1])).mean() return json.dumps(clf_params), score, entropy
def test_get_metric_equivalent_to_explicit_mahalanobis(estimator, build_dataset): """Tests that using the get_metric method of mahalanobis metric learners is equivalent to explicitely calling scipy's mahalanobis metric """ rng = np.random.RandomState(42) input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) model.fit(*remove_y(estimator, input_data, labels)) metric = model.get_metric() n_features = X.shape[1] a, b = (rng.randn(n_features), rng.randn(n_features)) expected_dist = mahalanobis(a[None], b[None], VI=model.get_mahalanobis_matrix()) assert_allclose(metric(a, b), expected_dist, rtol=1e-13)
def test_score_pairs_dim(estimator, build_dataset): # scoring of 3D arrays should return 1D array (several tuples), # and scoring of 2D arrays (one tuple) should return an error (like # scikit-learn's error when scoring 1D arrays) input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) model.fit(input_data, labels) tuples = np.array(list(product(X, X))) assert model.score_pairs(tuples).shape == (tuples.shape[0], ) context = make_context(estimator) msg = ("3D array of formed tuples expected{}. Found 2D array " "instead:\ninput={}. Reshape your data and/or use a preprocessor.\n" .format(context, tuples[1])) with pytest.raises(ValueError) as raised_error: model.score_pairs(tuples[1]) assert str(raised_error.value) == msg
def test_data_valid(model, data, target): is_log = 0 if is_log == 1: target = np.log(target) test_model = clone(model) X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.33, random_state=1) test_model.fit(X_train, y_train) y_predict = test_model.predict(X_test) if (is_log == 1): rmse(y_test, y_predict) else: rmse_log(y_test, y_predict)
def crossval_fm_scores(numbered_params, fm, vecs, labels, groups, n_params): """Calculates the mean avg precision score for a given classifier via CV. Used at training time to find the MAP for the given classifier and params via cross validation. """ assert isinstance(fm, FusionModel) params_number, params = numbered_params logger.debug( '%s: param %d/%d, starting CV for:\nparams: %s', fm.name, params_number, n_params, params) clf = clone(fm.clf) clf.set_params(**params) n_splits = config.FUSION_PARAM_TUNING_CV_KFOLD splits = GroupKFold(n_splits).split(vecs, labels, groups=groups) map_ = parallel_map if fm.parallelize_cv else map single_split_func = partial( crossval_fm_score_single_split, fm=fm, clf=clf, params=params, params_number=params_number, n_params=n_params, n_splits=n_splits, vecs=vecs, labels=labels, groups=groups, ) cv_results = map_(single_split_func, enumerate(splits, 1)) training_times, all_avg_precs = zip(*cv_results) all_avg_precs = list(itertools.chain.from_iterable(all_avg_precs)) t = sum(training_times, timedelta()) # if params_number % 10 == 0: # logger.info('%s: completed crossval of param %d', name, params_number) mean_avg_prec = np.mean(all_avg_precs) std_avg_prec = np.std(all_avg_precs) logger.info( '%s: param %d/%d:\n' 'params: %s\n' 'CV results: score (MAP): %.3f (std: %.3f), training time: %s', fm.name, params_number, n_params, params, mean_avg_prec, std_avg_prec, t ) return mean_avg_prec, std_avg_prec
def fit(self, X, y=None): """ Fit the model using X, y as training data. Will also learn the groups that exist within the dataset. :param X: array-like, shape=(n_columns, n_samples,) training data. :param y: array-like, shape=(n_samples,) training data. :return: Returns an instance of self. """ X, y = self.__prepare_input_data(X, y) if self.shrinkage is not None: self.__set_shrinkage_function() self.group_colnames_ = [str(_) for _ in as_list(self.groups)] if self.value_columns is not None: self.value_colnames_ = [str(_) for _ in as_list(self.value_columns)] else: self.value_colnames_ = [_ for _ in X.columns if _ not in self.group_colnames_] self.__validate(X, y) # List of all hierarchical subsets of columns self.group_colnames_hierarchical_ = expanding_list(self.group_colnames_, list) self.fallback_ = None if self.shrinkage is None and self.use_global_model: subset_x = X[self.value_colnames_] self.fallback_ = clone(self.estimator).fit(subset_x, y) if self.shrinkage is not None: self.estimators_ = {} for level_colnames in self.group_colnames_hierarchical_: self.estimators_.update( self.__fit_grouped_estimator(X, y, self.value_colnames_, level_colnames) ) else: self.estimators_ = self.__fit_grouped_estimator(X, y, self.value_colnames_, self.group_colnames_) self.groups_ = as_list(self.estimators_.keys()) if self.shrinkage is not None: self.shrinkage_factors_ = self.__get_shrinkage_factor(X) return self
def test_validate_calibration_params_invalid_parameters_error_before__fit( estimator, build_dataset): """For all pairs metric learners (which currently all have a _fit method), make sure that calibration parameters are validated before fitting""" estimator = clone(estimator) input_data, labels, _, _ = build_dataset() def breaking_fun(**args): # a function that fails so that we will miss # the calibration at the end and therefore the right error message from # validating params should be thrown before raise RuntimeError('Game over.') estimator._fit = breaking_fun expected_msg = ('Strategy can either be "accuracy", "f_beta" or ' '"max_tpr" or "max_tnr". Got "weird" instead.') with pytest.raises(ValueError) as raised_error: estimator.fit(input_data, labels, calibration_params={'strategy': 'weird'}) assert str(raised_error.value) == expected_msg
def test_pipeline_consistency(estimator, build_dataset, with_preprocessor): # Adapted from scikit learn # check that make_pipeline(est) gives same score as est # we do this test on all except quadruplets (since they don't have a y # in fit): if estimator.__class__.__name__ not in [ e.__class__.__name__ for (e, _) in quadruplets_learners ]: input_data, y, preprocessor, _ = build_dataset(with_preprocessor) def make_random_state(estimator, in_pipeline): rs = {} name_estimator = estimator.__class__.__name__ if name_estimator[-11:] == '_Supervised': name_param = 'random_state' if in_pipeline: name_param = name_estimator.lower() + '__' + name_param rs[name_param] = check_random_state(0) return rs estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) pipeline = make_pipeline(estimator) estimator.fit(*remove_y_quadruplets(estimator, input_data, y), **make_random_state(estimator, False)) pipeline.fit(*remove_y_quadruplets(estimator, input_data, y), **make_random_state(estimator, True)) if hasattr(estimator, 'score'): result = estimator.score( *remove_y_quadruplets(estimator, input_data, y)) result_pipe = pipeline.score( *remove_y_quadruplets(estimator, input_data, y)) assert_allclose_dense_sparse(result, result_pipe) if hasattr(estimator, 'predict'): result = estimator.predict(input_data) result_pipe = pipeline.predict(input_data) assert_allclose_dense_sparse(result, result_pipe) if issubclass(estimator.__class__, TransformerMixin): if hasattr(estimator, 'transform'): result = estimator.transform(input_data) result_pipe = pipeline.transform(input_data) assert_allclose_dense_sparse(result, result_pipe)
def fit(self, X, y, sample_weight=None): """ Train the classifier, will train several base classifiers on overlapping subsets of training dataset. :param X: pandas.DataFrame of shape [n_samples, n_features] :param y: labels of events - array-like of shape [n_samples] :param sample_weight: weight of events, array-like of shape [n_samples] or None if all weights are equal """ if hasattr(self.base_estimator, 'features'): assert self.base_estimator.features is None, 'Base estimator must have None features! ' \ 'Use features parameter in Folding to fix it' X, y, sample_weight = check_inputs(X, y, sample_weight=sample_weight, allow_none_weights=True) X = self._get_features(X) self._set_classes(y) folds_column = self._get_folds_column(len(X)) for _ in range(self.n_folds): self.estimators.append(clone(self.base_estimator)) if sample_weight is None: weights_iterator = (None for _ in range(self.n_folds)) else: weights_iterator = (sample_weight[folds_column != index] for index in range(self.n_folds)) result = utils.map_on_cluster(self.ipc_profile, train_estimator, range(len(self.estimators)), self.estimators, (X.iloc[folds_column != index, :].copy() for index in range(self.n_folds)), (y[folds_column != index] for index in range(self.n_folds)), weights_iterator) for status, data in result: if status == 'success': name, classifier, spent_time = data self.estimators[name] = classifier else: print('Problem while training on the node, report:\n', data) return self
def setup_dummy_YATSM(X, Y, dates, i_breaks): """ Setup a dummy YATSM model Args: X (np.ndarray): n x p features Y (np.ndarray): n_series x n independent data dates (np.ndarray): n dates i_breaks (iterable): indices of ``dates`` representing break dates (can be zero or nonzero, but len(i_breaks) is len(yatsm.record)) Returns: YATSM model """ n = dates.size yatsm = YATSM() yatsm.X, yatsm.Y, yatsm.dates = X, Y, dates yatsm.n_coef, yatsm.n_series = X.shape[1], Y.shape[0] yatsm.models = np.array( [sklearn.clone(yatsm.estimator) for i in range(yatsm.n_series)]) yatsm.test_indices = np.arange(yatsm.n_series) n_models = len(i_breaks) yatsm.record = np.hstack([yatsm.record_template] * n_models) def populate_record(yatsm, i_rec, i_start, i_end, i_break): yatsm.record[i_rec]['start'] = yatsm.dates[i_start] yatsm.record[i_rec]['end'] = yatsm.dates[i_end] yatsm.record[i_rec]['break'] = (yatsm.dates[i_break] if i_break else i_break) yatsm.fit_models(X[i_start:i_end, :], Y[:, i_start:i_end]) for i, m in enumerate(yatsm.models): yatsm.record[i_rec]['coef'][:, i] = m.coef yatsm.record[i_rec]['rmse'][i] = m.rmse return yatsm i_start = 0 i_end = i_breaks[0] - 1 if i_breaks[0] else n - 1 i_break = i_breaks[0] yatsm = populate_record(yatsm, 0, i_start, i_end, i_break) for idx, i_break in enumerate(i_breaks[1:]): i_start = i_breaks[idx] + 1 i_end = i_break - 1 if i_break else n - 1 yatsm = populate_record(yatsm, idx + 1, i_start, i_end, i_break) return yatsm
def __init__(self, test_indices=None, estimator={ 'object': sklearn.linear_model.Lasso(alpha=20), 'fit': {} }, **kwargs): self.test_indices = np.asarray(test_indices) self.estimator = sklearn.clone(estimator['object']) self.estimator_fit = estimator.get('fit', {}) self.models = [] # leave empty, fill in during `fit` self.n_record = 0 self.record = [] self.n_series, self.n_features = 0, 0 self.px = kwargs.get('px', 0) self.py = kwargs.get('py', 0)
def _train(train_data: DataFrame, time_series_predictor: Any, clusterer: Clustering) -> dict: models = dict() train_data = clusterer.cluster_data(train_data) for cluster in range(clusterer.n_clusters): cluster_train_df = train_data[cluster] if not cluster_train_df.empty: time_series_predictor.fit(cluster_train_df) models[cluster] = time_series_predictor time_series_predictor = clone(time_series_predictor, safe=False) return { ModelType.CLUSTERER.value: clusterer, ModelType.TIME_SERIES_PREDICTOR.value: models }
def _setparams_clustering(method, masker, n_clusters, crop=False): """Setting the parameters of the clustering method method : sklearn clustering-like or Random Projections masker : NiftiMasker n_clusters : int crop: bool, only for slic """ method = clone(method) if hasattr(method, 'n_clusters'): method.set_params(**{'masker': masker, 'n_clusters': n_clusters}) if hasattr(method, 'crop'): method.set_params(crop=crop) else: method.set_params(n_components=n_clusters) return method
def fit(self, X, y, **kwargs): """Fit the estimator. If `prefit` is set to `True` then the base estimator is kept as is. Otherwise it is fitted from the provided arguments. """ if self.estimator is None: raise ValueError(BASE_ESTIMATOR_NONE_ERROR_MESSAGE) if not self.prefit: self.estimator_ = clone(self.estimator).fit(X, y, **kwargs) else: try: check_is_fitted(self.estimator) except NotFittedError: warn(BASE_ESTIMATOR_NOT_FITTED_WARNING.format(type(self).__name__)) self.estimator_ = self.estimator return self
def extra_trees(X,y,n_est): ''' INPUT: Dataframe with features (X), target variable dataframe (y), number of estimators (parameter) OUTPUT: Score of ExtaTrees model ''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) ext = ExtraTreesRegressor(n_estimators=n_est) mean = X.mean(axis=0) std = X.std(axis=0) X = (X - mean) / std clf = clone(ext) clf = ext.fit(X_train, y_train) scores = ext.score(X_test, y_test) return 'ExtraTrees Score: '+str(scores), dict(zip(cols2, clf.feature_importances_))
def get_deep_copy(self): """ This creates an untrained copy of the model structure """ if self.mtype == mtype.SKL: return clone(self.model) elif self.mtype == mtype.XG: if isinstance(self.model, XGBRegressor): return XGBRegressor(**self.model.get_params()) if isinstance(self.model, XGBClassifier): return XGBClassifier(**self.model.get_params()) elif self.mtype == mtype.LGBM: # TODO pass elif self.mtype == mtype.KERAS: # TODO Consider whether ref is to KerasNN wrapper or the model itself. Probably latter. #return KerasNN(self.model.neuronList, self.model.dropout, self.model.epochs) pass
def randomForests(X, y, X_test): predictions = [] n_estimators = 200 models = [ DecisionTreeClassifier(max_depth=None), RandomForestClassifier(n_estimators=n_estimators), ExtraTreesClassifier(n_estimators=n_estimators), AdaBoostClassifier(DecisionTreeClassifier(max_depth=None), n_estimators=n_estimators) ] for model in models: clf = clone(model) clf = model.fit(X, y) prediction = clf.predict(X_test) predictions.append(prediction) return (predictions)
def train_predict(self, X_train, y_train, X_predict): # ~~~~~~~~~~~~~~~~~~ Summary ~~~~~~~~~~~~~~~~~~~~ # This function will train on the entire training dataset and will then create predictions on the Kaggle test set. # This function is mean primarily for the purpose of submission. # # ~~~~~~~~~~~~~~~~ Parameters ~~~~~~~~~~~~~~~~~~~ # Input: # - X_train: Full train dataset to train on # - y_train: Full y train to train on # - X_predict: Full test dataset (without labels) # Output: # - y_pred: predictions on X # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ instance = clone(self.model) instance.fit(X_train, y_train) y_pred = instance.predict(X_predict) return y_pred
def __init__(self, n_trees=500, min_leaf_size=10, max_depth=10, subsample_ratio=0.7, bootstrap=False, lambda_reg=0.01, propensity_model=LogisticRegression(penalty='l1', solver='saga', multi_class='auto'), # saga solver supports l1 model_Y=WeightedLassoCVWrapper(cv=3), propensity_model_final=None, model_Y_final=None, categories='auto', n_jobs=-1, random_state=None): # Copy and/or define models self.propensity_model = clone(propensity_model, safe=False) self.model_Y = clone(model_Y, safe=False) self.propensity_model_final = clone(propensity_model_final, safe=False) self.model_Y_final = clone(model_Y_final, safe=False) if self.propensity_model_final is None: self.propensity_model_final = clone(self.propensity_model, safe=False) if self.model_Y_final is None: self.model_Y_final = clone(self.model_Y, safe=False) # Nuisance estimators shall be defined during fitting because they need to know the number of distinct # treatments nuisance_estimator = None second_stage_nuisance_estimator = None # Define parameter estimators parameter_estimator = DiscreteTreatmentOrthoForest.parameter_estimator_func second_stage_parameter_estimator =\ DiscreteTreatmentOrthoForest.second_stage_parameter_estimator_gen(lambda_reg) # Define moment and mean gradient estimator moment_and_mean_gradient_estimator =\ DiscreteTreatmentOrthoForest.moment_and_mean_gradient_estimator_func if categories != 'auto': categories = [categories] # OneHotEncoder expects a 2D array with features per column self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first') super(DiscreteTreatmentOrthoForest, self).__init__( nuisance_estimator, second_stage_nuisance_estimator, parameter_estimator, second_stage_parameter_estimator, moment_and_mean_gradient_estimator, n_trees=n_trees, min_leaf_size=min_leaf_size, max_depth=max_depth, subsample_ratio=subsample_ratio, bootstrap=bootstrap, n_jobs=n_jobs, random_state=random_state)
def test_predict_monotonous(estimator, build_dataset, with_preprocessor): """Test that there is a threshold distance separating points labeled as similar and points labeled as dissimilar """ input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) pairs_train, pairs_test, y_train, y_test = train_test_split( input_data, labels) estimator.fit(pairs_train, y_train) distances = estimator.score_pairs(pairs_test) predictions = estimator.predict(pairs_test) min_dissimilar = np.min(distances[predictions == -1]) max_similar = np.max(distances[predictions == 1]) assert max_similar <= min_dissimilar separator = np.mean([min_dissimilar, max_similar]) assert (predictions[distances > separator] == -1).all() assert (predictions[distances < separator] == 1).all()
def adapt_ubm(ubm, X, adapt_params='m', adapt_iter=10): # clone UBM (n_components, covariance type, etc...) gmm = sklearn.clone(ubm) # initialize with UBM precomputed weights, means and covariance matrices gmm.n_init = 1 gmm.init_params = '' gmm.weights_ = ubm.weights_ gmm.means_ = ubm.means_ gmm.covars_ = ubm.covars_ # adapt only some parameters gmm.params = adapt_params gmm.n_iter = adapt_iter gmm.fit(X) return gmm
def __init__(self, forecaster, refit_interval=0, refit_window_size=None, refit_window_lag=0): self.forecaster = forecaster self.forecaster_ = clone(forecaster) self.refit_interval = refit_interval self.refit_window_size = refit_window_size self.refit_window_lag = refit_window_lag super(UpdateRefitsEvery, self).__init__() self.clone_tags(forecaster) # fit must be executed to fit the wrapped estimator and remember the cutoff self.set_tags(fit_is_empty=False)
def learn_curve(model, train_x, train_label, cv=3, scoring='neg_log_loss'): model_c = clone(model) N, train_score, test_score = learning_curve(model_c, train_x, train_label, cv=cv, train_sizes=np.linspace( 0.5, 1, 5), scoring=scoring) plt.figure(figsize=(7, 4)) plt.title('{}'.format(type(model).__name__)) plt.plot(N, np.mean(train_score, 1), color='blue', label='training score') plt.plot(N, np.mean(test_score, 1), color='red', label='validation score') plt.xlabel('training sample') plt.ylabel(scoring) plt.legend(loc=0) plt.show()
def _transform(self, X, y=None): """Transform X and return a transformed version. private _transform containing core logic, called from transform Parameters ---------- X : Series or Panel of mtype X_inner_mtype if X_inner_mtype is list, _transform must support all types in it Data to be transformed y : Series or Panel of mtype y_inner_mtype, default=None Additional data, e.g., labels for transformation Returns ------- transformed version of X """ return clone(self.transformer).fit_transform(X=X, y=y)
def test_dont_overwrite_parameters(estimator, build_dataset, with_preprocessor): # Adapted from scikit-learn # check that fit method only changes or sets private attributes input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) if hasattr(estimator, "n_components"): estimator.n_components = 1 dict_before_fit = estimator.__dict__.copy() estimator.fit(*remove_y_quadruplets(estimator, input_data, labels)) dict_after_fit = estimator.__dict__ public_keys_after_fit = [ key for key in dict_after_fit.keys() if is_public_parameter(key) ] attrs_added_by_fit = [ key for key in public_keys_after_fit if key not in dict_before_fit.keys() ] # check that fit doesn't add any public attribute assert not attrs_added_by_fit, ( "Estimator adds public attribute(s) during" " the fit method." " Estimators are only allowed to add private " "attributes" " either started with _ or ended" " with _ but %s added" % ', '.join(attrs_added_by_fit)) # check that fit doesn't change any public attribute attrs_changed_by_fit = [ key for key in public_keys_after_fit if (dict_before_fit[key] is not dict_after_fit[key]) ] assert not attrs_changed_by_fit, ( "Estimator changes public attribute(s) during" " the fit method. Estimators are only allowed" " to change attributes started" " or ended with _, but" " %s changed" % ', '.join(attrs_changed_by_fit))
def __init__(self, controls_model, treated_model, cate_controls_model=None, cate_treated_model=None, propensity_model=LogisticRegression(), propensity_func=None): self.controls_model = clone(controls_model, safe=False) self.treated_model = clone(treated_model, safe=False) self.cate_controls_model = clone(cate_controls_model, safe=False) self.cate_treated_model = clone(cate_treated_model, safe=False) if self.cate_controls_model is None: self.cate_controls_model = clone(self.controls_model, safe=False) if self.cate_treated_model is None: self.cate_treated_model = clone(self.treated_model, safe=False) self.propensity_func = clone(propensity_func, safe=False) self.propensity_model = clone(propensity_model, safe=False) self.has_propensity_func = self.propensity_func is not None
def test_cross_validation_manual_vs_scikit(estimator, build_dataset, with_preprocessor): """Tests that if we make a manual cross-validation, the result will be the same as scikit-learn's cross-validation (some code for generating the folds is taken from scikit-learn). """ if any(hasattr(estimator, method) for method in ["predict", "score"]): input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) n_splits = 3 kfold = KFold(shuffle=False, n_splits=n_splits) n_samples = input_data.shape[0] fold_sizes = (n_samples // n_splits) * np.ones(n_splits, dtype=np.int) fold_sizes[:n_samples % n_splits] += 1 current = 0 scores, predictions = [], np.zeros(input_data.shape[0]) for fold_size in fold_sizes: start, stop = current, current + fold_size current = stop test_slice = slice(start, stop) train_mask = np.ones(input_data.shape[0], bool) train_mask[test_slice] = False y_train, y_test = labels[train_mask], labels[test_slice] estimator.fit(*remove_y_quadruplets( estimator, input_data[train_mask], y_train)) if hasattr(estimator, "score"): scores.append( estimator.score(*remove_y_quadruplets( estimator, input_data[test_slice], y_test))) if hasattr(estimator, "predict"): predictions[test_slice] = estimator.predict( input_data[test_slice]) if hasattr(estimator, "score"): assert all(scores == cross_val_score( estimator, *remove_y_quadruplets(estimator, input_data, labels), cv=kfold)) if hasattr(estimator, "predict"): assert all(predictions == cross_val_predict( estimator, *remove_y_quadruplets(estimator, input_data, labels), cv=kfold))
def fit(self, X, y=None): """ Fit the model using X, y as training data. Will also learn the groups that exist within the dataset. :param X: array-like, shape=(n_columns, n_samples,) training data. :param y: array-like, shape=(n_samples,) training data. :return: Returns an instance of self. """ X_group, X_value = _split_groups_and_values(X, self.groups, min_value_cols=0, check_X=self.check_X, **self._check_kwargs) X_group = self.__add_shrinkage_column(X_group) if y is not None: y = check_array(y, ensure_2d=False) if self.shrinkage is not None: self.__set_shrinkage_function() # List of all hierarchical subsets of columns self.group_colnames_hierarchical_ = expanding_list( X_group.columns, list) self.fallback_ = None if self.shrinkage is None and self.use_global_model: self.fallback_ = clone(self.estimator).fit(X_value, y) if self.shrinkage is not None: self.estimators_ = self.__fit_shrinkage_groups(X_group, X_value, y) else: self.estimators_ = self.__fit_grouped_estimator( X_group, X_value, y) self.groups_ = as_list(self.estimators_.keys()) if self.shrinkage is not None: self.shrinkage_factors_ = self.__get_shrinkage_factor(X_group) return self
def stacking(model, train_data, train_target, test_data,test_target,n_fold): """ :param model: 模型算法 :param train_data: 训练集(不含带预测的目标特征) :param train_target: 需要预测的目标特征 :param test_data: 测试集 :param n_fold: 交叉验证的折数 :return: """ train_data = pd.DataFrame(train_data) train_target = pd.DataFrame(train_target) test_data = pd.DataFrame(test_data) skf = StratifiedKFold(n_splits=n_fold,shuffle=True) # StratifiedKFold 默认分层采样 train_pred = np.zeros((train_data.shape[0], 1), int) # 存储训练集预测结果 test_pred = np.zeros((test_data.shape[0], 1), int) # 存储测试集预测结果 行数:len(test_data) ,列数:1列 for skf_index, (train_index, val_index) in enumerate(skf.split(train_data, train_target)): print('第 ', skf_index + 1, ' 折交叉验证开始... ') # 训练集划分 new_model = clone(model) # print('pre-model',model) # print('new-model',new_model) x_train, x_val = train_data.iloc[train_index], train_data.iloc[val_index] y_train, y_val = train_target.iloc[train_index], train_target.iloc[val_index] # 模型构建 y_train = np.ravel(y_train) # 向量转成数组 new_model.fit(X=x_train, y=y_train) # 模型预测 accs = accuracy_score(y_val, new_model.predict(x_val)) print('第 ', skf_index + 1, ' 折交叉验证 : accuracy : ', accs) # 训练集预测结果 val_pred = new_model.predict(x_val) for i in range(len(val_index)): train_pred[val_index[i]] = val_pred[i] # 保存测试集预测结果 print('第 ', skf_index + 1, ' 折accuracy : ', accuracy_score(test_target, new_model.predict(test_data))) test_pred = np.column_stack((test_pred, new_model.predict(test_data))) # 将矩阵按列合并 test_pred_mean = np.mean(test_pred, axis=1) # 按行计算均值(会出现小数) test_pred_mean = pd.DataFrame(test_pred_mean) # 转成DataFrame test_pred_mean = test_pred_mean.apply(lambda x: round(x)) # 小数需要四舍五入成整数 test_set = np.ravel(test_pred_mean) return test_set.reshape(test_set.shape[0],1), np.array(train_pred)
def sWeights_to_proba(x, sWeights, model, use_cross_val=False, cv_params={ "cv": 4, "n_jobs": 1 }): if model.get_params()['loss_function'] != 'ConstrainedRegression': raise ValueError( "Smart training requires catboost with ConstrainedRegression loss") if use_cross_val: raw_predictions = sklearn.model_selection.cross_val_predict( model, x, sWeights, **cv_params) else: model_ = sklearn.clone(model) model_.fit(x, sWeights) raw_predictions = model_.predict(x) return expit(raw_predictions)
def test_accuracy_toy_example(estimator, build_dataset): """Test that the default scoring for triplets (accuracy) works on some toy example""" triplets, _, _, X = build_dataset(with_preprocessor=False) estimator = clone(estimator) set_random_state(estimator) estimator.fit(triplets) # We take the two first points and we build 4 regularly spaced points on the # line they define, so that it's easy to build triplets of different # similarities. X_test = X[0] + np.arange(4)[:, np.newaxis] * (X[0] - X[1]) / 4 triplets_test = np.array([[X_test[0], X_test[2], X_test[1]], [X_test[1], X_test[3], X_test[0]], [X_test[1], X_test[2], X_test[3]], [X_test[3], X_test[0], X_test[2]]]) # we force the transformation to be identity so that we control what it does estimator.components_ = np.eye(X.shape[1]) assert estimator.score(triplets_test) == 0.25