def test_is_classifier(): svc = SVC() assert_true(is_classifier(svc)) assert_true(is_classifier(GridSearchCV(svc, {'C': [0.1, 1]}))) assert_true(is_classifier(Pipeline([('svc', svc)]))) assert_true(is_classifier(Pipeline( [('svc_cv', GridSearchCV(svc, {'C': [0.1, 1]}))])))
def fit(self, X, y): if is_classifier(self): self.classes_, y = np.unique(y, return_inverse=True) self.num_classes_ = len(self.classes_) else: self.num_classes_ = -1 # Split data into train/val X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=self.holdout_split, random_state=self.random_state, stratify=y if is_classifier(self) else None, ) # Define attributes self.attributes_ = EBMUtils.gen_attributes(self.col_types, self.col_n_bins) # Build EBM allocation code if is_classifier(self): model_type = "classification" else: model_type = "regression" self.intercept_ = 0 self.attribute_sets_ = [] self.attribute_set_models_ = [] main_attr_indices = [[x] for x in range(len(self.attributes_))] main_attr_sets = EBMUtils.gen_attribute_sets(main_attr_indices) with closing( NativeEBM( self.attributes_, main_attr_sets, X_train, y_train, X_val, y_val, num_inner_bags=self.feature_step_n_inner_bags, num_classification_states=self.num_classes_, model_type=model_type, training_scores=None, validation_scores=None, ) ) as native_ebm: # Train main effects self._fit_main(native_ebm, main_attr_sets) # Build interaction terms self.inter_indices_ = self._build_interactions(native_ebm) self.staged_fit_interactions(X, y, self.inter_indices_) return self
def explain_local(self, X, y=None, name=None): # Produce feature value pairs for each instance. # Values are the model graph score per respective attribute set. if name is None: name = gen_name_from_class(self) X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types) instances = self.preprocessor_.transform(X) scores_gen = EBMUtils.scores_by_attrib_set( instances, self.attribute_sets_, self.attribute_set_models_ ) n_rows = instances.shape[0] data_dicts = [] for _ in range(n_rows): data_dict = { "type": "univariate", "names": [], "scores": [], "values": [], "extra": { "names": ["Intercept"], "scores": [self.intercept_], "values": [1], }, } data_dicts.append(data_dict) for set_idx, attribute_set, scores in scores_gen: for row_idx in range(n_rows): feature_name = self.feature_names[set_idx] data_dicts[row_idx]["names"].append(feature_name) data_dicts[row_idx]["scores"].append(scores[row_idx]) if attribute_set["n_attributes"] == 1: data_dicts[row_idx]["values"].append( X[row_idx, attribute_set["attributes"][0]] ) else: data_dicts[row_idx]["values"].append("") if is_classifier(self): scores = EBMUtils.classifier_predict_proba(instances, self)[:, 1] else: scores = EBMUtils.regressor_predict(instances, self) for row_idx in range(n_rows): data_dicts[row_idx]["perf"] = perf_dict(y, scores, row_idx) selector = gen_local_selector(instances, y, scores) internal_obj = {"overall": None, "specific": data_dicts} return EBMExplanation( "local", internal_obj, feature_names=self.feature_names, feature_types=self.feature_types, name=name, selector=selector, )
def score_fn(est, X, y, drop_indices): if is_classifier(est): prob = EBMUtils.classifier_predict_proba(X, estimator, drop_indices) return -1.0 * roc_auc_score(y, prob[:, 1]) else: pred = EBMUtils.regressor_predict(X, estimator, drop_indices) return mean_squared_error(y, pred)
def _is_classifier(self): """Whether the underlying model is a classifier Return: (boolean) whether `self.model` is a classifier """ return is_classifier(self.model) or hasattr(self.model, 'predict_proba')
def permutation_test_score(estimator, X, y, groups=None, cv=None, n_permutations=100, n_jobs=1, random_state=0, verbose=0, scoring=None): """ Evaluate the significance of a cross-validated score with permutations, as in test 1 of [Ojala2010]_. A modification of original sklearn's permutation test score function to evaluate p-value outside this function, so that the score can be reused from outside. .. [Ojala2010] Ojala and Garriga. Permutation Tests for Studying Classifier Performance. The Journal of Machine Learning Research (2010) vol. 11 """ X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) random_state = check_random_state(random_state) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(_permutation_test_score)( clone(estimator), X, _shuffle(y, groups, random_state), groups, cv, scorer) for _ in range(n_permutations)) permutation_scores = np.array(permutation_scores) return permutation_scores
def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): """ Evaluate a score by cross-validation """ if not isinstance(scoring, (list, tuple)): scoring = [scoring] X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) splits = list(cv.split(X, y, groups)) scorer = [check_scoring(estimator, scoring=s) for s in scoring] # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, verbose, None, fit_params) for train, test in splits) group_order = [] if hasattr(cv, 'groups'): group_order = [np.array(cv.groups)[test].tolist()[0] for _, test in splits] return np.squeeze(np.array(scores)), group_order
def benchmark(clf, X, y, cv=None): X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) cv = check_cv(cv, X, y, classifier=is_classifier(clf)) # learning_curve_ = learning_curve(clf, X_all, y_all, cv=cv) train_times = [] test_times = [] confusion_matrices = [] confusion_matrix_indices = [] coefs = [] for train, test in cv: X_train, y_train = X[train], y[train] X_test, y_test = X[test], y[test] t0 = time() clf.fit(X_train, y_train) train_times.append(time()-t0) t0 = time() y_pred = clf.predict(X_test) test_times.append(time()-t0) confusion_matrices.append(confusion_matrix(y_test, y_pred)) confusion_matrix_indices.append(np.array([[test[pred] for pred in true] for true in confusion_matrix_instances(y_test, y_pred)])) coefs.append(clf.coef_) return dict( train_times = np.array(train_times), test_times = np.array(test_times), confusion_matrices = np.array(confusion_matrices), confusion_matrix_indices = np.array(confusion_matrix_indices), coefs = np.array(coefs) )
def build_graph(estimator, cv, scorer, candidate_params, X, y=None, groups=None, fit_params=None, iid=True, refit=True, error_score='raise', return_train_score=True, cache_cv=True): X, y, groups = to_indexable(X, y, groups) cv = check_cv(cv, y, is_classifier(estimator)) # "pairwise" estimators require a different graph for CV splitting is_pairwise = getattr(estimator, '_pairwise', False) dsk = {} X_name, y_name, groups_name = to_keys(dsk, X, y, groups) n_splits = compute_n_splits(cv, X, y, groups) if fit_params: # A mapping of {name: (name, graph-key)} param_values = to_indexable(*fit_params.values(), allow_scalars=True) fit_params = {k: (k, v) for (k, v) in zip(fit_params, to_keys(dsk, *param_values))} else: fit_params = {} fields, tokens, params = normalize_params(candidate_params) main_token = tokenize(normalize_estimator(estimator), fields, params, X_name, y_name, groups_name, fit_params, cv, error_score == 'raise', return_train_score) cv_name = 'cv-split-' + main_token dsk[cv_name] = (cv_split, cv, X_name, y_name, groups_name, is_pairwise, cache_cv) if iid: weights = 'cv-n-samples-' + main_token dsk[weights] = (cv_n_samples, cv_name) else: weights = None scores = do_fit_and_score(dsk, main_token, estimator, cv_name, fields, tokens, params, X_name, y_name, fit_params, n_splits, error_score, scorer, return_train_score) cv_results = 'cv-results-' + main_token candidate_params_name = 'cv-parameters-' + main_token dsk[candidate_params_name] = (decompress_params, fields, params) dsk[cv_results] = (create_cv_results, scores, candidate_params_name, n_splits, error_score, weights) keys = [cv_results] if refit: best_params = 'best-params-' + main_token dsk[best_params] = (get_best_params, candidate_params_name, cv_results) best_estimator = 'best-estimator-' + main_token if fit_params: fit_params = (dict, (zip, list(fit_params.keys()), list(pluck(1, fit_params.values())))) dsk[best_estimator] = (fit_best, clone(estimator), best_params, X_name, y_name, fit_params) keys.append(best_estimator) return dsk, keys, n_splits
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) X, y = indexable(X, y) cv = check_cv(self.cv, X, y, classifier=is_classifier(self.estimator)) base_estimator = clone(self.estimator) best = best_parameters(base_estimator, cv, X, y, parameter_iterable, self.scorer_, self.fit_params, self.iid) best = best.compute() self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if isinstance(base_estimator, Pipeline): base_estimator = base_estimator.to_sklearn().compute() if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = base_estimator.set_params(**best.parameters) if y is not None: self.best_estimator_ = best_estimator.fit(X, y, **self.fit_params) else: self.best_estimator_ = best_estimator.fit(X, **self.fit_params) return self
def add_del_cv(df, predictors, target, model, scoring='roc_auc', cv1=None, n_folds=8, n_jobs=-1, start=[], selmax=None, selmin=1, min_ratio=1e-7, max_steps=10, verbosity=0): """ Forward-Backward (ADD-DEL) selection using model. Parameters ---------- Returns ------- selected: list selected predictors Example ------- References ---------- """ def test_to_break(selected, selected_curr, to_break): if set(selected) == set(selected_curr): to_break += 1 else: to_break = 0 return to_break X, y, _ = df_xyf(df, predictors=predictors, target=target) cv1 = cross_validation.check_cv( cv1, X=X, y=y, classifier=is_classifier(model)) selected_curr = start to_break = 0 for i_step in xrange(max_steps): selected = forward_cv( df, predictors, target, model, scoring=scoring, cv1=cv1, n_folds=n_folds, n_jobs=n_jobs, start=selected_curr, selmax=selmax, min_ratio=min_ratio, verbosity=verbosity-1) to_break = test_to_break(selected, selected_curr, to_break) selected_curr = selected if verbosity > 0: print('forward:', ' '.join(selected_curr)) if to_break > 1: break selected = backward_cv( df, selected_curr, target, model, scoring=scoring, cv1=cv1, n_folds=n_folds, n_jobs=n_jobs, selmin=selmin, min_ratio=min_ratio, verbosity=verbosity-1) to_break = test_to_break(selected, selected_curr, to_break) selected_curr = selected if verbosity > 0: print('backward:', ' '.join(selected_curr)) if to_break > 0: break return selected_curr
def transform(self, X, y=None): cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) X_prob = np.zeros((X.shape[0], self.n_classes)) X_pred = np.zeros(X.shape[0]) for estimator, (_, test) in zip(self.estimators_, cv.split(X)): X_prob[test] = estimator.predict_proba(X[test]) X_pred[test] = estimator.predict(X[test]) return np.hstack([X_prob, np.array([X_pred]).T])
def fit(self, X, y): y_labels = self._get_labels(y) cv = check_cv(self.cv, y_labels, classifier=is_classifier(self.estimator)) self.estimators_ = [] for train, _ in cv.split(X, y_labels): self.estimators_.append( clone(self.estimator).fit(X[train], y_labels[train]) ) return self
def _set_cv(cv, estimator=None, X=None, y=None): """ Set the default cross-validation depending on whether clf is classifier or regressor. """ from sklearn.base import is_classifier # Detect whether classification or regression if estimator in ['classifier', 'regressor']: est_is_classifier = estimator == 'classifier' else: est_is_classifier = is_classifier(estimator) # Setup CV if check_version('sklearn', '0.18'): from sklearn import model_selection as models from sklearn.model_selection import (check_cv, StratifiedKFold, KFold) if isinstance(cv, (int, np.int)): XFold = StratifiedKFold if est_is_classifier else KFold cv = XFold(n_splits=cv) elif isinstance(cv, str): if not hasattr(models, cv): raise ValueError('Unknown cross-validation') cv = getattr(models, cv) cv = cv() cv = check_cv(cv=cv, y=y, classifier=est_is_classifier) else: from sklearn import cross_validation as models from sklearn.cross_validation import (check_cv, StratifiedKFold, KFold) if isinstance(cv, (int, np.int)): if est_is_classifier: cv = StratifiedKFold(y=y, n_folds=cv) else: cv = KFold(n=len(y), n_folds=cv) elif isinstance(cv, str): if not hasattr(models, cv): raise ValueError('Unknown cross-validation') cv = getattr(models, cv) if cv.__name__ not in ['KFold', 'LeaveOneOut']: raise NotImplementedError('CV cannot be defined with str for' ' sklearn < .017.') cv = cv(len(y)) cv = check_cv(cv=cv, X=X, y=y, classifier=est_is_classifier) # Extract train and test set to retrieve them at predict time if hasattr(cv, 'split'): cv_splits = [(train, test) for train, test in cv.split(X=np.zeros_like(y), y=y)] else: # XXX support sklearn.cross_validation cv cv_splits = [(train, test) for train, test in cv] if not np.all([len(train) for train, _ in cv_splits]): raise ValueError('Some folds do not have any train epochs.') return cv, cv_splits
def fit(self, X, y): """Actual fitting, performing the search over parameters.""" parameter_iterable = ParameterSampler(self.param_distributions, self.n_iter, random_state=self.random_state) estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( delayed(cv_fit_and_score)(clone(base_estimator), X, y, self.scoring, parameters, cv=cv) for parameters in parameter_iterable) best = sorted(out, reverse=True)[0] self.best_params_ = best[1] self.best_score_ = best[0] if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best[1]) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _wrapped_cross_val_score(sklearn_pipeline, features, target, cv, scoring_function, sample_weight=None, groups=None): """Fit estimator and compute scores for a given dataset split. Parameters ---------- sklearn_pipeline : pipeline object implementing 'fit' The object to use to fit the data. features : array-like of shape at least 2D The data to fit. target : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. cv: int or cross-validation generator If CV is a number, then it is the number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. If it is an object then it is an object to be used as a cross-validation generator. scoring_function : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. sample_weight : array-like, optional List of sample weights to balance (or un-balanace) the dataset target as needed groups: array-like {n_samples, }, optional Group labels for the samples used while splitting the dataset into train/test set """ sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight) features, target, groups = indexable(features, target, groups) cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline)) cv_iter = list(cv.split(features, target, groups)) scorer = check_scoring(sklearn_pipeline, scoring=scoring_function) try: with warnings.catch_warnings(): warnings.simplefilter('ignore') scores = [_fit_and_score(estimator=clone(sklearn_pipeline), X=features, y=target, scorer=scorer, train=train, test=test, verbose=0, parameters=None, fit_params=sample_weight_dict) for train, test in cv_iter] CV_score = np.array(scores)[:, 0] return np.nanmean(CV_score) except TimeoutException: return "Timeout" except Exception as e: return -float('inf')
def _grid_search(self, train_X, train_y): if callable(self.inner_cv): # inner_cv = self.inner_cv(train_X, train_y) inner_cv = self.inner_cv.split(train_X, train_y) else: # inner_cv = _check_cv(self.inner_cv, train_X, train_y, # classifier=is_classifier(self.estimator)) inner_cv = _check_cv(self.inner_cv, train_y, classifier=is_classifier( self.estimator)).split(train_X, train_y) master = MPIGridSearchCVMaster(self.param_grid, inner_cv, self.estimator, self.scorer_, self.fit_params) return master.run(train_X, train_y)
def _predict(X, estimators): """Aux function of GeneralizationAcrossTime Predict each classifier. If multiple classifiers are passed, average prediction across all classifiers to result in a single prediction per classifier. Parameters ---------- estimators : ndarray, shape (n_folds,) | shape (1,) Array of scikit-learn classifiers to predict data. X : ndarray, shape (n_epochs, n_features, n_times) To-be-predicted data Returns ------- y_pred : ndarray, shape (n_epochs, m_prediction_dimensions) Classifier's prediction for each trial. """ from scipy import stats from sklearn.base import is_classifier # Initialize results: n_epochs = X.shape[0] n_clf = len(estimators) # Compute prediction for each sub-estimator (i.e. per fold) # if independent, estimators = all folds for fold, clf in enumerate(estimators): _y_pred = clf.predict(X) # initialize predict_results array if fold == 0: predict_size = _y_pred.shape[1] if _y_pred.ndim > 1 else 1 y_pred = np.ones((n_epochs, predict_size, n_clf)) if predict_size == 1: y_pred[:, 0, fold] = _y_pred else: y_pred[:, :, fold] = _y_pred # Collapse y_pred across folds if necessary (i.e. if independent) if fold > 0: # XXX need API to identify how multiple predictions can be combined? if is_classifier(clf): y_pred, _ = stats.mode(y_pred, axis=2) else: y_pred = np.mean(y_pred, axis=2) # Format shape y_pred = y_pred.reshape((n_epochs, predict_size)) return y_pred
def fit(self, X, y): """Fit the model to the training data.""" X, y = check_X_y(X, y, force_all_finite=False, multi_output=self.multi_output) _check_param_grid(self.param_grid) # cv = _check_cv(self.cv, X, y, classifier=is_classifier(self.estimator)) cv = _check_cv(self.cv, y, classifier=is_classifier(self.estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) if comm_rank == 0: self._fit_master(X, y, cv) else: self._fit_slave() return self
def score(self,test_parameter): """ The score function to call in order to evaluate the quality of the parameter test_parameter Parameters ---------- `tested_parameter` : dict, the parameter to test Returns ------- `score` : the CV score, either the list of all cv results or the mean (depending of score_format) """ if not self._callable_estimator: cv = check_cv(self.cv, self.X, self.y, classifier=is_classifier(self.estimator)) cv_score = [ _fit_and_score(clone(self.estimator), self.X, self.y, self.scorer_, train, test, False, test_parameter, self.fit_params, return_parameters=True) for train, test in cv ] n_test_samples = 0 mean_score = 0 detailed_score = [] for tmp_score, tmp_n_test_samples, _, _ in cv_score: detailed_score.append(tmp_score) tmp_score *= tmp_n_test_samples n_test_samples += tmp_n_test_samples mean_score += tmp_score mean_score /= float(n_test_samples) if(self.score_format == 'avg'): score = mean_score else: # format == 'cv' score = detailed_score else: if(self.score_format == 'avg'): score = [self.estimator(test_parameter)] else: # format == 'cv' score = self.estimator(test_parameter) return score
def cross_val_predict(estimator, X, y, cv=5, n_jobs=1, refit=False, predict_fun="predict"): X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) pred = Parallel(n_jobs=n_jobs)( delayed(_cross_val_predict)( clone(estimator), X, y, train, test, predict_fun) for train, test in cv) pred = np.concatenate(pred) if cv.indices: index = np.concatenate([test for _, test in cv]) else: index = np.concatenate([np.where(test)[0] for _, test in cv]) ## pred[index] = pred doesn't work as expected pred[index] = pred.copy() if refit: return pred, clone(estimator).fit(X,y) else: return pred
def test_precision(): rng_reg = RandomState(2) rng_clf = RandomState(8) for X, y, clf in zip( (rng_reg.random_sample((5, 2)), rng_clf.random_sample((1000, 4))), (rng_reg.random_sample((5, )), rng_clf.randint(2, size=(1000, ))), (DecisionTreeRegressor(criterion="friedman_mse", random_state=0, max_depth=1), DecisionTreeClassifier(max_depth=1, random_state=0))): clf.fit(X, y) for precision in (4, 3): dot_data = export_graphviz(clf, out_file=None, precision=precision, proportion=True) # With the current random state, the impurity and the threshold # will have the number of precision set in the export_graphviz # function. We will check the number of precision with a strict # equality. The value reported will have only 2 precision and # therefore, only a less equal comparison will be done. # check value for finding in finditer(r"value = \d+\.\d+", dot_data): assert_less_equal( len(search(r"\.\d+", finding.group()).group()), precision + 1) # check impurity if is_classifier(clf): pattern = r"gini = \d+\.\d+" else: pattern = r"friedman_mse = \d+\.\d+" # check impurity for finding in finditer(pattern, dot_data): assert_equal(len(search(r"\.\d+", finding.group()).group()), precision + 1) # check threshold for finding in finditer(r"<= \d+\.\d+", dot_data): assert_equal(len(search(r"\.\d+", finding.group()).group()), precision + 1)
def my_cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', method='predict'): X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) # Ensure the estimator has implemented the passed decision function if not callable(getattr(estimator, method)): raise AttributeError('{} not implemented in estimator' .format(method)) if method in ['decision_function', 'predict_proba', 'predict_log_proba']: le = LabelEncoder() y = le.fit_transform(y) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) prediction_blocks = parallel(delayed(_my_fit_and_predict)( clone(estimator), X, y, train, test, verbose, fit_params, method) for train, test in cv.split(X, y, groups)) # Concatenate the predictions predictions = [pred_block_i for pred_block_i, _, _ in prediction_blocks] test_indices = np.concatenate([indices_i for _, indices_i, _ in prediction_blocks]) scores = np.concatenate([score_i for _, _, score_i in prediction_blocks]) if not _check_is_permutation(test_indices, _num_samples(X)): raise ValueError('cross_val_predict only works for partitions') inv_test_indices = np.empty(len(test_indices), dtype=int) inv_test_indices[test_indices] = np.arange(len(test_indices)) # Check for sparse predictions if sp.issparse(predictions[0]): predictions = sp.vstack(predictions, format=predictions[0].format) else: predictions = np.concatenate(predictions) return predictions[inv_test_indices], scores
def _get_params(self): res = super(RGFEstimatorBase, self)._get_params() res.update(dict(max_leaf=self.max_leaf, test_interval=self.test_interval, algorithm=self.algorithm, loss=self.loss, reg_depth=self.reg_depth, l2=self.l2, sl2=self._sl2, normalize=self.normalize, min_samples_leaf=self._min_samples_leaf, n_iter=self._n_iter, n_tree_search=self.n_tree_search, opt_interval=self.opt_interval, learning_rate=self.learning_rate, memory_policy=self.memory_policy, verbose=self.verbose, init_model=self.init_model, is_classification=is_classifier(self))) return res
def fit(self,X,Y): if not self.best_subset: self.fshape = np.shape(X)[1] self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) self.cv = check_cv(self.cv, X, Y, classifier=is_classifier(self.estimator)) self.best_subset = tuple() self.best_subset_score = 0 self.scores_ = {self.best_subset:self.best_subset_score} X = np.array(X) Y = np.array(Y) try: self.get_best_subset(X,Y) except KeyboardInterrupt: pass self.estimator = self.estimator.fit(X[:,self.best_subset],Y) return self
def cross_val_score_filter_feature_selection(model,filter_function,filter_criteria, X, y, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): X, y = indexable(X, y) cv = check_cv(cv, X, y, classifier=is_classifier(model)) scorer = check_scoring(model, scoring=scoring) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) # scores = parallel(delayed(_fit_and_score)(clone(model), filter_function(X,y,train,filter_criteria), y, scorer, train, test, verbose, None, fit_params) for train, test in cv) return np.array(scores)[:, 0]
def explain_local(self, X, y=None, name=None): if name is None: name = gen_name_from_class(self) X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types) # Extract decision tree structure nodes, edges = self._graph_from_tree(self._model(), self.feature_names) decisions = [ self._model().decision_path(instance.reshape(1, -1)).nonzero()[1] + 1 for instance in X ] data_dicts = [ { "type": "tree", "features": self.feature_names, "nodes": nodes, "edges": edges, "decision": decision, } for decision in decisions ] internal_obj = {"overall": None, "specific": data_dicts} if is_classifier(self): scores = self.predict_proba(X)[:, 1] else: scores = self.predict(X) selector = gen_local_selector(X, y, scores) return TreeExplanation( "local", internal_obj, feature_names=self.feature_names, feature_types=self.feature_types, name=name, selector=selector, )
def _get_params(self): res = super(FastRGFEstimatorBase, self)._get_params() res.update(dict(max_depth=self.max_depth, max_leaf=self.max_leaf, tree_gain_ratio=self.tree_gain_ratio, min_samples_leaf=self._min_samples_leaf, loss=self._loss, l1=self.l1, l2=self.l2, opt_algorithm=self.opt_algorithm, n_estimators=self.n_estimators, learning_rate=self.learning_rate, max_bin=self._max_bin, data_l2=self.data_l2, min_child_weight=self.min_child_weight, sparse_max_features=self.sparse_max_features, sparse_min_occurences=self.sparse_min_occurences, n_jobs=self._n_jobs, verbose=self.verbose, is_classification=is_classifier(self), target=self._target)) return res
def predict(self, X): """Predict multi-class targets using underlying estimators. Parameters ---------- X : (sparse) array-like of shape (n_samples, n_features) Data. Returns ------- y : (sparse) array-like of shape (n_samples,) or (n_samples, n_classes) Predicted multi-class targets. """ check_is_fitted(self) n_samples = _num_samples(X) if self.label_binarizer_.y_type_ == "multiclass": maxima = np.empty(n_samples, dtype=float) maxima.fill(-np.inf) argmaxima = np.zeros(n_samples, dtype=int) for i, e in enumerate(self.estimators_): pred = _predict_binary(e, X) np.maximum(maxima, pred, out=maxima) argmaxima[maxima == pred] = i return self.classes_[argmaxima] else: if (hasattr(self.estimators_[0], "decision_function") and is_classifier(self.estimators_[0])): thresh = 0 else: thresh = .5 indices = array.array('i') indptr = array.array('i', [0]) for e in self.estimators_: indices.extend(np.where(_predict_binary(e, X) > thresh)[0]) indptr.append(len(indices)) data = np.ones(len(indices), dtype=int) indicator = sp.csc_matrix((data, indices, indptr), shape=(n_samples, len(self.estimators_))) return self.label_binarizer_.inverse_transform(indicator)
def cross_val_score_fn(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): """ Evaluate a score by cross-validation. This overrides the cross_val_score method typically found in cross_validation.py. Changes are clearly marked in comments, but the main change is augmenting the function to store Fit and Metric Events for each fold. """ X, y = indexable(X, y) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) # Default scoring scheme is 'accuracy' unless provided by user. if scoring is None: scoring = 'accuracy' # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) # Change from original scikit code: adding a new argument, scoring, to the # _fit_and_score function to track scoring function and create MetricEvents. scores = parallel( delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, verbose, None, fit_params, scoring) for train, test in cv) return np.array(scores)[:, 0]
def explain_local(self, X, y=None, name=None): if name is None: name = gen_name_from_class(self) X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types) # Extract decision tree structure nodes, edges = self._graph_from_tree(self._model(), self.feature_names) decisions = [ self._model().decision_path(instance.reshape(1, -1)).nonzero()[1] + 1 for instance in X ] data_dicts = [{ "type": "tree", "features": self.feature_names, "nodes": nodes, "edges": edges, "decision": decision, } for decision in decisions] internal_obj = {"overall": None, "specific": data_dicts} if is_classifier(self): scores = self.predict_proba(X)[:, 1] else: scores = self.predict(X) selector = gen_local_selector(X, y, scores) return TreeExplanation( "local", internal_obj, feature_names=self.feature_names, feature_types=self.feature_types, name=name, selector=selector, )
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr') if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if not self.dataset_filenames: self.save_dataset_filename(X, y, cv) dataset_filenames = self.dataset_filenames client = Client() lb_view = client.load_balanced_view() if self.verbose > 0: print("Number of CPU core %d" % len(client.ids())) self.tasks = [([lb_view.apply(evaluate, estimator, dataset_filename, params) for dataset_filename in dataset_filenames], params) for params in parameter_iterable] if self.sync: self.wait() self.set_grid_scores() self.set_best_score_params() if self.refit: self.set_best_estimator(estimator) return self
def _get_scores_and_estimators( experiment: Experiment) -> Tuple[List[float], List[Any]]: if experiment.test_set is not None: assert experiment.cross_validator is None, "Cannot use a cross validator with train test split" dataset = pd.concat([experiment.dataset, experiment.test_set]) split = np.array([-1] * len(experiment.dataset) + [1] * len(experiment.test_set)) cross_validator = PredefinedSplit(split) else: dataset = experiment.dataset cross_validator = experiment.cross_validator X = dataset.drop(columns=[experiment.label_column]) y = dataset[experiment.label_column] if experiment.group_column is None: if experiment.average_scores_on_instances: groups = Series(range(len(X)), index=X.index) else: groups = None else: groups = X[experiment.group_column] X = X.drop(columns=[experiment.group_column]) cv = check_cv(cross_validator, y, classifier=is_classifier(experiment.predictor)) train_test = cv.split(X, y, groups) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=None, verbose=False, pre_dispatch='2*n_jobs') scores_and_estimators = parallel( delayed(_fit_and_predict)(clone(experiment.predictor), X, y, train, test, groups, experiment.scorer) for train, test in train_test) scores_lists, estimators = zip(*scores_and_estimators) scores = [score for score_list in scores_lists for score in score_list] return scores, estimators
def _get_params(self): res = super(FastRGFEstimatorBase, self)._get_params() res.update( dict(max_depth=self.max_depth, max_leaf=self.max_leaf, tree_gain_ratio=self.tree_gain_ratio, min_samples_leaf=self._min_samples_leaf, loss=self._loss, l1=self.l1, l2=self.l2, opt_algorithm=self.opt_algorithm, n_estimators=self.n_estimators, learning_rate=self.learning_rate, max_bin=self._max_bin, data_l2=self.data_l2, min_child_weight=self.min_child_weight, sparse_max_features=self.sparse_max_features, sparse_min_occurences=self.sparse_min_occurences, n_jobs=self._n_jobs, verbose=self.verbose, is_classification=is_classifier(self), target=self._target)) return res
def __init__(self, models): """Proxy class to build an ensemble of models with an API as one Parameters ---------- models: array An array of models """ self._models = models if len(models) else None if self._models is not None: if is_classifier(self._models[0]): check_type = is_classifier self._scoring_fun = accuracy_score elif is_regressor(self._models[0]): check_type = is_regressor self._scoring_fun = r2_score else: raise ValueError('Expected regressors or classifiers,' ' got %s instead' % type(self._models[0])) for model in self._models: if not check_type(model): raise ValueError('Different types of models found, privide' ' either regressors or classifiers.')
def log_classifier(classifier, X_test, y_test, nrows=1000, run=None): assert is_classifier( classifier), "classifier should be sklearn classifier." run = tracking.get_or_create_run(run) run.log_inputs(**sanitize_dict(classifier.get_params())) _log_test_preds_proba(run, classifier, X_test, nrows=nrows) y_pred = classifier.predict(X_test) results = {} for metric_name, values in zip( ["precision", "recall", "fbeta_score", "support"], precision_recall_fscore_support(y_test, y_pred), ): for i, value in enumerate(values): results["{}_class_{}_test".format(metric_name, i)] = value results["accuracy"] = accuracy_score(y_test, y_pred) results["f1"] = f1_score(y_pred, y_pred, average="weighted") run.log_metrics(**results) _log_test_predictions(run, y_test, y_pred=y_pred, nrows=nrows)
def _make_estimator(self, estimator_idx, estimator_name): """Make and configure a copy of the estimator.""" # Set the non-overlapped random state if self.random_state is not None: random_state = (self.random_state + 10 * estimator_idx + 100 * self.layer_idx) else: random_state = None estimator = Estimator( name=estimator_name, criterion=self.criterion, n_trees=self.n_trees, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, backend=self.backend, n_jobs=self.n_jobs, random_state=random_state, is_classifier=is_classifier(self), ) return estimator
def _make_estimator(self, append=True, random_state=None): """Make and configure a copy of the `base_estimator_` attribute. Warning: This method should be used to properly instantiate new sub-estimators. """ estimator = clone(self.base_estimator_) estimator.set_params( **{p: getattr(self, p) for p in self.estimator_params}) # Pass the inferred class information to avoid redudant finding. if is_classifier(estimator): estimator.classes_ = self.classes_ estimator.n_classes_ = np.array(self.n_classes_, dtype=np.int32) if random_state is not None: _set_random_states(estimator, random_state) if append: self.estimators_.append(estimator) return estimator
def log_estimator_params(estimator, experiment=None): """Log estimator parameters. Log all estimator parameters as experiment properties. Make sure you created an experiment by using ``neptune.create_experiment()`` before you use this method. Tip: Check `Neptune documentation <https://docs.neptune.ai/integrations/scikit_learn.html>`_ for the full example. Args: estimator (:obj:`estimator`): | Scikit-learn estimator from which to log parameters. experiment (:obj:`neptune.experiments.Experiment`, optional, default is ``None``): | Neptune ``Experiment`` object to control to which experiment you log the data. | If ``None``, log to currently active, and most recent experiment. Returns: ``None`` Examples: .. code:: python3 rfr = RandomForestRegressor() neptune.init('my_workspace/my_project') neptune.create_experiment() log_estimator_params(rfr) """ assert is_regressor(estimator) or is_classifier(estimator) or isinstance(estimator, KMeans),\ 'Estimator should be sklearn regressor, classifier or kmeans clusterer.' exp = _validate_experiment(experiment) for param, value in estimator.get_params().items(): exp.set_property(param, value)
def plot_confusion_matrix(estimator, X, y_true, labels=None, sample_weight=None, normalize=None, display_labels=None, include_values=True, xticks_rotation='horizontal', values_format=None, cmap='viridis', ax=None): check_matplotlib_support("plot_confusion_matrix") if not is_classifier(estimator): raise ValueError("plot_confusion_matrix only supports classifiers") y_pred = estimator.predict(X) cm = confusion_matrix(y_true, y_pred, sample_weight=sample_weight, labels=labels, normalize=normalize) if display_labels is None: if labels is None: display_labels = estimator.classes_ else: display_labels = labels disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_labels) return disp.plot(include_values=include_values, cmap=cmap, ax=ax, xticks_rotation=xticks_rotation, values_format=values_format)
def test_model_pipeline_same_dense_and_sparse(LinearModel, params): # Test that linear model preceeded by StandardScaler in the pipeline and # with normalize set to False gives the same y_pred and the same .coef_ # given X sparse or dense model_dense = make_pipeline( StandardScaler(with_mean=False), LinearModel(normalize=False, **params) ) model_sparse = make_pipeline( StandardScaler(with_mean=False), LinearModel(normalize=False, **params) ) # prepare the data rng = np.random.RandomState(0) n_samples = 200 n_features = 2 X = rng.randn(n_samples, n_features) X[X < 0.1] = 0. X_sparse = sparse.csr_matrix(X) y = rng.rand(n_samples) if is_classifier(model_dense): y = np.sign(y) model_dense.fit(X, y) model_sparse.fit(X_sparse, y) assert_allclose(model_sparse[1].coef_, model_dense[1].coef_) y_pred_dense = model_dense.predict(X) y_pred_sparse = model_sparse.predict(X_sparse) assert_allclose(y_pred_dense, y_pred_sparse) assert_allclose(model_dense[1].intercept_, model_sparse[1].intercept_)
def _fit(self, X, y, parameter_iterable): estimator = from_sklearn(self.estimator) self.scorer_ = check_scoring(estimator, scoring=self.scoring) cv = check_cv(self.cv, X, y, classifier=is_classifier(estimator)) n_folds = len(cv) X, y = check_X_y(X, y) tups = [] parameters = [] train_test_sets = list(cv.split(X, y)) for params in parameter_iterable: est = estimator.set_params(**params) for X_train, y_train, X_test, y_test in train_test_sets: fit = est.fit(X_train, y_train, **self.fit_params) tups.append(score_and_n(self.scorer_, fit, X_test, y_test)) parameters.append(params) # Compute results get = self.get or _globals['get'] or threaded.get scores, n_samples = zip(*compute(tups, get=get)[0]) # Extract grid_scores and best parameters grid_scores = get_grid_scores(scores, parameters, n_samples, n_folds, self.iid) best = get_best(grid_scores) # Update attributes self.grid_scores_ = grid_scores self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score # Refit if needed if self.refit: self.best_estimator_ = (estimator.set_params( **best.parameters).fit(X, y, **self.fit_params).compute(get=get)) return self
def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): """ Evaluate a score by cross-validation """ if not isinstance(scoring, (list, tuple)): scoring = [scoring] X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) splits = list(cv.split(X, y, groups)) scorer = [check_scoring(estimator, scoring=s) for s in scoring] # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) scores = parallel( delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, verbose, None, fit_params) for train, test in splits) group_order = [] if hasattr(cv, 'groups'): group_order = [ np.array(cv.groups)[test].tolist()[0] for _, test in splits ] return np.squeeze(np.array(scores)), group_order
def predict(self, context: np.ndarray) -> np.ndarray: """Predict the mean reward function. Parameters ----------- context: array-like, shape (n_rounds_of_new_data, dim_context) Context vectors for new data. Returns ----------- estimated_rewards_by_reg_model: array-like, shape (n_rounds_of_new_data, n_actions, len_list) Estimated expected rewards for new data given each item and position by the regression model. """ n_rounds_of_new_data = context.shape[0] ones_n_rounds_arr = np.ones(n_rounds_of_new_data, int) estimated_rewards_by_reg_model = np.zeros( (n_rounds_of_new_data, self.n_actions, self.len_list) ) for action_ in np.arange(self.n_actions): for position_ in np.arange(self.len_list): X = self._pre_process_for_reg_model( context=context, action=action_ * ones_n_rounds_arr, action_context=self.action_context, ) estimated_rewards_ = ( self.base_model_list[position_].predict_proba(X)[:, 1] if is_classifier(self.base_model_list[position_]) else self.base_model_list[position_].predict(X) ) estimated_rewards_by_reg_model[ np.arange(n_rounds_of_new_data), action_ * ones_n_rounds_arr, position_ * ones_n_rounds_arr, ] = estimated_rewards_ return estimated_rewards_by_reg_model
def predict(self, x_test: np.ndarray, confidence: float): r"""Method that returns the prediction and the confidence interval This method returns the interval with a confidence level of `confidence` and the target predictions for `x_test`. The information returned for classification is different from the one returned for regression. In classification cases the tuple returned has two elements: a numpy.ndarray with a matrix of boolean values and a numpy.ndarray that contains the class predictions. On the other hand, in regression cases the tuple returned has 3 elements: a numpy.ndarray with the lower bound values, a numpy.ndarray with the predicted target values and a numpy.ndarray with the upper bound values. Parameters ---------- x_test: numpy.ndarray Array of data features used to predict the target values and the confidence interval confidence: float Float between 0 and 1 that represent the percentage of observations we want to be inside the predicted interval. Returns ------- prediction: Tuple[numpy.ndarray, numpy.ndarray] or Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray] Tuple containing the confidence interval and the target prediction Notes ----- The `x_test` data must have the same features as the data used for training and calibration, and they must be in the same order. The level of confidence has to be a fraction between 0 and 1. """ sig = 1 - confidence if is_classifier(self.model): return self.icp.predict(x_test, significance=sig), self.model.predict(x_test) elif is_regressor(self.model): return self.icp.predict(x_test, significance=sig)[:, 0], self.model.predict(x_test), self.icp.predict(x_test, significance=sig)[:, 1] elif type(self.model) == lgbm.basic.Booster: return self.icp.predict(x_test, significance=sig)[:, 0], self.model.predict(x_test), self.icp.predict(x_test, significance=sig)[:, 1]
def __init__(self, estimator=GaussianNB(), cv=LeaveOneOut(), scoring=None, max_chunk=100, percentage=.1, verbose=False, n_jobs=1): if not (0. < percentage <= 1.): raise ValueError( 'percentage must be > 0 and <= 1. Given {}'.format(percentage)) if not (0. < n_jobs): raise ValueError( 'n_jobs must be a positive integer. Given {}'.format( percentage)) if not (max_chunk >= 0): raise ValueError( 'max_chunk must be >= 0. Given: {}'.format(max_chunk)) if not is_classifier(estimator): raise ValueError( 'Estimator must be a sklearn-like classifier. Given {}'.format( estimator)) scoring = check_scoring(estimator, scoring=scoring) self.estimator = estimator self.cv = cv self.scoring = scoring self.max_chunk = max_chunk self.percentage = percentage self.verbose = verbose self.n_jobs = n_jobs
def __init__(self, estimator, training_x, training_y, test_x, test_y, estimator_id, problem_type, main_metric): super().__init__() self._training_result = None self._test_result = None self._estimator = estimator self._estimator_id = estimator_id self._training_x = training_x self._training_y = training_y self._test_x = test_x self._test_y = test_y self._evaluated = False if problem_type is None: if is_regressor(estimator): problem_type = 'regression' elif is_classifier(estimator): problem_type = 'classification' else: raise ValueError('problem type is not provided!') if problem_type.lower().startswith('regres'): problem_type = 'regression' elif problem_type.lower().startswith('class'): problem_type = 'classification' if main_metric is None: if problem_type == 'regression': main_metric = 'smape' elif problem_type == 'classification': main_metric = 'f1_score' else: raise ValueError('main metric should be provided!') self._problem_type = problem_type self._main_metric = main_metric
def fit(self, train_X, train_y, verbose=1): train_X, train_y = check_X_y(train_X, train_y, accept_sparse=True) if is_classifier(self): self.classes_ = unique_labels(train_y) self._fitness_evaluator.fit(train_X, train_y) self.pareto.clear() self._population = self._toolbox.population(n=self.pop_size) log_context = None try: log_context = self.logger.listen() ea_run(self._population, self._toolbox, self.n_gen, self.pop_size, self.config, n_jobs=self.n_jobs, timeout=self.max_evo_seconds, verbose=verbose) finally: self.logger.close(log_context) if not len(self.pareto): # try to get individuals that were evaluated before time limit end evaluated_inds = [ind for ind in self._population if ind.fitness.valid] if not len(evaluated_inds): warnings.warn("The algorithm did not have enough time to evaluate first generation and was not fitted.") return self self.pareto.update(evaluated_inds) tree = self.pareto[0] self.fitted_workflow = self._toolbox.compile(tree) self.fitted_workflow.fit(train_X, train_y) self.is_fitted_ = True return self
def __init__(self, base_estimator, ratio = 1, ensemble = 'mean', random_state = 42): def get_ensemble(ensemble): if ensemble == 'mean': return np.mean if ensemble == 'max': return np.max if ensemble == 'min': return np.min else: raise ValueError("ensemble must be one of these options: 'mean', 'max', 'min' not " + ensemble) if is_classifier(base_estimator): self.base_estimator = base_estimator else: raise ValueError("base_estimator must be a classifier not " + base_estimator) self._estimator_type = 'classifier' self._ratio = ratio self.ensemble = get_ensemble(ensemble) self._random_state = random_state self.classes_ = None self._target = None self.list_of_df = None self.list_models = None
def cross_val_score_weighted(estimator, x_data, y_data=None, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', error_score=np.nan, sample_weights=None): """Expand :func:`sklearn.model_selection.cross_val_score`.""" scorer = check_scoring(estimator, scoring=scoring) (x_data, y_data, groups) = indexable(x_data, y_data, groups) cv = check_cv(cv, y_data, classifier=is_classifier(estimator)) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) scores = parallel( delayed(_fit_and_score_weighted)(clone(estimator), x_data, y_data, scorer, train, test, None, fit_params, error_score=error_score, sample_weights=sample_weights) for train, test in cv.split(x_data, y_data, groups)) return np.array(scores)
def __init__(self, X, y, criterion, min_samples_split, max_depth, n_val_sample, random_state): # make sure max_depth > 1 if max_depth < 2: raise ValueError("max depth must be > 1") # check the input arrays, and if it's classification validate the # target values in y X, y = check_X_y(X, y, accept_sparse=False, dtype=None, copy=True) if is_classifier(self): check_classification_targets(y) # hyper parameters so we can later inspect attributes of the model self.min_samples_split = min_samples_split self.max_depth = max_depth self.n_val_sample = n_val_sample self.random_state = random_state # create the splitting class random_state = check_random_state(random_state) self.splitter = RandomSplitter(random_state, criterion, n_val_sample) # grow the tree depth first self.tree = self._find_next_split(X, y, 0)
def _validate_estimators(self): if self.estimators is None or len(self.estimators) == 0: raise ValueError( "Invalid 'estimators' attribute, 'estimators' should be a list" " of (string, estimator) tuples.") names, estimators = zip(*self.estimators) # defined by MetaEstimatorMixin self._validate_names(names) has_estimator = any(est != 'drop' for est in estimators) if not has_estimator: raise ValueError( "All estimators are dropped. At least one is required " "to be an estimator.") is_estimator_type = (is_classifier if is_classifier(self) else is_regressor) for est in estimators: if est != 'drop' and not is_estimator_type(est): raise ValueError("The estimator {} should be a {}.".format( est.__class__.__name__, is_estimator_type.__name__[3:])) return names, estimators
def show_models(self): # Check if fit had been called check_is_fitted(self) if is_classifier(self): sorted_list = [[ self.classes_[estimator.target], estimator ] for estimator in sorted(self.estimators_, key=lambda e: e.target) ] elif is_regressor(self): sorted_list = [[ estimator.target, estimator ] for estimator in sorted(self.estimators_, key=lambda e: e.target) ] else: msg = "Unknown type of model. Must be 'regressor' or 'classifier'" raise ValueError(msg) [ print(f"Target: {target}; Model: {estimator}") for target, estimator in sorted_list ] return pd.DataFrame(sorted_list, columns=["Target", "Model"])
def convert_pipeline(scope: Scope, operator: Operator, container: ModelComponentContainer): model = operator.raw_operator inputs = operator.inputs for step in model.steps: step_model = step[1] if is_classifier(step_model): scope.add_options(id(step_model), options={'zipmap': False}) container.add_options(id(step_model), options={'zipmap': False}) outputs = _parse_sklearn(scope, step_model, inputs, custom_parsers=None) inputs = outputs if len(outputs) != len(operator.outputs): raise RuntimeError("Mismatch between pipeline output %d and " "last step outputs %d." % (len(outputs), len(operator.outputs))) for fr, to in zip(outputs, operator.outputs): container.add_node( 'Identity', fr.full_name, to.full_name, name=scope.get_unique_operator_name("Id" + operator.onnx_name))
def test_BoxCoxTargetTransformer_target_transform(): for ll in (0, 0.1, 0.5, 2): bb = BoxCoxTargetTransformer(Ridge(), ll=ll) assert not is_classifier(bb) assert is_regressor(bb) y = np.arange(-100, 100, step=0.1) my = bb.target_transform(y) ymy = bb.target_inverse_transform(my) mymy = bb.target_transform(ymy) # plt.subplot(211) # plt.plot(y,my) # plt.subplot(212) # plt.plot(my,ymy) assert not pd.Series(my).isnull().any() assert not pd.Series(ymy).isnull().any() assert np.max(np.abs(y - ymy)) <= 10**(-10) assert np.max(np.abs(my - mymy)) <= 10**(-10)
def cross_val_train_predict(estimator, x, y, predict_method: str = "predict", cv: int = 5): """ Return fit estimators and predictions of each (Stratified) fold. """ from sklearn.base import clone, is_classifier from sklearn.model_selection._split import check_cv from sklearn.utils.metaestimators import _safe_split import numpy as np splitter = check_cv(cv, y, classifier=is_classifier(estimator)) estimators = [] predictions = None for train, test in splitter.split(x, y): x_train, y_train = _safe_split(estimator, x, y, train) x_test, _ = _safe_split(estimator, x, y, test, train) fold_estimator = clone(estimator) fold_predict = getattr(fold_estimator, predict_method) fold_estimator.fit(x_train, y_train) estimators.append(fold_estimator) fold_prediction = fold_predict(x_test) if predictions is None: if fold_prediction.ndim == 2: predictions = np.empty(shape=(len(y), fold_prediction.shape[1])) else: predictions = np.empty(shape=(len(y), )) predictions[test] = fold_prediction return predictions, estimators
def predict_rulelist(X: pd.DataFrame, model): if X is not pd.DataFrame: Exception('X needs to be a DataFrame') is_classification = is_classifier(model) rulelist = model._rulelist n_predictions = X.shape[0] n_targets = rulelist.default_rule_statistics.number_targets instances_covered = np.zeros(n_predictions, dtype=bool) predictions = np.empty((n_predictions, n_targets), dtype=object) for subgroup in rulelist.subgroups: instances_subgroup = ~instances_covered &\ reduce(lambda x,y: x & y, [item.activation_function(X).values for item in subgroup.pattern]) if is_classification: predictions[instances_subgroup, :] = point_value_categorical( subgroup.statistics) else: predictions[instances_subgroup, :] = point_value_gaussian( subgroup.statistics) instances_covered |= instances_subgroup # default rule if is_classification: predictions[~instances_covered, :] = point_value_categorical( rulelist.default_rule_statistics) else: predictions[~instances_covered, :] = point_value_gaussian( rulelist.default_rule_statistics) if n_targets == 1: predictions = predictions.flatten() # if int values try to return ints try: predictions = predictions.astype(int) except ValueError: pass return predictions
def _cross_val_predict(pipeline, X, y=None, cv=None) \ -> Tuple[np.ndarray, np.ndarray, np.ndarray, List[FlexiblePipeline]]: X, y, groups = indexable(X, y, None) cv = check_cv(cv, y, classifier=is_classifier(pipeline)) cv.random_state = 42 prediction_blocks = [] probability_blocks = [] fitted_pipelines = [] for train, test in cv.split(X, y, groups): cloned_pipeline = clone(pipeline) probability_blocks.append( (_fit_and_predict(cloned_pipeline, X, y, train, test, 0, {}, 'predict_proba'), test) ) prediction_blocks.append(cloned_pipeline.predict(X)) fitted_pipelines.append(cloned_pipeline) # Concatenate the predictions probabilities = [prob_block_i for prob_block_i, _ in probability_blocks] predictions = [pred_block_i for pred_block_i in prediction_blocks] test_indices = np.concatenate([indices_i for _, indices_i in probability_blocks]) if not _check_is_permutation(test_indices, _num_samples(X)): raise ValueError('cross_val_predict only works for partitions') inv_test_indices = np.empty(len(test_indices), dtype=int) inv_test_indices[test_indices] = np.arange(len(test_indices)) probabilities = np.concatenate(probabilities) predictions = np.concatenate(predictions) if isinstance(predictions, list): return y, [p[inv_test_indices] for p in predictions], [p[inv_test_indices] for p in probabilities], fitted_pipelines else: return y, predictions[inv_test_indices], probabilities[inv_test_indices], fitted_pipelines
def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None): random_state = check_random_state(self.random_state) if X.dtype != np.uint8: msg = "The dtype of `X` should be `np.uint8`, but got {} instead." raise RuntimeError(msg.format(X.dtype)) if check_input: # Need to validate separately here. # We can't pass multi_ouput=True because that would allow y to be # csr. check_X_params = dict(dtype=DTYPE, accept_sparse="csc") check_y_params = dict(ensure_2d=False, dtype=None) X, y = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params)) # Determine output settings n_samples, self.n_features_ = X.shape y = np.atleast_1d(y) expanded_class_weight = None if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] # `classes_` and `n_classes_` were set by the forest. if not hasattr(self, "classes_") and is_classifier(self): check_classification_targets(y) y = np.copy(y) self.classes_ = [] self.n_classes_ = [] if self.class_weight is not None: y_original = np.copy(y) y_encoded = np.zeros(y.shape, dtype=np.int) for k in range(self.n_outputs_): classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) y = y_encoded if self.class_weight is not None: expanded_class_weight = compute_sample_weight( self.class_weight, y_original) self.n_classes_ = np.array(self.n_classes_, dtype=np.int32) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = (np.iinfo(np.int32).max if self.max_depth is None else self.max_depth) if isinstance(self.min_samples_leaf, numbers.Integral): if not 1 <= self.min_samples_leaf: raise ValueError("min_samples_leaf must be at least 1 " "or in (0, 0.5], got %s" % self.min_samples_leaf) min_samples_leaf = self.min_samples_leaf else: # float if not 0.0 < self.min_samples_leaf <= 0.5: raise ValueError("min_samples_leaf must be at least 1 " "or in (0, 0.5], got %s" % self.min_samples_leaf) min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) if isinstance(self.min_samples_split, numbers.Integral): if not 2 <= self.min_samples_split: raise ValueError("min_samples_split must be an integer " "greater than 1 or a float in (0.0, 1.0]; " "got the integer %s" % self.min_samples_split) min_samples_split = self.min_samples_split else: # float if not 0.0 < self.min_samples_split <= 1.0: raise ValueError("min_samples_split must be an integer " "greater than 1 or a float in (0.0, 1.0]; " "got the float %s" % self.min_samples_split) min_samples_split = int(ceil(self.min_samples_split * n_samples)) min_samples_split = max(2, min_samples_split) min_samples_split = max(min_samples_split, 2 * min_samples_leaf) if isinstance(self.max_features, str): if self.max_features in ["auto", "sqrt"]: max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError("Invalid value for max_features. " "Allowed string values are 'auto', " "'sqrt' or 'log2'.") elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, numbers.Integral): max_features = self.max_features else: # float if self.max_features > 0.0: max_features = max(1, int(self.max_features * self.n_features_)) else: max_features = 0 self.max_features_ = max_features if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DOUBLE) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Set min_weight_leaf from min_weight_fraction_leaf if sample_weight is None: min_weight_leaf = self.min_weight_fraction_leaf * n_samples else: min_weight_leaf = self.min_weight_fraction_leaf * np.sum( sample_weight) min_impurity_split = self.min_impurity_split if min_impurity_split is not None: warnings.warn( "The min_impurity_split parameter is deprecated. " "Its default value has changed from 1e-7 to 0 in " "version 0.23, and it will be removed in 0.25. " "Use the min_impurity_decrease parameter instead.", FutureWarning, ) if min_impurity_split < 0.0: raise ValueError("min_impurity_split must be greater than " "or equal to 0") else: min_impurity_split = 0 if self.min_impurity_decrease < 0.0: raise ValueError("min_impurity_decrease must be greater than " "or equal to 0") if self.presort != "deprecated": warnings.warn( "The parameter 'presort' is deprecated and has no " "effect. It will be removed in v0.24. You can " "suppress this warning by not passing any value " "to the 'presort' parameter.", FutureWarning, ) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classifier(self): criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples) SPLITTERS = DENSE_SPLITTERS splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter]( criterion, self.max_features_, min_samples_leaf, min_weight_leaf, random_state, ) if is_classifier(self): self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) else: self.tree_ = Tree( self.n_features_, # TODO: tree should't need this in this case np.array([1] * self.n_outputs_, dtype=np.int32), self.n_outputs_, ) builder = DepthFirstTreeBuilder( splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, self.min_impurity_decrease, min_impurity_split, ) builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) if self.n_outputs_ == 1 and is_classifier(self): self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] # Only return the essential data for using a tree for prediction feature = self.tree_.feature threshold = self.tree_.threshold children = np.vstack( (self.tree_.children_left, self.tree_.children_right)).T value = self.tree_.value return feature, threshold, children, value