Esempio n. 1
0
def test_is_classifier():
    svc = SVC()
    assert_true(is_classifier(svc))
    assert_true(is_classifier(GridSearchCV(svc, {'C': [0.1, 1]})))
    assert_true(is_classifier(Pipeline([('svc', svc)])))
    assert_true(is_classifier(Pipeline(
        [('svc_cv', GridSearchCV(svc, {'C': [0.1, 1]}))])))
Esempio n. 2
0
    def fit(self, X, y):
        if is_classifier(self):
            self.classes_, y = np.unique(y, return_inverse=True)
            self.num_classes_ = len(self.classes_)
        else:
            self.num_classes_ = -1

        # Split data into train/val
        X_train, X_val, y_train, y_val = train_test_split(
            X,
            y,
            test_size=self.holdout_split,
            random_state=self.random_state,
            stratify=y if is_classifier(self) else None,
        )
        # Define attributes
        self.attributes_ = EBMUtils.gen_attributes(self.col_types, self.col_n_bins)
        # Build EBM allocation code
        if is_classifier(self):
            model_type = "classification"
        else:
            model_type = "regression"

        self.intercept_ = 0
        self.attribute_sets_ = []
        self.attribute_set_models_ = []

        main_attr_indices = [[x] for x in range(len(self.attributes_))]
        main_attr_sets = EBMUtils.gen_attribute_sets(main_attr_indices)
        with closing(
            NativeEBM(
                self.attributes_,
                main_attr_sets,
                X_train,
                y_train,
                X_val,
                y_val,
                num_inner_bags=self.feature_step_n_inner_bags,
                num_classification_states=self.num_classes_,
                model_type=model_type,
                training_scores=None,
                validation_scores=None,
            )
        ) as native_ebm:
            # Train main effects
            self._fit_main(native_ebm, main_attr_sets)

            # Build interaction terms
            self.inter_indices_ = self._build_interactions(native_ebm)

        self.staged_fit_interactions(X, y, self.inter_indices_)

        return self
Esempio n. 3
0
    def explain_local(self, X, y=None, name=None):
        # Produce feature value pairs for each instance.
        # Values are the model graph score per respective attribute set.
        if name is None:
            name = gen_name_from_class(self)

        X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types)
        instances = self.preprocessor_.transform(X)
        scores_gen = EBMUtils.scores_by_attrib_set(
            instances, self.attribute_sets_, self.attribute_set_models_
        )

        n_rows = instances.shape[0]
        data_dicts = []
        for _ in range(n_rows):
            data_dict = {
                "type": "univariate",
                "names": [],
                "scores": [],
                "values": [],
                "extra": {
                    "names": ["Intercept"],
                    "scores": [self.intercept_],
                    "values": [1],
                },
            }
            data_dicts.append(data_dict)

        for set_idx, attribute_set, scores in scores_gen:
            for row_idx in range(n_rows):
                feature_name = self.feature_names[set_idx]
                data_dicts[row_idx]["names"].append(feature_name)
                data_dicts[row_idx]["scores"].append(scores[row_idx])
                if attribute_set["n_attributes"] == 1:
                    data_dicts[row_idx]["values"].append(
                        X[row_idx, attribute_set["attributes"][0]]
                    )
                else:
                    data_dicts[row_idx]["values"].append("")

        if is_classifier(self):
            scores = EBMUtils.classifier_predict_proba(instances, self)[:, 1]
        else:
            scores = EBMUtils.regressor_predict(instances, self)

        for row_idx in range(n_rows):
            data_dicts[row_idx]["perf"] = perf_dict(y, scores, row_idx)

        selector = gen_local_selector(instances, y, scores)

        internal_obj = {"overall": None, "specific": data_dicts}

        return EBMExplanation(
            "local",
            internal_obj,
            feature_names=self.feature_names,
            feature_types=self.feature_types,
            name=name,
            selector=selector,
        )
Esempio n. 4
0
 def score_fn(est, X, y, drop_indices):
     if is_classifier(est):
         prob = EBMUtils.classifier_predict_proba(X, estimator, drop_indices)
         return -1.0 * roc_auc_score(y, prob[:, 1])
     else:
         pred = EBMUtils.regressor_predict(X, estimator, drop_indices)
         return mean_squared_error(y, pred)
Esempio n. 5
0
    def _is_classifier(self):
        """Whether the underlying model is a classifier

        Return:
            (boolean) whether `self.model` is a classifier
        """
        return is_classifier(self.model) or hasattr(self.model, 'predict_proba')
Esempio n. 6
0
def permutation_test_score(estimator, X, y, groups=None, cv=None,
                           n_permutations=100, n_jobs=1, random_state=0,
                           verbose=0, scoring=None):
    """
    Evaluate the significance of a cross-validated score with permutations,
    as in test 1 of [Ojala2010]_.

    A modification of original sklearn's permutation test score function
    to evaluate p-value outside this function, so that the score can be
    reused from outside.


    .. [Ojala2010] Ojala and Garriga. Permutation Tests for Studying Classifier
                   Performance.  The Journal of Machine Learning Research (2010)
                   vol. 11

    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    random_state = check_random_state(random_state)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_permutation_test_score)(
            clone(estimator), X, _shuffle(y, groups, random_state),
            groups, cv, scorer)
        for _ in range(n_permutations))
    permutation_scores = np.array(permutation_scores)
    return permutation_scores
Esempio n. 7
0
def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None,
                    n_jobs=1, verbose=0, fit_params=None,
                    pre_dispatch='2*n_jobs'):
    """
    Evaluate a score by cross-validation
    """
    if not isinstance(scoring, (list, tuple)):
        scoring = [scoring]

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    splits = list(cv.split(X, y, groups))
    scorer = [check_scoring(estimator, scoring=s) for s in scoring]
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer,
                                              train, test, verbose, None,
                                              fit_params)
                      for train, test in splits)

    group_order = []
    if hasattr(cv, 'groups'):
        group_order = [np.array(cv.groups)[test].tolist()[0] for _, test in splits]
    return np.squeeze(np.array(scores)), group_order
def benchmark(clf, X, y, cv=None):
    X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True)
    cv = check_cv(cv, X, y, classifier=is_classifier(clf))
    
    # learning_curve_ = learning_curve(clf, X_all, y_all, cv=cv)
    
    train_times = []
    test_times = []
    confusion_matrices = []
    confusion_matrix_indices = []
    coefs = []
    for train, test in cv:
        X_train, y_train = X[train], y[train]
        X_test, y_test = X[test], y[test]
        
        t0 = time()
        clf.fit(X_train, y_train)
        train_times.append(time()-t0)
        
        t0 = time()
        y_pred = clf.predict(X_test)
        test_times.append(time()-t0)
    
        confusion_matrices.append(confusion_matrix(y_test, y_pred))
        confusion_matrix_indices.append(np.array([[test[pred] for pred in true] for true in confusion_matrix_instances(y_test, y_pred)]))
    
        coefs.append(clf.coef_)
    
    return dict(
        train_times = np.array(train_times),
        test_times = np.array(test_times),
        confusion_matrices = np.array(confusion_matrices),
        confusion_matrix_indices = np.array(confusion_matrix_indices),
        coefs = np.array(coefs)
    )
Esempio n. 9
0
def build_graph(estimator, cv, scorer, candidate_params, X, y=None,
                groups=None, fit_params=None, iid=True, refit=True,
                error_score='raise', return_train_score=True, cache_cv=True):

    X, y, groups = to_indexable(X, y, groups)
    cv = check_cv(cv, y, is_classifier(estimator))
    # "pairwise" estimators require a different graph for CV splitting
    is_pairwise = getattr(estimator, '_pairwise', False)

    dsk = {}
    X_name, y_name, groups_name = to_keys(dsk, X, y, groups)
    n_splits = compute_n_splits(cv, X, y, groups)

    if fit_params:
        # A mapping of {name: (name, graph-key)}
        param_values = to_indexable(*fit_params.values(), allow_scalars=True)
        fit_params = {k: (k, v) for (k, v) in
                      zip(fit_params, to_keys(dsk, *param_values))}
    else:
        fit_params = {}

    fields, tokens, params = normalize_params(candidate_params)
    main_token = tokenize(normalize_estimator(estimator), fields, params,
                          X_name, y_name, groups_name, fit_params, cv,
                          error_score == 'raise', return_train_score)

    cv_name = 'cv-split-' + main_token
    dsk[cv_name] = (cv_split, cv, X_name, y_name, groups_name,
                    is_pairwise, cache_cv)

    if iid:
        weights = 'cv-n-samples-' + main_token
        dsk[weights] = (cv_n_samples, cv_name)
    else:
        weights = None

    scores = do_fit_and_score(dsk, main_token, estimator, cv_name, fields,
                              tokens, params, X_name, y_name, fit_params,
                              n_splits, error_score, scorer,
                              return_train_score)

    cv_results = 'cv-results-' + main_token
    candidate_params_name = 'cv-parameters-' + main_token
    dsk[candidate_params_name] = (decompress_params, fields, params)
    dsk[cv_results] = (create_cv_results, scores, candidate_params_name,
                       n_splits, error_score, weights)
    keys = [cv_results]

    if refit:
        best_params = 'best-params-' + main_token
        dsk[best_params] = (get_best_params, candidate_params_name, cv_results)
        best_estimator = 'best-estimator-' + main_token
        if fit_params:
            fit_params = (dict, (zip, list(fit_params.keys()),
                                list(pluck(1, fit_params.values()))))
        dsk[best_estimator] = (fit_best, clone(estimator), best_params,
                               X_name, y_name, fit_params)
        keys.append(best_estimator)

    return dsk, keys, n_splits
Esempio n. 10
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
        X, y = indexable(X, y)
        cv = check_cv(self.cv, X, y, classifier=is_classifier(self.estimator))
        base_estimator = clone(self.estimator)

        best = best_parameters(base_estimator, cv, X, y, parameter_iterable,
                               self.scorer_, self.fit_params, self.iid)
        best = best.compute()

        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score


        if isinstance(base_estimator, Pipeline):
            base_estimator = base_estimator.to_sklearn().compute()

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = base_estimator.set_params(**best.parameters)
            if y is not None:
                self.best_estimator_ = best_estimator.fit(X, y, **self.fit_params)
            else:
                self.best_estimator_ = best_estimator.fit(X, **self.fit_params)
        return self
Esempio n. 11
0
def add_del_cv(df, predictors, target, model, scoring='roc_auc', cv1=None,
               n_folds=8, n_jobs=-1, start=[], selmax=None, selmin=1,
               min_ratio=1e-7, max_steps=10, verbosity=0):
    """ Forward-Backward (ADD-DEL) selection using model.

    Parameters
    ----------

    Returns
    -------
    selected: list
        selected predictors

    Example
    -------
    References
    ----------
    """
    def test_to_break(selected, selected_curr, to_break):
        if set(selected) == set(selected_curr):
            to_break += 1
        else:
            to_break = 0
        return to_break

    X, y, _ = df_xyf(df, predictors=predictors, target=target)
    cv1 = cross_validation.check_cv(
            cv1, X=X, y=y,
            classifier=is_classifier(model))

    selected_curr = start
    to_break = 0

    for i_step in xrange(max_steps):
        selected = forward_cv(
                        df, predictors, target, model, scoring=scoring,
                        cv1=cv1, n_folds=n_folds, n_jobs=n_jobs,
                        start=selected_curr, selmax=selmax,
                        min_ratio=min_ratio, verbosity=verbosity-1)
        to_break = test_to_break(selected, selected_curr, to_break)
        selected_curr = selected
        if verbosity > 0:
            print('forward:', ' '.join(selected_curr))
        if to_break > 1:
            break
        selected = backward_cv(
                        df, selected_curr, target, model, scoring=scoring,
                        cv1=cv1, n_folds=n_folds, n_jobs=n_jobs, selmin=selmin,
                        min_ratio=min_ratio, verbosity=verbosity-1)
        to_break = test_to_break(selected, selected_curr, to_break)
        selected_curr = selected
        if verbosity > 0:
            print('backward:', ' '.join(selected_curr))
        if to_break > 0:
            break

    return selected_curr
Esempio n. 12
0
 def transform(self, X, y=None):
     cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
     
     X_prob = np.zeros((X.shape[0], self.n_classes))
     X_pred = np.zeros(X.shape[0])
     
     for estimator, (_, test) in zip(self.estimators_, cv.split(X)):
         X_prob[test] = estimator.predict_proba(X[test])
         X_pred[test] = estimator.predict(X[test])
     return np.hstack([X_prob, np.array([X_pred]).T])
Esempio n. 13
0
 def fit(self, X, y):
     y_labels = self._get_labels(y)
     cv = check_cv(self.cv, y_labels, classifier=is_classifier(self.estimator))
     self.estimators_ = []
     
     for train, _ in cv.split(X, y_labels):
         self.estimators_.append(
             clone(self.estimator).fit(X[train], y_labels[train])
         )
     return self
Esempio n. 14
0
def _set_cv(cv, estimator=None, X=None, y=None):
    """ Set the default cross-validation depending on whether clf is classifier
        or regressor. """

    from sklearn.base import is_classifier

    # Detect whether classification or regression
    if estimator in ['classifier', 'regressor']:
        est_is_classifier = estimator == 'classifier'
    else:
        est_is_classifier = is_classifier(estimator)
    # Setup CV
    if check_version('sklearn', '0.18'):
        from sklearn import model_selection as models
        from sklearn.model_selection import (check_cv, StratifiedKFold, KFold)
        if isinstance(cv, (int, np.int)):
            XFold = StratifiedKFold if est_is_classifier else KFold
            cv = XFold(n_splits=cv)
        elif isinstance(cv, str):
            if not hasattr(models, cv):
                raise ValueError('Unknown cross-validation')
            cv = getattr(models, cv)
            cv = cv()
        cv = check_cv(cv=cv, y=y, classifier=est_is_classifier)
    else:
        from sklearn import cross_validation as models
        from sklearn.cross_validation import (check_cv, StratifiedKFold, KFold)
        if isinstance(cv, (int, np.int)):
            if est_is_classifier:
                cv = StratifiedKFold(y=y, n_folds=cv)
            else:
                cv = KFold(n=len(y), n_folds=cv)
        elif isinstance(cv, str):
            if not hasattr(models, cv):
                raise ValueError('Unknown cross-validation')
            cv = getattr(models, cv)
            if cv.__name__ not in ['KFold', 'LeaveOneOut']:
                raise NotImplementedError('CV cannot be defined with str for'
                                          ' sklearn < .017.')
            cv = cv(len(y))
        cv = check_cv(cv=cv, X=X, y=y, classifier=est_is_classifier)

    # Extract train and test set to retrieve them at predict time
    if hasattr(cv, 'split'):
        cv_splits = [(train, test) for train, test in
                     cv.split(X=np.zeros_like(y), y=y)]
    else:
        # XXX support sklearn.cross_validation cv
        cv_splits = [(train, test) for train, test in cv]

    if not np.all([len(train) for train, _ in cv_splits]):
        raise ValueError('Some folds do not have any train epochs.')

    return cv, cv_splits
Esempio n. 15
0
    def fit(self, X, y):
        """Actual fitting,  performing the search over parameters."""

        parameter_iterable = ParameterSampler(self.param_distributions,
                                              self.n_iter,
                                              random_state=self.random_state)
        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(
            delayed(cv_fit_and_score)(clone(base_estimator), X, y, self.scoring,
                                      parameters, cv=cv)
            for parameters in parameter_iterable)

        best = sorted(out, reverse=True)[0]
        self.best_params_ = best[1]
        self.best_score_ = best[0]

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best[1])
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator

        return self
Esempio n. 16
0
def _wrapped_cross_val_score(sklearn_pipeline, features, target,
                             cv, scoring_function, sample_weight=None, groups=None):
    """Fit estimator and compute scores for a given dataset split.
    Parameters
    ----------
    sklearn_pipeline : pipeline object implementing 'fit'
        The object to use to fit the data.
    features : array-like of shape at least 2D
        The data to fit.
    target : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    cv: int or cross-validation generator
        If CV is a number, then it is the number of folds to evaluate each
        pipeline over in k-fold cross-validation during the TPOT optimization
         process. If it is an object then it is an object to be used as a
         cross-validation generator.
    scoring_function : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    sample_weight : array-like, optional
        List of sample weights to balance (or un-balanace) the dataset target as needed
    groups: array-like {n_samples, }, optional
        Group labels for the samples used while splitting the dataset into train/test set
    """
    sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight)

    features, target, groups = indexable(features, target, groups)

    cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline))
    cv_iter = list(cv.split(features, target, groups))
    scorer = check_scoring(sklearn_pipeline, scoring=scoring_function)

    try:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            scores = [_fit_and_score(estimator=clone(sklearn_pipeline),
                                    X=features,
                                    y=target,
                                    scorer=scorer,
                                    train=train,
                                    test=test,
                                    verbose=0,
                                    parameters=None,
                                    fit_params=sample_weight_dict)
                                for train, test in cv_iter]
            CV_score = np.array(scores)[:, 0]
            return np.nanmean(CV_score)
    except TimeoutException:
        return "Timeout"
    except Exception as e:
        return -float('inf')
Esempio n. 17
0
    def _grid_search(self, train_X, train_y):
        if callable(self.inner_cv):
            # inner_cv = self.inner_cv(train_X, train_y)
            inner_cv = self.inner_cv.split(train_X, train_y)
        else:
            # inner_cv = _check_cv(self.inner_cv, train_X, train_y,
            #                      classifier=is_classifier(self.estimator))
            inner_cv = _check_cv(self.inner_cv, train_y,
                                 classifier=is_classifier(
                                    self.estimator)).split(train_X, train_y)

        master = MPIGridSearchCVMaster(self.param_grid, inner_cv,
                                       self.estimator, self.scorer_,
                                       self.fit_params)
        return master.run(train_X, train_y)
Esempio n. 18
0
def _predict(X, estimators):
    """Aux function of GeneralizationAcrossTime

    Predict each classifier. If multiple classifiers are passed, average
    prediction across all classifiers to result in a single prediction per
    classifier.

    Parameters
    ----------
    estimators : ndarray, shape (n_folds,) | shape (1,)
        Array of scikit-learn classifiers to predict data.
    X : ndarray, shape (n_epochs, n_features, n_times)
        To-be-predicted data
    Returns
    -------
    y_pred : ndarray, shape (n_epochs, m_prediction_dimensions)
        Classifier's prediction for each trial.
    """
    from scipy import stats
    from sklearn.base import is_classifier
    # Initialize results:
    n_epochs = X.shape[0]
    n_clf = len(estimators)

    # Compute prediction for each sub-estimator (i.e. per fold)
    # if independent, estimators = all folds
    for fold, clf in enumerate(estimators):
        _y_pred = clf.predict(X)
        # initialize predict_results array
        if fold == 0:
            predict_size = _y_pred.shape[1] if _y_pred.ndim > 1 else 1
            y_pred = np.ones((n_epochs, predict_size, n_clf))
        if predict_size == 1:
            y_pred[:, 0, fold] = _y_pred
        else:
            y_pred[:, :, fold] = _y_pred

    # Collapse y_pred across folds if necessary (i.e. if independent)
    if fold > 0:
        # XXX need API to identify how multiple predictions can be combined?
        if is_classifier(clf):
            y_pred, _ = stats.mode(y_pred, axis=2)
        else:
            y_pred = np.mean(y_pred, axis=2)

    # Format shape
    y_pred = y_pred.reshape((n_epochs, predict_size))
    return y_pred
Esempio n. 19
0
    def fit(self, X, y):
        """Fit the model to the training data."""
        X, y = check_X_y(X, y, force_all_finite=False,
                         multi_output=self.multi_output)
        _check_param_grid(self.param_grid)

        # cv = _check_cv(self.cv, X, y, classifier=is_classifier(self.estimator))
        cv = _check_cv(self.cv, y, classifier=is_classifier(self.estimator))

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        if comm_rank == 0:
            self._fit_master(X, y, cv)
        else:
            self._fit_slave()

        return self
Esempio n. 20
0
	def score(self,test_parameter):
		"""
		The score function to call in order to evaluate the quality 
		of the parameter test_parameter

		Parameters
		----------
		`tested_parameter` : dict, the parameter to test

		Returns
		-------
		`score` : the CV score, either the list of all cv results or
			the mean (depending of score_format)
		"""

		if not self._callable_estimator:
	 		cv = check_cv(self.cv, self.X, self.y, classifier=is_classifier(self.estimator))
	 		cv_score = [ _fit_and_score(clone(self.estimator), self.X, self.y, self.scorer_,
							train, test, False, test_parameter,
							self.fit_params, return_parameters=True)
						for train, test in cv ]

			n_test_samples = 0
			mean_score = 0
			detailed_score = []
			for tmp_score, tmp_n_test_samples, _, _ in cv_score:
				detailed_score.append(tmp_score)
				tmp_score *= tmp_n_test_samples
				n_test_samples += tmp_n_test_samples
				mean_score += tmp_score
			mean_score /= float(n_test_samples)

			if(self.score_format == 'avg'):
				score = mean_score
			else: # format == 'cv'
				score = detailed_score


		else:
			if(self.score_format == 'avg'):
				score = [self.estimator(test_parameter)]
			else: # format == 'cv'
				score = self.estimator(test_parameter)

		return score
Esempio n. 21
0
def cross_val_predict(estimator, X, y, cv=5, n_jobs=1, refit=False, predict_fun="predict"):
    X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True)
    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
    pred = Parallel(n_jobs=n_jobs)(
        delayed(_cross_val_predict)(
            clone(estimator), X, y, train, test, predict_fun)
        for train, test in cv)
    pred = np.concatenate(pred)
    if cv.indices:
        index = np.concatenate([test for _, test in cv])
    else:
        index = np.concatenate([np.where(test)[0] for _, test in cv])
    ## pred[index] = pred doesn't work as expected
    pred[index] = pred.copy()
    if refit:
        return pred, clone(estimator).fit(X,y)
    else:
        return pred
Esempio n. 22
0
def test_precision():

    rng_reg = RandomState(2)
    rng_clf = RandomState(8)
    for X, y, clf in zip(
            (rng_reg.random_sample((5, 2)),
             rng_clf.random_sample((1000, 4))),
            (rng_reg.random_sample((5, )),
             rng_clf.randint(2, size=(1000, ))),
            (DecisionTreeRegressor(criterion="friedman_mse", random_state=0,
                                   max_depth=1),
             DecisionTreeClassifier(max_depth=1, random_state=0))):

        clf.fit(X, y)
        for precision in (4, 3):
            dot_data = export_graphviz(clf, out_file=None, precision=precision,
                                       proportion=True)

            # With the current random state, the impurity and the threshold
            # will have the number of precision set in the export_graphviz
            # function. We will check the number of precision with a strict
            # equality. The value reported will have only 2 precision and
            # therefore, only a less equal comparison will be done.

            # check value
            for finding in finditer(r"value = \d+\.\d+", dot_data):
                assert_less_equal(
                    len(search(r"\.\d+", finding.group()).group()),
                    precision + 1)
            # check impurity
            if is_classifier(clf):
                pattern = r"gini = \d+\.\d+"
            else:
                pattern = r"friedman_mse = \d+\.\d+"

            # check impurity
            for finding in finditer(pattern, dot_data):
                assert_equal(len(search(r"\.\d+", finding.group()).group()),
                             precision + 1)
            # check threshold
            for finding in finditer(r"<= \d+\.\d+", dot_data):
                assert_equal(len(search(r"\.\d+", finding.group()).group()),
                             precision + 1)
Esempio n. 23
0
File: ifs.py Progetto: teopir/ifqi
def my_cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1,
                         verbose=0, fit_params=None, pre_dispatch='2*n_jobs',
                         method='predict'):
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))

    # Ensure the estimator has implemented the passed decision function
    if not callable(getattr(estimator, method)):
        raise AttributeError('{} not implemented in estimator'
                             .format(method))

    if method in ['decision_function', 'predict_proba', 'predict_log_proba']:
        le = LabelEncoder()
        y = le.fit_transform(y)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    prediction_blocks = parallel(delayed(_my_fit_and_predict)(
        clone(estimator), X, y, train, test, verbose, fit_params, method)
                                 for train, test in cv.split(X, y, groups))

    # Concatenate the predictions
    predictions = [pred_block_i for pred_block_i, _, _ in prediction_blocks]
    test_indices = np.concatenate([indices_i
                                   for _, indices_i, _ in prediction_blocks])
    scores = np.concatenate([score_i for _, _, score_i in prediction_blocks])

    if not _check_is_permutation(test_indices, _num_samples(X)):
        raise ValueError('cross_val_predict only works for partitions')

    inv_test_indices = np.empty(len(test_indices), dtype=int)
    inv_test_indices[test_indices] = np.arange(len(test_indices))

    # Check for sparse predictions
    if sp.issparse(predictions[0]):
        predictions = sp.vstack(predictions, format=predictions[0].format)
    else:
        predictions = np.concatenate(predictions)
    return predictions[inv_test_indices], scores
Esempio n. 24
0
 def _get_params(self):
     res = super(RGFEstimatorBase, self)._get_params()
     res.update(dict(max_leaf=self.max_leaf,
                     test_interval=self.test_interval,
                     algorithm=self.algorithm,
                     loss=self.loss,
                     reg_depth=self.reg_depth,
                     l2=self.l2,
                     sl2=self._sl2,
                     normalize=self.normalize,
                     min_samples_leaf=self._min_samples_leaf,
                     n_iter=self._n_iter,
                     n_tree_search=self.n_tree_search,
                     opt_interval=self.opt_interval,
                     learning_rate=self.learning_rate,
                     memory_policy=self.memory_policy,
                     verbose=self.verbose,
                     init_model=self.init_model,
                     is_classification=is_classifier(self)))
     return res
Esempio n. 25
0
	def fit(self,X,Y):
		if not self.best_subset:
			self.fshape = np.shape(X)[1]
			self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

			self.cv = check_cv(self.cv, X, Y, classifier=is_classifier(self.estimator))

			self.best_subset = tuple()
			self.best_subset_score = 0
			self.scores_ = {self.best_subset:self.best_subset_score}
			X = np.array(X)
			Y = np.array(Y)


			try:
				self.get_best_subset(X,Y)
			except KeyboardInterrupt:
				pass
		self.estimator = self.estimator.fit(X[:,self.best_subset],Y)
		return self
def cross_val_score_filter_feature_selection(model,filter_function,filter_criteria, X, y, scoring=None, cv=None, n_jobs=1,
                    verbose=0, fit_params=None,
                    pre_dispatch='2*n_jobs'):

    X, y = indexable(X, y)

    cv = check_cv(cv, X, y, classifier=is_classifier(model))
    scorer = check_scoring(model, scoring=scoring)
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)

    #
    scores = parallel(delayed(_fit_and_score)(clone(model), filter_function(X,y,train,filter_criteria), y, scorer,
                                              train, test, verbose, None,
                                              fit_params)
                      for train, test in cv)

    return np.array(scores)[:, 0]
Esempio n. 27
0
    def explain_local(self, X, y=None, name=None):
        if name is None:
            name = gen_name_from_class(self)

        X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types)

        # Extract decision tree structure
        nodes, edges = self._graph_from_tree(self._model(), self.feature_names)

        decisions = [
            self._model().decision_path(instance.reshape(1, -1)).nonzero()[1] + 1
            for instance in X
        ]
        data_dicts = [
            {
                "type": "tree",
                "features": self.feature_names,
                "nodes": nodes,
                "edges": edges,
                "decision": decision,
            }
            for decision in decisions
        ]

        internal_obj = {"overall": None, "specific": data_dicts}

        if is_classifier(self):
            scores = self.predict_proba(X)[:, 1]
        else:
            scores = self.predict(X)

        selector = gen_local_selector(X, y, scores)

        return TreeExplanation(
            "local",
            internal_obj,
            feature_names=self.feature_names,
            feature_types=self.feature_types,
            name=name,
            selector=selector,
        )
Esempio n. 28
0
 def _get_params(self):
     res = super(FastRGFEstimatorBase, self)._get_params()
     res.update(dict(max_depth=self.max_depth,
                     max_leaf=self.max_leaf,
                     tree_gain_ratio=self.tree_gain_ratio,
                     min_samples_leaf=self._min_samples_leaf,
                     loss=self._loss,
                     l1=self.l1,
                     l2=self.l2,
                     opt_algorithm=self.opt_algorithm,
                     n_estimators=self.n_estimators,
                     learning_rate=self.learning_rate,
                     max_bin=self._max_bin,
                     data_l2=self.data_l2,
                     min_child_weight=self.min_child_weight,
                     sparse_max_features=self.sparse_max_features,
                     sparse_min_occurences=self.sparse_min_occurences,
                     n_jobs=self._n_jobs,
                     verbose=self.verbose,
                     is_classification=is_classifier(self),
                     target=self._target))
     return res
Esempio n. 29
0
    def predict(self, X):
        """Predict multi-class targets using underlying estimators.
        Parameters
        ----------
        X : (sparse) array-like of shape (n_samples, n_features)
            Data.
        Returns
        -------
        y : (sparse) array-like of shape (n_samples,) or (n_samples, n_classes)
            Predicted multi-class targets.
        """
        check_is_fitted(self)

        n_samples = _num_samples(X)
        if self.label_binarizer_.y_type_ == "multiclass":
            maxima = np.empty(n_samples, dtype=float)
            maxima.fill(-np.inf)
            argmaxima = np.zeros(n_samples, dtype=int)
            for i, e in enumerate(self.estimators_):
                pred = _predict_binary(e, X)
                np.maximum(maxima, pred, out=maxima)
                argmaxima[maxima == pred] = i
            return self.classes_[argmaxima]
        else:
            if (hasattr(self.estimators_[0], "decision_function")
                    and is_classifier(self.estimators_[0])):
                thresh = 0
            else:
                thresh = .5
            indices = array.array('i')
            indptr = array.array('i', [0])
            for e in self.estimators_:
                indices.extend(np.where(_predict_binary(e, X) > thresh)[0])
                indptr.append(len(indices))
            data = np.ones(len(indices), dtype=int)
            indicator = sp.csc_matrix((data, indices, indptr),
                                      shape=(n_samples, len(self.estimators_)))
            return self.label_binarizer_.inverse_transform(indicator)
Esempio n. 30
0
def cross_val_score_fn(estimator,
                       X,
                       y=None,
                       scoring=None,
                       cv=None,
                       n_jobs=1,
                       verbose=0,
                       fit_params=None,
                       pre_dispatch='2*n_jobs'):
    """
    Evaluate a score by cross-validation.
    This overrides the cross_val_score method typically found in 
    cross_validation.py. Changes are clearly marked in comments, but 
    the main change is augmenting the function to store Fit and Metric Events
    for each fold.
    """
    X, y = indexable(X, y)

    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

    scorer = check_scoring(estimator, scoring=scoring)

    # Default scoring scheme is 'accuracy' unless provided by user.
    if scoring is None:
        scoring = 'accuracy'
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs,
                        verbose=verbose,
                        pre_dispatch=pre_dispatch)

    # Change from original scikit code: adding a new argument, scoring, to the
    # _fit_and_score function to track scoring function and create MetricEvents.
    scores = parallel(
        delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test,
                                verbose, None, fit_params, scoring)
        for train, test in cv)
    return np.array(scores)[:, 0]
Esempio n. 31
0
    def explain_local(self, X, y=None, name=None):
        if name is None:
            name = gen_name_from_class(self)

        X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types)

        # Extract decision tree structure
        nodes, edges = self._graph_from_tree(self._model(), self.feature_names)

        decisions = [
            self._model().decision_path(instance.reshape(1, -1)).nonzero()[1] +
            1 for instance in X
        ]
        data_dicts = [{
            "type": "tree",
            "features": self.feature_names,
            "nodes": nodes,
            "edges": edges,
            "decision": decision,
        } for decision in decisions]

        internal_obj = {"overall": None, "specific": data_dicts}

        if is_classifier(self):
            scores = self.predict_proba(X)[:, 1]
        else:
            scores = self.predict(X)

        selector = gen_local_selector(X, y, scores)

        return TreeExplanation(
            "local",
            internal_obj,
            feature_names=self.feature_names,
            feature_types=self.feature_types,
            name=name,
            selector=selector,
        )
Esempio n. 32
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)

        X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr')
        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
            y = np.asarray(y)

        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
        if not self.dataset_filenames:
            self.save_dataset_filename(X, y, cv)

        dataset_filenames = self.dataset_filenames

        client = Client()
        lb_view = client.load_balanced_view()

        if self.verbose > 0:
            print("Number of CPU core %d" % len(client.ids()))

        self.tasks = [([lb_view.apply(evaluate, estimator, dataset_filename, params)
                        for dataset_filename in dataset_filenames], params)
                            for params in parameter_iterable]
        if self.sync:
            self.wait()
            self.set_grid_scores()
            self.set_best_score_params()

            if self.refit:
                self.set_best_estimator(estimator)
        return self
Esempio n. 33
0
def _get_scores_and_estimators(
        experiment: Experiment) -> Tuple[List[float], List[Any]]:
    if experiment.test_set is not None:
        assert experiment.cross_validator is None, "Cannot use a cross validator with train test split"
        dataset = pd.concat([experiment.dataset, experiment.test_set])
        split = np.array([-1] * len(experiment.dataset) +
                         [1] * len(experiment.test_set))
        cross_validator = PredefinedSplit(split)
    else:
        dataset = experiment.dataset
        cross_validator = experiment.cross_validator

    X = dataset.drop(columns=[experiment.label_column])
    y = dataset[experiment.label_column]
    if experiment.group_column is None:
        if experiment.average_scores_on_instances:
            groups = Series(range(len(X)), index=X.index)
        else:
            groups = None
    else:
        groups = X[experiment.group_column]
        X = X.drop(columns=[experiment.group_column])

    cv = check_cv(cross_validator,
                  y,
                  classifier=is_classifier(experiment.predictor))
    train_test = cv.split(X, y, groups)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=None, verbose=False, pre_dispatch='2*n_jobs')
    scores_and_estimators = parallel(
        delayed(_fit_and_predict)(clone(experiment.predictor), X, y, train,
                                  test, groups, experiment.scorer)
        for train, test in train_test)
    scores_lists, estimators = zip(*scores_and_estimators)
    scores = [score for score_list in scores_lists for score in score_list]
    return scores, estimators
Esempio n. 34
0
 def _get_params(self):
     res = super(FastRGFEstimatorBase, self)._get_params()
     res.update(
         dict(max_depth=self.max_depth,
              max_leaf=self.max_leaf,
              tree_gain_ratio=self.tree_gain_ratio,
              min_samples_leaf=self._min_samples_leaf,
              loss=self._loss,
              l1=self.l1,
              l2=self.l2,
              opt_algorithm=self.opt_algorithm,
              n_estimators=self.n_estimators,
              learning_rate=self.learning_rate,
              max_bin=self._max_bin,
              data_l2=self.data_l2,
              min_child_weight=self.min_child_weight,
              sparse_max_features=self.sparse_max_features,
              sparse_min_occurences=self.sparse_min_occurences,
              n_jobs=self._n_jobs,
              verbose=self.verbose,
              is_classification=is_classifier(self),
              target=self._target))
     return res
Esempio n. 35
0
    def __init__(self, models):
        """Proxy class to build an ensemble of models with an API as one

        Parameters
        ----------
            models: array
                An array of models
        """
        self._models = models if len(models) else None
        if self._models is not None:
            if is_classifier(self._models[0]):
                check_type = is_classifier
                self._scoring_fun = accuracy_score
            elif is_regressor(self._models[0]):
                check_type = is_regressor
                self._scoring_fun = r2_score
            else:
                raise ValueError('Expected regressors or classifiers,'
                                 ' got %s instead' % type(self._models[0]))
            for model in self._models:
                if not check_type(model):
                    raise ValueError('Different types of models found, privide'
                                     ' either regressors or classifiers.')
Esempio n. 36
0
def log_classifier(classifier, X_test, y_test, nrows=1000, run=None):
    assert is_classifier(
        classifier), "classifier should be sklearn classifier."

    run = tracking.get_or_create_run(run)

    run.log_inputs(**sanitize_dict(classifier.get_params()))

    _log_test_preds_proba(run, classifier, X_test, nrows=nrows)

    y_pred = classifier.predict(X_test)

    results = {}
    for metric_name, values in zip(
        ["precision", "recall", "fbeta_score", "support"],
            precision_recall_fscore_support(y_test, y_pred),
    ):
        for i, value in enumerate(values):
            results["{}_class_{}_test".format(metric_name, i)] = value
    results["accuracy"] = accuracy_score(y_test, y_pred)
    results["f1"] = f1_score(y_pred, y_pred, average="weighted")
    run.log_metrics(**results)
    _log_test_predictions(run, y_test, y_pred=y_pred, nrows=nrows)
Esempio n. 37
0
    def _make_estimator(self, estimator_idx, estimator_name):
        """Make and configure a copy of the estimator."""
        # Set the non-overlapped random state
        if self.random_state is not None:
            random_state = (self.random_state + 10 * estimator_idx +
                            100 * self.layer_idx)
        else:
            random_state = None

        estimator = Estimator(
            name=estimator_name,
            criterion=self.criterion,
            n_trees=self.n_trees,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            backend=self.backend,
            n_jobs=self.n_jobs,
            random_state=random_state,
            is_classifier=is_classifier(self),
        )

        return estimator
Esempio n. 38
0
    def __init__(self, models):
        """Proxy class to build an ensemble of models with an API as one

        Parameters
        ----------
        models: array
            An array of models
        """
        self._models = models if len(models) else None
        if self._models is not None:
            if is_classifier(self._models[0]):
                check_type = is_classifier
                self._scoring_fun = accuracy_score
            elif is_regressor(self._models[0]):
                check_type = is_regressor
                self._scoring_fun = r2_score
            else:
                raise ValueError('Expected regressors or classifiers,'
                                 ' got %s instead' % type(self._models[0]))
            for model in self._models:
                if not check_type(model):
                    raise ValueError('Different types of models found, privide'
                                     ' either regressors or classifiers.')
Esempio n. 39
0
    def _make_estimator(self, append=True, random_state=None):
        """Make and configure a copy of the `base_estimator_` attribute.

        Warning: This method should be used to properly instantiate new
        sub-estimators.
        """
        estimator = clone(self.base_estimator_)
        estimator.set_params(
            **{p: getattr(self, p)
               for p in self.estimator_params})

        # Pass the inferred class information to avoid redudant finding.
        if is_classifier(estimator):
            estimator.classes_ = self.classes_
            estimator.n_classes_ = np.array(self.n_classes_, dtype=np.int32)

        if random_state is not None:
            _set_random_states(estimator, random_state)

        if append:
            self.estimators_.append(estimator)

        return estimator
Esempio n. 40
0
def log_estimator_params(estimator, experiment=None):
    """Log estimator parameters.

    Log all estimator parameters as experiment properties.

    Make sure you created an experiment by using ``neptune.create_experiment()`` before you use this method.

    Tip:
        Check `Neptune documentation <https://docs.neptune.ai/integrations/scikit_learn.html>`_ for the full example.

    Args:
        estimator (:obj:`estimator`):
            | Scikit-learn estimator from which to log parameters.
        experiment (:obj:`neptune.experiments.Experiment`, optional, default is ``None``):
            | Neptune ``Experiment`` object to control to which experiment you log the data.
            | If ``None``, log to currently active, and most recent experiment.

    Returns:
        ``None``

    Examples:
        .. code:: python3

            rfr = RandomForestRegressor()

            neptune.init('my_workspace/my_project')
            neptune.create_experiment()

            log_estimator_params(rfr)
    """
    assert is_regressor(estimator) or is_classifier(estimator) or isinstance(estimator, KMeans),\
        'Estimator should be sklearn regressor, classifier or kmeans clusterer.'

    exp = _validate_experiment(experiment)

    for param, value in estimator.get_params().items():
        exp.set_property(param, value)
Esempio n. 41
0
def plot_confusion_matrix(estimator,
                          X,
                          y_true,
                          labels=None,
                          sample_weight=None,
                          normalize=None,
                          display_labels=None,
                          include_values=True,
                          xticks_rotation='horizontal',
                          values_format=None,
                          cmap='viridis',
                          ax=None):
    check_matplotlib_support("plot_confusion_matrix")

    if not is_classifier(estimator):
        raise ValueError("plot_confusion_matrix only supports classifiers")

    y_pred = estimator.predict(X)
    cm = confusion_matrix(y_true,
                          y_pred,
                          sample_weight=sample_weight,
                          labels=labels,
                          normalize=normalize)

    if display_labels is None:
        if labels is None:
            display_labels = estimator.classes_
        else:
            display_labels = labels

    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=display_labels)
    return disp.plot(include_values=include_values,
                     cmap=cmap,
                     ax=ax,
                     xticks_rotation=xticks_rotation,
                     values_format=values_format)
Esempio n. 42
0
def test_model_pipeline_same_dense_and_sparse(LinearModel, params):
    # Test that linear model preceeded by StandardScaler in the pipeline and
    # with normalize set to False gives the same y_pred and the same .coef_
    # given X sparse or dense

    model_dense = make_pipeline(
        StandardScaler(with_mean=False),
        LinearModel(normalize=False, **params)
    )

    model_sparse = make_pipeline(
        StandardScaler(with_mean=False),
        LinearModel(normalize=False, **params)
    )

    # prepare the data
    rng = np.random.RandomState(0)
    n_samples = 200
    n_features = 2
    X = rng.randn(n_samples, n_features)
    X[X < 0.1] = 0.

    X_sparse = sparse.csr_matrix(X)
    y = rng.rand(n_samples)

    if is_classifier(model_dense):
        y = np.sign(y)

    model_dense.fit(X, y)
    model_sparse.fit(X_sparse, y)

    assert_allclose(model_sparse[1].coef_, model_dense[1].coef_)
    y_pred_dense = model_dense.predict(X)
    y_pred_sparse = model_sparse.predict(X_sparse)
    assert_allclose(y_pred_dense, y_pred_sparse)

    assert_allclose(model_dense[1].intercept_, model_sparse[1].intercept_)
Esempio n. 43
0
    def _fit(self, X, y, parameter_iterable):
        estimator = from_sklearn(self.estimator)
        self.scorer_ = check_scoring(estimator, scoring=self.scoring)
        cv = check_cv(self.cv, X, y, classifier=is_classifier(estimator))
        n_folds = len(cv)
        X, y = check_X_y(X, y)

        tups = []
        parameters = []
        train_test_sets = list(cv.split(X, y))
        for params in parameter_iterable:
            est = estimator.set_params(**params)
            for X_train, y_train, X_test, y_test in train_test_sets:
                fit = est.fit(X_train, y_train, **self.fit_params)
                tups.append(score_and_n(self.scorer_, fit, X_test, y_test))
                parameters.append(params)

        # Compute results
        get = self.get or _globals['get'] or threaded.get
        scores, n_samples = zip(*compute(tups, get=get)[0])

        # Extract grid_scores and best parameters
        grid_scores = get_grid_scores(scores, parameters, n_samples, n_folds,
                                      self.iid)
        best = get_best(grid_scores)

        # Update attributes
        self.grid_scores_ = grid_scores
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        # Refit if needed
        if self.refit:
            self.best_estimator_ = (estimator.set_params(
                **best.parameters).fit(X, y,
                                       **self.fit_params).compute(get=get))
        return self
Esempio n. 44
0
def cross_val_score(estimator,
                    X,
                    y=None,
                    groups=None,
                    scoring=None,
                    cv=None,
                    n_jobs=1,
                    verbose=0,
                    fit_params=None,
                    pre_dispatch='2*n_jobs'):
    """
    Evaluate a score by cross-validation
    """
    if not isinstance(scoring, (list, tuple)):
        scoring = [scoring]

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    splits = list(cv.split(X, y, groups))
    scorer = [check_scoring(estimator, scoring=s) for s in scoring]
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs,
                        verbose=verbose,
                        pre_dispatch=pre_dispatch)
    scores = parallel(
        delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test,
                                verbose, None, fit_params)
        for train, test in splits)

    group_order = []
    if hasattr(cv, 'groups'):
        group_order = [
            np.array(cv.groups)[test].tolist()[0] for _, test in splits
        ]
    return np.squeeze(np.array(scores)), group_order
Esempio n. 45
0
    def predict(self, context: np.ndarray) -> np.ndarray:
        """Predict the mean reward function.

        Parameters
        -----------
        context: array-like, shape (n_rounds_of_new_data, dim_context)
            Context vectors for new data.

        Returns
        -----------
        estimated_rewards_by_reg_model: array-like, shape (n_rounds_of_new_data, n_actions, len_list)
            Estimated expected rewards for new data given each item and position by the regression model.

        """
        n_rounds_of_new_data = context.shape[0]
        ones_n_rounds_arr = np.ones(n_rounds_of_new_data, int)
        estimated_rewards_by_reg_model = np.zeros(
            (n_rounds_of_new_data, self.n_actions, self.len_list)
        )
        for action_ in np.arange(self.n_actions):
            for position_ in np.arange(self.len_list):
                X = self._pre_process_for_reg_model(
                    context=context,
                    action=action_ * ones_n_rounds_arr,
                    action_context=self.action_context,
                )
                estimated_rewards_ = (
                    self.base_model_list[position_].predict_proba(X)[:, 1]
                    if is_classifier(self.base_model_list[position_])
                    else self.base_model_list[position_].predict(X)
                )
                estimated_rewards_by_reg_model[
                    np.arange(n_rounds_of_new_data),
                    action_ * ones_n_rounds_arr,
                    position_ * ones_n_rounds_arr,
                ] = estimated_rewards_
        return estimated_rewards_by_reg_model
Esempio n. 46
0
    def predict(self, x_test: np.ndarray, confidence: float):
        r"""Method that returns the prediction and the confidence interval

        This method returns the interval with a confidence level of `confidence` and the target
        predictions for `x_test`. The information returned for classification is different from the
        one returned for regression. In classification cases the tuple returned has two elements: a
        numpy.ndarray with a matrix of boolean values and a numpy.ndarray that contains the class
        predictions. On the other hand, in regression cases the tuple returned has 3 elements: a numpy.ndarray
        with the lower bound values, a numpy.ndarray with the predicted target values and a numpy.ndarray
        with the upper bound values.

        Parameters
        ----------
        x_test: numpy.ndarray
            Array of data features used to predict the target values and the confidence interval
        confidence: float
            Float between 0 and 1 that represent the percentage of observations we want to be
            inside the predicted interval.

        Returns
        -------
        prediction: Tuple[numpy.ndarray, numpy.ndarray] or Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]
            Tuple containing the confidence interval and the target prediction

        Notes
        -----
        The `x_test` data must have the same features as the data used for training and calibration,
        and they must be in the same order.
        The level of confidence has to be a fraction between 0 and 1.
        """
        sig = 1 - confidence
        if is_classifier(self.model):
            return self.icp.predict(x_test, significance=sig), self.model.predict(x_test)
        elif is_regressor(self.model):
            return self.icp.predict(x_test, significance=sig)[:, 0], self.model.predict(x_test), self.icp.predict(x_test, significance=sig)[:, 1]
        elif type(self.model) == lgbm.basic.Booster:
            return self.icp.predict(x_test, significance=sig)[:, 0], self.model.predict(x_test), self.icp.predict(x_test, significance=sig)[:, 1]
Esempio n. 47
0
    def __init__(self,
                 estimator=GaussianNB(),
                 cv=LeaveOneOut(),
                 scoring=None,
                 max_chunk=100,
                 percentage=.1,
                 verbose=False,
                 n_jobs=1):

        if not (0. < percentage <= 1.):
            raise ValueError(
                'percentage must be > 0 and <= 1. Given {}'.format(percentage))

        if not (0. < n_jobs):
            raise ValueError(
                'n_jobs must be a positive integer. Given {}'.format(
                    percentage))

        if not (max_chunk >= 0):
            raise ValueError(
                'max_chunk must be >= 0. Given: {}'.format(max_chunk))

        if not is_classifier(estimator):
            raise ValueError(
                'Estimator must be a sklearn-like classifier. Given {}'.format(
                    estimator))

        scoring = check_scoring(estimator, scoring=scoring)

        self.estimator = estimator
        self.cv = cv
        self.scoring = scoring

        self.max_chunk = max_chunk
        self.percentage = percentage
        self.verbose = verbose
        self.n_jobs = n_jobs
Esempio n. 48
0
    def __init__(self, estimator, training_x, training_y, test_x, test_y,
                 estimator_id, problem_type, main_metric):
        super().__init__()
        self._training_result = None
        self._test_result = None
        self._estimator = estimator
        self._estimator_id = estimator_id
        self._training_x = training_x
        self._training_y = training_y
        self._test_x = test_x
        self._test_y = test_y
        self._evaluated = False

        if problem_type is None:
            if is_regressor(estimator):
                problem_type = 'regression'
            elif is_classifier(estimator):
                problem_type = 'classification'
            else:
                raise ValueError('problem type is not provided!')

        if problem_type.lower().startswith('regres'):
            problem_type = 'regression'
        elif problem_type.lower().startswith('class'):
            problem_type = 'classification'

        if main_metric is None:
            if problem_type == 'regression':
                main_metric = 'smape'
            elif problem_type == 'classification':
                main_metric = 'f1_score'
            else:
                raise ValueError('main metric should be provided!')

        self._problem_type = problem_type
        self._main_metric = main_metric
Esempio n. 49
0
    def fit(self, train_X, train_y, verbose=1):
        train_X, train_y = check_X_y(train_X, train_y, accept_sparse=True)

        if is_classifier(self):
            self.classes_ = unique_labels(train_y)

        self._fitness_evaluator.fit(train_X, train_y)
        self.pareto.clear()

        self._population = self._toolbox.population(n=self.pop_size)

        log_context = None
        try:
            log_context = self.logger.listen()

            ea_run(self._population, self._toolbox, self.n_gen, self.pop_size, self.config,
                   n_jobs=self.n_jobs, timeout=self.max_evo_seconds, verbose=verbose)
        finally:
            self.logger.close(log_context)

        if not len(self.pareto):
            # try to get individuals that were evaluated before time limit end
            evaluated_inds = [ind for ind in self._population if ind.fitness.valid]

            if not len(evaluated_inds):
                warnings.warn("The algorithm did not have enough time to evaluate first generation and was not fitted.")
                return self

            self.pareto.update(evaluated_inds)

        tree = self.pareto[0]
        self.fitted_workflow = self._toolbox.compile(tree)
        self.fitted_workflow.fit(train_X, train_y)

        self.is_fitted_ = True
        return self
Esempio n. 50
0
 def __init__(self, base_estimator, ratio = 1, ensemble = 'mean', random_state = 42):
     
     def get_ensemble(ensemble):
         if ensemble == 'mean':
             return np.mean
         if ensemble == 'max':
             return np.max
         if ensemble == 'min':
             return np.min
         else:
             raise ValueError("ensemble must be one of these options: 'mean', 'max', 'min' not " + ensemble)
             
     if is_classifier(base_estimator):
           self.base_estimator = base_estimator
     else:
         raise ValueError("base_estimator must be a classifier not " + base_estimator)
     self._estimator_type =  'classifier'
     self._ratio = ratio
     self.ensemble = get_ensemble(ensemble)
     self._random_state = random_state
     self.classes_ = None
     self._target = None
     self.list_of_df = None
     self.list_models = None
Esempio n. 51
0
def cross_val_score_weighted(estimator,
                             x_data,
                             y_data=None,
                             groups=None,
                             scoring=None,
                             cv=None,
                             n_jobs=None,
                             verbose=0,
                             fit_params=None,
                             pre_dispatch='2*n_jobs',
                             error_score=np.nan,
                             sample_weights=None):
    """Expand :func:`sklearn.model_selection.cross_val_score`."""
    scorer = check_scoring(estimator, scoring=scoring)
    (x_data, y_data, groups) = indexable(x_data, y_data, groups)

    cv = check_cv(cv, y_data, classifier=is_classifier(estimator))

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs,
                        verbose=verbose,
                        pre_dispatch=pre_dispatch)
    scores = parallel(
        delayed(_fit_and_score_weighted)(clone(estimator),
                                         x_data,
                                         y_data,
                                         scorer,
                                         train,
                                         test,
                                         None,
                                         fit_params,
                                         error_score=error_score,
                                         sample_weights=sample_weights)
        for train, test in cv.split(x_data, y_data, groups))
    return np.array(scores)
Esempio n. 52
0
    def __init__(self, X, y, criterion, min_samples_split, max_depth,
                 n_val_sample, random_state):
        # make sure max_depth > 1
        if max_depth < 2:
            raise ValueError("max depth must be > 1")

        # check the input arrays, and if it's classification validate the
        # target values in y
        X, y = check_X_y(X, y, accept_sparse=False, dtype=None, copy=True)
        if is_classifier(self):
            check_classification_targets(y)

        # hyper parameters so we can later inspect attributes of the model
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_val_sample = n_val_sample
        self.random_state = random_state

        # create the splitting class
        random_state = check_random_state(random_state)
        self.splitter = RandomSplitter(random_state, criterion, n_val_sample)

        # grow the tree depth first
        self.tree = self._find_next_split(X, y, 0)
Esempio n. 53
0
    def _validate_estimators(self):
        if self.estimators is None or len(self.estimators) == 0:
            raise ValueError(
                "Invalid 'estimators' attribute, 'estimators' should be a list"
                " of (string, estimator) tuples.")
        names, estimators = zip(*self.estimators)
        # defined by MetaEstimatorMixin
        self._validate_names(names)

        has_estimator = any(est != 'drop' for est in estimators)
        if not has_estimator:
            raise ValueError(
                "All estimators are dropped. At least one is required "
                "to be an estimator.")

        is_estimator_type = (is_classifier
                             if is_classifier(self) else is_regressor)

        for est in estimators:
            if est != 'drop' and not is_estimator_type(est):
                raise ValueError("The estimator {} should be a {}.".format(
                    est.__class__.__name__, is_estimator_type.__name__[3:]))

        return names, estimators
    def show_models(self):

        # Check if fit had been called
        check_is_fitted(self)

        if is_classifier(self):
            sorted_list = [[
                self.classes_[estimator.target], estimator
            ] for estimator in sorted(self.estimators_, key=lambda e: e.target)
                           ]
        elif is_regressor(self):
            sorted_list = [[
                estimator.target, estimator
            ] for estimator in sorted(self.estimators_, key=lambda e: e.target)
                           ]
        else:
            msg = "Unknown type of model. Must be 'regressor' or 'classifier'"
            raise ValueError(msg)

        [
            print(f"Target: {target}; Model: {estimator}")
            for target, estimator in sorted_list
        ]
        return pd.DataFrame(sorted_list, columns=["Target", "Model"])
Esempio n. 55
0
def convert_pipeline(scope: Scope, operator: Operator,
                     container: ModelComponentContainer):
    model = operator.raw_operator
    inputs = operator.inputs
    for step in model.steps:
        step_model = step[1]
        if is_classifier(step_model):
            scope.add_options(id(step_model), options={'zipmap': False})
            container.add_options(id(step_model), options={'zipmap': False})
        outputs = _parse_sklearn(scope,
                                 step_model,
                                 inputs,
                                 custom_parsers=None)
        inputs = outputs
    if len(outputs) != len(operator.outputs):
        raise RuntimeError("Mismatch between pipeline output %d and "
                           "last step outputs %d." %
                           (len(outputs), len(operator.outputs)))
    for fr, to in zip(outputs, operator.outputs):
        container.add_node(
            'Identity',
            fr.full_name,
            to.full_name,
            name=scope.get_unique_operator_name("Id" + operator.onnx_name))
Esempio n. 56
0
def test_BoxCoxTargetTransformer_target_transform():

    for ll in (0, 0.1, 0.5, 2):

        bb = BoxCoxTargetTransformer(Ridge(), ll=ll)

        assert not is_classifier(bb)
        assert is_regressor(bb)

        y = np.arange(-100, 100, step=0.1)

        my = bb.target_transform(y)
        ymy = bb.target_inverse_transform(my)
        mymy = bb.target_transform(ymy)

        #        plt.subplot(211)
        #        plt.plot(y,my)
        #        plt.subplot(212)
        #        plt.plot(my,ymy)

        assert not pd.Series(my).isnull().any()
        assert not pd.Series(ymy).isnull().any()
        assert np.max(np.abs(y - ymy)) <= 10**(-10)
        assert np.max(np.abs(my - mymy)) <= 10**(-10)
Esempio n. 57
0
def cross_val_train_predict(estimator,
                            x,
                            y,
                            predict_method: str = "predict",
                            cv: int = 5):
    """ Return fit estimators and predictions of each (Stratified) fold. """
    from sklearn.base import clone, is_classifier
    from sklearn.model_selection._split import check_cv
    from sklearn.utils.metaestimators import _safe_split
    import numpy as np

    splitter = check_cv(cv, y, classifier=is_classifier(estimator))

    estimators = []
    predictions = None
    for train, test in splitter.split(x, y):
        x_train, y_train = _safe_split(estimator, x, y, train)
        x_test, _ = _safe_split(estimator, x, y, test, train)

        fold_estimator = clone(estimator)
        fold_predict = getattr(fold_estimator, predict_method)

        fold_estimator.fit(x_train, y_train)
        estimators.append(fold_estimator)
        fold_prediction = fold_predict(x_test)

        if predictions is None:
            if fold_prediction.ndim == 2:
                predictions = np.empty(shape=(len(y),
                                              fold_prediction.shape[1]))
            else:
                predictions = np.empty(shape=(len(y), ))

        predictions[test] = fold_prediction

    return predictions, estimators
Esempio n. 58
0
def predict_rulelist(X: pd.DataFrame, model):
    if X is not pd.DataFrame: Exception('X needs to be a DataFrame')
    is_classification = is_classifier(model)
    rulelist = model._rulelist
    n_predictions = X.shape[0]
    n_targets = rulelist.default_rule_statistics.number_targets
    instances_covered = np.zeros(n_predictions, dtype=bool)
    predictions = np.empty((n_predictions, n_targets), dtype=object)
    for subgroup in rulelist.subgroups:
        instances_subgroup = ~instances_covered &\
                             reduce(lambda x,y: x & y, [item.activation_function(X).values for item in subgroup.pattern])
        if is_classification:
            predictions[instances_subgroup, :] = point_value_categorical(
                subgroup.statistics)
        else:
            predictions[instances_subgroup, :] = point_value_gaussian(
                subgroup.statistics)
        instances_covered |= instances_subgroup

    # default rule
    if is_classification:
        predictions[~instances_covered, :] = point_value_categorical(
            rulelist.default_rule_statistics)
    else:
        predictions[~instances_covered, :] = point_value_gaussian(
            rulelist.default_rule_statistics)

    if n_targets == 1:
        predictions = predictions.flatten()

    # if int values try to return ints
    try:
        predictions = predictions.astype(int)
    except ValueError:
        pass
    return predictions
Esempio n. 59
0
    def _cross_val_predict(pipeline, X, y=None, cv=None) \
            -> Tuple[np.ndarray, np.ndarray, np.ndarray, List[FlexiblePipeline]]:
        X, y, groups = indexable(X, y, None)
        cv = check_cv(cv, y, classifier=is_classifier(pipeline))
        cv.random_state = 42

        prediction_blocks = []
        probability_blocks = []
        fitted_pipelines = []
        for train, test in cv.split(X, y, groups):
            cloned_pipeline = clone(pipeline)
            probability_blocks.append(
                (_fit_and_predict(cloned_pipeline, X, y, train, test, 0, {}, 'predict_proba'), test)
            )
            prediction_blocks.append(cloned_pipeline.predict(X))
            fitted_pipelines.append(cloned_pipeline)

        # Concatenate the predictions
        probabilities = [prob_block_i for prob_block_i, _ in probability_blocks]
        predictions = [pred_block_i for pred_block_i in prediction_blocks]
        test_indices = np.concatenate([indices_i for _, indices_i in probability_blocks])

        if not _check_is_permutation(test_indices, _num_samples(X)):
            raise ValueError('cross_val_predict only works for partitions')

        inv_test_indices = np.empty(len(test_indices), dtype=int)
        inv_test_indices[test_indices] = np.arange(len(test_indices))

        probabilities = np.concatenate(probabilities)
        predictions = np.concatenate(predictions)

        if isinstance(predictions, list):
            return y, [p[inv_test_indices] for p in predictions], [p[inv_test_indices] for p in
                                                                   probabilities], fitted_pipelines
        else:
            return y, predictions[inv_test_indices], probabilities[inv_test_indices], fitted_pipelines
Esempio n. 60
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            check_input=True,
            X_idx_sorted=None):

        random_state = check_random_state(self.random_state)

        if X.dtype != np.uint8:
            msg = "The dtype of `X` should be `np.uint8`, but got {} instead."
            raise RuntimeError(msg.format(X.dtype))

        if check_input:
            # Need to validate separately here.
            # We can't pass multi_ouput=True because that would allow y to be
            # csr.
            check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
            check_y_params = dict(ensure_2d=False, dtype=None)
            X, y = self._validate_data(X,
                                       y,
                                       validate_separately=(check_X_params,
                                                            check_y_params))

        # Determine output settings
        n_samples, self.n_features_ = X.shape

        y = np.atleast_1d(y)
        expanded_class_weight = None

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        # `classes_` and `n_classes_` were set by the forest.
        if not hasattr(self, "classes_") and is_classifier(self):
            check_classification_targets(y)
            y = np.copy(y)

            self.classes_ = []
            self.n_classes_ = []

            if self.class_weight is not None:
                y_original = np.copy(y)

            y_encoded = np.zeros(y.shape, dtype=np.int)
            for k in range(self.n_outputs_):
                classes_k, y_encoded[:, k] = np.unique(y[:, k],
                                                       return_inverse=True)
                self.classes_.append(classes_k)
                self.n_classes_.append(classes_k.shape[0])
            y = y_encoded

            if self.class_weight is not None:
                expanded_class_weight = compute_sample_weight(
                    self.class_weight, y_original)

            self.n_classes_ = np.array(self.n_classes_, dtype=np.int32)

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        # Check parameters
        max_depth = (np.iinfo(np.int32).max
                     if self.max_depth is None else self.max_depth)

        if isinstance(self.min_samples_leaf, numbers.Integral):
            if not 1 <= self.min_samples_leaf:
                raise ValueError("min_samples_leaf must be at least 1 "
                                 "or in (0, 0.5], got %s" %
                                 self.min_samples_leaf)
            min_samples_leaf = self.min_samples_leaf
        else:  # float
            if not 0.0 < self.min_samples_leaf <= 0.5:
                raise ValueError("min_samples_leaf must be at least 1 "
                                 "or in (0, 0.5], got %s" %
                                 self.min_samples_leaf)
            min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples))

        if isinstance(self.min_samples_split, numbers.Integral):
            if not 2 <= self.min_samples_split:
                raise ValueError("min_samples_split must be an integer "
                                 "greater than 1 or a float in (0.0, 1.0]; "
                                 "got the integer %s" % self.min_samples_split)
            min_samples_split = self.min_samples_split
        else:  # float
            if not 0.0 < self.min_samples_split <= 1.0:
                raise ValueError("min_samples_split must be an integer "
                                 "greater than 1 or a float in (0.0, 1.0]; "
                                 "got the float %s" % self.min_samples_split)
            min_samples_split = int(ceil(self.min_samples_split * n_samples))
            min_samples_split = max(2, min_samples_split)

        min_samples_split = max(min_samples_split, 2 * min_samples_leaf)

        if isinstance(self.max_features, str):
            if self.max_features in ["auto", "sqrt"]:
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError("Invalid value for max_features. "
                                 "Allowed string values are 'auto', "
                                 "'sqrt' or 'log2'.")
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, numbers.Integral):
            max_features = self.max_features
        else:  # float
            if self.max_features > 0.0:
                max_features = max(1,
                                   int(self.max_features * self.n_features_))
            else:
                max_features = 0

        self.max_features_ = max_features

        if len(y) != n_samples:
            raise ValueError("Number of labels=%d does not match "
                             "number of samples=%d" % (len(y), n_samples))
        if not 0 <= self.min_weight_fraction_leaf <= 0.5:
            raise ValueError("min_weight_fraction_leaf must in [0, 0.5]")
        if max_depth <= 0:
            raise ValueError("max_depth must be greater than zero. ")
        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight, X, DOUBLE)

        if expanded_class_weight is not None:
            if sample_weight is not None:
                sample_weight = sample_weight * expanded_class_weight
            else:
                sample_weight = expanded_class_weight

        # Set min_weight_leaf from min_weight_fraction_leaf
        if sample_weight is None:
            min_weight_leaf = self.min_weight_fraction_leaf * n_samples
        else:
            min_weight_leaf = self.min_weight_fraction_leaf * np.sum(
                sample_weight)

        min_impurity_split = self.min_impurity_split
        if min_impurity_split is not None:
            warnings.warn(
                "The min_impurity_split parameter is deprecated. "
                "Its default value has changed from 1e-7 to 0 in "
                "version 0.23, and it will be removed in 0.25. "
                "Use the min_impurity_decrease parameter instead.",
                FutureWarning,
            )

            if min_impurity_split < 0.0:
                raise ValueError("min_impurity_split must be greater than "
                                 "or equal to 0")
        else:
            min_impurity_split = 0

        if self.min_impurity_decrease < 0.0:
            raise ValueError("min_impurity_decrease must be greater than "
                             "or equal to 0")

        if self.presort != "deprecated":
            warnings.warn(
                "The parameter 'presort' is deprecated and has no "
                "effect. It will be removed in v0.24. You can "
                "suppress this warning by not passing any value "
                "to the 'presort' parameter.",
                FutureWarning,
            )

        # Build tree
        criterion = self.criterion
        if not isinstance(criterion, Criterion):
            if is_classifier(self):
                criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
                                                         self.n_classes_)
            else:
                criterion = CRITERIA_REG[self.criterion](self.n_outputs_,
                                                         n_samples)

        SPLITTERS = DENSE_SPLITTERS

        splitter = self.splitter
        if not isinstance(self.splitter, Splitter):
            splitter = SPLITTERS[self.splitter](
                criterion,
                self.max_features_,
                min_samples_leaf,
                min_weight_leaf,
                random_state,
            )

        if is_classifier(self):
            self.tree_ = Tree(self.n_features_, self.n_classes_,
                              self.n_outputs_)
        else:
            self.tree_ = Tree(
                self.n_features_,
                # TODO: tree should't need this in this case
                np.array([1] * self.n_outputs_, dtype=np.int32),
                self.n_outputs_,
            )

        builder = DepthFirstTreeBuilder(
            splitter,
            min_samples_split,
            min_samples_leaf,
            min_weight_leaf,
            max_depth,
            self.min_impurity_decrease,
            min_impurity_split,
        )

        builder.build(self.tree_, X, y, sample_weight, X_idx_sorted)

        if self.n_outputs_ == 1 and is_classifier(self):
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]

        # Only return the essential data for using a tree for prediction
        feature = self.tree_.feature
        threshold = self.tree_.threshold
        children = np.vstack(
            (self.tree_.children_left, self.tree_.children_right)).T
        value = self.tree_.value

        return feature, threshold, children, value