def test_auc_score_non_binary_class():
    # Test that roc_auc_score function returns an error when trying
    # to compute AUC for non-binary class values.
    rng = check_random_state(404)
    y_pred = rng.rand(10)
    # y_true contains only one class value
    y_true = np.zeros(10, dtype="int")
    assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred)
    y_true = np.ones(10, dtype="int")
    assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred)
    y_true = -np.ones(10, dtype="int")
    assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred)
    # y_true contains three different class values
    y_true = rng.randint(0, 3, size=10)
    assert_raise_message(ValueError, "multiclass format is not supported", roc_auc_score, y_true, y_pred)

    clean_warning_registry()
    with warnings.catch_warnings(record=True):
        rng = check_random_state(404)
        y_pred = rng.rand(10)
        # y_true contains only one class value
        y_true = np.zeros(10, dtype="int")
        assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred)
        y_true = np.ones(10, dtype="int")
        assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred)
        y_true = -np.ones(10, dtype="int")
        assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred)

        # y_true contains three different class values
        y_true = rng.randint(0, 3, size=10)
        assert_raise_message(ValueError, "multiclass format is not supported", roc_auc_score, y_true, y_pred)
Exemple #2
0
def test_sample_weight_invariance(n_samples=50):
    random_state = check_random_state(0)

    # binary
    random_state = check_random_state(0)
    y_true = random_state.randint(0, 2, size=(n_samples, ))
    y_pred = random_state.randint(0, 2, size=(n_samples, ))
    y_score = random_state.random_sample(size=(n_samples,))
    for name in ALL_METRICS:
        if (name in METRICS_WITHOUT_SAMPLE_WEIGHT or
                name in METRIC_UNDEFINED_BINARY):
            continue
        metric = ALL_METRICS[name]
        if name in THRESHOLDED_METRICS:
            yield _named_check(check_sample_weight_invariance, name), name,\
                  metric, y_true, y_score
        else:
            yield _named_check(check_sample_weight_invariance, name), name,\
                  metric, y_true, y_pred

    # multiclass
    random_state = check_random_state(0)
    y_true = random_state.randint(0, 5, size=(n_samples, ))
    y_pred = random_state.randint(0, 5, size=(n_samples, ))
    y_score = random_state.random_sample(size=(n_samples, 5))
    for name in ALL_METRICS:
        if (name in METRICS_WITHOUT_SAMPLE_WEIGHT or
                name in METRIC_UNDEFINED_BINARY_MULTICLASS):
            continue
        metric = ALL_METRICS[name]
        if name in THRESHOLDED_METRICS:
            yield _named_check(check_sample_weight_invariance, name), name,\
                  metric, y_true, y_score
        else:
            yield _named_check(check_sample_weight_invariance, name), name,\
                  metric, y_true, y_pred

    # multilabel indicator
    _, ya = make_multilabel_classification(n_features=1, n_classes=20,
                                           random_state=0, n_samples=100,
                                           allow_unlabeled=False)
    _, yb = make_multilabel_classification(n_features=1, n_classes=20,
                                           random_state=1, n_samples=100,
                                           allow_unlabeled=False)
    y_true = np.vstack([ya, yb])
    y_pred = np.vstack([ya, ya])
    y_score = random_state.randint(1, 4, size=y_true.shape)

    for name in (MULTILABELS_METRICS + THRESHOLDED_MULTILABEL_METRICS +
                 MULTIOUTPUT_METRICS):
        if name in METRICS_WITHOUT_SAMPLE_WEIGHT:
            continue

        metric = ALL_METRICS[name]
        if name in THRESHOLDED_METRICS:
            yield (_named_check(check_sample_weight_invariance, name), name,
                   metric, y_true, y_score)
        else:
            yield (_named_check(check_sample_weight_invariance, name), name,
                   metric, y_true, y_pred)
Exemple #3
0
    def __init__(self, configuration, random_state=None):
        self.configuration = configuration

        if random_state is None:
            self.random_state = check_random_state(1)
        else:
            self.random_state = check_random_state(random_state)
Exemple #4
0
def test_RadiusNeighborsRegressor_multioutput_with_uniform_weight():
    """Test radius neighbors in multi-output regression (uniform weight)"""

    rng = check_random_state(0)
    n_features = 5
    n_samples = 40
    n_output = 4

    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples, n_output)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for algorithm, weights in product(ALGORITHMS, [None, 'uniform']):

        rnn = neighbors. RadiusNeighborsRegressor(weights=weights,
                                                  algorithm=algorithm)
        rnn.fit(X_train, y_train)

        neigh_idx = rnn.radius_neighbors(X_test, return_distance=False)
        y_pred_idx = np.array([np.mean(y_train[idx], axis=0)
                               for idx in neigh_idx])

        y_pred_idx = np.array(y_pred_idx)
        y_pred = rnn.predict(X_test)

        assert_equal(y_pred_idx.shape, y_test.shape)
        assert_equal(y_pred.shape, y_test.shape)
        assert_array_almost_equal(y_pred, y_pred_idx)
Exemple #5
0
def endless_permutations(N, random_state=None):
    """
    Generate an endless sequence of random integers from permutations of the
    set [0, ..., N).

    If we call this N times, we will sweep through the entire set without
    replacement, on the (N+1)th call a new permutation will be created, etc.

    Parameters
    ----------
    N: int
        the length of the set
    random_state: int or RandomState, optional
        random seed

    Yields
    ------
    int:
        a random int from the set [0, ..., N)
    """
    generator = check_random_state(random_state)
    while True:
        batch_inds = generator.permutation(N)
        for b in batch_inds:
            yield b
Exemple #6
0
    def fit(self, x, y):
        random_state = check_random_state(self.random_state)

        self.a = np.min(y)
        self.b = np.max(y)

        self.w = self._find_best_w(x, y, random_state)
        x_r = np.dot(self.w, x.T).T
        x1 = x_r[y == self.a]
        x2 = x_r[y == self.b]

        kA = self.base_objective.gamma * 1.06
        h1 = kA * len(x1) ** (-1.0 / 5) * np.std(x1)  # silverman's rule of the thumb
        h2 = kA * len(x2) ** (-1.0 / 5) * np.std(x2)  # silverman's rule of the thumb

        self.kde_a = KernelDensity(kernel='gaussian', bandwidth=h1).fit(x1.reshape(-1, 1))
        self.kde_b = KernelDensity(kernel='gaussian', bandwidth=h2).fit(x2.reshape(-1, 1))

        self.min_v = min(x_r)
        self.max_v = max(x_r)

        self.min_c = self._density_classification(self.min_v)
        self.max_c = self._density_classification(self.max_v)

        ytr = self.predict(x, True)

        last = ytr[0]
        self.k = 0
        for i in range(1, len(ytr)):
            if ytr[i] != last:
                self.k += 1
            last = ytr[i]
 def setUp(self):
     iris = datasets.load_iris()
     rng = check_random_state(0)
     perm = rng.permutation(iris.target.size)
     iris.data = iris.data[perm]
     iris.target = iris.target[perm]
     self.iris = iris
Exemple #8
0
def test_RadiusNeighborsClassifier_multioutput():
    """Test k-NN classifier on multioutput data"""
    rng = check_random_state(0)
    n_features = 2
    n_samples = 40
    n_output = 3

    X = rng.rand(n_samples, n_features)
    y = rng.randint(0, 3, (n_samples, n_output))

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    weights = [None, 'uniform', 'distance', _weight_func]

    for algorithm, weights in product(ALGORITHMS, weights):
        # Stack single output prediction
        y_pred_so = []
        for o in range(n_output):
            rnn = neighbors.RadiusNeighborsClassifier(weights=weights,
                                                      algorithm=algorithm)
            rnn.fit(X_train, y_train[:, o])
            y_pred_so.append(rnn.predict(X_test))

        y_pred_so = np.vstack(y_pred_so).T
        assert_equal(y_pred_so.shape, y_test.shape)

        # Multioutput prediction
        rnn_mo = neighbors.RadiusNeighborsClassifier(weights=weights,
                                                     algorithm=algorithm)
        rnn_mo.fit(X_train, y_train)
        y_pred_mo = rnn_mo.predict(X_test)

        assert_equal(y_pred_mo.shape, y_test.shape)
        assert_array_almost_equal(y_pred_mo, y_pred_so)
Exemple #9
0
def check_alternative_lrap_implementation(lrap_score, n_classes=5,
                                          n_samples=20, random_state=0):
    _, y_true = make_multilabel_classification(n_features=1,
                                               allow_unlabeled=False,
                                               random_state=random_state,
                                               n_classes=n_classes,
                                               n_samples=n_samples)

    # Score with ties
    y_score = sparse_random_matrix(n_components=y_true.shape[0],
                                   n_features=y_true.shape[1],
                                   random_state=random_state)

    if hasattr(y_score, "toarray"):
        y_score = y_score.toarray()
    score_lrap = label_ranking_average_precision_score(y_true, y_score)
    score_my_lrap = _my_lrap(y_true, y_score)
    assert_almost_equal(score_lrap, score_my_lrap)

    # Uniform score
    random_state = check_random_state(random_state)
    y_score = random_state.uniform(size=(n_samples, n_classes))
    score_lrap = label_ranking_average_precision_score(y_true, y_score)
    score_my_lrap = _my_lrap(y_true, y_score)
    assert_almost_equal(score_lrap, score_my_lrap)
def test_symmetry():
    """Test the symmetry of score and loss functions"""
    random_state = check_random_state(0)
    y_true = random_state.randint(0, 2, size=(20, ))
    y_pred = random_state.randint(0, 2, size=(20, ))

    # We shouldn't forget any metrics
    assert_equal(set(SYMMETRIC_METRICS).union(NOT_SYMMETRIC_METRICS,
                                              THRESHOLDED_METRICS,
                                              METRIC_UNDEFINED_MULTICLASS),
                 set(ALL_METRICS))

    assert_equal(
        set(SYMMETRIC_METRICS).intersection(set(NOT_SYMMETRIC_METRICS)),
        set([]))

    # Symmetric metric
    for name in SYMMETRIC_METRICS:
        metric = ALL_METRICS[name]
        assert_almost_equal(metric(y_true, y_pred),
                            metric(y_pred, y_true),
                            err_msg="%s is not symmetric" % name)

    # Not symmetric metrics
    for name in NOT_SYMMETRIC_METRICS:
        metric = ALL_METRICS[name]
        assert_true(np.any(metric(y_true, y_pred) != metric(y_pred, y_true)),
                    msg="%s seems to be symmetric" % name)
Exemple #11
0
    def _fit(self, gn):
        from sklearn.utils.validation import check_random_state
        from sklearn.utils.extmath import randomized_svd

        # apply scaling
        gn = self.scaler_.fit(gn).transform(gn)

        # transpose for svd
        # TODO eliminate need for transposition
        x = gn.T
        n_samples, n_features = x.shape

        # intermediates
        random_state = check_random_state(self.random_state)
        n_components = self.n_components
        n_samples, n_features = x.shape

        # singular value decomposition
        u, s, v = randomized_svd(x, n_components,
                                 n_iter=self.iterated_power,
                                 random_state=random_state)

        # calculate explained variance
        self.explained_variance_ = exp_var = (s ** 2) / n_samples
        full_var = np.var(x, axis=0).sum()
        self.explained_variance_ratio_ = exp_var / full_var

        # store components
        self.components_ = v

        return u, s, v
	def test_iris(self):
		"""Check consistency on dataset iris."""

		# also load the iris dataset
		# and randomly permute it
		iris = datasets.load_iris()
		rng = check_random_state(0)
		perm = rng.permutation(iris.target.size)
		iris.data = iris.data[perm]
		iris.target = iris.target[perm]

		

		clf = CFClassifier("")
		clf.fit(iris.data, iris.target)

		self.assertTrue(os.path.isfile(clf.forest))

		preds = clf.predict(iris.data)


		predicted_ratio = float(np.sum(preds==iris.target))/float(len(iris.target))
		print predicted_ratio

		self.assertGreaterEqual(predicted_ratio, .97) 

		probs = clf.predict_proba(iris.data)


		bin_idx=iris.target!=2

		roc_auc = roc_auc_score(iris.target[bin_idx], probs[bin_idx,1])

		self.assertGreaterEqual(roc_auc, .97) 
Exemple #13
0
    def fit(self, x, y):

        if len(set(y)) > 2:
            raise NotImplementedError('Currently MELM supports only binary datasets')

        self.base_objective = DCS_kd(gamma=self.gamma, k=self.k, 
                                     covariance_estimator=self.covariance_estimator)

        if self.classifier == 'KDE':
            self.clf = KDE(gamma=self.gamma)
        elif self.classifier == 'SVM':
            self.clf = SVM()
        elif self.classifier == 'KNN':
            self.clf = KNN()
        else:
            raise NotImplementedError('%s classifier is not implemented' % self.classifier)

        random_state = check_random_state(self.random_state)

        self.a = min(y)
        self.b = max(y)

        self.classes_ = np.array([self.a, self.b])

        self.w = self._find_best_w(x, y, random_state)

        self.clf.fit(self.transform(x), y)
Exemple #14
0
 def predict(self, X, Y_possible):
     if self.method == "random":
         rng = check_random_state(self.random_state)
     else:
         rng = None
     return [self._predict_interval(possible_intervals, rng)
             for possible_intervals in Y_possible]
Exemple #15
0
 def _predict_interval(self, possible_intervals, rng=None):
     if self.method == "center":
         return possible_intervals[len(possible_intervals) / 2]
     elif self.method == "random":
         if rng is None:
             rng = check_random_state(self.random_state)
         return possible_intervals[rng.randint(len(possible_intervals))]
def test_symmetry():
    # Test the symmetry of score and loss functions
    random_state = check_random_state(0)
    y_true = random_state.randint(0, 2, size=(20, ))
    y_pred = random_state.randint(0, 2, size=(20, ))

    # We shouldn't forget any metrics
    assert_equal(SYMMETRIC_METRICS.union(
        NOT_SYMMETRIC_METRICS, set(THRESHOLDED_METRICS),
        METRIC_UNDEFINED_BINARY_MULTICLASS),
        set(ALL_METRICS))

    assert_equal(
        SYMMETRIC_METRICS.intersection(NOT_SYMMETRIC_METRICS),
        set([]))

    # Symmetric metric
    for name in SYMMETRIC_METRICS:
        metric = ALL_METRICS[name]
        assert_allclose(metric(y_true, y_pred), metric(y_pred, y_true),
                        err_msg="%s is not symmetric" % name)

    # Not symmetric metrics
    for name in NOT_SYMMETRIC_METRICS:
        metric = ALL_METRICS[name]

        # use context manager to supply custom error message
        with assert_raises(AssertionError) as cm:
            assert_array_equal(metric(y_true, y_pred), metric(y_pred, y_true))
            cm.msg = ("%s seems to be symmetric" % name)
 def __init__(self, shuffle_factor=0.05, not_shuffled_columns=None, random_state=None):
     self.shuffle_factor = shuffle_factor
     self.random_state = check_random_state(random_state)
     if not_shuffled_columns is None:
         self.not_shuffled_columns = []
     else:
         self.not_shuffled_columns = not_shuffled_columns
def test_thresholded_invariance_string_vs_numbers_labels(name):
    # Ensure that thresholded metrics with string labels are invariant
    random_state = check_random_state(0)
    y1 = random_state.randint(0, 2, size=(20, ))
    y2 = random_state.randint(0, 2, size=(20, ))

    y1_str = np.array(["eggs", "spam"])[y1]

    pos_label_str = "spam"

    with ignore_warnings():
        metric = THRESHOLDED_METRICS[name]
        if name not in METRIC_UNDEFINED_BINARY:
            # Ugly, but handle case with a pos_label and label
            metric_str = metric
            if name in METRICS_WITH_POS_LABEL:
                metric_str = partial(metric_str, pos_label=pos_label_str)

            measure_with_number = metric(y1, y2)
            measure_with_str = metric_str(y1_str, y2)
            assert_array_equal(measure_with_number, measure_with_str,
                               err_msg="{0} failed string vs number "
                                       "invariance test".format(name))

            measure_with_strobj = metric_str(y1_str.astype('O'), y2)
            assert_array_equal(measure_with_number, measure_with_strobj,
                               err_msg="{0} failed string object vs number "
                                       "invariance test".format(name))
        else:
            # TODO those metrics doesn't support string label yet
            assert_raises(ValueError, metric, y1_str, y2)
            assert_raises(ValueError, metric, y1_str.astype('O'), y2)
def test_sample_order_invariance_multilabel_and_multioutput():
    random_state = check_random_state(0)

    # Generate some data
    y_true = random_state.randint(0, 2, size=(20, 25))
    y_pred = random_state.randint(0, 2, size=(20, 25))
    y_score = random_state.normal(size=y_true.shape)

    y_true_shuffle, y_pred_shuffle, y_score_shuffle = shuffle(y_true,
                                                              y_pred,
                                                              y_score,
                                                              random_state=0)

    for name in MULTILABELS_METRICS:
        metric = ALL_METRICS[name]
        assert_allclose(metric(y_true, y_pred),
                        metric(y_true_shuffle, y_pred_shuffle),
                        err_msg="%s is not sample order invariant" % name)

    for name in THRESHOLDED_MULTILABEL_METRICS:
        metric = ALL_METRICS[name]
        assert_allclose(metric(y_true, y_score),
                        metric(y_true_shuffle, y_score_shuffle),
                        err_msg="%s is not sample order invariant" % name)

    for name in MULTIOUTPUT_METRICS:
        metric = ALL_METRICS[name]
        assert_allclose(metric(y_true, y_score),
                        metric(y_true_shuffle, y_score_shuffle),
                        err_msg="%s is not sample order invariant" % name)
        assert_allclose(metric(y_true, y_pred),
                        metric(y_true_shuffle, y_pred_shuffle),
                        err_msg="%s is not sample order invariant" % name)
def check_importances(name, criterion, X, y):
    ForestEstimator = FOREST_ESTIMATORS[name]

    est = ForestEstimator(n_estimators=20, criterion=criterion, random_state=0)
    est.fit(X, y)
    importances = est.feature_importances_
    n_important = np.sum(importances > 0.1)
    assert_equal(importances.shape[0], 10)
    assert_equal(n_important, 3)

    # Check with parallel
    importances = est.feature_importances_
    est.set_params(n_jobs=2)
    importances_parrallel = est.feature_importances_
    assert_array_almost_equal(importances, importances_parrallel)

    # Check with sample weights
    sample_weight = check_random_state(0).randint(1, 10, len(X))
    est = ForestEstimator(n_estimators=20, random_state=0, criterion=criterion)
    est.fit(X, y, sample_weight=sample_weight)
    importances = est.feature_importances_
    assert_true(np.all(importances >= 0.0))

    for scale in [0.5, 10, 100]:
        est = ForestEstimator(n_estimators=20, random_state=0, criterion=criterion)
        est.fit(X, y, sample_weight=scale * sample_weight)
        importances_bis = est.feature_importances_
        assert_less(np.abs(importances - importances_bis).mean(), 0.001)
def test_binary_clf_curve():
    rng = check_random_state(404)
    y_true = rng.randint(0, 3, size=10)
    y_pred = rng.rand(10)
    msg = "multiclass format is not supported"
    assert_raise_message(ValueError, msg, precision_recall_curve,
                         y_true, y_pred)
def check_importances(X, y, name, criterion):
    ForestEstimator = FOREST_ESTIMATORS[name]

    est = ForestEstimator(n_estimators=20, criterion=criterion,
                          random_state=0)
    est.fit(X, y)
    importances = est.feature_importances_
    n_important = np.sum(importances > 0.1)
    assert_equal(importances.shape[0], 10)
    assert_equal(n_important, 3)

    # XXX: Remove this test in 0.19 after transform support to estimators
    # is removed.
    X_new = assert_warns(
        DeprecationWarning, est.transform, X, threshold="mean")
    assert_less(0 < X_new.shape[1], X.shape[1])

    # Check with parallel
    importances = est.feature_importances_
    est.set_params(n_jobs=2)
    importances_parrallel = est.feature_importances_
    assert_array_almost_equal(importances, importances_parrallel)

    # Check with sample weights
    sample_weight = check_random_state(0).randint(1, 10, len(X))
    est = ForestEstimator(n_estimators=20, random_state=0, criterion=criterion)
    est.fit(X, y, sample_weight=sample_weight)
    importances = est.feature_importances_
    assert_true(np.all(importances >= 0.0))

    for scale in [0.5, 10, 100]:
        est = ForestEstimator(n_estimators=20, random_state=0, criterion=criterion)
        est.fit(X, y, sample_weight=scale * sample_weight)
        importances_bis = est.feature_importances_
        assert_less(np.abs(importances - importances_bis).mean(), 0.001)
Exemple #23
0
def check_explicit_sparse_zeros(tree, max_depth=3,
                                n_features=10):
    TreeEstimator = ALL_TREES[tree]

    # n_samples set n_feature to ease construction of a simultaneous
    # construction of a csr and csc matrix
    n_samples = n_features
    samples = np.arange(n_samples)

    # Generate X, y
    random_state = check_random_state(0)
    indices = []
    data = []
    offset = 0
    indptr = [offset]
    for i in range(n_features):
        n_nonzero_i = random_state.binomial(n_samples, 0.5)
        indices_i = random_state.permutation(samples)[:n_nonzero_i]
        indices.append(indices_i)
        data_i = random_state.binomial(3, 0.5, size=(n_nonzero_i, )) - 1
        data.append(data_i)
        offset += n_nonzero_i
        indptr.append(offset)

    indices = np.concatenate(indices)
    data = np.array(np.concatenate(data), dtype=np.float32)
    X_sparse = csc_matrix((data, indices, indptr),
                          shape=(n_samples, n_features))
    X = X_sparse.toarray()
    X_sparse_test = csr_matrix((data, indices, indptr),
                               shape=(n_samples, n_features))
    X_test = X_sparse_test.toarray()
    y = random_state.randint(0, 3, size=(n_samples, ))

    # Ensure that X_sparse_test owns its data, indices and indptr array
    X_sparse_test = X_sparse_test.copy()

    # Ensure that we have explicit zeros
    assert_greater((X_sparse.data == 0.).sum(), 0)
    assert_greater((X_sparse_test.data == 0.).sum(), 0)

    # Perform the comparison
    d = TreeEstimator(random_state=0, max_depth=max_depth).fit(X, y)
    s = TreeEstimator(random_state=0, max_depth=max_depth).fit(X_sparse, y)

    assert_tree_equal(d.tree_, s.tree_,
                      "{0} with dense and sparse format gave different "
                      "trees".format(tree))

    Xs = (X_test, X_sparse_test)
    for X1, X2 in product(Xs, Xs):
        assert_array_almost_equal(s.tree_.apply(X1), d.tree_.apply(X2))
        assert_array_almost_equal(s.apply(X1), d.apply(X2))
        assert_array_almost_equal(s.apply(X1), s.tree_.apply(X1))
        assert_array_almost_equal(s.predict(X1), d.predict(X2))

        if tree in CLF_TREES:
            assert_array_almost_equal(s.predict_proba(X1),
                                      d.predict_proba(X2))
 def setUp(self):
     iris = datasets.load_iris()
     rng = check_random_state(0)
     iris.data = iris.data
     iris.target = iris.target
     self.iris = iris
     for csv_file in glob.glob("*.csv"):
         os.remove(csv_file)
Exemple #25
0
 def _get_folds_column(self, length):
     if self._random_number is None:
         self._random_number = check_random_state(self.random_state).randint(0, 100000)
     folds_column = numpy.zeros(length)
     for fold_number, (_, folds_indices) in enumerate(
             KFold(length, self.n_folds, shuffle=True, random_state=self._random_number)):
         folds_column[folds_indices] = fold_number
     return folds_column
def testSubset(test):
    n_faces = 5

    rng = check_random_state(4)
    face_ids = rng.randint(test.shape[0], size=(n_faces, ))
    test = test[face_ids, :]

    return test, n_faces
def test_regression_sample_weight_invariance(name):
    n_samples = 50
    random_state = check_random_state(0)
    # regression
    y_true = random_state.random_sample(size=(n_samples,))
    y_pred = random_state.random_sample(size=(n_samples,))
    metric = ALL_METRICS[name]
    check_sample_weight_invariance(name, metric, y_true, y_pred)
Exemple #28
0
def test_only_constant_features():
    random_state = check_random_state(0)
    X = np.zeros((10, 20))
    y = random_state.randint(0, 2, (10, ))
    for name, TreeEstimator in ALL_TREES.items():
        est = TreeEstimator(random_state=0)
        est.fit(X, y)
        assert_equal(est.tree_.max_depth, 0)
Exemple #29
0
 def _check_params(self):
     if self.loss is None:
         self.loss = AdaLossFunction()
     # Losses from sklearn are not allowed
     assert isinstance(self.loss, AbstractLossFunction), \
         'LossFunction should be derived from AbstractLossFunction'
     assert self.n_estimators > 0, 'n_estimators should be positive'
     self.random_state = check_random_state(self.random_state)
     assert 0 < self.subsample <= 1.0, 'subsample should be in the interval (0, 1]'
Exemple #30
0
def pairwise_transform(X, Y, limit=1.0, random_state=None):
    """Form comparable pairs with interval-annotated entries.

    Parameters
    ----------

    X: array-like, shape (n_samples x n_features)
        The feature representation of the instances.

    Y: array_like, shape (n_samples x 2)
        The lower and upper bounds of the interval of each instance.

    limit: float,
        Ratio (between 0 and 1) of how many pairs to form with each input
        sample.  Use this to reduce computing time and memory usage,
        at the cost of approximation error.
        If, for a given sample, there are 100 samples before and 100
        samples after, and p=0.1, then 10 + 10 transformed pairs will be
        created.

    """
    X = check_array(X, accept_sparse='csr')
    Y = check_array(Y, accept_sparse=None)
    rng = check_random_state(random_state)
    if Y.shape[1] != 2:
        raise ValueError("Y must have two columns, represeting the lower "
                         "and upper bound of the interval for each entry.")

    #n_samples = X.shape[0]
    #idx = np.arange(n_samples)
    chunks = []
    #chunk_idx = []
    for k, (x, (y_min, y_max)) in enumerate(zip(X, Y)):
        X_rest, Y_rest = X[1 + k:], Y[1 + k:]
        #idx_rest = idx[1 + k:]
        before = Y_rest[:, 1] < y_min
        after = Y_rest[:, 0] > y_max
        n_before = np.sum(before) * limit
        n_after = np.sum(after) * limit
        if n_before:
            before = np.where(before)[0]
            before = rng.choice(before, n_before, replace=False)
            X_bef = X_rest[before].copy()
            chunks.append(_safe_sparse_add_row(X_bef, -x))
            #chunk_idx.append(np.array([(i, k) for i in idx_rest[before]]))
        if n_after:
            after = np.where(after)[0]
            after = rng.choice(after, n_after, replace=False)
            X_aft = X_rest[after].copy()
            chunks.append(-(_safe_sparse_add_row(X_aft, -x)))
            #chunk_idx.append(np.array([(k, i) for i in idx_rest[after]]))

    if len(chunks):
        return sp.vstack(chunks) if sp.issparse(X) else np.vstack(chunks)
        # , np.row_stack(chunk_idx)
    else:
        raise ValueError("Empty slice: no pairs can be formed.")
Exemple #31
0
def test_recursion_decision_tree_vs_forest_and_gbdt(seed):
    # Make sure that the recursion method gives the same results on a
    # DecisionTreeRegressor and a GradientBoostingRegressor or a
    # RandomForestRegressor with 1 tree and equivalent parameters.

    rng = np.random.RandomState(seed)

    # Purely random dataset to avoid correlated features
    n_samples = 1000
    n_features = 5
    X = rng.randn(n_samples, n_features)
    y = rng.randn(n_samples) * 10

    # The 'init' estimator for GBDT (here the average prediction) isn't taken
    # into account with the recursion method, for technical reasons. We set
    # the mean to 0 to that this 'bug' doesn't have any effect.
    y = y - y.mean()

    # set max_depth not too high to avoid splits with same gain but different
    # features
    max_depth = 5

    tree_seed = 0
    forest = RandomForestRegressor(n_estimators=1, max_features=None,
                                   bootstrap=False, max_depth=max_depth,
                                   random_state=tree_seed)
    # The forest will use ensemble.base._set_random_states to set the
    # random_state of the tree sub-estimator. We simulate this here to have
    # equivalent estimators.
    equiv_random_state = check_random_state(tree_seed).randint(
        np.iinfo(np.int32).max)
    gbdt = GradientBoostingRegressor(n_estimators=1, learning_rate=1,
                                     criterion='mse', max_depth=max_depth,
                                     random_state=equiv_random_state)
    tree = DecisionTreeRegressor(max_depth=max_depth,
                                 random_state=equiv_random_state)

    forest.fit(X, y)
    gbdt.fit(X, y)
    tree.fit(X, y)

    # sanity check: if the trees aren't the same, the PD values won't be equal
    try:
        assert_is_subtree(tree.tree_, gbdt[0, 0].tree_)
        assert_is_subtree(tree.tree_, forest[0].tree_)
    except AssertionError:
        # For some reason the trees aren't exactly equal on 32bits, so the PDs
        # cannot be equal either. See
        # https://github.com/scikit-learn/scikit-learn/issues/8853
        assert _IS_32BIT, "this should only fail on 32 bit platforms"
        return

    grid = rng.randn(50).reshape(-1, 1)
    for f in range(n_features):
        features = np.array([f], dtype=np.int32)

        pdp_forest = _partial_dependence_recursion(forest, grid, features)
        pdp_gbdt = _partial_dependence_recursion(gbdt, grid, features)
        pdp_tree = _partial_dependence_recursion(tree, grid, features)

        np.testing.assert_allclose(pdp_gbdt, pdp_tree)
        np.testing.assert_allclose(pdp_forest, pdp_tree)
                    norm_diff(X - A, norm=2, random_state=0) / X_spectral_norm)
            f = scalable_frobenius_norm_discrepancy(X, U, s, V)
            all_frobenius[label].append(f / X_fro_norm)

    if len(all_time) == 0:
        raise ValueError("No tests ran. Aborting.")

    if enable_spectral_norm:
        title = "normalized spectral norm diff vs running time"
        scatter_time_vs_s(all_time, all_spectral, datasets, title)
    title = "normalized Frobenius norm diff vs running time"
    scatter_time_vs_s(all_time, all_frobenius, datasets, title)


if __name__ == "__main__":
    random_state = check_random_state(1234)

    power_iter = np.arange(0, 6)
    n_comps = 50

    for dataset_name in datasets:
        X = get_data(dataset_name)
        if X is None:
            continue
        print(" >>>>>> Benching sklearn and fbpca on %s %d x %d" %
              (dataset_name, X.shape[0], X.shape[1]))
        bench_a(
            X,
            dataset_name,
            power_iter,
            n_oversamples=2,
Exemple #33
0
                     hidden_layer_sizes)

# X, y = mlp_estimator._validate_input(X, y, incremental)
n_samples, n_features = X.shape

# Ensure y is 2D
# TODO:保证array为两维,即输入的y应该是np.array([[1, 2, 3]])这才是1行3列的array
# if y.ndim == 1:
#     y = y.reshape((-1, 1))

mlp_estimator.n_outputs_ = y.shape[1]

layer_units = ([n_features] + hidden_layer_sizes + [mlp_estimator.n_outputs_])

# check random state
mlp_estimator._random_state = check_random_state(mlp_estimator.random_state)

incremental = False
if not hasattr(mlp_estimator, 'coefs_') or (not mlp_estimator.warm_start
                                            and not incremental):
    # First time training the model
    mlp_estimator._initialize(y, layer_units)

# lbfgs does not support mini-batches
if mlp_estimator.solver == 'lbfgs':
    batch_size = n_samples
elif mlp_estimator.batch_size == 'auto':
    batch_size = min(200, n_samples)
else:
    if mlp_estimator.batch_size < 1 or mlp_estimator.batch_size > n_samples:
        warnings.warn("Got `batch_size` less than 1 or larger than "
Exemple #34
0
def check_explicit_sparse_zeros(tree, max_depth=3, n_features=10):
    TreeEstimator = ALL_TREES[tree]

    # n_samples set n_feature to ease construction of a simultaneous
    # construction of a csr and csc matrix
    n_samples = n_features
    samples = np.arange(n_samples)

    # Generate X, y
    random_state = check_random_state(0)
    indices = []
    data = []
    offset = 0
    indptr = [offset]
    for i in range(n_features):
        n_nonzero_i = random_state.binomial(n_samples, 0.5)
        indices_i = random_state.permutation(samples)[:n_nonzero_i]
        indices.append(indices_i)
        data_i = random_state.binomial(3, 0.5, size=(n_nonzero_i, )) - 1
        data.append(data_i)
        offset += n_nonzero_i
        indptr.append(offset)

    indices = np.concatenate(indices)
    data = np.array(np.concatenate(data), dtype=np.float32)
    X_sparse = csc_matrix((data, indices, indptr),
                          shape=(n_samples, n_features))
    X = X_sparse.toarray()
    X_sparse_test = csr_matrix((data, indices, indptr),
                               shape=(n_samples, n_features))
    X_test = X_sparse_test.toarray()
    y = random_state.randint(0, 3, size=(n_samples, ))

    # Ensure that X_sparse_test owns its data, indices and indptr array
    X_sparse_test = X_sparse_test.copy()

    # Ensure that we have explicit zeros
    assert_greater((X_sparse.data == 0.).sum(), 0)
    assert_greater((X_sparse_test.data == 0.).sum(), 0)

    # Perform the comparison
    d = TreeEstimator(random_state=0, max_depth=max_depth).fit(X, y)
    s = TreeEstimator(random_state=0, max_depth=max_depth).fit(X_sparse, y)

    assert_tree_equal(
        d.tree_, s.tree_, "{0} with dense and sparse format gave different "
        "trees".format(tree))

    Xs = (X_test, X_sparse_test)
    for X1, X2 in product(Xs, Xs):
        assert_array_almost_equal(s.tree_.apply(X1), d.tree_.apply(X2))
        assert_array_almost_equal(s.apply(X1), d.apply(X2))
        assert_array_almost_equal(s.apply(X1), s.tree_.apply(X1))

        assert_array_almost_equal(
            s.tree_.decision_path(X1).toarray(),
            d.tree_.decision_path(X2).toarray())
        assert_array_almost_equal(
            s.decision_path(X1).toarray(),
            d.decision_path(X2).toarray())
        assert_array_almost_equal(
            s.decision_path(X1).toarray(),
            s.tree_.decision_path(X1).toarray())

        assert_array_almost_equal(s.predict(X1), d.predict(X2))

        if tree in CLF_TREES:
            assert_array_almost_equal(s.predict_proba(X1), d.predict_proba(X2))
def test_print_overloading_estimator():
    """Check that printing a fitted estimator results in 'pretty' output"""

    random_state = check_random_state(415)
    X = np.reshape(random_state.uniform(size=50), (5, 10))
    y = random_state.uniform(size=5)

    # Check the regressor
    est = SymbolicRegressor(generations=2, random_state=0)

    # Unfitted
    orig_stdout = sys.stdout
    try:
        out = StringIO()
        sys.stdout = out
        print(est)
        output_unfitted = out.getvalue().strip()
    finally:
        sys.stdout = orig_stdout

    # Fitted
    est.fit(X, y)
    orig_stdout = sys.stdout
    try:
        out = StringIO()
        sys.stdout = out
        print(est)
        output_fitted = out.getvalue().strip()
    finally:
        sys.stdout = orig_stdout

    orig_stdout = sys.stdout
    try:
        out = StringIO()
        sys.stdout = out
        print(est._program)
        output_program = out.getvalue().strip()
    finally:
        sys.stdout = orig_stdout

    assert_true(output_unfitted != output_fitted)
    assert_true(output_unfitted == est.__repr__())
    assert_true(output_fitted == output_program)

    # Check the transformer
    est = SymbolicTransformer(generations=2, random_state=0)

    # Unfitted
    orig_stdout = sys.stdout
    try:
        out = StringIO()
        sys.stdout = out
        print(est)
        output_unfitted = out.getvalue().strip()
    finally:
        sys.stdout = orig_stdout

    # Fitted
    est.fit(X, y)
    orig_stdout = sys.stdout
    try:
        out = StringIO()
        sys.stdout = out
        print(est)
        output_fitted = out.getvalue().strip()
    finally:
        sys.stdout = orig_stdout

    orig_stdout = sys.stdout
    try:
        out = StringIO()
        sys.stdout = out
        output = str([gp.__str__() for gp in est])
        print(output.replace("',", ",\n").replace("'", ""))
        output_program = out.getvalue().strip()
    finally:
        sys.stdout = orig_stdout

    assert_true(output_unfitted != output_fitted)
    assert_true(output_unfitted == est.__repr__())
    assert_true(output_fitted == output_program)
Exemple #36
0
 def seed(self, seed=None):
     """Same as parent method but passing a RandomState instance is allowed.
     """
     self.np_random = check_random_state(seed)
     return [seed]
Exemple #37
0
    def _fit(self, X):
        solvers = {"full", "auto", "tsqr", "randomized"}
        solver = self.svd_solver

        if solver not in solvers:
            raise ValueError("Invalid solver '{}'. Must be one of {}".format(
                solver, solvers))

        # Handle n_components==None
        if self.n_components is None:
            # TODO: handle nan shapes
            n_components = min(X.shape)
        elif 0 < self.n_components < 1:
            raise NotImplementedError("Fractional 'n_components' is not "
                                      "currently supported")
        else:
            n_components = self.n_components

        n_samples, n_features = X.shape

        if solver == "auto":
            # Small problem, just call full PCA
            if max(X.shape) <= 500:
                solver = "full"
            elif n_components >= 1 and n_components < 0.8 * min(X.shape):
                solver = "randomized"
            # This is also the case of n_components in (0,1)
            else:
                solver = "full"

        if solver == "randomized":
            lower_limit = 1
        else:
            lower_limit = 0

        if not (min(n_samples, n_features) >= n_components >= lower_limit):
            msg = ("n_components={} must be between {} and "
                   "min(n_samples, n_features)={} with "
                   "svd_solver='{}'".format(n_components, lower_limit,
                                            min(n_samples, n_features),
                                            solver))
            raise ValueError(msg)

        if sp.issparse(X):
            raise TypeError("Cannot fit PCA on sparse 'X'")

        self.mean_ = X.mean(0)
        X -= self.mean_

        if solver in {"full", "tsqr"}:
            U, S, V = da.linalg.svd(X)
        else:
            # randomized
            random_state = check_random_state(self.random_state)
            seed = random_state.randint(np.iinfo("int32").max)
            n_power_iter = self.iterated_power
            U, S, V = da.linalg.svd_compressed(X,
                                               n_components,
                                               n_power_iter=n_power_iter,
                                               seed=seed)
        U, V = svd_flip(U, V)

        explained_variance = (S**2) / (n_samples - 1)
        components, singular_values = V, S

        if solver == "randomized":
            # total_var = X.var(ddof=1, axis=0)[:n_components].sum()
            total_var = X.var(ddof=1, axis=0).sum()
        else:
            total_var = explained_variance.sum()
        explained_variance_ratio = explained_variance / total_var

        # Postprocess the number of components required
        # TODO: n_components = 'mle'
        # Punting on fractional n_components for now
        # if 0 < n_components < 1.0:
        #     # number of components for which the cumulated explained
        #     # variance percentage is superior to the desired threshold
        #     ratio_cumsum = stable_cumsum(explained_variance_ratio)
        #     n_components = np.searchsorted(ratio_cumsum, n_components) + 1

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < min(n_features, n_samples):
            if solver == "randomized":
                noise_variance = (total_var.sum() -
                                  explained_variance.sum()) / (min(
                                      n_features, n_samples) - n_components)

                pass
            else:
                noise_variance = explained_variance[n_components:].mean()
        else:
            noise_variance = 0.0

        (
            self.n_samples_,
            self.n_features_,
            self.n_components_,
            self.components_,
            self.explained_variance_,
            self.explained_variance_ratio_,
            self.singular_values_,
            self.noise_variance_,
            self.singular_values_,
        ) = compute(
            n_samples,
            n_features,
            n_components,
            components,
            explained_variance,
            explained_variance_ratio,
            singular_values,
            noise_variance,
            singular_values,
        )

        if solver != "randomized":
            self.components_ = self.components_[:n_components]
            self.explained_variance_ = self.explained_variance_[:n_components]
            self.explained_variance_ratio_ = self.explained_variance_ratio_[:
                                                                            n_components]
            self.singular_values_ = self.singular_values_[:n_components]

        return U, S, V
Exemple #38
0
T = [[-1, -1], [2, 2], [3, 2]]
true_result = [-1, 1, 1]

# Larger classification sample used for testing feature importances
X_large, y_large = datasets.make_classification(n_samples=500,
                                                n_features=10,
                                                n_informative=3,
                                                n_redundant=0,
                                                n_repeated=0,
                                                shuffle=False,
                                                random_state=0)

# also load the iris dataset
# and randomly permute it
iris = datasets.load_iris()
rng = check_random_state(0)
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]

# also load the boston dataset
# and randomly permute it
boston = datasets.load_boston()
perm = rng.permutation(boston.target.size)
boston.data = boston.data[perm]
boston.target = boston.target[perm]

# also make a hastie_10_2 dataset
hastie_X, hastie_y = datasets.make_hastie_10_2(n_samples=20, random_state=1)
hastie_X = hastie_X.astype(np.float32)
Exemple #39
0
#
# License: BSD 3 clause

import numpy as np
from sklearn.datasets import load_boston, load_breast_cancer
from sklearn.metrics import mean_absolute_error
from sklearn.utils.testing import assert_equal, assert_raises
from sklearn.utils.validation import check_random_state

from gplearn.genetic import SymbolicRegressor, SymbolicClassifier
from gplearn.genetic import SymbolicTransformer
from gplearn.fitness import make_fitness, _mean_square_error

# load the breast cancer dataset and randomly permute it
cancer = load_breast_cancer()
perm = check_random_state(0).permutation(cancer.target.size)
cancer.data = cancer.data[perm]
cancer.target = cancer.target[perm]

# load the boston dataset and randomly permute it
boston = load_boston()
perm = check_random_state(0).permutation(boston.target.size)
boston.data = boston.data[perm]
boston.target = boston.target[perm]


def test_validate_fitness():
    """Check that valid fitness measures are accepted & invalid raise error"""

    # Check arg count checks
    _ = make_fitness(function=_mean_square_error, greater_is_better=True)
Exemple #40
0
# variable called 'model'. Don't actually train or do anything else
# with it yet:
#
# .. your code here ..

from sklearn import linear_model
model = linear_model.LinearRegression()

#
# INFO: There are 50 takes of each clip. You want to pull out just one
# of them, randomly, and that one will NOT be used in the training of
# your model. In other words, the one file we'll be testing / scoring
# on will be an unseen sample, independent to the rest of your
# training set:
from sklearn.utils.validation import check_random_state
rng = check_random_state(7)  # Leave this alone until you've submitted your lab
random_idx = rng.randint(zero.shape[0])
test = zero[random_idx]
train = np.delete(zero, [random_idx], axis=0)

#
# TODO: Print out the shape of train, and the shape of test
# train will be shaped: [n_samples, n_audio_samples], where
# n_audio_samples are the 'features' of the audio file
# train will be shaped [n_audio_features], since it is a single
# sample (audio file, e.g. observation).
#
# .. your code here ..

print("train.shape = \n", train.shape)
print("test.shape = \n", test.shape)
Exemple #41
0
def _initialize_metric_mahalanobis(input,
                                   init='identity',
                                   random_state=None,
                                   return_inverse=False,
                                   strict_pd=False,
                                   matrix_name='matrix'):
    """Returns a PSD matrix that can be used as a prior or an initialization
  for the Mahalanobis distance

  Parameters
  ----------
  input : array-like
    The input samples (can be tuples or regular samples).

  init : string or numpy array, optional (default='identity')
    Specification for the matrix to initialize. Possible options are
    'identity', 'covariance', 'random', and a numpy array of shape
    (n_features, n_features).

    'identity'
      An identity matrix of shape (n_features, n_features).

    'covariance'
      The (pseudo-)inverse covariance matrix (raises an error if the
      covariance matrix is not definite and `strict_pd == True`)

    'random'
      A random positive definite (PD) matrix of shape
      `(n_features, n_features)`, generated using
      `sklearn.datasets.make_spd_matrix`.

    numpy array
      A PSD matrix (or strictly PD if strict_pd==True) of
      shape (n_features, n_features), that will be used as such to
      initialize the metric, or set the prior.

  random_state : int or `numpy.RandomState` or None, optional (default=None)
    A pseudo random number generator object or a seed for it if int. If
    ``init='random'``, ``random_state`` is used to set the random Mahalanobis
    matrix. If ``init='pca'``, ``random_state`` is passed as an
    argument to PCA when initializing the matrix.

  return_inverse : bool, optional (default=False)
    Whether to return the inverse of the specified matrix. This
    can be sometimes useful. It will return the pseudo-inverse (which is the
    same as the inverse if the matrix is definite (i.e. invertible)). If
    `strict_pd == True` and the matrix is not definite, it will return an
    error.

  strict_pd : bool, optional (default=False)
    Whether to enforce that the provided matrix is definite (in addition to
    being PSD).

  param_name : str, optional (default='matrix')
    The name of the matrix used (example: 'init', 'prior'). Will be used in
    error messages.

  Returns
  -------
  M, or (M, M_inv) : `numpy.ndarray`
    The initial matrix to use M, and its inverse if `return_inverse=True`.
  """
    n_features = input.shape[-1]
    if isinstance(init, np.ndarray):
        # we copy the array, so that if we update the metric, we don't want to
        # update the init
        init = check_array(init, copy=True)

        # Assert that init.shape[1] = n_features
        if init.shape != (n_features, ) * 2:
            raise ValueError('The input dimensionality {} of the given '
                             'mahalanobis matrix `{}` must match the '
                             'dimensionality of the given inputs ({}).'.format(
                                 init.shape, matrix_name, n_features))

        # Assert that the matrix is symmetric
        if not np.allclose(init, init.T):
            raise ValueError("`{}` is not symmetric.".format(matrix_name))

    elif init not in ['identity', 'covariance', 'random']:
        raise ValueError(
            "`{}` must be 'identity', 'covariance', 'random' "
            "or a numpy array of shape (n_features, n_features).".format(
                matrix_name))

    random_state = check_random_state(random_state)
    M = init
    if isinstance(M, np.ndarray):
        w, V = eigh(M, check_finite=False)
        init_is_definite = _check_sdp_from_eigen(w)
        if strict_pd and not init_is_definite:
            raise LinAlgError(
                "You should provide a strictly positive definite "
                "matrix as `{}`. This one is not definite. Try another"
                " {}, or an algorithm that does not "
                "require the {} to be strictly positive definite.".format(
                    *((matrix_name, ) * 3)))
        elif return_inverse and not init_is_definite:
            warnings.warn('The initialization matrix is not invertible: '
                          'using the pseudo-inverse instead.')
        if return_inverse:
            M_inv = _pseudo_inverse_from_eig(w, V)
            return M, M_inv
        else:
            return M
    elif init == 'identity':
        M = np.eye(n_features, n_features)
        if return_inverse:
            M_inv = M.copy()
            return M, M_inv
        else:
            return M
    elif init == 'covariance':
        if input.ndim == 3:
            # if the input are tuples, we need to form an X by deduplication
            X = np.unique(np.vstack(input), axis=0)
        else:
            X = input
        # atleast2d is necessary to deal with scalar covariance matrices
        M_inv = np.atleast_2d(np.cov(X, rowvar=False))
        w, V = eigh(M_inv, check_finite=False)
        cov_is_definite = _check_sdp_from_eigen(w)
        if strict_pd and not cov_is_definite:
            raise LinAlgError(
                "Unable to get a true inverse of the covariance "
                "matrix since it is not definite. Try another "
                "`{}`, or an algorithm that does not "
                "require the `{}` to be strictly positive definite.".format(
                    *((matrix_name, ) * 2)))
        elif not cov_is_definite:
            warnings.warn(
                'The covariance matrix is not invertible: '
                'using the pseudo-inverse instead.'
                'To make the covariance matrix invertible'
                ' you can remove any linearly dependent features and/or '
                'reduce the dimensionality of your input, '
                'for instance using `sklearn.decomposition.PCA` as a '
                'preprocessing step.')
        M = _pseudo_inverse_from_eig(w, V)
        if return_inverse:
            return M, M_inv
        else:
            return M
    elif init == 'random':
        # we need to create a random symmetric matrix
        M = make_spd_matrix(n_features, random_state=random_state)
        if return_inverse:
            # we use pinvh even if we know the matrix is definite, just because
            # we need the returned matrix to be symmetric (and sometimes
            # np.linalg.inv returns not symmetric inverses of symmetric matrices)
            # TODO: there might be a more efficient method to do so
            M_inv = pinvh(M)
            return M, M_inv
        else:
            return M
Exemple #42
0
def test_symbolic_regressor():
    """Check that SymbolicRegressor example works"""

    rng = check_random_state(0)
    X_train = rng.uniform(-1, 1, 100).reshape(50, 2)
    y_train = X_train[:, 0]**2 - X_train[:, 1]**2 + X_train[:, 1] - 1
    X_test = rng.uniform(-1, 1, 100).reshape(50, 2)
    y_test = X_test[:, 0]**2 - X_test[:, 1]**2 + X_test[:, 1] - 1

    est_gp = SymbolicRegressor(population_size=5000,
                               generations=20,
                               stopping_criteria=0.01,
                               p_crossover=0.7,
                               p_subtree_mutation=0.1,
                               p_hoist_mutation=0.05,
                               p_point_mutation=0.1,
                               max_samples=0.9,
                               parsimony_coefficient=0.01,
                               random_state=0)
    est_gp.fit(X_train, y_train)

    assert_equal(len(est_gp._programs), 7)
    expected = 'sub(add(-0.999, X1), mul(sub(X1, X0), add(X0, X1)))'
    assert_equal(est_gp.__str__(), expected)
    assert_almost_equal(est_gp.score(X_test, y_test), 0.99999, decimal=5)
    dot_data = est_gp._program.export_graphviz()
    expected = ('digraph program {\nnode [style=filled]\n0 [label="sub", '
                'fillcolor="#136ed4"] ;\n1 [label="add", fillcolor="#136ed4"] '
                ';\n2 [label="-0.999", fillcolor="#60a6f6"] ;\n3 [label="X1", '
                'fillcolor="#60a6f6"] ;\n1 -> 3 ;\n1 -> 2 ;\n4 [label="mul", '
                'fillcolor="#136ed4"] ;\n5 [label="sub", fillcolor="#136ed4"] '
                ';\n6 [label="X1", fillcolor="#60a6f6"] ;\n7 [label="X0", '
                'fillcolor="#60a6f6"] ;\n5 -> 7 ;\n5 -> 6 ;\n8 [label="add", '
                'fillcolor="#136ed4"] ;\n9 [label="X0", fillcolor="#60a6f6"] '
                ';\n10 [label="X1", fillcolor="#60a6f6"] ;\n8 -> 10 ;\n8 -> 9 '
                ';\n4 -> 8 ;\n4 -> 5 ;\n0 -> 4 ;\n0 -> 1 ;\n}')
    assert_equal(dot_data, expected)
    assert_equal(
        est_gp._program.parents, {
            'method': 'Crossover',
            'parent_idx': 1555,
            'parent_nodes': range(1, 4),
            'donor_idx': 78,
            'donor_nodes': []
        })
    idx = est_gp._program.parents['donor_idx']
    fade_nodes = est_gp._program.parents['donor_nodes']
    assert_equal(est_gp._programs[-2][idx].__str__(), 'add(-0.999, X1)')
    assert_almost_equal(est_gp._programs[-2][idx].fitness_, 0.351803319075)
    dot_data = est_gp._programs[-2][idx].export_graphviz(fade_nodes=fade_nodes)
    expected = ('digraph program {\nnode [style=filled]\n0 [label="add", '
                'fillcolor="#136ed4"] ;\n1 [label="-0.999", '
                'fillcolor="#60a6f6"] ;\n2 [label="X1", fillcolor="#60a6f6"] '
                ';\n0 -> 2 ;\n0 -> 1 ;\n}')
    assert_equal(dot_data, expected)
    idx = est_gp._program.parents['parent_idx']
    fade_nodes = est_gp._program.parents['parent_nodes']
    assert_equal(est_gp._programs[-2][idx].__str__(),
                 'sub(sub(X1, 0.939), mul(sub(X1, X0), add(X0, X1)))')
    assert_almost_equal(est_gp._programs[-2][idx].fitness_, 0.17080204042)
    dot_data = est_gp._programs[-2][idx].export_graphviz(fade_nodes=fade_nodes)
    expected = ('digraph program {\nnode [style=filled]\n0 [label="sub", '
                'fillcolor="#136ed4"] ;\n1 [label="sub", fillcolor="#cecece"] '
                ';\n2 [label="X1", fillcolor="#cecece"] ;\n3 [label="0.939", '
                'fillcolor="#cecece"] ;\n1 -> 3 ;\n1 -> 2 ;\n4 [label="mul", '
                'fillcolor="#136ed4"] ;\n5 [label="sub", fillcolor="#136ed4"] '
                ';\n6 [label="X1", fillcolor="#60a6f6"] ;\n7 [label="X0", '
                'fillcolor="#60a6f6"] ;\n5 -> 7 ;\n5 -> 6 ;\n8 [label="add", '
                'fillcolor="#136ed4"] ;\n9 [label="X0", fillcolor="#60a6f6"] '
                ';\n10 [label="X1", fillcolor="#60a6f6"] ;\n8 -> 10 ;\n8 -> 9 '
                ';\n4 -> 8 ;\n4 -> 5 ;\n0 -> 4 ;\n0 -> 1 ;\n}')
    assert_equal(dot_data, expected)
Exemple #43
0
    def _validate_train_parms(self, train_set, train_lab, classes=None):
        random_state = validation.check_random_state(self.random_state)
        train_set, train_lab = validation.check_X_y(train_set, train_lab)

        if (self.initial_fit):
            if (classes):
                self.classes_ = np.asarray(classes)
                self.protos_initialized = np.zeros(self.classes_.size)
            else:
                self.classes_ = unique_labels(train_lab)
                self.protos_initialized = np.zeros(self.classes_.size)

        nb_classes = len(self.classes_)
        nb_samples, nb_features = train_set.shape  # nb_samples unused

        # set prototypes per class
        if isinstance(self.prototypes_per_class, int):
            if self.prototypes_per_class < 0 or not isinstance(
                    self.prototypes_per_class, int):
                raise ValueError("prototypes_per_class must be a positive int")
            # nb_ppc = number of protos per class
            nb_ppc = np.ones([nb_classes],
                             dtype='int') * self.prototypes_per_class
        else:
            nb_ppc = validation.column_or_1d(
                validation.check_array(self.prototypes_per_class,
                                       ensure_2d=False,
                                       dtype='int'))
            if nb_ppc.min() <= 0:
                raise ValueError(
                    "values in prototypes_per_class must be positive")
            if nb_ppc.size != nb_classes:
                raise ValueError("length of prototypes per class"
                                 " does not fit the number of classes"
                                 "classes=%d"
                                 "length=%d" % (nb_classes, nb_ppc.size))

        # initialize prototypes
        if self.initial_prototypes is None:
            if self.initial_fit:
                self.w_ = np.empty([np.sum(nb_ppc), nb_features],
                                   dtype=np.double)
                self.c_w_ = np.empty([nb_ppc.sum()], dtype=self.classes_.dtype)
            pos = 0
            for actClass in range(len(self.classes_)):
                nb_prot = nb_ppc[actClass]  # nb_ppc: prototypes per class
                if (self.protos_initialized[actClass] == 0 and
                        self.classes_[actClass] in unique_labels(train_lab)):
                    mean = np.mean(
                        train_set[train_lab == self.classes_[actClass], :], 0)
                    self.w_[pos:pos + nb_prot] = mean + (
                        random_state.rand(nb_prot, nb_features) * 2 - 1)
                    if math.isnan(self.w_[pos, 0]):
                        print('null: ', actClass)
                        self.protos_initialized[actClass] = 0
                    else:
                        self.protos_initialized[actClass] = 1

                    self.c_w_[pos:pos + nb_prot] = self.classes_[actClass]
                pos += nb_prot
        else:
            x = validation.check_array(self.initial_prototypes)
            self.w_ = x[:, :-1]
            self.c_w_ = x[:, -1]
            if self.w_.shape != (np.sum(nb_ppc), nb_features):
                raise ValueError("the initial prototypes have wrong shape\n"
                                 "found=(%d,%d)\n"
                                 "expected=(%d,%d)" %
                                 (self.w_.shape[0], self.w_.shape[1],
                                  nb_ppc.sum(), nb_features))
            if set(self.c_w_) != set(self.classes_):
                raise ValueError(
                    "prototype labels and test data classes do not match\n"
                    "classes={}\n"
                    "prototype labels={}\n".format(self.classes_, self.c_w_))
        if self.initial_fit:
            self.initial_fit = False

        return train_set, train_lab, random_state
def func_consensus(data, n_boot=1000, ci=95, seed=None):
    """
    Calculates thresholded group consensus functional connectivity graph

    This function concatenates all time series in `data` and computes a group
    correlation matrix based on this extended time series. It then generates
    length `T` bootstrapped samples from the concatenated matrix and estimates
    confidence intervals for all correlations. Correlations whose sign is
    consistent across bootstraps are retained; inconsistent correlations are
    set to zero.

    If `n_boot` is set to 0 or None a simple, group-averaged functional
    connectivity matrix is estimated, instead.

    Parameters
    ----------
    data : (N, T, S) array_like (or a list of S arrays, each shaped as (N, T))
        Pre-processed functional time series, where `N` is the number of nodes,
        `T` is the number of volumes in the time series, and `S` is the number
        of subjects.
    n_boot : int, optional
        Number of bootstraps for which to generate correlation. Default: 1000
    ci : (0, 100) float, optional
        Confidence interval for which to assess the reliability of correlations
        with bootstraps. Default: 95
    seed : int, optional
        Random seed. Default: None

    Returns
    -------
    consensus : (N, N) numpy.ndarray
        Thresholded, group-level correlation matrix

    References
    ----------
    Mišić, B., Betzel, R. F., Nematzadeh, A., Goni, J., Griffa, A., Hagmann,
    P., Flammini, A., Ahn, Y.-Y., & Sporns, O. (2015). Cooperative and
    competitive spreading dynamics on the human connectome. Neuron, 86(6),
    1518-1529.
    """

    # check inputs
    rs = check_random_state(seed)
    if ci > 100 or ci < 0:
        raise ValueError("`ci` must be between 0 and 100.")

    # group-average functional connectivity matrix desired instead of bootstrap
    if n_boot == 0 or n_boot is None:
        if isinstance(data, list):
            corrs = [np.corrcoef(sub) for sub in data]
        else:
            corrs = [
                np.corrcoef(data[..., sub]) for sub in range(data.shape[-1])
            ]
        return np.mean(corrs, axis=0)

    if isinstance(data, list):
        collapsed_data = np.hstack(data)
        nsample = int(collapsed_data.shape[-1] / len(data))
    else:
        collapsed_data = data.reshape((len(data), -1), order='F')
        nsample = data.shape[1]

    consensus = np.corrcoef(collapsed_data)

    # only keep the upper triangle for the bootstraps to save on memory usage
    triu_inds = np.triu_indices_from(consensus, k=1)
    bootstrapped_corrmat = np.zeros((len(triu_inds[0]), n_boot))

    # generate `n_boot` bootstrap correlation matrices by sampling `t` time
    # points from the concatenated time series
    for boot in range(n_boot):
        inds = rs.randint(collapsed_data.shape[-1], size=nsample)
        bootstrapped_corrmat[..., boot] = \
            np.corrcoef(collapsed_data[:, inds])[triu_inds]

    # extract the CIs from the bootstrapped correlation matrices
    # we don't need the input anymore so overwrite it
    bootstrapped_ci = np.percentile(bootstrapped_corrmat, [100 - ci, ci],
                                    axis=-1,
                                    overwrite_input=True)

    # remove unreliable (i.e., CI zero-crossing) correlations
    # if the signs of the bootstrapped confidence intervals are different
    # (i.e., their signs sum to 0), then we want to remove them
    # so, take the logical not of the CI (CI = 0 ---> True) and create a mask
    # then, set all connections from the consensus array inside the mask to 0
    remove_inds = np.logical_not(np.sign(bootstrapped_ci).sum(axis=0))
    mask = np.zeros_like(consensus, dtype=bool)
    mask[triu_inds] = remove_inds
    consensus[mask + mask.T] = 0

    return consensus
Exemple #45
0
    def fit(self, X, y):
        """Build a forest of trees from the training set (X, y) using random
        intervals and spectral features
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances,
        series_length] or shape = [n_instances,n_columns]
            The training input samples.  If a Pandas data frame is passed it
            must have a single column (i.e. univariate
            classification. RISE has no bespoke method for multivariate
            classification as yet.
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True)
        X = X.squeeze(1)

        n_instances, self.series_length = X.shape

        rng = check_random_state(self.random_state)

        self.estimators_ = []
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        self.intervals = np.zeros((self.n_estimators, 2), dtype=int)
        self.intervals[0][0] = 0
        self.intervals[0][1] = self.series_length
        for i in range(1, self.n_estimators):
            self.intervals[i][0] = rng.randint(self.series_length -
                                               self.min_interval)
            self.intervals[i][1] = rng.randint(
                self.intervals[i][0] + self.min_interval, self.series_length)
        # Check lag against global properties
        self.acf_lag_ = self.acf_lag
        if self.acf_lag > self.series_length - self.acf_min_values:
            self.acf_lag_ = self.series_length - self.acf_min_values
        if self.acf_lag < 0:
            self.acf_lag_ = 1
        self.lags = np.zeros(self.n_estimators, dtype=int)
        for i in range(0, self.n_estimators):
            temp_lag = self.acf_lag_
            if (temp_lag > self.intervals[i][1] - self.intervals[i][0] -
                    self.acf_min_values):
                temp_lag = (self.intervals[i][1] - self.intervals[i][0] -
                            self.acf_min_values)
            if temp_lag < 0:
                temp_lag = 1
            self.lags[i] = int(temp_lag)
            acf_x = np.empty(shape=(n_instances, self.lags[i]))
            ps_len = (self.intervals[i][1] - self.intervals[i][0]) / 2
            ps_x = np.empty(shape=(n_instances, int(ps_len)))
            for j in range(0, n_instances):
                acf_x[j] = acf(X[j, self.intervals[i][0]:self.intervals[i][1]],
                               temp_lag)
                ps_x[j] = ps(X[j, self.intervals[i][0]:self.intervals[i][1]])
            transformed_x = np.concatenate((acf_x, ps_x), axis=1)
            #            transformed_x=acf_x
            tree = clone(self.base_estimator)
            # set random state, but not the same, so that estimators vary
            tree.set_params(
                **{"random_state": rng.randint(np.iinfo(np.int32).max)})
            tree.fit(transformed_x, y)
            self.estimators_.append(tree)

        self._is_fitted = True
        return self
def test_format_invariance_with_1d_vectors(name):
    random_state = check_random_state(0)
    y1 = random_state.randint(0, 2, size=(20, ))
    y2 = random_state.randint(0, 2, size=(20, ))

    y1_list = list(y1)
    y2_list = list(y2)

    y1_1d, y2_1d = np.array(y1), np.array(y2)
    assert_array_equal(y1_1d.ndim, 1)
    assert_array_equal(y2_1d.ndim, 1)
    y1_column = np.reshape(y1_1d, (-1, 1))
    y2_column = np.reshape(y2_1d, (-1, 1))
    y1_row = np.reshape(y1_1d, (1, -1))
    y2_row = np.reshape(y2_1d, (1, -1))

    with ignore_warnings():
        metric = ALL_METRICS[name]

        measure = metric(y1, y2)

        assert_allclose(metric(y1_list, y2_list),
                        measure,
                        err_msg="%s is not representation invariant with list"
                        "" % name)

        assert_allclose(metric(y1_1d, y2_1d),
                        measure,
                        err_msg="%s is not representation invariant with "
                        "np-array-1d" % name)

        assert_allclose(metric(y1_column, y2_column),
                        measure,
                        err_msg="%s is not representation invariant with "
                        "np-array-column" % name)

        # Mix format support
        assert_allclose(metric(y1_1d, y2_list),
                        measure,
                        err_msg="%s is not representation invariant with mix "
                        "np-array-1d and list" % name)

        assert_allclose(metric(y1_list, y2_1d),
                        measure,
                        err_msg="%s is not representation invariant with mix "
                        "np-array-1d and list" % name)

        assert_allclose(metric(y1_1d, y2_column),
                        measure,
                        err_msg="%s is not representation invariant with mix "
                        "np-array-1d and np-array-column" % name)

        assert_allclose(metric(y1_column, y2_1d),
                        measure,
                        err_msg="%s is not representation invariant with mix "
                        "np-array-1d and np-array-column" % name)

        assert_allclose(metric(y1_list, y2_column),
                        measure,
                        err_msg="%s is not representation invariant with mix "
                        "list and np-array-column" % name)

        assert_allclose(metric(y1_column, y2_list),
                        measure,
                        err_msg="%s is not representation invariant with mix "
                        "list and np-array-column" % name)

        # These mix representations aren't allowed
        assert_raises(ValueError, metric, y1_1d, y2_row)
        assert_raises(ValueError, metric, y1_row, y2_1d)
        assert_raises(ValueError, metric, y1_list, y2_row)
        assert_raises(ValueError, metric, y1_row, y2_list)
        assert_raises(ValueError, metric, y1_column, y2_row)
        assert_raises(ValueError, metric, y1_row, y2_column)

        # NB: We do not test for y1_row, y2_row as these may be
        # interpreted as multilabel or multioutput data.
        if (name not in (MULTIOUTPUT_METRICS | THRESHOLDED_MULTILABEL_METRICS
                         | MULTILABELS_METRICS)):
            assert_raises(ValueError, metric, y1_row, y2_row)
Exemple #47
0
def test_invariance_string_vs_numbers_labels():
    # Ensure that classification metrics with string labels
    random_state = check_random_state(0)
    y1 = random_state.randint(0, 2, size=(20, ))
    y2 = random_state.randint(0, 2, size=(20, ))

    y1_str = np.array(["eggs", "spam"])[y1]
    y2_str = np.array(["eggs", "spam"])[y2]

    pos_label_str = "spam"
    labels_str = ["eggs", "spam"]

    for name, metric in CLASSIFICATION_METRICS.items():
        if name in METRIC_UNDEFINED_BINARY_MULTICLASS:
            continue

        measure_with_number = metric(y1, y2)

        # Ugly, but handle case with a pos_label and label
        metric_str = metric
        if name in METRICS_WITH_POS_LABEL:
            metric_str = partial(metric_str, pos_label=pos_label_str)

        measure_with_str = metric_str(y1_str, y2_str)

        assert_array_equal(measure_with_number,
                           measure_with_str,
                           err_msg="{0} failed string vs number invariance "
                           "test".format(name))

        measure_with_strobj = metric_str(y1_str.astype('O'),
                                         y2_str.astype('O'))
        assert_array_equal(measure_with_number,
                           measure_with_strobj,
                           err_msg="{0} failed string object vs number "
                           "invariance test".format(name))

        if name in METRICS_WITH_LABELS:
            metric_str = partial(metric_str, labels=labels_str)
            measure_with_str = metric_str(y1_str, y2_str)
            assert_array_equal(measure_with_number,
                               measure_with_str,
                               err_msg="{0} failed string vs number  "
                               "invariance test".format(name))

            measure_with_strobj = metric_str(y1_str.astype('O'),
                                             y2_str.astype('O'))
            assert_array_equal(measure_with_number,
                               measure_with_strobj,
                               err_msg="{0} failed string vs number  "
                               "invariance test".format(name))

    for name, metric in THRESHOLDED_METRICS.items():
        if name not in METRIC_UNDEFINED_BINARY:
            # Ugly, but handle case with a pos_label and label
            metric_str = metric
            if name in METRICS_WITH_POS_LABEL:
                metric_str = partial(metric_str, pos_label=pos_label_str)

            measure_with_number = metric(y1, y2)
            measure_with_str = metric_str(y1_str, y2)
            assert_array_equal(measure_with_number,
                               measure_with_str,
                               err_msg="{0} failed string vs number "
                               "invariance test".format(name))

            measure_with_strobj = metric(y1_str.astype('O'), y2)
            assert_array_equal(measure_with_number,
                               measure_with_strobj,
                               err_msg="{0} failed string object vs number "
                               "invariance test".format(name))
        else:
            # TODO those metrics doesn't support string label yet
            assert_raises(ValueError, metric, y1_str, y2)
            assert_raises(ValueError, metric, y1_str.astype('O'), y2)
Exemple #48
0
    def select(self, competences):
        """Select the most competent classifier for the classification of the
        query sample given the competence level estimates. Four selection
        schemes are available.

        Best : The base classifier with the highest competence level is
        selected. In cases where more than one base classifier achieves the
        same competence level, the one with the lowest index is selected. This
        method is the standard for the LCA, OLA, MLA techniques.

        Diff : Select the base classifier that is significantly better than the
        others in the pool (when the difference between its competence level
        and the competence level of the other base classifiers is higher than a
        predefined threshold). If no base classifier is significantly better,
        the base classifier is selected randomly among the member with
        equivalent competence level.

        Random : Selects a random base classifier among all base classifiers
        that achieved the same competence level.

        ALL : all base classifiers with the max competence level estimates are
        selected (note that in this case the
        DCS technique becomes a DES technique).

        Parameters
        ----------
        competences : array of shape = [n_samples, n_classifiers]
            Competence level estimated for each base classifier and test
            example.

        Returns
        -------
        selected_classifiers : array of shape [n_samples]
            Indices of the selected base classifier for each sample. If the
            selection_method is set to 'all', a boolean matrix is returned,
            containing True for the selected base classifiers, otherwise false.

        """
        if competences.ndim < 2:
            competences = competences.reshape(1, -1)

        selected_classifiers = []
        best_index = np.argmax(competences, axis=1)

        if self.selection_method == 'best':
            # Select the classifier with highest competence level
            selected_classifiers = best_index

        elif self.selection_method == 'diff':
            # Selects a base classifier if its competence level is significant
            # better than the rest. If there is no such classifier, select
            # randomly a base model.
            #
            # the best classifier will always have diff < diff_thresh. In a
            # case it is superior than all others, it will be the only member
            # selected. Otherwise, a random classifier from this list is
            # selected.

            rng = check_random_state(self.random_state)
            best_competence = competences[np.arange(competences.shape[0]),
                                          best_index]
            # best_competence = np.max(competences)
            diff = best_competence.reshape(-1, 1) - competences
            # TODO: Improve this part of the code
            selected_classifiers = np.zeros(diff.shape[0], dtype=np.int)
            for row in range(diff.shape[0]):
                diff_list = list(diff[row, :])
                indices = [
                    idx for idx, _ in enumerate(diff_list)
                    if diff_list[idx] < self.diff_thresh
                ]

                if len(indices) == 0:
                    indices = range(self.n_classifiers_)

                selected_classifiers[row] = rng.choice(indices)

        elif self.selection_method == 'random':
            # TODO: Improve this part of the code
            rng = check_random_state(self.random_state)
            selected_classifiers = np.zeros(competences.shape[0], dtype=np.int)
            best_competence = competences[np.arange(competences.shape[0]),
                                          best_index]
            for row in range(competences.shape[0]):
                competence_list = list(competences[row, :])

                # Select a random classifier among all with same competence
                # level
                indices = [
                    idx for idx, _ in enumerate(competence_list)
                    if competence_list[idx] == best_competence[row]
                ]

                selected_classifiers[row] = rng.choice(indices)

        elif self.selection_method == 'all':
            # select all base classifiers with max competence estimates.
            max_value = np.max(competences, axis=1)
            selected_classifiers = (competences == max_value.reshape(
                competences.shape[0], -1))

        return selected_classifiers
Exemple #49
0
def test_format_invariance_with_1d_vectors():
    random_state = check_random_state(0)
    y1 = random_state.randint(0, 2, size=(20, ))
    y2 = random_state.randint(0, 2, size=(20, ))

    y1_list = list(y1)
    y2_list = list(y2)

    y1_1d, y2_1d = np.array(y1), np.array(y2)
    assert_equal(y1_1d.ndim, 1)
    assert_equal(y2_1d.ndim, 1)
    y1_column = np.reshape(y1_1d, (-1, 1))
    y2_column = np.reshape(y2_1d, (-1, 1))
    y1_row = np.reshape(y1_1d, (1, -1))
    y2_row = np.reshape(y2_1d, (1, -1))

    for name, metric in ALL_METRICS.items():
        if name in METRIC_UNDEFINED_BINARY_MULTICLASS:
            continue

        measure = metric(y1, y2)

        assert_almost_equal(metric(y1_list, y2_list),
                            measure,
                            err_msg="%s is not representation invariant "
                            "with list" % name)

        assert_almost_equal(metric(y1_1d, y2_1d),
                            measure,
                            err_msg="%s is not representation invariant "
                            "with np-array-1d" % name)

        assert_almost_equal(metric(y1_column, y2_column),
                            measure,
                            err_msg="%s is not representation invariant "
                            "with np-array-column" % name)

        # Mix format support
        assert_almost_equal(metric(y1_1d, y2_list),
                            measure,
                            err_msg="%s is not representation invariant "
                            "with mix np-array-1d and list" % name)

        assert_almost_equal(metric(y1_list, y2_1d),
                            measure,
                            err_msg="%s is not representation invariant "
                            "with mix np-array-1d and list" % name)

        assert_almost_equal(metric(y1_1d, y2_column),
                            measure,
                            err_msg="%s is not representation invariant "
                            "with mix np-array-1d and np-array-column" % name)

        assert_almost_equal(metric(y1_column, y2_1d),
                            measure,
                            err_msg="%s is not representation invariant "
                            "with mix np-array-1d and np-array-column" % name)

        assert_almost_equal(metric(y1_list, y2_column),
                            measure,
                            err_msg="%s is not representation invariant "
                            "with mix list and np-array-column" % name)

        assert_almost_equal(metric(y1_column, y2_list),
                            measure,
                            err_msg="%s is not representation invariant "
                            "with mix list and np-array-column" % name)

        # These mix representations aren't allowed
        assert_raises(ValueError, metric, y1_1d, y2_row)
        assert_raises(ValueError, metric, y1_row, y2_1d)
        assert_raises(ValueError, metric, y1_list, y2_row)
        assert_raises(ValueError, metric, y1_row, y2_list)
        assert_raises(ValueError, metric, y1_column, y2_row)
        assert_raises(ValueError, metric, y1_row, y2_column)

        # NB: We do not test for y1_row, y2_row as these may be
        # interpreted as multilabel or multioutput data.
        if (name not in (MULTIOUTPUT_METRICS + THRESHOLDED_MULTILABEL_METRICS +
                         MULTILABELS_METRICS)):
            assert_raises(ValueError, metric, y1_row, y2_row)
    def fit(self, X, y, check_input=True):
        """Build a similarity forest regressor from the training set (X, y).
                Parameters
                ----------
                X : array-like of any type, as long as suitable similarity function is provided
                    The training input samples.
                y : array-like, shape = [n_samples]
                    The training outputs.

                Returns
                -------
                self : object
        """

        # Check input
        if check_input:
            # Check that X and y have correct shape
            X, y = check_X_y(X, y)

            # Input validation, check it to be a non-empty 2D array containing only finite values
            X = check_array(X)

            # Check if provided similarity function applies to input
            X = self._validate_X_predict(X, check_input)

            if self.criterion == 'theil' or self.criterion == 'atkinson':
                if not np.where(y >= 0)[0].size == y.size:
                    raise ValueError(
                        'When using Theil or Atkinson indexes, one need to make sure y has all positive values'
                    )

        y = np.atleast_1d(y)

        self.base_estimator_ = SimilarityTreeRegressor

        # Check input
        random_state = check_random_state(self.random_state)

        if not isinstance(self.n_directions, int):
            raise ValueError('n_directions parameter must be an int')

        # Default similarity functions: dot product or rbf kernel
        if self.sim_function == 'dot':
            self.sim_function = dot_product
        elif self.sim_function == 'rbf':
            self.sim_function = rbf

        self.oob_score_ = 0.0

        self.X_ = X
        self.y_ = y
        self.random_state_ = random_state

        self.estimators_ = []
        for i in range(self.n_estimators):
            tree = SimilarityTreeRegressor(
                n_directions=self.n_directions,
                sim_function=self.sim_function,
                random_state=self.random_state,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                discriminative_sampling=self.discriminative_sampling,
                criterion=self.criterion,
                gamma=self.gamma)

            self.estimators_.append(tree)

        pool = Pool(processes=4)
        self.estimators_ = pool.map(self.fit_tree_, self.estimators_)
        pool.close()
        pool.join()

        if self.oob_score:
            self.oob_score_ /= self.n_estimators

        assert len(self.estimators_) == self.n_estimators
        self.is_fitted_ = True

        return self
Exemple #51
0
    def __init__(self,
                 input_shape=(100, 28 * 28),
                 random_state=None,
                 batch_size=100,
                 num_layers=4,
                 num_units_per_layer=(10, 10, 10),
                 dropout_per_layer=(0.5, 0.5, 0.5),
                 std_per_layer=(0.005, 0.005, 0.005),
                 num_output_units=2,
                 dropout_output=0.5,
                 learning_rate=0.01,
                 lambda2=1e-4,
                 momentum=0.9,
                 beta1=0.9,
                 beta2=0.9,
                 rho=0.95,
                 solver='adam',
                 num_epochs=2,
                 lr_policy='fixed',
                 gamma=0.01,
                 power=1.0,
                 epoch_step=1,
                 activation_per_layer=('relu', ) * 3,
                 weight_init_per_layer=('henormal', ) * 3,
                 leakiness_per_layer=(1. / 3., ) * 3,
                 tanh_alpha_per_layer=(2. / 3., ) * 3,
                 tanh_beta_per_layer=(1.7159, ) * 3,
                 is_sparse=False,
                 is_binary=False,
                 is_regression=False,
                 is_multilabel=False):

        self.random_state = random_state
        self.batch_size = batch_size
        self.input_shape = input_shape
        self.num_layers = num_layers
        self.num_units_per_layer = num_units_per_layer
        self.dropout_per_layer = np.asarray(dropout_per_layer,
                                            dtype=theano.config.floatX)
        self.num_output_units = num_output_units
        self.dropout_output = T.cast(dropout_output,
                                     dtype=theano.config.floatX)
        self.activation_per_layer = activation_per_layer
        self.weight_init_per_layer = weight_init_per_layer
        self.std_per_layer = np.asarray(std_per_layer,
                                        dtype=theano.config.floatX)
        self.leakiness_per_layer = np.asarray(leakiness_per_layer,
                                              dtype=theano.config.floatX)
        self.tanh_alpha_per_layer = np.asarray(tanh_alpha_per_layer,
                                               dtype=theano.config.floatX)
        self.tanh_beta_per_layer = np.asarray(tanh_beta_per_layer,
                                              dtype=theano.config.floatX)
        self.momentum = T.cast(momentum, dtype=theano.config.floatX)

        self.init_learning_rate = np.asarray(learning_rate,
                                             dtype=theano.config.floatX)
        self.learning_rate = np.asarray(learning_rate,
                                        dtype=theano.config.floatX)
        self.lambda2 = T.cast(lambda2, dtype=theano.config.floatX)
        self.beta1 = T.cast(beta1, dtype=theano.config.floatX)
        self.beta2 = T.cast(beta2, dtype=theano.config.floatX)
        self.rho = T.cast(rho, dtype=theano.config.floatX)
        self.num_epochs = num_epochs
        self.lr_policy = lr_policy
        self.gamma = np.asarray(gamma, dtype=theano.config.floatX)
        self.power = np.asarray(power, dtype=theano.config.floatX)
        self.epoch_step = np.asarray(epoch_step, dtype=theano.config.floatX)
        self.is_binary = is_binary
        self.is_regression = is_regression
        self.is_multilabel = is_multilabel
        self.is_sparse = is_sparse
        self.solver = solver

        if is_sparse:
            #input_var = S.csr_matrix('inputs', dtype=theano.config.floatX)
            input_var = T.matrix('inputs')
        else:
            input_var = T.matrix('inputs')

        if self.is_binary or self.is_multilabel or self.is_regression:
            target_var = T.matrix('targets')
        else:
            target_var = T.ivector('targets')

        if DEBUG:
            if self.is_binary:
                print("... using binary loss")
            if self.is_multilabel:
                print("... using multilabel prediction")
            if self.is_regression:
                print("... using regression loss")
            print("... building network!")
            print("Input shape:", input_shape)
            print("... with number of epochs:")
            print(num_epochs)

        # Added for reproducibility
        seed = check_random_state(self.random_state)
        lasagne.random.set_rng(seed)

        self.network = lasagne.layers.InputLayer(shape=input_shape,
                                                 input_var=input_var)

        # Define each layer
        for i in range(num_layers - 1):
            init_weight = self._choose_weight_init(i)
            activation_function = self._choose_activation(i)
            self.network = lasagne.layers.DenseLayer(
                lasagne.layers.dropout(self.network,
                                       p=self.dropout_per_layer[i]),
                num_units=self.num_units_per_layer[i],
                W=init_weight,
                b=lasagne.init.Constant(val=0.0),
                nonlinearity=activation_function)

        # Define output layer and nonlinearity of last layer
        if self.is_regression:
            output_activation = lasagne.nonlinearities.linear
        elif self.is_binary or self.is_multilabel:
            output_activation = lasagne.nonlinearities.sigmoid
        else:
            output_activation = lasagne.nonlinearities.softmax

        self.network = lasagne.layers.DenseLayer(
            lasagne.layers.dropout(self.network, p=self.dropout_output),
            num_units=self.num_output_units,
            W=lasagne.init.GlorotNormal(),
            b=lasagne.init.Constant(),
            nonlinearity=output_activation)

        prediction = lasagne.layers.get_output(self.network)

        if self.is_regression:
            loss_function = lasagne.objectives.squared_error
        elif self.is_binary or self.is_multilabel:
            loss_function = lasagne.objectives.binary_crossentropy
        else:
            loss_function = lasagne.objectives.categorical_crossentropy

        loss = loss_function(prediction, target_var)

        # Aggregate loss mean function with l2
        # Regularization on all layers' params
        if self.is_binary or self.is_multilabel:
            #loss = T.sum(loss, dtype=theano.config.floatX)
            loss = T.mean(loss, dtype=theano.config.floatX)
        else:
            loss = T.mean(loss, dtype=theano.config.floatX)
        l2_penalty = self.lambda2 * lasagne.regularization.regularize_network_params(
            self.network, lasagne.regularization.l2)
        loss += l2_penalty
        params = lasagne.layers.get_all_params(self.network, trainable=True)

        # Create the symbolic scalar lr for loss & updates function
        lr_scalar = T.scalar('lr', dtype=theano.config.floatX)

        if solver == "nesterov":
            updates = lasagne.updates.nesterov_momentum(
                loss, params, learning_rate=lr_scalar, momentum=self.momentum)
        elif solver == "adam":
            updates = lasagne.updates.adam(loss,
                                           params,
                                           learning_rate=lr_scalar,
                                           beta1=self.beta1,
                                           beta2=self.beta2)
        elif solver == "adadelta":
            updates = lasagne.updates.adadelta(loss,
                                               params,
                                               learning_rate=lr_scalar,
                                               rho=self.rho)
        elif solver == "adagrad":
            updates = lasagne.updates.adagrad(loss,
                                              params,
                                              learning_rate=lr_scalar)
        elif solver == "sgd":
            updates = lasagne.updates.sgd(loss,
                                          params,
                                          learning_rate=lr_scalar)
        elif solver == "momentum":
            updates = lasagne.updates.momentum(loss,
                                               params,
                                               learning_rate=lr_scalar,
                                               momentum=self.momentum)
        elif solver == "smorm3s":
            updates = smorm3s(loss, params, learning_rate=lr_scalar)
        else:
            updates = lasagne.updates.sgd(loss,
                                          params,
                                          learning_rate=lr_scalar)

        # Validation was removed, as auto-sklearn handles that, if this net
        # is to be used independently, validation accuracy has to be included
        if DEBUG:
            print("... compiling theano functions")
        self.train_fn = theano.function([input_var, target_var, lr_scalar],
                                        loss,
                                        updates=updates,
                                        allow_input_downcast=True,
                                        profile=False,
                                        on_unused_input='warn',
                                        name='train_fn')
        if DEBUG:
            print('... compiling update function')
        self.update_function = self._policy_function()
    def _validate_train_parms(self, train_set, train_lab, classes=None):
        random_state = validation.check_random_state(self.random_state)
        train_set, train_lab = validation.check_X_y(train_set,
                                                    train_lab.ravel())

        if self.initial_fit:
            if classes:
                self.classes_ = np.asarray(classes)
                self.protos_initialized = np.zeros(self.classes_.size)
            else:
                self.classes_ = unique_labels(train_lab)
                self.protos_initialized = np.zeros(self.classes_.size)

            # Validate that labels have correct format
            for i in range(len(self.classes_)):
                if i not in self.classes_:
                    raise ValueError('Labels have to be ascending int,\
                                     starting at 0, got {}'
                                     .format(self.classes_))

        nb_classes = len(self.classes_)
        nb_features = train_set.shape[1]

        # set prototypes per class
        if isinstance(self.prototypes_per_class, int):
            # ppc is int so we can give same number ppc to for all classes
            if self.prototypes_per_class < 0:
                raise ValueError('prototypes_per_class must be a positive int')
            # nb_ppc = number of protos per class
            nb_ppc = np.ones([nb_classes],
                             dtype='int') * self.prototypes_per_class
        elif isinstance(self.prototypes_per_class, list):
            # its an array containing individual number of protos per class
            # - not fully supported yet
            nb_ppc = validation.column_or_1d(
                validation.check_array(self.prototypes_per_class,
                                       ensure_2d=False, dtype='int'))
            if nb_ppc.min() <= 0:
                raise ValueError(
                    'values in prototypes_per_class must be positive')
            if nb_ppc.size != nb_classes:
                raise ValueError(
                    'length of prototypes_per_class'
                    ' does not fit the number of classes'
                    'classes=%d'
                    'length=%d' % (nb_classes, nb_ppc.size))
        else:
            raise ValueError('Invalid data type for prototypes_per_class, '
                             'must be int or list of int')

        # initialize prototypes
        if self.initial_prototypes is None:
            if self.initial_fit:
                self.w_ = np.empty([np.sum(nb_ppc), nb_features],
                                   dtype=np.double)
                self.c_w_ = np.empty([nb_ppc.sum()], dtype=self.classes_.dtype)
            pos = 0
            for actClassIdx in range(len(self.classes_)):
                actClass = self.classes_[actClassIdx]
                nb_prot = nb_ppc[actClassIdx]  # nb_ppc: prototypes per class
                if (self.protos_initialized[actClassIdx] == 0 and
                        actClass in unique_labels(train_lab)):
                    mean = np.mean(
                        train_set[train_lab == actClass, :], 0)

                    if self.prototypes_per_class == 1:
                        # If only one prototype we init it to mean
                        self.w_[pos:pos + nb_prot] = mean
                    else:
                        # else we add some random noise to distribute them
                        self.w_[pos:pos + nb_prot] = mean + (
                            random_state.rand(nb_prot, nb_features) * 2 - 1)

                    if math.isnan(self.w_[pos, 0]):
                        raise ValueError('Prototype on position {} for class\
                                         {} is NaN.'
                                         .format(pos, actClass))
                    else:
                        self.protos_initialized[actClassIdx] = 1

                    self.c_w_[pos:pos + nb_prot] = actClass
                pos += nb_prot
        else:
            x = validation.check_array(self.initial_prototypes)
            self.w_ = x[:, :-1]
            self.c_w_ = x[:, -1]
            if self.w_.shape != (np.sum(nb_ppc), nb_features):
                raise ValueError("the initial prototypes have wrong shape\n"
                                 "found=(%d,%d)\n"
                                 "expected=(%d,%d)" % (
                                     self.w_.shape[0], self.w_.shape[1],
                                     nb_ppc.sum(), nb_features))
            if set(self.c_w_) != set(self.classes_):
                raise ValueError(
                    "prototype labels and test data classes do not match\n"
                    "classes={}\n"
                    "prototype labels={}\n".format(self.classes_, self.c_w_))
        if self.initial_fit:
            if self.gradient_descent == 'adadelta':
                self.squared_mean_gradient = np.zeros_like(self.w_)
                self.squared_mean_step = np.zeros_like(self.w_)
            self.initial_fit = False

        return train_set, train_lab
Exemple #53
0
def make_forecasting_problem(n_timepoints=50, random_state=None):
    rng = check_random_state(random_state)
    return pd.Series(rng.random(size=n_timepoints),
                     index=pd.Int64Index(np.arange(n_timepoints)))
    def fit(self, X, y, check_input=True):
        """Build a similarity tree regressor from the training set (X, y).
               Parameters
               ----------
               X : array-like of any type, as long as suitable similarity function is provided
                   The training input samples.
               y : array-like, shape = [n_samples]
                   The training outputs.

               Returns
               -------
               self : object
        """

        # Check input
        if check_input:
            # Check that X and y have correct shape
            X, y = check_X_y(X, y)

            # Input validation, check it to be a non-empty 2D array containing only finite values
            X = check_array(X)

            # Check if provided similarity function applies to input
            X = self._validate_X_predict(X, check_input)

            if self.criterion == 'theil' or self.criterion == 'atkinson':
                if not np.where(y >= 0)[0].size == y.size:
                    raise ValueError(
                        'When using Theil or Atkinson indexes, one need to make sure y has all positive values'
                    )

        # Check parameters
        random_state = check_random_state(self.random_state)

        if not isinstance(self.n_directions, int):
            raise ValueError('n_directions parameter must be an int')

        self._lhs = None
        self._rhs = None
        self._p = None
        self._q = None
        self._similarities = []
        self._split_point = -np.inf
        self._value = None
        self._is_leaf = False
        self.is_fitted_ = False
        self._impurity = None

        # Append self to the list of class instances

        # Current node id is length of all nodes list. Nodes are numbered from 1, the root node
        self._node_id = id(self)

        # Value of predicion
        self._value = np.mean(y)

        # Current node's impurity
        if self.criterion == 'variance':
            self._impurity = np.var(y)
        elif self.criterion == 'theil':
            self._impurity = theil(y)
        elif self.criterion == 'atkinson':
            self._impurity = atkinson(y)
        else:
            raise ValueError('Unknown split criterion')

        if y.size == 1:
            self._is_leaf = True
            self.is_fitted_ = True
            return self

        if self._is_pure(y):
            self._is_leaf = True
            self.is_fitted_ = True
            return self

        if self.max_depth is not None:
            if self.depth == self.max_depth:
                self._is_leaf = True
                self.is_fitted_ = True
                return self

        if len(y) <= self.min_samples_split:
            self._is_leaf = True
            self.is_fitted_ = True
            return self

        # Sample n_direction discriminative directions and find the best one
        best_impurity = np.inf
        best_split_point = None
        best_p = None
        best_q = None
        similarities = []
        for i, j in self._sample_directions(random_state, y,
                                            self.n_directions):

            impurity, split_point, curr_similarities = find_split(
                X,
                y,
                X[i],
                X[j],
                self.criterion,
                self.sim_function,
                gamma=self.gamma)

            if impurity < best_impurity:
                best_impurity = impurity
                best_p = X[i]
                best_q = X[j]
                best_split_point = split_point
                similarities = curr_similarities

        if best_split_point is None:
            self.is_fitted_ = True
            self._is_leaf = True
            return self

        # if split improves impurity
        if self._impurity - best_impurity > 0.0:
            self._split_point = best_split_point
            self._p = best_p
            self._q = best_q
            self._similarities = np.array(similarities, dtype=np.float32)

            e = 0.000000001
            # Left- and right-hand side partitioning
            lhs_idxs = np.nonzero(
                self._similarities - self._split_point < e)[0]
            rhs_idxs = np.nonzero(
                self._similarities - self._split_point > -e)[0]

            if len(lhs_idxs) > 0 and len(rhs_idxs) > 0:
                params = self.get_params()
                params['depth'] += 1
                self._lhs = SimilarityTreeRegressor(**params).fit(
                    X[lhs_idxs], y[lhs_idxs], check_input=False)

                self._rhs = SimilarityTreeRegressor(**params).fit(
                    X[rhs_idxs], y[rhs_idxs], check_input=False)
            else:
                raise ValueError(
                    'Left- and right-hand-side indexes havn\'t been found,'
                    'even though the split had been found')

        # Split doesn't improve impurity, stop growing a tree
        else:
            self.is_fitted_ = True
            self._is_leaf = True
            return self

        return self
Exemple #55
0
    def fit(self, X, y):
        """Prepare the DS model by setting the KNN algorithm and
        pre-processing the information required to apply the DS
        methods

        Parameters
        ----------
        X : array of shape = [n_samples, n_features]
            The input data.

        y : array of shape = [n_samples]
            class labels of each example in X.

        Returns
        -------
        self
        """
        self.random_state_ = check_random_state(self.random_state)

        # Check if the length of X and y are consistent.
        X, y = check_X_y(X, y)

        # Check if the pool of classifiers is None.
        # If yes, use a BaggingClassifier for the pool.
        if self.pool_classifiers is None:
            if len(X) < 2:
                raise ValueError('More than one sample is needed '
                                 'if the pool of classifiers is not informed.')

            # Split the dataset into training (for the base classifier) and
            # DSEL (for DS)
            X_train, X_dsel, y_train, y_dsel = train_test_split(
                X, y, test_size=self.DSEL_perc,
                random_state=self.random_state_)

            self.pool_classifiers_ = BaggingClassifier(
                random_state=self.random_state_)
            self.pool_classifiers_.fit(X_train, y_train)

        else:
            self._check_base_classifier_fitted()
            self.pool_classifiers_ = self.pool_classifiers
            X_dsel = X
            y_dsel = y

        self.n_classifiers_ = len(self.pool_classifiers_)

        # check if the input parameters are correct. Raise an error if the
        # generated_pool is not fitted or k < 1
        self._validate_parameters()

        # Check label encoder on the pool of classifiers
        self.check_label_encoder()

        self._setup_label_encoder(y)
        y_dsel = self.enc_.transform(y_dsel)
        self._set_dsel(X_dsel, y_dsel)

        # validate the value of k
        self._validate_k()
        self._set_region_of_competence_algorithm()
        self._fit_region_competence(X_dsel, y_dsel)

        # validate the IH
        if self.with_IH:
            self._validate_ih()
        return self
Exemple #56
0
def _initialize_components(n_components,
                           input,
                           y=None,
                           init='auto',
                           verbose=False,
                           random_state=None,
                           has_classes=True):
    """Returns the initial transformation to be used depending on the arguments.

  Parameters
  ----------
  n_components : int
    The number of components to take. (Note: it should have been checked
    before, meaning it should not be None and it should be a value in
    [1, X.shape[1]])

  input : array-like
    The input samples (can be tuples or regular samples).

  y : array-like or None
    The input labels (or not if there are no labels).

  init : string or numpy array, optional (default='auto')
    Initialization of the linear transformation. Possible options are
    'auto', 'pca', 'lda', 'identity', 'random', and a numpy array of shape
    (n_features_a, n_features_b).

    'auto'
      Depending on ``n_components``, the most reasonable initialization
      will be chosen. If ``n_components <= n_classes`` we use 'lda' (see
      the description of 'lda' init), as it uses labels information. If
      not, but ``n_components < min(n_features, n_samples)``, we use 'pca',
      as it projects data onto meaningful directions (those of higher
      variance). Otherwise, we just use 'identity'.

    'pca'
      ``n_components`` principal components of the inputs passed
      to :meth:`fit` will be used to initialize the transformation.
      (See `sklearn.decomposition.PCA`)

    'lda'
      ``min(n_components, n_classes)`` most discriminative
      components of the inputs passed to :meth:`fit` will be used to
      initialize the transformation. (If ``n_components > n_classes``,
      the rest of the components will be zero.) (See
      `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`).
      This initialization is possible only if `has_classes == True`.

    'identity'
      The identity matrix. If ``n_components`` is strictly smaller than the
      dimensionality of the inputs passed to :meth:`fit`, the identity
      matrix will be truncated to the first ``n_components`` rows.

    'random'
      The initial transformation will be a random array of shape
      `(n_components, n_features)`. Each value is sampled from the
      standard normal distribution.

    numpy array
      n_features_b must match the dimensionality of the inputs passed to
      :meth:`fit` and n_features_a must be less than or equal to that.
      If ``n_components`` is not None, n_features_a must match it.

  verbose : bool
    Whether to print the details of the initialization or not.

  random_state : int or `numpy.RandomState` or None, optional (default=None)
    A pseudo random number generator object or a seed for it if int. If
    ``init='random'``, ``random_state`` is used to initialize the random
    transformation. If ``init='pca'``, ``random_state`` is passed as an
    argument to PCA when initializing the transformation.

  has_classes : bool (default=True)
    Whether the labels are in fact classes. If true, this will allow to use
    the 'lda' initialization.

  Returns
  -------
  init_components : `numpy.ndarray`
    The initial transformation to use.
  """
    # if we are doing a regression we cannot use lda:
    n_features = input.shape[-1]
    authorized_inits = ['auto', 'pca', 'identity', 'random']
    if has_classes:
        authorized_inits.append('lda')

    if isinstance(init, np.ndarray):
        # we copy the array, so that if we update the metric, we don't want to
        # update the init
        init = check_array(init, copy=True)

        # Assert that init.shape[1] = X.shape[1]
        if init.shape[1] != n_features:
            raise ValueError(
                'The input dimensionality ({}) of the given '
                'linear transformation `init` must match the '
                'dimensionality of the given inputs `X` ({}).'.format(
                    init.shape[1], n_features))

        # Assert that init.shape[0] <= init.shape[1]
        if init.shape[0] > init.shape[1]:
            raise ValueError(
                'The output dimensionality ({}) of the given '
                'linear transformation `init` cannot be '
                'greater than its input dimensionality ({}).'.format(
                    init.shape[0], init.shape[1]))

        # Assert that self.n_components = init.shape[0]
        if n_components != init.shape[0]:
            raise ValueError('The preferred dimensionality of the '
                             'projected space `n_components` ({}) does'
                             ' not match the output dimensionality of '
                             'the given linear transformation '
                             '`init` ({})!'.format(n_components,
                                                   init.shape[0]))
    elif init not in authorized_inits:
        raise ValueError(
            "`init` must be '{}' "
            "or a numpy array of shape (n_components, n_features).".format(
                "', '".join(authorized_inits)))

    random_state = check_random_state(random_state)
    if isinstance(init, np.ndarray):
        return init
    n_samples = input.shape[0]
    if init == 'auto':
        if has_classes:
            n_classes = len(np.unique(y))
        else:
            n_classes = -1
        init = _auto_select_init(has_classes, n_features, n_samples,
                                 n_components, n_classes)
    if init == 'identity':
        return np.eye(n_components, input.shape[-1])
    elif init == 'random':
        return random_state.randn(n_components, input.shape[-1])
    elif init in {'pca', 'lda'}:
        init_time = time.time()
        if init == 'pca':
            pca = PCA(n_components=n_components, random_state=random_state)
            if verbose:
                print('Finding principal components... ')
                sys.stdout.flush()
            pca.fit(input)
            transformation = pca.components_
        elif init == 'lda':
            lda = LinearDiscriminantAnalysis(n_components=n_components)
            if verbose:
                print('Finding most discriminative components... ')
                sys.stdout.flush()
            lda.fit(input, y)
            transformation = lda.scalings_.T[:n_components]
        if verbose:
            print('done in {:5.2f}s'.format(time.time() - init_time))
        return transformation
def test_sample_weight_invariance(n_samples=50):
    random_state = check_random_state(0)
    # regression
    y_true = random_state.random_sample(size=(n_samples, ))
    y_pred = random_state.random_sample(size=(n_samples, ))
    for name in ALL_METRICS:
        if name not in REGRESSION_METRICS:
            continue
        if name in METRICS_WITHOUT_SAMPLE_WEIGHT:
            continue
        metric = ALL_METRICS[name]
        yield _named_check(check_sample_weight_invariance, name), name,\
            metric, y_true, y_pred

    # binary
    random_state = check_random_state(0)
    y_true = random_state.randint(0, 2, size=(n_samples, ))
    y_pred = random_state.randint(0, 2, size=(n_samples, ))
    y_score = random_state.random_sample(size=(n_samples, ))
    for name in ALL_METRICS:
        if name in REGRESSION_METRICS:
            continue
        if (name in METRICS_WITHOUT_SAMPLE_WEIGHT
                or name in METRIC_UNDEFINED_BINARY):
            continue
        metric = ALL_METRICS[name]
        if name in THRESHOLDED_METRICS:
            yield _named_check(check_sample_weight_invariance, name), name,\
                  metric, y_true, y_score
        else:
            yield _named_check(check_sample_weight_invariance, name), name,\
                  metric, y_true, y_pred

    # multiclass
    random_state = check_random_state(0)
    y_true = random_state.randint(0, 5, size=(n_samples, ))
    y_pred = random_state.randint(0, 5, size=(n_samples, ))
    y_score = random_state.random_sample(size=(n_samples, 5))
    for name in ALL_METRICS:
        if name in REGRESSION_METRICS:
            continue
        if (name in METRICS_WITHOUT_SAMPLE_WEIGHT
                or name in METRIC_UNDEFINED_BINARY_MULTICLASS):
            continue
        metric = ALL_METRICS[name]
        if name in THRESHOLDED_METRICS:
            yield _named_check(check_sample_weight_invariance, name), name,\
                  metric, y_true, y_score
        else:
            yield _named_check(check_sample_weight_invariance, name), name,\
                  metric, y_true, y_pred

    # multilabel indicator
    _, ya = make_multilabel_classification(n_features=1,
                                           n_classes=20,
                                           random_state=0,
                                           n_samples=100,
                                           allow_unlabeled=False)
    _, yb = make_multilabel_classification(n_features=1,
                                           n_classes=20,
                                           random_state=1,
                                           n_samples=100,
                                           allow_unlabeled=False)
    y_true = np.vstack([ya, yb])
    y_pred = np.vstack([ya, ya])
    y_score = random_state.randint(1, 4, size=y_true.shape)

    for name in (MULTILABELS_METRICS + THRESHOLDED_MULTILABEL_METRICS +
                 MULTIOUTPUT_METRICS):
        if name in METRICS_WITHOUT_SAMPLE_WEIGHT:
            continue

        metric = ALL_METRICS[name]
        if name in THRESHOLDED_METRICS:
            yield (_named_check(check_sample_weight_invariance,
                                name), name, metric, y_true, y_score)
        else:
            yield (_named_check(check_sample_weight_invariance,
                                name), name, metric, y_true, y_pred)
    def fit(self, X, y, check_input=True):
        """Build a similarity tree classifier from the training set (X, y).
               Parameters
               ----------
               X : array-like of any type, as long as suitable similarity function is provided
                   The training input samples.
               y : array-like, shape = [n_samples]
                   The labels.

               Returns
               -------
               self : object
        """
        # Check input
        if check_input:
            # Check that X and y have correct shape
            X, y = check_X_y(X, y)

            # Input validation, check it to be a non-empty 2D array containing only finite values
            X = check_array(X)

            # Check if provided similarity function applies to input
            X = self._validate_X_predict(X, check_input)

        y = np.atleast_1d(y)
        is_classification = is_classifier(self)

        if is_classification:
            check_classification_targets(y)
            y = np.copy(y)

        if self.classes is None:
            self.classes_ = unique_labels(y)
        else:
            self.classes_ = self.classes
        self.n_classes_ = len(self.classes_)

        # Check parameters
        if self.random_state is not None:
            random_state = check_random_state(self.random_state)
        else:
            random_state = np.random.RandomState()

        if not isinstance(self.n_directions, int):
            raise ValueError('n_directions parameter must be an int')

        self._lhs = None
        self._rhs = None
        self._p = None
        self._q = None
        self._similarities = []
        self._split_point = -np.inf
        self._value = None
        self._is_leaf = False
        self.is_fitted_ = False
        self.n_ = len(y)

        # Append self to the list of class instances

        # Current node id is length of all nodes list. Nodes are numbered from 1, the root node
        self._node_id = id(self)

        # Value of predicion
        probs = np.ones(shape=self.n_classes_)
        for i, c in enumerate(self.classes_):
            count = np.where(y == c)[0].size
            probs[i] = count / len(y) + 0.000000001

        self._value = probs
        self._class_prediction = self.classes_[np.argmax(self._value)]

        if not 1.0 - 0.00001 <= self._value.sum() <= 1.0 + 0.00001:
            raise ValueError('Wrong node class probability values.')

        # Return leaf node value
        if self._is_pure(y):
            self._is_leaf = True
            return self

        if len(y) == 1:
            self._is_leaf = True
            return self

        if self.max_depth is not None:
            if self.depth == self.max_depth:
                self._is_leaf = True
                return self

        # Sample n_direction discriminative split directions and find the best one
        best_impurity = 1.0
        best_split_point = -np.inf
        best_p = None
        best_q = None
        similarities = []
        for i, j in self._sample_directions(random_state, y,
                                            self.n_directions):

            impurity, split_point, curr_similarities = find_split(
                X, y, X[i], X[j], 'gini', self.sim_function, gamma=self.gamma)
            if impurity < best_impurity:
                best_impurity = impurity
                best_split_point = split_point
                best_p = X[i]
                best_q = X[j]
                similarities = curr_similarities

        if best_impurity < 1.0:
            self._split_point = best_split_point
            self._p = best_p
            self._q = best_q
            self._similarities = np.array(similarities)
            self._impurity = best_impurity

            # Left- and right-hand side partitioning
            lhs_idxs = np.nonzero(self._similarities <= self._split_point)[0]
            rhs_idxs = np.nonzero(self._similarities > self._split_point)[0]

            if len(lhs_idxs) > 0 and len(rhs_idxs) > 0:
                params = self.get_params()
                params['depth'] += 1
                params['classes'] = self.classes_

                self._lhs = SimilarityTreeClassifier(**params).fit(
                    X[lhs_idxs], y[lhs_idxs], check_input=False)
                self._rhs = SimilarityTreeClassifier(**params).fit(
                    X[rhs_idxs], y[rhs_idxs], check_input=False)

            else:
                self._is_leaf = True
                return self

        self.is_fitted_ = True
        return self
Exemple #59
0
    def fit(self, X, y):
        """Build a forest of trees from the training set (X, y) using random
        intervals and summary features
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances,
        series_length] or shape = [n_instances,n_columns]
            The training input samples.  If a Pandas data frame is passed it
            must have a single column (i.e. univariate
            classification. TSF has no bespoke method for multivariate
            classification as yet.
        y : array-like, shape =  [n_instances]    The class labels.
        unequal : bool
            Flag to adjust the fitting to account for unequal length series

        Returns
        -------
        self : object
        """
        # Make this only for unequal, but try get around this

        #Try except for now
        try:
            X, y = check_X_y(
                X,
                y,
                enforce_univariate=not self.capabilities["multivariate"],
                coerce_to_numpy=True,
            )
            X = X.squeeze(1)
            #Number of instances, length of each series
            n_instances, self.series_length = X.shape

            rng = check_random_state(self.random_state)

            self.n_classes = np.unique(y).shape[0]

            self.classes_ = class_distribution(np.asarray(y).reshape(-1,
                                                                     1))[0][0]
            self.n_intervals = int(math.sqrt(self.series_length))
            if self.n_intervals == 0:
                self.n_intervals = 1
            if self.series_length < self.min_interval:
                self.min_interval = self.series_length

            self.intervals_ = [
                _get_intervals(self.n_intervals, self.min_interval,
                               self.series_length, rng)
                for _ in range(self.n_estimators)
            ]
            self.unequal = False

        # CHANGE THIS TO HANDLE SPECIFIC ERROR
        except ValueError:
            self.unequal = True
            n_instances = X.shape[0]
            rng = check_random_state(self.random_state)
            self.n_classes = np.unique(y).shape[0]
            self.classes_ = class_distribution(np.asarray(y).reshape(-1,
                                                                     1))[0][0]

            self.intervals_ = []
            for i in range(self.n_estimators):
                series_length = X.iloc[i % X.size][0].size
                n_intervals = int(math.sqrt(series_length))
                if n_intervals == 0:
                    n_intervals = 1
                if series_length < self.min_interval:
                    self.min_interval = series_length
                self.intervals_.append(
                    _get_intervals(n_intervals, self.min_interval,
                                   series_length, rng))

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(
                _fit_estimator)(X, y, self.base_estimator, self.intervals_[i],
                                self.random_state, self.unequal)
            for i in range(self.n_estimators))
        self._is_fitted = True
        return self
    def fit(self, X, y, check_input=True):
        """Build a forest of trees from the training set (X, y)
            Parameters
            ----------
            X : array-like matrix of shape = [n_samples, n_features]
                The training data samples.
            y : array-like matrix of shape = [n_samples,]
                The training data labels.
            Returns
            -------
            self : object.
        """
        # Check input
        if check_input:
            # Check that X and y have correct shape
            X, y = check_X_y(X, y)

            # Input validation, check it to be a non-empty 2D array containing only finite values
            X = check_array(X)

            # Check if provided similarity function applies to input
            X = self._validate_X_predict(X, check_input)

        y = np.atleast_1d(y)
        is_classification = is_classifier(self)

        if is_classification:
            check_classification_targets(y)
            y = np.copy(y)

        y = np.array(y)
        self.classes_ = unique_labels(y)
        self.n_classes_ = self.classes_.shape[0]
        self.base_estimator_ = SimilarityTreeClassifier

        # Check input
        if self.random_state is not None:
            random_state = check_random_state(self.random_state)
        else:
            random_state = np.random.RandomState()

        if not isinstance(self.n_directions, int):
            raise ValueError('n_directions parameter must be an int')

        # Default similarity functions: dot product or rbf kernel
        if self.sim_function == 'dot':
            self.sim_function = dot_product
        elif self.sim_function == 'rbf':
            self.sim_function = rbf

        self.oob_score_ = 0.0

        self.estimators_ = []
        for i in range(self.n_estimators):

            if self.bootstrap:
                all_idxs = range(len(y))
                idxs = random_state.choice(all_idxs, len(y), replace=True)
                tree = SimilarityTreeClassifier(classes=self.classes_,
                                                n_directions=self.n_directions,
                                                sim_function=self.sim_function,
                                                random_state=self.random_state,
                                                gamma=self.gamma)
                tree.fit(X[idxs], y[idxs], check_input=False)

                self.estimators_.append(tree)

                if self.oob_score:
                    idxs_oob = np.setdiff1d(np.array(range(y.size)), idxs)
                    self.oob_score_ += tree.score(X[idxs_oob], y[idxs_oob])

            else:
                tree = SimilarityTreeClassifier(classes=self.classes_,
                                                n_directions=self.n_directions,
                                                sim_function=self.sim_function,
                                                random_state=self.random_state,
                                                max_depth=self.max_depth,
                                                gamma=self.gamma)
                tree.fit(X, y, check_input=False)

                self.estimators_.append(tree)

        if self.oob_score:
            self.oob_score_ /= self.n_estimators

        assert len(self.estimators_) == self.n_estimators
        self.is_fitted_ = True
        return self