Beispiel #1
0
def test_sample_weight_invariance(n_samples=50):
    random_state = check_random_state(0)

    # binary
    random_state = check_random_state(0)
    y_true = random_state.randint(0, 2, size=(n_samples, ))
    y_pred = random_state.randint(0, 2, size=(n_samples, ))
    y_score = random_state.random_sample(size=(n_samples,))
    for name in ALL_METRICS:
        if (name in METRICS_WITHOUT_SAMPLE_WEIGHT or
                name in METRIC_UNDEFINED_BINARY):
            continue
        metric = ALL_METRICS[name]
        if name in THRESHOLDED_METRICS:
            yield _named_check(check_sample_weight_invariance, name), name,\
                  metric, y_true, y_score
        else:
            yield _named_check(check_sample_weight_invariance, name), name,\
                  metric, y_true, y_pred

    # multiclass
    random_state = check_random_state(0)
    y_true = random_state.randint(0, 5, size=(n_samples, ))
    y_pred = random_state.randint(0, 5, size=(n_samples, ))
    y_score = random_state.random_sample(size=(n_samples, 5))
    for name in ALL_METRICS:
        if (name in METRICS_WITHOUT_SAMPLE_WEIGHT or
                name in METRIC_UNDEFINED_BINARY_MULTICLASS):
            continue
        metric = ALL_METRICS[name]
        if name in THRESHOLDED_METRICS:
            yield _named_check(check_sample_weight_invariance, name), name,\
                  metric, y_true, y_score
        else:
            yield _named_check(check_sample_weight_invariance, name), name,\
                  metric, y_true, y_pred

    # multilabel indicator
    _, ya = make_multilabel_classification(n_features=1, n_classes=20,
                                           random_state=0, n_samples=100,
                                           allow_unlabeled=False)
    _, yb = make_multilabel_classification(n_features=1, n_classes=20,
                                           random_state=1, n_samples=100,
                                           allow_unlabeled=False)
    y_true = np.vstack([ya, yb])
    y_pred = np.vstack([ya, ya])
    y_score = random_state.randint(1, 4, size=y_true.shape)

    for name in (MULTILABELS_METRICS + THRESHOLDED_MULTILABEL_METRICS +
                 MULTIOUTPUT_METRICS):
        if name in METRICS_WITHOUT_SAMPLE_WEIGHT:
            continue

        metric = ALL_METRICS[name]
        if name in THRESHOLDED_METRICS:
            yield (_named_check(check_sample_weight_invariance, name), name,
                   metric, y_true, y_score)
        else:
            yield (_named_check(check_sample_weight_invariance, name), name,
                   metric, y_true, y_pred)
def test_normalize_option_multilabel_classification():
    # Test in the multilabel case
    n_classes = 4
    n_samples = 100

    # for both random_state 0 and 1, y_true and y_pred has at least one
    # unlabelled entry
    _, y_true = make_multilabel_classification(n_features=1,
                                               n_classes=n_classes,
                                               random_state=0,
                                               allow_unlabeled=True,
                                               n_samples=n_samples)
    _, y_pred = make_multilabel_classification(n_features=1,
                                               n_classes=n_classes,
                                               random_state=1,
                                               allow_unlabeled=True,
                                               n_samples=n_samples)

    # To make sure at least one empty label is present
    y_true += [0]*n_classes
    y_pred += [0]*n_classes

    for name in METRICS_WITH_NORMALIZE_OPTION:
        metrics = ALL_METRICS[name]
        measure = metrics(y_true, y_pred, normalize=True)
        assert_greater(measure, 0,
                       msg="We failed to test correctly the normalize option")
        assert_almost_equal(metrics(y_true, y_pred, normalize=False)
                            / n_samples, measure,
                            err_msg="Failed with %s" % name)
def test_multilabel_classification_report():
    n_classes = 4
    n_samples = 50

    _, y_true = make_multilabel_classification(n_features=1,
                                               n_samples=n_samples,
                                               n_classes=n_classes,
                                               random_state=0)

    _, y_pred = make_multilabel_classification(n_features=1,
                                               n_samples=n_samples,
                                               n_classes=n_classes,
                                               random_state=1)

    expected_report = """\
             precision    recall  f1-score   support

          0       0.50      0.67      0.57        24
          1       0.51      0.74      0.61        27
          2       0.29      0.08      0.12        26
          3       0.52      0.56      0.54        27

avg / total       0.45      0.51      0.46       104
"""

    report = classification_report(y_true, y_pred)
    assert_equal(report, expected_report)
Beispiel #4
0
def test_normalize_option_multilabel_classification():
    # Test in the multilabel case
    n_classes = 4
    n_samples = 100
    _, y_true = make_multilabel_classification(n_features=1, n_classes=n_classes, random_state=0, n_samples=n_samples)
    _, y_pred = make_multilabel_classification(n_features=1, n_classes=n_classes, random_state=1, n_samples=n_samples)

    # Be sure to have at least one empty label
    y_true += ([],)
    y_pred += ([],)
    n_samples += 1

    lb = LabelBinarizer().fit([range(n_classes)])
    y_true_binary_indicator = lb.transform(y_true)
    y_pred_binary_indicator = lb.transform(y_pred)

    for name, metrics in METRICS_WITH_NORMALIZE_OPTION.items():
        # List of list of labels
        measure = metrics(y_true, y_pred, normalize=True)
        assert_greater(measure, 0, msg="We failed to test correctly the normalize option")
        assert_almost_equal(
            metrics(y_true, y_pred, normalize=False) / n_samples, measure, err_msg="Failed with %s" % name
        )

        # Indicator matrix format
        measure = metrics(y_true_binary_indicator, y_pred_binary_indicator, normalize=True)
        assert_greater(measure, 0, msg="We failed to test correctly the normalize option")
        assert_almost_equal(
            metrics(y_true_binary_indicator, y_pred_binary_indicator, normalize=False) / n_samples,
            measure,
            err_msg="Failed with %s" % name,
        )
Beispiel #5
0
def test_multilabel_representation_invariance():
    # Generate some data
    n_classes = 4
    n_samples = 50

    _, y1 = make_multilabel_classification(
        n_features=1, n_classes=n_classes, random_state=0, n_samples=n_samples, allow_unlabeled=True
    )
    _, y2 = make_multilabel_classification(
        n_features=1, n_classes=n_classes, random_state=1, n_samples=n_samples, allow_unlabeled=True
    )

    # To make sure at least one empty label is present
    y1 += [0] * n_classes
    y2 += [0] * n_classes

    y1_sparse_indicator = sp.coo_matrix(y1)
    y2_sparse_indicator = sp.coo_matrix(y2)

    for name in MULTILABELS_METRICS:
        metric = ALL_METRICS[name]

        # XXX cruel hack to work with partial functions
        if isinstance(metric, partial):
            metric.__module__ = "tmp"
            metric.__name__ = name

        measure = metric(y1, y2)

        # Check representation invariance
        assert_almost_equal(
            metric(y1_sparse_indicator, y2_sparse_indicator),
            measure,
            err_msg="%s failed representation invariance  " "between dense and sparse indicator " "formats." % name,
        )
def benchmark(metrics=tuple(v for k, v in sorted(METRICS.items())),
              formats=tuple(v for k, v in sorted(FORMATS.items())),
              samples=1000, classes=4, density=.2,
              n_times=5):
    """Times metric calculations for a number of inputs

    Parameters
    ----------
    metrics : array-like of callables (1d or 0d)
        The metric functions to time.

    formats : array-like of callables (1d or 0d)
        These may transform a dense indicator matrix into multilabel
        representation.

    samples : array-like of ints (1d or 0d)
        The number of samples to generate as input.

    classes : array-like of ints (1d or 0d)
        The number of classes in the input.

    density : array-like of ints (1d or 0d)
        The density of positive labels in the input.

    n_times : int
        Time calling the metric n_times times.

    Returns
    -------
    array of floats shaped like (metrics, formats, samples, classes, density)
        Time in seconds.
    """
    metrics = np.atleast_1d(metrics)
    samples = np.atleast_1d(samples)
    classes = np.atleast_1d(classes)
    density = np.atleast_1d(density)
    formats = np.atleast_1d(formats)
    out = np.zeros((len(metrics), len(formats), len(samples), len(classes),
                    len(density)), dtype=float)
    it = itertools.product(samples, classes, density)
    for i, (s, c, d) in enumerate(it):
        _, y_true = make_multilabel_classification(n_samples=s, n_features=1,
                                                   n_classes=c, n_labels=d * c,
                                                   return_indicator=True,
                                                   random_state=42)
        _, y_pred = make_multilabel_classification(n_samples=s, n_features=1,
                                                   n_classes=c, n_labels=d * c,
                                                   return_indicator=True,
                                                   random_state=84)
        for j, f in enumerate(formats):
            f_true = f(y_true)
            f_pred = f(y_pred)
            for k, metric in enumerate(metrics):
                t = timeit(partial(metric, f_true, f_pred), number=n_times)

                out[k, j].flat[i] = t
    return out
def test_ovr_fit_predict_sparse():
    for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix, sp.lil_matrix]:
        base_clf = MultinomialNB(alpha=1)

        X, Y = datasets.make_multilabel_classification(
            n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=True, random_state=0
        )

        X_train, Y_train = X[:80], Y[:80]
        X_test = X[80:]

        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)

        clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train))
        Y_pred_sprs = clf_sprs.predict(X_test)

        assert_true(clf.multilabel_)
        assert_true(sp.issparse(Y_pred_sprs))
        assert_array_equal(Y_pred_sprs.toarray(), Y_pred)

        # Test predict_proba
        Y_proba = clf_sprs.predict_proba(X_test)

        # predict assigns a label if the probability that the
        # sample has the label is greater than 0.5.
        pred = Y_proba > 0.5
        assert_array_equal(pred, Y_pred_sprs.toarray())

        # Test decision_function
        clf_sprs = OneVsRestClassifier(svm.SVC()).fit(X_train, sparse(Y_train))
        dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int)
        assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())
Beispiel #8
0
def test_ovr_multilabel_predict_proba():
    base_clf = MultinomialNB(alpha=1)
    for au in (False, True):
        X, Y = datasets.make_multilabel_classification(n_samples=100,
                                                       n_features=20,
                                                       n_classes=5,
                                                       n_labels=3,
                                                       length=50,
                                                       allow_unlabeled=au,
                                                       return_indicator=True,
                                                       random_state=0)
        X_train, Y_train = X[:80], Y[:80]
        X_test, Y_test = X[80:], Y[80:]
        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)

        # decision function only estimator. Fails in current implementation.
        decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
        assert_raises(AttributeError, decision_only.predict_proba, X_test)

        # Estimator with predict_proba disabled, depending on parameters.
        decision_only = OneVsRestClassifier(svm.SVC(probability=False))
        decision_only.fit(X_train, Y_train)
        assert_raises(AttributeError, decision_only.predict_proba, X_test)

        Y_pred = clf.predict(X_test)
        Y_proba = clf.predict_proba(X_test)

        # predict assigns a label if the probability that the
        # sample has the label is greater than 0.5.
        pred = Y_proba > .5
        assert_array_equal(pred, Y_pred)
Beispiel #9
0
def check_alternative_lrap_implementation(lrap_score, n_classes=5,
                                          n_samples=20, random_state=0):
    _, y_true = make_multilabel_classification(n_features=1,
                                               allow_unlabeled=False,
                                               random_state=random_state,
                                               n_classes=n_classes,
                                               n_samples=n_samples)

    # Score with ties
    y_score = sparse_random_matrix(n_components=y_true.shape[0],
                                   n_features=y_true.shape[1],
                                   random_state=random_state)

    if hasattr(y_score, "toarray"):
        y_score = y_score.toarray()
    score_lrap = label_ranking_average_precision_score(y_true, y_score)
    score_my_lrap = _my_lrap(y_true, y_score)
    assert_almost_equal(score_lrap, score_my_lrap)

    # Uniform score
    random_state = check_random_state(random_state)
    y_score = random_state.uniform(size=(n_samples, n_classes))
    score_lrap = label_ranking_average_precision_score(y_true, y_score)
    score_my_lrap = _my_lrap(y_true, y_score)
    assert_almost_equal(score_lrap, score_my_lrap)
 def get_multilabel(self):
     return make_multilabel_classification(n_samples=100,
                                           n_features=10,
                                           n_classes=5,
                                           n_labels=5,
                                           return_indicator=True,
                                           random_state=1)
Beispiel #11
0
 def testMultiClassification(self):
     """TODO(ilblackdragon): Implement multi-output classification.
     """
     random.seed(42)
     n_classes = 5
     X, y = datasets.make_multilabel_classification(n_classes=n_classes,
                                                    random_state=42)
    def get_codes(self):
        X, Y = make_multilabel_classification(n_samples=15, n_labels=8, n_classes=8, random_state=0)
        self.classifier_labels = Y
        self.classifier_error_codes = LabelBinarizer().fit_transform(Y)
        print self.classifier_labels
        print self.classifier_error_codes

        f = open('ecoc_classifiers', 'w')

        for row in self.classifier_labels:
            str_op = '['
            for label in row:
                str_op += str(label) + ','
            str_op += ']'
            f.write(str_op)
        f.write('\n')

        for row in self.classifier_error_codes:
            str_op = '['
            for label in row:
                str_op += str(label) + ','
            str_op += ']'
            f.write(str_op)
        f.flush()
        return
def test_sparse_input():
    X, y = datasets.make_multilabel_classification(random_state=0,
                                                   n_samples=50)

    for name, sparse_matrix in product(FOREST_ESTIMATORS,
                                       (csr_matrix, csc_matrix, coo_matrix)):
        yield check_sparse_input, name, X, sparse_matrix(X), y
Beispiel #14
0
def test_random_hasher_sparse_data():
    X, y = datasets.make_multilabel_classification(return_indicator=True,
                                                   random_state=0)
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    X_transformed = hasher.fit_transform(X)
    X_transformed_sparse = hasher.fit_transform(csc_matrix(X))
    assert_array_equal(X_transformed_sparse.toarray(), X_transformed.toarray())
def test_output_transformer():
    X, y = datasets.make_multilabel_classification(return_indicator=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    # Check that random_state are different
    transformer = GaussianRandomProjection(n_components=5, random_state=None)
    for name, ForestEstimator in FOREST_ESTIMATORS.items():
        est = ForestEstimator(random_state=5, output_transformer=transformer)
        est.fit(X_train, y_train)
        y_pred = est.predict(X_test)
        assert_equal(y_pred.shape, y_test.shape)

        random_state = [sub.output_transformer_.random_state
                        for sub in est.estimators_]

        assert_equal(len(set(random_state)), est.n_estimators)


    # Check that random_state are equals
    transformer = FixedStateTransformer(GaussianRandomProjection(
        n_components=5), random_seed=0)
    for name, ForestEstimator in FOREST_ESTIMATORS.items():
        est = ForestEstimator(random_state=5, output_transformer=transformer)
        est.fit(X_train, y_train)
        y_pred = est.predict(X_test)
        assert_equal(y_pred.shape, y_test.shape)


        random_state = [sub.output_transformer_.random_state
                        for sub in est.estimators_]

        assert_equal(len(set(random_state)), 1)
        assert_equal(random_state[0], 0)
def test_grid_search_with_multioutput_data():
    # Test search with multi-output estimator

    X, y = make_multilabel_classification(random_state=0)

    est_parameters = {"max_depth": [1, 2, 3, 4]}
    cv = KFold(y.shape[0], random_state=0)

    estimators = [DecisionTreeRegressor(random_state=0), DecisionTreeClassifier(random_state=0)]

    # Test with grid search cv
    for est in estimators:
        grid_search = GridSearchCV(est, est_parameters, cv=cv)
        grid_search.fit(X, y)
        for parameters, _, cv_validation_scores in grid_search.grid_scores_:
            est.set_params(**parameters)

            for i, (train, test) in enumerate(cv):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(correct_score, cv_validation_scores[i])

    # Test with a randomized search
    for est in estimators:
        random_search = RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3)
        random_search.fit(X, y)
        for parameters, _, cv_validation_scores in random_search.grid_scores_:
            est.set_params(**parameters)

            for i, (train, test) in enumerate(cv):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(correct_score, cv_validation_scores[i])
def test_multilabel_classification():
    """Test that multi-label classification works as expected."""
    # test fit method
    X, y = make_multilabel_classification(n_samples=50, random_state=0,
                                          return_indicator=True)
    elm = ELMClassifier(weight_scale=100)
    elm.fit(X, y)
    assert_greater(elm.score(X, y), 0.95)
    def assertClassifierWorksWithSparsity(self, classifier, sparsity_indicator = 'sparse'):
        feed_sparse = sparsity_indicator == 'sparse'
        X, y = make_multilabel_classification(sparse = feed_sparse, return_indicator = sparsity_indicator)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
        classifier.fit(X_train, y_train)
        result = classifier.predict(X_test)

        self.assertEqual(result.shape, y_test.shape)
def test_ovr_multilabel_decision_function():
    X, Y = datasets.make_multilabel_classification(
        n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=True, random_state=0
    )
    X_train, Y_train = X[:80], Y[:80]
    X_test = X[80:]
    clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train)
    assert_array_equal((clf.decision_function(X_test) > 0).astype(int), clf.predict(X_test))
def test_sparse_input(EstimatorClass, sparse_matrix):
    y, X = datasets.make_multilabel_classification(random_state=0,
                                                   n_samples=50,
                                                   n_features=1,
                                                   n_classes=20)
    y = y[:, 0]

    check_sparse_input(EstimatorClass, X, sparse_matrix(X), y)
def test_make_multilabel_classification_return_indicator():
    for allow_unlabeled, min_length in zip((True, False), (0, 1)):
        X, Y = make_multilabel_classification(n_samples=25, n_features=20,
                                              n_classes=3, random_state=0,
                                              return_indicator=True,
                                              allow_unlabeled=allow_unlabeled)
        assert_equal(X.shape, (25, 20), "X shape mismatch")
        assert_equal(Y.shape, (25, 3), "Y shape mismatch")
        assert_true(np.all(np.sum(Y, axis=0) > min_length))
Beispiel #22
0
def test_multilabel_sample_weight_invariance(name):
    # multilabel indicator
    random_state = check_random_state(0)
    _, ya = make_multilabel_classification(n_features=1, n_classes=20,
                                           random_state=0, n_samples=100,
                                           allow_unlabeled=False)
    _, yb = make_multilabel_classification(n_features=1, n_classes=20,
                                           random_state=1, n_samples=100,
                                           allow_unlabeled=False)
    y_true = np.vstack([ya, yb])
    y_pred = np.vstack([ya, ya])
    y_score = random_state.randint(1, 4, size=y_true.shape)

    metric = ALL_METRICS[name]
    if name in THRESHOLDED_METRICS:
        check_sample_weight_invariance(name, metric, y_true, y_score)
    else:
        check_sample_weight_invariance(name, metric, y_true, y_pred)
def test_make_multilabel_classification_return_indicator_sparse():
    for allow_unlabeled, min_length in zip((True, False), (0, 1)):
        X, Y = make_multilabel_classification(n_samples=25, n_features=20,
                                              n_classes=3, random_state=0,
                                              return_indicator='sparse',
                                              allow_unlabeled=allow_unlabeled)
        assert_equal(X.shape, (25, 20), "X shape mismatch")
        assert_equal(Y.shape, (25, 3), "Y shape mismatch")
        assert_true(sp.issparse(Y))
Beispiel #24
0
def test_scorer_sample_weight():
    # Test that scorers support sample_weight or raise sensible errors

    # Unlike the metrics invariance test, in the scorer case it's harder
    # to ensure that, on the classifier output, weighted and unweighted
    # scores really should be unequal.
    X, y = make_classification(random_state=0)
    _, y_ml = make_multilabel_classification(n_samples=X.shape[0], random_state=0)
    split = train_test_split(X, y, y_ml, random_state=0)
    X_train, X_test, y_train, y_test, y_ml_train, y_ml_test = split

    sample_weight = np.ones_like(y_test)
    sample_weight[:10] = 0

    # get sensible estimators for each metric
    sensible_regr = DummyRegressor(strategy="median")
    sensible_regr.fit(X_train, y_train)
    sensible_clf = DecisionTreeClassifier(random_state=0)
    sensible_clf.fit(X_train, y_train)
    sensible_ml_clf = DecisionTreeClassifier(random_state=0)
    sensible_ml_clf.fit(X_train, y_ml_train)
    estimator = dict(
        [(name, sensible_regr) for name in REGRESSION_SCORERS]
        + [(name, sensible_clf) for name in CLF_SCORERS]
        + [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS]
    )

    for name, scorer in SCORERS.items():
        if name in MULTILABEL_ONLY_SCORERS:
            target = y_ml_test
        else:
            target = y_test
        try:
            weighted = scorer(estimator[name], X_test, target, sample_weight=sample_weight)
            ignored = scorer(estimator[name], X_test[10:], target[10:])
            unweighted = scorer(estimator[name], X_test, target)
            assert_not_equal(
                weighted,
                unweighted,
                msg="scorer {0} behaves identically when "
                "called with sample weights: {1} vs "
                "{2}".format(name, weighted, unweighted),
            )
            assert_almost_equal(
                weighted,
                ignored,
                err_msg="scorer {0} behaves differently when "
                "ignoring samples and setting sample_weight to"
                " 0: {1} vs {2}".format(name, weighted, ignored),
            )

        except TypeError as e:
            assert_true(
                "sample_weight" in str(e),
                "scorer {0} raises unhelpful exception when called " "with sample weights: {1}".format(name, str(e)),
            )
 def test_class_type(self):
     """
     Test class must be either binary or multiclass type
     """
     X, y = make_multilabel_classification()
     model = RandomForestClassifier()
     model.fit(X, y)
     with self.assertRaises(YellowbrickValueError):
         visualizer = ClassPredictionError(model)
         visualizer.score(X, y)
Beispiel #26
0
def test_output_transformer():
    X, y = datasets.make_multilabel_classification(return_indicator=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    transformer = GaussianRandomProjection(n_components=10)
    for name, TreeEstimator in ALL_TREES.items():
        est = TreeEstimator(random_state=0, output_transformer=transformer)
        est.fit(X_train, y_train)
        y_pred = est.predict(X_test)
        assert_equal(y_pred.shape, y_test.shape)
def test():
    X,Y_list = make_multilabel_classification()
    Y = LabelBinarizer().fit_transform(Y_list)
    Y[Y==0] = -1
    clf = OneVsRestClassifier(LinearSVC())
    #clf = MultilabelLR(L0=1, λ1=0.1, λ2=0.1, γ=0.1, μ=0.1)
    clf.fit(X,Y)
    Y_hat = clf.predict(X)

    print(roc_auc_score(Y.flat, Y_hat.flat))
def test_make_multilabel_classification():
    for allow_unlabeled, min_length in zip((True, False), (0, 1)):
        X, Y = make_multilabel_classification(n_samples=100, n_features=20,
                                              n_classes=3, random_state=0,
                                              allow_unlabeled=allow_unlabeled)
        assert_equal(X.shape, (100, 20), "X shape mismatch")
        if not allow_unlabeled:
            assert_equal(max([max(y) for y in Y]), 2)
        assert_equal(min([len(y) for y in Y]), min_length)
        assert_true(max([len(y) for y in Y]) <= 3)
def setup_module():
    # Create some memory mapped data
    global X_mm, y_mm, y_ml_mm, TEMP_FOLDER, ESTIMATORS
    TEMP_FOLDER = tempfile.mkdtemp(prefix='sklearn_test_score_objects_')
    X, y = make_classification(n_samples=30, n_features=5, random_state=0)
    _, y_ml = make_multilabel_classification(n_samples=X.shape[0],
                                             random_state=0)
    filename = os.path.join(TEMP_FOLDER, 'test_data.pkl')
    joblib.dump((X, y, y_ml), filename)
    X_mm, y_mm, y_ml_mm = joblib.load(filename, mmap_mode='r')
    ESTIMATORS = _make_estimators(X_mm, y_mm, y_ml_mm)
def test_make_multilabel_classification_return_indicator():
    for allow_unlabeled, min_length in zip((True, False), (0, 1)):
        X, Y = make_multilabel_classification(n_samples=25, n_features=20,
                                              n_classes=3, random_state=0,
                                              allow_unlabeled=allow_unlabeled)
        assert_equal(X.shape, (25, 20), "X shape mismatch")
        assert_equal(Y.shape, (25, 3), "Y shape mismatch")
        assert_true(np.all(np.sum(Y, axis=0) > min_length))

    # Also test return_distributions and return_indicator with True
    X2, Y2, p_c, p_w_c = make_multilabel_classification(
        n_samples=25, n_features=20, n_classes=3, random_state=0,
        allow_unlabeled=allow_unlabeled, return_distributions=True)

    assert_array_equal(X, X2)
    assert_array_equal(Y, Y2)
    assert_equal(p_c.shape, (3,))
    assert_almost_equal(p_c.sum(), 1)
    assert_equal(p_w_c.shape, (20, 3))
    assert_almost_equal(p_w_c.sum(axis=0), [1] * 3)
Beispiel #31
0
def test_ovr_fit_predict_sparse():
    for sparse in [
            sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix,
            sp.lil_matrix
    ]:
        base_clf = MultinomialNB(alpha=1)

        X, Y = datasets.make_multilabel_classification(n_samples=100,
                                                       n_features=20,
                                                       n_classes=5,
                                                       n_labels=3,
                                                       length=50,
                                                       allow_unlabeled=True,
                                                       random_state=0)

        X_train, Y_train = X[:80], Y[:80]
        X_test = X[80:]

        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)

        clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train))
        Y_pred_sprs = clf_sprs.predict(X_test)

        assert_true(clf.multilabel_)
        assert_true(sp.issparse(Y_pred_sprs))
        assert_array_equal(Y_pred_sprs.toarray(), Y_pred)

        # Test predict_proba
        Y_proba = clf_sprs.predict_proba(X_test)

        # predict assigns a label if the probability that the
        # sample has the label is greater than 0.5.
        pred = Y_proba > .5
        assert_array_equal(pred, Y_pred_sprs.toarray())

        # Test decision_function
        clf_sprs = OneVsRestClassifier(svm.SVC()).fit(X_train, sparse(Y_train))
        dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int)
        assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())
Beispiel #32
0
def test_ovr_multilabel_predict_proba():
    base_clf = MultinomialNB(alpha=1)
    for au in (False, True):
        X, Y = datasets.make_multilabel_classification(n_samples=100,
                                                       n_features=20,
                                                       n_classes=5,
                                                       n_labels=3,
                                                       length=50,
                                                       allow_unlabeled=au,
                                                       random_state=0)
        X_train, Y_train = X[:80], Y[:80]
        X_test = X[80:]
        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)

        # Decision function only estimator.
        decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
        assert not hasattr(decision_only, 'predict_proba')

        # Estimator with predict_proba disabled, depending on parameters.
        decision_only = OneVsRestClassifier(svm.SVC(probability=False))
        assert not hasattr(decision_only, 'predict_proba')
        decision_only.fit(X_train, Y_train)
        assert not hasattr(decision_only, 'predict_proba')
        assert hasattr(decision_only, 'decision_function')

        # Estimator which can get predict_proba enabled after fitting
        gs = GridSearchCV(svm.SVC(probability=False),
                          param_grid={'probability': [True]})
        proba_after_fit = OneVsRestClassifier(gs)
        assert not hasattr(proba_after_fit, 'predict_proba')
        proba_after_fit.fit(X_train, Y_train)
        assert hasattr(proba_after_fit, 'predict_proba')

        Y_pred = clf.predict(X_test)
        Y_proba = clf.predict_proba(X_test)

        # predict assigns a label if the probability that the
        # sample has the label is greater than 0.5.
        pred = Y_proba > .5
        assert_array_equal(pred, Y_pred)
Beispiel #33
0
def test_grid_search_with_multioutput_data():
    """ Test search with multi-output estimator"""

    X, y = make_multilabel_classification(return_indicator=True,
                                          random_state=0)

    est_parameters = {"max_depth": [1, 2, 3, 4]}
    cv = KFold(y.shape[0], random_state=0)

    estimators = [
        DecisionTreeRegressor(random_state=0),
        DecisionTreeClassifier(random_state=0)
    ]

    # Test with grid search cv
    for est in estimators:
        grid_search = GridSearchCV(est, est_parameters, cv=cv)
        grid_search.fit(X, y)
        for parameters, _, cv_validation_scores in grid_search.grid_scores_:
            est.set_params(**parameters)

            for i, (train, test) in enumerate(cv):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(correct_score, cv_validation_scores[i])

    # Test with a randomized search
    for est in estimators:
        random_search = RandomizedSearchCV(est,
                                           est_parameters,
                                           cv=cv,
                                           n_iter=3)
        random_search.fit(X, y)
        for parameters, _, cv_validation_scores in random_search.grid_scores_:
            est.set_params(**parameters)

            for i, (train, test) in enumerate(cv):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(correct_score, cv_validation_scores[i])
Beispiel #34
0
def test_multioutput():
    X, y = make_multilabel_classification(n_samples=100, n_labels=1,
                                          n_classes=5, random_state=0,
                                          return_indicator=True)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    est = LazyBaggingClassifier(random_state=0, n_estimators=10,
                                bootstrap=False)
    est.fit(X_train, y_train)

    assert_almost_equal(est.score(X_train, y_train), 1.)

    y_proba = est.predict_proba(X_test)
    y_log_proba = est.predict_log_proba(X_test)
    for p, log_p in zip(y_proba, y_log_proba):
        assert_array_almost_equal(p, np.exp(log_p))

    est = LazyBaggingRegressor(random_state=0, n_estimators=10,
                               bootstrap=False)
    est.fit(X_train, y_train)
    assert_almost_equal(est.score(X_train, y_train), 1.)
Beispiel #35
0
def multilLabel():
    # 多标签多分类原始标签
    y = [[2, 3, 4], [2], [0, 1, 3], [0, 1, 2, 3, 4], [0, 1, 2]]
    # 对标签进行预处理
    mb = MultiLabelBinarizer()
    # y_mb变成N*K的矩阵(N:样本数,K:类别数)
    y_mb = mb.fit_transform(y)

    # 多类别学习,标签形如[0,0,1,1,2,2]
    data = load_iris()
    X, y = data.data, data.target
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)
    binary_model = SVC(kernel="linear", random_state=1)
    # one-vs-all形式(既可以多类别问题也可以多标签多分类问题,fir(X,y)中y.shape=[samples] or [samples,classes])
    multi_model = OneVsRestClassifier(binary_model).fit(X_train, y_train)
    # one-vs-one形式(只能用于多类别问题,fit(X,y)函数要求y.shape=[samples])
    #multi_model = OneVsOneClassifier(binary_model).fit(X_train,y_train)
    y_pred = multi_model.predict(X_test)
    print("True Labels:   ", y_test)
    print("Predict Labels:", y_pred)
    print("Accuracy: ", accuracy_score(y_test, y_pred))

    # 多标签多分类
    ml_X, ml_y = make_multilabel_classification()
    print("多标签多分类训练标签:\n", ml_y[:5])
    ml_X_train, ml_X_test, ml_y_train, ml_y_test = train_test_split(
        ml_X, ml_y, test_size=0.1)
    # one-vs-all
    clf = OneVsRestClassifier(SVC(kernel="linear"))
    clf.fit(ml_X_train, ml_y_train)
    pred_y = clf.predict(ml_X_test)
    print("True Labels:  \n", ml_y_test)
    print("Predict Labels:\n", pred_y)

    print("Hamming_loss: ", hamming_loss(ml_y_test, pred_y))
    print("Accuracy:     ", accuracy_score(ml_y_test, pred_y))
Beispiel #36
0
    def test_evenly_distributes_unlabelled(self):
        cv = IterativeStratifiedKFold(n_splits=5,
                                      shuffle=False,
                                      random_state=0)
        X, y = make_multilabel_classification(100,
                                              20,
                                              n_labels=5,
                                              random_state=0,
                                              allow_unlabeled=False)

        y[[0, 1, 2, 3, 4], :] = 0

        # Make label 0 have only 3 positive instances.

        folds = list(cv.split(X, y))
        for train_idx, valid_idx in folds:
            unlabelled_in_train = np.where(
                np.sum(y[train_idx, :], axis=1) == 0)[0].shape[0]
            unlabelled_in_valid = np.where(
                np.sum(y[valid_idx, :], axis=1) == 0)[0].shape[0]
            self.assertEqual(unlabelled_in_train, 4)
            self.assertEqual(unlabelled_in_valid, 1)
Beispiel #37
0
 def generate_classification(self,
                             num_classes,
                             num_features,
                             num_samples,
                             test_split=0.1,
                             seed=0):
     #X, Y = make_classification(n_samples=800, n_features=num_feats, n_classes=num_classes, n_informative=4)
     X, y = make_multilabel_classification(n_samples=num_samples,
                                           n_features=num_features,
                                           n_classes=num_classes,
                                           n_labels=0.01,
                                           length=50,
                                           allow_unlabeled=False,
                                           sparse=False,
                                           return_indicator='dense',
                                           return_distributions=False,
                                           random_state=seed)
     Y = np.argmax(y, axis=1)
     self.categorical_features = [False] * num_features
     self.problem_type = ProblemType.FeatureClassification
     self.X, self.Y = X, Y
     self._split_data(test_split, seed)
Beispiel #38
0
def dataset(request):
    X, y = make_multilabel_classification(
        n_samples=int(request.param['n_samples'] * 1.2),
        n_features=request.param['n_features'],
        n_classes=request.param['n_classes'],
        n_labels=request.param['n_classes'],
        length=request.param['n_targets'])
    new_x = []
    new_y = []
    for i in range(y.shape[0]):
        a = np.argwhere(y[i] == 1)[:, 0]
        if len(a) >= request.param['n_targets']:
            new_x.append(i)
            np.random.shuffle(a)
            a = a[:request.param['n_targets']]
            new_y.append(a)
        if len(new_x) >= request.param['n_samples']:
            break
    X = X[new_x]
    y = np.array(new_y)

    return train_test_split(X, y, test_size=0.33)
Beispiel #39
0
def test_check_classifiers_multilabel_output_format_predict():
    n_samples, test_size, n_outputs = 100, 25, 5
    _, y = make_multilabel_classification(
        n_samples=n_samples,
        n_features=2,
        n_classes=n_outputs,
        n_labels=3,
        length=50,
        allow_unlabeled=True,
        random_state=0,
    )
    y_test = y[-test_size:]

    class MultiLabelClassifierPredict(_BaseMultiLabelClassifierMock):
        def predict(self, X):
            return self.response_output

    # 1. inconsistent array type
    clf = MultiLabelClassifierPredict(response_output=y_test.tolist())
    err_msg = (r"MultiLabelClassifierPredict.predict is expected to output a "
               r"NumPy array. Got <class 'list'> instead.")
    with raises(AssertionError, match=err_msg):
        check_classifiers_multilabel_output_format_predict(
            clf.__class__.__name__, clf)
    # 2. inconsistent shape
    clf = MultiLabelClassifierPredict(response_output=y_test[:, :-1])
    err_msg = (r"MultiLabelClassifierPredict.predict outputs a NumPy array of "
               r"shape \(25, 4\) instead of \(25, 5\).")
    with raises(AssertionError, match=err_msg):
        check_classifiers_multilabel_output_format_predict(
            clf.__class__.__name__, clf)
    # 3. inconsistent dtype
    clf = MultiLabelClassifierPredict(
        response_output=y_test.astype(np.float64))
    err_msg = (r"MultiLabelClassifierPredict.predict does not output the same "
               r"dtype than the targets.")
    with raises(AssertionError, match=err_msg):
        check_classifiers_multilabel_output_format_predict(
            clf.__class__.__name__, clf)
Beispiel #40
0
def test_predict_proba_multilabel():
    # Test that predict_proba works as expected for multilabel.
    # Multilabel should not use softmax which makes probabilities sum to 1
    X, Y = make_multilabel_classification(n_samples=50,
                                          random_state=0,
                                          return_indicator=True)
    n_samples, n_classes = Y.shape

    clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=30, random_state=0)
    clf.fit(X, Y)
    y_proba = clf.predict_proba(X)

    assert_equal(y_proba.shape, (n_samples, n_classes))
    assert_array_equal(y_proba > 0.5, Y)

    y_log_proba = clf.predict_log_proba(X)
    proba_max = y_proba.argmax(axis=1)
    proba_log_max = y_log_proba.argmax(axis=1)

    assert_greater((y_proba.sum(1) - 1).dot(y_proba.sum(1) - 1), 1e-10)
    assert_array_equal(proba_max, proba_log_max)
    assert_array_equal(y_log_proba, np.log(y_proba))
Beispiel #41
0
def test_ovr_multilabel_dataset():
    base_clf = MultinomialNB(alpha=1)
    for au, prec, recall in zip((True, False), (0.51, 0.66), (0.51, 0.80)):
        X, Y = datasets.make_multilabel_classification(n_samples=100,
                                                       n_features=20,
                                                       n_classes=5,
                                                       n_labels=2,
                                                       length=50,
                                                       allow_unlabeled=au,
                                                       random_state=0)
        X_train, Y_train = X[:80], Y[:80]
        X_test, Y_test = X[80:], Y[80:]
        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)

        assert_true(clf.multilabel_)
        assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"),
                            prec,
                            decimal=2)
        assert_almost_equal(recall_score(Y_test, Y_pred, average="micro"),
                            recall,
                            decimal=2)
def test_sparse_input(EstimatorClass, sparse_matrix):
    y, X = datasets.make_multilabel_classification(
        random_state=0, n_samples=50, n_features=1, n_classes=20
    )
    y = y[:, 0]
    X_sparse = sparse_matrix(X)

    dense = EstimatorClass(
        n_estimators=10, random_state=0, max_depth=2, min_impurity_decrease=1e-7
    ).fit(X, y)
    sparse = EstimatorClass(
        n_estimators=10, random_state=0, max_depth=2, min_impurity_decrease=1e-7
    ).fit(X_sparse, y)

    assert_array_almost_equal(sparse.apply(X), dense.apply(X))
    assert_array_almost_equal(sparse.predict(X), dense.predict(X))
    assert_array_almost_equal(sparse.feature_importances_, dense.feature_importances_)

    assert_array_almost_equal(sparse.predict(X_sparse), dense.predict(X))
    assert_array_almost_equal(dense.predict(X_sparse), sparse.predict(X))

    if issubclass(EstimatorClass, GradientBoostingClassifier):
        assert_array_almost_equal(sparse.predict_proba(X), dense.predict_proba(X))
        assert_array_almost_equal(
            sparse.predict_log_proba(X), dense.predict_log_proba(X)
        )

        assert_array_almost_equal(
            sparse.decision_function(X_sparse), sparse.decision_function(X)
        )
        assert_array_almost_equal(
            dense.decision_function(X_sparse), sparse.decision_function(X)
        )
        for res_sparse, res in zip(
            sparse.staged_decision_function(X_sparse),
            sparse.staged_decision_function(X),
        ):
            assert_array_almost_equal(res_sparse, res)
Beispiel #43
0
    def test_actually_works_on_proper_params(self):
        X, y = make_multilabel_classification(sparse=True,
                                              return_indicator='sparse')
        assert sp.issparse(y)

        for allow_overlap in [True, False]:
            for weighted in [True, False]:
                for include_self_edges in [True, False]:
                    for use_degree_corr in [True, False, None]:
                        for model_selection_criterium in [
                                'mean_field', 'bethe'
                        ]:
                            for verbose in [True, False]:
                                clusterer = GraphToolCooccurenceClusterer(
                                    weighted=weighted,
                                    allow_overlap=allow_overlap,
                                    include_self_edges=include_self_edges,
                                    n_iters=2,
                                    n_init_iters=2,
                                    use_degree_corr=use_degree_corr,
                                    model_selection_criterium=
                                    model_selection_criterium,
                                    verbose=verbose)
                                self.assertEqual(clusterer.allow_overlap,
                                                 allow_overlap)
                                self.assertEqual(clusterer.is_weighted,
                                                 weighted)
                                self.assertEqual(clusterer.include_self_edges,
                                                 include_self_edges)
                                self.assertEqual(clusterer.n_iters, 2)
                                self.assertEqual(clusterer.n_init_iters, 2)
                                self.assertEqual(
                                    clusterer.model_selection_criterium,
                                    model_selection_criterium)
                                self.assertEqual(clusterer.verbose, verbose)

                                partition = clusterer.fit_predict(X, y)
                                self.assertIsInstance(partition, np.ndarray)
Beispiel #44
0
    def test_evenly_distributes_label_with_multilabel(self):
        X, y = make_multilabel_classification(100,
                                              20,
                                              n_labels=18,
                                              random_state=0,
                                              allow_unlabeled=False,
                                              n_classes=18)
        d_idx = 0
        c_idx = 1
        y[:, d_idx] = 0
        y[:, c_idx] = 0

        y[[0, 10, 20, 12, 4], d_idx] = 1
        y[[1, 11, 21, 12, 4], c_idx] = 1

        # With Shuffle
        iskf = IterativeStratifiedKFold(n_splits=3,
                                        shuffle=True,
                                        random_state=42)
        cv = list(iskf.split(X, y))
        for train, valid in cv:
            self.assertIn(y[train].sum(axis=0)[d_idx], (4, 3))
            self.assertIn(y[valid].sum(axis=0)[d_idx], (1, 2))
        for train, valid in cv:
            self.assertIn(y[train].sum(axis=0)[c_idx], (4, 3))
            self.assertIn(y[valid].sum(axis=0)[c_idx], (1, 2))

        # With Shuffle
        iskf = IterativeStratifiedKFold(n_splits=3,
                                        shuffle=False,
                                        random_state=42)
        cv = list(iskf.split(X, y))
        for train, valid in cv:
            self.assertIn(y[train].sum(axis=0)[d_idx], (4, 3))
            self.assertIn(y[valid].sum(axis=0)[d_idx], (1, 2))
        for train, valid in cv:
            self.assertIn(y[train].sum(axis=0)[c_idx], (4, 3))
            self.assertIn(y[valid].sum(axis=0)[c_idx], (1, 2))
Beispiel #45
0
    def __configure(self):
        """ __configure

        Uses the make_multilabel_classification function from scikit-learn 
        to generate a multilabel classification problem. This problem will 
        be kept in memory and provided as demanded.


        """
        self.X, self.y = make_multilabel_classification(
            n_samples=self.n_samples,
            n_features=self.n_features,
            n_classes=self.n_targets,
            n_labels=self.n_labels,
            random_state=self.random_state)
        self.target_names = ["target_" + str(i) for i in range(self.n_targets)]
        self.feature_names = [
            "att_num_" + str(i) for i in range(self.n_num_features)
        ]
        self.target_values = np.unique(
            self.y).tolist() if self.n_targets == 1 else [
                np.unique(self.y[:, i]).tolist() for i in range(self.n_targets)
            ]
Beispiel #46
0
    def setUp(self):

        self.binary_problem_instance = make_classification(n_classes=2, n_samples=100, n_features=10, \
         n_informative=8, n_redundant=0,random_state=0, shuffle=False)
        self.Xb = self.binary_problem_instance[0]
        self.yb = self.binary_problem_instance[1]
        self.Xb, self.yb = helpers.binary_to_regression(self.Xb, self.yb)


        self.multiClass_problem_instance = make_classification(n_classes=4, n_samples=100, n_features=10, \
         n_informative=8, n_redundant=0,random_state=0, shuffle=False)
        self.Xmc = self.multiClass_problem_instance[0]
        self.ymc = self.multiClass_problem_instance[1]
        self.Xmc, self.ymc = helpers.multiClass_to_regression(
            self.Xmc, self.ymc, 4)


        self.multiLabel_problem_instance = make_multilabel_classification(n_classes=5, \
         n_labels=2, n_samples=100, n_features=10)
        self.Xml = self.multiLabel_problem_instance[0]
        self.yml = self.multiLabel_problem_instance[1]
        self.Xml, self.yml = helpers.multiLabel_to_regression(
            self.Xml, self.yml, 5)
Beispiel #47
0
def test_sparse():
    """ Validate running LinearExplainer on scipy sparse data
    """
    import sklearn.linear_model
    from sklearn.datasets import make_multilabel_classification
    from scipy.special import expit

    np.random.seed(0)
    n_features = 20
    X, y = make_multilabel_classification(n_samples=100,
                                          sparse=True,
                                          n_features=n_features,
                                          n_classes=1,
                                          n_labels=2)

    # train linear model
    model = sklearn.linear_model.LogisticRegression()
    model.fit(X, y)

    # explain the model's predictions using SHAP values
    explainer = shap.LinearExplainer(model, X)
    shap_values = explainer.shap_values(X)
    assert np.max(np.abs(expit(explainer.expected_value + shap_values.sum(1)) - model.predict_proba(X)[:, 1])) < 1e-6
Beispiel #48
0
def get_data():
    x,y =  make_multilabel_classification(n_samples=20,n_features=2,\
                                          n_labels=1,n_classes=1,random_state=2)
    # #创建表格
    # wb = workbook.Workbook()
    # #表示动作句柄
    # wa = wb.active
    # for i in range(len(x)):
    #     # print(list(x[i])+list(y[i]))
    #     wa.append(list(x[i])+list(y[i]))
    # wb.save('data.xlsx')
    # # read_excel_xlsx('data.xlsx')


    # x:特征值,y:类别
    # 根据类别分个类
    # 类别1的下标
    index1 = np.array([index for (index,value) in enumerate(y) if value == 0]) # print(index1)
    #类别2的下标
    index2 = np.array([index for (index,value) in enumerate(y) if value == 1])
    c1  = x[index1]
    c2 = x[index2]
    return x,np.array([c1,c2])
Beispiel #49
0
def test_multilabel_classification():
    # Test that multi-label classification works as expected.
    # test fit method
    X, y = make_multilabel_classification(n_samples=50, random_state=0,
                                          return_indicator=True)
    mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=50, alpha=1e-5,
                        max_iter=150, random_state=0, activation='logistic',
                        learning_rate_init=0.2)
    mlp.fit(X, y)
    assert mlp.score(X, y) > 0.97

    # test partial fit method
    mlp = MLPClassifier(solver='sgd', hidden_layer_sizes=50, max_iter=150,
                        random_state=0, activation='logistic', alpha=1e-5,
                        learning_rate_init=0.2)
    for i in range(100):
        mlp.partial_fit(X, y, classes=[0, 1, 2, 3, 4])
    assert mlp.score(X, y) > 0.9

    # Make sure early stopping still work now that spliting is stratified by
    # default (it is disabled for multilabel classification)
    mlp = MLPClassifier(early_stopping=True)
    mlp.fit(X, y).predict(X)
Beispiel #50
0
 def generate_classification(self, num_classes, num_features, num_samples, test_split=0.1, seed=0):
     """Generate a classification task
     
     Arguments:
         num_classes {int} -- Number of classes
         num_features {int} -- Number of features
         num_samples {int} -- Number of samples
     
     Keyword Arguments:
         test_split {float} -- Size of test split (default: {0.1})
         seed {int} -- A random seed (default: {0})
     """
     #X, Y = make_classification(n_samples=800, n_features=num_feats, n_classes=num_classes, n_informative=4)
     X, y = make_multilabel_classification(
         n_samples=num_samples, n_features=num_features, n_classes=num_classes, n_labels=0.01,
         length=50, allow_unlabeled=False, sparse=False, return_indicator='dense',
         return_distributions=False, random_state=seed
     )
     Y = np.argmax(y, axis=1)
     self.categorical_features = [False] * num_features
     self.problem_type = ProblemType.FeatureClassification
     self.X, self.Y = X, Y
     self._split_data(test_split, seed)
Beispiel #51
0
    def test_shuffle_shuffles_splits(self):
        X, y = make_multilabel_classification(100,
                                              20,
                                              n_labels=5,
                                              random_state=0,
                                              allow_unlabeled=False)

        # With Shuffle
        iskf = IterativeStratifiedKFold(n_splits=3,
                                        shuffle=True,
                                        random_state=42)
        cv1 = list(iskf.split(X, y))

        # Without shuffle
        iskf = IterativeStratifiedKFold(n_splits=3,
                                        shuffle=False,
                                        random_state=42)
        cv2 = list(iskf.split(X, y))

        for train_shuff, valid_shuff in cv1:
            for train_no_shuff, valid_no_shuff in cv2:
                self.assertNotEqual(list(train_shuff), list(train_no_shuff))
                self.assertNotEqual(list(valid_shuff), list(valid_no_shuff))
Beispiel #52
0
def test_pca_fit(datatype, input_type, name, use_handle):

    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'digits':
        X, _ = datasets.load_digits(return_X_y=True)

    else:
        X, Y = make_multilabel_classification(n_samples=500,
                                              n_classes=2,
                                              n_labels=1,
                                              allow_unlabeled=False,
                                              random_state=1)

    skpca = skPCA(n_components=2)
    skpca.fit(X)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)
    cupca.fit(X)
    cupca.handle.sync()

    for attr in [
            'singular_values_', 'components_', 'explained_variance_',
            'explained_variance_ratio_'
    ]:
        with_sign = False if attr in ['components_'] else True
        print(attr)
        print(getattr(cupca, attr))
        print(getattr(skpca, attr))
        cuml_res = (getattr(cupca, attr))
        if type(cuml_res) == np.ndarray:
            cuml_res = cuml_res.as_matrix()
        skl_res = getattr(skpca, attr)
        assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
Beispiel #53
0
    def __configure(self, n_samples, n_features, n_targets, n_labels):
        """ __configure

        Uses the make_multilabel_classification function from scikit-learn 
        to generate a multilabel classification problem. This problem will 
        be kept in memory and provided as demanded.

        Parameters
        ----------
        n_samples: int
            Total amount of samples to generate.

        n_features: int
            Number of features to generate.

        n_targets: int
            Number of targeting tasks to generate.
        
        n_labels: int
            Number of labels to generate.

        """
        self.X, self.y = make_multilabel_classification(
            n_samples=n_samples,
            n_features=n_features,
            n_classes=n_targets,
            n_labels=n_labels,
            random_state=self.random_state)
        self.num_samples = n_samples
        self.num_features = n_features
        self.num_target_tasks = n_targets
        self.num_labels = n_labels
        self.num_numerical_attributes = n_features
        self.class_header = ["label_" + str(i) for i in range(self.num_labels)]
        self.attributes_header = [
            "att_num_" + str(i) for i in range(self.num_numerical_attributes)
        ]
Beispiel #54
0
 def setup_mlc_dataset(self):
     X, Y = datasets.make_multilabel_classification(n_features=5,
                                                    random_state=1126)
     return Dataset(X, Y)
Beispiel #55
0
def check_classifier_on_multilabel_or_multioutput_targets(name, Estimator):
    estimator = Estimator()
    X, y = make_multilabel_classification(n_samples=30)
    msg = "Multilabel and multioutput targets are not supported."
    with pytest.raises(ValueError, match=msg):
        estimator.fit(X, y)
Beispiel #56
0
def test_sparse_input(name, sparse_matrix):
    X, y = datasets.make_multilabel_classification(random_state=0,
                                                   n_samples=50)

    check_sparse_input(name, X, sparse_matrix(X), y)
Beispiel #57
0
def test_random_hasher_sparse_data():
    X, y = datasets.make_multilabel_classification(random_state=0)
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    X_transformed = hasher.fit_transform(X)
    X_transformed_sparse = hasher.fit_transform(csc_matrix(X))
    assert_array_equal(X_transformed_sparse.toarray(), X_transformed.toarray())
Beispiel #58
0
#
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
#
#
# X contiene las noticias, pero quiero ver en qué formato
#

# In[1]:

# Parallelizing using Pool.apply()

import multiprocessing as mp

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import make_multilabel_classification
X, _ = make_multilabel_classification(random_state=0)

print(X)
print(X.shape)

lda = LatentDirichletAllocation(n_components=5, random_state=0)
lda.fit(X)
lda.transform(X[-2:])

# Vale, parece que X es un array de numpy, que contiene 100 filas con 20 columnas cada uno: un vector fila por cada noticia, que contiene el conteo de veces que aparece cada palabra (de un total de 20 palabras). Esto es lo que se llama "Bag Of Words"
#
# En nuestro caso tenemos que coger todas las noticias, eliminar las palabras te tipo "stopwords" (palabras como "de", "la", "un", ...) y hacernos una lista con todas las palabras únicas que aparecen en el conjunto de todas las noticias (que serán más de 20 seguro, pero bueno).

# ## Leer y limpiar csv

# In[1]:
Beispiel #59
0
def dump_multilabel_classification(
        model,
        suffix="",
        folder=None,
        allow_failure=None,
        verbose=False,
        label_string=False,
        first_class=0,
        comparable_outputs=None,
        target_opset=None):
    """
    Trains and dumps a model for a binary classification problem.
    The function trains a model and calls
    :func:`dump_data_and_model`.

    Every created filename will follow the pattern:
    ``<folder>/<prefix><task><classifier-name><suffix>.<data|expected|model|onnx>.<pkl|onnx>``.
    """
    X = [[0, 1], [1, 1], [2, 0], [0.5, 0.5], [1.1, 1.1], [2.1, 0.1]]
    X = numpy.array(X, dtype=numpy.float32)
    if label_string:
        y = [["l0"], ["l1"], ["l2"], ["l0", "l1"], ["l1"], ["l2"]]
    else:
        y = [[0 + first_class], [1 + first_class], [2 + first_class],
             [0 + first_class, 1 + first_class],
             [1 + first_class], [2 + first_class]]
    y = MultiLabelBinarizer().fit_transform(y)
    model.fit(X, y)
    if verbose:
        print("[make_multilabel_classification] model '{}'".format(
            model.__class__.__name__))
    model_onnx, prefix = convert_model(
        model, "multi-class classifier",
        [("input", FloatTensorType([None, 2]))],
        target_opset=target_opset)
    if verbose:
        print("[make_multilabel_classification] model was converted")
    dump_data_and_model(
        X.astype(numpy.float32),
        model,
        model_onnx,
        folder=folder,
        allow_failure=allow_failure,
        basename=prefix + "Mcl" + model.__class__.__name__ + suffix,
        verbose=verbose,
        comparable_outputs=comparable_outputs,
    )

    X, y = make_multilabel_classification(40, n_features=4, random_state=42,
                                          n_classes=3)
    X = X[:, :2]
    model.fit(X, y)
    if verbose:
        print("[make_multilabel_classification] model '{}'".format(
            model.__class__.__name__))
    model_onnx, prefix = convert_model(model, "multi-class classifier",
                                       [("input", FloatTensorType([None, 2]))])
    if verbose:
        print("[make_multilabel_classification] model was converted")
    dump_data_and_model(
        X[:10].astype(numpy.float32),
        model,
        model_onnx,
        folder=folder,
        allow_failure=allow_failure,
        basename=prefix + "RndMla" + model.__class__.__name__ + suffix,
        verbose=verbose,
        comparable_outputs=comparable_outputs,
    )
Beispiel #60
0
filename = input("Введите путь файла: ")
# Define the color maps for plots
color_map = plt.cm.get_cmap('RdYlBu')
color_map_discrete = matplotlib.colors.LinearSegmentedColormap.from_list("", ["red","cyan","magenta","blue"])
fig, ax = plt.subplots(nrows=1, ncols=4, figsize=(18, 7))
plt_ind_list = np.arange(3) + 131

dataset_x = []
dataset_y = []
dataset_sparse = []
labels = [1,2,4]
for label, plt_ind in zip(labels, plt_ind_list):
    x, y = dt.make_multilabel_classification(n_samples=1000,
                                             n_features=4,
                                             n_labels=label,
                                             n_classes=5,
                                             random_state=rand_state)
    target = np.sum(y * [1,1,1,1,1], axis=1)
    dataset_x.append(x)
    dataset_y.append(y)
    plt.subplot(plt_ind)
    my_scatter_plot = plt.scatter(x[:, 0],
                                  x[:, 1],
                                  c=target,
                                  vmin=min(target),
                                  vmax=max(target),
                                  cmap=color_map)
    plt.title('n_labels: ' + str(label))
n_ds_x = np.concatenate(dataset_x)
n_ds_y = np.concatenate(dataset_y)