def check_transformer_pickle(name, Transformer):
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    n_samples, n_features = X.shape
    X = StandardScaler().fit_transform(X)
    X -= X.min()
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        transformer = Transformer()
    if not hasattr(transformer, 'transform'):
        return
    set_random_state(transformer)
    set_fast_parameters(transformer)

    # fit
    if name in CROSS_DECOMPOSITION:
        random_state = np.random.RandomState(seed=12345)
        y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))])
        y_ = y_.T
    else:
        y_ = y

    transformer.fit(X, y_)
    X_pred = transformer.fit(X, y_).transform(X)
    pickled_transformer = pickle.dumps(transformer)
    unpickled_transformer = pickle.loads(pickled_transformer)
    pickled_X_pred = unpickled_transformer.transform(X)

    assert_array_almost_equal(pickled_X_pred, X_pred)
def check_classifiers_classes(name, Classifier):
    X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    # We need to make sure that we have non negative data, for things
    # like NMF
    X -= X.min() - .1
    y_names = np.array(["one", "two", "three"])[y]

    for y_names in [y_names, y_names.astype('O')]:
        if name in ["LabelPropagation", "LabelSpreading"]:
            # TODO some complication with -1 label
            y_ = y
        else:
            y_ = y_names

        classes = np.unique(y_)
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            classifier = Classifier()
        if name == 'BernoulliNB':
            classifier.set_params(binarize=X.mean())
        set_fast_parameters(classifier)
        # fit
        classifier.fit(X, y_)

        y_pred = classifier.predict(X)
        # training set performance
        assert_array_equal(np.unique(y_), np.unique(y_pred))
        if np.any(classifier.classes_ != classes):
            print("Unexpected classes_ attribute for %r: "
                  "expected %s, got %s" %
                  (classifier, classes, classifier.classes_))
def test_thresholded_scorers():
    """Test scorers that take thresholds."""
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, y_train)
    score1 = SCORERS['roc_auc'](clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
    score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)
    assert_almost_equal(score1, score3)

    logscore = SCORERS['log_loss'](clf, X_test, y_test)
    logloss = log_loss(y_test, clf.predict_proba(X_test))
    assert_almost_equal(-logscore, logloss)

    # same for an estimator without decision_function
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    score1 = SCORERS['roc_auc'](clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)

    # Test that an exception is raised on more than two classes
    X, y = make_blobs(random_state=0, centers=3)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf.fit(X_train, y_train)
    assert_raises(ValueError, SCORERS['roc_auc'], clf, X_test, y_test)
def test_fit_uuu():
    n_samples1 = 10000
    n_features = 5
    centers1 = np.array([[10, 5, 1, -5, -10],
                        [-10, -5, -1, 5, 10]])
    cluster_std1 = np.array([[1.0, 2.0, 3.0, 4.0, 5.0],
                            [5.0, 4.0, 3.0, 2.0, 1.0]])

    X1, y1 = make_blobs(n_features=n_features,
                      n_samples=n_samples1,
                      centers=centers1,
                      cluster_std=cluster_std1)

    n_samples2 = 5000
    centers2 = np.array([[10, 5, 1, -5, -10]])
    cluster_std2 = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]])
    X2, y2 = make_blobs(n_features=n_features,
                      n_samples=n_samples2,
                      centers=centers2,
                      cluster_std=cluster_std2)
    X = np.vstack((X1, X2))

    model = mixture.PGMM(covariance_type='UUU', n_components=2, n_pc=3)
    model.fit(X)
    assert_array_almost_equal(np.sum(model.means_, 0), np.sum(centers1, 0), decimal=0)
    assert_array_almost_equal(np.sort(model.weights_), np.array([0.333, 0.666]), decimal=1)
    assert_equal(model.means_.shape, np.array([2, n_features]))
    assert_equal(model.weights_.shape, np.array([2]))
    assert_equal(model.noise_.shape, np.array([2, n_features]))
    assert_equal(model.principal_subspace_.shape, np.array([2, n_features, 3]))
    assert_equal(model.covars_.shape, np.array([2, n_features, n_features]))
    logging.info('TestFitUUU: OK')
Exemple #5
0
def test_check_is_fitted():
    # Check is ValueError raised when non estimator instance passed
    assert_raises(ValueError, check_is_fitted, ARDRegression, "coef_")
    assert_raises(TypeError, check_is_fitted, "SVR", "support_")

    ard = ARDRegression()
    svr = SVR()

    try:
        assert_raises(NotFittedError, check_is_fitted, ard, "coef_")
        assert_raises(NotFittedError, check_is_fitted, svr, "support_")
    except ValueError:
        assert False, "check_is_fitted failed with ValueError"

    # NotFittedError is a subclass of both ValueError and AttributeError
    try:
        check_is_fitted(ard, "coef_", "Random message %(name)s, %(name)s")
    except ValueError as e:
        assert_equal(str(e), "Random message ARDRegression, ARDRegression")

    try:
        check_is_fitted(svr, "support_", "Another message %(name)s, %(name)s")
    except AttributeError as e:
        assert_equal(str(e), "Another message SVR, SVR")

    ard.fit(*make_blobs())
    svr.fit(*make_blobs())

    assert_equal(None, check_is_fitted(ard, "coef_"))
    assert_equal(None, check_is_fitted(svr, "support_"))
    def show_dbscan():
        """
        simulate 1 month of normal hourly room percentage data followed by an anomalous percentage
        the normal data is bimodal following most peoples activity patterns in which there is routinely a weekday
        percentage and a weekend percentage. 1 day in which the person spends a large amount of time in the bathroom
        is simulated
        """

        # simulate normal hourly data
        weekday = ([0.05, 0.95], 0.05) #bath, bed
        weekend = ([0.3, 0.7], 0.1)
        roomperwd, truelabelswd = make_blobs(n_samples=23, centers=weekday[0],
                                             cluster_std=weekday[1], random_state=0)
        roomperwe, truelabelswe = make_blobs(n_samples=8, centers=weekend[0],
                                             cluster_std=weekend[1], random_state=0)

        # combine modes
        roompers = np.vstack((roomperwd, roomperwe))

        # make positive and sum to one to simulate valid distribution
        for i in range(roompers.shape[0]):
            for j in range(roompers.shape[1]):
                if roompers[i, j] < 0:
                    roompers[i, j] = 0
        roompersnorm = normalize(roompers, norm='l1')

        # simulate anomaly on most recent day where don't leave bedroom
        roompersnorm[-1, :] = np.array([0.8, 0.2])

        # detect outliers
        roompersdetector = HourlyRoomPercentageAnomalyDetection(roompersnorm, eps=0.3, min_samples=3)
        labels = roompersdetector.scale_and_proximity_cluster(eps=0.3, min_samples=3)

        # plot results
        plt.figure()
        seenflag1 = False; seenflag2 = False; seenflag3 = False;
        for i, label in enumerate(labels):
            if label == 0:
                if seenflag1:
                    plt.plot(roompersnorm[i][0], roompersnorm[i][1], 'ro')
                else:
                    plt.plot(roompersnorm[i][0], roompersnorm[i][1], 'ro', label='Cluster 1')
                    seenflag1 = True
            elif label == 1:
                if seenflag2:
                    plt.plot(roompersnorm[i][0], roompersnorm[i][1], 'kx')
                else:
                    plt.plot(roompersnorm[i][0], roompersnorm[i][1], 'kx', label='Cluster 2')
                    seenflag2 = True
            elif label == -1:
                if seenflag3:
                    plt.plot(roompersnorm[i][0], roompersnorm[i][1], 'b^')
                else:
                    plt.plot(roompersnorm[i][0], roompersnorm[i][1], 'b^', label='Outlier')
                    seenflag3 = True
        plt.legend(loc='lower left')
        plt.axis([0, 1, 0, 1])
        plt.show()
def test_calibration_multiclass():
    """Test calibration for multiclass """
    # test multi-class setting with classifier that implements
    # only decision function
    clf = LinearSVC()
    X, y_idx = make_blobs(n_samples=100, n_features=2, random_state=42,
                          centers=3, cluster_std=3.0)

    # Use categorical labels to check that CalibratedClassifierCV supports
    # them correctly
    target_names = np.array(['a', 'b', 'c'])
    y = target_names[y_idx]

    X_train, y_train = X[::2], y[::2]
    X_test, y_test = X[1::2], y[1::2]

    clf.fit(X_train, y_train)
    for method in ['isotonic', 'sigmoid']:
        cal_clf = CalibratedClassifierCV(clf, method=method, cv=2)
        cal_clf.fit(X_train, y_train)
        probas = cal_clf.predict_proba(X_test)
        assert_array_almost_equal(np.sum(probas, axis=1), np.ones(len(X_test)))

        # Check that log-loss of calibrated classifier is smaller than
        # log-loss of naively turned OvR decision function to probabilities
        # via softmax
        def softmax(y_pred):
            e = np.exp(-y_pred)
            return e / e.sum(axis=1).reshape(-1, 1)

        uncalibrated_log_loss = \
            log_loss(y_test, softmax(clf.decision_function(X_test)))
        calibrated_log_loss = log_loss(y_test, probas)
        assert_greater_equal(uncalibrated_log_loss, calibrated_log_loss)

    # Test that calibration of a multiclass classifier decreases log-loss
    # for RandomForestClassifier
    X, y = make_blobs(n_samples=100, n_features=2, random_state=42,
                      cluster_std=3.0)
    X_train, y_train = X[::2], y[::2]
    X_test, y_test = X[1::2], y[1::2]

    clf = RandomForestClassifier(n_estimators=10, random_state=42)
    clf.fit(X_train, y_train)
    clf_probs = clf.predict_proba(X_test)
    loss = log_loss(y_test, clf_probs)

    for method in ['isotonic', 'sigmoid']:
        cal_clf = CalibratedClassifierCV(clf, method=method, cv=3)
        cal_clf.fit(X_train, y_train)
        cal_clf_probs = cal_clf.predict_proba(X_test)
        cal_loss = log_loss(y_test, cal_clf_probs)
        assert_greater(loss, cal_loss)
def test_thresholded_scorers():
    # Test scorers that take thresholds.
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
    score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)
    assert_almost_equal(score1, score3)

    logscore = get_scorer('neg_log_loss')(clf, X_test, y_test)
    logloss = log_loss(y_test, clf.predict_proba(X_test))
    assert_almost_equal(-logscore, logloss)

    # same for an estimator without decision_function
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)

    # test with a regressor (no decision_function)
    reg = DecisionTreeRegressor()
    reg.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(reg, X_test, y_test)
    score2 = roc_auc_score(y_test, reg.predict(X_test))
    assert_almost_equal(score1, score2)

    # Test that an exception is raised on more than two classes
    X, y = make_blobs(random_state=0, centers=3)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf.fit(X_train, y_train)
    with pytest.raises(ValueError, match="multiclass format is not supported"):
        get_scorer('roc_auc')(clf, X_test, y_test)

    # test error is raised with a single class present in model
    # (predict_proba shape is not suitable for binary auc)
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = DecisionTreeClassifier()
    clf.fit(X_train, np.zeros_like(y_train))
    with pytest.raises(ValueError, match="need classifier with two classes"):
        get_scorer('roc_auc')(clf, X_test, y_test)

    # for proba scorers
    with pytest.raises(ValueError, match="need classifier with two classes"):
        get_scorer('neg_log_loss')(clf, X_test, y_test)
def check_clustering(name, Alg):
    X, y = make_blobs(n_samples=50, random_state=1)
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    n_samples, n_features = X.shape
    # catch deprecation and neighbors warnings
    with warnings.catch_warnings(record=True):
        alg = Alg()
    set_fast_parameters(alg)
    if hasattr(alg, "n_clusters"):
        alg.set_params(n_clusters=3)
    set_random_state(alg)
    if name == 'AffinityPropagation':
        alg.set_params(preference=-100)
        alg.set_params(max_iter=100)

    # fit
    alg.fit(X)
    # with lists
    alg.fit(X.tolist())

    assert_equal(alg.labels_.shape, (n_samples,))
    pred = alg.labels_
    assert_greater(adjusted_rand_score(pred, y), 0.4)
    # fit another time with ``fit_predict`` and compare results
    if name is 'SpectralClustering':
        # there is no way to make Spectral clustering deterministic :(
        return
    set_random_state(alg)
    with warnings.catch_warnings(record=True):
        pred2 = alg.fit_predict(X)
    assert_array_equal(pred, pred2)
def test_compute_class_weight_invariance():
    # Test that results with class_weight="balanced" is invariant wrt
    # class imbalance if the number of samples is identical.
    # The test uses a balanced two class dataset with 100 datapoints.
    # It creates three versions, one where class 1 is duplicated
    # resulting in 150 points of class 1 and 50 of class 0,
    # one where there are 50 points in class 1 and 150 in class 0,
    # and one where there are 100 points of each class (this one is balanced
    # again).
    # With balancing class weights, all three should give the same model.
    X, y = make_blobs(centers=2, random_state=0)
    # create dataset where class 1 is duplicated twice
    X_1 = np.vstack([X] + [X[y == 1]] * 2)
    y_1 = np.hstack([y] + [y[y == 1]] * 2)
    # create dataset where class 0 is duplicated twice
    X_0 = np.vstack([X] + [X[y == 0]] * 2)
    y_0 = np.hstack([y] + [y[y == 0]] * 2)
    # duplicate everything
    X_ = np.vstack([X] * 2)
    y_ = np.hstack([y] * 2)
    # results should be identical
    logreg1 = LogisticRegression(class_weight="balanced").fit(X_1, y_1)
    logreg0 = LogisticRegression(class_weight="balanced").fit(X_0, y_0)
    logreg = LogisticRegression(class_weight="balanced").fit(X_, y_)
    assert_array_almost_equal(logreg1.coef_, logreg0.coef_)
    assert_array_almost_equal(logreg.coef_, logreg0.coef_)
def check_pipeline_consistency(name, Estimator):
    if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit():
        # Those transformers yield non-deterministic output when executed on
        # a 32bit Python. The same transformers are stable on 64bit Python.
        # FIXME: try to isolate a minimalistic reproduction case only depending
        # scipy and/or maybe generate a test dataset that does not
        # cause such unstable behaviors.
        msg = name + ' is non deterministic on 32bit Python'
        raise SkipTest(msg)

    # check that make_pipeline(est) gives same score as est
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    X -= X.min()
    y = multioutput_estimator_convert_y_2d(name, y)
    estimator = Estimator()
    set_fast_parameters(estimator)
    set_random_state(estimator)
    pipeline = make_pipeline(estimator)
    estimator.fit(X, y)
    pipeline.fit(X, y)
    funcs = ["score", "fit_transform"]
    for func_name in funcs:
        func = getattr(estimator, func_name, None)
        if func is not None:
            func_pipeline = getattr(pipeline, func_name)
            result = func(X, y)
            result_pipe = func_pipeline(X, y)
            assert_array_almost_equal(result, result_pipe)
def check_estimators_overwrite_params(name, Estimator):
    X, y = make_blobs(random_state=0, n_samples=9)
    y = multioutput_estimator_convert_y_2d(name, y)
    # some want non-negative input
    X -= X.min()
    with warnings.catch_warnings(record=True):
        # catch deprecation warnings
        estimator = Estimator()

    if name == 'MiniBatchDictLearning' or name == 'MiniBatchSparsePCA':
        # FIXME
        # for MiniBatchDictLearning and MiniBatchSparsePCA
        estimator.batch_size = 1

    set_fast_parameters(estimator)

    set_random_state(estimator)

    params = estimator.get_params()
    estimator.fit(X, y)
    new_params = estimator.get_params()
    for k, v in params.items():
        assert_false(np.any(new_params[k] != v),
                     "Estimator %s changes its parameter %s"
                     " from %s to %s during fit."
                     % (name, k, v, new_params[k]))
def separable_demo():
    """ Generate a linearly-separable dataset D, train a linear SVM on
    D, then output the resulting decision boundary on a figure.
    """
    from sklearn.datasets import make_blobs
    X, y = make_blobs(n_samples=200, n_features=2, 
                      centers=((0,0), (4, 4)),
                      cluster_std=1.0)
    plot_data(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    svc = svm.SVC(class_weight='auto')
    param_grid = {'kernel': ['linear'],
                  'C': [1e0, 1e1, 1e2, 1e3, 1e4]}
    strat_2fold = StratifiedKFold(y_train, k=2)
    print "    Parameters to be chosen through cross validation:"
    for name, vals in param_grid.iteritems():
        if name != 'kernel':
            print "        {0}: {1}".format(name, vals)
    clf = GridSearchCV(svc, param_grid, n_jobs=1, cv=strat_2fold)
    clf.fit(X_train, y_train)
    print "== Best Parameters:", clf.best_params_
    y_pred = clf.predict(X_test)
    acc = len(np.where(y_pred == y_test)[0]) / float(len(y_pred))
    print "== Accuracy:", acc
    print classification_report(y_test, y_pred)
    plot_svm(clf.best_estimator_, X, y, X_test, y_test, 
             title="SVM Decision Boundary, Linear Kernel ({0} accuracy, C={1})".format(acc, clf.best_params_['C']))
def check_class_weight_classifiers(name, Classifier):
    if name == "NuSVC":
        # the sparse version has a parameter that doesn't do anything
        raise SkipTest
    if name.endswith("NB"):
        # NaiveBayes classifiers have a somewhat different interface.
        # FIXME SOON!
        raise SkipTest

    for n_centers in [2, 3]:
        # create a very noisy dataset
        X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
                                                            random_state=0)
        n_centers = len(np.unique(y_train))

        if n_centers == 2:
            class_weight = {0: 1000, 1: 0.0001}
        else:
            class_weight = {0: 1000, 1: 0.0001, 2: 0.0001}

        with warnings.catch_warnings(record=True):
            classifier = Classifier(class_weight=class_weight)
        if hasattr(classifier, "n_iter"):
            classifier.set_params(n_iter=100)
        if hasattr(classifier, "min_weight_fraction_leaf"):
            classifier.set_params(min_weight_fraction_leaf=0.01)

        set_random_state(classifier)
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        assert_greater(np.mean(y_pred == 0), 0.89)
Exemple #15
0
def test_transformers_data_not_an_array():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    transformers = all_estimators(type_filter='transformer')
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    X = StandardScaler().fit_transform(X)
    # We need to make sure that we have non negative data, for things
    # like NMF
    X -= X.min() - .1

    for name, Transformer in transformers:
        # XXX: some transformers are transforming the input
        # data. This is a bug that we'll fix later. Right now we copy
        # the data each time
        this_X = NotAnArray(X.copy())
        this_y = NotAnArray(np.asarray(y))
        if name in dont_test:
            continue
        # these don't actually fit the data:
        if name in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']:
            continue
        # And these wan't multivariate output
        if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'):
            continue
        yield check_transformer, name, Transformer, this_X, this_y
def test_class_weight_classifiers():
    # test that class_weight works and that the semantics are consistent
    classifiers = all_estimators(type_filter="classifier")

    with warnings.catch_warnings(record=True):
        classifiers = [c for c in classifiers if "class_weight" in c[1]().get_params().keys()]

    for n_centers in [2, 3]:
        # create a very noisy dataset
        X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
        for name, Classifier in classifiers:
            if name == "NuSVC":
                # the sparse version has a parameter that doesn't do anything
                continue
            if name.endswith("NB"):
                # NaiveBayes classifiers have a somewhat different interface.
                # FIXME SOON!
                continue
            if n_centers == 2:
                class_weight = {0: 1000, 1: 0.0001}
            else:
                class_weight = {0: 1000, 1: 0.0001, 2: 0.0001}

            with warnings.catch_warnings(record=True):
                classifier = Classifier(class_weight=class_weight)
            if hasattr(classifier, "n_iter"):
                classifier.set_params(n_iter=100)

            set_random_state(classifier)
            classifier.fit(X_train, y_train)
            y_pred = classifier.predict(X_test)
            assert_greater(np.mean(y_pred == 0), 0.9)
Exemple #17
0
def test_decision_function_shape_two_class():
    for n_classes in [2, 3]:
        X, y = make_blobs(centers=n_classes, random_state=0)
        for estimator in [svm.SVC, svm.NuSVC]:
            clf = OneVsRestClassifier(estimator(
                decision_function_shape="ovr")).fit(X, y)
            assert_equal(len(clf.predict(X)), len(y))
Exemple #18
0
def test_classifiers_classes():
    # test if classifiers can cope with non-consecutive classes
    classifiers = all_estimators(type_filter='classifier')
    X, y = make_blobs(random_state=12345)
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    y = 2 * y + 1
    classes = np.unique(y)
    # TODO: make work with next line :)
    #y = y.astype(np.str)
    for name, Clf in classifiers:
        if Clf in dont_test:
            continue
        if Clf in [MultinomialNB, BernoulliNB]:
            # TODO also test these!
            continue

        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            clf = Clf()
        # fit
        clf.fit(X, y)
        y_pred = clf.predict(X)
        # training set performance
        assert_array_equal(np.unique(y), np.unique(y_pred))
        assert_greater(zero_one_score(y, y_pred), 0.78,
                       "accuracy of %s not greater than 0.78" % str(Clf))
        assert_array_equal(
            clf.classes_, classes,
            "Unexpected classes_ attribute for %r" % clf)
Exemple #19
0
def check_estimators_overwrite_params(name, Estimator):
    X, y = make_blobs(random_state=0, n_samples=9)
    y = multioutput_estimator_convert_y_2d(name, y)
    # some want non-negative input
    X -= X.min()
    with warnings.catch_warnings(record=True):
        # catch deprecation warnings
        estimator = Estimator()

    set_fast_parameters(estimator)
    set_random_state(estimator)

    # Make a physical copy of the orginal estimator parameters before fitting.
    params = estimator.get_params()
    original_params = deepcopy(params)

    # Fit the model
    estimator.fit(X, y)

    # Compare the state of the model parameters with the original parameters
    new_params = estimator.get_params()
    for param_name, original_value in original_params.items():
        new_value = new_params[param_name]

        # We should never change or mutate the internal state of input
        # parameters by default. To check this we use the joblib.hash function
        # that introspects recursively any subobjects to compute a checksum.
        # The only exception to this rule of immutable constructor parameters
        # is possible RandomState instance but in this check we explicitly
        # fixed the random_state params recursively to be integer seeds.
        assert_equal(hash(new_value), hash(original_value),
                     "Estimator %s should not change or mutate "
                     " the parameter %s from %s to %s during fit."
                     % (name, param_name, original_value, new_value))
Exemple #20
0
def test_sag_pobj_matches_logistic_regression():
    """tests if the sag pobj matches log reg"""
    n_samples = 100
    alpha = 1.0
    max_iter = 20
    X, y = make_blobs(n_samples=n_samples, centers=2, random_state=0,
                      cluster_std=0.1)

    clf1 = LogisticRegression(solver='sag', fit_intercept=False, tol=.0000001,
                              C=1. / alpha / n_samples, max_iter=max_iter,
                              random_state=10)
    clf2 = clone(clf1)
    clf3 = LogisticRegression(fit_intercept=False, tol=.0000001,
                              C=1. / alpha / n_samples, max_iter=max_iter,
                              random_state=10)

    clf1.fit(X, y)
    clf2.fit(sp.csr_matrix(X), y)
    clf3.fit(X, y)

    pobj1 = get_pobj(clf1.coef_, alpha, X, y, log_loss)
    pobj2 = get_pobj(clf2.coef_, alpha, X, y, log_loss)
    pobj3 = get_pobj(clf3.coef_, alpha, X, y, log_loss)

    assert_array_almost_equal(pobj1, pobj2, decimal=4)
    assert_array_almost_equal(pobj2, pobj3, decimal=4)
    assert_array_almost_equal(pobj3, pobj1, decimal=4)
Exemple #21
0
def test_multiclass_classifier_class_weight():
    """tests multiclass with classweights for each class"""
    alpha = .1
    n_samples = 20
    tol = .00001
    max_iter = 50
    class_weight = {0: .45, 1: .55, 2: .75}
    fit_intercept = True
    X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0,
                      cluster_std=0.1)
    step_size = get_step_size(X, alpha, fit_intercept, classification=True)
    classes = np.unique(y)

    clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples,
                              max_iter=max_iter, tol=tol, random_state=77,
                              fit_intercept=fit_intercept,
                              class_weight=class_weight)
    clf2 = clone(clf1)
    clf1.fit(X, y)
    clf2.fit(sp.csr_matrix(X), y)

    le = LabelEncoder()
    class_weight_ = compute_class_weight(class_weight, np.unique(y), y)
    sample_weight = class_weight_[le.fit_transform(y)]

    coef1 = []
    intercept1 = []
    coef2 = []
    intercept2 = []
    for cl in classes:
        y_encoded = np.ones(n_samples)
        y_encoded[y != cl] = -1

        spweights1, spintercept1 = sag_sparse(X, y_encoded, step_size, alpha,
                                              n_iter=max_iter, dloss=log_dloss,
                                              sample_weight=sample_weight)
        spweights2, spintercept2 = sag_sparse(X, y_encoded, step_size, alpha,
                                              n_iter=max_iter, dloss=log_dloss,
                                              sample_weight=sample_weight,
                                              sparse=True)
        coef1.append(spweights1)
        intercept1.append(spintercept1)
        coef2.append(spweights2)
        intercept2.append(spintercept2)

    coef1 = np.vstack(coef1)
    intercept1 = np.array(intercept1)
    coef2 = np.vstack(coef2)
    intercept2 = np.array(intercept2)

    for i, cl in enumerate(classes):
        assert_array_almost_equal(clf1.coef_[i].ravel(),
                                  coef1[i].ravel(),
                                  decimal=2)
        assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1)

        assert_array_almost_equal(clf2.coef_[i].ravel(),
                                  coef2[i].ravel(),
                                  decimal=2)
        assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)
def test_grid_search_correct_score_results():
    # test that correct scores are used
    n_splits = 3
    clf = LinearSVC(random_state=0)
    X, y = make_blobs(random_state=0, centers=2)
    Cs = [.1, 1, 10]
    for score in ['f1', 'roc_auc']:
        grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score, cv=n_splits)
        results = grid_search.fit(X, y).cv_results_

        # Test scorer names
        result_keys = list(results.keys())
        expected_keys = (("mean_test_score", "rank_test_score") +
                         tuple("split%d_test_score" % cv_i
                               for cv_i in range(n_splits)))
        assert_true(all(in1d(expected_keys, result_keys)))

        cv = StratifiedKFold(n_splits=n_splits)
        n_splits = grid_search.n_splits_
        for candidate_i, C in enumerate(Cs):
            clf.set_params(C=C)
            cv_scores = np.array(
                list(grid_search.cv_results_['split%d_test_score'
                                             % s][candidate_i]
                     for s in range(n_splits)))
            for i, (train, test) in enumerate(cv.split(X, y)):
                clf.fit(X[train], y[train])
                if score == "f1":
                    correct_score = f1_score(y[test], clf.predict(X[test]))
                elif score == "roc_auc":
                    dec = clf.decision_function(X[test])
                    correct_score = roc_auc_score(y[test], dec)
                assert_almost_equal(correct_score, cv_scores[i])
def plot_scaling():
    X, y = make_blobs(n_samples=50, centers=2, random_state=4, cluster_std=1)
    X += 3

    plt.figure(figsize=(15, 8))
    main_ax = plt.subplot2grid((2, 4), (0, 0), rowspan=2, colspan=2)

    main_ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cm2, s=60)
    maxx = np.abs(X[:, 0]).max()
    maxy = np.abs(X[:, 1]).max()

    main_ax.set_xlim(-maxx + 1, maxx + 1)
    main_ax.set_ylim(-maxy + 1, maxy + 1)
    main_ax.set_title("Original Data")
    other_axes = [plt.subplot2grid((2, 4), (i, j)) for j in range(2, 4) for i in range(2)]

    for ax, scaler in zip(other_axes, [StandardScaler(), RobustScaler(),
                                       MinMaxScaler(), Normalizer(norm='l2')]):
        X_ = scaler.fit_transform(X)
        ax.scatter(X_[:, 0], X_[:, 1], c=y, cmap=cm2, s=60)
        ax.set_xlim(-2, 2)
        ax.set_ylim(-2, 2)
        ax.set_title(type(scaler).__name__)

    other_axes.append(main_ax)

    for ax in other_axes:
        ax.spines['left'].set_position('center')
        ax.spines['right'].set_color('none')
        ax.spines['bottom'].set_position('center')
        ax.spines['top'].set_color('none')
        ax.xaxis.set_ticks_position('bottom')
        ax.yaxis.set_ticks_position('left')
Exemple #24
0
def test_verbose_boolean():
    # checks that the output for the verbose output is the same
    # for the flag values '1' and 'True'
    # simple 3 cluster dataset
    X, y = make_blobs(random_state=1)
    for Model in [DPGMM, VBGMM]:
        dpgmm_bool = Model(n_components=10, random_state=1, alpha=20,
                           n_iter=50, verbose=True)
        dpgmm_int = Model(n_components=10, random_state=1, alpha=20,
                          n_iter=50, verbose=1)

        old_stdout = sys.stdout
        sys.stdout = StringIO()
        try:
            # generate output with the boolean flag
            dpgmm_bool.fit(X)
            verbose_output = sys.stdout
            verbose_output.seek(0)
            bool_output = verbose_output.readline()
            # generate output with the int flag
            dpgmm_int.fit(X)
            verbose_output = sys.stdout
            verbose_output.seek(0)
            int_output = verbose_output.readline()
            assert_equal(bool_output, int_output)
        finally:
            sys.stdout = old_stdout
Exemple #25
0
def test_decision_function_shape():
    # check that decision_function_shape='ovr' gives
    # correct shape and is consistent with predict

    clf = svm.SVC(kernel='linear', C=0.1,
                  decision_function_shape='ovr').fit(iris.data, iris.target)
    dec = clf.decision_function(iris.data)
    assert_equal(dec.shape, (len(iris.data), 3))
    assert_array_equal(clf.predict(iris.data), np.argmax(dec, axis=1))

    # with five classes:
    X, y = make_blobs(n_samples=80, centers=5, random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    clf = svm.SVC(kernel='linear', C=0.1,
                  decision_function_shape='ovr').fit(X_train, y_train)
    dec = clf.decision_function(X_test)
    assert_equal(dec.shape, (len(X_test), 5))
    assert_array_equal(clf.predict(X_test), np.argmax(dec, axis=1))

    # check shape of ovo_decition_function=True
    clf = svm.SVC(kernel='linear', C=0.1,
                  decision_function_shape='ovo').fit(X_train, y_train)
    dec = clf.decision_function(X_train)
    assert_equal(dec.shape, (len(X_train), 10))

    # check deprecation warning
    clf = svm.SVC(kernel='linear', C=0.1).fit(X_train, y_train)
    msg = "change the shape of the decision function"
    dec = assert_warns_message(ChangedBehaviorWarning, msg,
                               clf.decision_function, X_train)
    assert_equal(dec.shape, (len(X_train), 10))
Exemple #26
0
def single_calc(n_sample):
    #n_sample = 1000
    n_feature = 2
    cluster_std = 0.5
    center = 2

    #for n_sample in [10, 50, 100, 500, 1000, 5000, 10000]:

    pts, labels = datasets.make_blobs(n_samples=n_sample, n_features=n_feature, cluster_std=cluster_std, centers=center)
    start = timer()
    tri = Tri(pts)
    end = timer()
    tri_time = end - start
    print(tri_time)

    #tri_res = compare_labels(labels, tri.labels)
    start = timer()
    auto = Autoclust(pts)
    end = timer()
    auto_time = end - start
    print(auto_time)
    #auto_res = compare_labels(labels, auto.labels)
    res_dict = {'tri': tri_time, 'auto': auto_time, 'samples': n_sample}

    with open('times', 'a') as f:
        print(res_dict, file=f)
Exemple #27
0
def test_vbgmm_no_modify_alpha():
    alpha = 2.
    n_components = 3
    X, y = make_blobs(random_state=1)
    vbgmm = VBGMM(n_components=n_components, alpha=alpha, n_iter=1)
    assert_equal(vbgmm.alpha, alpha)
    assert_equal(vbgmm.fit(X).alpha_, float(alpha) / n_components)
def test_grid_search_iid():
    # test the iid parameter
    # noise-free simple 2d-data
    X, y = make_blobs(
        centers=[[0, 0], [1, 0], [0, 1], [1, 1]], random_state=0, cluster_std=0.1, shuffle=False, n_samples=80
    )
    # split dataset into two folds that are not iid
    # first one contains data of all 4 blobs, second only from two.
    mask = np.ones(X.shape[0], dtype=np.bool)
    mask[np.where(y == 1)[0][::2]] = 0
    mask[np.where(y == 2)[0][::2]] = 0
    # this leads to perfect classification on one fold and a score of 1/3 on
    # the other
    svm = SVC(kernel="linear")
    # create "cv" for splits
    cv = [[mask, ~mask], [~mask, mask]]
    # once with iid=True (default)
    grid_search = GridSearchCV(svm, param_grid={"C": [1, 10]}, cv=cv)
    grid_search.fit(X, y)
    first = grid_search.grid_scores_[0]
    assert_equal(first.parameters["C"], 1)
    assert_array_almost_equal(first.cv_validation_scores, [1, 1.0 / 3.0])
    # for first split, 1/4 of dataset is in test, for second 3/4.
    # take weighted average
    assert_almost_equal(first.mean_validation_score, 1 * 1.0 / 4.0 + 1.0 / 3.0 * 3.0 / 4.0)

    # once with iid=False
    grid_search = GridSearchCV(svm, param_grid={"C": [1, 10]}, cv=cv, iid=False)
    grid_search.fit(X, y)
    first = grid_search.grid_scores_[0]
    assert_equal(first.parameters["C"], 1)
    # scores are the same as above
    assert_array_almost_equal(first.cv_validation_scores, [1, 1.0 / 3.0])
    # averaged score is just mean of scores
    assert_almost_equal(first.mean_validation_score, np.mean(first.cv_validation_scores))
def check_transformer_general(name, Transformer):
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    X = StandardScaler().fit_transform(X)
    X -= X.min()
    _check_transformer(name, Transformer, X, y)
    _check_transformer(name, Transformer, X.tolist(), y.tolist())
def test_search_cv_results_rank_tie_breaking():
    X, y = make_blobs(n_samples=50, random_state=42)

    # The two C values are close enough to give similar models
    # which would result in a tie of their mean cv-scores
    param_grid = {'C': [1, 1.001, 0.001]}

    grid_search = GridSearchCV(SVC(), param_grid=param_grid)
    random_search = RandomizedSearchCV(SVC(), n_iter=3,
                                       param_distributions=param_grid)

    for search in (grid_search, random_search):
        search.fit(X, y)
        results = search.cv_results_
        # Check tie breaking strategy -
        # Check that there is a tie in the mean scores between
        # candidates 1 and 2 alone
        assert_almost_equal(results['mean_test_score'][0],
                            results['mean_test_score'][1])
        try:
            assert_almost_equal(results['mean_test_score'][1],
                                results['mean_test_score'][2])
        except AssertionError:
            pass
        # 'min' rank should be assigned to the tied candidates
        assert_almost_equal(search.cv_results_['rank_test_score'], [1, 1, 3])
Exemple #31
0
#output1 = np.empty(len(x_input))
#output2 = np.empty(len(x_input))
#for i in range(len(x_input)):
#    a = A[1,1]
#    b = (A[1,0] + A[0,1])*x[i] + alpha[1]
#    c = A[0,0] * x[i]**2 + alpha[0] * x[i] + alpha0
#    output1[i], output2[i] = solve_quadratic_eq(a,b,c)
#
#plt.figure()
#plt.scatter(train["Height"], train["Weight"], c=y)
#plt.plot(x,output1, label = "first sol")
#plt.plot(x,output2, label = "second sol")
#plt.legend()

#########################################3
X, y = make_blobs(300, 2, centers=2, random_state=100)

X[y == 0] = X[y == 0] + 2
X[y == 0] = np.dot(np.array([[0.5, 0.1], [1.2, 1.5]]), X[y == 0].T).T
X = X - np.mean(X, axis=0)
class1 = X[y == 0]
class2 = X[y == 1]
prior1 = len(class1) / len(X)
prior2 = len(class2) / len(X)
cov1 = np.cov(class1.T)
cov2 = np.cov(class2.T)
#cov1[1,0] = cov1[0,1] = cov2[0,1] = cov2[1,0] = 0
mean1 = np.mean(np.array(class1), axis=0)
mean2 = np.mean(np.array(class2), axis=0)

A = 0.5 * (np.linalg.inv(cov1) - np.linalg.inv(cov2))
Exemple #32
0
                           **kwargs)


if __name__ == '__main__':
    n_samples = 1000
    random_state = 170
    transformation = [[0.6, -0.6], [-0.4, 0.8]]

    models = [
        {
            'name':
            'Far Blobs',
            'X':
            datasets.make_blobs(n_samples=n_samples,
                                centers=25,
                                random_state=0,
                                center_box=(-10000, 10000),
                                cluster_std=50)[0],
        },
        {
            'name':
            'Noisy Circles',
            'X':
            datasets.make_circles(n_samples=n_samples, factor=.5,
                                  noise=.05)[0],
        },
        {
            'name': 'Noisy Moons',
            'X': datasets.make_moons(n_samples=n_samples, noise=.05)[0],
        },
        {





# # Grouping objects by similarity using k-means

# ## K-means clustering using scikit-learn




X, y = make_blobs(n_samples=150,
                  n_features=2, 
                  centers=3,
                  cluster_std=0.5, 
                  shuffle=True, 
                  random_state=0)





plt.scatter(X[:, 0], X[:, 1], 
            c='white', marker='o', edgecolor='black', s=50)
plt.grid()
plt.tight_layout()
#plt.savefig('images/11_01.png', dpi=300)
plt.title("some random data")
plt.show()
Exemple #34
0
plt.show()

# 分类模型随机数据
# X1为样本特征,Y1为样本类别输出, 共400个样本,每个样本2个特征,输出有3个类别,没有冗余特征,每个类别一个簇
X1, Y1 = make_classification(n_samples=400,
                             n_features=2,
                             n_redundant=0,
                             n_clusters_per_class=1,
                             n_classes=3)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)
plt.show()

# 生成用于聚类的各向同性高斯blobs
# X为样本特征,Y为样本簇类别, 共1000个样本,每个样本2个特征,共3个簇,簇中心在[-1,-1], [1,1], [2,2], 簇方差分别为[0.4, 0.5, 0.2]
X, y = make_blobs(n_samples=1000,
                  n_features=2,
                  centers=[[-1, -1], [1, 1], [2, 2]],
                  cluster_std=[0.4, 0.5, 0.2])
plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)
plt.show()

# 分组正态分布混合数据
# 生成2维正态分布,生成的数据按分位数分成3组,1000个样本,2个样本特征均值为1和2,协方差系数为2
X1, Y1 = make_gaussian_quantiles(n_samples=1000,
                                 n_features=2,
                                 n_classes=3,
                                 mean=[1, 2],
                                 cov=2)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)
plt.show()
Exemple #35
0
def mejora_semiboost(n=20,
                     clf=SVC(probability=True),
                     n_features=5,
                     n_samples=1000,
                     ratio_unsampled=0.5,
                     data_simulation='make_classification',
                     similarity_kernel='rbf'):

    ROC_semiboost = list()
    ROC_clf = list()

    for i in range(n):
        ''' SIMULATE SEMI SUPERVISED DATASET '''
        if data_simulation == 'make_classification':
            X, y = make_classification(n_features=n_features,
                                       n_samples=n_samples,
                                       n_redundant=0,
                                       n_clusters_per_class=1)

        elif data_simulation == 'make_blobs':
            X, y = make_blobs(n_features=n_features,
                              centers=2,
                              n_samples=n_samples)

        elif data_simulation == 'make_gaussian_quantiles':
            X, y = make_gaussian_quantiles(n_features=n_features,
                                           n_classes=2,
                                           n_samples=n_samples)

        elif data_simulation == 'make_moons':
            X, y = make_moons(n_samples=n_samples)

        elif data_simulation == 'make_circles':
            X, y = make_circles(n_samples=n_samples)

        else:
            print('Unknown data simulation method')

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42)

        labels = np.copy(y_train)
        labels[labels == 0] = -1

        # create some unlabeled data
        random_unlabeled_points = np.random.rand(
            len(y_train)) < ratio_unsampled
        labels[random_unlabeled_points] = 0
        y_train = labels
        ''' SEMIBOOST SKLEARN STYLE '''
        model = SemiBoost.SemiBoostClassifier(base_model=clf)
        model.fit(X_train,
                  y_train,
                  n_neighbors=3,
                  n_jobs=10,
                  max_models=15,
                  similarity_kernel='rbf',
                  verbose=False)

        ROC_semiboost.append(roc_auc_score(model.predict(X_test), y_test))
        ''' BASE CLASSIFIER '''
        model = clf
        XX = X_train[~random_unlabeled_points, ]
        yy = y_train[~random_unlabeled_points]
        model.fit(XX, yy)
        ROC_clf.append(roc_auc_score(model.predict(X_test), y_test))

    return (np.mean(np.array(ROC_semiboost) - np.array(ROC_clf)),
            np.std(np.array(ROC_semiboost) - np.array(ROC_clf)))
Exemple #36
0
def simulate_normal_clusters(N, ndim, centers=4, center_box=(-8, 8), **kwds):
    return make_blobs(N, ndim, centers=centers, center_box=center_box, **kwds)
Exemple #37
0
# K Means Outlier Detection On Make_Blobs DataSet

# Generate a single blob of 100 points
# Identify the five points that are furthest from the centroid
from sklearn.datasets import make_blobs
X, labels = make_blobs(100, centers=1)

# The k means should have a single center for most occassions
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=1)
kmeans.fit(X)
KMeans(algorithm='auto',
       copy_x=True,
       init='k-means++',
       max_iter=300,
       n_clusters=1,
       n_init=10,
       n_jobs=1,
       precompute_distances='auto',
       random_state=None,
       tol=0.0001,
       verbose=0)

# Visualize the blobs with a scatter plot to see the centroid
import matplotlib.pyplot as plt
f, ax = plt.subplots(figsize=(8, 5))
ax.set_title("Blob")
ax.scatter(X[:, 0], X[:, 1], label='Points')
ax.scatter(kmeans.cluster_centers_[:, 0],
           kmeans.cluster_centers_[:, 1],
           label='Centroid',
import matplotlib.pyplot as plt

def plot_data(X, y, figsize=None):
    if not figsize:
        figsize = (8, 6)
    plt.figure(figsize=figsize)
    plt.plot(X[y==0, 0], X[y==0, 1], 'or', alpha=0.5, label=0)
    plt.plot(X[y==1, 0], X[y==1, 1], 'ob', alpha=0.5, label=1)
    plt.xlim((min(X[:, 0])-0.1, max(X[:, 0])+0.1))
    plt.ylim((min(X[:, 1])-0.1, max(X[:, 1])+0.1))
    plt.legend()
    
X, y = make_blobs(n_samples=500,
                  n_features=2,
                  centers=4,
                  cluster_std=1,
                  center_box=(-10.0, 10.0),
                  shuffle=True,
                  random_state=1) 
plot_data(X, y)

kmeans_model = cluster.KMeans(n_clusters=2, random_state=1)
kmeans_model.fit(X)
kmeans_model.cluster_centers_
kmeans_model.labels_

#metrics when target labels are not known
silhouette_avg = metrics.silhouette_score(X,kmeans_model.labels_,metric='euclidean')
print(silhouette_avg)
silhouette_samples = metrics.silhouette_samples(X,kmeans_model.labels_,metric='euclidean')
print(silhouette_samples)
    ax.set_xlabel('X')
    ax.set_ylabel('Y')

    markers = ['o', 'd', '^', 'x', '1', '2', '3', 's']
    colors = ['r', 'b', 'g', 'c', 'm', 'k', 'y', '#cccfff']

    for i in range(nb_samples):
        ax.scatter(X[i, 0], X[i, 1], marker=markers[Y[i]], color=colors[Y[i]])

    plt.show()


if __name__ == '__main__':
    # Create the dataset
    X, Y = make_blobs(n_samples=nb_samples,
                      n_features=2,
                      centers=8,
                      cluster_std=2.0)

    # Show the dataset
    fig, ax = plt.subplots(1, 1, figsize=(10, 8))

    ax.grid()
    ax.set_xlabel('X')
    ax.set_ylabel('Y')

    ax.scatter(X[:, 0], X[:, 1], marker='o', color='b')
    plt.show()

    # Complete linkage
    print('Complete linkage')
    ac = AgglomerativeClustering(n_clusters=8, linkage='complete')
Exemple #40
0
        rowSums = np.sum(self.affMat, axis=1)
        dmax = np.max(rowSums)
        D = np.diag(rowSums)
        L = (self.affMat + dmax * np.eye(D.shape[0]) - D) / dmax

        values, vectors = np.linalg.eig(L)
        assert np.all(np.isreal(values))

        bigEigInd = np.argsort(-values)
        return vectors[:, bigEigInd[:self.n_clusters]]

    def apply_constraints(self):
        self.affMat[self.ML[:, 0], self.ML[:, 1]] = 1
        self.affMat[self.CL[:, 0], self.CL[:, 1]] = 0


if __name__ == '__main__':
    Nclusters, N, Nconstraints = (3, 100, 40)
    data, labels = ds.make_blobs(n_samples=N, n_features=2, centers=Nclusters)

    constraintMat = ConstrainedClustering.make_constraints(
        labels,
        data=data,
        method='mmffqs',
        Nconstraints=Nconstraints,
        errRate=0)

    plt.figure()
    ConstrainedClustering.plot_constraints(data, constraintMat)
    plt.show()
Exemple #41
0
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from scipy.cluster.hierarchy import dendrogram, ward

X, y = make_blobs(n_samples=12, random_state=0)

linkage_array = ward(X)
dendrogram(linkage_array)

ax = plt.gca()
bounds = ax.get_xbound()
ax.plot(bounds, [7.25, 7.25], '--', c='k')
ax.plot(bounds, [4, 4], '--', c='k')
ax.text(bounds[1], 7.25, ' two clusters', va='center')
ax.text(bounds[1], 4, ' three clusters', va='center')
ax.set_xlabel('Sample index')
ax.set_ylabel('Cluster distance')
plt.show()
import matplotlib.pyplot as plt

from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler

np.random.seed(0)

# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples,
                                      factor=.5,
                                      noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None

colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)

clustering_names = [
    'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift',
    'SpectralClustering', 'Ward', 'AgglomerativeClustering', 'DBSCAN', 'Birch'
]

plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
plt.subplots_adjust(left=.02,
                    right=.98,
                    bottom=.001,
                    top=.96,
Exemple #43
0
if __name__ == '__main__':
    from sklearn.datasets import make_blobs
    import matplotlib.pyplot as plt
    import numpy as np
    import argparse
    # argparse.ArgumentParser(prog=None, usage=None, description=None, epilog=None, parents=[], formatter_class=argparse.HelpFormatter, prefix_chars='-', fromfile_prefix_chars=None, argument_default=None, conflict_handler='error', add_help=True, allow_abbrev=True, exit_on_error=True)
    parser = argparse.ArgumentParser(description='')
    # parser.add_argument(name or flags...[, action][, nargs][, const][, default][, type][, choices][, required][, help][, metavar][, dest])
    parser.add_argument('-a', '--arg1')
    args = parser.parse_args()

    # #############################################################################
    # Generate sample data
    centers = [[1, 1], [-1, -1], [1, -1]]
    X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
                                random_state=0)
    # #############################################################################
    n_clusters, labels, core_samples = DBscan(X)
    core_samples_mask = np.zeros_like(labels, dtype=bool)
    core_samples_mask[core_samples] = True

    # Plot result
    unique_labels = set(labels)
    colors = [plt.cm.Spectral(each)
              for each in np.linspace(0, 1, len(unique_labels))]
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = [0, 0, 0, 1]
        class_member_mask = (labels == k)
        xy = X[class_member_mask & core_samples_mask]
'''
@Author: Runsen
@微信公众号: 润森笔记
@博客: https://blog.csdn.net/weixin_44510615
@Date: 2020/5/2
'''

from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, silhouette_score, calinski_harabasz_score
x, y = datasets.make_blobs(400, n_features=2, centers=4, random_state=0)
model = KMeans(n_clusters=4)
model.fit(x)
y_pred = model.predict(x)
print(" 调整兰德系数: " + str(adjusted_rand_score(y, y_pred)))
print(" 轮廓系数: " + str(silhouette_score(x, y_pred)))
print(" CH分数: " + str(calinski_harabasz_score(x, y_pred)))
neptune.stop()

## Explore Results

# Scikit-learn KMeans clustering

## Step 1: Create KMeans object and example data

parameters = {'n_init': 11, 'max_iter': 270}

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

km = KMeans(**parameters)

X, y = make_blobs(n_samples=579, n_features=17, centers=7, random_state=28743)

## Step 2: Initialize Neptune

import neptune

neptune.init('shared/sklearn-integration', api_token='ANONYMOUS')

## Step 3: Create an Experiment

neptune.create_experiment(params=parameters,
                          name='clustering-example',
                          tags=['KMeans', 'clustering'])

## Step 4: Log KMeans clustering summary
from main import mglearn, train_test_split, plt, np

from sklearn.datasets import make_blobs
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, ward

X, y = make_blobs(random_state=0, n_samples=12)

linkage_array = ward(X)
dendrogram(linkage_array)

ax = plt.gca()
bounds = ax.get_xbound()
ax.plot(bounds, [7.25, 7.25], '--', c='k')
ax.plot(bounds, [4, 4], '--', c='k')

ax.text(bounds[1], 7.25, 'two clusters', va='center', fontdict={"size": 15})
ax.text(bounds[1], 4, 'three clusters', va='center', fontdict={"size": 15})

# agg = AgglomerativeClustering(n_clusters=3)
# assignment = agg.fit_predict(X)

# mglearn.discrete_scatter(X[:, 0], X[:, 1], assignment)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")

plt.show()
Exemple #47
0

#    plt.show()

RS = 11

name = 'wq_random_project'
target = 'quality'
train = pd.read_csv(f'wine_train.csv')
test = pd.read_csv(f'wine_test.csv')
full = pd.concat([train, test])
y = np.array(train.loc[:, target])
X = np.array(train.drop(target, axis=1))

name = 'gb_random_project'
X, y = make_blobs(centers=6, n_features=2, n_samples=1000, random_state=11)

n_pairs = 100
np.random.seed(RS)
sample_idxs = np.random.choice(range(X.shape[0]),
                               size=2 * n_pairs,
                               replace=False)
x_vals = np.array(range(1, X.shape[1] + 1))
y_vals = []
for n_components in x_vals:
    print(X.shape[1], '->', n_components)
    np.random.seed(RS)
    best_mean = np.inf
    for rs in np.random.choice(range(1000), size=5, replace=False):
        transformer = GaussianRandomProjection(random_state=rs,
                                               n_components=n_components)
Exemple #48
0
ap.add_argument("-a",
                "--alpha",
                type=float,
                default=0.01,
                help="learning rate")
ap.add_argument("-b",
                "--batch-size",
                type=int,
                default=32,
                help="size of SGD mini-batches")
args = vars(ap.parse_args())

# generate a 2-class classification problem with 1000 data points, where each data point is a 2D feature vector
(X, y) = make_blobs(n_samples=1000,
                    n_features=2,
                    centers=2,
                    cluster_std=1.5,
                    random_state=1)
y = y.reshape((y.shape[0], 1))

# insert a column of 1's as the last entry in the feature matrix -- this little trick allows us to treat
# the bias as a trainable parameter within the weight matrix
X = np.c_[X, np.ones((X.shape[0]))]

# partition the data into training and testing splits using 50% of
# the data for training and the remaining 50% for testing
(trainX, testX, trainY, testY) = train_test_split(X,
                                                  y,
                                                  test_size=0.5,
                                                  random_state=42)
Exemple #49
0
        self.means = np.zeros([self.kk, self.mm])

        for n in range(self.nn):

            self.means[self.assign[n]] += self.data[n]

        for k in range(self.kk):

            self.means[k] /= self.count[k]


if __name__ == "__main__":

    features, target = make_blobs(n_samples=1000,
                                  n_features=2,
                                  centers=3,
                                  cluster_std=0.5,
                                  shuffle=True,
                                  random_state=1)

    #plt.scatter(features[:, 0], features[:, 1], c = target)

    #plt.show()

    centers = np.array([[2.0, 2.0], [-1.0, -5.0], [-5.0, -1.0]])

    model = Kmeans(data=features, means=centers)

    plt.scatter(features[:, 0], features[:, 1], c=target)
    plt.show()

    plt.scatter(features[:, 0], features[:, 1], c=model.assign)
    return bool(random.getrandbits(1))


# A cluster generator creates a cluster in the range [-1, 1]. It return a tuple (points, cluster_indices)
cluster_generators = []
cluster_generators.append(
    (2, lambda samples: make_circles(n_samples=samples,
                                     noise=random.uniform(0, 0.08),
                                     factor=random.uniform(0.1, 0.4))))

circles = make_circles(n_samples=15,
                       noise=random.uniform(0, 0.08),
                       factor=random.uniform(0.1, 0.4))

for i in range(10):
    data = make_blobs(50, centers=5)
    data = make_moons(n_samples=101, noise=random.uniform(0, 0.08))
    data_points = data[0]

    if rand_bool():
        data_points = -data_points
    data_points = rotate_2d(data_points, random.uniform(0, np.pi * 2))

    data_points = rescale_data(data_points, [0, 1], [0, 1])

    data = (data_points, data[1])

    plot_data(data)

# for i in range(10):
#     plot_data(cluster_generators[0](40))
# 导入高斯朴素贝叶斯
from sklearn.naive_bayes import GaussianNB
# 导入画图工具
import matplotlib.pyplot as plt
# 导入数据集生成工具
from sklearn.datasets import make_blobs
# 导入数据集拆分工具
from sklearn.model_selection import train_test_split
import numpy as np

# 生成样本数量为500, 分类数为5的数据集
X, y = make_blobs(n_samples=500, centers=5, random_state=8)
# 将数据集拆分成训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8)

# 使用高斯朴素贝叶斯
gnb = GaussianNB()
gnb.fit(X_train, y_train)

print('\n代码运行结果: ')
print('训练集数据得分:{:.3f}'.format(gnb.score(X_train, y_train)))
print('测试及数据得分:{:.3f}'.format(gnb.score(X_test, y_test)))

# 限定横轴与纵轴的最大值
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
# 用不同的背景色表示不同的分类
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),
                     np.arange(y_min, y_max, .02))
z = gnb.predict(np.c_[(xx.ravel(), yy.ravel())]).reshape(xx.shape)
plt.pcolormesh(xx, yy, z, cmap=plt.cm.Spectral)
Exemple #52
0
def plot_classification(clf=SVC(probability=True),
                        n_features=2,
                        n_samples=1000,
                        ratio_unsampled=0.99,
                        data_simulation='make_classification'):
    ''' SIMULATE SEMI SUPERVISED DATASET '''
    if data_simulation == 'make_classification':
        X, y = make_classification(n_features=n_features,
                                   n_samples=n_samples,
                                   n_redundant=0,
                                   n_clusters_per_class=1)

    elif data_simulation == 'make_blobs':
        X, y = make_blobs(n_features=n_features,
                          centers=2,
                          n_samples=n_samples)

    elif data_simulation == 'make_gaussian_quantiles':
        X, y = make_gaussian_quantiles(n_features=n_features,
                                       n_classes=2,
                                       n_samples=n_samples)

    elif data_simulation == 'make_moons':
        X, y = make_moons(n_samples=n_samples)

    elif data_simulation == 'make_circles':
        X, y = make_circles(n_samples=n_samples)

    else:
        print('Unknown data simulation method')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    labels = np.copy(y_train)
    labels[labels == 0] = -1

    # create some unlabeled data
    random_unlabeled_points = np.random.rand(len(y_train)) < ratio_unsampled
    labels[random_unlabeled_points] = 0
    y_train = labels
    ''' SEMIBOOST SKLEARN STYLE '''
    model = SemiBoost.SemiBoostClassifier(base_model=clf)
    model.fit(X_train,
              y_train,
              n_jobs=10,
              max_models=10,
              similarity_kernel='rbf',
              verbose=False)
    ''' Plot '''
    gs = gridspec.GridSpec(1, 2)
    fig = plt.figure(figsize=(10, 8))

    ax = plt.subplot(gs[0, 0])
    fig = plot_decision_regions(X=X_test, y=y_test, clf=model, legend=2)
    plt.title('SemiBoost')
    ''' BASE CLASSIFIER '''
    basemodel = clf
    XX = X_train[~random_unlabeled_points, ]
    yy = y_train[~random_unlabeled_points]

    basemodel.fit(XX, yy)
    ''' Plot '''
    ax = plt.subplot(gs[0, 1])
    fig = plot_decision_regions(X=X_test, y=y_test, clf=basemodel, legend=2)
    plt.title('BaseModel')

    plt.show()
Exemple #53
0
    def __init__(self,
                 model,
                 population_size,
                 n_blobs,
                 n_features,
                 home_district_in_position,
                 iseed=None):

        self.model = model
        self.roulette_distribution = {}
        self.feature_vector = {}
        self.vector_to_human = {}
        self.vector_to_home = {}
        self.vector_to_classroom = {}
        self.vector_to_office = {}
        self.vector_to_restaurant = {}
        self.unit_info_map = self.unit_info_map()
        n_vec = population_size
        blobs, assignments = make_blobs(
            n_samples=n_vec,
            n_features=n_features,
            centers=n_blobs,
            cluster_std=0.1,  #1.0
            center_box=(-10.0, 10.0),
            shuffle=False,
            random_state=iseed)
        self.n_blobs = n_blobs
        self.home_district_in_position = home_district_in_position
        self.blob_dict = {}
        for vec, assignment in zip(blobs, assignments):
            if assignment not in self.blob_dict:
                self.blob_dict[assignment] = []
            self.blob_dict[assignment].append(vec)
        self.vectors = blobs
        #self.vectors = KeyedVectors(n_features)
        #numlist = range(n_vec)
        #self.vectors.add(numlist,blobs[:])
        #for i in range(n_vec):
        #self.vectors.add_vector(i, blobs[i,:])
        #vectors.add_vector(str(i), blobs[i,:])
        #print (numlist)
        #print(blobs)
        #print (self.vectors)
        for i in range(n_vec):
            #vector1 = self.vectors.get_vector(i)
            vector1 = self.vectors[i]
            tuple_vec1 = tuple(vector1)
            similarities = KeyedVectors.cosine_similarities(
                vector1, self.vectors)
            #print (distances)
            #distances = self.vectors.cosine_similarities(vector1,self.vectors)
            #self.roulette_distribution[tuple_vec1] = {}
            temp = {}
            sum_similarities = (similarities - similarities.min()).sum()
            for j in range(n_vec):
                if i != j:
                    vector2 = self.vectors[j]
                    tuple_vec2 = tuple(vector2)
                    temp[tuple_vec2] = (similarities[j] -
                                        similarities.min()) / sum_similarities

            self.roulette_distribution[tuple_vec1] = dict(
                sorted(temp.items(), key=lambda item: -item[1]))
Exemple #54
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

np.random.seed(123)
X, y = make_blobs(n_samples=1000, n_features=10, centers=5, cluster_std=3)

RFC = RandomForestClassifier(n_estimators=80, oob_score=True)
RFC.fit(X, y)
print("oob_score_", RFC.oob_score_)

_x0 = np.random.randn(10)
sample_gen = GenerativeSampler(model=RFC,
                               target_class=0,
                               class_err_prob=1 - RFC.oob_score_,
                               use_empirical=False)
test = sample_gen.run_chain(n=10, x0=_x0)

# Test that class_err_prob self populates correctly
sample_gen = GenerativeSampler(model=RFC,
                               X=X,
                               y=y,
                               target_class=0,
                               use_empirical=False)
#assert sample_gen.class_err_prob == 0
print(
    "calculated class_err_prob", sample_gen.class_err_prob
)  # For RFC this will always be 0 because it's calculated against the training data.
test = sample_gen.run_chain(n=10, x0=_x0)

# test that x0 self populates correctly
#!/usr/bin/python
# -*- coding: utf-8 -*-
#[email protected]
"""
层次聚类
自低向上,初始中,每个点作为一类。
"""
print(__doc__)

from sklearn.datasets import make_moons, make_circles, make_blobs
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import matplotlib.pyplot as plt

centers = [[0, 1], [-1, -1], [1, -1]]
X, y = make_blobs(n_samples=1500, random_state=170)
trs = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
X = np.dot(X, trs)
"""
层次聚类
===============
参数:
    n_clusters:一个整数,指定分类簇的数量
    linkage:一个字符串,用于指定链接算法
        ‘ward’:单链接single-linkage,采用dmindmin
        ‘complete’:全链接complete-linkage算法,采用dmaxdmax
        ‘average’:均连接average-linkage算法,采用davgdavg
    affinity:一个字符串或者可调用对象,用于计算距离
"""

clt = AgglomerativeClustering(linkage="ward")
Exemple #56
0
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_blobs

sns.set()  # for plot styling

features, labels = make_blobs(n_samples=300,
                              centers=4,
                              cluster_std=0.60,
                              random_state=0)

plt.scatter(features[:, 0], features[:, 1], s=50)
plt.show()
# scatter plot of blobs dataset
from sklearn.datasets import make_blobs
from matplotlib import pyplot
from numpy import where
# generate 2d classification dataset
X, y = make_blobs(n_samples=500,
                  centers=3,
                  n_features=2,
                  cluster_std=2,
                  random_state=2)
# scatter plot for each class value
for class_value in range(3):
    # select indices of points with the class label
    row_ix = where(y == class_value)
    # scatter plot for points with a different color
    pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
# show plot
pyplot.show()
from sklearn.datasets import make_blobs
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import brier_score_loss
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split


n_samples = 50000
n_bins = 3  # use 3 bins for calibration_curve as we have 3 clusters here

# Generate 3 blobs with 2 classes where the second blob contains
# half positive samples and half negative samples. Probability in this
# blob is therefore 0.5.
centers = [(-5, -5), (0, 0), (5, 5)]
X, y = make_blobs(n_samples=n_samples, centers=centers, shuffle=False,
                  random_state=42)

y[:n_samples // 2] = 0
y[n_samples // 2:] = 1
sample_weight = np.random.RandomState(42).rand(y.shape[0])

# split train, test for calibration
X_train, X_test, y_train, y_test, sw_train, sw_test = \
    train_test_split(X, y, sample_weight, test_size=0.9, random_state=42)

# Gaussian Naive-Bayes with no calibration
clf = GaussianNB()
clf.fit(X_train, y_train)  # GaussianNB itself does not support sample-weights
prob_pos_clf = clf.predict_proba(X_test)[:, 1]

# Gaussian Naive-Bayes with isotonic calibration
Exemple #59
0
import matplotlib.colors
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler


def expand(a, b):
    d = (b - a) * 0.1
    return a - d, b + d


if __name__ == "__main__":
    N = 1000
    centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]]
    data, y = ds.make_blobs(N,
                            n_features=2,
                            centers=centers,
                            cluster_std=[0.5, 0.25, 0.7, 0.5],
                            random_state=0)
    data = StandardScaler().fit_transform(data)
    # 数据1的参数:(epsilon, min_sample)
    params = ((0.2, 5), (0.2, 10), (0.2, 15), (0.3, 5), (0.3, 10), (0.3, 15))

    # 数据2
    # t = np.arange(0, 2*np.pi, 0.1)
    # data1 = np.vstack((np.cos(t), np.sin(t))).T
    # data2 = np.vstack((2*np.cos(t), 2*np.sin(t))).T
    # data3 = np.vstack((3*np.cos(t), 3*np.sin(t))).T
    # data = np.vstack((data1, data2, data3))
    # # # 数据2的参数:(epsilon, min_sample)
    # params = ((0.5, 3), (0.5, 5), (0.5, 10), (1., 3), (1., 10), (1., 20))
Exemple #60
0
from sklearn.datasets import make_blobs
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC

X, y = make_blobs(n_samples=125, centers=2, cluster_std=0.60, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=0)

# plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap="winter")
# plt.show()

model = SVC(kernel='linear')
history = model.fit(X_train, y_train)

# ax = plt.gca()
# xlim = ax.get_xlim()
# ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap="winter", marker='s')
# w = model.coef_[0]
# a = -w[0] / w[1]
# xx = np.linspace(xlim[0], xlim[1])
# yy = a * xx - (model.intercept_[0] / w[1])
# plt.plot(xx, yy)
plt.show()