Python make_blobsの例、sklearn.datasets.samples_generator.make_blobs Pythonの例

コード例 #1

0

ファイルを表示

ファイル: adasyn_test.py プロジェクト: abhishekkrthakur/amazon_challenge

def createCluster():
    X1, y1 = make_blobs(n_samples=50, centers=1, n_features=2,random_state=0,center_box = (-5.0,5.0))
    X2, y2 = make_blobs(n_samples=200, centers=1, n_features=2,random_state=0,center_box = (-4.0,6.0))
    
    X = np.concatenate((X1,X2),axis=0)
    y = np.concatenate((y1,[1]*len(y2)),axis=0)
    
    return X.tolist(),y.tolist()

コード例 #2

0

ファイルを表示

ファイル: ADASYNtest.py プロジェクト: glrs/ADASYN

def createCluster():
    # TODO: For the example to work properly with the new changes for multi-class data support
    #       this method should change so it creates multi-class data. I'm not getting into that
    #       since I don't need it. Simplest way: just use your own multi-class data.
    X1, y1 = make_blobs(n_samples=50, centers=1, n_features=2,random_state=0,center_box = (-5.0,5.0))
    X2, y2 = make_blobs(n_samples=200, centers=1, n_features=2,random_state=0,center_box = (-4.0,6.0))
    
    X = np.concatenate((X1,X2),axis=0)
    y = np.concatenate((y1,[1]*len(y2)),axis=0)
    
    return X.tolist(),y.tolist()

コード例 #3

0

ファイルを表示

ファイル: exercise2.py プロジェクト: akash13singh/dm-assignment2

def knn_evaluation():
    print 'KNeighborsClassifier'
    np.random.seed(123)
    dataset,true_labels = make_blobs(n_samples=10000, n_features=2)
    color = ['r-', 'b-']
    methods = [True, False]

    for b in methods:
        print 'bootstrapping = %s' % methods[b]
        misclassification_rates = []
        min_rate = np.inf
        min_k = 0

        for i in range(1,51):
            neigh = KNeighborsClassifier(n_neighbors=i)
            scores = validation(neigh, dataset, true_labels, methods[b])
            misclassifications = 1 - scores
            misclassification_rates.append(np.average(misclassifications))
        
            if min_rate > misclassification_rates[i-1]:
                min_rate = misclassification_rates[i-1]
                min_k = i

        print 'minimum rate = %s' % min_rate
        print 'best k = %s' % min_k

        label = 'bootstrap' if methods[b] else 'cross-validation'
        pyplot.plot(range(1,51), misclassification_rates, color[b], label = label)
    
    pyplot.title('Mis-classification rates of KNeighborsClassifier')
    pyplot.xlabel('Values of k')
    pyplot.ylabel('Mis classification rates')
    pyplot.legend(loc = 'upper right')
    pyplot.show()

コード例 #4

0

ファイルを表示

ファイル: test_grid_search.py プロジェクト: Calvin-O/scikit-learn

def test_grid_search_iid():
    # test the iid parameter
    # noise-free simple 2d-data
    X, y = make_blobs(centers=[[0, 0], [1, 0], [0, 1], [1, 1]], random_state=0,
                      cluster_std=0.1, shuffle=False, n_samples=80)
    # split dataset into two folds that are not iid
    # first one contains data of all 4 blobs, second only from two.
    mask = np.ones(X.shape[0], dtype=np.bool)
    mask[np.where(y == 1)[0][::2]] = 0
    mask[np.where(y == 2)[0][::2]] = 0
    # this leads to perfect classification on one fold and a score of 1/3 on
    # the other
    svm = SVC(kernel='linear')
    # create "cv" for splits
    cv = [[mask, ~mask], [~mask, mask]]
    # once with iid=True (default)
    grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv)
    grid_search.fit(X, y)
    _, average_score, scores = grid_search.cv_scores_[0]
    assert_array_almost_equal(scores, [1, 1. / 3.])
    # for first split, 1/4 of dataset is in test, for second 3/4.
    # take weighted average
    assert_almost_equal(average_score, 1 * 1. / 4. + 1. / 3. * 3. / 4.)

    # once with iid=False (default)
    grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv,
                               iid=False)
    grid_search.fit(X, y)
    _, average_score, scores = grid_search.cv_scores_[0]
    # scores are the same as above
    assert_array_almost_equal(scores, [1, 1. / 3.])
    # averaged score is just mean of scores
    assert_almost_equal(average_score, np.mean(scores))

コード例 #5

0

ファイルを表示

ファイル: test_mean_shift.py プロジェクト: allefpablo/scikit-learn

def test_bin_seeds():
    # Test the bin seeding technique which can be used in the mean shift
    # algorithm
    # Data is just 6 points in the plane
    X = np.array([[1., 1.], [1.4, 1.4], [1.8, 1.2],
                  [2., 1.], [2.1, 1.1], [0., 0.]])

    # With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be
    # found
    ground_truth = {(1., 1.), (2., 1.), (0., 0.)}
    test_bins = get_bin_seeds(X, 1, 1)
    test_result = set(tuple(p) for p in test_bins)
    assert len(ground_truth.symmetric_difference(test_result)) == 0

    # With a bin coarseness of 1.0 and min_bin_freq of 2, 2 bins should be
    # found
    ground_truth = {(1., 1.), (2., 1.)}
    test_bins = get_bin_seeds(X, 1, 2)
    test_result = set(tuple(p) for p in test_bins)
    assert len(ground_truth.symmetric_difference(test_result)) == 0

    # With a bin size of 0.01 and min_bin_freq of 1, 6 bins should be found
    # we bail and use the whole data here.
    with warnings.catch_warnings(record=True):
        test_bins = get_bin_seeds(X, 0.01, 1)
    assert_array_almost_equal(test_bins, X)

    # tight clusters around [0, 0] and [1, 1], only get two bins
    X, _ = make_blobs(n_samples=100, n_features=2, centers=[[0, 0], [1, 1]],
                      cluster_std=0.1, random_state=0)
    test_bins = get_bin_seeds(X, 1)
    assert_array_equal(test_bins, [[0, 0], [1, 1]])

コード例 #6

0

ファイルを表示

ファイル: test_optics.py プロジェクト: MartinThoma/scikit-learn

def test_dbscan_optics_parity(eps, min_samples):
    # Test that OPTICS clustering labels are <= 5% difference of DBSCAN

    centers = [[1, 1], [-1, -1], [1, -1]]
    X, labels_true = make_blobs(n_samples=750, centers=centers,
                                cluster_std=0.4, random_state=0)

    # calculate optics with dbscan extract at 0.3 epsilon
    op = OPTICS(min_samples=min_samples).fit(X)
    core_optics, labels_optics = op.extract_dbscan(eps)

    # calculate dbscan labels
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)

    contingency = contingency_matrix(db.labels_, labels_optics)
    agree = min(np.sum(np.max(contingency, axis=0)),
                np.sum(np.max(contingency, axis=1)))
    disagree = X.shape[0] - agree

    # verify core_labels match
    assert_array_equal(core_optics, db.core_sample_indices_)

    non_core_count = len(labels_optics) - len(core_optics)
    percent_mismatch = np.round((disagree - 1) / non_core_count, 2)

    # verify label mismatch is <= 5% labels
    assert percent_mismatch <= 0.05

コード例 #7

0

ファイルを表示

ファイル: model_selection.py プロジェクト: palindrome6/Data-Mining---Assignment-2

def exercise_2a():
    X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0)
    # plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)
    # plt.show()
    kf = KFold(1000, n_folds=10, shuffle=False, random_state=None)
    accuracy_lst = np.zeros([49, 2], dtype=float)
    accuracy_current = np.zeros(10, dtype=float)
    for k in range(1,50):
        iterator = 0
        clf = KNeighborsClassifier(n_neighbors=k)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf.fit(X_train, y_train)
            accuracy_current[iterator] = (1. - clf.score(X_test,y_test))
            iterator+=1
        accuracy_lst[k-1, 0] = accuracy_current.mean()
        # accuracy_lst[k-1, 1] = accuracy_current.std() #confidence interval 95%
    x = np.arange(1,50, dtype=int)
    plt.style.use('ggplot')
    plt.plot(x, accuracy_lst[:, 0], '#009999', marker='o')
    # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^')
    plt.xticks(x, x)
    plt.margins(0.02)
    plt.xlabel('K values')
    plt.ylabel('Missclasification Error')
    plt.show()

コード例 #8

0

ファイルを表示

ファイル: model_selection.py プロジェクト: palindrome6/Data-Mining---Assignment-2

def exercise_1():
    X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0)
    n_samples = len(X)
    kf = cross_validation.KFold(n_samples, n_folds=10, shuffle=False, random_state=None)
    # kf = cross_validation.ShuffleSplit(1000,n_iter=25, test_size=0.1, train_size=0.9, random_state=None)

    error_total = np.zeros([49, 1], dtype=float)
    for k in range(1,50):
        error = []
        clf = KNeighborsClassifier(n_neighbors=k)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf.fit(X_train, y_train)
            error.append( zero_one_loss(y_test, clf.predict(X_test)) )


            # error.append(clf.predict(X_test))
            # error.append( 1. - clf.score(X_test, y_test) ) #, accuracy_score(y_test, clf.predict(X_test))
            # error.append(mean_squared_error(y_test, clf.predict(X_test)))
            # error.append()
        # print error
        error_total[k-1, 0] = np.array(error).mean()
    # print error_total
    x = np.arange(1,50, dtype=int)
    plt.style.use('ggplot')
    plt.plot(x, error_total[:, 0], '#009999', marker='o')
    # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^')
    plt.xticks(x, x)
    plt.margins(0.02)
    plt.xlabel('K values')
    plt.ylabel('Missclasification Error')
    plt.show()

コード例 #9

0

ファイルを表示

ファイル: model_selection.py プロジェクト: palindrome6/Data-Mining---Assignment-2

def exercise_2b():
    X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0)
    kf = ShuffleSplit(100, train_size= 0.9, test_size=0.1, random_state=0)
    # kf = KFold(1000, n_folds=10, shuffle=False, random_state=None)
    accuracy_lst = np.zeros([49, 2], dtype=float)
    accuracy_current = np.zeros(10, dtype=float)
    for k in range(1,50):
        iterator = 0
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf = KNeighborsClassifier(n_neighbors=k)
            clf.fit(X_train, y_train)
            accuracy_current[iterator] = (1. - clf.score(X_test,y_test))
            iterator+=1
            print mean_squared_error(y_test, clf.predict(X_test))
        accuracy_lst[k-1, 0] = accuracy_current.mean()
        accuracy_lst[k-1, 1] = accuracy_current.var()#*2 #confidence interval 95%
    x = np.arange(1,50, dtype=int)
    plt.style.use('ggplot')
    plt.plot(x, accuracy_lst[:, 1], '#009999', marker='o')
    # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^')
    plt.xticks(x, x)
    plt.margins(0.02)
    plt.xlabel('K')
    plt.ylabel('Variance')
    plt.show()

コード例 #10

0

ファイルを表示

ファイル: sgd_separator.py プロジェクト: Balu-Varanasi/pycon_2013_india

def plot_sgd_separator():
    # we create 50 separable points
    X, Y = make_blobs(n_samples=50, centers=2,
                      random_state=0, cluster_std=0.60)

    # fit the model
    clf = SGDClassifier(loss="hinge", alpha=0.01,
                        n_iter=200, fit_intercept=True)
    clf.fit(X, Y)

    # plot the line, the points, and the nearest vectors to the plane
    xx = np.linspace(-1, 5, 10)
    yy = np.linspace(-1, 5, 10)

    X1, X2 = np.meshgrid(xx, yy)
    Z = np.empty(X1.shape)
    for (i, j), val in np.ndenumerate(X1):
        x1 = val
        x2 = X2[i, j]
        p = clf.decision_function([x1, x2])
        Z[i, j] = p[0]
    levels = [-1.0, 0.0, 1.0]
    linestyles = ['dashed', 'solid', 'dashed']
    colors = 'k'

    ax = plt.axes()
    ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
    ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)

    ax.axis('tight')

コード例 #11

0

ファイルを表示

ファイル: test_k_means.py プロジェクト: daniel-perry/scikit-learn

def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
    # check that fit.predict gives same result as fit_predict
    # There's a very small chance of failure with elkan on unstructured dataset
    # because predict method uses fast euclidean distances computation which
    # may cause small numerical instabilities.
    # NB: This test is largely redundant with respect to test_predict and
    #     test_predict_equal_labels.  This test has the added effect of
    #     testing idempotence of the fittng procesdure which appears to
    #     be where it fails on some MacOS setups.
    if sys.platform == "darwin":
        pytest.xfail(
            "Known failures on MacOS, See "
            "https://github.com/scikit-learn/scikit-learn/issues/12644")
    if not (algo == 'elkan' and constructor is sp.csr_matrix):
        rng = np.random.RandomState(seed)

        X = make_blobs(n_samples=1000, n_features=10, centers=10,
                       random_state=rng)[0].astype(dtype, copy=False)
        X = constructor(X)

        kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed,
                        tol=tol, max_iter=max_iter, n_jobs=1)

        labels_1 = kmeans.fit(X).predict(X)
        labels_2 = kmeans.fit_predict(X)

        assert_array_equal(labels_1, labels_2)

コード例 #12

0

ファイルを表示

ファイル: data_loaders.py プロジェクト: manuwhs/Trapyng

def get_toy_classification_data(n_samples=100, centers=3, n_features=2, type_data = "blobs"):
    # generate 2d classification dataset
    if (type_data == "blobs"):
        X, y = make_blobs(n_samples=n_samples, centers=centers, n_features=n_features)
    elif(type_data == "moons"):
        X, y = make_moons(n_samples=n_samples, noise=0.1)
    elif(type_data == "circles"):
        X, y =  make_circles(n_samples=n_samples, noise=0.05)
    # scatter plot, dots colored by class value
#    df = DataFrame(dict(x=X[:,0], y=X[:,1], label=y))
#    colors = {0:'red', 1:'blue', 2:'green'}
#    fig, ax = pyplot.subplots()
#    grouped = df.groupby('label')
#    for key, group in grouped:
#        group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key])
#    pyplot.show()
    
    X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, stratify = None)
    
    classes = np.unique(y_train)
    
    if(0):
        enc = OneHotEncoder().fit(classes.reshape(-1,1))
        
        y_train = enc.transform(y_train.reshape(-1, 1))
        print (y_test)
        y_test = enc.transform(y_test.reshape(-1, 1))
        print (y_test)
    
    y_train = one_hot_encode(y_train, classes)
    y_test = one_hot_encode(y_test, classes)
    
    return  X_train, y_train, X_test, y_test, classes

コード例 #13

0

ファイルを表示

ファイル: exercise2.py プロジェクト: akash13singh/dm-assignment2

def tree_evaluation():
    print 'DecisionTreeClassifier'
    np.random.seed(123)
    dataset,true_labels = make_blobs(n_samples=10000, n_features=2)
    color = ['r-', 'b-']
    methods = [True, False]

    for b in methods:
        print 'bootstrapping = %s' % methods[b]
        misclassification_rates = []
        min_rate = np.inf
        min_k = 0

        for i in range(2,16):
            tree_classifier = tree.DecisionTreeClassifier(max_depth=i)
            scores = validation(tree_classifier, dataset, true_labels, methods[b])
            misclassifications = 1 - scores
            misclassification_rates.append(np.average(misclassifications))
        
            if min_rate > misclassification_rates[i-2]:
                min_rate = misclassification_rates[i-2]
                min_k = i

        print 'minimum rate = %s' % min_rate
        print 'best depth = %s' % min_k

        label = 'bootstrap' if methods[b] else 'cross-validation'
        pyplot.plot(range(2,16), misclassification_rates, color[b], label = label)
    
    pyplot.title('Mis-classification rates of DecisionTreeClassifier')
    pyplot.xlabel('Values of k')
    pyplot.ylabel('Mis classification rates')
    pyplot.legend(loc = 'upper left')
    pyplot.show()

コード例 #14

0

ファイルを表示

ファイル: test_spectral.py プロジェクト: osdf/scikit-learn

def test_spectral_amg_mode():
    # Test the amg mode of SpectralClustering
    centers = np.array([
        [0., 0., 0.],
        [10., 10., 10.],
        [20., 20., 20.],
    ])
    X, true_labels = make_blobs(n_samples=100, centers=centers,
                                cluster_std=1., random_state=42)
    D = pairwise_distances(X)  # Distance matrix
    S = np.max(D) - D  # Similarity matrix
    S = sparse.coo_matrix(S)
    try:
        from pyamg import smoothed_aggregation_solver
        amg_loaded = True
    except ImportError:
        amg_loaded = False
    if amg_loaded:
        labels = spectral_clustering(S, n_clusters=len(centers),
                                     random_state=0, mode="amg")
        # We don't care too much that it's good, just that it *worked*.
        # There does have to be some lower limit on the performance though.
        assert_greater(np.mean(labels == true_labels), .3)
    else:
        assert_raises(ValueError, spectral_embedding, S,
                      n_components=len(centers), random_state=0, mode="amg")

コード例 #15

0

ファイルを表示

ファイル: generate_data.py プロジェクト: vivekaxl/ScratchPad

def generate_anisotropically_clusters(number_of_samples, number_of_clusters, n_features=2,  variances=None, filename=""):
    """
    :param number_of_samples:  The total number of points equally divided among clusters.
    :param number_of_clusters: The number of clusters to generate
    :param n_features:         The number of features for each sample.
    :param variances:          The standard deviation of the clusters.
    :param filename:           The file to store the results
    :return:
    """

    if variances is None: variances = [0.5 for _ in xrange(number_of_clusters)]
    if filename == "":
        filename = "./Data/anisotropically_" + str(number_of_samples) + "_features_" + str(n_features) \
                   + "_cluster_" + str(number_of_clusters) + ".csv"
    random_state = 170
    X, y = make_blobs(n_samples=number_of_samples, centers=number_of_clusters, n_features=n_features,
                      random_state=random_state, cluster_std=variances)
    transformation = np.array([[random() if i == j else uniform(-1, 1) for j in xrange(n_features)] for i in xrange(n_features)])
    X = np.dot(X, transformation)

    features = ["features_" + str(i + 1) for i in xrange(n_features)]
    df = pd.DataFrame()
    for i, feature in enumerate(features): df[feature] = X[:, i]
    df["class"] = y
    df.to_csv(filename, index=False)

    return X, y

コード例 #16

0

ファイルを表示

ファイル: project.py プロジェクト: kimjh4930/m-learning

	def test_soft():
		
		X, Y = make_blobs(n_samples=10, centers=2, n_features=2, random_state=1)
		
		for i in range(0, len(Y)):
			if Y[i] == 0 :
				Y[i] = -1.0
		
		X1, y1, X2, y2 = gen_lin_separable_data()
		#print Y
		#print X1
		#X1, y1, X2, y2 = gen_lin_separable_overlap_data()
		#print y2
		X_train, y_train = split_train(X1, y1, X2, y2)
		#print X_train
		#X_test, y_test = split_test(X1, y1, X2, y2)
 
		clf = SVM(C=0.1)
		#clf.fit(X_train, y_train)
		clf.fit(X, Y)
 
		#y_predict = clf.predict(X_test)
		#correct = np.sum(y_predict == y_test)
		#print "%d out of %d predictions correct" % (correct, len(y_predict))

		plot_contour(X_train[y_train==1], X_train[y_train==-1], clf)

コード例 #17

0

ファイルを表示

ファイル: test_codebook.py プロジェクト: massich/oct_image_classif

    def test_fitted_model(self):

        # non centered, sparse centers to check the
        centers = np.array([
            [0.0, 5.0, 0.0, 0.0, 0.0],
            [1.0, 1.0, 4.0, 0.0, 0.0],
            [1.0, 0.0, 0.0, 5.0, 1.0],
            ])
        n_samples = 100
        n_clusters, n_features = centers.shape
        X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
                                    cluster_std=1., random_state=42)

        cbook = CoodeBook(n_words=3)
        cbook = cbook.fit(X) # TODO: Is it neaded to reasign? or it can be just cbook.fit(X)

        # check that the number of clusters centers and distinct labels match
        # the expectation
        centers = cbook.get_dictionary()
        assert_equal(centers.shape, (n_clusters, n_features))

        labels = cbook.predict(X)
        assert_equal(np.unique(labels).shape[0], n_clusters)

        # check that the labels assignment are perfect (up to a permutation)
        assert_equal(v_measure_score(true_labels, labels), 1.0)
        assert_greater(cbook.cluster_core.inertia_, 0.0)

        # check that the descriptor looks like the homogenous PDF used
        # to create the original samples
        cbook_hist = cbook.get_BoF_descriptor(X)
        expected_value = float(1)/cbook.n_words
        for bin_value in cbook_hist[0]:
            assert_less(round(bin_value-expected_value,3), 0.01)

コード例 #18

0

ファイルを表示

ファイル: kmeans.py プロジェクト: lightalchemist/ML-algorithms

def main():
    import matplotlib.pyplot as plt
    from sklearn.datasets.samples_generator import make_blobs
    n_centers = 3
    X, y = make_blobs(n_samples=1000, centers=n_centers, n_features=2,
                    cluster_std=0.7, random_state=0)

    # Run this K-Means
    import kmeans
    t0 = time.time()
    y_pred, centers, obj_val_seq = kmeans.kmeans(X, n_centers)
    t1 = time.time()
    print("Final obj val: {}".format(obj_val_seq[-1]))
    print("Time taken (this implementation): {}".format(t1 - t0))

    # Run scikit-learn's K-Means
    from sklearn.cluster import k_means
    t0 = time.time()
    centers, y_pred, obj_val = k_means(X, n_centers, random_state=0)
    t1 = time.time()
    print("Final obj val: {}".format(obj_val))
    print("Time taken (Scikit, 1 job): {}".format(t1 - t0))

    # Plot change in objective value over iteration
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(obj_val_seq, 'b-', marker='*')
    fig.suptitle("Change in K-means objective value across iterations")
    ax.set_xlabel("Iteration")
    ax.set_ylabel("Objective value")
    fig.show()

    # Plot data
    from itertools import cycle
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    fig = plt.figure(figsize=plt.figaspect(0.5))  # Make twice as wide to accomodate both plots
    ax = fig.add_subplot(121)
    ax.set_title("Data with true labels and final centers")
    for k, color in zip(range(n_centers), colors):
        ax.plot(X[y==k, 0], X[y==k, 1], color + '.')

    initial_centers = kmeans.init_centers(X, n_centers, 2) # This is valid because we always use the same random seed.
    # Plot initial centers
    for x in initial_centers:
        ax.plot(x[0], x[1], "mo", markeredgecolor="k", markersize=8)

    # Plot final centers
    for x in centers:
        ax.plot(x[0], x[1], "co", markeredgecolor="k", markersize=8)

    # Plot assignments
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    ax = fig.add_subplot(122)
    ax.set_title("Data with final assignments")
    for k, color in zip(range(n_centers), colors):
        ax.plot(X[y_pred==k, 0], X[y_pred==k, 1], color + '.')

    fig.tight_layout()
    fig.gca()
    fig.show()

コード例 #19

0

ファイルを表示

ファイル: SGD_Classification.py プロジェクト: abinashpanda/ml_tutorial

def plot_sgd_classifier(num_samples, clt_std):
    #generation of data
    X, y = make_blobs(n_samples=num_samples, centers=2, cluster_std=clt_std)

    #fitting of data using logistic regression
    clf = SGDClassifier(loss='log', alpha=0.01)
    clf.fit(X, y)

    #plotting of data
    x_ = np.linspace(min(X[:, 0]), max(X[:, 0]), 10)
    y_ = np.linspace(min(X[:, 1]), max(X[:, 1]), 10)

    X_, Y_ = np.meshgrid(x_, y_)
    Z = np.empty(X_.shape)

    for (i, j), val in np.ndenumerate(X_):
        x1 = val
        x2 = Y_[i, j]
        conf_score = clf.decision_function([x1, x2])
        Z[i, j] = conf_score[0]

    levels = [-1.0, 0, 1.0]
    colors = 'k'
    linestyles = ['dashed', 'solid', 'dashed']

    ax = plt.axes()
    plt.xlabel('X1')
    plt.ylabel('X2')
    ax.contour(X_, Y_, Z, colors=colors,
               levels=levels, linestyles=linestyles, labels='Boundary')
    ax.scatter(X[:, 0], X[:, 1], c=y)

コード例 #20

0

ファイルを表示

ファイル: black_box.py プロジェクト: usantamaria/mat281

def iplot(N_points=100, n_clusters=2):

    X, y = make_blobs(n_samples=N_points, centers=n_clusters,
                      random_state=0, cluster_std=0.60)

    def _kmeans_step(k=n_clusters, frame=0):
        rng = np.random.RandomState(2)
        labels = np.zeros(X.shape[0])
        centers = X[rng.randint(N_points, size=k),:]

        nsteps = frame // 3

        for i in range(nsteps + 1):
            old_centers = centers
            if i < nsteps or frame % 3 > 0:
                dist = euclidean_distances(X, centers)
                labels = dist.argmin(1)

            if i < nsteps or frame % 3 > 1:
                centers = np.array([X[labels == j].mean(0)
                                    for j in range(k)])
                nans = np.isnan(centers)
                centers[nans] = old_centers[nans]


        # plot the cluster centers
        fig = plt.figure(figsize=(8,6))

        plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='rainbow');
        plt.scatter(old_centers[:, 0], old_centers[:, 1], marker='s',
                    c="white",
                    s=200)
        plt.scatter(old_centers[:, 0], old_centers[:, 1], marker='s',
                    c=np.arange(k), s=50, cmap='rainbow')

        # plot new centers if third frame
        if frame % 3 == 2:
            for i in range(k):
                plt.annotate('', centers[i], old_centers[i],
                             arrowprops=dict(arrowstyle='->', linewidth=1))
            plt.scatter(centers[:, 0], centers[:, 1], marker='s',
                        c="white",
                        s=200, cmap='rainbow')
            plt.scatter(centers[:, 0], centers[:, 1], marker='s',
                        c=np.arange(k), s=50, cmap='rainbow')

        plt.xlim(-4, 4)
        plt.ylim(-2, 10)

        if frame % 3 == 1:
            plt.text(3.8, 9.5, "1. Reasignacion de etiquetas",
                     ha='right', va='top', size=14)
        elif frame % 3 == 2:
            plt.text(3.8, 9.5, "2. Calculo de centroides",
                     ha='right', va='top', size=14)

    frame_range = [0,20]
    k_range = [2, n_clusters+2]
    return interact(_kmeans_step, k=k_range, frame=frame_range)

コード例 #21

0

ファイルを表示

ファイル: test_spectral.py プロジェクト: GbalsaC/bitnamiP

def test_spectral_unknown_mode():
    # Test that SpectralClustering fails with an unknown mode set.
    centers = np.array([[0.0, 0.0, 0.0], [10.0, 10.0, 10.0], [20.0, 20.0, 20.0]])
    X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=1.0, random_state=42)
    D = pairwise_distances(X)  # Distance matrix
    S = np.max(D) - D  # Similarity matrix
    S = sparse.coo_matrix(S)
    assert_raises(ValueError, spectral_clustering, S, n_clusters=2, random_state=0, mode="<unknown>")

コード例 #22

0

ファイルを表示

ファイル: test_k_means.py プロジェクト: FedericaLionetto/scikit-learn

def test_minibatch_sensible_reassign_partial_fit():
    zeroed_X, true_labels = make_blobs(n_samples=n_samples, centers=5, cluster_std=1.0, random_state=42)
    zeroed_X[::2, :] = 0
    mb_k_means = MiniBatchKMeans(n_clusters=20, random_state=42, init="random")
    for i in range(100):
        mb_k_means.partial_fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)

コード例 #23

0

ファイルを表示

ファイル: test_optics.py プロジェクト: MartinThoma/scikit-learn

def test_bad_reachability():
    msg = "All reachability values are inf. Set a larger max_eps."
    centers = [[1, 1], [-1, -1], [1, -1]]
    X, labels_true = make_blobs(n_samples=750, centers=centers,
                                cluster_std=0.4, random_state=0)

    clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10)
    assert_raise_message(ValueError, msg, clust.fit, X)

コード例 #24

0

ファイルを表示

ファイル: figures.py プロジェクト: Samerazar/ESAC-stats-2014

def plot_kmeans_interactive():
    from IPython.html.widgets import interact
    from sklearn.metrics.pairwise import euclidean_distances
    from sklearn.datasets.samples_generator import make_blobs
    
    X, y = make_blobs(n_samples=300, centers=4,
                      random_state=0, cluster_std=0.60)

    def _kmeans_step(frame, n_clusters):
        rng = np.random.RandomState(2)
        labels = np.zeros(X.shape[0])
        centers = rng.randn(n_clusters, 2)

        nsteps = frame // 3

        for i in range(nsteps + 1):
            old_centers = centers
            if i < nsteps or frame % 3 > 0:
                dist = euclidean_distances(X, centers)
                labels = dist.argmin(1)

            if i < nsteps or frame % 3 > 1:
                centers = np.array([X[labels == j].mean(0)
                                    for j in range(n_clusters)])
                nans = np.isnan(centers)
                centers[nans] = old_centers[nans]


        # plot the cluster centers
        plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='rainbow');
        plt.scatter(old_centers[:, 0], old_centers[:, 1], marker='o',
                    c=np.arange(n_clusters),
                    s=200, cmap='rainbow')
        plt.scatter(old_centers[:, 0], old_centers[:, 1], marker='o',
                    c='black', s=50)

        # plot new centers if third frame
        if frame % 3 == 2:
            for i in range(n_clusters):
                plt.annotate('', centers[i], old_centers[i], 
                             arrowprops=dict(arrowstyle='->', linewidth=1))
            plt.scatter(centers[:, 0], centers[:, 1], marker='o',
                        c=np.arange(n_clusters),
                        s=200, cmap='rainbow')
            plt.scatter(centers[:, 0], centers[:, 1], marker='o',
                        c='black', s=50)

        plt.xlim(-4, 4)
        plt.ylim(-2, 10)

        if frame % 3 == 1:
            plt.text(3.8, 9.5, "1. Reassign points to nearest centroid",
                     ha='right', va='top', size=14)
        elif frame % 3 == 2:
            plt.text(3.8, 9.5, "2. Update centroids to cluster means",
                     ha='right', va='top', size=14)

    return interact(_kmeans_step, frame=[0, 50], n_clusters=[3, 5])

コード例 #25

0

ファイルを表示

ファイル: test_spectral.py プロジェクト: 93sam/scikit-learn

def test_spectral_clustering_sparse():
    X, y = make_blobs(n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01)

    S = rbf_kernel(X, gamma=1)
    S = np.maximum(S - 1e-4, 0)
    S = sparse.coo_matrix(S)

    labels = SpectralClustering(random_state=0, n_clusters=2, affinity="precomputed").fit(S).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)

コード例 #26

0

ファイルを表示

ファイル: test_grid_search.py プロジェクト: agalusza/scikit-learn

def test_unsupervised_grid_search():
    # test grid-search with unsupervised estimator
    X, y = make_blobs(random_state=0)
    km = KMeans(random_state=0)
    grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]),
                               score_func=adjusted_rand_score)
    grid_search.fit(X)
    # most number of clusters should be best
    assert_equal(grid_search.best_params_["n_clusters"], 4)

コード例 #27

0

ファイルを表示

ファイル: test_optics.py プロジェクト: allefpablo/scikit-learn

def test_bad_reachability():
    msg = "All reachability values are inf. Set a larger max_eps."
    centers = [[1, 1], [-1, -1], [1, -1]]
    X, labels_true = make_blobs(n_samples=750, centers=centers,
                                cluster_std=0.4, random_state=0)

    with pytest.warns(UserWarning, match=msg):
        clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015)
        clust.fit(X)

コード例 #28

0

ファイルを表示

ファイル: ap.py プロジェクト: shunliz/mltest

def init_sample():
    ## 生成的测试数据的中心点
    centers = [[1, 1], [-1, -1], [1, -1]]
    ##生成数据
    Xn, labels_true = make_blobs(n_samples=150, centers=centers, cluster_std=0.5,
                            random_state=0)
    #3数据的长度，即：数据点的个数
    dataLen = len(Xn)

    return Xn,dataLen

コード例 #29

0

ファイルを表示

ファイル: knn.py プロジェクト: davidmcclure/islp

    def from_blobs(cls):

        X, y = make_blobs(
            n_samples=1000,
            centers=5,
            cluster_std=1,
            n_features=5,
        )

        return cls(X, y)

コード例 #30

0

ファイルを表示

ファイル: test_spectral.py プロジェクト: GbalsaC/bitnamiP

def test_affinities():
    X, y = make_blobs(n_samples=40, random_state=1, centers=[[1, 1], [-1, -1]], cluster_std=0.4)
    # nearest neighbors affinity
    sp = SpectralClustering(n_clusters=2, affinity="nearest_neighbors", random_state=0)
    labels = sp.fit(X).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)

    sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0)
    labels = sp.fit(X).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)

コード例 #31

0

ファイルを表示

ファイル: make_blob.py プロジェクト: nord2sudjp/zappy_python

import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.datasets.samples_generator import make_blobs

centers_l = [0.01, 0.1, 0.5, 1, 10, 1000]

fig, ax = plt.subplots(6, figsize=(10,20))

for i in range(len(centers_l)):
    X, y = make_blobs(n_samples=150, centers=3,random_state=0, cluster_std=centers_l[i])
    ax[i].scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn');
    
plt.show()

コード例 #32

0

ファイルを表示

#!/usr/bin/env python3

from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs

# #############################################################################
# Generate sample data
centers = [[0.4, 0.4], [-0.4, -0.4], [0.4, -0.4]]
X, labels_true = make_blobs(n_samples=30,
                            centers=centers,
                            cluster_std=0.1,
                            random_state=0)

# #############################################################################
# Plot initial
import matplotlib.pyplot as plt
from itertools import cycle
import tikzplotlib

plt.close('all')
plt.figure(1)
plt.clf()

for p in X:
    plt.plot(p[0], p[1], 'k.')
plt.title('Unclustered dataset')
tikzplotlib.save("figures/ap_unclust.tex")
# #############################################################################
# Compute Affinity Propagation
af = AffinityPropagation(max_iter=1000, damping=0.5).fit(X)

コード例 #33

0

ファイルを表示

ファイル: SGD.py プロジェクト: iebeid/Knowledge-Graph-Mining

def main():
    epochs = 100
    alpha = 0.01
    batch_size = 32
    (X, y) = make_blobs(n_samples=10000,
                        n_features=2,
                        centers=2,
                        cluster_std=2.5,
                        random_state=95)
    # insert a column of 1's as the first entry in the feature
    # vector -- this is a little trick that allows us to treat
    # the bias as a trainable parameter *within* the weight matrix
    # rather than an entirely separate variable
    X = np.c_[np.ones((X.shape[0])), X]
    # initialize our weight matrix such it has the same number of
    # columns as our input features
    print("[INFO] starting training...")
    W = np.random.uniform(size=(X.shape[1], ))
    # initialize a list to store the loss value for each epoch
    lossHistory = []
    # loop over the desired number of epochs
    for epoch in np.arange(0, epochs):
        # initialize the total loss for the epoch
        epochLoss = []
        # loop over our data in batches
        for (batchX, batchY) in next_batch(X, y, batch_size):
            # take the dot product between our current batch of
            # features and weight matrix `W`, then pass this value
            # through the sigmoid activation function
            preds = sigmoid_activation(batchX.dot(W))
            # now that we have our predictions, we need to determine
            # our `error`, which is the difference between our predictions
            # and the true values
            error = preds - batchY
            # given our `error`, we can compute the total loss value on
            # the batch as the sum of squared loss
            loss = np.sum(error**2)
            epochLoss.append(loss)
            # the gradient update is therefore the dot product between
            # the transpose of our current batch and the error on the
            # # batch
            gradient = batchX.T.dot(error) / batchX.shape[0]
            # use the gradient computed on the current batch to take
            # a "step" in the correct direction
            W += -alpha * gradient
        # update our loss history list by taking the average loss
        # across all batches
        lossHistory.append(np.average(epochLoss))
    # compute the line of best fit by setting the sigmoid function
    # to 0 and solving for X2 in terms of X1
    Y = (-W[0] - (W[1] * X)) / W[2]
    # plot the original data along with our line of best fit
    plt.figure()
    plt.scatter(X[:, 1], X[:, 2], marker="o", c=y)
    plt.plot(X, Y, "r-")
    # construct a figure that plots the loss over time
    fig = plt.figure()
    plt.plot(np.arange(0, epochs), lossHistory)
    fig.suptitle("Training Loss")
    plt.xlabel("Epoch #")
    plt.ylabel("Loss")
    plt.show()

コード例 #34

0

ファイルを表示

ファイル: k-means8.py プロジェクト: Limlin-Qs/MachineLearningAlgorithm

from sklearn import metrics
"""
  Author: limlin
  Contact: [email protected]
  Datetime: 2020/12/4 9:29
  Software: PyCharm
  Profile: https://www.cnblogs.com/hellojiaojiao/p/10758408.html
"""
"""
下面是生成一些样本数据
X为样本特征，Y为样本簇类别， 共1000个样本，每个样本2个特征，共4个簇，簇中心在[-1,-1], [0,0],[1,1], [2,2]，
簇方差分别为[0.4, 0.5, 0.2]
"""
X, y = make_blobs(n_samples=500,
                  n_features=2,
                  centers=[[2, 3], [3, 0], [1, 1]],
                  cluster_std=[0.4, 0.5, 0.2],
                  random_state=9)
"""
首先画出生成的样本数据的分布
"""
plt.scatter(X[:, 0], X[:, 1], marker='o')
plt.show()
"""
下面看不同的k值下的聚类效果
"""
score_all = []
list1 = range(2, 6)
# 其中i不能为0，也不能为1
for i in range(2, 6):
    y_pred = KMeans(n_clusters=i, random_state=9).fit_predict(X)

コード例 #35

0

ファイルを表示

ファイル: gmm_v2.py プロジェクト: N-T-SH/Gaussian-Mixture-Models---Machine-Learning

def estimateMeanConvergence(noMoveClick, moveClick, surpriseMe, noOfResets,
                            alpha, noOfSamples):
    # declare the below variables as using global scope since they are reassigned in the function
    global noMoves
    global moves
    global surprises
    global i
    global resetCount
    global my_centers
    global agentR
    global agentB
    global max_iter
    global gmm
    global ks_df_OLD
    global X_old
    global Y_old
    # indicator to check if KS statistic needs to be run
    runKS = 0
    # indicator to check if same distn. is selected
    sameDist = 0
    # initialize variables to initial values if reset button is pressed
    if noOfResets > resetCount:
        resetCount = noOfResets

        noMoves = 0
        moves = 0
        surprises = 0
        i = 0
        random.seed(1)
        max_iter = 100
        agentR = (-25, -25)
        agentB = (25, -25)
        my_centers = ((-5, -5), (5, 5))
        gmm = None
        ks_df_OLD = None
        X_old = None
        Y_old = None

    # Detect which button was pressed and generate data accordingly
    if noMoveClick > noMoves:
        move = 0
        sameDist = 1
        noMoves = noMoveClick
    elif moveClick > moves:
        move = 1
        moves = moveClick
    elif surpriseMe > surprises:
        move = random.randint(0, 1)
        runKS = 1
        surprises = surpriseMe
    else:
        print("Button press not detected. Assuming same distribution")
        move = 0
        sameDist = 1
    print("iteration: %d" % i)
    # assign center of distributions
    my_centers = ((my_centers[0][0] + 1 * move, my_centers[0][1] + 4 * move),
                  (my_centers[1][0] - 3 * move, my_centers[1][1] - 1 * move))
    # draw samples
    X, y_true = make_blobs(n_samples=int(noOfSamples),
                           centers=my_centers,
                           cluster_std=1.5,
                           random_state=i)

    # stack observations if new data from same distribution (in order to incorporate more data into estimate of mean).
    # otherwise, only use new data.
    #
    # note: assumes independence of x and y
    if i != 0:
        # fit GMM using previous means as initial means
        gmm = GaussianMixture(n_components=2,
                              means_init=gmm.means_,
                              max_iter=max_iter).fit(X)
        # extract predicted labels
        labels = gmm.predict(X)
        ks_df = pd.DataFrame(np.column_stack((X, labels, y_true)))
        ks_df.columns = ['x1', 'x2', 'labels', 'true_labels']

        # if unknown is selected and if estimated underlying distributions have not changed, stack data.
        #
        # note: assumes independence of x and y (i.e. uses univariate ks test)
        if runKS == 1:
            print("Random Distribution:%d" % move)
            print(ks_2samp(ks_df[ks_df['labels'] == 1]['x1'], ks_df_OLD[ks_df_OLD['labels'] == 1]['x1'])[1],\
            ks_2samp(ks_df[ks_df['labels'] == 1]['x2'], ks_df_OLD[ks_df_OLD['labels'] == 1]['x2'])[1],\
            ks_2samp(ks_df[ks_df['labels'] == 0]['x1'], ks_df_OLD[ks_df_OLD['labels'] == 0]['x1'])[1],\
            ks_2samp(ks_df[ks_df['labels'] == 0]['x2'], ks_df_OLD[ks_df_OLD['labels'] == 0]['x2'])[1])
            if ks_2samp(ks_df[ks_df['labels'] == 1]['x1'], ks_df_OLD[ks_df_OLD['labels'] == 1]['x1'])[1] > .99 and\
                ks_2samp(ks_df[ks_df['labels'] == 1]['x2'], ks_df_OLD[ks_df_OLD['labels'] == 1]['x2'])[1] > .99 and\
                ks_2samp(ks_df[ks_df['labels'] == 0]['x1'], ks_df_OLD[ks_df_OLD['labels'] == 0]['x1'])[1] > .99 and\
                ks_2samp(ks_df[ks_df['labels'] == 0]['x2'], ks_df_OLD[ks_df_OLD['labels'] == 0]['x2'])[1] > .99:
                # stack observations
                print("Same Distribution: KS")
                X = np.vstack((X_old, X))
                y_true = np.concatenate((Y_old, y_true), axis=0)
#                    y_true = y_true.flatten()
# if same distribution is selected stack data
        if sameDist == 1:
            # stack observations
            print("Same Distribution: Button")
            print(ks_2samp(ks_df[ks_df['labels'] == 1]['x1'], ks_df_OLD[ks_df_OLD['labels'] == 1]['x1'])[1],\
            ks_2samp(ks_df[ks_df['labels'] == 1]['x2'], ks_df_OLD[ks_df_OLD['labels'] == 1]['x2'])[1],\
            ks_2samp(ks_df[ks_df['labels'] == 0]['x1'], ks_df_OLD[ks_df_OLD['labels'] == 0]['x1'])[1],\
            ks_2samp(ks_df[ks_df['labels'] == 0]['x2'], ks_df_OLD[ks_df_OLD['labels'] == 0]['x2'])[1])
            X = np.vstack((X_old, X))
            y_true = np.concatenate((Y_old, y_true), axis=0)


#            y_true = y_true.flatten()
# fit GMM
    gmm = GaussianMixture(n_components=2,
                          init_params='kmeans',
                          max_iter=max_iter).fit(X)

    # extract predicted labels
    labels = gmm.predict(X)

    # create ks df for next iteration
    ks_df_OLD = pd.DataFrame(np.column_stack((X, labels, y_true)))
    ks_df_OLD.columns = ['x1', 'x2', 'labels', 'true_labels']

    # update target means for agents to seek
    B_mean_estimate, R_mean_estimate = tuple(gmm.means_[0]), tuple(
        gmm.means_[1])

    ## move agents towards respective estimated means
    #
    # calc distances from agents to respective means
    distanceR = (sum((np.array(R_mean_estimate) - np.array(agentR))**2))**(
        1 / 2)  # unweighted....no log prob...
    distanceB = (sum((np.array(B_mean_estimate) - np.array(agentB))**2))**(
        1 / 2)  # unweighted....no log prob...

    # calculate angles
    angle_degreeR = math.degrees(
        math.atan2(R_mean_estimate[1] - agentR[1],
                   R_mean_estimate[0] - agentR[0]))
    angle_degreeB = math.degrees(
        math.atan2(B_mean_estimate[1] - agentB[1],
                   B_mean_estimate[0] - agentB[0]))

    # scale (if set to 1, agent will move all the way to mean of respective distribution)
    # replaced .5 with alpha as the learning rate
    scaleR = alpha
    scaleB = alpha

    # set new agent location (red)
    agentR = agentR[0] + scaleR * distanceR * math.cos(angle_degreeR * math.pi / 180),\
             agentR[1] + scaleR * distanceR * math.sin(angle_degreeR * math.pi / 180)
    # set new agent location (blue)
    agentB = agentB[0] + scaleB * distanceB * math.cos(angle_degreeB * math.pi / 180),\
             agentB[1] + scaleB * distanceB * math.sin(angle_degreeB * math.pi / 180)

    # make data copy for next iteration
    X_old = np.copy(X)
    Y_old = np.copy(y_true)

    #Increment Counter
    i += 1

コード例 #36

0

ファイルを表示

ファイル: Funcion_K-means.py プロジェクト: emilioarced/KMeans-ST

        clusters = cercanos(puntos,cent)

        cent  = centros(clusters)

    return cent


# N =100

# x = np.random.rand(N)
# y = np.random.rand(N)

# data = [[x,y] for x,y in zip(x,y) ]

if __name__ == "__main__":
    X, y_true = make_blobs(n_samples=300, centers=4,
                           cluster_std=0.60, random_state=0)
    plt.scatter(X[:, 0], X[:, 1], s=50);

    # KMeans Via Us
    cent = kmeans(X,k=4)

    plt.scatter([c[0] for c in cent ], [c[1] for c in cent ], c='black', s=200, alpha=0.5);
    plt.show()

    # Sklearn KMeans
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=4)
    kmeans.fit(X)
    y_kmeans = kmeans.predict(X)

    plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='viridis')

コード例 #37

0

ファイルを表示

import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin
from sklearn.datasets.samples_generator import make_blobs

# #############################################################################
# Generate sample data
np.random.seed(0)

batch_size = 45
centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7)

# #############################################################################
# Compute clustering with Means

k_means = KMeans(init='k-means++', n_clusters=3, n_init=10)
t0 = time.time()
k_means.fit(X)
t_batch = time.time() - t0

# #############################################################################
# Compute clustering with MiniBatchKMeans

mbk = MiniBatchKMeans(init='k-means++', n_clusters=3, batch_size=batch_size,
                      n_init=10, max_no_improvement=10, verbose=0)
t0 = time.time()

コード例 #38

0

ファイルを表示

import numpy as np
from sklearn.cluster import MeanShift
from sklearn.datasets.samples_generator import make_blobs
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import style
style.use("ggplot")

centers = [[1,1,1],[5,5,5],[3,10,10]]

X, _ = make_blobs(n_samples=100, centers=centers, cluster_std=1.5)

ms = MeanShift()
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

print(cluster_centers)
numClusters = len(np.unique(labels))
print("Number of estimated clusters: ", numClusters)

colors = 10 * ['r','g','b','c','k','y','m']
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

for i in range(len(X)):
	ax.scatter(X[i][0], X[i][1], X[i][2], c=colors[labels[i]], marker='o')

ax.scatter(cluster_centers[:,0], cluster_centers[:,1], cluster_centers[:,2],
	marker="x", color='k', s=150, linewidths=5, zorder=10)

コード例 #39

0

ファイルを表示

ファイル: clureal_v1.py プロジェクト: CN-TU/py_clureal-experiments

    dataset['Silhouette'], dataset['Calinski Harabasz'], dataset[
        'Davies Bouldin'] = other_validations(X, y, verbose=report)
    outliers = np.sum(y == -1) / np.sum(mass)

    if SK:
        draw_symbol(k, dataset, clusters, mm, kinship, kdens, volr, outliers)

    return y, dataset, clusters


if __name__ == '__main__':
    from sklearn.datasets.samples_generator import make_blobs
    X, y_real = make_blobs(n_samples=1500,
                           centers=7,
                           n_features=2,
                           random_state=0,
                           cluster_std=0.6)

    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
    scaler.fit(X)
    X = scaler.transform(X)

    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=10, random_state=0).fit(X)
    y = kmeans.predict(X)

    plt.figure(figsize=(9 * 2 + 3, 12.5))
    plt.subplots_adjust(left=.02,
                        right=.98,

コード例 #40

0

ファイルを表示

ファイル: datasets_blobs.py プロジェクト: xuyuanxin/notes

'''
>>> from sklearn.datasets.samples_generator import make_blobs
>>> X, y = make_blobs(n_samples=10, centers=3, n_features=2,
...                   random_state=0)
>>> print(X.shape)
(10, 2)
>>> y
array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0])
'''

from sklearn.datasets.samples_generator import make_blobs

X, y = make_blobs(n_samples=10, centers=3, n_features=3, random_state=0)

print X
print(X.shape)
print y

コード例 #41

0

ファイルを表示

# scatter plot of blobs dataset
from sklearn.datasets.samples_generator import make_blobs
from matplotlib import pyplot
from numpy import where
# generate 2d classification dataset
X, y = make_blobs(n_samples=1000,
                  centers=3,
                  n_features=2,
                  cluster_std=2,
                  random_state=2)
# scatter plot for each class value
for class_value in range(3):
    # select indices of points with the class label
    row_ix = where(y == class_value)
    # scatter plot for points with a different color
    pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
# show plot
pyplot.show()

コード例 #42

0

ファイルを表示

# In[1]:


import matplotlib.pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
import numpy as np
from random import randint
import math
from sklearn.utils import shuffle


# In[2]:


(X, y) = make_blobs(n_samples=500, n_features=20, centers=2,cluster_std=3.1,random_state=95) #Original Data


# In[3]:


for i in range(len(X)):
    for j in range(len(X[0])-8):
        rand_no = randint(0,len(X[0])-1)
        X[i][rand_no] = 0 


# In[4]:


X = np.c_[np.ones((X.shape[0])), X] #For absorbing bias

コード例 #43

0

ファイルを表示

ファイル: test_k_means.py プロジェクト: ztx01001/scikit-learn

from sklearn.cluster.k_means_ import _labels_inertia
from sklearn.cluster.k_means_ import _mini_batch_step
from sklearn.datasets.samples_generator import make_blobs
from io import StringIO
from sklearn.metrics.cluster import homogeneity_score

# non centered, sparse centers to check the
centers = np.array([
    [0.0, 5.0, 0.0, 0.0, 0.0],
    [1.0, 1.0, 4.0, 0.0, 0.0],
    [1.0, 0.0, 0.0, 5.0, 1.0],
])
n_samples = 100
n_clusters, n_features = centers.shape
X, true_labels = make_blobs(n_samples=n_samples,
                            centers=centers,
                            cluster_std=1.,
                            random_state=42)
X_csr = sp.csr_matrix(X)


@pytest.mark.parametrize("representation, algo", [('dense', 'full'),
                                                  ('dense', 'elkan'),
                                                  ('sparse', 'full')])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_kmeans_results(representation, algo, dtype):
    # cheks that kmeans works as intended
    array_constr = {'dense': np.array, 'sparse': sp.csr_matrix}[representation]
    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype)
    sample_weight = [3, 1, 1, 3]  # will be rescaled to [1.5, 0.5, 0.5, 1.5]
    init_centers = np.array([[0, 0], [1, 1]], dtype=dtype)

コード例 #44

0

ファイルを表示

import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
from sklearn.datasets.samples_generator import make_blobs
import random

centers = random.randrange(2,5)

X, y = make_blobs(n_samples=20, centers=centers, n_features=2)

##X = np.array([[1, 2],
##              [1.5, 1.8],
##              [5, 8 ],
##              [8, 8],
##              [1, 0.6],
##              [9,11],
##              [8,2],
##              [10,2],
##              [9,3],])

#plt.scatter(X[:,0], X[:,1], s=150)
#plt.show()

class Mean_Shift:
    def __init__(self, radius=None, radius_norm_step = 100):
        self.radius = radius
        self.radius_norm_step = radius_norm_step

    def fit(self, data):
        if self.radius == None:

コード例 #45

0

ファイルを表示

ファイル: MeanShift_custom.py プロジェクト: cachett/ML-Classification

                for centroid in range(len(self.centroids))
            ]
            classification = distances.index(min(distances))
            self.classifications[classification].append(feature_set)

    def predict(self, data):
        distances = [
            np.linalg.norm(data - self.centroids[centroid])
            for centroid in range(len(self.centroids))
        ]
        classification = distances.index(min(distances))
        return classification


for _ in range(10):
    X, y = make_blobs(n_samples=100, centers=3, n_features=2)
    # X = np.array([[1,2],[1.5,1.8],[5,8],[1,0.6],[8,8],[9,11],[8,2],[10,2],[9,3]])
    colors = 100 * ["g", "r", "c", "b", "k"]
    # plt.scatter(X[:,0], X[:,1], color="b", s=150, linewidths=5, marker="o")
    # plt.show()
    clf = MeanShift()
    clf.fit(X)

    centroids = clf.centroids

    for classification in clf.classifications:
        color = colors[classification]
        for feature_set in clf.classifications[classification]:
            plt.scatter(feature_set[0],
                        feature_set[1],
                        marker='o',

コード例 #46

0

ファイルを表示

import random
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets.samples_generator import make_blobs
import pandas as pd
from sklearn.preprocessing import StandardScaler

np.random.seed(0)
X, y = make_blobs(n_samples=5000,
                  centers=[[4, 4], [-2, -1], [2, -3], [1, 1]],
                  cluster_std=0.9)
plt.scatter(X[:, 0], X[:, 1], marker='.')

k_means = KMeans(init="k-means++", n_clusters=4, n_init=12)
k_means.fit(X)
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_
print(k_means_cluster_centers)

# Initialize the plot with the specified dimensions.
fig = plt.figure(figsize=(6, 4))

# Colors uses a color map, which will produce an array of colors based on
# the number of labels there are. We use set(k_means_labels) to get the
# unique labels.
colors = plt.cm.Spectral(np.linspace(0, 1, len(set(k_means_labels))))

# Create a plot
ax = fig.add_subplot(1, 1, 1)

コード例 #47

0

ファイルを表示


ap = argparse.ArgumentParser()
ap.add_argument("-e", "--epochs", type=float, default=60, help="# of epochs")
ap.add_argument("-a", "--alpha", type=float, default=0.8, help="learning rate")
ap.add_argument("-b",
                "--batch-size",
                type=int,
                default=32,
                help="size of SGD mini-batches")
args = vars(ap.parse_args())

# diambil samples dari data iris 100 data pertama
(X, Theta) = make_blobs(n_samples=100,
                        n_features=2,
                        centers=2,
                        cluster_std=2.5,
                        random_state=95)

X = np.c_[np.ones((X.shape[0])), X]

print("[INFO] starting training...")
W = np.random.uniform(size=(X.shape[1], ))

lossHistory = []

for epoch in np.arange(0, args["epochs"]):
    epochLoss = []
    for (batchX, batchY) in h(X, Theta, args["epochs"]):
        preds = sigmoid_activation(batchX.dot(W))
        error = preds - batchY

コード例 #48

0

ファイルを表示

ファイル: linear_separable.py プロジェクト: MaoliRUNsen/machine_learning_book

'''
@Author： Runsen
@微信公众号： 润森笔记
@博客： https://blog.csdn.net/weixin_44510615
@Date： 2020/4/27
'''

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets.samples_generator import make_blobs
sns.set()
X, y = make_blobs(n_samples=50, centers=2,
                  random_state=0, cluster_std=0.60)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
xfit = np.linspace(-1, 3.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
for m, b in [(1, 0.65), (0.5, 1.6), (-0.2, 2.9)]:
    plt.plot(xfit, m * xfit + b, '-k')
plt.xlim(-1, 3.5)
plt.show()


xfit = np.linspace(-1, 3.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')

for m, b, d in [(1, 0.65, 0.33), (0.5, 1.6, 0.55), (-0.2, 2.9, 0.2)]:
    yfit = m * xfit + b
    plt.plot(xfit, yfit, '-k')
    plt.fill_between(xfit, yfit - d, yfit + d, edgecolor='none',
                     color='#AAAAAA', alpha=0.4)

コード例 #49

0

ファイルを表示

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 31 16:04:05 2019

@author: xsxsz
"""

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.datasets.samples_generator import make_blobs
from sklearn.decomposition import PCA

x,y=make_blobs(n_samples=10000,n_features=3,\
               centers=[[3,3, 3], [0,0,0], [1,1,1], [2,2,2]],\
               cluster_std=[0.2, 0.1, 0.2, 0.2],random_state=9)
fig = plt.figure()
ax = Axes3D(fig, rect=[0, 0, 1, 1], elev=30, azim=30)
plt.scatter(x[:, 0], x[:, 1], x[:, 2], marker='o', color='g')
pca1 = PCA(n_components=3)
pca1.fit(x)
print(pca1.explained_variance_)
print('------------')
print(pca1.explained_variance_ratio_)
print('------------')
pca2 = PCA(n_components=2)
pca2.fit(x)
print(pca2.explained_variance_)
print('------------')
print(pca2.explained_variance_ratio_)
print('------------')

コード例 #50

0

ファイルを表示

import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.datasets.samples_generator import make_blobs
import numpy as np

(X, Y) = make_blobs(n_samples=5, n_features=2, centers=2, random_state=50)

plt.figure()
plt.scatter(X[:, 0], X[:, 1], marker='o', c=Y)
plt.axis([-5, 10, -12, -1])
plt.show()

postiveX = []
negativeX = []
for i, v in enumerate(y):
    if v == 0:
        negativeX.append(X[i])
    else:
        postiveX.append(X[i])

# our data dictionary
data_dict = {-1: np.array(negativeX), 1: np.array(postiveX)}

コード例 #51

0

ファイルを表示

ファイル: cluster_sklearn.py プロジェクト: 0xr0ot/PY_self

#-*-coding:utf-8-*-
"""
Created on Wed Jul 19 14:25:31 2017
@author: UlionTse
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import Birch, KMeans, MiniBatchKMeans, DBSCAN, SpectralClustering, ward_tree, AgglomerativeClustering, MeanShift, AffinityPropagation
from sklearn import metrics

# data ready.
X, y = make_blobs(n_samples=1000,
                  n_features=2,
                  centers=[[0, 0], [0, 2], [2, 0], [1, 1], [2, 2]],
                  cluster_std=[0.2, 0.2, 0.2, 0.4, 0.2],
                  random_state=9)
plt.scatter(X[:, 0], X[:, 1], marker='o')
plt.show()

# needed attrs.
# attr_1 = KMeans(n_clusters=8)
# attr_2 = MiniBatchKMeans(n_clusters=8)
# attr_3 = Birch(n_clusters=3,threshold=0.5,branching_factor=50)
# attr_4 = ward_tree(X,n_clusters=None)
# attr_5 = AgglomerativeClustering(n_clusters=2,linkage='ward')
# attr_6 = DBSCAN(eps=0.5,leaf_size=30)
# attr_7 = MeanShift(bandwidth=None,seeds=None,min_bin_freq=1)
# attr_8 = SpectralClustering(n_clusters=8,n_init=10,gamma=1.0)
# attr_9 = AffinityPropagation(damping=0.5,max_iter=200,convergence_iter=15,affinity='euclidean')

コード例 #52

0

ファイルを表示

ファイル: dbscan.py プロジェクト: dadjun/pydemo

                     markerfacecolor=col,
                     markeredgecolor='k',
                     markersize=3)
        # 加标题，显示分类数
        plt.title('Estimated number of clusters: %d' % nclusters)
        plt.show()


def jiangzao(labels):
    # 标签中的簇数，忽略噪声（如果存在）
    clusters = len(set(labels)) - (1 if -1 in labels else 0)
    return clusters


def standar_scaler(points):
    p = StandardScaler().fit_transform(points)
    return p


if __name__ == "__main__":
    """
    test class dbScan
    """
    centers = [[1, 1], [-1, -1], [-1, 1], [1, -1]]
    point, labelsTrue = make_blobs(n_samples=2000,
                                   centers=centers,
                                   cluster_std=0.4,
                                   random_state=0)
    point = standar_scaler(point)
    db = DBScan(point, labelsTrue)
    db.draw()

コード例 #53

0

ファイルを表示

ファイル: testkmean.py プロジェクト: Mattyama91/stock-project-python-2

    def kMeansClustering(self):

        abbreviations_of_companies = list(
            name_abbreviation_mWIG40_dict.values())

        movements = Parameters(abbreviations_of_companies)

        daily_movement = movements.daily_movement()

        # a dataframe transformation into an array and the transpose of a matrix
        df_array = daily_movement.to_numpy().T

        # impact on the result, but not significant on a large scale.
        # zero means no change in the price of the item on a given day.
        df_array[np.isnan(df_array)] = 0

        style.use("seaborn-pastel")

        # make_blobs() is used to generate sample points around c centers (randomly chosen)
        X, y = make_blobs(n_samples=400,
                          centers=10,
                          cluster_std=1,
                          n_features=2)

        plt.scatter(X[:, 0], X[:, 1], s=5, color='r')

        plt.xlabel('X')
        plt.ylabel('Y')
        plt.show()
        # clear the figure
        plt.clf()

        # Elbow Method For Optimal k
        sum_of_squared_distances = []
        K = range(1, 16)

        for k in K:
            km = KMeans(n_clusters=k)
            km = km.fit(df_array)
            sum_of_squared_distances.append(km.inertia_)

        plt.plot(K, sum_of_squared_distances, 'bx-')
        plt.xlabel('k')
        plt.ylabel('Sum_of_squared_distances')
        plt.title('Elbow Method For Optimal k')
        plt.show()
        plt.clf()

        normalizer = Normalizer()
        kmeans = KMeans(n_clusters=13, max_iter=1500)
        pipeline = make_pipeline(normalizer, kmeans)
        pipeline.fit(df_array)
        labels = pipeline.predict(df_array)

        x = list(name_abbreviation_mWIG40_dict.values())
        y = []

        for i in x:
            y.append(sectors_mWIG40_dict[i])
        print(y)

        df = pd.DataFrame({
            'Labels': labels,
            'Companies': daily_movement.columns,
            'Economic sector': y
        }).sort_values(by=['Labels', 'Economic sector'], axis=0)

        return print(df)

コード例 #54

0

ファイルを表示

def plot_kmeans_interactive(min_clusters=1, max_clusters=6):
    from sklearn.metrics.pairwise import euclidean_distances
    from sklearn.datasets.samples_generator import make_blobs

    with warnings.catch_warnings():
        warnings.filterwarnings('ignore')

        X, y = make_blobs(n_samples=300,
                          centers=4,
                          random_state=0,
                          cluster_std=0.60)

        def _kmeans_step(frame=0, n_clusters=4):
            rng = np.random.RandomState(2)
            labels = np.zeros(X.shape[0])
            centers = rng.randn(n_clusters, 2)

            nsteps = frame // 3

            for i in range(nsteps + 1):
                old_centers = centers
                if i < nsteps or frame % 3 > 0:
                    dist = euclidean_distances(X, centers)
                    labels = dist.argmin(1)

                if i < nsteps or frame % 3 > 1:
                    centers = np.array(
                        [X[labels == j].mean(0) for j in range(n_clusters)])
                    nans = np.isnan(centers)
                    centers[nans] = old_centers[nans]

            # plot the data and cluster centers
            plt.scatter(X[:, 0],
                        X[:, 1],
                        c=labels,
                        s=50,
                        cmap='rainbow',
                        vmin=0,
                        vmax=n_clusters - 1)
            plt.scatter(old_centers[:, 0],
                        old_centers[:, 1],
                        marker='o',
                        c=np.arange(n_clusters),
                        s=200,
                        cmap='rainbow')
            plt.scatter(old_centers[:, 0],
                        old_centers[:, 1],
                        marker='o',
                        c='black',
                        s=50)

            # plot new centers if third frame
            if frame % 3 == 2:
                for i in range(n_clusters):
                    plt.annotate('',
                                 centers[i],
                                 old_centers[i],
                                 arrowprops=dict(arrowstyle='->', linewidth=1))
                plt.scatter(centers[:, 0],
                            centers[:, 1],
                            marker='o',
                            c=np.arange(n_clusters),
                            s=200,
                            cmap='rainbow')
                plt.scatter(centers[:, 0],
                            centers[:, 1],
                            marker='o',
                            c='black',
                            s=50)

            plt.xlim(-4, 4)
            plt.ylim(-2, 10)

            if frame % 3 == 1:
                plt.text(3.8,
                         9.5,
                         "1. Reassign points to nearest centroid",
                         ha='right',
                         va='top',
                         size=14)
            elif frame % 3 == 2:
                plt.text(3.8,
                         9.5,
                         "2. Update centroids to cluster means",
                         ha='right',
                         va='top',
                         size=14)

    return interact(_kmeans_step,
                    frame=[0, 50],
                    n_clusters=[min_clusters, max_clusters])

コード例 #55

0

ファイルを表示

## https://machinelearningmastery.com/generate-test-datasets-python-scikit-learn/
## http://scikit-learn.org/stable/auto_examples/datasets/plot_random_dataset.html

from sklearn.datasets.samples_generator import make_blobs
from matplotlib import pyplot
from pandas import DataFrame
import numpy

# generate 2d classification dataset
X, labels = make_blobs(n_samples=1000, n_features=2, centers=3)
colors = {0:'red', 1:'blue', 2:'green'}
colored_labels = numpy.vectorize(lambda l: colors[l])(labels)

x_coordinates = X[:, 0] 
y_coordinates = X[:, 1]
pyplot.scatter(x_coordinates, y_coordinates, marker='o', c=colored_labels, s=25, edgecolor='k', label=y)
df = DataFrame(dict(x=x_coordinates, y=y_coordinates, label=labels))
df.to_csv("data.csv")

!hdfs dfs -put -f data.csv

コード例 #56

0

ファイルを表示

# import libraries
import numpy as np
from sklearn.cluster import MeanShift
from sklearn.datasets.samples_generator import make_blobs

# creadte data
centers = [[3,3,3],[4,5,5],[3,10,10]]
X, _ = make_blobs(n_samples=700, centers = centers, cluster_std=0.5)

# create model
MSh = MeanShift()

# train model
MSh.fit(X)
labels = MSh.labels_
cluster_centers = MSh.cluster_centers_
print(cluster_centers)

コード例 #57

0

ファイルを表示

ファイル: evaluate_spectral.py プロジェクト: ninja16/Thesis_final

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random
from tqdm import tqdm
import numpy as np
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import SpectralClustering, AffinityPropagation

import utils
plt.ion()
plt.show()

nb_clusters = 7
X, y_true = make_blobs(n_samples=300,
                       centers=nb_clusters,
                       cluster_std=.80,
                       random_state=0)
plt.title(f'Ground truth simulated data : {nb_clusters} clusters')
plt.scatter(X[:, 0], X[:, 1], s=50, c=y_true)

print(utils.internalValidation(X, y_true))

コード例 #58

0

ファイルを表示

ファイル: MeanShift_Algo.py プロジェクト: skipperuzumaki/Machiene_Learning

import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
from sklearn.datasets.samples_generator import make_blobs
import random

ctq = random.randrange(2,5)

X,y = make_blobs(n_samples = 50,centers=5,n_features=2)

#X = np.array([[1,2],[1.5,1.6],[5,8],[1,0.6],[8,8],[9,11],[8,2],[9,3],[10,3]])
colors = 10*['g','r','c','k','b']

class MeanShift:
	def __init__(self,radius=None,rad_norm_step = 100):
		self.radius = radius
		self.rad_norm_step = rad_norm_step
	def fit(self,data):
		if self.radius == None:
			all_data_ctd = np.average(abs(data),axis = 0)
			all_data_norm = np.linalg.norm(abs(all_data_ctd))
			self.radius = all_data_norm/self.rad_norm_step
		centroids = {}
		for i in range(len(data)):
			centroids[i]=data[i]
		while True:
			n_centroids = []
			for i in centroids:
				wts = [i for i in range(self.rad_norm_step)][::-1]
				in_bdwth = []

コード例 #59

0

ファイルを表示

ファイル: datasets.py プロジェクト: ninickl/bolt

def load_dataset(which_dataset,
                 N=-1,
                 D=-1,
                 norm_mean=False,
                 norm_len=False,
                 num_queries=10,
                 Ntrain=-1,
                 D_multiple_of=-1):
    true_nn = None

    # randomly generated datasets
    if which_dataset == Random.UNIFORM:
        X_test = np.random.rand(N, D)
        X_train = np.random.rand(Ntrain, D) if Ntrain > 0 else X_test
        Q = np.random.rand(num_queries, D)
    elif which_dataset == Random.GAUSS:
        X_test = np.random.randn(N, D)
        X_train = np.random.randn(Ntrain, D) if Ntrain > 0 else X_test
        Q = np.random.randn(num_queries, D)
    elif which_dataset == Random.WALK:
        X_test = np.random.randn(N, D)
        X_test = np.cumsum(X_test, axis=1)
        X_train = np.copy(X_test)
        if Ntrain > 0:
            X_train = np.random.randn(Ntrain, D)
            X_train = np.cumsum(X_train)
        Q = np.random.randn(num_queries, D)
        Q = np.cumsum(Q, axis=-1)
    elif which_dataset == Random.BLOBS:
        # centers is D x D, and centers[i, j] = (i + j)
        centers = np.arange(D)
        centers = np.sum(np.meshgrid(centers, centers), axis=0)
        X_test, _ = make_blobs(n_samples=N, centers=centers)
        X_train = np.copy(X_test)
        if Ntrain > 0:
            X_train, _ = make_blobs(n_samples=Ntrain, centers=centers)
        Q, true_nn = make_blobs(n_samples=num_queries, centers=centers)

    # datasets that are just one block of a "real" dataset
    elif isinstance(which_dataset, str):
        # assert False # TODO rm after real experiments
        X_test = load_file(which_dataset)
        X_test, Q = extract_random_rows(X_test, how_many=num_queries)
        X_train = np.copy(X_test)
        true_nn = _ground_truth_for_dataset(which_dataset)

    # "real" datasets with predefined train, test, queries, truth
    elif which_dataset in ALL_REAL_DATASETS:
        X_train, Q, X_test, true_nn = _load_complete_dataset(
            which_dataset, num_queries=num_queries)

    else:
        raise ValueError("unrecognized dataset {}".format(which_dataset))

    N = X_test.shape[0] if N < 1 else N
    D = X_test.shape[1] if D < 1 else D
    X_test, X_train = np.copy(X_test)[:N, :D], X_train[:N, :D]
    Q = Q[:, :D] if len(Q.shape) > 1 else Q[:D]

    train_is_test = X_train.base is X_test or X_test.base is X_train
    train_is_test = train_is_test or np.array_equal(X_train[:100],
                                                    X_test[:100])

    if train_is_test:
        print "WARNING: Training data is also the test data!"

    if norm_mean:
        means = np.mean(X_train, axis=0)
        X_train -= means
        X_test -= means
        Q -= means
    if norm_len:
        X_test /= np.linalg.norm(X_test, axis=1, keepdims=True)
        X_train /= np.linalg.norm(X_train, axis=1, keepdims=True)
        Q /= np.linalg.norm(Q, axis=-1, keepdims=True)

    # np.set_printoptions(precision=6)
    # print "start of Q:", Q[:5, :5]
    # print "start of X_test:", X_test[:5, :5]

    # TODO don't convert datasets that are originally uint8s to floats
    X_train = X_train.astype(np.float32)
    X_test = X_test.astype(np.float32)
    # Q = np.squeeze(Q.astype(np.float32))
    Q = Q.astype(np.float32)

    if D_multiple_of > 1:
        X_train = ensure_num_cols_multiple_of(X_train, D_multiple_of)
        X_test = ensure_num_cols_multiple_of(X_test, D_multiple_of)
        Q = ensure_num_cols_multiple_of(Q, D_multiple_of)

    return X_train, Q, X_test, true_nn

コード例 #60

0

ファイルを表示

"""

import numpy as np

from sklearn.utils.testing import assert_equal, assert_array_equal
from sklearn.cluster.affinity_propagation_ import AffinityPropagation
from sklearn.cluster.affinity_propagation_ import affinity_propagation
from sklearn.datasets.samples_generator import make_blobs
from sklearn.metrics import euclidean_distances

n_clusters = 3
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
X, _ = make_blobs(n_samples=60,
                  n_features=2,
                  centers=centers,
                  cluster_std=0.4,
                  shuffle=True,
                  random_state=0)


def test_affinity_propagation():
    """Affinity Propagation algorithm
    """
    # Compute similarities
    S = -euclidean_distances(X, squared=True)
    preference = np.median(S) * 10
    # Compute Affinity Propagation
    cluster_centers_indices, labels = affinity_propagation(
        S, preference=preference)

    n_clusters_ = len(cluster_centers_indices)