def createCluster(): X1, y1 = make_blobs(n_samples=50, centers=1, n_features=2,random_state=0,center_box = (-5.0,5.0)) X2, y2 = make_blobs(n_samples=200, centers=1, n_features=2,random_state=0,center_box = (-4.0,6.0)) X = np.concatenate((X1,X2),axis=0) y = np.concatenate((y1,[1]*len(y2)),axis=0) return X.tolist(),y.tolist()
def createCluster(): # TODO: For the example to work properly with the new changes for multi-class data support # this method should change so it creates multi-class data. I'm not getting into that # since I don't need it. Simplest way: just use your own multi-class data. X1, y1 = make_blobs(n_samples=50, centers=1, n_features=2,random_state=0,center_box = (-5.0,5.0)) X2, y2 = make_blobs(n_samples=200, centers=1, n_features=2,random_state=0,center_box = (-4.0,6.0)) X = np.concatenate((X1,X2),axis=0) y = np.concatenate((y1,[1]*len(y2)),axis=0) return X.tolist(),y.tolist()
def knn_evaluation(): print 'KNeighborsClassifier' np.random.seed(123) dataset,true_labels = make_blobs(n_samples=10000, n_features=2) color = ['r-', 'b-'] methods = [True, False] for b in methods: print 'bootstrapping = %s' % methods[b] misclassification_rates = [] min_rate = np.inf min_k = 0 for i in range(1,51): neigh = KNeighborsClassifier(n_neighbors=i) scores = validation(neigh, dataset, true_labels, methods[b]) misclassifications = 1 - scores misclassification_rates.append(np.average(misclassifications)) if min_rate > misclassification_rates[i-1]: min_rate = misclassification_rates[i-1] min_k = i print 'minimum rate = %s' % min_rate print 'best k = %s' % min_k label = 'bootstrap' if methods[b] else 'cross-validation' pyplot.plot(range(1,51), misclassification_rates, color[b], label = label) pyplot.title('Mis-classification rates of KNeighborsClassifier') pyplot.xlabel('Values of k') pyplot.ylabel('Mis classification rates') pyplot.legend(loc = 'upper right') pyplot.show()
def test_grid_search_iid(): # test the iid parameter # noise-free simple 2d-data X, y = make_blobs(centers=[[0, 0], [1, 0], [0, 1], [1, 1]], random_state=0, cluster_std=0.1, shuffle=False, n_samples=80) # split dataset into two folds that are not iid # first one contains data of all 4 blobs, second only from two. mask = np.ones(X.shape[0], dtype=np.bool) mask[np.where(y == 1)[0][::2]] = 0 mask[np.where(y == 2)[0][::2]] = 0 # this leads to perfect classification on one fold and a score of 1/3 on # the other svm = SVC(kernel='linear') # create "cv" for splits cv = [[mask, ~mask], [~mask, mask]] # once with iid=True (default) grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv) grid_search.fit(X, y) _, average_score, scores = grid_search.cv_scores_[0] assert_array_almost_equal(scores, [1, 1. / 3.]) # for first split, 1/4 of dataset is in test, for second 3/4. # take weighted average assert_almost_equal(average_score, 1 * 1. / 4. + 1. / 3. * 3. / 4.) # once with iid=False (default) grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv, iid=False) grid_search.fit(X, y) _, average_score, scores = grid_search.cv_scores_[0] # scores are the same as above assert_array_almost_equal(scores, [1, 1. / 3.]) # averaged score is just mean of scores assert_almost_equal(average_score, np.mean(scores))
def test_bin_seeds(): # Test the bin seeding technique which can be used in the mean shift # algorithm # Data is just 6 points in the plane X = np.array([[1., 1.], [1.4, 1.4], [1.8, 1.2], [2., 1.], [2.1, 1.1], [0., 0.]]) # With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be # found ground_truth = {(1., 1.), (2., 1.), (0., 0.)} test_bins = get_bin_seeds(X, 1, 1) test_result = set(tuple(p) for p in test_bins) assert len(ground_truth.symmetric_difference(test_result)) == 0 # With a bin coarseness of 1.0 and min_bin_freq of 2, 2 bins should be # found ground_truth = {(1., 1.), (2., 1.)} test_bins = get_bin_seeds(X, 1, 2) test_result = set(tuple(p) for p in test_bins) assert len(ground_truth.symmetric_difference(test_result)) == 0 # With a bin size of 0.01 and min_bin_freq of 1, 6 bins should be found # we bail and use the whole data here. with warnings.catch_warnings(record=True): test_bins = get_bin_seeds(X, 0.01, 1) assert_array_almost_equal(test_bins, X) # tight clusters around [0, 0] and [1, 1], only get two bins X, _ = make_blobs(n_samples=100, n_features=2, centers=[[0, 0], [1, 1]], cluster_std=0.1, random_state=0) test_bins = get_bin_seeds(X, 1) assert_array_equal(test_bins, [[0, 0], [1, 1]])
def test_dbscan_optics_parity(eps, min_samples): # Test that OPTICS clustering labels are <= 5% difference of DBSCAN centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) # calculate optics with dbscan extract at 0.3 epsilon op = OPTICS(min_samples=min_samples).fit(X) core_optics, labels_optics = op.extract_dbscan(eps) # calculate dbscan labels db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) contingency = contingency_matrix(db.labels_, labels_optics) agree = min(np.sum(np.max(contingency, axis=0)), np.sum(np.max(contingency, axis=1))) disagree = X.shape[0] - agree # verify core_labels match assert_array_equal(core_optics, db.core_sample_indices_) non_core_count = len(labels_optics) - len(core_optics) percent_mismatch = np.round((disagree - 1) / non_core_count, 2) # verify label mismatch is <= 5% labels assert percent_mismatch <= 0.05
def exercise_2a(): X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0) # plt.scatter(X[:, 0], X[:, 1], marker='o', c=y) # plt.show() kf = KFold(1000, n_folds=10, shuffle=False, random_state=None) accuracy_lst = np.zeros([49, 2], dtype=float) accuracy_current = np.zeros(10, dtype=float) for k in range(1,50): iterator = 0 clf = KNeighborsClassifier(n_neighbors=k) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) accuracy_current[iterator] = (1. - clf.score(X_test,y_test)) iterator+=1 accuracy_lst[k-1, 0] = accuracy_current.mean() # accuracy_lst[k-1, 1] = accuracy_current.std() #confidence interval 95% x = np.arange(1,50, dtype=int) plt.style.use('ggplot') plt.plot(x, accuracy_lst[:, 0], '#009999', marker='o') # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^') plt.xticks(x, x) plt.margins(0.02) plt.xlabel('K values') plt.ylabel('Missclasification Error') plt.show()
def exercise_1(): X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0) n_samples = len(X) kf = cross_validation.KFold(n_samples, n_folds=10, shuffle=False, random_state=None) # kf = cross_validation.ShuffleSplit(1000,n_iter=25, test_size=0.1, train_size=0.9, random_state=None) error_total = np.zeros([49, 1], dtype=float) for k in range(1,50): error = [] clf = KNeighborsClassifier(n_neighbors=k) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) error.append( zero_one_loss(y_test, clf.predict(X_test)) ) # error.append(clf.predict(X_test)) # error.append( 1. - clf.score(X_test, y_test) ) #, accuracy_score(y_test, clf.predict(X_test)) # error.append(mean_squared_error(y_test, clf.predict(X_test))) # error.append() # print error error_total[k-1, 0] = np.array(error).mean() # print error_total x = np.arange(1,50, dtype=int) plt.style.use('ggplot') plt.plot(x, error_total[:, 0], '#009999', marker='o') # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^') plt.xticks(x, x) plt.margins(0.02) plt.xlabel('K values') plt.ylabel('Missclasification Error') plt.show()
def exercise_2b(): X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0) kf = ShuffleSplit(100, train_size= 0.9, test_size=0.1, random_state=0) # kf = KFold(1000, n_folds=10, shuffle=False, random_state=None) accuracy_lst = np.zeros([49, 2], dtype=float) accuracy_current = np.zeros(10, dtype=float) for k in range(1,50): iterator = 0 for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf = KNeighborsClassifier(n_neighbors=k) clf.fit(X_train, y_train) accuracy_current[iterator] = (1. - clf.score(X_test,y_test)) iterator+=1 print mean_squared_error(y_test, clf.predict(X_test)) accuracy_lst[k-1, 0] = accuracy_current.mean() accuracy_lst[k-1, 1] = accuracy_current.var()#*2 #confidence interval 95% x = np.arange(1,50, dtype=int) plt.style.use('ggplot') plt.plot(x, accuracy_lst[:, 1], '#009999', marker='o') # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^') plt.xticks(x, x) plt.margins(0.02) plt.xlabel('K') plt.ylabel('Variance') plt.show()
def plot_sgd_separator(): # we create 50 separable points X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) # fit the model clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True) clf.fit(X, Y) # plot the line, the points, and the nearest vectors to the plane xx = np.linspace(-1, 5, 10) yy = np.linspace(-1, 5, 10) X1, X2 = np.meshgrid(xx, yy) Z = np.empty(X1.shape) for (i, j), val in np.ndenumerate(X1): x1 = val x2 = X2[i, j] p = clf.decision_function([x1, x2]) Z[i, j] = p[0] levels = [-1.0, 0.0, 1.0] linestyles = ['dashed', 'solid', 'dashed'] colors = 'k' ax = plt.axes() ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired) ax.axis('tight')
def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol): # check that fit.predict gives same result as fit_predict # There's a very small chance of failure with elkan on unstructured dataset # because predict method uses fast euclidean distances computation which # may cause small numerical instabilities. # NB: This test is largely redundant with respect to test_predict and # test_predict_equal_labels. This test has the added effect of # testing idempotence of the fittng procesdure which appears to # be where it fails on some MacOS setups. if sys.platform == "darwin": pytest.xfail( "Known failures on MacOS, See " "https://github.com/scikit-learn/scikit-learn/issues/12644") if not (algo == 'elkan' and constructor is sp.csr_matrix): rng = np.random.RandomState(seed) X = make_blobs(n_samples=1000, n_features=10, centers=10, random_state=rng)[0].astype(dtype, copy=False) X = constructor(X) kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed, tol=tol, max_iter=max_iter, n_jobs=1) labels_1 = kmeans.fit(X).predict(X) labels_2 = kmeans.fit_predict(X) assert_array_equal(labels_1, labels_2)
def get_toy_classification_data(n_samples=100, centers=3, n_features=2, type_data = "blobs"): # generate 2d classification dataset if (type_data == "blobs"): X, y = make_blobs(n_samples=n_samples, centers=centers, n_features=n_features) elif(type_data == "moons"): X, y = make_moons(n_samples=n_samples, noise=0.1) elif(type_data == "circles"): X, y = make_circles(n_samples=n_samples, noise=0.05) # scatter plot, dots colored by class value # df = DataFrame(dict(x=X[:,0], y=X[:,1], label=y)) # colors = {0:'red', 1:'blue', 2:'green'} # fig, ax = pyplot.subplots() # grouped = df.groupby('label') # for key, group in grouped: # group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key]) # pyplot.show() X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, stratify = None) classes = np.unique(y_train) if(0): enc = OneHotEncoder().fit(classes.reshape(-1,1)) y_train = enc.transform(y_train.reshape(-1, 1)) print (y_test) y_test = enc.transform(y_test.reshape(-1, 1)) print (y_test) y_train = one_hot_encode(y_train, classes) y_test = one_hot_encode(y_test, classes) return X_train, y_train, X_test, y_test, classes
def tree_evaluation(): print 'DecisionTreeClassifier' np.random.seed(123) dataset,true_labels = make_blobs(n_samples=10000, n_features=2) color = ['r-', 'b-'] methods = [True, False] for b in methods: print 'bootstrapping = %s' % methods[b] misclassification_rates = [] min_rate = np.inf min_k = 0 for i in range(2,16): tree_classifier = tree.DecisionTreeClassifier(max_depth=i) scores = validation(tree_classifier, dataset, true_labels, methods[b]) misclassifications = 1 - scores misclassification_rates.append(np.average(misclassifications)) if min_rate > misclassification_rates[i-2]: min_rate = misclassification_rates[i-2] min_k = i print 'minimum rate = %s' % min_rate print 'best depth = %s' % min_k label = 'bootstrap' if methods[b] else 'cross-validation' pyplot.plot(range(2,16), misclassification_rates, color[b], label = label) pyplot.title('Mis-classification rates of DecisionTreeClassifier') pyplot.xlabel('Values of k') pyplot.ylabel('Mis classification rates') pyplot.legend(loc = 'upper left') pyplot.show()
def test_spectral_amg_mode(): # Test the amg mode of SpectralClustering centers = np.array([ [0., 0., 0.], [10., 10., 10.], [20., 20., 20.], ]) X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=1., random_state=42) D = pairwise_distances(X) # Distance matrix S = np.max(D) - D # Similarity matrix S = sparse.coo_matrix(S) try: from pyamg import smoothed_aggregation_solver amg_loaded = True except ImportError: amg_loaded = False if amg_loaded: labels = spectral_clustering(S, n_clusters=len(centers), random_state=0, mode="amg") # We don't care too much that it's good, just that it *worked*. # There does have to be some lower limit on the performance though. assert_greater(np.mean(labels == true_labels), .3) else: assert_raises(ValueError, spectral_embedding, S, n_components=len(centers), random_state=0, mode="amg")
def generate_anisotropically_clusters(number_of_samples, number_of_clusters, n_features=2, variances=None, filename=""): """ :param number_of_samples: The total number of points equally divided among clusters. :param number_of_clusters: The number of clusters to generate :param n_features: The number of features for each sample. :param variances: The standard deviation of the clusters. :param filename: The file to store the results :return: """ if variances is None: variances = [0.5 for _ in xrange(number_of_clusters)] if filename == "": filename = "./Data/anisotropically_" + str(number_of_samples) + "_features_" + str(n_features) \ + "_cluster_" + str(number_of_clusters) + ".csv" random_state = 170 X, y = make_blobs(n_samples=number_of_samples, centers=number_of_clusters, n_features=n_features, random_state=random_state, cluster_std=variances) transformation = np.array([[random() if i == j else uniform(-1, 1) for j in xrange(n_features)] for i in xrange(n_features)]) X = np.dot(X, transformation) features = ["features_" + str(i + 1) for i in xrange(n_features)] df = pd.DataFrame() for i, feature in enumerate(features): df[feature] = X[:, i] df["class"] = y df.to_csv(filename, index=False) return X, y
def test_soft(): X, Y = make_blobs(n_samples=10, centers=2, n_features=2, random_state=1) for i in range(0, len(Y)): if Y[i] == 0 : Y[i] = -1.0 X1, y1, X2, y2 = gen_lin_separable_data() #print Y #print X1 #X1, y1, X2, y2 = gen_lin_separable_overlap_data() #print y2 X_train, y_train = split_train(X1, y1, X2, y2) #print X_train #X_test, y_test = split_test(X1, y1, X2, y2) clf = SVM(C=0.1) #clf.fit(X_train, y_train) clf.fit(X, Y) #y_predict = clf.predict(X_test) #correct = np.sum(y_predict == y_test) #print "%d out of %d predictions correct" % (correct, len(y_predict)) plot_contour(X_train[y_train==1], X_train[y_train==-1], clf)
def test_fitted_model(self): # non centered, sparse centers to check the centers = np.array([ [0.0, 5.0, 0.0, 0.0, 0.0], [1.0, 1.0, 4.0, 0.0, 0.0], [1.0, 0.0, 0.0, 5.0, 1.0], ]) n_samples = 100 n_clusters, n_features = centers.shape X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) cbook = CoodeBook(n_words=3) cbook = cbook.fit(X) # TODO: Is it neaded to reasign? or it can be just cbook.fit(X) # check that the number of clusters centers and distinct labels match # the expectation centers = cbook.get_dictionary() assert_equal(centers.shape, (n_clusters, n_features)) labels = cbook.predict(X) assert_equal(np.unique(labels).shape[0], n_clusters) # check that the labels assignment are perfect (up to a permutation) assert_equal(v_measure_score(true_labels, labels), 1.0) assert_greater(cbook.cluster_core.inertia_, 0.0) # check that the descriptor looks like the homogenous PDF used # to create the original samples cbook_hist = cbook.get_BoF_descriptor(X) expected_value = float(1)/cbook.n_words for bin_value in cbook_hist[0]: assert_less(round(bin_value-expected_value,3), 0.01)
def main(): import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_blobs n_centers = 3 X, y = make_blobs(n_samples=1000, centers=n_centers, n_features=2, cluster_std=0.7, random_state=0) # Run this K-Means import kmeans t0 = time.time() y_pred, centers, obj_val_seq = kmeans.kmeans(X, n_centers) t1 = time.time() print("Final obj val: {}".format(obj_val_seq[-1])) print("Time taken (this implementation): {}".format(t1 - t0)) # Run scikit-learn's K-Means from sklearn.cluster import k_means t0 = time.time() centers, y_pred, obj_val = k_means(X, n_centers, random_state=0) t1 = time.time() print("Final obj val: {}".format(obj_val)) print("Time taken (Scikit, 1 job): {}".format(t1 - t0)) # Plot change in objective value over iteration fig = plt.figure() ax = fig.add_subplot(111) ax.plot(obj_val_seq, 'b-', marker='*') fig.suptitle("Change in K-means objective value across iterations") ax.set_xlabel("Iteration") ax.set_ylabel("Objective value") fig.show() # Plot data from itertools import cycle colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') fig = plt.figure(figsize=plt.figaspect(0.5)) # Make twice as wide to accomodate both plots ax = fig.add_subplot(121) ax.set_title("Data with true labels and final centers") for k, color in zip(range(n_centers), colors): ax.plot(X[y==k, 0], X[y==k, 1], color + '.') initial_centers = kmeans.init_centers(X, n_centers, 2) # This is valid because we always use the same random seed. # Plot initial centers for x in initial_centers: ax.plot(x[0], x[1], "mo", markeredgecolor="k", markersize=8) # Plot final centers for x in centers: ax.plot(x[0], x[1], "co", markeredgecolor="k", markersize=8) # Plot assignments colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') ax = fig.add_subplot(122) ax.set_title("Data with final assignments") for k, color in zip(range(n_centers), colors): ax.plot(X[y_pred==k, 0], X[y_pred==k, 1], color + '.') fig.tight_layout() fig.gca() fig.show()
def plot_sgd_classifier(num_samples, clt_std): #generation of data X, y = make_blobs(n_samples=num_samples, centers=2, cluster_std=clt_std) #fitting of data using logistic regression clf = SGDClassifier(loss='log', alpha=0.01) clf.fit(X, y) #plotting of data x_ = np.linspace(min(X[:, 0]), max(X[:, 0]), 10) y_ = np.linspace(min(X[:, 1]), max(X[:, 1]), 10) X_, Y_ = np.meshgrid(x_, y_) Z = np.empty(X_.shape) for (i, j), val in np.ndenumerate(X_): x1 = val x2 = Y_[i, j] conf_score = clf.decision_function([x1, x2]) Z[i, j] = conf_score[0] levels = [-1.0, 0, 1.0] colors = 'k' linestyles = ['dashed', 'solid', 'dashed'] ax = plt.axes() plt.xlabel('X1') plt.ylabel('X2') ax.contour(X_, Y_, Z, colors=colors, levels=levels, linestyles=linestyles, labels='Boundary') ax.scatter(X[:, 0], X[:, 1], c=y)
def iplot(N_points=100, n_clusters=2): X, y = make_blobs(n_samples=N_points, centers=n_clusters, random_state=0, cluster_std=0.60) def _kmeans_step(k=n_clusters, frame=0): rng = np.random.RandomState(2) labels = np.zeros(X.shape[0]) centers = X[rng.randint(N_points, size=k),:] nsteps = frame // 3 for i in range(nsteps + 1): old_centers = centers if i < nsteps or frame % 3 > 0: dist = euclidean_distances(X, centers) labels = dist.argmin(1) if i < nsteps or frame % 3 > 1: centers = np.array([X[labels == j].mean(0) for j in range(k)]) nans = np.isnan(centers) centers[nans] = old_centers[nans] # plot the cluster centers fig = plt.figure(figsize=(8,6)) plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='rainbow'); plt.scatter(old_centers[:, 0], old_centers[:, 1], marker='s', c="white", s=200) plt.scatter(old_centers[:, 0], old_centers[:, 1], marker='s', c=np.arange(k), s=50, cmap='rainbow') # plot new centers if third frame if frame % 3 == 2: for i in range(k): plt.annotate('', centers[i], old_centers[i], arrowprops=dict(arrowstyle='->', linewidth=1)) plt.scatter(centers[:, 0], centers[:, 1], marker='s', c="white", s=200, cmap='rainbow') plt.scatter(centers[:, 0], centers[:, 1], marker='s', c=np.arange(k), s=50, cmap='rainbow') plt.xlim(-4, 4) plt.ylim(-2, 10) if frame % 3 == 1: plt.text(3.8, 9.5, "1. Reasignacion de etiquetas", ha='right', va='top', size=14) elif frame % 3 == 2: plt.text(3.8, 9.5, "2. Calculo de centroides", ha='right', va='top', size=14) frame_range = [0,20] k_range = [2, n_clusters+2] return interact(_kmeans_step, k=k_range, frame=frame_range)
def test_spectral_unknown_mode(): # Test that SpectralClustering fails with an unknown mode set. centers = np.array([[0.0, 0.0, 0.0], [10.0, 10.0, 10.0], [20.0, 20.0, 20.0]]) X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=1.0, random_state=42) D = pairwise_distances(X) # Distance matrix S = np.max(D) - D # Similarity matrix S = sparse.coo_matrix(S) assert_raises(ValueError, spectral_clustering, S, n_clusters=2, random_state=0, mode="<unknown>")
def test_minibatch_sensible_reassign_partial_fit(): zeroed_X, true_labels = make_blobs(n_samples=n_samples, centers=5, cluster_std=1.0, random_state=42) zeroed_X[::2, :] = 0 mb_k_means = MiniBatchKMeans(n_clusters=20, random_state=42, init="random") for i in range(100): mb_k_means.partial_fit(zeroed_X) # there should not be too many exact zero cluster centers assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)
def test_bad_reachability(): msg = "All reachability values are inf. Set a larger max_eps." centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10) assert_raise_message(ValueError, msg, clust.fit, X)
def plot_kmeans_interactive(): from IPython.html.widgets import interact from sklearn.metrics.pairwise import euclidean_distances from sklearn.datasets.samples_generator import make_blobs X, y = make_blobs(n_samples=300, centers=4, random_state=0, cluster_std=0.60) def _kmeans_step(frame, n_clusters): rng = np.random.RandomState(2) labels = np.zeros(X.shape[0]) centers = rng.randn(n_clusters, 2) nsteps = frame // 3 for i in range(nsteps + 1): old_centers = centers if i < nsteps or frame % 3 > 0: dist = euclidean_distances(X, centers) labels = dist.argmin(1) if i < nsteps or frame % 3 > 1: centers = np.array([X[labels == j].mean(0) for j in range(n_clusters)]) nans = np.isnan(centers) centers[nans] = old_centers[nans] # plot the cluster centers plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='rainbow'); plt.scatter(old_centers[:, 0], old_centers[:, 1], marker='o', c=np.arange(n_clusters), s=200, cmap='rainbow') plt.scatter(old_centers[:, 0], old_centers[:, 1], marker='o', c='black', s=50) # plot new centers if third frame if frame % 3 == 2: for i in range(n_clusters): plt.annotate('', centers[i], old_centers[i], arrowprops=dict(arrowstyle='->', linewidth=1)) plt.scatter(centers[:, 0], centers[:, 1], marker='o', c=np.arange(n_clusters), s=200, cmap='rainbow') plt.scatter(centers[:, 0], centers[:, 1], marker='o', c='black', s=50) plt.xlim(-4, 4) plt.ylim(-2, 10) if frame % 3 == 1: plt.text(3.8, 9.5, "1. Reassign points to nearest centroid", ha='right', va='top', size=14) elif frame % 3 == 2: plt.text(3.8, 9.5, "2. Update centroids to cluster means", ha='right', va='top', size=14) return interact(_kmeans_step, frame=[0, 50], n_clusters=[3, 5])
def test_spectral_clustering_sparse(): X, y = make_blobs(n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01) S = rbf_kernel(X, gamma=1) S = np.maximum(S - 1e-4, 0) S = sparse.coo_matrix(S) labels = SpectralClustering(random_state=0, n_clusters=2, affinity="precomputed").fit(S).labels_ assert_equal(adjusted_rand_score(y, labels), 1)
def test_unsupervised_grid_search(): # test grid-search with unsupervised estimator X, y = make_blobs(random_state=0) km = KMeans(random_state=0) grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]), score_func=adjusted_rand_score) grid_search.fit(X) # most number of clusters should be best assert_equal(grid_search.best_params_["n_clusters"], 4)
def test_bad_reachability(): msg = "All reachability values are inf. Set a larger max_eps." centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) with pytest.warns(UserWarning, match=msg): clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015) clust.fit(X)
def init_sample(): ## 生成的测试数据的中心点 centers = [[1, 1], [-1, -1], [1, -1]] ##生成数据 Xn, labels_true = make_blobs(n_samples=150, centers=centers, cluster_std=0.5, random_state=0) #3数据的长度,即:数据点的个数 dataLen = len(Xn) return Xn,dataLen
def from_blobs(cls): X, y = make_blobs( n_samples=1000, centers=5, cluster_std=1, n_features=5, ) return cls(X, y)
def test_affinities(): X, y = make_blobs(n_samples=40, random_state=1, centers=[[1, 1], [-1, -1]], cluster_std=0.4) # nearest neighbors affinity sp = SpectralClustering(n_clusters=2, affinity="nearest_neighbors", random_state=0) labels = sp.fit(X).labels_ assert_equal(adjusted_rand_score(y, labels), 1) sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0) labels = sp.fit(X).labels_ assert_equal(adjusted_rand_score(y, labels), 1)
import matplotlib.pyplot as plt %matplotlib inline from sklearn.datasets.samples_generator import make_blobs centers_l = [0.01, 0.1, 0.5, 1, 10, 1000] fig, ax = plt.subplots(6, figsize=(10,20)) for i in range(len(centers_l)): X, y = make_blobs(n_samples=150, centers=3,random_state=0, cluster_std=centers_l[i]) ax[i].scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn'); plt.show()
#!/usr/bin/env python3 from sklearn.cluster import AffinityPropagation from sklearn import metrics from sklearn.datasets.samples_generator import make_blobs # ############################################################################# # Generate sample data centers = [[0.4, 0.4], [-0.4, -0.4], [0.4, -0.4]] X, labels_true = make_blobs(n_samples=30, centers=centers, cluster_std=0.1, random_state=0) # ############################################################################# # Plot initial import matplotlib.pyplot as plt from itertools import cycle import tikzplotlib plt.close('all') plt.figure(1) plt.clf() for p in X: plt.plot(p[0], p[1], 'k.') plt.title('Unclustered dataset') tikzplotlib.save("figures/ap_unclust.tex") # ############################################################################# # Compute Affinity Propagation af = AffinityPropagation(max_iter=1000, damping=0.5).fit(X)
def main(): epochs = 100 alpha = 0.01 batch_size = 32 (X, y) = make_blobs(n_samples=10000, n_features=2, centers=2, cluster_std=2.5, random_state=95) # insert a column of 1's as the first entry in the feature # vector -- this is a little trick that allows us to treat # the bias as a trainable parameter *within* the weight matrix # rather than an entirely separate variable X = np.c_[np.ones((X.shape[0])), X] # initialize our weight matrix such it has the same number of # columns as our input features print("[INFO] starting training...") W = np.random.uniform(size=(X.shape[1], )) # initialize a list to store the loss value for each epoch lossHistory = [] # loop over the desired number of epochs for epoch in np.arange(0, epochs): # initialize the total loss for the epoch epochLoss = [] # loop over our data in batches for (batchX, batchY) in next_batch(X, y, batch_size): # take the dot product between our current batch of # features and weight matrix `W`, then pass this value # through the sigmoid activation function preds = sigmoid_activation(batchX.dot(W)) # now that we have our predictions, we need to determine # our `error`, which is the difference between our predictions # and the true values error = preds - batchY # given our `error`, we can compute the total loss value on # the batch as the sum of squared loss loss = np.sum(error**2) epochLoss.append(loss) # the gradient update is therefore the dot product between # the transpose of our current batch and the error on the # # batch gradient = batchX.T.dot(error) / batchX.shape[0] # use the gradient computed on the current batch to take # a "step" in the correct direction W += -alpha * gradient # update our loss history list by taking the average loss # across all batches lossHistory.append(np.average(epochLoss)) # compute the line of best fit by setting the sigmoid function # to 0 and solving for X2 in terms of X1 Y = (-W[0] - (W[1] * X)) / W[2] # plot the original data along with our line of best fit plt.figure() plt.scatter(X[:, 1], X[:, 2], marker="o", c=y) plt.plot(X, Y, "r-") # construct a figure that plots the loss over time fig = plt.figure() plt.plot(np.arange(0, epochs), lossHistory) fig.suptitle("Training Loss") plt.xlabel("Epoch #") plt.ylabel("Loss") plt.show()
from sklearn import metrics """ Author: limlin Contact: [email protected] Datetime: 2020/12/4 9:29 Software: PyCharm Profile: https://www.cnblogs.com/hellojiaojiao/p/10758408.html """ """ 下面是生成一些样本数据 X为样本特征,Y为样本簇类别, 共1000个样本,每个样本2个特征,共4个簇,簇中心在[-1,-1], [0,0],[1,1], [2,2], 簇方差分别为[0.4, 0.5, 0.2] """ X, y = make_blobs(n_samples=500, n_features=2, centers=[[2, 3], [3, 0], [1, 1]], cluster_std=[0.4, 0.5, 0.2], random_state=9) """ 首先画出生成的样本数据的分布 """ plt.scatter(X[:, 0], X[:, 1], marker='o') plt.show() """ 下面看不同的k值下的聚类效果 """ score_all = [] list1 = range(2, 6) # 其中i不能为0,也不能为1 for i in range(2, 6): y_pred = KMeans(n_clusters=i, random_state=9).fit_predict(X)
def estimateMeanConvergence(noMoveClick, moveClick, surpriseMe, noOfResets, alpha, noOfSamples): # declare the below variables as using global scope since they are reassigned in the function global noMoves global moves global surprises global i global resetCount global my_centers global agentR global agentB global max_iter global gmm global ks_df_OLD global X_old global Y_old # indicator to check if KS statistic needs to be run runKS = 0 # indicator to check if same distn. is selected sameDist = 0 # initialize variables to initial values if reset button is pressed if noOfResets > resetCount: resetCount = noOfResets noMoves = 0 moves = 0 surprises = 0 i = 0 random.seed(1) max_iter = 100 agentR = (-25, -25) agentB = (25, -25) my_centers = ((-5, -5), (5, 5)) gmm = None ks_df_OLD = None X_old = None Y_old = None # Detect which button was pressed and generate data accordingly if noMoveClick > noMoves: move = 0 sameDist = 1 noMoves = noMoveClick elif moveClick > moves: move = 1 moves = moveClick elif surpriseMe > surprises: move = random.randint(0, 1) runKS = 1 surprises = surpriseMe else: print("Button press not detected. Assuming same distribution") move = 0 sameDist = 1 print("iteration: %d" % i) # assign center of distributions my_centers = ((my_centers[0][0] + 1 * move, my_centers[0][1] + 4 * move), (my_centers[1][0] - 3 * move, my_centers[1][1] - 1 * move)) # draw samples X, y_true = make_blobs(n_samples=int(noOfSamples), centers=my_centers, cluster_std=1.5, random_state=i) # stack observations if new data from same distribution (in order to incorporate more data into estimate of mean). # otherwise, only use new data. # # note: assumes independence of x and y if i != 0: # fit GMM using previous means as initial means gmm = GaussianMixture(n_components=2, means_init=gmm.means_, max_iter=max_iter).fit(X) # extract predicted labels labels = gmm.predict(X) ks_df = pd.DataFrame(np.column_stack((X, labels, y_true))) ks_df.columns = ['x1', 'x2', 'labels', 'true_labels'] # if unknown is selected and if estimated underlying distributions have not changed, stack data. # # note: assumes independence of x and y (i.e. uses univariate ks test) if runKS == 1: print("Random Distribution:%d" % move) print(ks_2samp(ks_df[ks_df['labels'] == 1]['x1'], ks_df_OLD[ks_df_OLD['labels'] == 1]['x1'])[1],\ ks_2samp(ks_df[ks_df['labels'] == 1]['x2'], ks_df_OLD[ks_df_OLD['labels'] == 1]['x2'])[1],\ ks_2samp(ks_df[ks_df['labels'] == 0]['x1'], ks_df_OLD[ks_df_OLD['labels'] == 0]['x1'])[1],\ ks_2samp(ks_df[ks_df['labels'] == 0]['x2'], ks_df_OLD[ks_df_OLD['labels'] == 0]['x2'])[1]) if ks_2samp(ks_df[ks_df['labels'] == 1]['x1'], ks_df_OLD[ks_df_OLD['labels'] == 1]['x1'])[1] > .99 and\ ks_2samp(ks_df[ks_df['labels'] == 1]['x2'], ks_df_OLD[ks_df_OLD['labels'] == 1]['x2'])[1] > .99 and\ ks_2samp(ks_df[ks_df['labels'] == 0]['x1'], ks_df_OLD[ks_df_OLD['labels'] == 0]['x1'])[1] > .99 and\ ks_2samp(ks_df[ks_df['labels'] == 0]['x2'], ks_df_OLD[ks_df_OLD['labels'] == 0]['x2'])[1] > .99: # stack observations print("Same Distribution: KS") X = np.vstack((X_old, X)) y_true = np.concatenate((Y_old, y_true), axis=0) # y_true = y_true.flatten() # if same distribution is selected stack data if sameDist == 1: # stack observations print("Same Distribution: Button") print(ks_2samp(ks_df[ks_df['labels'] == 1]['x1'], ks_df_OLD[ks_df_OLD['labels'] == 1]['x1'])[1],\ ks_2samp(ks_df[ks_df['labels'] == 1]['x2'], ks_df_OLD[ks_df_OLD['labels'] == 1]['x2'])[1],\ ks_2samp(ks_df[ks_df['labels'] == 0]['x1'], ks_df_OLD[ks_df_OLD['labels'] == 0]['x1'])[1],\ ks_2samp(ks_df[ks_df['labels'] == 0]['x2'], ks_df_OLD[ks_df_OLD['labels'] == 0]['x2'])[1]) X = np.vstack((X_old, X)) y_true = np.concatenate((Y_old, y_true), axis=0) # y_true = y_true.flatten() # fit GMM gmm = GaussianMixture(n_components=2, init_params='kmeans', max_iter=max_iter).fit(X) # extract predicted labels labels = gmm.predict(X) # create ks df for next iteration ks_df_OLD = pd.DataFrame(np.column_stack((X, labels, y_true))) ks_df_OLD.columns = ['x1', 'x2', 'labels', 'true_labels'] # update target means for agents to seek B_mean_estimate, R_mean_estimate = tuple(gmm.means_[0]), tuple( gmm.means_[1]) ## move agents towards respective estimated means # # calc distances from agents to respective means distanceR = (sum((np.array(R_mean_estimate) - np.array(agentR))**2))**( 1 / 2) # unweighted....no log prob... distanceB = (sum((np.array(B_mean_estimate) - np.array(agentB))**2))**( 1 / 2) # unweighted....no log prob... # calculate angles angle_degreeR = math.degrees( math.atan2(R_mean_estimate[1] - agentR[1], R_mean_estimate[0] - agentR[0])) angle_degreeB = math.degrees( math.atan2(B_mean_estimate[1] - agentB[1], B_mean_estimate[0] - agentB[0])) # scale (if set to 1, agent will move all the way to mean of respective distribution) # replaced .5 with alpha as the learning rate scaleR = alpha scaleB = alpha # set new agent location (red) agentR = agentR[0] + scaleR * distanceR * math.cos(angle_degreeR * math.pi / 180),\ agentR[1] + scaleR * distanceR * math.sin(angle_degreeR * math.pi / 180) # set new agent location (blue) agentB = agentB[0] + scaleB * distanceB * math.cos(angle_degreeB * math.pi / 180),\ agentB[1] + scaleB * distanceB * math.sin(angle_degreeB * math.pi / 180) # make data copy for next iteration X_old = np.copy(X) Y_old = np.copy(y_true) #Increment Counter i += 1
clusters = cercanos(puntos,cent) cent = centros(clusters) return cent # N =100 # x = np.random.rand(N) # y = np.random.rand(N) # data = [[x,y] for x,y in zip(x,y) ] if __name__ == "__main__": X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0) plt.scatter(X[:, 0], X[:, 1], s=50); # KMeans Via Us cent = kmeans(X,k=4) plt.scatter([c[0] for c in cent ], [c[1] for c in cent ], c='black', s=200, alpha=0.5); plt.show() # Sklearn KMeans from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=4) kmeans.fit(X) y_kmeans = kmeans.predict(X) plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='viridis')
import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import MiniBatchKMeans, KMeans from sklearn.metrics.pairwise import pairwise_distances_argmin from sklearn.datasets.samples_generator import make_blobs # ############################################################################# # Generate sample data np.random.seed(0) batch_size = 45 centers = [[1, 1], [-1, -1], [1, -1]] n_clusters = len(centers) X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7) # ############################################################################# # Compute clustering with Means k_means = KMeans(init='k-means++', n_clusters=3, n_init=10) t0 = time.time() k_means.fit(X) t_batch = time.time() - t0 # ############################################################################# # Compute clustering with MiniBatchKMeans mbk = MiniBatchKMeans(init='k-means++', n_clusters=3, batch_size=batch_size, n_init=10, max_no_improvement=10, verbose=0) t0 = time.time()
import numpy as np from sklearn.cluster import MeanShift from sklearn.datasets.samples_generator import make_blobs import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from matplotlib import style style.use("ggplot") centers = [[1,1,1],[5,5,5],[3,10,10]] X, _ = make_blobs(n_samples=100, centers=centers, cluster_std=1.5) ms = MeanShift() ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ print(cluster_centers) numClusters = len(np.unique(labels)) print("Number of estimated clusters: ", numClusters) colors = 10 * ['r','g','b','c','k','y','m'] fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for i in range(len(X)): ax.scatter(X[i][0], X[i][1], X[i][2], c=colors[labels[i]], marker='o') ax.scatter(cluster_centers[:,0], cluster_centers[:,1], cluster_centers[:,2], marker="x", color='k', s=150, linewidths=5, zorder=10)
dataset['Silhouette'], dataset['Calinski Harabasz'], dataset[ 'Davies Bouldin'] = other_validations(X, y, verbose=report) outliers = np.sum(y == -1) / np.sum(mass) if SK: draw_symbol(k, dataset, clusters, mm, kinship, kdens, volr, outliers) return y, dataset, clusters if __name__ == '__main__': from sklearn.datasets.samples_generator import make_blobs X, y_real = make_blobs(n_samples=1500, centers=7, n_features=2, random_state=0, cluster_std=0.6) from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler(copy=True, feature_range=(0, 1)) scaler.fit(X) X = scaler.transform(X) from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=10, random_state=0).fit(X) y = kmeans.predict(X) plt.figure(figsize=(9 * 2 + 3, 12.5)) plt.subplots_adjust(left=.02, right=.98,
''' >>> from sklearn.datasets.samples_generator import make_blobs >>> X, y = make_blobs(n_samples=10, centers=3, n_features=2, ... random_state=0) >>> print(X.shape) (10, 2) >>> y array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0]) ''' from sklearn.datasets.samples_generator import make_blobs X, y = make_blobs(n_samples=10, centers=3, n_features=3, random_state=0) print X print(X.shape) print y
# scatter plot of blobs dataset from sklearn.datasets.samples_generator import make_blobs from matplotlib import pyplot from numpy import where # generate 2d classification dataset X, y = make_blobs(n_samples=1000, centers=3, n_features=2, cluster_std=2, random_state=2) # scatter plot for each class value for class_value in range(3): # select indices of points with the class label row_ix = where(y == class_value) # scatter plot for points with a different color pyplot.scatter(X[row_ix, 0], X[row_ix, 1]) # show plot pyplot.show()
# In[1]: import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_blobs import numpy as np from random import randint import math from sklearn.utils import shuffle # In[2]: (X, y) = make_blobs(n_samples=500, n_features=20, centers=2,cluster_std=3.1,random_state=95) #Original Data # In[3]: for i in range(len(X)): for j in range(len(X[0])-8): rand_no = randint(0,len(X[0])-1) X[i][rand_no] = 0 # In[4]: X = np.c_[np.ones((X.shape[0])), X] #For absorbing bias
from sklearn.cluster.k_means_ import _labels_inertia from sklearn.cluster.k_means_ import _mini_batch_step from sklearn.datasets.samples_generator import make_blobs from io import StringIO from sklearn.metrics.cluster import homogeneity_score # non centered, sparse centers to check the centers = np.array([ [0.0, 5.0, 0.0, 0.0, 0.0], [1.0, 1.0, 4.0, 0.0, 0.0], [1.0, 0.0, 0.0, 5.0, 1.0], ]) n_samples = 100 n_clusters, n_features = centers.shape X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) X_csr = sp.csr_matrix(X) @pytest.mark.parametrize("representation, algo", [('dense', 'full'), ('dense', 'elkan'), ('sparse', 'full')]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_kmeans_results(representation, algo, dtype): # cheks that kmeans works as intended array_constr = {'dense': np.array, 'sparse': sp.csr_matrix}[representation] X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype) sample_weight = [3, 1, 1, 3] # will be rescaled to [1.5, 0.5, 0.5, 1.5] init_centers = np.array([[0, 0], [1, 1]], dtype=dtype)
import matplotlib.pyplot as plt from matplotlib import style style.use('ggplot') import numpy as np from sklearn.datasets.samples_generator import make_blobs import random centers = random.randrange(2,5) X, y = make_blobs(n_samples=20, centers=centers, n_features=2) ##X = np.array([[1, 2], ## [1.5, 1.8], ## [5, 8 ], ## [8, 8], ## [1, 0.6], ## [9,11], ## [8,2], ## [10,2], ## [9,3],]) #plt.scatter(X[:,0], X[:,1], s=150) #plt.show() class Mean_Shift: def __init__(self, radius=None, radius_norm_step = 100): self.radius = radius self.radius_norm_step = radius_norm_step def fit(self, data): if self.radius == None:
for centroid in range(len(self.centroids)) ] classification = distances.index(min(distances)) self.classifications[classification].append(feature_set) def predict(self, data): distances = [ np.linalg.norm(data - self.centroids[centroid]) for centroid in range(len(self.centroids)) ] classification = distances.index(min(distances)) return classification for _ in range(10): X, y = make_blobs(n_samples=100, centers=3, n_features=2) # X = np.array([[1,2],[1.5,1.8],[5,8],[1,0.6],[8,8],[9,11],[8,2],[10,2],[9,3]]) colors = 100 * ["g", "r", "c", "b", "k"] # plt.scatter(X[:,0], X[:,1], color="b", s=150, linewidths=5, marker="o") # plt.show() clf = MeanShift() clf.fit(X) centroids = clf.centroids for classification in clf.classifications: color = colors[classification] for feature_set in clf.classifications[classification]: plt.scatter(feature_set[0], feature_set[1], marker='o',
import random import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.datasets.samples_generator import make_blobs import pandas as pd from sklearn.preprocessing import StandardScaler np.random.seed(0) X, y = make_blobs(n_samples=5000, centers=[[4, 4], [-2, -1], [2, -3], [1, 1]], cluster_std=0.9) plt.scatter(X[:, 0], X[:, 1], marker='.') k_means = KMeans(init="k-means++", n_clusters=4, n_init=12) k_means.fit(X) k_means_labels = k_means.labels_ k_means_cluster_centers = k_means.cluster_centers_ print(k_means_cluster_centers) # Initialize the plot with the specified dimensions. fig = plt.figure(figsize=(6, 4)) # Colors uses a color map, which will produce an array of colors based on # the number of labels there are. We use set(k_means_labels) to get the # unique labels. colors = plt.cm.Spectral(np.linspace(0, 1, len(set(k_means_labels)))) # Create a plot ax = fig.add_subplot(1, 1, 1)
ap = argparse.ArgumentParser() ap.add_argument("-e", "--epochs", type=float, default=60, help="# of epochs") ap.add_argument("-a", "--alpha", type=float, default=0.8, help="learning rate") ap.add_argument("-b", "--batch-size", type=int, default=32, help="size of SGD mini-batches") args = vars(ap.parse_args()) # diambil samples dari data iris 100 data pertama (X, Theta) = make_blobs(n_samples=100, n_features=2, centers=2, cluster_std=2.5, random_state=95) X = np.c_[np.ones((X.shape[0])), X] print("[INFO] starting training...") W = np.random.uniform(size=(X.shape[1], )) lossHistory = [] for epoch in np.arange(0, args["epochs"]): epochLoss = [] for (batchX, batchY) in h(X, Theta, args["epochs"]): preds = sigmoid_activation(batchX.dot(W)) error = preds - batchY
''' @Author: Runsen @微信公众号: 润森笔记 @博客: https://blog.csdn.net/weixin_44510615 @Date: 2020/4/27 ''' import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.datasets.samples_generator import make_blobs sns.set() X, y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn') xfit = np.linspace(-1, 3.5) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn') for m, b in [(1, 0.65), (0.5, 1.6), (-0.2, 2.9)]: plt.plot(xfit, m * xfit + b, '-k') plt.xlim(-1, 3.5) plt.show() xfit = np.linspace(-1, 3.5) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn') for m, b, d in [(1, 0.65, 0.33), (0.5, 1.6, 0.55), (-0.2, 2.9, 0.2)]: yfit = m * xfit + b plt.plot(xfit, yfit, '-k') plt.fill_between(xfit, yfit - d, yfit + d, edgecolor='none', color='#AAAAAA', alpha=0.4)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Jan 31 16:04:05 2019 @author: xsxsz """ import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn.datasets.samples_generator import make_blobs from sklearn.decomposition import PCA x,y=make_blobs(n_samples=10000,n_features=3,\ centers=[[3,3, 3], [0,0,0], [1,1,1], [2,2,2]],\ cluster_std=[0.2, 0.1, 0.2, 0.2],random_state=9) fig = plt.figure() ax = Axes3D(fig, rect=[0, 0, 1, 1], elev=30, azim=30) plt.scatter(x[:, 0], x[:, 1], x[:, 2], marker='o', color='g') pca1 = PCA(n_components=3) pca1.fit(x) print(pca1.explained_variance_) print('------------') print(pca1.explained_variance_ratio_) print('------------') pca2 = PCA(n_components=2) pca2.fit(x) print(pca2.explained_variance_) print('------------') print(pca2.explained_variance_ratio_) print('------------')
import matplotlib.pyplot as plt from sklearn import svm from sklearn.datasets.samples_generator import make_blobs import numpy as np (X, Y) = make_blobs(n_samples=5, n_features=2, centers=2, random_state=50) plt.figure() plt.scatter(X[:, 0], X[:, 1], marker='o', c=Y) plt.axis([-5, 10, -12, -1]) plt.show() postiveX = [] negativeX = [] for i, v in enumerate(y): if v == 0: negativeX.append(X[i]) else: postiveX.append(X[i]) # our data dictionary data_dict = {-1: np.array(negativeX), 1: np.array(postiveX)}
#-*-coding:utf-8-*- """ Created on Wed Jul 19 14:25:31 2017 @author: UlionTse """ import numpy as np import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_blobs from sklearn.cluster import Birch, KMeans, MiniBatchKMeans, DBSCAN, SpectralClustering, ward_tree, AgglomerativeClustering, MeanShift, AffinityPropagation from sklearn import metrics # data ready. X, y = make_blobs(n_samples=1000, n_features=2, centers=[[0, 0], [0, 2], [2, 0], [1, 1], [2, 2]], cluster_std=[0.2, 0.2, 0.2, 0.4, 0.2], random_state=9) plt.scatter(X[:, 0], X[:, 1], marker='o') plt.show() # needed attrs. # attr_1 = KMeans(n_clusters=8) # attr_2 = MiniBatchKMeans(n_clusters=8) # attr_3 = Birch(n_clusters=3,threshold=0.5,branching_factor=50) # attr_4 = ward_tree(X,n_clusters=None) # attr_5 = AgglomerativeClustering(n_clusters=2,linkage='ward') # attr_6 = DBSCAN(eps=0.5,leaf_size=30) # attr_7 = MeanShift(bandwidth=None,seeds=None,min_bin_freq=1) # attr_8 = SpectralClustering(n_clusters=8,n_init=10,gamma=1.0) # attr_9 = AffinityPropagation(damping=0.5,max_iter=200,convergence_iter=15,affinity='euclidean')
markerfacecolor=col, markeredgecolor='k', markersize=3) # 加标题,显示分类数 plt.title('Estimated number of clusters: %d' % nclusters) plt.show() def jiangzao(labels): # 标签中的簇数,忽略噪声(如果存在) clusters = len(set(labels)) - (1 if -1 in labels else 0) return clusters def standar_scaler(points): p = StandardScaler().fit_transform(points) return p if __name__ == "__main__": """ test class dbScan """ centers = [[1, 1], [-1, -1], [-1, 1], [1, -1]] point, labelsTrue = make_blobs(n_samples=2000, centers=centers, cluster_std=0.4, random_state=0) point = standar_scaler(point) db = DBScan(point, labelsTrue) db.draw()
def kMeansClustering(self): abbreviations_of_companies = list( name_abbreviation_mWIG40_dict.values()) movements = Parameters(abbreviations_of_companies) daily_movement = movements.daily_movement() # a dataframe transformation into an array and the transpose of a matrix df_array = daily_movement.to_numpy().T # impact on the result, but not significant on a large scale. # zero means no change in the price of the item on a given day. df_array[np.isnan(df_array)] = 0 style.use("seaborn-pastel") # make_blobs() is used to generate sample points around c centers (randomly chosen) X, y = make_blobs(n_samples=400, centers=10, cluster_std=1, n_features=2) plt.scatter(X[:, 0], X[:, 1], s=5, color='r') plt.xlabel('X') plt.ylabel('Y') plt.show() # clear the figure plt.clf() # Elbow Method For Optimal k sum_of_squared_distances = [] K = range(1, 16) for k in K: km = KMeans(n_clusters=k) km = km.fit(df_array) sum_of_squared_distances.append(km.inertia_) plt.plot(K, sum_of_squared_distances, 'bx-') plt.xlabel('k') plt.ylabel('Sum_of_squared_distances') plt.title('Elbow Method For Optimal k') plt.show() plt.clf() normalizer = Normalizer() kmeans = KMeans(n_clusters=13, max_iter=1500) pipeline = make_pipeline(normalizer, kmeans) pipeline.fit(df_array) labels = pipeline.predict(df_array) x = list(name_abbreviation_mWIG40_dict.values()) y = [] for i in x: y.append(sectors_mWIG40_dict[i]) print(y) df = pd.DataFrame({ 'Labels': labels, 'Companies': daily_movement.columns, 'Economic sector': y }).sort_values(by=['Labels', 'Economic sector'], axis=0) return print(df)
def plot_kmeans_interactive(min_clusters=1, max_clusters=6): from sklearn.metrics.pairwise import euclidean_distances from sklearn.datasets.samples_generator import make_blobs with warnings.catch_warnings(): warnings.filterwarnings('ignore') X, y = make_blobs(n_samples=300, centers=4, random_state=0, cluster_std=0.60) def _kmeans_step(frame=0, n_clusters=4): rng = np.random.RandomState(2) labels = np.zeros(X.shape[0]) centers = rng.randn(n_clusters, 2) nsteps = frame // 3 for i in range(nsteps + 1): old_centers = centers if i < nsteps or frame % 3 > 0: dist = euclidean_distances(X, centers) labels = dist.argmin(1) if i < nsteps or frame % 3 > 1: centers = np.array( [X[labels == j].mean(0) for j in range(n_clusters)]) nans = np.isnan(centers) centers[nans] = old_centers[nans] # plot the data and cluster centers plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='rainbow', vmin=0, vmax=n_clusters - 1) plt.scatter(old_centers[:, 0], old_centers[:, 1], marker='o', c=np.arange(n_clusters), s=200, cmap='rainbow') plt.scatter(old_centers[:, 0], old_centers[:, 1], marker='o', c='black', s=50) # plot new centers if third frame if frame % 3 == 2: for i in range(n_clusters): plt.annotate('', centers[i], old_centers[i], arrowprops=dict(arrowstyle='->', linewidth=1)) plt.scatter(centers[:, 0], centers[:, 1], marker='o', c=np.arange(n_clusters), s=200, cmap='rainbow') plt.scatter(centers[:, 0], centers[:, 1], marker='o', c='black', s=50) plt.xlim(-4, 4) plt.ylim(-2, 10) if frame % 3 == 1: plt.text(3.8, 9.5, "1. Reassign points to nearest centroid", ha='right', va='top', size=14) elif frame % 3 == 2: plt.text(3.8, 9.5, "2. Update centroids to cluster means", ha='right', va='top', size=14) return interact(_kmeans_step, frame=[0, 50], n_clusters=[min_clusters, max_clusters])
## https://machinelearningmastery.com/generate-test-datasets-python-scikit-learn/ ## http://scikit-learn.org/stable/auto_examples/datasets/plot_random_dataset.html from sklearn.datasets.samples_generator import make_blobs from matplotlib import pyplot from pandas import DataFrame import numpy # generate 2d classification dataset X, labels = make_blobs(n_samples=1000, n_features=2, centers=3) colors = {0:'red', 1:'blue', 2:'green'} colored_labels = numpy.vectorize(lambda l: colors[l])(labels) x_coordinates = X[:, 0] y_coordinates = X[:, 1] pyplot.scatter(x_coordinates, y_coordinates, marker='o', c=colored_labels, s=25, edgecolor='k', label=y) df = DataFrame(dict(x=x_coordinates, y=y_coordinates, label=labels)) df.to_csv("data.csv") !hdfs dfs -put -f data.csv
# import libraries import numpy as np from sklearn.cluster import MeanShift from sklearn.datasets.samples_generator import make_blobs # creadte data centers = [[3,3,3],[4,5,5],[3,10,10]] X, _ = make_blobs(n_samples=700, centers = centers, cluster_std=0.5) # create model MSh = MeanShift() # train model MSh.fit(X) labels = MSh.labels_ cluster_centers = MSh.cluster_centers_ print(cluster_centers)
import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from collections import Counter import random from tqdm import tqdm import numpy as np from sklearn.datasets.samples_generator import make_blobs from sklearn.cluster import SpectralClustering, AffinityPropagation import utils plt.ion() plt.show() nb_clusters = 7 X, y_true = make_blobs(n_samples=300, centers=nb_clusters, cluster_std=.80, random_state=0) plt.title(f'Ground truth simulated data : {nb_clusters} clusters') plt.scatter(X[:, 0], X[:, 1], s=50, c=y_true) print(utils.internalValidation(X, y_true))
import matplotlib.pyplot as plt from matplotlib import style style.use('ggplot') import numpy as np from sklearn.datasets.samples_generator import make_blobs import random ctq = random.randrange(2,5) X,y = make_blobs(n_samples = 50,centers=5,n_features=2) #X = np.array([[1,2],[1.5,1.6],[5,8],[1,0.6],[8,8],[9,11],[8,2],[9,3],[10,3]]) colors = 10*['g','r','c','k','b'] class MeanShift: def __init__(self,radius=None,rad_norm_step = 100): self.radius = radius self.rad_norm_step = rad_norm_step def fit(self,data): if self.radius == None: all_data_ctd = np.average(abs(data),axis = 0) all_data_norm = np.linalg.norm(abs(all_data_ctd)) self.radius = all_data_norm/self.rad_norm_step centroids = {} for i in range(len(data)): centroids[i]=data[i] while True: n_centroids = [] for i in centroids: wts = [i for i in range(self.rad_norm_step)][::-1] in_bdwth = []
def load_dataset(which_dataset, N=-1, D=-1, norm_mean=False, norm_len=False, num_queries=10, Ntrain=-1, D_multiple_of=-1): true_nn = None # randomly generated datasets if which_dataset == Random.UNIFORM: X_test = np.random.rand(N, D) X_train = np.random.rand(Ntrain, D) if Ntrain > 0 else X_test Q = np.random.rand(num_queries, D) elif which_dataset == Random.GAUSS: X_test = np.random.randn(N, D) X_train = np.random.randn(Ntrain, D) if Ntrain > 0 else X_test Q = np.random.randn(num_queries, D) elif which_dataset == Random.WALK: X_test = np.random.randn(N, D) X_test = np.cumsum(X_test, axis=1) X_train = np.copy(X_test) if Ntrain > 0: X_train = np.random.randn(Ntrain, D) X_train = np.cumsum(X_train) Q = np.random.randn(num_queries, D) Q = np.cumsum(Q, axis=-1) elif which_dataset == Random.BLOBS: # centers is D x D, and centers[i, j] = (i + j) centers = np.arange(D) centers = np.sum(np.meshgrid(centers, centers), axis=0) X_test, _ = make_blobs(n_samples=N, centers=centers) X_train = np.copy(X_test) if Ntrain > 0: X_train, _ = make_blobs(n_samples=Ntrain, centers=centers) Q, true_nn = make_blobs(n_samples=num_queries, centers=centers) # datasets that are just one block of a "real" dataset elif isinstance(which_dataset, str): # assert False # TODO rm after real experiments X_test = load_file(which_dataset) X_test, Q = extract_random_rows(X_test, how_many=num_queries) X_train = np.copy(X_test) true_nn = _ground_truth_for_dataset(which_dataset) # "real" datasets with predefined train, test, queries, truth elif which_dataset in ALL_REAL_DATASETS: X_train, Q, X_test, true_nn = _load_complete_dataset( which_dataset, num_queries=num_queries) else: raise ValueError("unrecognized dataset {}".format(which_dataset)) N = X_test.shape[0] if N < 1 else N D = X_test.shape[1] if D < 1 else D X_test, X_train = np.copy(X_test)[:N, :D], X_train[:N, :D] Q = Q[:, :D] if len(Q.shape) > 1 else Q[:D] train_is_test = X_train.base is X_test or X_test.base is X_train train_is_test = train_is_test or np.array_equal(X_train[:100], X_test[:100]) if train_is_test: print "WARNING: Training data is also the test data!" if norm_mean: means = np.mean(X_train, axis=0) X_train -= means X_test -= means Q -= means if norm_len: X_test /= np.linalg.norm(X_test, axis=1, keepdims=True) X_train /= np.linalg.norm(X_train, axis=1, keepdims=True) Q /= np.linalg.norm(Q, axis=-1, keepdims=True) # np.set_printoptions(precision=6) # print "start of Q:", Q[:5, :5] # print "start of X_test:", X_test[:5, :5] # TODO don't convert datasets that are originally uint8s to floats X_train = X_train.astype(np.float32) X_test = X_test.astype(np.float32) # Q = np.squeeze(Q.astype(np.float32)) Q = Q.astype(np.float32) if D_multiple_of > 1: X_train = ensure_num_cols_multiple_of(X_train, D_multiple_of) X_test = ensure_num_cols_multiple_of(X_test, D_multiple_of) Q = ensure_num_cols_multiple_of(Q, D_multiple_of) return X_train, Q, X_test, true_nn
""" import numpy as np from sklearn.utils.testing import assert_equal, assert_array_equal from sklearn.cluster.affinity_propagation_ import AffinityPropagation from sklearn.cluster.affinity_propagation_ import affinity_propagation from sklearn.datasets.samples_generator import make_blobs from sklearn.metrics import euclidean_distances n_clusters = 3 centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10 X, _ = make_blobs(n_samples=60, n_features=2, centers=centers, cluster_std=0.4, shuffle=True, random_state=0) def test_affinity_propagation(): """Affinity Propagation algorithm """ # Compute similarities S = -euclidean_distances(X, squared=True) preference = np.median(S) * 10 # Compute Affinity Propagation cluster_centers_indices, labels = affinity_propagation( S, preference=preference) n_clusters_ = len(cluster_centers_indices)