def main(): no_of_samples = 400 data = [] data.append( datasets.make_moons(n_samples=no_of_samples, noise=0.05)[0] ) data.append( datasets.make_circles(n_samples=no_of_samples, factor=0.5, noise=0.05)[0] ) # number of clusters we expect K = 2 for X in data: # from dataset, create adjacency, degree, and laplacian matrix adjacency = gaussianDistance( X, sigma=0.1 ) degree = degreeMatrix( adjacency ) L = diag(degree) - adjacency # perform whitening on the Laplacian matrix deg_05 = diag( degree ** -0.5 ) L = deg_05.dot( L ).dot( deg_05 ) # use eig to obtain eigenvalues and eigenvectors eigenvalues, eigenvectors = linalg.eig( L ) # Sort the eigenvalues ascending, the first K zero eigenvalues represent the connected components idx = eigenvalues.argsort() eigenvalues.sort() evecs = eigenvectors[:, idx] eigenvectors = evecs[:, 0:K] print eigenvalues[0:K] color_array = ['b', 'r', 'g', 'y'] fig = pyplot.figure( figsize=(15, 5) ) fig.canvas.set_window_title( 'Difference between K-means and Spectral Clusterings' ) # First perform the normal K-means on the original dataset and plot it out centroids, labels = scipy.cluster.vq.kmeans2( X, K ) data = c_[X, labels] ax = fig.add_subplot( 131 ) ax.set_title('K means clustering') for k in range( 0, K ): ax.scatter( data[data[:, 2]==k, 0], data[data[:, 2]==k, 1], c=color_array[k], marker='o') # Then we perform spectral clustering, i.e. K-means on eigenvectors centroids, labels = scipy.cluster.vq.kmeans2( eigenvectors, K ) data = c_[X, labels] ax = fig.add_subplot( 132 ) ax.set_title('Spectral clustering') for k in range( 0, K ): ax.scatter( data[data[:, 2]==k, 0], data[data[:, 2]==k, 1], c=color_array[k], marker='o') # Plot out the eigenvectors too data = c_[eigenvectors, labels] ax = fig.add_subplot(133) ax.set_title('K-eigenvectors') for k in range( 0, K ): ax.scatter( data[data[:, 2]==k, 0], data[data[:, 2]==k, 1], c=color_array[k], marker='o') pyplot.show()
def train_data(self, num_data=2000, stddev=0.10): """ generate the moon/linear data """ if self.dtype == "moon": feat_vec, labels = datasets.make_moons(num_data, noise=stddev) elif self.dtype == "linear": feat_vec, labels = make_blobs(n_samples=num_data, n_features=2, centers=2, cluster_std=1.7) else: feat_vec, labels = datasets.make_moons(num_data, noise=stddev) ## ## we need to have these in numpy matrix format ## feats_vecs = np.matrix(feat_vec).astype(np.float32) labels = np.array(labels).astype(dtype=np.uint8) # Convert the int numpy array into a one-hot matrix. labels_onehot = (np.arange(self.num_classes) == labels[:, None]).astype(np.float32) ## ## create train and test set ## train_set_size = int(self.dsplit * num_data) self.feats_vecs = feats_vecs[:train_set_size,:] self.tfeats_vecs = feats_vecs[train_set_size:,:] self.labels_onehot = labels_onehot[:train_set_size] self.tlabels_onehot = labels_onehot[train_set_size:] # Return a pair of the feature matrix and the one-hot label matrix. return self.feats_vecs, self.labels_onehot
def plot_tree_progressive(): fig, axes = plt.subplots(4, 2, figsize=(15, 25), subplot_kw={'xticks': (), 'yticks': ()}) X, y = make_moons(n_samples=100, noise=0.25, random_state=3) for i, max_depth in enumerate([1, 2, 9]): tree = plot_tree(X, y, max_depth=max_depth, ax=axes[i + 1, 0]) axes[i + 1, 1].imshow(tree_image(tree)) axes[i + 1, 1].set_axis_off() axes[0, 1].set_visible(False) for ax in axes[:, 0]: ax.scatter(X[:, 0], X[:, 1], c=np.array(['r', 'b'])[y], s=60) X, y = make_moons(noise=0.3, random_state=0)
def _download(): train_x, train_t = make_moons(n_samples=10000, shuffle=True, noise=0.2, random_state=1234) test_x, test_t = make_moons(n_samples=10000, shuffle=True, noise=0.2, random_state=1234) valid_x, valid_t = make_moons(n_samples=10000, shuffle=True, noise=0.2, random_state=1234) train_x += np.abs(train_x.min()) test_x += np.abs(test_x.min()) valid_x += np.abs(valid_x.min()) train_set = (train_x, train_t) test_set = (test_x, test_t) valid_set = (valid_x, valid_t) return train_set, test_set, valid_set
def make_trans_moons(theta=40, nb=100, noise=.05): from math import cos, sin, pi X, y = make_moons(nb, noise=noise, random_state=1) Xt, yt = make_moons(nb, noise=noise, random_state=2) trans = -np.mean(X, axis=0) X = 2*(X+trans) Xt = 2*(Xt+trans) theta = -theta*pi/180 rotation = np.array( [ [cos(theta), sin(theta)], [-sin(theta), cos(theta)] ] ) Xt = np.dot(Xt, rotation.T) return X, y, Xt, yt
def generate_noisy_data(): blobs, _ = datasets.make_blobs(n_samples=200, centers=[(-0.75,2.25), (1.0, 2.0)], cluster_std=0.25) moons, _ = datasets.make_moons(n_samples=200, noise=0.05) noise = np.random.uniform(-1.0, 3.0, (50, 2)) return np.vstack([blobs, moons, noise])
def test_make_moons(): X, y = make_moons(3, shuffle=False) for x, label in zip(X, y): center = [0.0, 0.0] if label == 0 else [1.0, 0.5] dist_sqr = ((x - center) ** 2).sum() assert_almost_equal(dist_sqr, 1.0, err_msg="Point is not on expected unit circle")
def main(): X, y = datasets.make_moons(n_samples=200, shuffle=True, noise=0.1, random_state=None) plt.scatter(X[:, 0], X[:, 1], c=y) for i in range(8): clf = RandomForestClassifier(n_estimators = 2**i) clf.fit(X,y) plot_surface(clf, X, y)
def plot_adaboost(): X, y = make_moons(noise=0.3, random_state=0) # Create and fit an AdaBoosted decision tree est = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME.R", n_estimators=200) sample_weight = np.empty(X.shape[0], dtype=np.float) sample_weight[:] = 1. / X.shape[0] est._validate_estimator() est.estimators_ = [] est.estimator_weights_ = np.zeros(4, dtype=np.float) est.estimator_errors_ = np.ones(4, dtype=np.float) plot_step = 0.02 # Plot the decision boundaries x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) fig, axes = plt.subplots(1, 4, figsize=(14, 4), sharey=True) colors = ['#d7191c', '#fdae61', '#ffffbf', '#abd9e9', '#2c7bb6'] c = lambda a, b, c: map(lambda x: x / 254.0, [a, b, c]) colors = [c(215, 25, 28), c(253, 174, 97), c(255, 255, 191), c(171, 217, 233), c(44, 123, 182), ] for i, ax in enumerate(axes): sample_weight, estimator_weight, estimator_error = est._boost(i, X, y, sample_weight) est.estimator_weights_[i] = estimator_weight est.estimator_errors_[i] = estimator_error sample_weight /= np.sum(sample_weight) Z = est.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=matplotlib.colors.ListedColormap([colors[1], colors[-2]]), alpha=1.0) ax.axis("tight") # Plot the training points ax.scatter(X[:, 0], X[:, 1], c=np.array([colors[0], colors[-1]])[y], s=20 + (200 * sample_weight) ** 2, cmap=plt.cm.Paired) ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max) ax.set_xlabel('$x_0$') if i == 0: ax.set_ylabel('$x_1$') plt.tight_layout() plt.show()
def test_run(): data, label = make_moons(n_samples=NSAMPLES, noise=0.4) scores, confusions, predictions, test_proba = \ poly(data, label, n_folds=2, verbose=1, feature_selection=False, save=False, project_name='test1') data, label = make_classification(n_samples=NSAMPLES, n_features=20, n_informative=5, n_redundant=2, n_repeated=0, n_classes=3, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None) scores, confusions, predictions, test_proba = \ poly(data, label, n_folds=3, verbose=1, feature_selection=False, save=False, project_name='test2') scores, confusions, predictions, test_proba = \ poly(data, label, n_folds=3, verbose=1, exclude=['Multilayer Perceptron'], feature_selection=True, project_name='test3') scores, confusions, predictions, test_proba = \ poly(data, label, n_folds=3, verbose=1, exclude=['Multilayer Perceptron', 'Voting'], feature_selection=True, project_name='test3') plot(scores)
def single_run(te): print te data, label = make_moons(n_samples=1000, noise=0.05, shuffle=True, random_state = int(time.time())) data,validation_data,label,validation_label = train_test_split(data,label,train_size = .30) #separate the data set into buckets total_data = list(group_list(data,1)) total_label = list(group_list(label,1)) #The two separate site sets for s in range(10,150,10): print s nets = [] nn_groups_data = [] nn_groups_label = [] number_of_nets = s for x in range(number_of_nets): nets.append(nnDif.nn_build(1,[2,6,6,1],eta=eta,nonlin=nonlin)) iters = 20000 for j in range(number_of_nets): x = (total_data[int(float(j)/number_of_nets*(len(total_data))):int(float((j+1))/number_of_nets*(len(total_data)))]) nn_groups_data.append(x) nn_groups_label.append(total_label[int(float(j)/number_of_nets*(len(total_label)/number_of_nets)):int(float((j+1))/number_of_nets*(len(total_label)))]) start = time.time() visitbatches(nets,nn_groups_data,nn_groups_label,[],it=iters) print time.time() - start one = accuracy(nets[0], validation_data, validation_label, thr=0.5) nn1Acc[te][s/10] += one '''
def generate_biclass_data(data_type, random_state): """ Generate biclass data to classify arg : data_type (str) possible type of data choose any in ["lin_sep", "non_lin_sep", "overlap"] 'lin_sep' : Bi-class, linearly separable data 'non_lin_sep' : Bi-class, non linearly separable data 'overlap' : Bi-class, non linearly separable data with class overlap random_state (int) seed for numpy.random """ # Set seed for reproducible results np.random.seed(random_state) # Case 1 : linearly separable data if data_type == "lin_sep": mean1 = np.array([0, 2]) mean2 = np.array([2, 0]) cov = np.array([[0.8, 0.6], [0.6, 0.8]]) X1 = np.random.multivariate_normal(mean1, cov, 100) y1 = np.ones(len(X1)) X2 = np.random.multivariate_normal(mean2, cov, 100) y2 = np.ones(len(X2)) * -1 X = np.vstack((X1, X2)) y = np.hstack((y1, y2)) # Case 2 : non -linearly separable data elif data_type == "moons": X, y = make_moons(n_samples=200, noise=0.2) elif data_type == "circles": X, y = make_circles(n_samples=200, noise=0.2, factor=0.5) # Case 3 : data with overlap between classes elif data_type == "overlap": mean1 = np.array([0, 2]) mean2 = np.array([2, 0]) cov = np.array([[1.5, 1.0], [1.0, 1.5]]) X1 = np.random.multivariate_normal(mean1, cov, 100) y1 = np.ones(len(X1)) X2 = np.random.multivariate_normal(mean2, cov, 100) y2 = np.ones(len(X2)) * -1 X = np.vstack((X1, X2)) y = np.hstack((y1, y2)) assert(X.shape[0] == y.shape[0]) # Format target to: -1 / +1 targets = set(y.tolist()) t1 = min(targets) t2 = max(targets) l1 = np.where(y < t2) l2 = np.where(y > t1) y[l1] = -1 y[l2] = 1 return X, y
def loadDatasets(linearly_separable): datasets = [\ make_moons(noise=0.3, random_state=0), \ make_circles(noise=0.2, factor=0.5, random_state=1), \ linearly_separable \ ] return datasets
def test_1(): # 读取sk里面的数据, 并且绘图 np.random.seed(0) X, y = datasets.make_moons(200, noise=0.20) print(X) mpp.scatter(X[:,0], X[:,1], s=40, c=y) #mpp.plot(X[:,0], X[:,1]) mpp.show() return X, y
def make_datasets(): """ :return: """ return [make_moons(n_samples=200, noise=0.3, random_state=0), make_circles(n_samples=200, noise=0.2, factor=0.5, random_state=1), make_linearly_separable()]
def test(): np.random.seed(0) train_x, train_y = datasets.make_moons(5000, noise=.20) train_y = np.eye(2)[train_y] example_count = len(train_x) nn = TheanoNN(train_x.shape[1],1000,train_y.shape[1],np.float32(0.01),np.float32(0.01),train_x,train_y) nn.train()
def make_noisy_problem(n_samples_train=30, label_noise_rate=0.1, input_noise=0.15, n_samples_test=3000, seed=0): rng = np.random.RandomState(seed) rng = np.random.RandomState(1) scaler = StandardScaler() X_train, y_train = make_moons(n_samples=n_samples_train, shuffle=True, noise=input_noise, random_state=rng) X_test, y_test = make_moons(n_samples=n_samples_test, shuffle=True, noise=input_noise, random_state=rng) if label_noise_rate > 0: rnd_levels = rng.uniform(low=0., high=1., size=n_samples_train) noise_mask = rnd_levels <= label_noise_rate y_train[noise_mask] = rng.randint(low=0, high=2, size=noise_mask.sum()) X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) return (X_train, y_train), (X_test, y_test)
def build_datasets(n_samples=100): X, y = make_classification(n_samples=n_samples, n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1) X += 2 * np.random.uniform(size=X.shape) linearly_separable = (X, y) names = ['moons', 'circles', 'linear', 'xor'] datasets = [make_moons(n_samples=n_samples, noise=0.3), make_circles(n_samples=n_samples, noise=0.2, factor=0.5), linearly_separable, xor_scale_invariant(n_samples=n_samples)] return (names, datasets)
def setup_method(self, method): base = GradientBoostingClassifier(n_estimators=2) self.clf = CascadedBooster(base_clf=base) n_samples = 500 np.random.seed(42) X, Y = make_moons(n_samples=n_samples, noise=.05) self.X = X self.Y = Y self.clf.fit(X, Y)
def get_dataset(dataset, n_samples): if dataset == "Noisy Circles": return datasets.make_circles(n_samples=n_samples, factor=0.5, noise=0.05) elif dataset == "Noisy Moons": return datasets.make_moons(n_samples=n_samples, noise=0.05) elif dataset == "Blobs": return datasets.make_blobs(n_samples=n_samples, random_state=8) elif dataset == "No Structure": return np.random.rand(n_samples, 2), None
def makeSimpleDatasets(n_samples=1500): # from sklearn example np.random.seed(0) # Generate datasets. We choose the size big enough to see the scalability # of the algorithms, but not too big to avoid too long running times n_samples = 1500 noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05) noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05) blobs = datasets.make_blobs(n_samples=n_samples, random_state=8) no_structure = np.random.rand(n_samples, 2), None return [noisy_circles, noisy_moons, blobs, no_structure]
def get_dataset(dataset, n_samples): # Generate the new data: if dataset=='Noisy Circles': X, y = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05) elif dataset=='Noisy Moons': X, y = datasets.make_moons(n_samples=n_samples, noise=.05) elif dataset=='Blobs': X, y = datasets.make_blobs(n_samples=n_samples, random_state=8) else: X, y = np.random.rand(n_samples, 2), None return X, y
def main(): # Load the dataset X, y = datasets.make_moons(n_samples=300, noise=0.08, shuffle=False) # Cluster the data using DBSCAN clf = DBSCAN(eps=0.17, min_samples=5) y_pred = clf.predict(X) # Project the data onto the 2 primary principal components p = Plot() p.plot_in_2d(X, y_pred, title="DBSCAN") p.plot_in_2d(X, y, title="Actual Clustering")
def plot_tree_progressive(): fig, axes = plt.subplots(4, 2, figsize=(15, 25), subplot_kw={'xticks': (), 'yticks': ()}) X, y = make_moons(n_samples=100, noise=0.25, random_state=3) for i, max_depth in enumerate([1, 2, 9]): tree = plot_tree(X, y, max_depth=max_depth, ax=axes[i + 1, 0]) axes[i + 1, 1].imshow(tree_image(tree)) axes[i + 1, 1].set_axis_off() axes[0, 1].set_visible(False) for ax in axes[:, 0]: discrete_scatter(X[:, 0], X[:, 1], y, ax=ax) ax.legend(loc="best")
def generate_moon_sample(): #X[0] is all the data points(2D), including two moon shape #X[1] is all the labels of the data X,y = datasets.make_moons(n_samples=5000, noise=.05) moon_dict = {} count = 0 for x in X: x1=float(x[0]) x2=float(x[1]) moon_dict[str(count)] = (x1,x2) count = count + 1 return moon_dict, X, y
def test_single_linkage_clustering(): # Check that we get the correct result in two emblematic cases moons, moon_labels = make_moons(noise=0.05, random_state=42) clustering = AgglomerativeClustering(n_clusters=2, linkage='single') clustering.fit(moons) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, moon_labels), 1) circles, circle_labels = make_circles(factor=0.5, noise=0.025, random_state=42) clustering = AgglomerativeClustering(n_clusters=2, linkage='single') clustering.fit(circles) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, circle_labels), 1)
def load_dataset( dname, num_samples ): if 'circles' in dname.lower(): noisy_circles = datasets.make_circles(n_samples=num_samples, factor=.5, noise=.05) return noisy_circles elif 'moons' in dname.lower(): noisy_moons = datasets.make_moons(n_samples=num_samples, noise=.05) return noisy_moons elif 'blobs' in dname.lower(): blobs = datasets.make_blobs(n_samples=num_samples, random_state=8) return blobs else: no_structure = np.random.rand(num_samples, 2), None return no_structure return [[]]
def main(): X, y = make_moons(n_samples=100, random_state=123) while True: print(options) opt = int(raw_input('------>')) if opt == 1: show_moons(X, y) return elif opt == 2: scikit_std_pca(X, y) return elif opt == 3: kernel_pca_unfold(X, y) return else: print("Wrong chocie\n"); continue
def test_as_classifier(): X, y = make_moons(n_samples=100, random_state=1) y = 2 * y - 1 # use -1/+1 labels clf = as_classifier(DecisionTreeRegressor()) clf.fit(X, y) probas = clf.predict_proba(X) predictions = clf.predict(X) assert_array_equal(probas.shape, (len(X), 2)) assert_array_equal(predictions, y) y[-1] = 2 clf = as_classifier(DecisionTreeRegressor()) assert_raises(ValueError, clf.fit, X, y)
def test(): k = 2 X, y_true = make_moons(n_samples=500, random_state=0, noise=0.01) Y = spectral_embedding.transform(X, k, n_neighbors=7, sigma=0.1) n = np.linalg.norm(Y, axis=1) n = n.reshape(-1, 1) Y = Y / n # Apply K-Means to cluster Y y_pred, _, _ = kmeans.kmeans(Y, k) fig = plt.figure() ax = fig.add_subplot(121) ax.scatter(np.arange(len(Y)), Y[:, 0]) ax.set_title("Eigenvector 1") ax = fig.add_subplot(122) ax.scatter(np.arange(len(Y)), Y[:, 1]) ax.set_title("Eigenvector 2") # Plot the data fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(X[y_true==0, 0], X[y_true==0, 1], c='b', alpha=0.5, label="Class 1") ax.scatter(X[y_true==1, 0], X[y_true==1, 1], c='g', alpha=0.5, label="Class 2") ax.set_title("Original data") ax.legend() # Plot the predictions fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(X[y_pred==0, 0], X[y_pred==0, 1], c='r', alpha=0.5, label="Class 1") ax.scatter(X[y_pred==1, 0], X[y_pred==1, 1], c='y', alpha=0.5, label="Class 2") ax.set_title("Result of clustering") ax.legend() # Plot the transformed data fig = plt.figure() ax = fig.add_subplot(111) idx_class0 = np.argwhere(y_true==0) idx_class1 = np.argwhere(y_true==1) ax.scatter(Y[idx_class0, 0], Y[idx_class0, 1], c='b', alpha=0.5, label="Class 1") ax.scatter(Y[idx_class1, 0], Y[idx_class1, 1], c='g', alpha=0.5, label="Class 2") ax.set_title("Original data after spectral embedding") ax.legend() print("Number in class 0: {}".format(np.sum(y_pred==0))) print("Number in class 1: {}".format(np.sum(y_pred==1))) plt.show()
#!/usr/bin python # -*- encoding: utf-8 -*- ''' @Author : Celeste Young @File : 生成数据3圆形半月.py @Time : 2021/2/15 21:31 @Tips : ''' from sklearn.datasets import make_circles from sklearn.datasets import make_moons import matplotlib.pyplot as plt fig = plt.figure(1) x1, y1 = make_circles(n_samples=1000, factor=0.5, noise=0.1) plt.subplot(121) plt.title('make_circles function example') plt.scatter(x1[:, 0], x1[:, 1], marker='o', c=y1) plt.subplot(122) x1, y1 = make_moons(n_samples=1000, noise=0.1) plt.title('make_moons function example') plt.scatter(x1[:, 0], x1[:, 1], marker='o', c=y1) plt.show()
from sklearn.datasets import make_blobs, make_moons import matplotlib.pyplot as plt from sklearn.cluster import KMeans, DBSCAN # X, y = make_blobs(n_samples=100, centers=3) X, y = make_moons(n_samples=100, shuffle=True, noise=None, random_state=42) print(plt.scatter(X[:, 0], X[:, 1])) kmeans = DBSCAN(eps=0.5, min_samples=5) # kmeans = KMeans(n_clusters=3).fit(X) labels = kmeans.labels_ centroids = kmeans.cluster_centers_ print(labels) print(centroids) plt.scatter(X[:, 0], X[:, 1], c=labels) plt.plot(centroids[:, 0], centroids[:, 1], 'r')
import matplotlib.pyplot as plt import mglearn from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import make_moons from sklearn.model_selection import train_test_split X, y = make_moons(n_samples=100, noise=0.25, random_state=3) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) forest = RandomForestClassifier(n_estimators=5, random_state=2) forest.fit(X_train, y_train) fig, axes = plt.subplots(2, 3, figsize=(20, 10)) for i, (ax, tree) in enumerate(zip(axes.ravel(), forest.estimators_)): ax.set_title("Tree {}".format(i)) mglearn.plots.plot_tree_partition(X, y, tree, ax=ax) mglearn.plots.plot_2d_separator(forest, X, fill=True, ax=axes[-1, -1], alpha=.4) axes[-1, -1].set_title("Random Forest") mglearn.discrete_scatter(X[:, 0], X[:, 1], y) plt.show()
def args(): parser = argparse.ArgumentParser() parser.add_argument('--noise', type=float, default=0.2) parser.add_argument('--random_state', type=int, default=0) parser.add_argument('--n_samples', type=int, default=1000) parser.add_argument('--save_dir', type=str, default='../dataset') parser.add_argument('--name', type=str, default='custom_two_moon') return parser.parse_args() if __name__ == "__main__": flags = args() x, y = make_moons(noise=flags.noise, random_state=flags.random_state, n_samples=flags.n_samples) tvx, test_x, tvy, test_y = train_test_split(x, y, test_size=0.2) train_x, val_x, train_y, val_y = train_test_split(tvx, tvy, test_size=0.25) print('train: ', len(train_y), 'val: ', len(val_y), 'test: ', len(test_y)) zeros = x[np.where(y == 0), :][0] ones = x[np.where(y == 1), :][0] plt.scatter(zeros[:, 0], zeros[:, 1]) plt.scatter(ones[:, 0], ones[:, 1]) plt.show() if os.path.isdir(os.path.join(flags.save_dir, flags.name)) is False: os.makedirs(os.path.join(flags.save_dir, flags.name))
from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.svm import SVC from sklearn.datasets import make_moons (X, y) = make_moons(200, noise=0.2) rbf_kernel_svm_clf = Pipeline((("scaler", StandardScaler()), ("svm_clf", SVC(kernel="rbf", gamma=5, C=0.001)))) rbf_kernel_svm_clf.fit(X, y) import numpy as np import matplotlib.pyplot as plt xx, yy = np.meshgrid(np.arange(-2, 3, 0.01), np.arange(-1, 2, 0.01)) y_new = rbf_kernel_svm_clf.predict(np.c_[xx.ravel(), yy.ravel()]) plt.contourf(xx, yy, y_new.reshape(xx.shape), cmap="PuBu") plt.scatter(X[:, 0], X[:, 1], marker="o", c=y) """ 决策边界范围很小,如果γγ比较大,会使得决策线变窄,变得不规则。相反,小的γγ使决策线变宽,边平滑。所以γγ就像一个正则化参数:如果你的模型过拟合,可以适当减少它,如果它欠拟合,可以增加它(类似于C超参数)。"""
import numpy as np from sklearn.model_selection import train_test_split from sklearn.datasets import make_moons from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier if __name__ == '__main__': import sys, os # visualization module from datavyz import ge X, y = make_moons(n_samples=400, noise=0.30, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # Single Decision Tree Classifier tree = DecisionTreeClassifier() tree.fit(X_train, y_train) # Bagging Classifier bag_clf = BaggingClassifier(\ DecisionTreeClassifier(), n_estimators=500, max_samples=0.5, bootstrap=True) bag_clf.fit(X_train, y_train) fig, AX = ge.figure(axes=(1, 2)) for ax in AX: ge.scatter( X=[X_train[:, 0][y_train == 1], X_train[:, 0][y_train == 0]],
def generate_nonlinear_synthetic_data_classification3(n_samples, noise=0.1): return make_moons(n_samples=n_samples, noise=noise, random_state=100)
# Create a random forest from scratch, based on an exercise in chapter 6 of the excellent book [Hands on Machine Learning with Scikit-learn and Tensorflow](http://shop.oreilly.com/product/0636920052289.do). # # ## Train and fine tune a Decision Tree for the moons dataset # # > a. Generate a moons dataset using `make_moons(n_samples=10000, noise=0.4)` # # Reading the [documentation](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_moons.html) for the `make_moons` function tells us the following things: # # - It is for making play datasets with for clustering and classification # - The `n_samples` parameter suggested controls the number of datapoints that it will return # - `noise` is random noise added to the dataset # - It is going to return two things: an array `X` containing the samples and an array `y` containing their class from sklearn.datasets import make_moons moons_X, moons_y = make_moons(n_samples=10000, noise=0.4) # I like to take a look at the dataset before getting stuck in. # I could try printing it out, or can use the plotting functions in matplotlib. # Plotting is nicer, lets try that. get_ipython().magic('matplotlib inline') from matplotlib import pyplot as plt figure = plt.figure(figsize=(10, 10)) plt.scatter(x=moons_X[:, 0], y=moons_X[:, 1], c=moons_y, alpha=0.5) # Okay, that makes sense. # There are two classes there, which I'm going to try separating with the decision tree. # There is a bit of overlap between the classes, which is going to make things more difficult for the classifier.
from sklearn import cluster, datasets, mixture from sklearn.neighbors import kneighbors_graph from sklearn.preprocessing import StandardScaler from itertools import cycle, islice np.random.seed(0) # ============ # Generate datasets. We choose the size big enough to see the scalability # of the algorithms, but not too big to avoid too long running times # ============ n_samples = 1500 noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05) noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05) blobs = datasets.make_blobs(n_samples=n_samples, random_state=8) no_structure = np.random.rand(n_samples, 2), None # Anisotropicly distributed data random_state = 170 X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state) transformation = [[0.6, -0.6], [-0.4, 0.8]] X_aniso = np.dot(X, transformation) aniso = (X_aniso, y) # blobs with varied variances varied = datasets.make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state)
import numpy as np import tensorflow as tf from sklearn.datasets import make_moons # dataset num_attributes = 2 num_classes = 2 examples, classes = make_moons(256) examples += np.random.normal(scale=5e-2, size=np.shape(examples)) # hyperparameters num_must = 12 num_cannot = 12 neighborhood_size = 64 auxiliary_weight = 1e-2 gamma = 1 # parameter for RBF regularizer = tf.contrib.layers.l2_regularizer(1e-2) batch_size = len(examples) # examples min_batch_must = 0 # pairs of examples min_batch_cannot = 0 # pairs of examples learning_rate = 1e-4 num_episodes = 4096 class model: input = tf.placeholder(tf.float32, shape=(None, num_attributes)) layers = [input] layers.append( tf.layers.dense(inputs=layers[-1], units=16,
import numpy as np from sklearn.datasets import make_moons from sklearn.model_selection import train_test_split from nn_utils2 import full_forward_propagation, train, get_accuracy_value #https://towardsdatascience.com/lets-code-a-neural-network-in-plain-numpy-ae7e74410795 #DATA------------------------------------------------- N_SAMPLES = 1000 TEST_SIZE = 0.1 X, y = make_moons(n_samples=N_SAMPLES, noise=0.2, random_state=100) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=42) #LINEAR REGRESSION------------------------------------- #x_batch = np.linspace(0, 2, 2000) #y_batch = 1.5 * x_batch + np.random.randn(*x_batch.shape) * 0.2 + 0.5 #x_batch.resize((2000,1)) #y_batch.resize((2000,1)) #X_train, X_test, y_train, y_test = train_test_split(x_batch, y_batch, test_size=0.1, random_state=42) #HYPERPARAMETERS-------------------------------------- N_nn = 5 kernel_a = 0.001 alpha = 10 beta = 0
s = [[i[0] * i[1][0], i[0] * i[1][1]] for i in zip(t, x)] gradient_w = np.sum( s, 0) / row * self.eta #np.sum的三种,sum(x,0)所有x的列元素相加,sum(x,1)行元素相加 gradient_b = np.sum(t, 0) / row * self.eta self.w -= gradient_w self.b -= gradient_b ypts = (self.w[0] * xpts + self.b) / (-self.w[1]) if itr % 100 == 0: plt.figure() for i in range(250): plt.plot(x[i, 0], x[i, 1], col[y[i]] + 'o') plt.ylim([-1.5, 1.5]) plt.plot(xpts, ypts, 'g*', lw=2) plt.title('eta = %s, Iteration = %s\n' % (str(eta), str(itr))) plt.savefig('p_N%s_it%s' % (str(row), str(itr)), dpi=200, bbox_inches='tight') # plt.plot(5.50113924e-01, -9.35132373e-01, 'b*', lw=3) itr += 1 plt.show() if __name__ == '__main__': import matplotlib.pyplot as plt x, y = make_moons(250, noise=0.25) col = {0: 'r', 1: 'b'} lr = LR() print(x) print(y) lr.logistic_regression(x, y, eta=1.2)
from sklearn.cluster import KMeans from sklearn.datasets import make_moons from matplotlib import pyplot as plt from pandas import DataFrame from matplotlib.colors import ListedColormap # generate 2d classification dataset X, y = make_moons(n_samples=1000, noise=0.1, random_state=42) # scatter plot, dots colored by class value df = DataFrame(dict(x=X[:, 0], y=X[:, 1], label=y)) colors = {0: 'red', 1: 'blue'} fig, ax = plt.subplots() grouped = df.groupby('label') for key, group in grouped: group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key]) k = 2 #running kmeans clustering into two kmeans = KMeans(n_clusters=k, random_state=0).fit(X) # this will contain the labels for our predicted clusters (either 0 or 1) labels = kmeans.labels_ # the centers of the calculated clusters clusters = kmeans.cluster_centers_ # printing our cluster centers - there will be 2 of them. print(clusters)
from sklearn import datasets import numpy as np import matplotlib.pyplot as plt np.random.seed(0) feature_set, labels = datasets.make_moons(100, noise=0.10) plt.figure(figsize=(10, 7)) plt.scatter(feature_set[:, 0], feature_set[:, 1], c=labels, cmap=plt.cm.winter) labels = labels.reshape(100, 1) def sigmoid(x): return 1 / (1 + np.exp(-x)) def sigmoid_der(x): return sigmoid(x) * (1 - sigmoid(x)) wh = np.random.rand(len(feature_set[0]), 4) wo = np.random.rand(4, 1) lr = 0.5 for epoch in range(200000): # feedforward zh = np.dot(feature_set, wh) ah = sigmoid(zh) zo = np.dot(ah, wo) ao = sigmoid(zo)
""" ML for Practical Hackers This is just a simple tutorial on SVM algorithm. """ import numpy as np import sklearn import matplotlib.pyplot as plt from utils import plot_decisions_boundary from sklearn.svm import SVC from sklearn.datasets import make_moons from sklearn.model_selection import GridSearchCV, train_test_split X, Y = make_moons(n_samples=600, noise=0.25) plt.scatter(X[:,0], X[:,1], c=Y) plt.show() X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2) #Setting parameters for hypothesis space search grid_linear = { 'C' : np.logspace(-2,2,10), 'kernel' : ['linear'] } #Creating GridSearchCV object for searching the optimal values #I separated the linear and kernel case for visualization purposes. clf_linear = GridSearchCV(SVC(), param_grid=grid_linear, cv=10) clf_linear.fit(X_train, y_train) linear_error = 100 * np.sum(clf_linear.best_estimator_.predict(X_test) != y_test) / len(y_test)
DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), MLPClassifier(alpha=1, max_iter=1000), AdaBoostClassifier(), GaussianNB(), QuadraticDiscriminantAnalysis()] # X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, # random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) # X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) datasets = [make_moons(noise=0.3, random_state=0), make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable ] figure = plt.figure(figsize=(27, 9)) i = 1 # iterate over datasets for ds_cnt, ds in enumerate(datasets): # preprocess dataset, split into training and test part X, y = ds X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=.4, random_state=42)
layer.append(layers.Dense(layer[-1], out_dim)) return layer def discriminator(X): layer = [layers.Dense(X, 32)] layer.append(layers.Activation(layer[-1], T.leaky_relu)) layer.append(layers.Dense(layer[-1], 32)) layer.append(layers.Activation(layer[-1], T.leaky_relu)) layer.append(layers.Dense(layer[-1], 2)) return layer BS = 100 lr = 0.001 DATA, _ = datasets.make_moons(1000) X = T.Placeholder([BS, 2], "float32") Z = T.Placeholder([BS, 2], "float32") G_sample = generator(Z, 2) logits = discriminator(T.concatenate([G_sample[-1], X])) labels = T.concatenate([T.zeros(BS, dtype="int32"), T.ones(BS, dtype="int32")]) disc_loss = losses.sparse_crossentropy_logits(labels, logits[-1]).mean() gen_loss = losses.sparse_crossentropy_logits(1 - labels[:BS], logits[-1][:BS]).mean() masks = T.concatenate([G_sample[1] > 0, G_sample[3] > 0], 1) A = T.stack( [
# -*- coding: utf-8 -*- import theano import theano.tensor as T import numpy as np from sklearn import datasets import matplotlib.pyplot as plt import time #定义数据类型 np.random.seed(0) train_X, train_y = datasets.make_moons(300, noise=0.20) train_X = train_X.astype(np.float32) train_y = train_y.astype(np.int32) num_example = len(train_X) #设置参数 nn_input_dim = 332 #输入神经元个数 nn_output_dim = 2 #输出神经元个数 nn_hdim = 100 #梯度下降参数 epsilon = 0.01 #learning rate reg_lambda = 0.01 #正则化长度 #设置共享变量 w1 = theano.shared(np.random.randn(nn_input_dim, nn_hdim), name="W1") b1 = theano.shared(np.zeros(nn_hdim), name="b1") w2 = theano.shared(np.random.randn(nn_hdim, nn_output_dim), name="W2") b2 = theano.shared(np.zeros(nn_output_dim), name="b2") #前馈算法
import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap from sklearn import neighbors, datasets from sklearn.model_selection import GridSearchCV np.random.seed(2017) # Set random seed so results are repeatable n = [100, 500, 1000, 5000] # number of training points plt.figure(figsize=(8, 8)) for idx, size in enumerate(n): # generate a simple 2D dataset X, y = datasets.make_moons(size, 'True', 0.3) # perform a grid search on n_neighbors : # using the entire dataset, we set a 10-fold stratified cross-validation to compute the test score : # i.e we divide the dataset into 10 folds while preserving the percentage of samples for each class, train on 9 # folds and test on the last one. We then select the value of k for which this test score was the highest. # (quantitative approach). # define the range of values to try for k n_neighbors = np.arange(1, 21) param_grid = {"n_neighbors": n_neighbors} print('Performing grid search using the following parameters & ranges: \n', param_grid) # instantiate the GridSearch object & fit it on the dataset (might take some time for higher values of n) grid_search = GridSearchCV(
#print('lambdas') #print(lambdas) return mat_sq_dists,one_n,K,alphas, lambdas from sklearn.datasets import make_moons from sklearn.decomposition import PCA cloud1=np.loadtxt('cloud1.txt') t = np.reshape(cloud1,(-1,2)) print(t) cloud2=np.loadtxt('cloud2.txt') z=np.reshape(cloud2,(-1,2)) print(z) X1, y = make_moons(n_samples=100, random_state=123) X=np.vstack((t,z)) print(X) mat_sq_dists,one_n,K,alphas, lambdas = stepwise_kpca(X, gamma=15, n_components=100) np.savetxt('kerneldata.txt',alphas[:,[0,1]], fmt='%.5f') n=alphas*lambdas m=lambdas*alphas c=n-m np.savetxt('eigvectorssorted.txt',alphas,fmt='%.5f')
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn import datasets # In[]: X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=42) plt.scatter(X[y == 0, 0], X[y == 0, 1]) plt.scatter(X[y == 1, 0], X[y == 1, 1]) plt.show() # In[]: from sklearn.model_selection import train_test_split X_train, X_text, y_train, y_test = train_test_split(X, y, random_state=42) print(pd.value_counts(y_train, sort=True)) # In[]: from sklearn.linear_model import LogisticRegression log_clf = LogisticRegression() log_clf.fit(X_train, y_train) print(log_clf.score(X_text, y_test)) # In[]: from sklearn.svm import SVC svm_clf = SVC() svm_clf.fit(X_train, y_train)
import matplotlib.pyplot as plt from tqdm import tqdm def sig(X): return 1.0/(1.0+np.exp(-X)) def der_sig(X): return np.multiply(sig(X), (1-sig(X))) reg_lambda = 0#.001 epsilon = 0.0003 num_examples = 3000 n2, n3, n4 = 12, 10, 2 X, y = datasets.make_moons(num_examples, noise=0.1) n1 = X.shape[1] W1 = np.random.rand(n1, n2) / np.sqrt(n1) W2 = np.random.rand(n2, n3) / np.sqrt(n2) W3 = np.random.rand(n3, n4) / np.sqrt(n3) b1 = np.random.rand(1, n2) * 0. b2 = np.random.rand(1, n3) * 0. b3 = np.random.rand(1, n4) * 0. def calculate_loss(): z1 = X.dot(W1) + b1 a1 = sig(z1) z2 = a1.dot(W2) + b2 a2 = sig(z2)
names = ["Decision Tree"] classifiers = [ DecisionTreeClassifier(max_depth=5), ] X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) datasets = [ make_moons(noise=0.3, random_state=0), make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable ] figure = plt.figure(figsize=(27, 9)) i = 1 # iterate over datasets for ds_cnt, ds in enumerate(datasets): # preprocess dataset, split into training and test part #X, y = ds X = usedData y = usedValue print(X, y) X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = \
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets X, y = datasets.make_moons(noise=0.25, random_state=666) from sklearn.tree import DecisionTreeClassifier #超参数1:max_depth,决策树深度,越小越不容易过拟合 dt_clf1 = DecisionTreeClassifier(max_depth=2) dt_clf1.fit(X, y) from Utils.PlotDecisionBoundary import plot_decision_boundary plot_decision_boundary(dt_clf1, axis=[-1.5, 2.5, -1.0, 1.5]) plt.scatter(X[y == 0, 0], X[y == 0, 1]) plt.scatter(X[y == 1, 0], X[y == 1, 1]) plt.title("max_depth=2") plt.show() #超参数2:min_samples_split,划分结束判断条件。如果只剩min_samples_split个样本,则不再继续划分。越大越不容易过拟合 dt_clf2 = DecisionTreeClassifier(min_samples_split=10) dt_clf2.fit(X, y) from Utils.PlotDecisionBoundary import plot_decision_boundary plot_decision_boundary(dt_clf2, axis=[-1.5, 2.5, -1.0, 1.5]) plt.scatter(X[y == 0, 0], X[y == 0, 1]) plt.scatter(X[y == 1, 0], X[y == 1, 1]) plt.title("min_samples_split=10") plt.show() #超参数3:min_samples_leaf,划分结束判断条件。最底层的叶子节点需要至少保留min_samples_leaf个样本,则不再继续划分。越大越不容易过拟合
# coding: utf-8 # In[24]: from sklearn import datasets from sklearn.model_selection import train_test_split # Sklearn **make_moons dataset** at this [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_moons.html#sklearn.datasets.make_moons) # In[25]: X, y = datasets.make_moons(n_samples=10000, noise=0.4) # In[26]: X_train, X_test, y_train, y_test = train_test_split(X, y) # In[27]: from sklearn.ensemble import RandomForestClassifier # In[28]: from sklearn.ensemble import VotingClassifier # In[29]: from sklearn.linear_model import LogisticRegression # In[30]:
from sklearn.datasets import make_moons from sklearn.cluster import KMeans import matplotlib.pyplot as plt X, y = make_moons(n_samples=200, noise=0.05, random_state=0) n_cluster = 10 kmeans = KMeans(n_clusters=n_cluster) kmeans.fit(X) y = kmeans.predict(X) plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], marker='^',s=100,linewidth=2,edgecolors='k') plt.scatter(X[:,0],X[:,1],c=y,marker='o',s=13) plt.show()
def main(argv=None): from cleverhans_tutorials import check_installation check_installation(__file__) if not os.path.exists( CONFIG.SAVE_PATH ): os.makedirs( CONFIG.SAVE_PATH ) save_path_data = CONFIG.SAVE_PATH + 'data/' if not os.path.exists( save_path_data ): os.makedirs( save_path_data ) model_path = CONFIG.SAVE_PATH + '../all/' + CONFIG.DATASET + '/' if not os.path.exists( model_path ): os.makedirs( model_path ) os.makedirs( model_path + 'data/' ) nb_epochs = FLAGS.nb_epochs batch_size = FLAGS.batch_size learning_rate = FLAGS.learning_rate nb_filters = FLAGS.nb_filters len_x = int(CONFIG.NUM_TEST/2) start = time.time() # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set seeds to improve reproducibility if CONFIG.DATASET == 'mnist' or CONFIG.DATASET == 'cifar10': tf.set_random_seed(1234) np.random.seed(1234) rd.seed(1234) elif CONFIG.DATASET == 'moon' or CONFIG.DATASET == 'dims': tf.set_random_seed(13) np.random.seed(1234) rd.seed(0) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session tf_config = tf.ConfigProto(allow_soft_placement=True,log_device_placement=True) tf_config.gpu_options.per_process_gpu_memory_fraction = 0.2 sess = tf.Session(config=tf_config) if CONFIG.DATASET == 'mnist': # Get MNIST data mnist = MNIST(train_start=0, train_end=CONFIG.NUM_TRAIN, test_start=0, test_end=CONFIG.NUM_TEST) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') elif CONFIG.DATASET == 'cifar10': # Get CIFAR10 data data = CIFAR10(train_start=0, train_end=CONFIG.NUM_TRAIN, test_start=0, test_end=CONFIG.NUM_TEST) dataset_size = data.x_train.shape[0] dataset_train = data.to_tensorflow()[0] dataset_train = dataset_train.map( lambda x, y: (random_shift(random_horizontal_flip(x)), y), 4) dataset_train = dataset_train.batch(batch_size) dataset_train = dataset_train.prefetch(16) x_train, y_train = data.get_set('train') x_test, y_test = data.get_set('test') elif CONFIG.DATASET == 'moon': # Create a two moon example X, y = make_moons(n_samples=(CONFIG.NUM_TRAIN+CONFIG.NUM_TEST), noise=0.2, random_state=0) X = StandardScaler().fit_transform(X) x_train1, x_test1, y_train1, y_test1 = train_test_split(X, y, test_size=(CONFIG.NUM_TEST/(CONFIG.NUM_TRAIN +CONFIG.NUM_TEST)), random_state=0) x_train, y_train, x_test, y_test = normalize_reshape_inputs_2d(model_path, x_train1, y_train1, x_test1, y_test1) elif CONFIG.DATASET == 'dims': X, y = make_moons(n_samples=(CONFIG.NUM_TRAIN+CONFIG.NUM_TEST), noise=0.2, random_state=0) X = StandardScaler().fit_transform(X) x_train1, x_test1, y_train1, y_test1 = train_test_split(X, y, test_size=(CONFIG.NUM_TEST/(CONFIG.NUM_TRAIN +CONFIG.NUM_TEST)), random_state=0) x_train2, y_train, x_test2, y_test = normalize_reshape_inputs_2d(model_path, x_train1, y_train1,x_test1, y_test1) x_train, x_test = add_noise_and_QR(x_train2, x_test2, CONFIG.NUM_DIMS) np.save(os.path.join(save_path_data, 'x_test'), x_test) np.save(os.path.join(save_path_data, 'y_test'), y_test) # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': 1} rng = np.random.RandomState([2017, 8, 30]) with open(CONFIG.SAVE_PATH + 'acc_param.txt', 'a') as fi: def do_eval(adv_x, preds, x_set, y_set, report_key): acc, pred_np, adv_x_np = model_eval(sess, x, y, preds, adv_x, nb_classes, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if report_key: print('Accuracy on %s examples: %0.4f' % (report_key, acc), file=fi) return pred_np, adv_x_np if CONFIG.DATASET == 'mnist': trained_model_path = model_path + 'data/trained_model' model = ModelBasicCNN('model1', nb_classes, nb_filters) elif CONFIG.DATASET == 'cifar10': trained_model_path = model_path + 'data/trained_model' model = ModelAllConvolutional('model1', nb_classes, nb_filters, input_shape=[32, 32, 3]) elif CONFIG.DATASET == 'moon': trained_model_path = model_path + 'data/trained_model' model = ModelMLP('model1', nb_classes) elif CONFIG.DATASET == 'dims': trained_model_path = save_path_data + 'trained_model' model = ModelMLP_dyn('model1', nb_classes, CONFIG.NUM_DIMS) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) def evaluate(): _, _ = do_eval(x, preds, x_test, y_test, 'test during train') if os.path.isfile( trained_model_path + '.index' ): tf_model_load(sess, trained_model_path) else: if CONFIG.DATASET == 'mnist': train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) elif CONFIG.DATASET == 'cifar10': train(sess, loss, None, None, dataset_train=dataset_train, dataset_size=dataset_size, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) elif CONFIG.DATASET == 'moon': train_2d(sess, loss, x, y, x_train, y_train, save=False, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) elif CONFIG.DATASET == 'dims': train_2d(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) saver = tf.train.Saver() saver.save(sess, trained_model_path) # Evaluate the accuracy on test examples if os.path.isfile( save_path_data + 'logits_zero_attacked.npy' ): logits_0 = np.load(save_path_data + 'logits_zero_attacked.npy') else: _, _ = do_eval(x, preds, x_train, y_train, 'train') logits_0, _ = do_eval(x, preds, x_test, y_test, 'test') np.save(os.path.join(save_path_data, 'logits_zero_attacked'), logits_0) if CONFIG.DATASET == 'moon': num_grid_points = 5000 if os.path.isfile( model_path + 'data/images_mesh' + str(num_grid_points) + '.npy' ): x_mesh = np.load(model_path + 'data/images_mesh' + str(num_grid_points) + '.npy') logits_mesh = np.load(model_path + 'data/logits_mesh' + str(num_grid_points) + '.npy') else: xx, yy = np.meshgrid(np.linspace(0, 1, num_grid_points), np.linspace(0, 1, num_grid_points)) x_mesh1 = np.stack([np.ravel(xx), np.ravel(yy)]).T y_mesh1 = np.ones((x_mesh1.shape[0]),dtype='int64') x_mesh, y_mesh, _, _ = normalize_reshape_inputs_2d(model_path, x_mesh1, y_mesh1) logits_mesh, _ = do_eval(x, preds, x_mesh, y_mesh, 'mesh') x_mesh = np.squeeze(x_mesh) np.save(os.path.join(model_path, 'data/images_mesh'+str(num_grid_points)), x_mesh) np.save(os.path.join(model_path, 'data/logits_mesh'+str(num_grid_points)), logits_mesh) points_x = x_test[:len_x] points_y = y_test[:len_x] points_x_bar = x_test[len_x:] points_y_bar = y_test[len_x:] # Initialize the CW attack object and graph cw = CarliniWagnerL2(model, sess=sess) # first attack attack_params = { 'learning_rate': CONFIG.CW_LEARNING_RATE, 'max_iterations': CONFIG.CW_MAX_ITERATIONS } if CONFIG.DATASET == 'moon': out_a = compute_polytopes_a(x_mesh, logits_mesh, model_path) attack_params['const_a_min'] = out_a attack_params['const_a_max'] = 100 adv_x = cw.generate(x, **attack_params) if os.path.isfile( save_path_data + 'images_once_attacked.npy' ): adv_img_1 = np.load(save_path_data + 'images_once_attacked.npy') logits_1 = np.load(save_path_data + 'logits_once_attacked.npy') else: #Evaluate the accuracy on adversarial examples preds_adv = model.get_logits(adv_x) logits_1, adv_img_1 = do_eval(adv_x, preds_adv, points_x_bar, points_y_bar, 'test once attacked') np.save(os.path.join(save_path_data, 'images_once_attacked'), adv_img_1) np.save(os.path.join(save_path_data, 'logits_once_attacked'), logits_1) # counter attack attack_params['max_iterations'] = 1024 if CONFIG.DATASET == 'moon': out_alpha2 = compute_epsilons_balls_alpha(x_mesh, np.squeeze(x_test), np.squeeze(adv_img_1), model_path, CONFIG.SAVE_PATH) attack_params['learning_rate'] = out_alpha2 attack_params['const_a_min'] = -1 attack_params['max_iterations'] = 2048 plot_data(np.squeeze(adv_img_1), logits_1, CONFIG.SAVE_PATH+'data_pred1.png', x_mesh, logits_mesh) adv_adv_x = cw.generate(x, **attack_params) x_k = np.concatenate((points_x, adv_img_1), axis=0) y_k = np.concatenate((points_y, logits_1), axis=0) if os.path.isfile( save_path_data + 'images_twice_attacked.npy' ): adv_img_2 = np.load(save_path_data + 'images_twice_attacked.npy') logits_2 = np.load(save_path_data + 'logits_twice_attacked.npy') else: # Evaluate the accuracy on adversarial examples preds_adv_adv = model.get_logits(adv_adv_x) logits_2, adv_img_2 = do_eval(adv_adv_x, preds_adv_adv, x_k, y_k, 'test twice attacked') np.save(os.path.join(save_path_data, 'images_twice_attacked'), adv_img_2) np.save(os.path.join(save_path_data, 'logits_twice_attacked'), logits_2) if CONFIG.DATASET == 'moon': plot_data(np.squeeze(adv_img_2[:len_x]), logits_2[:len_x], CONFIG.SAVE_PATH+'data_pred2.png', x_mesh, logits_mesh) plot_data(np.squeeze(adv_img_2[len_x:]), logits_2[len_x:], CONFIG.SAVE_PATH+'data_pred12.png', x_mesh, logits_mesh) test_balls(np.squeeze(x_k), np.squeeze(adv_img_2), logits_0, logits_1, logits_2, CONFIG.SAVE_PATH) compute_returnees(logits_0[len_x:], logits_1, logits_2[len_x:], logits_0[:len_x], logits_2[:len_x], CONFIG.SAVE_PATH) if x_test.shape[-1] > 1: num_axis=(1,2,3) else: num_axis=(1,2) D_p = np.squeeze(np.sqrt(np.sum(np.square(points_x-adv_img_2[:len_x]), axis=num_axis))) D_p_p = np.squeeze(np.sqrt(np.sum(np.square(adv_img_1-adv_img_2[len_x:]), axis=num_axis))) D_p_mod, D_p_p_mod = modify_D(D_p, D_p_p, logits_0[len_x:], logits_1, logits_2[len_x:], logits_0[:len_x], logits_2[:len_x]) if D_p_mod != [] and D_p_p_mod != []: plot_violins(D_p_mod, D_p_p_mod, CONFIG.SAVE_PATH) threshold_evaluation(D_p_mod, D_p_p_mod, CONFIG.SAVE_PATH) _ = compute_auroc(D_p_mod, D_p_p_mod, CONFIG.SAVE_PATH) plot_results_models(len_x, CONFIG.DATASET, CONFIG.SAVE_PATH) print('Time needed:', time.time()-start) return report
return centers, labels centers, labels = find_clusters(X, 4) plt.scatter(X[:,0], X[:,1], c=labels, s=50, cmap='viridis'); center, labels = find_clusters(X, 4, rseed=0) plt.scatter(X[:,0], X[:,1], c=labels, s=50, cmap='viridis') labels = KMeans(6, random_state=0).fit_predict(X) plt.scatter(X[:,0], X[:,1], c=labels, s=50, cmap='viridis'); # DBSCAN, mean-shift, affinity propagation... from sklearn.datasets import make_moons X, y = make_moons(200, noise=.05, random_state=0) labels =KMeans(2, random_state=0).fit_predict(X) plt.scatter(X[:,0], X[:,1], c=labels, s=50, cmap='viridis') from sklearn.cluster import SpectralClustering model = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', assign_labels='kmeans') labels = model.fit_predict(X) plt.scatter(X[:,0], X[:,1], c=labels, s=50, cmap='viridis') from sklearn.datasets import load_digits digits = load_digits() digits.data.shape
import matplotlib.pyplot as plt from sklearn.cluster import DBSCAN from sklearn.cluster import KMeans from sklearn import datasets import numpy as np x, y = datasets.make_moons(n_samples=2000, noise=0.05) x1 = x[:, 0] x2 = x[:, 1] plt.title("This is the dataset we want to classify with DBSCAN:\n") plt.scatter(x1, x2, s=5, color='purple') plt.show() dbscan = DBSCAN(eps=0.1) dbscan.fit(x) y_pred = dbscan.labels_.astype(np.int) colors = np.array(['#ff0345', '#70ff09']) plt.title("These are the clusters with DBSCAN:\n") plt.scatter(x1, x2, s=5, color=colors[y_pred]) plt.show() kmeans = KMeans(n_clusters=2) kmeans.fit(x) y_pred = kmeans.labels_.astype(np.int) colors = np.array(['#ff0345', '#70ff09'])
def main(): # moons_X: Data, moon_y: Labels moons_X, moon_y = make_moons(n_samples = 2000) addNoise(moons_X, moon_y)