def main():
	no_of_samples = 400
	
	data = []
	data.append( datasets.make_moons(n_samples=no_of_samples, noise=0.05)[0] )
	data.append( datasets.make_circles(n_samples=no_of_samples, factor=0.5, noise=0.05)[0] )
	
	# number of clusters we expect
	K = 2

	for X in data:	
		# from dataset, create adjacency, degree, and laplacian matrix
		adjacency 	= gaussianDistance( X, sigma=0.1 )
		degree 		= degreeMatrix( adjacency )
		L 			= diag(degree) - adjacency

		# perform whitening on the Laplacian matrix
		deg_05 	= diag( degree  ** -0.5 )
		L 		= deg_05.dot( L ).dot( deg_05 )

		# use eig to obtain eigenvalues and eigenvectors
		eigenvalues, eigenvectors = linalg.eig( L )

		# Sort the eigenvalues ascending, the first K zero eigenvalues represent the connected components
		idx = eigenvalues.argsort()
		eigenvalues.sort()
		evecs = eigenvectors[:, idx]
		eigenvectors = evecs[:, 0:K]
		print eigenvalues[0:K]

		color_array = ['b', 'r', 'g', 'y']

		fig = pyplot.figure( figsize=(15, 5) )
		fig.canvas.set_window_title( 'Difference between K-means and Spectral Clusterings' )

		# First perform the normal K-means on the original dataset and plot it out
		centroids, labels = scipy.cluster.vq.kmeans2( X, K )
		data = c_[X, labels]	
		ax = fig.add_subplot( 131 )
		ax.set_title('K means clustering')
		for k in range( 0, K ):
			ax.scatter( data[data[:, 2]==k, 0], data[data[:, 2]==k, 1], c=color_array[k], marker='o')

		# Then we perform spectral clustering, i.e. K-means on eigenvectors
		centroids, labels = scipy.cluster.vq.kmeans2( eigenvectors, K )
		data = c_[X, labels]	
		ax = fig.add_subplot( 132 )
		ax.set_title('Spectral clustering')
		for k in range( 0, K ):
			ax.scatter( data[data[:, 2]==k, 0], data[data[:, 2]==k, 1], c=color_array[k], marker='o')

		# Plot out the eigenvectors too
		data = c_[eigenvectors, labels]
		ax = fig.add_subplot(133)
		ax.set_title('K-eigenvectors')
		for k in range( 0, K ):
			ax.scatter( data[data[:, 2]==k, 0], data[data[:, 2]==k, 1], c=color_array[k], marker='o')

		pyplot.show()
Example #2
0
    def train_data(self, num_data=2000, stddev=0.10):
        """
        generate the moon/linear data
        """
        if self.dtype == "moon":
            feat_vec, labels = datasets.make_moons(num_data, noise=stddev)
        elif self.dtype == "linear":
            feat_vec, labels = make_blobs(n_samples=num_data, n_features=2, 
                                          centers=2, cluster_std=1.7)
        else:
            feat_vec, labels = datasets.make_moons(num_data, noise=stddev)

        ##
        ## we need to have these in numpy matrix format
        ##
        feats_vecs = np.matrix(feat_vec).astype(np.float32)
        labels = np.array(labels).astype(dtype=np.uint8)

        # Convert the int numpy array into a one-hot matrix.
        labels_onehot = (np.arange(self.num_classes) == labels[:, None]).astype(np.float32)

        ##
        ## create train and test set
        ##
        train_set_size = int(self.dsplit * num_data)

        self.feats_vecs = feats_vecs[:train_set_size,:]
        self.tfeats_vecs = feats_vecs[train_set_size:,:] 
        self.labels_onehot = labels_onehot[:train_set_size]
        self.tlabels_onehot = labels_onehot[train_set_size:]

        # Return a pair of the feature matrix and the one-hot label matrix.
        return self.feats_vecs, self.labels_onehot
def plot_tree_progressive():
    fig, axes = plt.subplots(4, 2, figsize=(15, 25), subplot_kw={'xticks': (), 'yticks': ()})
    X, y = make_moons(n_samples=100, noise=0.25, random_state=3)

    for i, max_depth in enumerate([1, 2, 9]):
        tree = plot_tree(X, y, max_depth=max_depth, ax=axes[i + 1, 0])
        axes[i + 1, 1].imshow(tree_image(tree))
        axes[i + 1, 1].set_axis_off()
    axes[0, 1].set_visible(False)
    for ax in axes[:, 0]:
        ax.scatter(X[:, 0], X[:, 1], c=np.array(['r', 'b'])[y], s=60)
    X, y = make_moons(noise=0.3, random_state=0)
def _download():
    train_x, train_t = make_moons(n_samples=10000, shuffle=True, noise=0.2, random_state=1234)
    test_x, test_t = make_moons(n_samples=10000, shuffle=True, noise=0.2, random_state=1234)
    valid_x, valid_t = make_moons(n_samples=10000, shuffle=True, noise=0.2, random_state=1234)

    train_x += np.abs(train_x.min())
    test_x += np.abs(test_x.min())
    valid_x += np.abs(valid_x.min())

    train_set = (train_x, train_t)
    test_set = (test_x, test_t)
    valid_set = (valid_x, valid_t)

    return train_set, test_set, valid_set
def make_trans_moons(theta=40, nb=100, noise=.05):
    from math import cos, sin, pi
    
    X, y = make_moons(nb, noise=noise, random_state=1) 
    Xt, yt = make_moons(nb, noise=noise, random_state=2)
    
    trans = -np.mean(X, axis=0) 
    X  = 2*(X+trans)
    Xt = 2*(Xt+trans)
    
    theta = -theta*pi/180
    rotation = np.array( [  [cos(theta), sin(theta)], [-sin(theta), cos(theta)] ] )
    Xt = np.dot(Xt, rotation.T)
    
    return X, y, Xt, yt
Example #6
0
def generate_noisy_data():
    blobs, _ = datasets.make_blobs(n_samples=200,
                                    centers=[(-0.75,2.25), (1.0, 2.0)],
                                    cluster_std=0.25)
    moons, _ = datasets.make_moons(n_samples=200, noise=0.05)
    noise = np.random.uniform(-1.0, 3.0, (50, 2))
    return np.vstack([blobs, moons, noise])
def test_make_moons():
    X, y = make_moons(3, shuffle=False)
    for x, label in zip(X, y):
        center = [0.0, 0.0] if label == 0 else [1.0, 0.5]
        dist_sqr = ((x - center) ** 2).sum()
        assert_almost_equal(dist_sqr, 1.0,
                            err_msg="Point is not on expected unit circle")
def main():
    X, y = datasets.make_moons(n_samples=200, shuffle=True, noise=0.1, random_state=None)
    plt.scatter(X[:, 0], X[:, 1], c=y)
    for i in range(8):
        clf = RandomForestClassifier(n_estimators = 2**i)   
        clf.fit(X,y)
        plot_surface(clf, X, y)
Example #9
0
def plot_adaboost():
    X, y = make_moons(noise=0.3, random_state=0)

    # Create and fit an AdaBoosted decision tree
    est = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                             algorithm="SAMME.R",
                             n_estimators=200)

    sample_weight = np.empty(X.shape[0], dtype=np.float)
    sample_weight[:] = 1. / X.shape[0]

    est._validate_estimator()
    est.estimators_ = []
    est.estimator_weights_ = np.zeros(4, dtype=np.float)
    est.estimator_errors_ = np.ones(4, dtype=np.float)

    plot_step = 0.02

    # Plot the decision boundaries
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    fig, axes = plt.subplots(1, 4, figsize=(14, 4), sharey=True)
    colors = ['#d7191c', '#fdae61', '#ffffbf', '#abd9e9', '#2c7bb6']
    c = lambda a, b, c: map(lambda x: x / 254.0, [a, b, c])
    colors = [c(215, 25, 28),
              c(253, 174, 97),
              c(255, 255, 191),
              c(171, 217, 233),
              c(44, 123, 182),
              ]

    for i, ax in enumerate(axes):
        sample_weight, estimator_weight, estimator_error = est._boost(i, X, y, sample_weight)
        est.estimator_weights_[i] = estimator_weight
        est.estimator_errors_[i] = estimator_error
        sample_weight /= np.sum(sample_weight)

        Z = est.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z,
                    cmap=matplotlib.colors.ListedColormap([colors[1], colors[-2]]),
                    alpha=1.0)
        ax.axis("tight")

        # Plot the training points
        ax.scatter(X[:, 0], X[:, 1],
                   c=np.array([colors[0], colors[-1]])[y],
                   s=20 + (200 * sample_weight) ** 2, cmap=plt.cm.Paired)
        ax.set_xlim(x_min, x_max)
        ax.set_ylim(y_min, y_max)
        ax.set_xlabel('$x_0$')

        if i == 0:
            ax.set_ylabel('$x_1$')

    plt.tight_layout()
    plt.show()
Example #10
0
def test_run():
    data, label = make_moons(n_samples=NSAMPLES, noise=0.4)
    scores, confusions, predictions, test_proba = \
        poly(data, label, n_folds=2, verbose=1, feature_selection=False,
             save=False, project_name='test1')
    data, label = make_classification(n_samples=NSAMPLES, n_features=20,
                                      n_informative=5, n_redundant=2,
                                      n_repeated=0, n_classes=3,
                                      n_clusters_per_class=2, weights=None,
                                      flip_y=0.01, class_sep=1.0,
                                      hypercube=True, shift=0.0,
                                      scale=1.0, shuffle=True,
                                      random_state=None)
    scores, confusions, predictions, test_proba = \
        poly(data, label, n_folds=3, verbose=1, feature_selection=False,
             save=False, project_name='test2')

    scores, confusions, predictions, test_proba = \
        poly(data, label, n_folds=3, verbose=1,
             exclude=['Multilayer Perceptron'], feature_selection=True,
             project_name='test3')
    scores, confusions, predictions, test_proba = \
        poly(data, label, n_folds=3, verbose=1,
             exclude=['Multilayer Perceptron',
                      'Voting'],
             feature_selection=True,
             project_name='test3')
    plot(scores)
def single_run(te):
    print te
    data, label = make_moons(n_samples=1000, noise=0.05, shuffle=True, random_state = int(time.time()))
        
    data,validation_data,label,validation_label = train_test_split(data,label,train_size = .30)
        #separate the data set into buckets
    
    total_data = list(group_list(data,1))
    total_label = list(group_list(label,1))
    
    #The two separate site sets

    for s in range(10,150,10):
	print s
	nets = []
	nn_groups_data = []
	nn_groups_label = []
	number_of_nets = s
	for x in range(number_of_nets):
            nets.append(nnDif.nn_build(1,[2,6,6,1],eta=eta,nonlin=nonlin))
        iters = 20000
        for j in range(number_of_nets):
            x = (total_data[int(float(j)/number_of_nets*(len(total_data))):int(float((j+1))/number_of_nets*(len(total_data)))])
            nn_groups_data.append(x)
    
            nn_groups_label.append(total_label[int(float(j)/number_of_nets*(len(total_label)/number_of_nets)):int(float((j+1))/number_of_nets*(len(total_label)))])

	start = time.time()
	visitbatches(nets,nn_groups_data,nn_groups_label,[],it=iters)
	print time.time() - start
	one = accuracy(nets[0], validation_data, validation_label, thr=0.5)

	nn1Acc[te][s/10] += one
        '''
Example #12
0
def generate_biclass_data(data_type, random_state):
    """ Generate biclass data to classify

    arg : data_type (str) possible type of data
            choose any in ["lin_sep", "non_lin_sep", "overlap"]
            'lin_sep' : Bi-class, linearly separable data
            'non_lin_sep' : Bi-class, non linearly separable data
            'overlap' : Bi-class, non linearly separable data with class overlap

        random_state (int) seed for numpy.random
    """

    # Set seed for reproducible results
    np.random.seed(random_state)

    # Case 1 : linearly separable data
    if data_type == "lin_sep":
        mean1 = np.array([0, 2])
        mean2 = np.array([2, 0])
        cov = np.array([[0.8, 0.6], [0.6, 0.8]])
        X1 = np.random.multivariate_normal(mean1, cov, 100)
        y1 = np.ones(len(X1))
        X2 = np.random.multivariate_normal(mean2, cov, 100)
        y2 = np.ones(len(X2)) * -1
        X = np.vstack((X1, X2))
        y = np.hstack((y1, y2))

    # Case 2 : non -linearly separable data
    elif data_type == "moons":
        X, y = make_moons(n_samples=200, noise=0.2)

    elif data_type == "circles":
        X, y = make_circles(n_samples=200, noise=0.2, factor=0.5)

    # Case 3 : data with overlap between classes
    elif data_type == "overlap":
        mean1 = np.array([0, 2])
        mean2 = np.array([2, 0])
        cov = np.array([[1.5, 1.0], [1.0, 1.5]])
        X1 = np.random.multivariate_normal(mean1, cov, 100)
        y1 = np.ones(len(X1))
        X2 = np.random.multivariate_normal(mean2, cov, 100)
        y2 = np.ones(len(X2)) * -1
        X = np.vstack((X1, X2))
        y = np.hstack((y1, y2))

    assert(X.shape[0] == y.shape[0])

    # Format target to: -1 / +1
    targets = set(y.tolist())
    t1 = min(targets)
    t2 = max(targets)
    l1 = np.where(y < t2)
    l2 = np.where(y > t1)
    y[l1] = -1
    y[l2] = 1

    return X, y
Example #13
0
def loadDatasets(linearly_separable):

    datasets = [\
                make_moons(noise=0.3, random_state=0), \
                make_circles(noise=0.2, factor=0.5, random_state=1), \
                linearly_separable \
               ]

    return datasets
Example #14
0
def test_1():
    # 读取sk里面的数据, 并且绘图
    np.random.seed(0)
    X, y = datasets.make_moons(200, noise=0.20)
    print(X)
    mpp.scatter(X[:,0], X[:,1], s=40, c=y)
    #mpp.plot(X[:,0], X[:,1])
    mpp.show()
    return X, y
Example #15
0
def make_datasets():
    """

    :return:
    """

    return [make_moons(n_samples=200, noise=0.3, random_state=0),
            make_circles(n_samples=200, noise=0.2, factor=0.5, random_state=1),
            make_linearly_separable()]
    def test():
        np.random.seed(0)

        train_x, train_y = datasets.make_moons(5000, noise=.20)
        train_y = np.eye(2)[train_y]

        example_count = len(train_x)

        nn = TheanoNN(train_x.shape[1],1000,train_y.shape[1],np.float32(0.01),np.float32(0.01),train_x,train_y)
        nn.train()
def make_noisy_problem(n_samples_train=30, label_noise_rate=0.1, input_noise=0.15,
                       n_samples_test=3000, seed=0):
    rng = np.random.RandomState(seed)
    rng = np.random.RandomState(1)
    scaler = StandardScaler()

    X_train, y_train = make_moons(n_samples=n_samples_train, shuffle=True,
                                  noise=input_noise, random_state=rng)
    X_test, y_test = make_moons(n_samples=n_samples_test, shuffle=True,
                                noise=input_noise, random_state=rng)
    
    if label_noise_rate > 0:
        rnd_levels = rng.uniform(low=0., high=1., size=n_samples_train)
        noise_mask = rnd_levels <= label_noise_rate
        y_train[noise_mask] = rng.randint(low=0, high=2, size=noise_mask.sum())
    
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return (X_train, y_train), (X_test, y_test)
def build_datasets(n_samples=100):
    X, y = make_classification(n_samples=n_samples, n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1)
    X += 2 * np.random.uniform(size=X.shape)
    linearly_separable = (X, y)

    names = ['moons', 'circles', 'linear', 'xor']
    datasets = [make_moons(n_samples=n_samples, noise=0.3),
                make_circles(n_samples=n_samples, noise=0.2, factor=0.5),
                linearly_separable,
                xor_scale_invariant(n_samples=n_samples)]
    return (names, datasets)
Example #19
0
    def setup_method(self, method):
        base = GradientBoostingClassifier(n_estimators=2)
        self.clf = CascadedBooster(base_clf=base)

        n_samples = 500
        np.random.seed(42)
        X, Y = make_moons(n_samples=n_samples, noise=.05)

        self.X = X
        self.Y = Y
        self.clf.fit(X, Y)
Example #20
0
def get_dataset(dataset, n_samples):
    if dataset == "Noisy Circles":
        return datasets.make_circles(n_samples=n_samples, factor=0.5, noise=0.05)

    elif dataset == "Noisy Moons":
        return datasets.make_moons(n_samples=n_samples, noise=0.05)

    elif dataset == "Blobs":
        return datasets.make_blobs(n_samples=n_samples, random_state=8)

    elif dataset == "No Structure":
        return np.random.rand(n_samples, 2), None
Example #21
0
def makeSimpleDatasets(n_samples=1500): # from sklearn example
    np.random.seed(0)
    # Generate datasets. We choose the size big enough to see the scalability
    # of the algorithms, but not too big to avoid too long running times
    n_samples = 1500
    noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
                                          noise=.05)
    noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
    blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
    no_structure = np.random.rand(n_samples, 2), None

    return [noisy_circles, noisy_moons, blobs, no_structure]
Example #22
0
def get_dataset(dataset, n_samples):
    # Generate the new data:
    if dataset=='Noisy Circles':
        X, y = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05)
    elif dataset=='Noisy Moons':
        X, y = datasets.make_moons(n_samples=n_samples, noise=.05)
    elif dataset=='Blobs':
        X, y = datasets.make_blobs(n_samples=n_samples, random_state=8)
    else:
        X, y = np.random.rand(n_samples, 2), None

    return X, y
Example #23
0
def main():
    # Load the dataset
    X, y = datasets.make_moons(n_samples=300, noise=0.08, shuffle=False)

    # Cluster the data using DBSCAN
    clf = DBSCAN(eps=0.17, min_samples=5)
    y_pred = clf.predict(X)

    # Project the data onto the 2 primary principal components
    p = Plot()
    p.plot_in_2d(X, y_pred, title="DBSCAN")
    p.plot_in_2d(X, y, title="Actual Clustering")
def plot_tree_progressive():
    fig, axes = plt.subplots(4, 2, figsize=(15, 25), subplot_kw={'xticks': (), 'yticks': ()})
    X, y = make_moons(n_samples=100, noise=0.25, random_state=3)

    for i, max_depth in enumerate([1, 2, 9]):
        tree = plot_tree(X, y, max_depth=max_depth, ax=axes[i + 1, 0])
        axes[i + 1, 1].imshow(tree_image(tree))
        axes[i + 1, 1].set_axis_off()
    axes[0, 1].set_visible(False)
    for ax in axes[:, 0]:
        discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
        ax.legend(loc="best")
Example #25
0
def generate_moon_sample():

    #X[0] is all the data points(2D), including two moon shape
    #X[1] is all the labels of the data
    X,y = datasets.make_moons(n_samples=5000, noise=.05)
    moon_dict = {}
    count = 0
    for x in X:
        x1=float(x[0])
        x2=float(x[1])
        moon_dict[str(count)] = (x1,x2)
        count = count + 1

    return moon_dict, X, y
def test_single_linkage_clustering():
    # Check that we get the correct result in two emblematic cases
    moons, moon_labels = make_moons(noise=0.05, random_state=42)
    clustering = AgglomerativeClustering(n_clusters=2, linkage='single')
    clustering.fit(moons)
    assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                     moon_labels), 1)

    circles, circle_labels = make_circles(factor=0.5, noise=0.025,
                                          random_state=42)
    clustering = AgglomerativeClustering(n_clusters=2, linkage='single')
    clustering.fit(circles)
    assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                     circle_labels), 1)
def load_dataset( dname, num_samples ):
    if 'circles' in dname.lower():
        noisy_circles = datasets.make_circles(n_samples=num_samples, factor=.5, noise=.05)
        return noisy_circles
    elif 'moons' in dname.lower():
        noisy_moons = datasets.make_moons(n_samples=num_samples, noise=.05)
        return noisy_moons
    elif 'blobs' in dname.lower():
        blobs = datasets.make_blobs(n_samples=num_samples, random_state=8)
        return blobs
    else:
        no_structure = np.random.rand(num_samples, 2), None
        return no_structure
    return [[]]
def main():
	X, y = make_moons(n_samples=100, random_state=123)
	while True:
		print(options)
		opt = int(raw_input('------>'))
		if opt == 1:
			show_moons(X, y)
			return 
		elif opt == 2:
			scikit_std_pca(X, y)
			return 
		elif opt == 3:
			kernel_pca_unfold(X, y)
			return 
		else:	print("Wrong chocie\n"); continue
Example #29
0
def test_as_classifier():
    X, y = make_moons(n_samples=100, random_state=1)
    y = 2 * y - 1  # use -1/+1 labels

    clf = as_classifier(DecisionTreeRegressor())
    clf.fit(X, y)
    probas = clf.predict_proba(X)
    predictions = clf.predict(X)

    assert_array_equal(probas.shape, (len(X), 2))
    assert_array_equal(predictions, y)

    y[-1] = 2
    clf = as_classifier(DecisionTreeRegressor())
    assert_raises(ValueError, clf.fit, X, y)
def test():
    k = 2
    X, y_true = make_moons(n_samples=500, random_state=0, noise=0.01)
    Y = spectral_embedding.transform(X, k, n_neighbors=7, sigma=0.1)
    n = np.linalg.norm(Y, axis=1)
    n = n.reshape(-1, 1)
    Y = Y / n
    # Apply K-Means to cluster Y
    y_pred, _, _ = kmeans.kmeans(Y, k)

    fig = plt.figure()
    ax = fig.add_subplot(121)
    ax.scatter(np.arange(len(Y)), Y[:, 0])
    ax.set_title("Eigenvector 1")
    ax = fig.add_subplot(122)
    ax.scatter(np.arange(len(Y)), Y[:, 1])
    ax.set_title("Eigenvector 2")

    # Plot the data
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(X[y_true==0, 0], X[y_true==0, 1], c='b', alpha=0.5, label="Class 1")
    ax.scatter(X[y_true==1, 0], X[y_true==1, 1], c='g', alpha=0.5, label="Class 2")
    ax.set_title("Original data")
    ax.legend()

    # Plot the predictions
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(X[y_pred==0, 0], X[y_pred==0, 1], c='r', alpha=0.5, label="Class 1")
    ax.scatter(X[y_pred==1, 0], X[y_pred==1, 1], c='y', alpha=0.5, label="Class 2")
    ax.set_title("Result of clustering")
    ax.legend()

    # Plot the transformed data
    fig = plt.figure()
    ax = fig.add_subplot(111)
    idx_class0 = np.argwhere(y_true==0)
    idx_class1 = np.argwhere(y_true==1)
    ax.scatter(Y[idx_class0, 0], Y[idx_class0, 1], c='b', alpha=0.5, label="Class 1")
    ax.scatter(Y[idx_class1, 0], Y[idx_class1, 1], c='g', alpha=0.5, label="Class 2")
    ax.set_title("Original data after spectral embedding")
    ax.legend()

    print("Number in class 0: {}".format(np.sum(y_pred==0)))
    print("Number in class 1: {}".format(np.sum(y_pred==1)))

    plt.show()
#!/usr/bin python
# -*- encoding: utf-8 -*-
'''
@Author  :   Celeste Young
@File    :   生成数据3圆形半月.py    
@Time    :   2021/2/15 21:31  
@Tips    :   
'''

from sklearn.datasets import make_circles
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt

fig = plt.figure(1)
x1, y1 = make_circles(n_samples=1000, factor=0.5, noise=0.1)
plt.subplot(121)
plt.title('make_circles function example')
plt.scatter(x1[:, 0], x1[:, 1], marker='o', c=y1)

plt.subplot(122)
x1, y1 = make_moons(n_samples=1000, noise=0.1)
plt.title('make_moons function example')
plt.scatter(x1[:, 0], x1[:, 1], marker='o', c=y1)
plt.show()
Example #32
0
from sklearn.datasets import make_blobs, make_moons
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN

# X, y = make_blobs(n_samples=100, centers=3)
X, y = make_moons(n_samples=100, shuffle=True, noise=None, random_state=42)
print(plt.scatter(X[:, 0], X[:, 1]))
kmeans = DBSCAN(eps=0.5, min_samples=5)
# kmeans = KMeans(n_clusters=3).fit(X)

labels = kmeans.labels_
centroids = kmeans.cluster_centers_
print(labels)
print(centroids)
plt.scatter(X[:, 0], X[:, 1], c=labels)
plt.plot(centroids[:, 0], centroids[:, 1], 'r')
import matplotlib.pyplot as plt
import mglearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=100, noise=0.25, random_state=3)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

forest = RandomForestClassifier(n_estimators=5, random_state=2)
forest.fit(X_train, y_train)

fig, axes = plt.subplots(2, 3, figsize=(20, 10))
for i, (ax, tree) in enumerate(zip(axes.ravel(), forest.estimators_)):
    ax.set_title("Tree {}".format(i))
    mglearn.plots.plot_tree_partition(X, y, tree, ax=ax)

mglearn.plots.plot_2d_separator(forest,
                                X,
                                fill=True,
                                ax=axes[-1, -1],
                                alpha=.4)
axes[-1, -1].set_title("Random Forest")
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)

plt.show()
Example #34
0

def args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--noise', type=float, default=0.2)
    parser.add_argument('--random_state', type=int, default=0)
    parser.add_argument('--n_samples', type=int, default=1000)
    parser.add_argument('--save_dir', type=str, default='../dataset')
    parser.add_argument('--name', type=str, default='custom_two_moon')
    return parser.parse_args()


if __name__ == "__main__":
    flags = args()
    x, y = make_moons(noise=flags.noise,
                      random_state=flags.random_state,
                      n_samples=flags.n_samples)
    tvx, test_x, tvy, test_y = train_test_split(x, y, test_size=0.2)
    train_x, val_x, train_y, val_y = train_test_split(tvx, tvy, test_size=0.25)

    print('train: ', len(train_y), 'val: ', len(val_y), 'test: ', len(test_y))

    zeros = x[np.where(y == 0), :][0]
    ones = x[np.where(y == 1), :][0]
    plt.scatter(zeros[:, 0], zeros[:, 1])
    plt.scatter(ones[:, 0], ones[:, 1])
    plt.show()

    if os.path.isdir(os.path.join(flags.save_dir, flags.name)) is False:
        os.makedirs(os.path.join(flags.save_dir, flags.name))
Example #35
0
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.datasets import make_moons

(X, y) = make_moons(200, noise=0.2)

rbf_kernel_svm_clf = Pipeline((("scaler", StandardScaler()), ("svm_clf",
                                                              SVC(kernel="rbf",
                                                                  gamma=5,
                                                                  C=0.001))))
rbf_kernel_svm_clf.fit(X, y)

import numpy as np
import matplotlib.pyplot as plt
xx, yy = np.meshgrid(np.arange(-2, 3, 0.01), np.arange(-1, 2, 0.01))
y_new = rbf_kernel_svm_clf.predict(np.c_[xx.ravel(), yy.ravel()])
plt.contourf(xx, yy, y_new.reshape(xx.shape), cmap="PuBu")
plt.scatter(X[:, 0], X[:, 1], marker="o", c=y)
"""
决策边界范围很小,如果γγ比较大,会使得决策线变窄,变得不规则。相反,小的γγ使决策线变宽,边平滑。所以γγ就像一个正则化参数:如果你的模型过拟合,可以适当减少它,如果它欠拟合,可以增加它(类似于C超参数)。"""
Example #36
0
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

if __name__ == '__main__':

    import sys, os

    # visualization module
    from datavyz import ge

    X, y = make_moons(n_samples=400, noise=0.30, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    # Single Decision Tree Classifier
    tree = DecisionTreeClassifier()
    tree.fit(X_train, y_train)

    # Bagging Classifier
    bag_clf = BaggingClassifier(\
        DecisionTreeClassifier(),
        n_estimators=500, max_samples=0.5, bootstrap=True)
    bag_clf.fit(X_train, y_train)

    fig, AX = ge.figure(axes=(1, 2))
    for ax in AX:
        ge.scatter(
            X=[X_train[:, 0][y_train == 1], X_train[:, 0][y_train == 0]],
def generate_nonlinear_synthetic_data_classification3(n_samples, noise=0.1):
    return make_moons(n_samples=n_samples, noise=noise, random_state=100)
Example #38
0
# Create a random forest from scratch, based on an exercise in chapter 6 of the excellent book [Hands on Machine Learning with Scikit-learn and Tensorflow](http://shop.oreilly.com/product/0636920052289.do).
#
# ## Train and fine tune a Decision Tree for the moons dataset
#
# > a. Generate a moons dataset using `make_moons(n_samples=10000, noise=0.4)`
#
# Reading the [documentation](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_moons.html) for the `make_moons` function tells us the following things:
#
#  - It is for making play datasets with for clustering and classification
#  - The `n_samples` parameter suggested controls the number of datapoints that it will return
#  - `noise` is random noise added to the dataset
#  - It is going to return two things: an array `X` containing the samples and an array `y` containing their class

from sklearn.datasets import make_moons

moons_X, moons_y = make_moons(n_samples=10000, noise=0.4)

# I like to take a look at the dataset before getting stuck in.
# I could try printing it out, or can use the plotting functions in matplotlib.
# Plotting is nicer, lets try that.

get_ipython().magic('matplotlib inline')
from matplotlib import pyplot as plt

figure = plt.figure(figsize=(10, 10))

plt.scatter(x=moons_X[:, 0], y=moons_X[:, 1], c=moons_y, alpha=0.5)

# Okay, that makes sense.
# There are two classes there, which I'm going to try separating with the decision tree.
# There is a bit of overlap between the classes, which is going to make things more difficult for the classifier.
Example #39
0
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice

np.random.seed(0)

# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples,
                                      factor=.5,
                                      noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None

# Anisotropicly distributed data
random_state = 170
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)

# blobs with varied variances
varied = datasets.make_blobs(n_samples=n_samples,
                             cluster_std=[1.0, 2.5, 0.5],
                             random_state=random_state)
Example #40
0
import numpy as np
import tensorflow as tf
from sklearn.datasets import make_moons

# dataset
num_attributes = 2
num_classes = 2

examples, classes = make_moons(256)
examples += np.random.normal(scale=5e-2, size=np.shape(examples))

# hyperparameters
num_must = 12
num_cannot = 12
neighborhood_size = 64
auxiliary_weight = 1e-2
gamma = 1  # parameter for RBF
regularizer = tf.contrib.layers.l2_regularizer(1e-2)
batch_size = len(examples)  # examples
min_batch_must = 0  # pairs of examples
min_batch_cannot = 0  # pairs of examples
learning_rate = 1e-4
num_episodes = 4096


class model:
    input = tf.placeholder(tf.float32, shape=(None, num_attributes))
    layers = [input]
    layers.append(
        tf.layers.dense(inputs=layers[-1],
                        units=16,
Example #41
0
import numpy as np

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

from nn_utils2 import full_forward_propagation, train, get_accuracy_value

#https://towardsdatascience.com/lets-code-a-neural-network-in-plain-numpy-ae7e74410795

#DATA-------------------------------------------------
N_SAMPLES = 1000
TEST_SIZE = 0.1
X, y = make_moons(n_samples=N_SAMPLES, noise=0.2, random_state=100)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=TEST_SIZE,
                                                    random_state=42)

#LINEAR  REGRESSION-------------------------------------
#x_batch = np.linspace(0, 2, 2000)
#y_batch = 1.5 * x_batch + np.random.randn(*x_batch.shape) * 0.2 + 0.5

#x_batch.resize((2000,1))
#y_batch.resize((2000,1))
#X_train, X_test, y_train, y_test = train_test_split(x_batch, y_batch, test_size=0.1, random_state=42)

#HYPERPARAMETERS--------------------------------------
N_nn = 5
kernel_a = 0.001
alpha = 10
beta = 0
            s = [[i[0] * i[1][0], i[0] * i[1][1]] for i in zip(t, x)]
            gradient_w = np.sum(
                s,
                0) / row * self.eta  #np.sum的三种,sum(x,0)所有x的列元素相加,sum(x,1)行元素相加
            gradient_b = np.sum(t, 0) / row * self.eta
            self.w -= gradient_w
            self.b -= gradient_b
            ypts = (self.w[0] * xpts + self.b) / (-self.w[1])
            if itr % 100 == 0:
                plt.figure()
                for i in range(250):
                    plt.plot(x[i, 0], x[i, 1], col[y[i]] + 'o')
                plt.ylim([-1.5, 1.5])
                plt.plot(xpts, ypts, 'g*', lw=2)
                plt.title('eta = %s, Iteration = %s\n' % (str(eta), str(itr)))
                plt.savefig('p_N%s_it%s' % (str(row), str(itr)),
                            dpi=200,
                            bbox_inches='tight')
                # plt.plot(5.50113924e-01, -9.35132373e-01, 'b*', lw=3)
            itr += 1
        plt.show()


if __name__ == '__main__':
    import matplotlib.pyplot as plt
    x, y = make_moons(250, noise=0.25)
    col = {0: 'r', 1: 'b'}
    lr = LR()
    print(x)
    print(y)
    lr.logistic_regression(x, y, eta=1.2)
Example #43
0
from sklearn.cluster import KMeans
from sklearn.datasets import make_moons
from matplotlib import pyplot as plt
from pandas import DataFrame
from matplotlib.colors import ListedColormap

# generate 2d classification dataset
X, y = make_moons(n_samples=1000, noise=0.1, random_state=42)
# scatter plot, dots colored by class value
df = DataFrame(dict(x=X[:, 0], y=X[:, 1], label=y))
colors = {0: 'red', 1: 'blue'}
fig, ax = plt.subplots()
grouped = df.groupby('label')
for key, group in grouped:
    group.plot(ax=ax,
               kind='scatter',
               x='x',
               y='y',
               label=key,
               color=colors[key])

k = 2
#running kmeans clustering into two
kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
# this will contain the labels for our predicted clusters (either 0 or 1)
labels = kmeans.labels_
# the centers of the calculated clusters
clusters = kmeans.cluster_centers_
# printing our cluster centers - there will be 2 of them.
print(clusters)
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)
feature_set, labels = datasets.make_moons(100, noise=0.10)
plt.figure(figsize=(10, 7))
plt.scatter(feature_set[:, 0], feature_set[:, 1], c=labels, cmap=plt.cm.winter)

labels = labels.reshape(100, 1)


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def sigmoid_der(x):
    return sigmoid(x) * (1 - sigmoid(x))


wh = np.random.rand(len(feature_set[0]), 4)
wo = np.random.rand(4, 1)
lr = 0.5

for epoch in range(200000):
    # feedforward
    zh = np.dot(feature_set, wh)
    ah = sigmoid(zh)
    zo = np.dot(ah, wo)
    ao = sigmoid(zo)
Example #45
0
"""
ML for Practical Hackers

This is just a simple tutorial on SVM algorithm.
"""
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from utils import plot_decisions_boundary

from sklearn.svm import SVC
from sklearn.datasets import make_moons
from sklearn.model_selection import GridSearchCV, train_test_split

X, Y = make_moons(n_samples=600, noise=0.25)
plt.scatter(X[:,0], X[:,1], c=Y)
plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

#Setting parameters for hypothesis space search
grid_linear = {
    'C'         : np.logspace(-2,2,10),
    'kernel'    : ['linear']

}
#Creating GridSearchCV object for searching the optimal values
#I separated the linear and kernel case for visualization purposes.
clf_linear = GridSearchCV(SVC(), param_grid=grid_linear, cv=10)
clf_linear.fit(X_train, y_train)
linear_error = 100 * np.sum(clf_linear.best_estimator_.predict(X_test) != y_test) / len(y_test)
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        MLPClassifier(alpha=1, max_iter=1000),
        AdaBoostClassifier(),
        GaussianNB(),
        QuadraticDiscriminantAnalysis()]

    # X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
    #                         random_state=1, n_clusters_per_class=1)
    
    
    rng = np.random.RandomState(2)
    # X += 2 * rng.uniform(size=X.shape)
    linearly_separable = (X, y)

    datasets = [make_moons(noise=0.3, random_state=0),
                make_circles(noise=0.2, factor=0.5, random_state=1),
                linearly_separable
                ]

    figure = plt.figure(figsize=(27, 9))
    i = 1
    # iterate over datasets
    for ds_cnt, ds in enumerate(datasets):
        # preprocess dataset, split into training and test part
        X, y = ds

        X = StandardScaler().fit_transform(X)
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=.4, random_state=42)
Example #47
0
    layer.append(layers.Dense(layer[-1], out_dim))
    return layer


def discriminator(X):
    layer = [layers.Dense(X, 32)]
    layer.append(layers.Activation(layer[-1], T.leaky_relu))
    layer.append(layers.Dense(layer[-1], 32))
    layer.append(layers.Activation(layer[-1], T.leaky_relu))
    layer.append(layers.Dense(layer[-1], 2))
    return layer


BS = 100
lr = 0.001
DATA, _ = datasets.make_moons(1000)

X = T.Placeholder([BS, 2], "float32")
Z = T.Placeholder([BS, 2], "float32")

G_sample = generator(Z, 2)
logits = discriminator(T.concatenate([G_sample[-1], X]))
labels = T.concatenate([T.zeros(BS, dtype="int32"), T.ones(BS, dtype="int32")])

disc_loss = losses.sparse_crossentropy_logits(labels, logits[-1]).mean()
gen_loss = losses.sparse_crossentropy_logits(1 - labels[:BS],
                                             logits[-1][:BS]).mean()
masks = T.concatenate([G_sample[1] > 0, G_sample[3] > 0], 1)

A = T.stack(
    [
Example #48
0
# -*- coding: utf-8 -*-
import theano
import theano.tensor as T
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
import time
#定义数据类型

np.random.seed(0)
train_X, train_y = datasets.make_moons(300, noise=0.20)
train_X = train_X.astype(np.float32)
train_y = train_y.astype(np.int32)
num_example = len(train_X)

#设置参数
nn_input_dim = 332  #输入神经元个数
nn_output_dim = 2  #输出神经元个数
nn_hdim = 100
#梯度下降参数
epsilon = 0.01  #learning rate
reg_lambda = 0.01  #正则化长度

#设置共享变量

w1 = theano.shared(np.random.randn(nn_input_dim, nn_hdim), name="W1")
b1 = theano.shared(np.zeros(nn_hdim), name="b1")
w2 = theano.shared(np.random.randn(nn_hdim, nn_output_dim), name="W2")
b2 = theano.shared(np.zeros(nn_output_dim), name="b2")

#前馈算法
Example #49
0
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
from sklearn.model_selection import GridSearchCV

np.random.seed(2017)  # Set random seed so results are repeatable

n = [100, 500, 1000, 5000]  # number of training points

plt.figure(figsize=(8, 8))

for idx, size in enumerate(n):

    # generate a simple 2D dataset
    X, y = datasets.make_moons(size, 'True', 0.3)

    # perform a grid search on n_neighbors :
    # using the entire dataset, we set a 10-fold stratified cross-validation to compute the test score :
    # i.e we divide the dataset into 10 folds while preserving the percentage of samples for each class, train on 9
    # folds and test on the last one. We then select the value of k for which this test score was the highest.
    # (quantitative approach).

    # define the range of values to try for k
    n_neighbors = np.arange(1, 21)
    param_grid = {"n_neighbors": n_neighbors}
    print('Performing grid search using the following parameters & ranges: \n',
          param_grid)

    # instantiate the GridSearch object & fit it on the dataset (might take some time for higher values of n)
    grid_search = GridSearchCV(
    #print('lambdas')
    #print(lambdas)
    return mat_sq_dists,one_n,K,alphas, lambdas


from sklearn.datasets import make_moons
from sklearn.decomposition import PCA

cloud1=np.loadtxt('cloud1.txt')
t = np.reshape(cloud1,(-1,2))
print(t)

cloud2=np.loadtxt('cloud2.txt')
z=np.reshape(cloud2,(-1,2))
print(z)
X1, y = make_moons(n_samples=100, random_state=123)
X=np.vstack((t,z))
print(X)



mat_sq_dists,one_n,K,alphas, lambdas = stepwise_kpca(X, gamma=15, n_components=100)

np.savetxt('kerneldata.txt',alphas[:,[0,1]], fmt='%.5f')

n=alphas*lambdas
m=lambdas*alphas
c=n-m


np.savetxt('eigvectorssorted.txt',alphas,fmt='%.5f')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets

# In[]:
X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=42)

plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

# In[]:
from sklearn.model_selection import train_test_split

X_train, X_text, y_train, y_test = train_test_split(X, y, random_state=42)
print(pd.value_counts(y_train, sort=True))

# In[]:
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)
print(log_clf.score(X_text, y_test))

# In[]:
from sklearn.svm import SVC

svm_clf = SVC()
svm_clf.fit(X_train, y_train)
Example #52
0
import matplotlib.pyplot as plt
from tqdm import tqdm

def sig(X):
    return 1.0/(1.0+np.exp(-X))


def der_sig(X):
    return np.multiply(sig(X), (1-sig(X)))


reg_lambda = 0#.001
epsilon = 0.0003
num_examples = 3000
n2, n3, n4 = 12, 10, 2
X, y = datasets.make_moons(num_examples, noise=0.1)

n1 = X.shape[1]
W1 = np.random.rand(n1, n2) / np.sqrt(n1)
W2 = np.random.rand(n2, n3) / np.sqrt(n2)
W3 = np.random.rand(n3, n4) / np.sqrt(n3)
b1 = np.random.rand(1, n2) * 0.
b2 = np.random.rand(1, n3) * 0.
b3 = np.random.rand(1, n4) * 0.


def calculate_loss():
    z1 = X.dot(W1) + b1
    a1 = sig(z1)
    z2 = a1.dot(W2) + b2
    a2 = sig(z2)
names = ["Decision Tree"]
classifiers = [
    DecisionTreeClassifier(max_depth=5),
]

X, y = make_classification(n_features=2,
                           n_redundant=0,
                           n_informative=2,
                           random_state=1,
                           n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [
    make_moons(noise=0.3, random_state=0),
    make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable
]

figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    # preprocess dataset, split into training and test part
    #X, y = ds

    X = usedData
    y = usedValue
    print(X, y)
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = \
Example #54
0
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets

X, y = datasets.make_moons(noise=0.25, random_state=666)

from sklearn.tree import DecisionTreeClassifier
#超参数1:max_depth,决策树深度,越小越不容易过拟合
dt_clf1 = DecisionTreeClassifier(max_depth=2)
dt_clf1.fit(X, y)

from Utils.PlotDecisionBoundary import plot_decision_boundary
plot_decision_boundary(dt_clf1, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.title("max_depth=2")
plt.show()

#超参数2:min_samples_split,划分结束判断条件。如果只剩min_samples_split个样本,则不再继续划分。越大越不容易过拟合
dt_clf2 = DecisionTreeClassifier(min_samples_split=10)
dt_clf2.fit(X, y)

from Utils.PlotDecisionBoundary import plot_decision_boundary
plot_decision_boundary(dt_clf2, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.title("min_samples_split=10")
plt.show()

#超参数3:min_samples_leaf,划分结束判断条件。最底层的叶子节点需要至少保留min_samples_leaf个样本,则不再继续划分。越大越不容易过拟合
# coding: utf-8

# In[24]:

from sklearn import datasets
from sklearn.model_selection import train_test_split

# Sklearn **make_moons dataset** at this [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_moons.html#sklearn.datasets.make_moons)

# In[25]:

X, y = datasets.make_moons(n_samples=10000, noise=0.4)

# In[26]:

X_train, X_test, y_train, y_test = train_test_split(X, y)

# In[27]:

from sklearn.ensemble import RandomForestClassifier

# In[28]:

from sklearn.ensemble import VotingClassifier

# In[29]:

from sklearn.linear_model import LogisticRegression

# In[30]:
Example #56
0
from sklearn.datasets import make_moons
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
X, y = make_moons(n_samples=200, noise=0.05, random_state=0)
n_cluster = 10
kmeans = KMeans(n_clusters=n_cluster)
kmeans.fit(X)
y = kmeans.predict(X)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1],
            marker='^',s=100,linewidth=2,edgecolors='k')
plt.scatter(X[:,0],X[:,1],c=y,marker='o',s=13)
plt.show()
Example #57
0
def main(argv=None):
  
  from cleverhans_tutorials import check_installation
  check_installation(__file__)
  
  if not os.path.exists( CONFIG.SAVE_PATH ):
    os.makedirs( CONFIG.SAVE_PATH )
  save_path_data = CONFIG.SAVE_PATH + 'data/'
  if not os.path.exists( save_path_data ):
    os.makedirs( save_path_data )
  model_path = CONFIG.SAVE_PATH + '../all/' +  CONFIG.DATASET + '/'
  if not os.path.exists( model_path ):
    os.makedirs( model_path )
    os.makedirs( model_path + 'data/' )
  
  nb_epochs = FLAGS.nb_epochs
  batch_size = FLAGS.batch_size
  learning_rate = FLAGS.learning_rate
  nb_filters = FLAGS.nb_filters
  len_x = int(CONFIG.NUM_TEST/2)
  
  start = time.time()

  # Object used to keep track of (and return) key accuracies
  report = AccuracyReport()

  # Set seeds to improve reproducibility
  if CONFIG.DATASET == 'mnist' or CONFIG.DATASET == 'cifar10':
    tf.set_random_seed(1234)
    np.random.seed(1234)
    rd.seed(1234)
  elif CONFIG.DATASET == 'moon' or CONFIG.DATASET == 'dims':
    tf.set_random_seed(13)
    np.random.seed(1234)
    rd.seed(0)          
  
  # Set logging level to see debug information
  set_log_level(logging.DEBUG)

  # Create TF session
  tf_config = tf.ConfigProto(allow_soft_placement=True,log_device_placement=True)
  tf_config.gpu_options.per_process_gpu_memory_fraction = 0.2 
  sess = tf.Session(config=tf_config)   
  
  if CONFIG.DATASET == 'mnist':
    # Get MNIST data
    mnist = MNIST(train_start=0, train_end=CONFIG.NUM_TRAIN,
                  test_start=0, test_end=CONFIG.NUM_TEST)
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')
  elif CONFIG.DATASET == 'cifar10':
    # Get CIFAR10 data
    data = CIFAR10(train_start=0, train_end=CONFIG.NUM_TRAIN,
                  test_start=0, test_end=CONFIG.NUM_TEST)
    dataset_size = data.x_train.shape[0]
    dataset_train = data.to_tensorflow()[0]
    dataset_train = dataset_train.map(
      lambda x, y: (random_shift(random_horizontal_flip(x)), y), 4)
    dataset_train = dataset_train.batch(batch_size)
    dataset_train = dataset_train.prefetch(16)
    x_train, y_train = data.get_set('train')
    x_test, y_test = data.get_set('test')                             
  elif CONFIG.DATASET == 'moon':
    # Create a two moon example
    X, y = make_moons(n_samples=(CONFIG.NUM_TRAIN+CONFIG.NUM_TEST), noise=0.2,
                      random_state=0)
    X = StandardScaler().fit_transform(X)
    x_train1, x_test1, y_train1, y_test1 = train_test_split(X, y,
                                            test_size=(CONFIG.NUM_TEST/(CONFIG.NUM_TRAIN
                                            +CONFIG.NUM_TEST)), random_state=0)                          
    x_train, y_train, x_test, y_test = normalize_reshape_inputs_2d(model_path, x_train1,
                                                                   y_train1, x_test1,
                                                                   y_test1)
  elif CONFIG.DATASET == 'dims':
    X, y = make_moons(n_samples=(CONFIG.NUM_TRAIN+CONFIG.NUM_TEST), noise=0.2,
                      random_state=0)
    X = StandardScaler().fit_transform(X)
    x_train1, x_test1, y_train1, y_test1 = train_test_split(X, y,
                                            test_size=(CONFIG.NUM_TEST/(CONFIG.NUM_TRAIN
                                            +CONFIG.NUM_TEST)), random_state=0)                          
    x_train2, y_train, x_test2, y_test = normalize_reshape_inputs_2d(model_path, x_train1,
                                                                     y_train1,x_test1,
                                                                     y_test1)
    x_train, x_test = add_noise_and_QR(x_train2, x_test2, CONFIG.NUM_DIMS)

  np.save(os.path.join(save_path_data, 'x_test'), x_test)
  np.save(os.path.join(save_path_data, 'y_test'), y_test)

  # Use Image Parameters
  img_rows, img_cols, nchannels = x_train.shape[1:4]
  nb_classes = y_train.shape[1]

  # Define input TF placeholder
  x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                        nchannels))
  y = tf.placeholder(tf.float32, shape=(None, nb_classes))

  # Train an model
  train_params = {
      'nb_epochs': nb_epochs,
      'batch_size': batch_size,
      'learning_rate': learning_rate
  }
  eval_params = {'batch_size': 1}
  rng = np.random.RandomState([2017, 8, 30])
  
  with open(CONFIG.SAVE_PATH + 'acc_param.txt', 'a') as fi:

    def do_eval(adv_x, preds, x_set, y_set, report_key):
      acc, pred_np, adv_x_np = model_eval(sess, x, y, preds, adv_x, nb_classes, x_set,
                                          y_set, args=eval_params)
      setattr(report, report_key, acc)
      if report_key:
        print('Accuracy on %s examples: %0.4f' % (report_key, acc), file=fi)
      return pred_np, adv_x_np
    
    if CONFIG.DATASET == 'mnist':
      trained_model_path = model_path + 'data/trained_model'
      model = ModelBasicCNN('model1', nb_classes, nb_filters)
    elif CONFIG.DATASET == 'cifar10':
      trained_model_path = model_path + 'data/trained_model'
      model = ModelAllConvolutional('model1', nb_classes, nb_filters,
                                    input_shape=[32, 32, 3])
    elif CONFIG.DATASET == 'moon':
      trained_model_path = model_path + 'data/trained_model'
      model = ModelMLP('model1', nb_classes)
    elif CONFIG.DATASET == 'dims':
      trained_model_path = save_path_data + 'trained_model'
      model = ModelMLP_dyn('model1', nb_classes, CONFIG.NUM_DIMS)
      
    preds = model.get_logits(x)
    loss = CrossEntropy(model, smoothing=0.1)
    
    def evaluate():
      _, _ = do_eval(x, preds, x_test, y_test, 'test during train')
    
    if os.path.isfile( trained_model_path + '.index' ):
      tf_model_load(sess, trained_model_path)
    else:
      if CONFIG.DATASET == 'mnist':
        train(sess, loss, x_train, y_train, evaluate=evaluate,
              args=train_params, rng=rng, var_list=model.get_params())
      elif CONFIG.DATASET == 'cifar10':
        train(sess, loss, None, None,
              dataset_train=dataset_train, dataset_size=dataset_size,
              evaluate=evaluate, args=train_params, rng=rng,
              var_list=model.get_params())
      elif CONFIG.DATASET == 'moon':
        train_2d(sess, loss, x, y, x_train, y_train, save=False, evaluate=evaluate,
                args=train_params, rng=rng, var_list=model.get_params())
      elif CONFIG.DATASET == 'dims':
        train_2d(sess, loss, x, y, x_train, y_train, evaluate=evaluate,
                args=train_params, rng=rng, var_list=model.get_params())
      saver = tf.train.Saver()
      saver.save(sess, trained_model_path)
    
    # Evaluate the accuracy on test examples
    if os.path.isfile( save_path_data + 'logits_zero_attacked.npy' ):
      logits_0 = np.load(save_path_data + 'logits_zero_attacked.npy')
    else:
      _, _ = do_eval(x, preds, x_train, y_train, 'train')
      logits_0, _ = do_eval(x, preds, x_test, y_test, 'test')
      np.save(os.path.join(save_path_data, 'logits_zero_attacked'), logits_0) 
    
    if CONFIG.DATASET == 'moon':
      num_grid_points = 5000
      if os.path.isfile( model_path + 'data/images_mesh' + str(num_grid_points) + '.npy' ):
        x_mesh = np.load(model_path + 'data/images_mesh' + str(num_grid_points) + '.npy')
        logits_mesh = np.load(model_path + 'data/logits_mesh' + str(num_grid_points) + '.npy')
      else:
        xx, yy = np.meshgrid(np.linspace(0, 1, num_grid_points), np.linspace(0, 1, num_grid_points)) 
        x_mesh1 = np.stack([np.ravel(xx), np.ravel(yy)]).T
        y_mesh1 = np.ones((x_mesh1.shape[0]),dtype='int64')
        x_mesh, y_mesh, _, _ = normalize_reshape_inputs_2d(model_path, x_mesh1, y_mesh1)
        logits_mesh, _ = do_eval(x, preds, x_mesh, y_mesh, 'mesh')
        x_mesh = np.squeeze(x_mesh)
        np.save(os.path.join(model_path, 'data/images_mesh'+str(num_grid_points)), x_mesh)
        np.save(os.path.join(model_path, 'data/logits_mesh'+str(num_grid_points)), logits_mesh)
        
    points_x = x_test[:len_x]
    points_y = y_test[:len_x]
    points_x_bar = x_test[len_x:]
    points_y_bar = y_test[len_x:] 
     
    # Initialize the CW attack object and graph
    cw = CarliniWagnerL2(model, sess=sess) 
    
    # first attack
    attack_params = {
        'learning_rate': CONFIG.CW_LEARNING_RATE,
        'max_iterations': CONFIG.CW_MAX_ITERATIONS
      }
    
    if CONFIG.DATASET == 'moon':
     
      out_a = compute_polytopes_a(x_mesh, logits_mesh, model_path)
      attack_params['const_a_min'] = out_a
      attack_params['const_a_max'] = 100
    
    adv_x = cw.generate(x, **attack_params) 
      
    if os.path.isfile( save_path_data + 'images_once_attacked.npy' ):
      adv_img_1 = np.load(save_path_data + 'images_once_attacked.npy')
      logits_1 = np.load(save_path_data + 'logits_once_attacked.npy')
    else:
      #Evaluate the accuracy on adversarial examples
      preds_adv = model.get_logits(adv_x)
      logits_1, adv_img_1 = do_eval(adv_x, preds_adv, points_x_bar, points_y_bar,
                                    'test once attacked')
      np.save(os.path.join(save_path_data, 'images_once_attacked'), adv_img_1)
      np.save(os.path.join(save_path_data, 'logits_once_attacked'), logits_1)
      
    # counter attack 
    attack_params['max_iterations'] = 1024
      
    if CONFIG.DATASET == 'moon':  
      
      out_alpha2 = compute_epsilons_balls_alpha(x_mesh, np.squeeze(x_test),
                                                np.squeeze(adv_img_1), model_path,
                                                CONFIG.SAVE_PATH)
      attack_params['learning_rate'] = out_alpha2
      attack_params['const_a_min'] = -1
      attack_params['max_iterations'] = 2048
      
      plot_data(np.squeeze(adv_img_1), logits_1, CONFIG.SAVE_PATH+'data_pred1.png', x_mesh,
                logits_mesh)
      
    adv_adv_x = cw.generate(x, **attack_params) 
      
    x_k = np.concatenate((points_x, adv_img_1), axis=0)
    y_k = np.concatenate((points_y, logits_1), axis=0)
    
    if os.path.isfile( save_path_data + 'images_twice_attacked.npy' ):
      adv_img_2 = np.load(save_path_data + 'images_twice_attacked.npy')
      logits_2 = np.load(save_path_data + 'logits_twice_attacked.npy')
    else:
      # Evaluate the accuracy on adversarial examples
      preds_adv_adv = model.get_logits(adv_adv_x)
      logits_2, adv_img_2 = do_eval(adv_adv_x, preds_adv_adv, x_k, y_k,
                                    'test twice attacked')   
      
      np.save(os.path.join(save_path_data, 'images_twice_attacked'), adv_img_2)
      np.save(os.path.join(save_path_data, 'logits_twice_attacked'), logits_2)
    
    if CONFIG.DATASET == 'moon':  
      plot_data(np.squeeze(adv_img_2[:len_x]), logits_2[:len_x],
                CONFIG.SAVE_PATH+'data_pred2.png', x_mesh, logits_mesh)
      plot_data(np.squeeze(adv_img_2[len_x:]), logits_2[len_x:],
                CONFIG.SAVE_PATH+'data_pred12.png', x_mesh, logits_mesh)
      test_balls(np.squeeze(x_k), np.squeeze(adv_img_2), logits_0, logits_1, logits_2,
                 CONFIG.SAVE_PATH)
 
  compute_returnees(logits_0[len_x:], logits_1, logits_2[len_x:], logits_0[:len_x],
                    logits_2[:len_x], CONFIG.SAVE_PATH) 
  
  if x_test.shape[-1] > 1:
    num_axis=(1,2,3)
  else:
    num_axis=(1,2)
    
  D_p = np.squeeze(np.sqrt(np.sum(np.square(points_x-adv_img_2[:len_x]), axis=num_axis)))
  D_p_p = np.squeeze(np.sqrt(np.sum(np.square(adv_img_1-adv_img_2[len_x:]),
                                    axis=num_axis)))
  D_p_mod, D_p_p_mod = modify_D(D_p, D_p_p, logits_0[len_x:], logits_1, logits_2[len_x:],
                                logits_0[:len_x], logits_2[:len_x])
      
  if D_p_mod != [] and D_p_p_mod != []:
    plot_violins(D_p_mod, D_p_p_mod, CONFIG.SAVE_PATH)
    threshold_evaluation(D_p_mod, D_p_p_mod, CONFIG.SAVE_PATH)
    _ = compute_auroc(D_p_mod, D_p_p_mod, CONFIG.SAVE_PATH)
      
  plot_results_models(len_x, CONFIG.DATASET, CONFIG.SAVE_PATH)
  
  print('Time needed:', time.time()-start)

  return report
Example #58
0
        
    return centers, labels

centers, labels = find_clusters(X, 4)
plt.scatter(X[:,0], X[:,1], c=labels, s=50, cmap='viridis');

center, labels = find_clusters(X, 4, rseed=0)
plt.scatter(X[:,0], X[:,1], c=labels, s=50, cmap='viridis')

labels = KMeans(6, random_state=0).fit_predict(X)
plt.scatter(X[:,0], X[:,1], c=labels, s=50, cmap='viridis');

# DBSCAN, mean-shift, affinity propagation...

from sklearn.datasets import make_moons
X, y = make_moons(200, noise=.05, random_state=0)

labels =KMeans(2, random_state=0).fit_predict(X)
plt.scatter(X[:,0], X[:,1], c=labels, s=50, cmap='viridis')

from sklearn.cluster import SpectralClustering
model = SpectralClustering(n_clusters=2, affinity='nearest_neighbors',
                           assign_labels='kmeans')

labels = model.fit_predict(X)
plt.scatter(X[:,0], X[:,1], c=labels, s=50, cmap='viridis')

from sklearn.datasets import load_digits
digits = load_digits()
digits.data.shape
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn import datasets
import numpy as np

x, y = datasets.make_moons(n_samples=2000, noise=0.05)

x1 = x[:, 0]
x2 = x[:, 1]

plt.title("This is the dataset we want to classify with DBSCAN:\n")
plt.scatter(x1, x2, s=5, color='purple')
plt.show()

dbscan = DBSCAN(eps=0.1)
dbscan.fit(x)
y_pred = dbscan.labels_.astype(np.int)

colors = np.array(['#ff0345', '#70ff09'])

plt.title("These are the clusters with DBSCAN:\n")
plt.scatter(x1, x2, s=5, color=colors[y_pred])
plt.show()

kmeans = KMeans(n_clusters=2)
kmeans.fit(x)
y_pred = kmeans.labels_.astype(np.int)

colors = np.array(['#ff0345', '#70ff09'])
Example #60
0
def main():
	# moons_X: Data, moon_y: Labels
	moons_X, moon_y = make_moons(n_samples = 2000)
	addNoise(moons_X, moon_y)