def main():
	no_of_samples = 400
	
	data = []
	data.append( datasets.make_moons(n_samples=no_of_samples, noise=0.05)[0] )
	data.append( datasets.make_circles(n_samples=no_of_samples, factor=0.5, noise=0.05)[0] )
	
	# number of clusters we expect
	K = 2

	for X in data:	
		# from dataset, create adjacency, degree, and laplacian matrix
		adjacency 	= gaussianDistance( X, sigma=0.1 )
		degree 		= degreeMatrix( adjacency )
		L 			= diag(degree) - adjacency

		# perform whitening on the Laplacian matrix
		deg_05 	= diag( degree  ** -0.5 )
		L 		= deg_05.dot( L ).dot( deg_05 )

		# use eig to obtain eigenvalues and eigenvectors
		eigenvalues, eigenvectors = linalg.eig( L )

		# Sort the eigenvalues ascending, the first K zero eigenvalues represent the connected components
		idx = eigenvalues.argsort()
		eigenvalues.sort()
		evecs = eigenvectors[:, idx]
		eigenvectors = evecs[:, 0:K]
		print eigenvalues[0:K]

		color_array = ['b', 'r', 'g', 'y']

		fig = pyplot.figure( figsize=(15, 5) )
		fig.canvas.set_window_title( 'Difference between K-means and Spectral Clusterings' )

		# First perform the normal K-means on the original dataset and plot it out
		centroids, labels = scipy.cluster.vq.kmeans2( X, K )
		data = c_[X, labels]	
		ax = fig.add_subplot( 131 )
		ax.set_title('K means clustering')
		for k in range( 0, K ):
			ax.scatter( data[data[:, 2]==k, 0], data[data[:, 2]==k, 1], c=color_array[k], marker='o')

		# Then we perform spectral clustering, i.e. K-means on eigenvectors
		centroids, labels = scipy.cluster.vq.kmeans2( eigenvectors, K )
		data = c_[X, labels]	
		ax = fig.add_subplot( 132 )
		ax.set_title('Spectral clustering')
		for k in range( 0, K ):
			ax.scatter( data[data[:, 2]==k, 0], data[data[:, 2]==k, 1], c=color_array[k], marker='o')

		# Plot out the eigenvectors too
		data = c_[eigenvectors, labels]
		ax = fig.add_subplot(133)
		ax.set_title('K-eigenvectors')
		for k in range( 0, K ):
			ax.scatter( data[data[:, 2]==k, 0], data[data[:, 2]==k, 1], c=color_array[k], marker='o')

		pyplot.show()
Beispiel #2
0
    def test_sparse(self):
        np.random.seed(10)
        thresh = 1.1

        # Do dense filtration with threshold
        data = (
            datasets.make_circles(n_samples=100)[0]
            + 5 * datasets.make_circles(n_samples=100)[0]
        )
        rips0 = Rips(thresh=thresh, maxdim=1)
        dgms0 = rips0.fit_transform(data)

        # Convert to sparse matrix first based on threshold,
        # then do full filtration
        rips1 = Rips(maxdim=1)
        D = makeSparseDM(data, thresh)
        dgms1 = rips1.fit_transform(D, distance_matrix=True)

        # The same number of edges should have been added
        assert rips0.num_edges_ == rips1.num_edges_

        I10 = dgms0[1]
        I11 = dgms1[1]
        idx = np.argsort(I10[:, 0])
        I10 = I10[idx, :]
        idx = np.argsort(I11[:, 0])
        I11 = I11[idx, :]
        assert np.allclose(I10, I11)
Beispiel #3
0
    def test_sparse(self):
        np.random.seed(10)
        thresh = 1.1

        # Do dense filtration with threshold
        data = (
            datasets.make_circles(n_samples=100)[0]
            + 5 * datasets.make_circles(n_samples=100)[0]
        )
        res0 = ripser(data, thresh=thresh)

        # Convert to sparse matrix first based on threshold,
        # then do full filtration
        D = makeSparseDM(data, thresh)
        res1 = ripser(D, distance_matrix=True)

        # The same number of edges should have been added
        assert res0["num_edges"] == res1["num_edges"]

        dgms0 = res0["dgms"]
        dgms1 = res1["dgms"]
        I10 = dgms0[1]
        I11 = dgms1[1]
        idx = np.argsort(I10[:, 0])
        I10 = I10[idx, :]
        idx = np.argsort(I11[:, 0])
        I11 = I11[idx, :]
        assert np.allclose(I10, I11)
Beispiel #4
0
def generate_data():
    def kernel(x1, x2):
        return np.array([x1, x2, 2 * x1 ** 2 + 2 * x2 ** 2])

    X, Y = make_circles(500, noise=0.12, factor=0.01)

    A = X[np.where(Y == 0)]
    B = X[np.where(Y == 1)]

    X0_orig = A[:, 0]
    Y0_orig = A[:, 1]

    X1_orig = B[:, 0]
    Y1_orig = B[:, 1]

    A = np.array([kernel(x, y) for x, y in zip(np.ravel(X0_orig), np.ravel(Y0_orig))])

    X0 = A[:, 0]
    Y0 = A[:, 1]
    Z0 = A[:, 2]

    A = np.array([kernel(x, y) for x, y in zip(np.ravel(X1_orig), np.ravel(Y1_orig))])
    X1 = A[:, 0]
    Y1 = A[:, 1]
    Z1 = A[:, 2]

    return X0, X1, Y0, Y1, Z0, Z1
Beispiel #5
0
def withCircleData():
    np.random.seed(0)
    X, Y = make_circles(n_samples=400, noise=.05, factor=.3)
    #plotData(X, Y, 'original-circle.png')
    #testPCA(X, Y, ncomp=2, dataset='circles')
    #myKPCA(X, Y, kernel_type='gauss', c=1, deg=2, ncomp=2, dataset='circles')
    myKPCA(X, Y, kernel_type='poly', c=1, deg=10, ncomp=2, dataset='circles')
def main():
    args = sys.argv[1:]
    
    dataset_path = None
    if args and '-save' in args:
        try: dataset_path = args[args.index('-save') + 1]
        except: dataset_path = 'dataset.p'
        
    # Generate the dataset
    print "...Generating Dataset..."
    X1, Y1 = make_circles(n_samples=800, noise=0.07, factor=0.4)
    frac0 = len(np.where(Y1 == 0)[0]) / float(len(Y1))
    frac1 = len(np.where(Y1 == 1)[0]) / float(len(Y1))
    
    print "Percentage of '0' labels:", frac0
    print "Percentage of '1' labels:", frac1

    # (Optionally) save the dataset to DATASET_PATH
    if dataset_path:
        print "...Saving dataset to {0}...".format(dataset_path)
        pickle.dump((X1, Y1, frac0, frac1), open(dataset_path, 'wb'))

    # Plot the dataset
    print "...Showing dataset in new window..."
    pl.figure(figsize=(10, 8))
    pl.subplots_adjust(bottom=.05, top=.9, left=.05, right=.95)

    pl.subplot(111)
    pl.title("Our Dataset: N=200, '0': {0} '1': {1} ".format(frac0, frac1), fontsize="large")

    pl.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)

    pl.show()
    
    print "...Done."
Beispiel #7
0
def main():
    n=500
    x,y=datasets.make_circles(n_samples=n,factor=.5,noise=.05)
    label=CCPMCV().fit(x).label
    print "ARI:",adjusted_rand_score(y,label)
    figure(1)
    scatter(x[:,0],x[:,1],c=label,s=50)
    show()
Beispiel #8
0
def generate_circles(n, y_val):
    """
    Generates a dataset where points are shaped into two circles,
    and labels them with y_val.
    """
    X,y = make_circles(n, noise=0.1)

    return (X, [y_val] * len(X))
Beispiel #9
0
def generate_circles():

    X1, Y1 = make_circles(n_samples=500, noise=0.07, factor=0.4)
    plt.figure(figsize=(5, 5))
    plt.scatter(X1[:, 0], X1[:, 1], c=Y1)
    plt.grid(b=True, which="major", linestyle="-", alpha=0.1, color="black")
    plt.title("Can this be solved linearly?", size=16)
    plt.show()
Beispiel #10
0
def generate_biclass_data(data_type, random_state):
    """ Generate biclass data to classify

    arg : data_type (str) possible type of data
            choose any in ["lin_sep", "non_lin_sep", "overlap"]
            'lin_sep' : Bi-class, linearly separable data
            'non_lin_sep' : Bi-class, non linearly separable data
            'overlap' : Bi-class, non linearly separable data with class overlap

        random_state (int) seed for numpy.random
    """

    # Set seed for reproducible results
    np.random.seed(random_state)

    # Case 1 : linearly separable data
    if data_type == "lin_sep":
        mean1 = np.array([0, 2])
        mean2 = np.array([2, 0])
        cov = np.array([[0.8, 0.6], [0.6, 0.8]])
        X1 = np.random.multivariate_normal(mean1, cov, 100)
        y1 = np.ones(len(X1))
        X2 = np.random.multivariate_normal(mean2, cov, 100)
        y2 = np.ones(len(X2)) * -1
        X = np.vstack((X1, X2))
        y = np.hstack((y1, y2))

    # Case 2 : non -linearly separable data
    elif data_type == "moons":
        X, y = make_moons(n_samples=200, noise=0.2)

    elif data_type == "circles":
        X, y = make_circles(n_samples=200, noise=0.2, factor=0.5)

    # Case 3 : data with overlap between classes
    elif data_type == "overlap":
        mean1 = np.array([0, 2])
        mean2 = np.array([2, 0])
        cov = np.array([[1.5, 1.0], [1.0, 1.5]])
        X1 = np.random.multivariate_normal(mean1, cov, 100)
        y1 = np.ones(len(X1))
        X2 = np.random.multivariate_normal(mean2, cov, 100)
        y2 = np.ones(len(X2)) * -1
        X = np.vstack((X1, X2))
        y = np.hstack((y1, y2))

    assert(X.shape[0] == y.shape[0])

    # Format target to: -1 / +1
    targets = set(y.tolist())
    t1 = min(targets)
    t2 = max(targets)
    l1 = np.where(y < t2)
    l2 = np.where(y > t1)
    y[l1] = -1
    y[l2] = 1

    return X, y
def loadDatasets(linearly_separable):

    datasets = [\
                make_moons(noise=0.3, random_state=0), \
                make_circles(noise=0.2, factor=0.5, random_state=1), \
                linearly_separable \
               ]

    return datasets
Beispiel #12
0
def make_datasets():
    """

    :return:
    """

    return [make_moons(n_samples=200, noise=0.3, random_state=0),
            make_circles(n_samples=200, noise=0.2, factor=0.5, random_state=1),
            make_linearly_separable()]
def test_gridsearch_pipeline():
    # Test if we can do a grid-search to find parameters to separate
    # circles with a perceptron model.
    X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0)
    kpca = KernelPCA(kernel="rbf", n_components=2)
    pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron())])
    param_grid = dict(kernel_pca__gamma=2.0 ** np.arange(-2, 2))
    grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid)
    grid_search.fit(X, y)
    assert_equal(grid_search.best_score_, 1)
Beispiel #14
0
def test_random_trees_dense_type():
    # Test that the `sparse_output` parameter of RandomTreesEmbedding
    # works by returning a dense array.

    # Create the RTE with sparse=False
    hasher = RandomTreesEmbedding(n_estimators=10, sparse_output=False)
    X, y = datasets.make_circles(factor=0.5)
    X_transformed = hasher.fit_transform(X)

    # Assert that type is ndarray, not scipy.sparse.csr.csr_matrix
    assert_equal(type(X_transformed), np.ndarray)
def build_datasets(n_samples=100):
    X, y = make_classification(n_samples=n_samples, n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1)
    X += 2 * np.random.uniform(size=X.shape)
    linearly_separable = (X, y)

    names = ['moons', 'circles', 'linear', 'xor']
    datasets = [make_moons(n_samples=n_samples, noise=0.3),
                make_circles(n_samples=n_samples, noise=0.2, factor=0.5),
                linearly_separable,
                xor_scale_invariant(n_samples=n_samples)]
    return (names, datasets)
def test_gridsearch_pipeline_precomputed():
    # Test if we can do a grid-search to find parameters to separate
    # circles with a perceptron model using a precomputed kernel.
    X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0)
    kpca = KernelPCA(kernel="precomputed", n_components=2)
    pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron())])
    param_grid = dict(Perceptron__n_iter=np.arange(1, 5))
    grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid)
    X_kernel = rbf_kernel(X, gamma=2.0)
    grid_search.fit(X_kernel, y)
    assert_equal(grid_search.best_score_, 1)
def get_dataset(dataset, n_samples):
    # Generate the new data:
    if dataset=='Noisy Circles':
        X, y = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05)
    elif dataset=='Noisy Moons':
        X, y = datasets.make_moons(n_samples=n_samples, noise=.05)
    elif dataset=='Blobs':
        X, y = datasets.make_blobs(n_samples=n_samples, random_state=8)
    else:
        X, y = np.random.rand(n_samples, 2), None

    return X, y
Beispiel #18
0
def get_dataset(dataset, n_samples):
    if dataset == "Noisy Circles":
        return datasets.make_circles(n_samples=n_samples, factor=0.5, noise=0.05)

    elif dataset == "Noisy Moons":
        return datasets.make_moons(n_samples=n_samples, noise=0.05)

    elif dataset == "Blobs":
        return datasets.make_blobs(n_samples=n_samples, random_state=8)

    elif dataset == "No Structure":
        return np.random.rand(n_samples, 2), None
def gen_test_data() :
    ################################################################
    # using sklearn                                                #
    ################################################################
    N = 500
    #features,labels = ds.make_classification(n_samples = N,n_features = 2,n_informative = 2,n_redundant = 0,n_clusters_per_class = 1,class_sep = 2,shift = 2.2)
    features,labels = ds.make_circles(n_samples = N)
    #features,labels = ds.make_moons(n_samples = N)
    labels[labels == 0] = -1
    features = auto_np.array(features) * 4.0
    labels = auto_np.array(labels).reshape(features.shape[0],1)
    return features,labels
Beispiel #20
0
def makeSimpleDatasets(n_samples=1500): # from sklearn example
    np.random.seed(0)
    # Generate datasets. We choose the size big enough to see the scalability
    # of the algorithms, but not too big to avoid too long running times
    n_samples = 1500
    noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
                                          noise=.05)
    noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
    blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
    no_structure = np.random.rand(n_samples, 2), None

    return [noisy_circles, noisy_moons, blobs, no_structure]
def labelPropagationTest01():
	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.semi_supervised import label_propagation
	from sklearn.datasets import make_circles

	#Generate Data
	n_samples = 200
	X,  y = make_circles(n_samples = n_samples, shuffle = False)
	outer, inner = 0, 1
	labels = -np.ones(n_samples)

	#这里是设置label,相当于是给了一个初始值, 其他的label都不知道
	labels[0] = outer
	labels[-1] = inner

	#print X.shape #X是200 * 2的矩阵
	#print labels #label头尾分别是0和1, 其他全部是-1
	#print y #前100个是0, 后100个是1

	#Learn with LabelSpreading
	#这里输入的是X和labels -- 没有输入y
	#但是这里的labels是要预测的量
	label_spread = label_propagation.LabelSpreading(kernel = "knn", alpha = 1.0)
	label_spread.fit(X, labels)

	#Plot output labels
	output_labels = label_spread.transduction_
	plt.figure(figsize=(8.5, 4))
	plt.subplot(1,2,1)
	plot_outer_labeled, = plt.plot(X[labels == outer, 0], X[labels == outer, 1], 'rs')
	plot_unlabeled, = plt.plot(X[labels == -1, 0], X[labels == -1, 1], 'g.')
	plot_inner_labeled, = plt.plot(X[labels == inner, 0], X[labels == inner, 1], 'bs')

	plt.legend((plot_outer_labeled, plot_inner_labeled, plot_unlabeled), ("Outer Labeled", "Inner Labeled", "Unlabeled"), "upper left", numpoints = 1, shadow = False)
	plt.title("Raw data (2 classes = red and blue)")


	#要学习这里的画图技巧
	plt.subplot(1,2,2)
	output_label_array = np.asarray(output_labels)
	outer_numbers = np.where(output_label_array == outer)[0]
	inner_numbers = np.where(output_label_array == inner)[0]
	plot_outer, = plt.plot(X[outer_numbers, 0], X[outer_numbers, 1], 'rs')
	plot_inner, = plt.plot(X[inner_numbers, 0], X[inner_numbers, 1], 'bs')

	plt.legend((plot_outer, plot_inner), ('Outer Learned', 'Inner Learned'), 'upper left', numpoints = 1, shadow = False)

	plt.title("Labels Learned with Label Spreading (KNN)")
	plt.subplots_adjust(left = 0.07, bottom = 0.07, right = 0.93, top = 0.92)

	plt.show()
Beispiel #22
0
def test_random_trees_dense_equal():
    # Test that the `sparse_output` parameter of RandomTreesEmbedding
    # works by returning the same array for both argument values.

    # Create the RTEs
    hasher_dense = RandomTreesEmbedding(n_estimators=10, sparse_output=False, random_state=0)
    hasher_sparse = RandomTreesEmbedding(n_estimators=10, sparse_output=True, random_state=0)
    X, y = datasets.make_circles(factor=0.5)
    X_transformed_dense = hasher_dense.fit_transform(X)
    X_transformed_sparse = hasher_sparse.fit_transform(X)

    # Assert that dense and sparse hashers have same array.
    assert_array_equal(X_transformed_sparse.toarray(), X_transformed_dense)
def load_dataset( dname, num_samples ):
    if 'circles' in dname.lower():
        noisy_circles = datasets.make_circles(n_samples=num_samples, factor=.5, noise=.05)
        return noisy_circles
    elif 'moons' in dname.lower():
        noisy_moons = datasets.make_moons(n_samples=num_samples, noise=.05)
        return noisy_moons
    elif 'blobs' in dname.lower():
        blobs = datasets.make_blobs(n_samples=num_samples, random_state=8)
        return blobs
    else:
        no_structure = np.random.rand(num_samples, 2), None
        return no_structure
    return [[]]
def gen_test_data() :
    ################################################################
    # using sklearn                                                #
    ################################################################
    N = 500
    #features,labels = ds.make_classification(n_samples = N,n_features = 2,n_informative = 2,n_redundant = 0,n_clusters_per_class = 1,class_sep = 2,shift = 2.2)
    features,labels = ds.make_circles(n_samples = N)
    #features,labels = ds.make_moons(n_samples = N)
    labels_tmp = np.zeros((features.shape[0],2))
    features = features * 4.0
    labels_tmp[labels == 1,0] = 1
    labels_tmp[labels == 0,1] = 1
    labels = labels_tmp
    return features,labels
def test_single_linkage_clustering():
    # Check that we get the correct result in two emblematic cases
    moons, moon_labels = make_moons(noise=0.05, random_state=42)
    clustering = AgglomerativeClustering(n_clusters=2, linkage='single')
    clustering.fit(moons)
    assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                     moon_labels), 1)

    circles, circle_labels = make_circles(factor=0.5, noise=0.025,
                                          random_state=42)
    clustering = AgglomerativeClustering(n_clusters=2, linkage='single')
    clustering.fit(circles)
    assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                     circle_labels), 1)
Beispiel #26
0
def generate_circle_sample():

    #X[0] is all the data points(2D), including one big circle, one small circle
    #X[1] is all the labels of the data
    X,y = datasets.make_circles(n_samples=5000, factor=.5, noise=.05)

    circle_dict = {}
    count = 0
    for x in X:
        x1=float(x[0])
        x2=float(x[1])
        circle_dict[str(count)] = (x1,x2)
        count = count + 1

    return circle_dict, X, y
def main():
	X, y = make_circles(n_samples=1000, random_state=123, noise=0.1, factor=0.2)
	while True:
		print(options)
		opt = int(raw_input('------>'))
		if opt == 1:
			show_moons(X, y)
			return 
		elif opt == 2:
			scikit_std_pca(X, y)
			return 
		elif opt == 3:
			kernel_pca_unfold(X, y)
			return 
		else:	print("Wrong choice\n"); continue
Beispiel #28
0
def get_circles():
    """Load synthetic concentric circles data from sklearn, transform into
    dataframe & return."""
    circles_data, circles_labels = make_circles(n_samples=1000, noise=0.1)
    circles_data = pd.DataFrame(circles_data)
    circles_labels = pd.DataFrame(circles_labels)

    circles_combined = pd.concat([circles_data, circles_labels], axis=1)
    circles_combined.columns = ['x', 'y', 'label']

    # draw scatter plot of concentric circles
    g = (ggplot(circles_combined, aes('x', 'y', shape='label'))
        + geom_point(size=40))
    # print g

    return circles_combined
def DBSCAN_cirles(n_samples=10000,factor=.4,noise=.1,eps=.1,minPts=30):
    '''
    Plots classic example for DBSCAN clustering on datasets consisting of two circular clusters
    Args:
        n_samples: number of total observations
        factor: scaling between inner and outer circle, see sklearn.datasets.make_circles doc
        noise: standard deviation of noise, see sklearn.datasets.make_circles doc
        eps: DBSCAN epsilon parameter
        minPts: DBSCAN minPts parameter
    '''
    circle = make_circles(n_samples=n_samples, factor=factor, noise=noise)
    circle = circle[0]
    circlescan = DBSCAN.DBSCAN(circle,eps,minPts)
    #reassign noise for plotting
    labels = circlescan.cluster_labels
    for p, i in enumerate(labels):
        if i == 'noise':
            labels[p] = circlescan._n_clusters + 1
    plt.scatter(circle[:, 0], circle[:, 1], c=circlescan.cluster_labels)
def test_nested_circles():
    # Test the linear separability of the first 2D KPCA transform
    X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0)

    # 2D nested circles are not linearly separable
    train_score = Perceptron().fit(X, y).score(X, y)
    assert_less(train_score, 0.8)

    # Project the circles data into the first 2 components of a RBF Kernel
    # PCA model.
    # Note that the gamma value is data dependent. If this test breaks
    # and the gamma value has to be updated, the Kernel PCA example will
    # have to be updated too.
    kpca = KernelPCA(kernel="rbf", n_components=2, fit_inverse_transform=True, gamma=2.0)
    X_kpca = kpca.fit_transform(X)

    # The data is perfectly linearly separable in that space
    train_score = Perceptron().fit(X_kpca, y).score(X_kpca, y)
    assert_equal(train_score, 1.0)
Beispiel #31
0
def getCircles(size):
    X, y = datasets.make_circles(size, factor=.5, noise=.05)
    return X, [[i] for i in y]
Beispiel #32
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise
from sklearn.datasets import make_blobs, make_circles
from mpl_toolkits.mplot3d import Axes3D

# draw blobs data
# X, y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60)
# draw circles data
X, y = make_circles(100, factor=.1, noise=.1)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='winter')

# calculate the rbf (gaussian) kernel between X and (0, 0)
K = pairwise.rbf_kernel(X, np.array([[0, 0]]))
# K = pairwise.polynomial_kernel(X, np.array([[0.5, 0.5]]))
fig = plt.figure()

ax = Axes3D(fig)
ax.scatter(X[:, 0], X[:, 1], K[:], c=y, cmap='winter')

plt.show()
# -*- coding: utf-8 -*-

# ライブラリをロード
from sklearn.decomposition import PCA, KernelPCA
from sklearn.datasets import make_circles

# 線形分離不可能なデータを生成
features, _ = make_circles(n_samples=1000,
                           random_state=1,
                           noise=0.1,
                           factor=0.1)

# RBF(radius basis function)カーネルPCAを適用
kpca = KernelPCA(kernel="rbf", gamma=15, n_components=1)
features_kpca = kpca.fit_transform(features)

print("もとの特徴量数:", features.shape[1])
print("削減後の特徴量数:", features_kpca.shape[1])
Beispiel #34
0
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np
from P146 import rbf_kernel_pca
from matplotlib.ticker import FormatStrFormatter
from sklearn.datasets import make_circles

X, y = make_circles(n_samples=1000, random_state=123, noise=0.1, factor=0.2)

scikit_pca = PCA(n_components=2)
X_spca = scikit_pca.fit_transform(X)
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(7, 3))

ax[0].scatter(X_spca[y == 0, 0],
              X_spca[y == 0, 1],
              color='red',
              marker='^',
              alpha=0.5)
ax[0].scatter(X_spca[y == 1, 0],
              X_spca[y == 1, 1],
              color='blue',
              marker='o',
              alpha=0.5)
ax[1].scatter(X_spca[y == 0, 0],
              np.zeros((500, 1)) + 0.02,
              color='red',
              marker='^',
              alpha=0.5)
ax[1].scatter(X_spca[y == 1, 0],
              np.zeros((500, 1)) - 0.02,
Beispiel #35
0
sc.output_options(parser)

from scrape import write_dict

args = sc.parse_args(parser)

#results = np.loadtxt(open("test_1_centroids.csv","rb"),delimiter=",",skiprows=1)

#print (results)

# bcml mode feature_class version
#  0    1        2           3
# mode { baseline, update }

np.random.seed(0)

X, y = make_circles(n_samples=args.n_samples,
                    factor=args.factor,
                    noise=args.noise,
                    shuffle=args.shuffle)

datasets.dump_svmlight_file(X,
                            y,
                            args.output_file,
                            zero_based=args.zero_based,
                            query_id=args.query_id,
                            multilabel=args.multilabel,
                            comment=args.comment)

write_dict({'feature_file': args.output_file})
propagate correctly around the circle.
"""
print(__doc__)

# Authors: Clay Woolam <*****@*****.**>
#          Andreas Mueller <*****@*****.**>
# Licence: BSD

import numpy as np
import matplotlib.pyplot as plt
from sklearn.semi_supervised import label_propagation
from sklearn.datasets import make_circles

# generate ring with inner box
n_samples = 200
X, y = make_circles(n_samples=n_samples, shuffle=False)
outer, inner = 0, 1
labels = -np.ones(n_samples)
labels[0] = outer
labels[-1] = inner

###############################################################################
# Learn with LabelSpreading
label_spread = label_propagation.LabelSpreading(kernel='knn', alpha=1.0)
label_spread.fit(X, labels)

###############################################################################
# Plot output labels
output_labels = label_spread.transduction_
plt.figure(figsize=(8.5, 4))
plt.subplot(1, 2, 1)
Beispiel #37
0
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()
]

X, y = make_classification(n_features=2,
                           n_redundant=0,
                           n_informative=2,
                           random_state=1,
                           n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [
    make_moons(noise=0.3, random_state=0),
    make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable
]

figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
for ds in datasets:
    # preprocess dataset, split into training and test part
    X, y = ds
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
Beispiel #38
0
!pip install scipy
!pip install matplotlib
!pip install sklearn

import numpy as np
import scipy as sc
import matplotlib.pyplot as plt

from sklearn.datasets import make_circles

# CREAR EL DATASET

n = 500
p = 2

X, Y = make_circles(n_samples=n, factor=0.5, noise=0.05)

Y = Y[:, np.newaxis]

plt.scatter(X[Y[:, 0] == 0, 0], X[Y[:, 0] == 0, 1], c="skyblue")
plt.scatter(X[Y[:, 0] == 1, 0], X[Y[:, 0] == 1, 1], c="salmon")
plt.axis("equal")
plt.show()

# CLASE DE LA CAPA DE LA RED

class neural_layer():
 
  def __init__(self, n_conn, n_neur, act_f):
    
    self.act_f = act_f
"""
1. Deep Neural Network using PyTorch
2. Using non linear boundaries to seperate the data
"""

import torch
import matplotlib.pyplot as plt
import torch.nn as nn
import numpy as np
from sklearn import datasets

number_of_points = 500
centers = [[-0.5, 0.5], [0.5, -0.5]]
x, y = datasets.make_circles(n_samples=number_of_points,
                             random_state=123,
                             noise=0.1,
                             factor=0.2)
x_data = torch.Tensor(x)
y_data = torch.Tensor(y.reshape(500, 1))


def scatter_plot():
    plt.scatter(x[y == 0, 0], x[y == 0, 1])
    plt.scatter(x[y == 1, 0], x[y == 1, 1])


class Model(nn.Module):  #constructing a model using Linear class
    def __init__(self, input_size, H1, output_size):
        super().__init__()
        self.linear = nn.Linear(input_size, H1)
        self.linear2 = nn.Linear(H1, output_size)
Beispiel #40
0
excitatory_ratio = 0.5
n_iters = 100
mu = 0.1
gamma = 0.5

config_str = "layers_{} excite_{} iters_{} mu_{} gamma_{}".format(
    str(layer_sizes), excitatory_ratio, n_iters, mu, gamma)
np.random.seed(0)

# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples,
                                      factor=.5,
                                      noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None

# Anisotropicly distributed data
random_state = 170
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)

# blobs with varied variances
varied = datasets.make_blobs(n_samples=n_samples,
                             cluster_std=[1.0, 2.5, 0.5],
Beispiel #41
0
import torch
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
from sklearn import datasets

n_pts = 500
centers = [[-0.5, 0.5], [0.5, -0.5]]
X, Y = datasets.make_circles(n_samples=n_pts,
                             random_state=123,
                             noise=0.1,
                             factor=0.2)
x_data = torch.Tensor(X)
y_data = torch.Tensor(Y.reshape(500, 1))


def scattering():
    plt.scatter(X[Y == 0, 0], X[Y == 0, 1])
    plt.scatter(X[Y == 1, 0], X[Y == 1, 1])
    plt.show()


class Model(nn.Module):
    def __init__(self, input, output, hidden1):
        super().__init__()
        self.linear = nn.Linear(
            input,
            hidden1)  #now we have hidden layer, so we need to include it
        self.linear2 = nn.Linear(hidden1, output)

    def forward(self, x):
Beispiel #42
0
                        y_tr,
                        n_out,
                        epochs,
                        num_models=10,
                        train_kwargs={},
                        **kwargs):
    M = ClassifierEnsemble(num_models, X_tr.shape[1], n_out, F.cross_entropy,
                           **kwargs)
    M.train(X_tr, y_tr, epochs, **train_kwargs)

    p_tr = M.predict(X_tr, posterior=True)

    return p_tr


X, y = datasets.make_circles(1000, factor=0.5, noise=0.05)
alpha = 0.2
ypu, ind = makePU(y, alpha, balanced=True)
ind_tr, ind_te = train_test_split(ind, test_size=0.2)

# save and read to make label unchanged
# y = y.reshape((-1,1))
# ypu = ypu.reshape((-1,1))
# ind = ind.reshape((-1,1))
# data = pd.DataFrame(np.concatenate([X, y, ypu], axis=1), columns = ['X.x', 'X.y', 'y', 'ypu'])
# data.to_csv('data.txt', float_format='%.3f')
# np.savetxt('ind_tr.txt', ind_tr, delimiter=',', fmt='% 4d')
# np.savetxt('ind_te.txt', ind_te, delimiter=',', fmt='% 4d')

# data = pd.read_csv('data.txt', index_col=0, sep=',')
# X = np.array(data.iloc[:,:2])
# https://mp.weixin.qq.com/s/UltBigoduH76vs_pmLUOVQ
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles
from sklearn.model_selection import train_test_split
np.random.seed(123)

# generate data set
X, y = make_circles(n_samples=1000, factor=0.5, noise=.1)
fig = plt.figure(figsize=(8,6))
plt.scatter(X[:,0], X[:,1], c=y)
plt.xlim([-1.5, 1.5])
plt.ylim([-1.5, 1.5])
plt.title("Dataset")
plt.xlabel("First feature")
plt.ylabel("Second feature")
plt.show()


# split train and test data
# reshape targets to get column vector with shape (n_samples, 1)
y_true = y[:, np.newaxis]
# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y_true)
print(f'Shape X_train: {X_train.shape}')
print(f'Shape y_train: {y_train.shape}')
print(f'Shape X_test: {X_test.shape}')
print(f'Shape y_test: {y_test.shape}')

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import svm#from sklearn.svm import SVC  两者都可以
from sklearn.datasets import make_circles, make_moons, make_blobs,make_classification

# 制作4个数据集
n_samples = 100
 
datasets = [
    make_moons(n_samples=n_samples, noise=0.2, random_state=0),
    make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1),
    make_blobs(n_samples=n_samples, centers=2, random_state=5),#分簇的数据集
    make_classification(n_samples=n_samples,n_features = 2,n_informative=2,n_redundant=0, random_state=5)
                #n_features:特征数,n_informative:带信息的特征数,n_redundant:不带信息的特征数
    ]
 
Kernel = ["linear","poly","rbf","sigmoid"]

# 规定子图规格
nrows=len(datasets)
ncols=len(Kernel) + 1
 
fig, axes = plt.subplots(nrows, ncols,figsize=(20,16))

# 开始画图
#第一层循环:在不同的数据集中循环
for ds_cnt, (X,Y) in enumerate(datasets):
    
    #在图像中的第一列,放置原数据的分布
    ax = axes[ds_cnt, 0]
            return None, kmc.labels_


if __name__ == '__main__':
    from sklearn import datasets
    from sklearn.metrics import adjusted_mutual_info_score
    from kemlglearn.datasets import make_blobs
    import matplotlib.pyplot as plt


    # data = datasets.load_iris()['data']
    # labels = datasets.load_iris()['target']

    # data, labels = make_blobs(n_samples=[100, 200], n_features=2, centers=[[1,1], [0,0]], random_state=2, cluster_std=[0.2, 0.4])
    data, labels = datasets.make_circles(n_samples=400, noise=0.1, random_state=4, factor=0.3)

    km = KMeans(n_clusters=2)

    cons = SimpleConsensusClustering(n_clusters=2, n_clusters_base=20, n_components=50, ncb_rand=False)

    lkm = km.fit_predict(data)
    cons.fit(data)
    lcons = cons.labels_

    print(adjusted_mutual_info_score(lkm, labels))
    print(adjusted_mutual_info_score(lcons, labels))

    fig = plt.figure()

    # ax = fig.gca(projection='3d')
Beispiel #46
0

X, y = make_classification(
    n_features=2,
    n_redundant=0,
    n_informative=2,
    random_state=1,
    n_clusters_per_class=1)

rng = np.random.RandomState(2)
X += 2*rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [
    make_moons(noise=0.3, random_state=0),
    make_circles(noise=0.2, factor=0.5, random_state=1),
    linearly_separable
]

# Iterate through the dataset
figure = plt.figure(figsize=(27, 9))
i = 1
for ds_cnt, ds in enumerate(datasets):
    X, y = ds
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=.4, random_state=42)
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max()+.5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max()+.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
# -*- coding: utf-8 -*-
#[email protected]
"""
字典学习
"""
print(__doc__)


import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from sklearn.decomposition import DictionaryLearning
mpl.style.use('fivethirtyeight')
from sklearn.datasets import make_circles

np.random.seed(0)

X, y = make_circles(n_samples=400, factor=.3, noise=.05)

pca = DictionaryLearning(n_components=2)
X_pca = pca.fit_transform(X)

fig = plt.figure()
ax = fig.add_subplot(211)
ax.scatter(X[:, 0], X[:, 1], c=y)
ax.axis("equal")
ax = fig.add_subplot(212)
ax.scatter(X_pca[:, 0], X_pca[:, 1], c=y)

ax.axis("equal")
plt.show()
    plt.figure(figsize=(12, 8))
    # plot the contour
    plt.contourf(aa, bb, Z, cmap='bwr', alpha=0.2)
    # plot the moons of data
    plot_data(plt, X, y)

    return plt


# Generate some data blobs.  Data will be either 0 or 1 when 2 is number of centers.
# X is a [number of samples, 2] sized array. X[sample] contains its x,y position of the sample in the space
# ex: X[1] = [1.342, -2.3], X[2] = [-4.342, 2.12]
# y is a [number of samples] sized array. y[sample] contains the class index (ie. 0 or 1 when there are 2 centers)
# ex: y[1] = 0 , y[1] = 1
X, y = make_circles(n_samples=1000, factor=.6, noise=0.1, random_state=42)
#pl = plot_data(plt, X, y)
#pl.show()

# Split the data into Training and Test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

# Create the keras model
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
Beispiel #49
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import cluster

x1, y1 = datasets.make_circles(n_samples=2000, factor=0.5, noise=0.05)
x2, y2 = datasets.make_blobs(n_samples=1000,
                             centers=[[1.2, 1.2]],
                             cluster_std=[[0.1]])

xData = np.concatenate((x1, x2))
# plt.scatter(xData[:, 0], xData[:, 1])
# plt.show()

kMeansModel = cluster.KMeans(n_clusters=3)
kMeansModel.fit(xData)
kmPredict = kMeansModel.labels_
plt.scatter(xData[:, 0], xData[:, 1], c=kmPredict)
plt.show()

dbModel = cluster.DBSCAN(eps=0.2, min_samples=50)
dbModel.fit(xData)
dbPredict = dbModel.labels_
plt.scatter(xData[:, 0], xData[:, 1], c=dbPredict)
plt.show()
# mlp for the circles problem with cross-entropy loss
from utils import disable_tensorflow_gpu
from sklearn.datasets import make_circles
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from matplotlib import pyplot
# generate 2d classification dataset
X, y = make_circles(n_samples=1000, noise=0.1, random_state=1)
# split into train and test
n_train = 500
trainX, testX = X[:n_train, :], X[n_train:, :]
trainy, testy = y[:n_train], y[n_train:]
# define model
model = Sequential()
model.add(
    Dense(50, input_dim=2, activation='relu', kernel_initializer='he_uniform'))
model.add(Dense(1, activation='sigmoid'))
opt = SGD(lr=0.01, momentum=0.9)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
# fit model
history = model.fit(trainX,
                    trainy,
                    validation_data=(testX, testy),
                    epochs=200,
                    verbose=0)
# evaluate the model
_, train_acc = model.evaluate(trainX, trainy, verbose=0)
_, test_acc = model.evaluate(testX, testy, verbose=0)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))
# plot loss during training
Beispiel #51
0
@ Filename:       DBSCAN_TEST.py
@ Author:         Ryuk
@ Create Date:    2019-05-20   
@ Update Date:    2019-05-20 
@ Description:    Implement DBSCAN_TEST
"""

from Cluster import KMeans as kmeans
from Cluster import DBSCAN as dbscan
from sklearn.cluster import DBSCAN

import time
import matplotlib.pyplot as plt
from sklearn import datasets

X1, y1=datasets.make_circles(n_samples=5000, factor=.6, noise=.05)
trainData = X1[0:1000]
time_start1 = time.time()
clf1 = kmeans(k=4, cluster_type="KMeans")
pred1 = clf1.train(trainData)
time_end1 = time.time()
print("Runtime of KMeans:", time_end1-time_start1)

time_start2 = time.time()
clf2 = dbscan()
pred = clf2.train(trainData)
time_end2 = time.time()
print("Runtime of DBSCAN:", time_end2-time_start2)

time_start3 = time.time()
clf3 = DBSCAN(eps=0.1, min_samples=10)
Beispiel #52
0
# -*- coding: utf-8 -*-
"""
Created on Tue Mar  3 16:13:19 2020

@author: chaos
"""

import sys
sys.path.append('../..')

import numpy as np
from sklearn.datasets import make_circles
import matrixslow as ms

X, y = make_circles(600, noise=0.1, factor=0.2)
y = y * 2 - 1

# 特征维数
dimension = 20

# 构造噪声特征
X = np.concatenate([X, np.random.normal(0.0, 0.5, (600, dimension - 2))],
                   axis=1)

# 隐藏向量维度
k = 2

# 一次项
x1 = ms.core.Variable(dim=(dimension, 1), init=False, trainable=False)

# 标签
        row_idx = np.where(y_pred == cluster)
        plt.scatter(X[row_idx, 0], X[row_idx, 1])
    plt.title('Clusters')
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.legend()
    plt.show()


"""## Get the dataset"""

n = 1000
from sklearn.datasets import make_moons, make_blobs, make_circles, make_s_curve
X_moons, y_moons = make_moons(n_samples=n, noise=0.1)
X_blobs, y_blobs = make_blobs(n_samples=n, n_features=2)
X_circles, y_circles = make_circles(n_samples=n, noise=0.1, factor=0.4)
X_scurve, y_scurve = make_s_curve(n_samples=n, noise=0.1)
X_random = np.random.random([n, 2])
y_random = np.random.randint(0, 3, size=[n])
transformation = [[0.80834549, -0.83667341], [-0.20887718, 0.85253229]]
X_aniso = np.dot(X_blobs, transformation)
y_aniso = y_blobs

plot_dataset(X_moons)

visual(2, X_moons, y_moons)

plot_dataset(X_blobs)

visual(3, X_blobs, y_blobs)
    def demo(self):
        h = .02
        names = [
            "Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
            "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
            "Naive Bayes", "QDA"
        ]
        classifiers = [
            KNeighborsClassifier(3),
            SVC(kernel="linear", C=0.025),
            SVC(gamma=2, C=1),
            GaussianProcessClassifier(1.0 * RBF(1.0)),
            DecisionTreeClassifier(max_depth=5),
            RandomForestClassifier(max_depth=5,
                                   n_estimators=10,
                                   max_features=1),
            MLPClassifier(alpha=1),
            AdaBoostClassifier(),
            GaussianNB(),
            QuadraticDiscriminantAnalysis()
        ]
        X, y = make_classification(n_features=2,
                                   n_redundant=0,
                                   n_informative=2,
                                   random_state=1,
                                   n_clusters_per_class=1)
        rng = np.random.RandomState(2)
        X += 2 * rng.uniform(size=X.shape)
        linearly_separable = (X, y)
        datasets = [
            make_moons(noise=0.3, random_state=0),
            make_circles(noise=0.2, factor=0.5, random_state=1),
            linearly_separable
        ]
        figure = plt.figure(figsize=(27, 9))
        i = 1
        # iterable over datasets
        for ds_cnt, ds in enumerate(datasets):
            # preprocess dataset, split into training and test part
            X, y = ds
            X = StandardScaler().fit_transform(X)
            X_train, X_test, y_train, y_test = \
                train_test_split(X, y, test_size=.4, random_state=42)
            x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
            y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
            xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                                 np.arange(y_min, y_max, h))

            # just plot the dataset first
            cm = plt.cm.RdBu
            cm_bright = ListedColormap(['#FF0000', '#0000FF'])
            ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
            if ds_cnt == 0:
                ax.set_title("Input data")
            # Plot the training points
            ax.scatter(X_test[:, 0],
                       X_test[:, 1],
                       c=y_test,
                       cmap=cm_bright,
                       alpha=0.6,
                       edgecolors='k')
            ax.set_xlim(xx.min(), xx.max())
            ax.set_ylim(yy.min(), yy.max())
            ax.set_xticks(())
            ax.set_yticks(())
            i += 1

            # iterate over classifiers
            for name, clf in zip(names, classifiers):
                ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
                clf.fit(X_train, y_train)
                score = clf.score(X_test, y_test)

                # Plot the decision boundary. For that, we will assign a color to each
                # point in the mesh [x_min, x_max]x[y_min, y_max].
                if hasattr(clf, "decision_function"):
                    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
                else:
                    Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

                # Put the result into a color plot
                Z = Z.reshape(xx.shape)
                ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

                # Plot the training points
                ax.scatter(X_train[:, 0],
                           X_train[:, 1],
                           c=y_train,
                           cmap=cm_bright,
                           edgecolors='k')
                # Plot the testing points
                ax.scatter(X_test[:, 0],
                           X_test[:, 1],
                           c=y_test,
                           cmap=cm_bright,
                           edgecolors='k',
                           alpha=0.6)

                ax.set_xlim(xx.min(), xx.max())
                ax.set_ylim(yy.min(), yy.max())
                ax.set_xticks(())
                ax.set_yticks(())
                if ds_cnt == 0:
                    ax.set_title(name)
                ax.text(xx.max() - .3,
                        yy.min() + .3, ('%.2f' % score).lstrip('0'),
                        size=15,
                        horizontalalignment='right')
                i += 1

        plt.show()
        return self
Beispiel #55
0
    normal_size = int((1 - anom_perc) * n)
    anom_size = normal_size - tes_normal_size
    test_data = np.vstack(
        [
            np.random.multivariate_normal(mean=mu[0, :], cov=cov[0], size=test_normal_size),
            np.random.multivariate_normal(mean=mu[1, :], cov=cov[1], size=test_anom_size),
        ]
    )
    test_labels = np.hstack([np.repeat(0, test_normal_size), np.repeat(1, test_anom_size)])

    return train_normal_data, test_data, train_normal_labels, test_labels


if __name__ == "__main__":
    circle_X, circle_y = make_circles(10000, noise=0.01)
    circle_X = circle_X[circle_y == 1, :]
    circle_y = circle_y[circle_y == 1]

    pickle.dump((circle_X, circle_y), open("../data/circle_train_normal.pickle", "wb"))
    test_grid = np.array(
        np.meshgrid(
            np.linspace(circle_X[:, 0].min(), circle_X[:, 0].max(), 100),
            np.linspace(circle_X[:, 1].min(), circle_X[:, 1].max(), 100),
        )
    ).T.reshape(-1, 2)
    pickle.dump((test_grid, test_grid), open("../data/circle_test_normal.pickle", "wb"))

    scaler = MinMaxScaler()
    circle_X = scaler.fit_transform(circle_X)
    test_grid = np.array(np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100))).T.reshape(
Beispiel #56
0
def plot_classifiers():
    """
    Plot classifiers in synthetic datasets, taken from http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
    A comparison of a several classifiers in scikit-learn on synthetic datasets. The point of this example is to illustrate the nature of decision boundaries of different classifiers. This should be taken with a grain of salt, as the intuition conveyed by these examples does not necessarily carry over to real datasets.
    Particularly in high-dimensional spaces, data can more easily be separated linearly and the simplicity of classifiers such as naive Bayes and linear SVMs might lead to better generalization than is achieved by other classifiers.
    The plots show training points in solid colors and testing points semi-transparent. The lower right shows the classification accuracy on the test set.
    """
    h = .02  # step size in the mesh

    names = [
        "DummyClassifier", "Nearest Neighbors", "Decision Tree", "Naive Bayes",
        "Linear SVM", "RBF SVM", "Random Forest"
    ]
    classifiers = [
        DummyClassifier(strategy="prior"),
        KNeighborsClassifier(3),
        DecisionTreeClassifier(max_depth=5),
        GaussianNB(),
        SVC(kernel="linear", C=0.025),
        SVC(gamma=2, C=1),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
    ]

    X, y = make_classification(n_features=2,
                               n_redundant=0,
                               n_informative=2,
                               random_state=1,
                               n_clusters_per_class=1)
    rng = np.random.RandomState(2)
    X += 2 * rng.uniform(size=X.shape)
    linearly_separable = (X, y)

    datasets = [
        make_moons(noise=0.3, random_state=0),
        make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable
    ]
    ds_names = [
        "Dataset moons", "Dataset circles", "Dataset linearly_separable"
    ]

    figure = plt.figure(figsize=(27, 9))
    i = 1
    # iterate over datasets
    for ds_name, ds in zip(ds_names, datasets):
        # preprocess dataset, split into training and test part
        X, y = ds
        X = StandardScaler().fit_transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)

        x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
        y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))

        # just plot the dataset first
        cm = plt.cm.RdBu
        cm_bright = ListedColormap(['#FF0000', '#0000FF'])
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        ax.set_title(ds_name)
        # Plot the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
        # and testing points
        ax.scatter(X_test[:, 0],
                   X_test[:, 1],
                   c=y_test,
                   cmap=cm_bright,
                   alpha=0.6)
        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        i += 1

        # iterate over classifiers
        for name, clf in zip(names, classifiers):
            ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)

            # Plot the decision boundary. For that, we will assign a color to each
            # point in the mesh [x_min, m_max]x[y_min, y_max].
            if hasattr(clf, "decision_function"):
                Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
            else:
                Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

            # Put the result into a color plot
            Z = Z.reshape(xx.shape)
            ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

            # Plot also the training points
            ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
            # and testing points
            ax.scatter(X_test[:, 0],
                       X_test[:, 1],
                       c=y_test,
                       cmap=cm_bright,
                       alpha=0.6)

            ax.set_xlim(xx.min(), xx.max())
            ax.set_ylim(yy.min(), yy.max())
            ax.set_xticks(())
            ax.set_yticks(())
            ax.set_title(name)
            ax.text(xx.max() - .3,
                    yy.min() + .3, ('%.2f' % score).lstrip('0'),
                    size=15,
                    horizontalalignment='right')
            i += 1

    figure.subplots_adjust(left=.02, right=.98)
    plt.suptitle("Comparison of Classifiers in synthetic datasets",
                 fontsize=18)
    plt.show()