def Kmeans_cluster_analysis(x,y,n_clusters):
    X = np.hstack((x.reshape((x.shape[0],1)),y.reshape((y.shape[0],1))))
    X = Scaler().fit_transform(X)
    km = KMeans(n_clusters)
    km.fit(X)
    labels = km.labels_
    cluster_centers = km.cluster_centers_
    
    labels_unique = set(labels) #np.unique(labels)
    n_clusters_ = len(labels_unique)
    
    #print("number of estimated clusters : %d" % n_clusters_)
    colors = 'bgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmyk' #cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    #colors = pl.cm.Spectral(np.linspace(0, 1, len(labels_unique)))
    for i in xrange(len(labels_unique)):
        my_members = labels == i
        cluster_center = cluster_centers[i]
        plt.scatter(X[my_members, 0], X[my_members, 1],s=90,c=colors[i],alpha=0.7)
        plt.scatter(cluster_center[0], cluster_center[1],marker='+',s=280,c=colors[i])
    tolx = (X[:,0].max()-X[:,0].min())*0.03
    toly = (X[:,1].max()-X[:,1].min())*0.03
    plt.xlim(X[:,0].min()-tolx,X[:,0].max()+tolx)
    plt.ylim(X[:,1].min()-toly,X[:,1].max()+toly)
    plt.show()
    return labels
def update_clusters():
    num_reviews = Review.objects.count()
    update_step = ((num_reviews/100)+1) * 5
    if num_reviews % update_step == 0: 
    # Create a sparse matrix from user reviews
        all_usernames = map(lambda x: x.username, User.objects.only("username"))
        all_wine_ids = set(map(lambda x: x.wine.id, Review.objects.only("wine")))
        num_users = len(all_usernames)
        # m is often used to denote a matrix
        ratings_m = dok_matrix((num_users, max(all_wine_ids)+1), dtype=np.float32)
        for i in range(num_users): 
            # each user corresponds to a row, in the order of all_usernames
            user_reviews = Review.objects.filter(user_name=all_usernames[i])
        for user_review in user_reviews:
            ratings_m[i,user_review.wine.id] = user_review.rating

        # Perform kmeans clustering
        k = int(num_users / 10) + 2
        kmeans = KMeans(n_clusters=k)
        clustering = kmeans.fit(ratings_m.tocsr())

        # Update clusters
        Cluster.objects.all().delete()
        new_clusters = {i: Cluster(name=i) for i in range(k)}
        for cluster in new_clusters.values(): # clusters need to be saved before referring to users
            cluster.save()
        for i,cluster_label in enumerate(clustering.labels_):
            new_clusters[cluster_label].users.add(User.objects.get(username=all_usernames[i]))
def csv_parser(fileName):
    data = open(fileName, 'rU').readlines()
    outfile = fileName[:-4] + '_kmeans.csv'
    fhout = open(outfile, 'w')
    outfile = data[0].strip() + ',Label' + '\n'
    fhout.write(outfile)


    vaf = []

    for line in data[1:]:
        flds = line.split(',')
        vaf.append([float(flds[7]), float(flds[8])])

    print vaf[:5]

    vaf_np = np.array(vaf)
    print len(vaf_np)
    print vaf_np[:5]

    kmeansModel = KMeans(k=6, init='k-means++', n_init=100, max_iter=3000)

    labels = kmeansModel.fit_predict(vaf_np)

##    clustDist = model.transform(vaf_np)
    print labels[:30]

    for j in range(1, len(data)):
        outline = data[j].strip() + ',' + str(labels[j-1]) + '\n'
        fhout.write(outline)
    fhout.close()
Beispiel #4
0
    def fit(self, X):
        """

        :param X:
        :return:
        """

        lcl = range(1, self._maxc+1)


        # compute the fractal dimension
        ldistorsion = []
        for i in range(1, self._maxc+1):
            cluster = KMeans(n_clusters=i, n_jobs=-1)
            cluster.fit(X)
            ldistorsion.append(within_scatter_matrix_score(X, cluster.labels_))

        print(X.shape[1])
        print(ldistorsion)

        PCF = []
        for x,y in zip(ldistorsion, lcl):
            print(x,y, np.power(y, 2.0/X.shape[1]))
            PCF.append(x * np.power(y, 2.0/X.shape[1]))

        print(PCF)

        self._M = np.argmin(PCF)
        print(self._M)
 def treeGenerator(self, rootLabel, points,names):
     # rootLabel is label of root
     # points is list of Feature Vectors
     # names is the name of the image corresponding Feature vector is in
     # print rootLabel, len(points)
     if len(points) < self.threshold:
         self.adjancency[rootLabel]=[]
         if rootLabel not in self.leafLabels:
             self.leafLabels.append(rootLabel)
         return
     else:
         localModel = KMeans(n_clusters = self.branches,n_jobs=4)
         localModel.fit(points)
         adj = []
         localTree = {}
         for i in localModel.cluster_centers_:
             self.treeMap[self.nodes]=i
             self.nodeImages[self.nodes]=[] # A map for node and the Images It has
             localTree[tuple(i)]=self.nodes
             adj.append(self.nodes)
             self.nodes = self.nodes + 1
         self.adjancency[rootLabel]=adj
         localClusterPoints = [[] for i in range(self.branches)]
         localClusterImgNames = [[] for i in range(self.branches)]
         # A local array to store which FV is in which cluster
         for i in range(len(points)):
             localClusterPoints[localModel.labels_[i]].append(points[i])
             localClusterImgNames[localModel.labels_[i]].append(names[i])
             if names[i] not in self.nodeImages[localTree[tuple(localModel.cluster_centers_[localModel.labels_[i]])]]:
                  self.nodeImages[localTree[tuple(localModel.cluster_centers_[localModel.labels_[i]])]].append(names[i])
         for i in range(self.branches):
             thisClusterCenter = tuple(localModel.cluster_centers_[i])
             self.treeGenerator(localTree[thisClusterCenter],localClusterPoints[i],localClusterImgNames[i])
def kmeans_cluster(G, graph_name, num_clusters):
    subgraphs = []
    #Find a way to figure out clusters number automatically
    write_directory = os.path.join(Constants.KMEANS_PATH,graph_name)
    if not os.path.exists(write_directory):
        os.makedirs(write_directory)
    nodeList = G.nodes()
    matrix_data = nx.to_numpy_matrix(G, nodelist = nodeList)
    kmeans = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10)   
    kmeans.fit(matrix_data)
    label = kmeans.labels_
    clusters = {}
    
    for nodeIndex, nodeLabel in enumerate(label):
        if nodeLabel not in clusters:
            clusters[nodeLabel] = []
        clusters[nodeLabel].append(nodeList[nodeIndex])
        
    #countNodes is used to test whether we have all the nodes in the clusters 
    countNodes = 0    
    for clusterIndex, subGraphNodes in enumerate(clusters.keys()):
        subgraph = G.subgraph(clusters[subGraphNodes])
        subgraphs.append(subgraph)
        nx.write_gexf(subgraph, os.path.join(write_directory,graph_name+str(clusterIndex)+Constants.GEXF_FORMAT))
        #countNodes = countNodes + len(clusters[subGraphNodes])
        pass
    return num_clusters
Beispiel #7
0
def make_tsne_plot(model, rel_wds, plot_lims, title):

    dim = 30
    X, keys = make_data_matrix(model)

    # first we actually do PCA to reduce the
    # dimensionality to make tSNE easier to calculate
    X_std = StandardScaler().fit_transform(X)
    sklearn_pca = PCA(n_components=2)
    X = sklearn_pca.fit_transform(X_std)[:,:dim]

    # do downsample
    k = 5000
    sample = []
    important_words = []
    r_wds = [word[0] for word in rel_wds]
    for i, key in enumerate(keys):
        if key in r_wds:
            sample.append(i)
    sample = np.concatenate((np.array(sample),
                np.random.choice(len(keys), k-10, replace = False),
             ))
    X = X[sample,:]
    keys = [keys[i] for i in sample]



    # Do tSNE
    tsne = TSNE(n_components=2, random_state=0, metric="cosine")
    X_transf = tsne.fit_transform(X)

    k_means = KMeans(n_clusters=8)
    labels = k_means.fit_predict(X_transf)

    scatter_plot(X_transf[:,0], X_transf[:,1],  rel_wds, labels, title, keys, plot_lims)
def run_kmeans(gene_folder, n_clusters):
	pars, fitness = load_all_generations_as_DataFrame(gene_folder)
	kmeans = KMeans(n_clusters=n_clusters)
	kmeans.fit(pars)
	means = map(lambda c: fitness[kmeans.labels_ == c].mean()['longest_interval_within_margin'], range(n_clusters))
	stds = map(lambda c: fitness[kmeans.labels_ == c].std()['longest_interval_within_margin'], range(n_clusters))
	return kmeans, means, stds
Beispiel #9
0
def create_fiveline(image):
    edges = cv2.Canny(image, 50, 150, apertureSize=3)

    ys = list()
    minLineLength = 1
    maxLineGap = 10

    lines = cv2.HoughLinesP(edges, 1, np.pi / 180, 70, minLineLength, maxLineGap)
    
    for line in lines:
        for x1, y1, x2, y2 in line:
            cv2.line(image, (x1,y1), (x2,y2), (0, 255, 0), 2)
            if (abs(y1 - y2 < 4)):
                innerlist = list()
                innerlist.append((y1 + y2) / 2)
                ys.append(innerlist)
    
    cv2.imwrite('images/houghlines.jpg', image)
    display_image(image)

    kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10)
    kmeans.fit(np.asarray(ys))

    fiveline = list()
    for innerlist in kmeans.cluster_centers_:
        fiveline.append(innerlist[0])

    fiveline.sort()
    print "K-MEANS centers"
    print fiveline
    return fiveline
Beispiel #10
0
    def partition_FOV_KMeans(self,tradeoff_weight=.5,fx=.25,fy=.25,n_clusters=4,max_iter=500):
        """
        Partition the FOV in clusters that are grouping pixels close in space and in mutual correlation

        Parameters
        ------------------------------
        tradeoff_weight:between 0 and 1 will weight the contributions of distance and correlation in the overall metric
        fx,fy: downsampling factor to apply to the movie
        n_clusters,max_iter: KMeans algorithm parameters

        Outputs
        -------------------------------
        fovs:array 2D encoding the partitions of the FOV
        mcoef: matric of pairwise correlation coefficients
        distanceMatrix: matrix of picel distances

        Example

        """

        _,h1,w1=self.shape
        self.resize(fx,fy)
        T,h,w=self.shape
        Y=np.reshape(self,(T,h*w))
        mcoef=np.corrcoef(Y.T)
        idxA,idxB =  np.meshgrid(list(range(w)),list(range(h)));
        coordmat=np.vstack((idxA.flatten(),idxB.flatten()))
        distanceMatrix=euclidean_distances(coordmat.T);
        distanceMatrix=old_div(distanceMatrix,np.max(distanceMatrix))
        estim=KMeans(n_clusters=n_clusters,max_iter=max_iter);
        kk=estim.fit(tradeoff_weight*mcoef-(1-tradeoff_weight)*distanceMatrix)
        labs=kk.labels_
        fovs=np.reshape(labs,(h,w))
        fovs=cv2.resize(np.uint8(fovs),(w1,h1),old_div(1.,fx),old_div(1.,fy),interpolation=cv2.INTER_NEAREST)
        return np.uint8(fovs), mcoef, distanceMatrix
def perform_cluster_analysis(dataset):

    filename = 'elbow_plot.dat'

    if os.path.exists(cpath + filename):
        data = joblib.load(cpath + filename)
        K = data[0]
        meandistortions = data[1]
    else:
        X = dataset
        print 'X Shape: ', X.shape

        #K = range(1, 50, 5)
        K = [1, 2, 5, 10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
        #K = [1, 2, 5, 10, 50, 100]
        meandistortions = []
        cluster_centers = []
        for k in K:
            print k
            kmeans = KMeans(n_clusters=k, n_jobs=3)
            kmeans.fit(X)
            #import ipdb; ipdb.set_trace() # debugging code
            #meandistortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1))/X.shape[0])
            meandistortions.append(kmeans.inertia_)
            cluster_centers.append(kmeans.cluster_centers_)
            #print 'k: ', k, ' Cluster Centers: ', kmeans.cluster_centers_
        data = [K, meandistortions]
        joblib.dump(data, cpath + filename, compress=8)

    plot_name = "elbow_plot.png"
    title = 'Selecting k with the Elbow Method'
    xlabel = 'Number of Clusters (k)'
    ylabel = 'Average Distortion'
    xyplot(K, meandistortions, 0, 0, 0, 0, title, xlabel, ylabel, staticpath + plot_name, line=1, y_log=0)
Beispiel #12
0
def reduce_colors(image, n_clusters):

	image = img_as_float(image)
	height = len(image)
	width = len(image[0])
	image = image.reshape((height*width,3))

	image_mean = {}
	image_median = {}

	kmeans = KMeans(n_clusters = n_clusters, init='k-means++', random_state=241)
	classes = kmeans.fit_predict(image)

	means, medians = [], []
	for cl in range(n_clusters):
		means.append( np.mean(image[classes == cl], axis = 0))
		medians.append( np.median(image[classes == cl], axis = 0))
	
	image_mean = image.copy().astype(float)
	image_median = image.copy().astype(float)
	for cl in range(n_clusters):
		image_mean[classes == cl] = means[cl]
		image_median[classes == cl] = medians[cl]

	logging.info('Clusters: %s, PSNR(mean): %s, PSRN(median): %s'%(n_clusters, PSNR(image, image_mean), PSNR(image, image_median)))

	image_mean = image_mean.reshape(height,width,3)

	string_image = StringIO()
	plt.imsave(string_image, image_mean)

	return string_image
def makecluster():

	n_points=6
	n_dim=2
	n_clusters=6
	model=KMeans(init='k-means++',n_clusters=4,n_init=10)
   
	data=np.zeros((16,2))
 	#print data
        data1=np.array(temp)   
	data[0:4,:]=2
	data[4:8,:]=1
	data[8:12:,:]=-1
	data[12:16,:]=-2
	data[(0,4,8,12),1]=2
	data[(1,5,9,13),1]=1
	data[(2,6,10,14),1]=-1
	data[(3,7,11,15),1]=-2
	
	#data[3,1]=2
        #data[4,1]=3
	#data[5,1]=2
        #data[0,1]=3

	
	model.fit(data1)
	print data1
	print model.labels_
Beispiel #14
0
def findColor(frame):
    t = time()
    # dim = np.array(frame.size)/2
    # frame.thumbnail(dim, Image.ANTIALIAS)
    # print "Thumbnail in %0.3f seconds." % (time() - t)
    # t = time()
    points = imresize(np.array(frame, dtype=np.float64), 0.3)
    w,h,d = points.shape
    data = np.reshape(points, (w*h, d))
    sample = shuffle(data, random_state=0)[:len(data)/3]
    print "Reshape and shuffle in %0.3f seconds." % (time() - t)
    t = time()
    kmeans = KMeans(n_clusters=k_colors, n_jobs=jobs).fit(sample)
    labels = kmeans.predict(data)
    print "Fit and predict in %0.3f seconds." % (time() - t)
    t = time()
    colors = [map(int, color) for color in kmeans.cluster_centers_]
    # hsvs = np.array([rgb_to_hsv(*values) for values in colors])
    # frequent = np.argmax(hsvs[:,1])
    # frequent = colors[frequent]
    print "Found in %0.3f seconds." % (time() - t)
    frequents = defaultdict(int)
    for l in labels:
        frequents[l] += 1
    frequents = sorted(frequents.items(), key=lambda x:x[1], reverse=True)
    frequents = [colors[i[0]] for i in frequents[:3]]
    # print "Counted in %0.3f seconds." % (time() - t)
    # print "Top 3 colors [RGB]: ", frequents[:3]
    return frequents[2] if len(frequents) == 3 else frequents[0]
Beispiel #15
0
def match_line_cluster(gdf1, gdf2):
    """
    Try to match two layers of linestrings with KMeans cluster analysis based
    on a triplet of descriptive attributes :
    (centroid coords., rounded length, approximate bearing)

    Parameters
    ----------
    gdf1: GeoDataFrame
        The reference dataset.
    gdf2: GeoDataFrame
        The collection of LineStrings to match.

    Returns
    -------
    matching_table: pandas.Series
        A table (index-based on *gdf1*) containing the id of the matching
        feature found in *gdf2*.
    """
    param1, param2 = list(map(mparams, [gdf1, gdf2]))
    k_means = KMeans(init='k-means++', n_clusters=len(gdf1),
                     n_init=10, max_iter=1000)
    k_means.fit(np.array((param1+param2)))
    df1 = pd.Series(k_means.labels_[len(gdf1):])
    df2 = pd.Series(k_means.labels_[len(gdf1):])
#    gdf1['fid_layer2'] = \
#        df1.apply(lambda x: df2.where(gdf2['key'] == x).notnull().nonzero()[0][0])
    return pd.DataFrame(
        index=list(range(len(gdf1))),
        data=df1.apply(
            lambda x: df2.where(df2 == x).notnull().nonzero())
        )
def iris_h2o_vs_sciKmeans(ip,port):
  # Connect to a pre-existing cluster
  h2o.init(ip,port)  # connect to localhost:54321

  iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))
  iris_sci = np.genfromtxt(h2o.locate("smalldata/iris/iris.csv"), delimiter=',')
  iris_sci = iris_sci[:,0:4]

  s =[[4.9,3.0,1.4,0.2],
  [5.6,2.5,3.9,1.1],
  [6.5,3.0,5.2,2.0]]

  start = h2o.H2OFrame(s)
  start_key = start.send_frame()

  h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start_key, standardize=False)

  sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1)
  sci_km.fit(iris_sci)

  # Log.info("Cluster centers from H2O:")
  print "Cluster centers from H2O:"
  h2o_centers = h2o_km.centers()
  print h2o_centers

  # Log.info("Cluster centers from scikit:")
  print "Cluster centers from scikit:"
  sci_centers = sci_km.cluster_centers_.tolist()
  print sci_centers

  for hcenter, scenter in zip(h2o_centers, sci_centers):
    for hpoint, spoint in zip(hcenter,scenter):
      assert (hpoint- spoint) < 1e-10, "expected centers to be the same"
Beispiel #17
0
def showClustering(data):
    kmeans = KMeans()
    kmeans.fit(data)
    labels = kmeans.labels_
    uniqueLabels = numpy.unique(labels)
    nCluster = len(uniqueLabels)
    centers = kmeans.cluster_centers_
    import matplotlib.pyplot as plt
    from itertools import cycle
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    plt.figure(1)
    plt.clf()
    for center in centers:
        print center
    for k,col in zip(range(nCluster),colors):
        members = labels == k
        print "plotting %dth cluster" % k
        print "label type" ,labels, type(labels)
        print "members are:", members, type(members)
        print "data[members,0]",data[members,0],type(data[members,0])
        center = centers[k]
        plt.plot(data[members,0],data[members,1],col +'.')
        plt.plot(center[0],center[1],'o',markerfacecolor=col,
                 markeredgecolor = 'k',markersize = 14)
    plt.title("clusters")
    plt.show()
def main():
	songIds = open("songIDsofFirst100Users.txt","r")
	try:
		for line in songIds:
			songIDsToCluster.append(int(line))
	finally:
		songIds.close()
	print len(songIDsToCluster)

	f= sio.loadmat('/home/dmitriy/workspace/MLFinalProject/MatlabFiles/finalVectors.mat')

	full = np.nan_to_num(np.matrix(f['finalVectors']))
	# fullSplit = np.array_split(full, 360)

	# print("Done Reading")
	# mtx = fullSplit[0]
	# print(len(mtx))
	mtx = full[songIDsToCluster]
	mtx /= np.max(np.abs(mtx),axis=0)
	for clusters in range(25,50):
		errors = 0
		num_clusters = clusters
		ClusteringKmeans = KMeans(n_clusters=num_clusters)
		ClusteringKmeans.fit(mtx)
		result = ClusteringKmeans.labels_
		#silhouette = metrics.silhouette_score(mtx,result,metric='euclidean')
		#plot(mtx,result)
		writeSongIDandClusterToFile(result,clusters)
		print("Clusters:", clusters, "Retest Error:", errors)
Beispiel #19
0
def kmeans_clustering(matrix, N):
    km = KMeans(n_clusters=N, n_jobs=-1)
    clusters = km.fit_predict(matrix)
    res = [[] for _ in range(N) ]
    for i, c in enumerate(clusters):
        res[c].append(i)
    return res
class AdvancedModel():
    
    clusters = []
    
    # price class regression
    price_reg = LinearRegression()
        
    def fit(self, X_train, y_train, n_clusters=4):
        y_train_mat = np.array(y_train).reshape((-1,1))
        
        # 1. determine clusters
        self.km = KMeans(n_clusters=5)
        self.km.fit(y_train_mat)
        clusters = self.km.cluster_centers_
        cluster_indices = self.km.predict(y_train_mat)
        print(clusters)
        
        # 2. fit naive bayes
        #self.nb.fit(X_train, ...)
        #self
        
        # 3. train regression model
        #price_reg.fit
        
    def predict(self, X):
        pass
        
    def get_weights(self):
        return np.append(self.price_reg.coef_, [self.price_reg.intercept_])
        
    def set_weights(self, w):
        self.price_reg.coef_ = w[:-1]
        self.price_reg.intercept_ = w[-1]
        
Beispiel #21
0
def scikit_pca(model, rel_wds, plot_lims, title, cluster="kmeans"):
    """
    Given a word2vec model and a cluster (choice of "kmeans" or "spectral")
    Make a plot of all word-vectors in the model.
    """
    X, keys = make_data_matrix(model)

    for i, key in enumerate(keys):
        X[i,] = model[key]

    if cluster == "kmeans":
        k_means = KMeans(n_clusters=8)
        labels = k_means.fit_predict(X)

    elif cluster == "spectral":
        sp_clust = SpectralClustering()
        labels = sp_clust.fit_predict(X)

    # PCA
    X_std = StandardScaler().fit_transform(X)
    sklearn_pca = PCA(n_components=2)
    X_transf = sklearn_pca.fit_transform(X_std)

    scatter_plot(X_transf[:,0], X_transf[:,1],  rel_wds, labels, title, keys, plot_lims)

    return sklearn_pca.explained_variance_ratio_
def cluster(dat):
	kmean=KMeans(init='k-means++', n_clusters=numclusters, n_init=10)
	y=kmean.fit_predict(dat)
	partition=[[] for i in range(numclusters)]
	for i in range(len(dat)):
		partition[y[i]].append(dat[i])
	return [partition,kmean]
def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
    # check that fit.predict gives same result as fit_predict
    # There's a very small chance of failure with elkan on unstructured dataset
    # because predict method uses fast euclidean distances computation which
    # may cause small numerical instabilities.
    # NB: This test is largely redundant with respect to test_predict and
    #     test_predict_equal_labels.  This test has the added effect of
    #     testing idempotence of the fittng procesdure which appears to
    #     be where it fails on some MacOS setups.
    if sys.platform == "darwin":
        pytest.xfail(
            "Known failures on MacOS, See "
            "https://github.com/scikit-learn/scikit-learn/issues/12644")
    if not (algo == 'elkan' and constructor is sp.csr_matrix):
        rng = np.random.RandomState(seed)

        X = make_blobs(n_samples=1000, n_features=10, centers=10,
                       random_state=rng)[0].astype(dtype, copy=False)
        X = constructor(X)

        kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed,
                        tol=tol, max_iter=max_iter, n_jobs=1)

        labels_1 = kmeans.fit(X).predict(X)
        labels_2 = kmeans.fit_predict(X)

        assert_array_equal(labels_1, labels_2)
Beispiel #24
0
 def pca_k_means(self):
     if not self.pca_reduced:
         self.pc_analysis()
     kmeans = KMeans(init='k-means++', n_clusters=3, n_init=10)
     kmeans.fit(self.pca_reduced, self.player_value)
     h = .02
     x_min, x_max = self.pca_reduced[:, 0].min() - 1, self.pca_reduced[:, 0].max() + 1
     y_min, y_max = self.pca_reduced[:, 1].min() - 1, self.pca_reduced[:, 1].max() + 1
     xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
     Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
     Z = Z.reshape(xx.shape)
     plt.figure(1)
     plt.clf()
     plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()),
                cmap=plt.cm.Paired, aspect='auto', origin='lower')
     plt.plot(self.pca_reduced[:, 0], self.pca_reduced[:, 1], 'k.', markersize=2)
     centroids = kmeans.cluster_centers_
     labels = self.pca_labels = kmeans.labels_
     intertia = kmeans.inertia_
     plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10)
     plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
               'Centroids are marked with white cross')
     plt.xlim(x_min, x_max)
     plt.ylim(y_min, y_max)
     plt.xticks(())
     plt.yticks(())
     return {'plt': plt, 'centroids': centroids, 'labels': labels, 'inertia': intertia}
Beispiel #25
0
    def initialize_hypers(self, W):
        mu_0 = W.mean(axis=(0,1))
        sigma_0 = np.diag(W.var(axis=(0,1)))

        # Set the global cov
        nu_0 = self._cov_model.nu_0
        self._cov_model.sigma_0 = sigma_0 * (nu_0 - self.B - 1)

        # Set the mean
        for c1 in xrange(self.C):
            for c2 in xrange(self.C):
                self._gaussians[c1][c2].mu_0 = mu_0
                self._gaussians[c1][c2].sigma = self._cov_model.sigma_0
                self._gaussians[c1][c2].resample()

        if self.special_case_self_conns:
            W_self = W[np.arange(self.N), np.arange(self.N)]
            self._self_gaussian.mu_0 = W_self.mean(axis=0)
            self._self_gaussian.sigma_0 = np.diag(W_self.var(axis=0))
            self._self_gaussian.resample()

        # Cluster the neurons based on their rows and columns
        from sklearn.cluster import KMeans
        features = np.hstack((W[:,:,0], W[:,:,0].T))
        km = KMeans(n_clusters=self.C)
        km.fit(features)
        self.c = km.labels_.astype(np.int)

        print "Initial c: ", self.c
Beispiel #26
0
def Corpus_K_Means(TestSample,num_topic): 
    Theta = TestSample.Theta
    ThetaPredict = np.zeros(Theta.shape)
    
    W = TestSample.Word
    W = np.array(W,dtype='double')
    
    estimators = KMeans(n_clusters=num_topic,n_init=5)
    estimators.fit(W)
    BetaPredict=estimators.cluster_centers_

    Q = 2*BetaPredict.dot(BetaPredict.transpose())
    Q = matrix(Q)
    P = W.dot(BetaPredict.transpose())
    G = -np.eye(num_topic)
    G = matrix(G)
    h = np.zeros([num_topic,1])
    h = matrix(h)
    A = np.ones([1,num_topic])
    A = matrix(A)
    b = matrix(1.0)
    
    solvers.options['show_progress'] = False
    
    for i in range(num_topic):
        p = matrix(P[[i],:].transpose())
        sol=solvers.qp(Q, p, G, h, A, b)
        ThetaPredict[:,[i]] = np.array(sol['x'])
        
    Err = ThetaPredict - Theta

    return np.square(np.linalg.norm(Err))
Beispiel #27
0
def re_classify_dict():
    dict_file = open("_dictionary.pickle", "rb")
    sc_list = cPickle.load(dict_file)
    sc_list = np.concatenate(sc_list)

    Dh_dict = sc_list[:, 144:]
    Dl_dict = sc_list[:, :144]

    k_means = KMeans(n_clusters=15)
    k_means = k_means.fit(Dl_dict)
    y_predict = k_means.predict(Dl_dict)

    num = []
    y_tmp = np.asarray(y_predict, dtype=int) * 0 + 1
    for i in range(len(np.unique(y_predict))):
        num.append(np.sum(y_tmp[y_predict == i]))
    rand = np.asarray(num).argsort()  # 按照各个类别patch个数从少到多排序的类别索引

    classified_hdict = []
    classified_patch = []
    for i in rand:
        predict_temp = y_predict == i
        classified_hdict.append(Dh_dict[predict_temp])
        print len(classified_hdict[-1])

    for i in range(9):
        x = i % 3
        y = i / 3
        # 进行一次系数编码测试
        patch_show(classified_hdict[i+5][:100], [0.05+x*0.31, 0.05+y*0.31, 0.3, 0.3], i)

    plt.show()
Beispiel #28
0
def kmeans(content_list):
    tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenize, \
    lowercase=False)
    '''
    tokenizer: 指定分词函数
    lowercase: 在分词之前将所有的文本转换成小写,因为涉及到中文文本处理,
    所以最好是False
    '''
    tfidf_matrix = tfidf_vectorizer.fit_transform(content_list)
    num_clusters = 20
    km_cluster = KMeans(n_clusters=num_clusters, max_iter=300, n_init=8, \
                        init='k-means++',n_jobs=8)
    '''
    n_clusters: 指定K的值
    max_iter: 对于单次初始值计算的最大迭代次数
    n_init: 重新选择初始值的次数
    init: 制定初始值选择的算法
    n_jobs: 进程个数,为-1的时候是指默认跑满CPU
    注意,这个对于单个初始值的计算始终只会使用单进程计算,
    并行计算只是针对与不同初始值的计算。比如n_init=10,n_jobs=40, 
    服务器上面有20个CPU可以开40个进程,最终只会开10个进程
    '''
    #返回各自文本的所被分配到的类索引
    result = km_cluster.fit_predict(tfidf_matrix)
    print "Predicting result: ", result
    return result
def get_modelKmeans():
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    # Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
    # benign_h2o.summary()

    benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values="NaN", strategy="mean", axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    for i in range(2, 7):
        # Log.info("H2O K-Means")
        km_h2o = H2OKMeansEstimator(k=i)
        km_h2o.train(x=range(benign_h2o.ncol), training_frame=benign_h2o)
        km_h2o.show()
        model = h2o.get_model(km_h2o._id)
        model.show()

        km_sci = KMeans(n_clusters=i, init="k-means++", n_init=1)
        km_sci.fit(benign_sci)
        print "sckit centers"
        print km_sci.cluster_centers_
Beispiel #30
0
 def add_kmeans_col(self, iter = 1000, n_init = 10, n = 4):
     '''Add a new k_means cluster column to X data'''
     logging.info('Adding kmeans %d clusters to X' %(n))
     km = KMeans(n_clusters=n, max_iter=iter, n_init=n_init)
     km.fit(self.X[:,1:]) # XXX: This might not be kosher as it affects all of X
     self.models['km-col'] = km        
     self.X = np.hstack( (self.X, km.predict(self.X[:,1:]).reshape(-1,1)) )