def calculateNumberOfIdealClusters(maxAmount, corpus):
	print "Initializing silhouette analysis"
	range_n_clusters = range(2, maxAmount) # max amount of clusters equal to amount of jobs

	silhouette_high = 0;
	silhouette_high_n_clusters = 2;

	for n_clusters in range_n_clusters:
		# Initialize the clusterer with n_clusters value
		cluster = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward", affinity="euclidean")
		cluster_labels = cluster.fit_predict(corpus)

		# The silhouette_score gives the average value for all the samples.
		# This gives a perspective into the density and separation of the formed clusters
		silhouette_avg = silhouette_score(corpus, cluster_labels)

		print "For n_clusters = %d, the average silhouette_score is: %.5f" % (n_clusters, silhouette_avg)

		if (silhouette_avg > silhouette_high):
		    silhouette_high = silhouette_avg
		    silhouette_high_n_clusters = n_clusters

		# Compute the silhouette scores for each sample
		sample_silhouette_values = silhouette_samples(corpus, cluster_labels)

	print ("Highest score = %f for n_clusters = %d" % (silhouette_high, silhouette_high_n_clusters))
	return silhouette_high_n_clusters
コード例 #2
0
def programmer_3():

    standardizedfile = "data/standardized.xls"
    k = 3
    data = pd.read_excel(standardizedfile, index_col=u"基站编号")

    # 层次聚类
    model = AgglomerativeClustering(n_clusters=k, linkage="ward")
    model.fit(data)

    # 详细输入原始数据及对应类别
    r = pd.concat([data, pd.Series(model.labels_, index=data.index)], axis=1)
    r.columns = list(data.columns) + [u"聚类类别"]

    # 绘制聚类图,并且用不同样式进行画图
    style = ["ro-", "go-", "bo-"]
    xlabels = [u"工作日人均停留时间", u"凌晨人均停留时间", u"周末人均停留时间", u"日均人流量"]
    pic_output = "tmp/type_"

    for i in range(k):
        plt.figure()
        tmp = r[r[u"聚类类别"] == i].iloc[:, :4]
        for j in range(len(tmp)):
            plt.plot(range(1, 5), tmp.iloc[j], style[i])

        plt.xticks(range(1, 5), xlabels, rotation=20)

        plt.title(u"商圈类别%s" % (i + 1))
        # 调整底部
        plt.subplots_adjust(bottom=0.15)
        plt.savefig(u"%s%s.png" % (pic_output, i + 1))
コード例 #3
0
def clustering_tweets_hc(labeled_tweets, num_cluster):
    vectorizer = cst_vectorizer.StemmedTfidfVectorizer(**param)
    tweet_vec = vectorizer.fit_transform(labeled_tweets).toarray()
    # print(tweet_vec)
    n_clusters = num_cluster

    from sklearn.neighbors import kneighbors_graph

    knn_graph = kneighbors_graph(tweet_vec, 1, include_self=False)
    # print(knn_graph)

    connectivity = knn_graph
    from sklearn.cluster import AgglomerativeClustering

    model = AgglomerativeClustering(linkage='ward', connectivity=connectivity, n_clusters=n_clusters)
    model.fit(tweet_vec)
    c = model.labels_
    # print(c,len(c))

    clustered_tweets = []
    for i in range(0, num_cluster):
        similar_indices = (c == i).nonzero()[0]
        sent = ''
        for sid in similar_indices:
            sent = labeled_tweets[sid] + ' ' + sent
        clustered_tweets.append(sent)
    return clustered_tweets
コード例 #4
0
ファイル: cluster.py プロジェクト: Sandy4321/nba-analysis-2
def cluster_agg(cluster_data):
    clstr = AgglomerativeClustering(n_clusters=11, linkage='ward')
    clstr.fit(cluster_data)

    df['tier'] = clstr.labels_
    results = df[['Player', 'tier']]
    return results
コード例 #5
0
def test_agglomerative_clustering_with_distance_threshold(linkage):
    # Check that we obtain the correct number of clusters with
    # agglomerative clustering with distance_threshold.
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    n_samples = 100
    X = rng.randn(n_samples, 50)
    connectivity = grid_to_graph(*mask.shape)
    # test when distance threshold is set to 10
    distance_threshold = 10
    for conn in [None, connectivity]:
        clustering = AgglomerativeClustering(
            n_clusters=None,
            distance_threshold=distance_threshold,
            connectivity=conn, linkage=linkage)
        clustering.fit(X)
        clusters_produced = clustering.labels_
        num_clusters_produced = len(np.unique(clustering.labels_))
        # test if the clusters produced match the point in the linkage tree
        # where the distance exceeds the threshold
        tree_builder = _TREE_BUILDERS[linkage]
        children, n_components, n_leaves, parent, distances = \
            tree_builder(X, connectivity=conn, n_clusters=None,
                         return_distance=True)
        num_clusters_at_threshold = np.count_nonzero(
            distances >= distance_threshold) + 1
        # test number of clusters produced
        assert num_clusters_at_threshold == num_clusters_produced
        # test clusters produced
        clusters_at_threshold = _hc_cut(n_clusters=num_clusters_produced,
                                        children=children,
                                        n_leaves=n_leaves)
        assert np.array_equiv(clusters_produced,
                              clusters_at_threshold)
コード例 #6
0
ファイル: process.py プロジェクト: meganbarnes/HRC
def buckshot(k, mat):
    size = int((k*mat.shape[0])**.5)
    print size
    samp = np.zeros((size, mat.shape[1]))
    inds = np.random.randint(0, mat.shape[0], size)
    print inds
    
    for i in xrange(size):
        samp[i] = mat[inds[i]]
        
    #agglomerative clusting on sample
    hier = AgglomerativeClustering(n_clusters=k, linkage='average', affinity='euclidean', compute_full_tree=True)
    flat = hier.fit_predict(samp)
    
    centroids = []
    #find centroids
    for j in xrange(k):
        i_s = [i for i, l in enumerate(flat) if l == j]
        print len(i_s)
        points = [samp[m] for m in i_s]
        points = np.array(points)
        cent = np.mean(points, axis=0)
        centroids.append(cent)
    
    return centroids
    def classify_core(self, N_CLUSTERS, clusterType, data_for_trial_type, begin_time, end_time):

        BEGIN_TIME_FRAME = begin_time*self.griddy.TIME_GRID_SPACING
        END_TIME_FRAME = end_time*self.griddy.TIME_GRID_SPACING

        data = data_for_trial_type[:,BEGIN_TIME_FRAME:END_TIME_FRAME,self.griddy.VEL_X]

        labels = None
        if clusterType == 'kmeans':
            kmeans = KMeans(n_clusters=N_CLUSTERS)
            kmeans.fit(data)
            labels = kmeans.labels_
        elif clusterType == 'affinity_propagation':
            ap = AffinityPropagation(damping=0.75)
            ap.fit(data)
            labels = ap.labels_
            N_CLUSTERS = np.max(self.labels)+1
        elif clusterType == 'DBSCAN':
            dbscan = DBSCAN()
            dbscan.fit(data)
            labels = dbscan.labels_
            N_CLUSTERS = np.max(labels)+1
            print 'N_CLUSTERS=' + str(N_CLUSTERS)
        elif clusterType == 'AgglomerativeClustering':
            ac = AgglomerativeClustering(n_clusters=N_CLUSTERS)
            ac.fit(data)
            labels = ac.labels_
        else:
            print 'ERROR: clusterType: ' + clusterType + ' is not recognized'

        return (labels, N_CLUSTERS)
コード例 #8
0
def __generate_dummy_data():
    from sklearn.cluster import AgglomerativeClustering
    import itertools
    X = np.array([[
         -5.27453240e-01,  -6.14130238e-01,  -1.63611427e+00,
         -9.26556498e-01,   7.82296885e-01,  -1.06286220e+00,
         -1.24368729e+00,  -1.16151964e+00,  -2.25816923e-01,
         -3.32354552e-02],
       [ -2.01273137e-01,   5.25758359e-01,   1.37940072e+00,
         -7.63256657e-01,  -1.27275323e+00,  -1.31618084e+00,
         -7.00167331e-01,   2.21410669e+00,   9.15456567e-01,
          7.93076923e-01],
       [  1.53249104e-01,  -5.48642411e-01,  -1.06559060e+00,
         -3.05253203e-01,  -1.93393495e+00,   1.39827978e-01,
          1.73359830e-01,   2.85576854e-02,  -1.19427027e+00,
          1.04395610e+00],
       [  1.00595172e+02,   1.01661346e+02,   1.00115635e+02,
          9.86884249e+01,   9.86506406e+01,   1.02214982e+02,
          1.01144087e+02,   1.00642778e+02,   1.01635339e+02,
          9.88981171e+01],
       [  1.01506262e+02,   1.00525318e+02,   9.93021764e+01,
          9.92514163e+01,   1.01199015e+02,   1.01771241e+02,
          1.00464097e+02,   9.97482396e+01,   9.96888274e+01,
          9.88297336e+01]])
    model = AgglomerativeClustering(linkage="average", affinity="cosine")
    model.fit(X)
    ii = itertools.count(X.shape[0])
    DEBUG(str([{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]))
    return model, model.labels_
コード例 #9
0
 def sp_connectivity(self,X,connectivity, n_clusters, n):
         
        # plt.figure(figsize=(10, 4))
         
  #       plt.subplot(1, 3, index + 1)
         model = AgglomerativeClustering(linkage="ward",
                                            connectivity=connectivity,
                                             n_clusters=n_clusters)
         #t0 = time.time()
         y = np.zeros(shape=(n))
         y = model.fit_predict(X, None)
         #elapsed_time = time.time() - t0
         return y
         
         
         
         #plt.scatter(X[:, 0], X[:, 1], c=model.labels_,
          #           cmap=plt.cm.spectral)
         #plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time),
           #           fontdict=dict(verticalalignment='top'))
         #plt.axis('equal')
         #plt.axis('off')
         #plt.subplots_adjust(bottom=0, top=.89, wspace=0,
           #                      left=0, right=1)
         #    plt.suptitle('n_cluster=%i, connectivity=%r' %
          #                (n_clusters, connectivity is not None), size=17)
 
 
         #plt.show()
コード例 #10
0
 def knn_connectivity(self, X):
     knn_graph = kneighbors_graph(X, 30, include_self=False)
 
     for connectivity in (None, knn_graph):
             n_clusters = 4
             plt.figure(figsize=(10, 4))
             for index, linkage in enumerate(('average', 'complete', 'ward')):
                 plt.subplot(1, 3, index + 1)
                 model = AgglomerativeClustering(linkage=linkage,
                                             connectivity=connectivity,
                                             n_clusters=n_clusters)
                 t0 = time.time()
                 model.fit(X)
                 elapsed_time = time.time() - t0
                 plt.scatter(X[:, 0], X[:, 1], c=model.labels_,
                         cmap=plt.cm.spectral)
                 plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time),
                       fontdict=dict(verticalalignment='top'))
                 plt.axis('equal')
                 plt.axis('off')
 
                 plt.subplots_adjust(bottom=0, top=.89, wspace=0,
                                 left=0, right=1)
                 plt.suptitle('n_cluster=%i, connectivity=%r' %
                          (n_clusters, connectivity is not None), size=17)
 
 
     plt.show()
コード例 #11
0
ファイル: pretraining.py プロジェクト: PonteIneptique/pandora
    def plot_mfi(self, outputfile='embeddings.pdf', nb_clusters=8, weights='NA'):
        # collect embeddings for mfi:
        X = np.asarray([self.w2v_model[w] for w in self.mfi \
                            if w in self.w2v_model], dtype='float32')
        # dimension reduction:
        tsne = TSNE(n_components=2)
        coor = tsne.fit_transform(X) # unsparsify

        plt.clf()
        sns.set_style('dark')
        sns.plt.rcParams['axes.linewidth'] = 0.4
        fig, ax1 = sns.plt.subplots()  

        labels = self.mfi
        # first plot slices:
        x1, x2 = coor[:,0], coor[:,1]
        ax1.scatter(x1, x2, 100, edgecolors='none', facecolors='none')
        # clustering on top (add some colouring):
        clustering = AgglomerativeClustering(linkage='ward',
                            affinity='euclidean', n_clusters=nb_clusters)
        clustering.fit(coor)
        # add names:
        for x, y, name, cluster_label in zip(x1, x2, labels, clustering.labels_):
            ax1.text(x, y, name, ha='center', va="center",
                     color=plt.cm.spectral(cluster_label / 10.),
                     fontdict={'family': 'Arial', 'size': 8})
        # control aesthetics:
        ax1.set_xlabel('')
        ax1.set_ylabel('')
        ax1.set_xticklabels([])
        ax1.set_xticks([])
        ax1.set_yticklabels([])
        ax1.set_yticks([])
        sns.plt.savefig(outputfile, bbox_inches=0)
コード例 #12
0
ファイル: gitproject.py プロジェクト: wvanamstel/project
    def clustering_approach(self):
        '''
        Cluster user data using various clustering algos
        IN: self.df_full and self.labels
        OUT: results to stdout
        '''
        print 'Fitting clustering model'
        X = self.df_full.values
        y = self.labels

        # scale data
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

        # KMeans
        km_clf = KMeans(n_clusters=2, n_jobs=6)
        km_clf.fit(X)

        # swap labels as super-users are in cluster 0 (messy!!)
        temp = y.apply(lambda x: 0 if x == 1 else 1)
        print '\nKMeans clustering: '
        self.analyse_preds(temp, km_clf.labels_)

        # Agglomerative clustering
        print '\nAgglomerative clustering approach: '
        ac_clf = AgglomerativeClustering()
        ac_labels = ac_clf.fit_predict(X)
        self.analyse_preds(y, ac_labels)

        return None
コード例 #13
0
 def Create_Ext_Agg_cluster(self,stem,stop,processing,remS): 
      
     Allrow_dicts=data_pkg.FileHandling.read_csv(self.ExtStringCSv)
     Allstrings=list()
     #Allstrings=[rowdict_str["Text_original"] for rowdict_str in Allrow_dicts]
     for row_dict in Allrow_dicts:
         if self.POS =="ALL_EXT":
             Stringrow=row_dict["Text_original"]+row_dict["Adj_Extended"]+row_dict["Noun_Extended"] +row_dict["Verb_Extended"]
             Allstrings.append(Stringrow)
         else:
             Stringrow=row_dict["Adj"]+row_dict["Adj_Extended"]+row_dict["Noun"]+row_dict["Noun_Extended"]#+row_dict["Verb"]#+row_dict["Verb_Extended"]
             Allstrings.append(Stringrow)
              
     Allstrings_process=[preprocess_text.preprocess(string_text, stem,stop) for string_text in Allstrings]  
      
     if remS:
         Allstrings_process=[preprocess_text.removeS(text) for text in Allstrings_process]            
     vectorizer = CountVectorizer()    
     term_doc=vectorizer.fit_transform(Allstrings_process)
     #-------------------------- feature_names=vectorizer.get_feature_names()
     #--z---------------------------------------------- Array=term_doc.toarray
      
     if self.affinity=='euclidean':
         Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,affinity='euclidean')
     if self.affinity=='cosine':
         Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity='cosine')
     Res_Labels=Agg_cluster.fit_predict(term_doc.toarray())
     self.cluster_tup_list=self.tuple_Ext_cluster_doc(Res_Labels,Allstrings,Allrow_dicts)
     #term_doc_lsa = lsa.fit_transform(term_doc)
     print type (term_doc)
     self.metric=metrics.silhouette_score(term_doc.toarray(), Res_Labels, metric=self.affinity)
     print Res_Labels
     print("n_samples: %d, n_features: %d" % term_doc.shape) 
コード例 #14
0
def clustering(data, params):

    # parse parameters

    for item in params:
        if isinstance(params[item], str):
            exec(item+'='+'"'+params[item]+'"')
        else:
            exec(item+'='+str(params[item]))

    # apply Agglomerative Clustering to reduced data

    clusters = AgglomerativeClustering(n_clusters=n_clusters,
                                       affinity=affinity, linkage=linkage)
    clusters.fit(data)

    # Agglomerative Clustering does not give centers of clusters
    # so lets try the mean of each cluster

    cluster_centers = []
    for i in range(n_clusters):
        mask = (clusters.labels_ == i)
        cluster_centers.append(mean(data[mask], axis=0))
    cluster_centers = array(cluster_centers)

    return [cluster_centers, clusters.labels_]
コード例 #15
0
ファイル: anomalyDetection.py プロジェクト: JWeel/Sjoemel
def agglom(data, n_clusters):
    knn_graph = kneighbors_graph(data, 30, include_self=False)
    
    cluster = AgglomerativeClustering(n_clusters=n_clusters, connectivity=knn_graph, linkage='ward') # use ward / average / complete for different results
    model = cluster.fit(data)
    
    return cluster.fit_predict(data)
コード例 #16
0
ファイル: cluster.py プロジェクト: ahnqirage/avenir
def train_agglomerative():
	print "starting agglomerative clustering..."
	model = AgglomerativeClustering(n_clusters=num_clusters, affinity=aggl_affinity,  
	linkage=aggl_linkage)
	model.fit(X)
	labels = model.labels_	
	print labels
コード例 #17
0
ファイル: hcm.py プロジェクト: harrylclc/ist557
def eval_dist(linkage='ward'):
    a_score = []
    idx = []
    d = [[] for i in xrange(3)]
    for k in xrange(2, 50 + 1):
        print 'k={}'.format(k)
        est = AgglomerativeClustering(n_clusters=k, linkage=linkage)
        est.fit(x)
        ari_v = metrics.adjusted_rand_score(y, est.labels_)
        ds = calc_distance(k, est.labels_)
        for i in xrange(3):
            d[i].append(ds[i])
        print ari_v
        a_score.append(ari_v)
        idx.append(k)
    fig, axes = plt.subplots(nrows=1, ncols=2)
    axes[0].plot(idx, a_score)
#     plt.xlim(0, 220)
    axes[0].set_ylim(ymin=0)
    axes[0].set_ylabel('ARI')
    axes[0].set_xlabel('# of clusters')
#     plt.savefig('figs/hc_ari.png')
#     plt.show()
#     plt.close()
    labels = ['Minimum', 'Maximum', 'Average']
#     for i in xrange(3):
#         axes[1].plot(idx, d[i], label=labels[i])
    axes[1].plot(idx, d[1])
    axes[1].legend()
    axes[1].set_ylabel('distance')
    axes[1].set_xlabel('# of clusters')
#     plt.savefig('figs/hc_distance.png')
    plt.show()
コード例 #18
0
ファイル: clusterings.py プロジェクト: thran/experiments2.0
def hierarchical(similarity, concepts=2, euclid=False):
    if euclid:
        model = AgglomerativeClustering(n_clusters=concepts)
        return model.fit_predict(similarity)
    else:
        model = AgglomerativeClustering(n_clusters=concepts, affinity='precomputed', linkage='complete')
        return model.fit_predict(1 - similarity)
コード例 #19
0
ファイル: w2v_greedy.py プロジェクト: tuywen/bishe_code
def Word2VecReduction(senlist, w2vec, ratio):
  slen = len(senlist)
  word_matrix = []
  word2label = {}
  idx2word = {}
  useword = set([])
  cnt = 0
  for i in range(0, slen):
    for word in senlist[i].word_used:
      if word not in useword: #and word in w2vec:
        idx2word[cnt] = word
        cnt += 1
        useword.add(word)
        word_matrix.append(w2vec[word])
  wlen = len(useword)
  print "use words:", wlen
  
  nclusters = max(int(0.9*wlen), 100)
  print nclusters
  AgloCluster = AgglomerativeClustering(n_clusters=nclusters,linkage="average", affinity='cosine')
  AgloCluster.fit(word_matrix)
  AgloCluster_labels = AgloCluster.labels_
  
  for i in range(0, wlen):
    word2label[idx2word[i]] = AgloCluster_labels[i]

  for i in range(0, slen):
    senlist[i].sen_words = [ str(word2label[w]) for w in senlist[i].word_used]
    senlist[i].word_dict = {}
    #print senlist[i].sen_words
  return
コード例 #20
0
def test_connectivity_propagation():
    # Check that connectivity in the ward tree is propagated correctly during
    # merging.
    X = np.array(
        [
            (0.014, 0.120),
            (0.014, 0.099),
            (0.014, 0.097),
            (0.017, 0.153),
            (0.017, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.152),
            (0.018, 0.149),
            (0.018, 0.144),
        ]
    )
    connectivity = kneighbors_graph(X, 10, include_self=False)
    ward = AgglomerativeClustering(n_clusters=4, connectivity=connectivity, linkage="ward")
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X)
コード例 #21
0
def wardHierarchical(img):
    connectivity = grid_to_graph(*img.shape)
    print("Compute structured hierarchical clustering...")
    st = time.time()
    n_clusters = 15  # number of regions
    ward = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward',
                                   connectivity=connectivity)
    
    face = sp.misc.imresize(img, 0.10) / 255.
    X = np.reshape(img, (-1, 1))
    ward.fit(X)
    label = np.reshape(ward.labels_, face.shape)
    print("Elapsed time: ", time.time() - st)
    print("Number of pixels: ", label.size)
    print("Number of clusters: ", np.unique(label).size)


    plt.figure(figsize=(5, 5))
    plt.imshow(face, cmap=plt.cm.gray)
    for l in range(n_clusters):
        plt.contour(label == l, contours=1,
                    colors=[plt.cm.spectral(l / float(n_clusters)), ])
    plt.xticks(())
    plt.yticks(())
    plt.show()
コード例 #22
0
ファイル: Word_Classes.py プロジェクト: jonathandunn/c2xg
	def agglomerative_clusters(self, word_vectors):
	
		#Pre-calculate BallTree object
		starting = time.time()
		Ball_Tree = BallTree(word_vectors, leaf_size = 200, metric = "minkowski")
		print("BallTree object in " + str(time.time() - starting))
		
		#Pre-calculate k_neighbors graph
		starting = time.time()
		connectivity_graph = kneighbors_graph(Ball_Tree, 
						n_neighbors = 1, 
						mode = "connectivity", 
						metric = "minkowski", 
						p = 2, 
						include_self = False, 
						n_jobs = workers
						)
		print("Pre-compute connectivity graph in " + str(time.time() - starting))

		#Agglomerative clustering
		starting = time.time()
		Agl = AgglomerativeClustering(n_clusters = 100, 
										affinity = "minkowski", 
										connectivity = connectivity_graph, 
										compute_full_tree = True, 
										linkage = "average"
										)
		
		Agl.fit(word_vectors)
		print("Agglomerative clustering in " + str(time.time() - starting))
		
		clusters = Agl.labels_
		
		return clusters
コード例 #23
0
def agglomClus(distmat,k,sendData=False):
    '''
    For all the TPD matrices captured by pairwise distmat, uses sklearn to hierarchically cluster
    if meth=agglomerative, bottom up
    k number of clusters
    '''
    from scipy.cluster.hierarchy import dendrogram
    from sklearn.datasets import load_iris
    from sklearn.cluster import AgglomerativeClustering
    from sklearn.metrics import pairwise_distances
    from matplotlib import pyplot as plt
    import itertools
    
    #put the calculated (generalized Manhattan) inter-matrix distances into array of floats
    diMat = []
    dists = csv.reader(open(distmat, 'r',newline='\n'))
    for row in dists:
        diMat.append(row)
    disArr = np.array(diMat)#pairwise dist mat as strings
    diArr = disArr.astype(float)#now as floats
    #distMat_cond = squareform(diArr)#turns redundant, square into condensed, triangular
    
    #set and fit the agglomerative clustering model
    mclus = AgglomerativeClustering(n_clusters = k, affinity='precomputed',linkage='complete')
    clusfit = mclus.fit(diArr)
    labels = clusfit.labels_
    #print(labels)
    
    #From PCA-based data, pull in the string names of chords in order
    chdnames = csv.reader(open('n10_PCA/562TPDmatrixSim kmed 200_n10PCA.csv', 'r',newline='\n'))
    #these for some other topN
    #chdnames = csv.reader(open('7470TPDmatrixSim kmed 50_n10PCA.csv', 'r',newline='\n'))
    #chdnames = csv.reader(open('2510TPDmatrixSim kmed 500_n10PCA.csv', 'r',newline='\n'))
    chdnamesit = []
    for row in chdnames:
        chdnamesit.append(row)
    chdnameslst = []
    for i,chd in enumerate(chdnamesit):
        if i<2: continue
        chdnameslst.append(chd[0])
    #print(chdnameslst)
    
    #output agglomerative mergings as csv if sendData==True
    if sendData:
        ii = itertools.count(diArr.shape[0])
        nodelst = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in clusfit.children_]
        csvName = 'agglom_testing.csv'
        file = open(csvName, 'w',newline='\n')
        lw = csv.writer(file)
        for row in nodelst:
            vals = []
            for key,value in row.items():
                vals.append(value)
            lw.writerow(vals)
    
    #plot a dendrogram of the agglomerative hierarchical clustering
    plt.title('Hierarchical Clustering Dendrogram')
    plot_dendrogram(clusfit,labels=chdnameslst,show_leaf_counts=True,leaf_font_size=8,leaf_rotation=45)#labels=clusfit.labels_
    plt.show()
コード例 #24
0
def clusterWithSimMatrix(simMatrix, num):
  clustering = AgglomerativeClustering(n_clusters=num,
                                       affinity='precomputed',
                                       linkage='complete')
  #clustering = MiniBatchKMeans(n_clusters=num, init='k-means++', n_init=1,
  #				 init_size=1000, batch_size=1000, verbose=opts.verbose)
  clustering.fit(simMatrix)
  return clustering
コード例 #25
0
 def agglomerative_clustering(self, samples):
     affinityArg = self.metric
     if self.metric == "gaussian":
         affinityArg = similairty_metrics.gaussianSimGraph
         
     ac = AgglomerativeClustering(linkage = self.linkage, n_clusters=self.num_clusters, affinity = affinityArg)
     ac.fit(samples)
     return ac.labels_
コード例 #26
0
ファイル: analysis.py プロジェクト: vladislive/transferparser
 def pca_ward_tree(self):
     if not self.pca_reduced:
         self.pc_analysis()
     reduced_red = manifold.SpectralEmbedding(n_components=2).fit_transform(self.pca_reduced)
     clustering = AgglomerativeClustering(linkage='ward', n_clusters=3)
     clustering.fit(self.pca_reduced)
     self._plot_ward_tree(reduced_red, self.pca_reduced, self.player_value, clustering.labels_)
     return plt
コード例 #27
0
ファイル: catTPD.py プロジェクト: andrewdjones/YJaMP_Analysis
def agglomClusCat(distmat,k,crit):
    '''
    For all the TPD matrices captured by pairwise distmat, uses sklearn to hierarchically cluster
    k is number of clusters
    crit is criterion for fcluster ('distance' best option)
    '''
    import sklearn
    from scipy.cluster.hierarchy import dendrogram
    from scipy.cluster.hierarchy import fcluster
    from sklearn.datasets import load_iris
    from sklearn.cluster import AgglomerativeClustering
    from sklearn.metrics import pairwise_distances
    from matplotlib import pyplot as plt
    import itertools
    
    #put the calculated (generalized Manhattan) inter-matrix distances into array of floats
    diMat = []
    dists = csv.reader(open(distmat, 'r',newline='\n'))
    for row in dists:
        diMat.append(row)
    disArr = np.array(diMat)#pairwise dist mat as strings
    diArr = disArr.astype(float)#now as floats
    #distMat_cond = squareform(diArr)#turns redundant, square into condensed, triangular
    
    #set and fit the agglomerative clustering model
    mclus = AgglomerativeClustering(n_clusters = k, affinity='precomputed',linkage='complete')
    clusfit = mclus.fit(diArr)
    labels = clusfit.labels_
    #print(labels)
    
    #From PCA-based data, pull in the string names of chords in order
    chdnames = csv.reader(open('n10_PCA/562TPDmatrixSim kmed 200_n10PCA.csv', 'r',newline='\n'))
    #these for some other topN
    #chdnames = csv.reader(open('7470TPDmatrixSim kmed 50_n10PCA.csv', 'r',newline='\n'))
    #chdnames = csv.reader(open('2510TPDmatrixSim kmed 500_n10PCA.csv', 'r',newline='\n'))
    chdnamesit = []
    for row in chdnames:
        chdnamesit.append(row)
    chdnameslst = []
    for i,chd in enumerate(chdnamesit):
        if i<2: continue
        chdnameslst.append(chd[0])
    #print(chdnameslst)
    
    #now make a dendrogram and/or flat clustering assignments
    #plot_dendrogram(clusfit,labels=chdnameslst,show_leaf_counts=True,leaf_font_size=8,leaf_rotation=45)#labels=clusfit.labels_
    clusters = fcluster(plot_dendrogram(clusfit,labels=chdnameslst,show_leaf_counts=True,leaf_font_size=8,leaf_rotation=45),k,criterion=crit)
    assigns = []
    for i in range(200):
        assigns.append([clusters[i],chdnameslst[i]])
    sassigns = sorted(assigns,key=operator.itemgetter(0))
    
    #send out the leaf cluster membership data
    csvName = 'truncDend_memb_test.csv'
    file = open(csvName, 'w',newline='\n')
    lw = csv.writer(file)
    for row in sassigns:
        lw.writerow(row)
コード例 #28
0
def get_topics(X_lsi, text_names, nk=1):
    ag = AgglomerativeClustering(n_clusters=nk, affinity='cosine', linkage='average')
    topics = ag.fit_predict(X_lsi)
    paper_to_topic = defaultdict(int)
    topic_to_papers = defaultdict(list)
    for paper,topic in zip(text_names,topics):
        paper_to_topic[paper] = topic
        topic_to_papers[topic].append(paper)
    return (paper_to_topic, topic_to_papers)
コード例 #29
0
 def CreateCluster(self):
     Fileobj=file(self.DistanceFile,"rb")
     SimArray=np.load(self.DistanceFile)
     Fileobj.close()
     
     print SimArray
     AggClusterDistObj=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity=self.affinity) 
     Res_Labels=AggClusterDistObj.fit_predict(SimArray)
     print Res_Labels
コード例 #30
0
ファイル: __init__.py プロジェクト: matt-leach/flask-cluster
def hierarchical(X, num_clusters):
    """
    Hierarchical Clustering on X for response y
    Returns array of cluster groups
    """
    model = AgglomerativeClustering(n_clusters=num_clusters)
    cleanX = preprocessing.scale(X.as_matrix())
    model.fit(cleanX)
    return model.labels_
def cluster(X,n_clusters):
    from sklearn.cluster import AgglomerativeClustering
    clustering = AgglomerativeClustering(n_clusters=n_clusters,linkage="complete")
    pred = clustering.fit_predict(X)
    return pred
コード例 #32
0
 def run_algorithm_for_k(k, linkage):
     cluster = AgglomerativeClustering(n_clusters=k, affinity='euclidean', linkage=linkage)
     cluster.fit(x)
     score = (metrics.silhouette_score(x, cluster.labels_, metric='euclidean'))
     silhouette_scores.append(score)
("The festival was generally well received by locals, and businesses in the area would typically put up signs welcoming festival-goers to their town.", "Music"),
("As a result of the location of the music festival, numerous live albums and videos have been recorded or filmed in Bushnell, including the annual Cornerstone Festival DVD. ", "Music"),
("Cornerstone held its final festival in 2012 and no longer operates.", "Music"),

("Beginning in 1908, the Truman Pioneer Stud Farm in Bushnell was home to one of the largest horse shows in the Midwest.", "Horse show"),
("The show was well known for imported European horses.", "Horse show"),
("The Bushnell Horse Show features some of the best Belgian and Percheron hitches in the country. Teams have come from many different states and Canada to compete.", "Horse show"),
]

sentences = [row[0] for row in corpus]

corpus_embeddings = embedder.encode(sentences)
num_clusters = len(set([row[1] for row in corpus]))

#Sklearn clustering
km = AgglomerativeClustering(n_clusters=num_clusters)
km.fit(corpus_embeddings)

cluster_assignment = km.labels_


clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    for row in cluster:
        print("(Gold label: {}) - {}".format(row[1], row[0]))
    print("")
コード例 #34
0
plt.figure(figsize=(10, 7))
plt.title("Average Text Dendograms")
dend = shc.dendrogram(shc.linkage( upper_dists , method='average'))
# plt.show()

method= 'average'
from sklearn import metrics
points_average = [] 
labels_average= []
values_in_range = []
n_clusters_average = []
the_range = np.arange( 0.05,1,0.05 )
for x in the_range:
    values_in_range.append( x )
    cluster_topics = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage=method, distance_threshold=x)
    model_topics = cluster_topics.fit(sim_matrix)    
    labels_average.append(  model_topics.labels_ )    
    n_clusters_average.append( model_topics.n_clusters_ )    
    score = metrics.silhouette_score(sim_matrix, model_topics.labels_ , metric='precomputed')
    print(x,score)
    points_average.append( [ x, score ] )
    df_notifications['cluster_'+str(x)]=model_topics.labels_

df_notifications.to_csv('data_silhoutte_scores.csv')

plt.plot( [x[0] for x in points_average],[x[1] for x in points_average] )
plt.title( 'average method/ title+body hirarchical model silhoutte score' )
plt.xlabel( 'cut off threshold' )
plt.ylabel('Silhoutte Score')
plt.show()
コード例 #35
0
type(df_norm)

z = linkage(df_norm, method="complete",metric="euclidean")

plt.figure(figsize=(15, 5));plt.title('Hierarchical Clustering Dendrogram');plt.xlabel('Index');plt.ylabel('Distance')
sch.dendrogram(
    z,
    leaf_rotation=0.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()

# Now applying AgglomerativeClustering choosing 3 as clusters from the dendrogram
from	sklearn.cluster	import	AgglomerativeClustering 
h_complete	=	AgglomerativeClustering(n_clusters=3,	linkage='complete',affinity = "euclidean").fit(df_norm)

cluster_labels=pd.Series(h_complete.labels_)

frames['clust']=cluster_labels # creating a  new column and assigning it to new column 
frames

frames = frames.iloc[:,[17,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]]
frames

# getting aggregate mean of each cluster
frames.iloc[:,2:].groupby(frames.clust).median()
## Cluster 2 is more suitable

# creating a csv file 
frames.to_csv("PCA.csv",encoding="utf-8")
コード例 #36
0
import numpy as np
import pandas as pd
from scipy import ndimage
from scipy.cluster import hierarchy
from scipy.spatial import distance_matrix
from matplotlib import pyplot as plt
from sklearn import manifold, datasets
from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets.samples_generator import make_blobs

X1, y1 = make_blobs(n_samples=50,
                    centers=[[4, 4], [-2, -1], [1, 1], [10, 4]],
                    cluster_std=0.9)
plt.scatter(X1[:, 0], X1[:, 1], marker='o')

agglom = AgglomerativeClustering(n_clusters=4, linkage='average')
agglom.fit(X1, y1)

plt.figure(figsize=(6, 4))

x_min, x_max = np.min(X1, axis=0), np.max(X1, axis=0)

X1 = (X1 - x_min) / (x_max - x_min)

for i in range(X1.shape[0]):

    plt.text(X1[i, 0],
             X1[i, 1],
             str(y1[i]),
             color=plt.cm.nipy_spectral(agglom.labels_[i] / 10.),
             fontdict={
コード例 #37
0
import unittest
import numpy as np
from reval.best_nclust_cv import FindBestClustCV, _confint
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import AgglomerativeClustering
import math

# Modify to test other functions and parameters
RNDLABELS_ITER = 10
CLASSIFIER = KNeighborsClassifier(n_neighbors=5)
CLUSTERING = AgglomerativeClustering()
NCLUST_RANGE = [2, 4]
NFOLD = 2


class TestBestNclusterCV(unittest.TestCase):

    @classmethod
    def setUp(cls):
        cls.s = CLASSIFIER
        cls.c = CLUSTERING
        cls.nrand = RNDLABELS_ITER
        cls.nfold = NFOLD
        cls.nclust_range = NCLUST_RANGE
        cls.findbest = FindBestClustCV(cls.nfold, cls.nclust_range, cls.s, cls.c, cls.nrand)

    def test_best_nclust(self):
        data = np.array([[0] * 20,
                         [1] * 20] * 20)
        strat_vect = np.array([0, 1] * 20)
        metrics, best_nclust, _ = self.findbest.best_nclust(data,
コード例 #38
0
    text_file = open("data/0208_3_" + mode + "Linkage_score.txt", "a+")
    text_file.write("\n************" +
                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
                    "************\n")
    print("\n************" +
          datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          "************\n")
    text_file.close()

    ### Number of clusters
    for n_clusters in range(2, 31):

        ## linkage{"ward","complete","average","single"}, optional (default="ward")
        model = AgglomerativeClustering(n_clusters=n_clusters,
                                        affinity='euclidean',
                                        linkage=mode)
        predict = pd.DataFrame(model.fit_predict(df))
        predict.columns = ['predict']

        # concatenate labels to df as a new column
        r = pd.concat([df, predict], axis=1)

        #print(r.sample(10))
        # clusters
        silhouette_avg = silhouette_score(df.values, predict.values.ravel())
        DBI_avg = davies_bouldin_score(df.values, predict.values.ravel())
        text_file = open("data/0208_3_" + mode + "Linkage_score.txt", "a+")
        text_file.write("\n\nn_clusters =" + str(n_clusters) +
                        "The average silhouette_score is :" +
                        str(silhouette_avg))
コード例 #39
0
    def decision_function(self, X):
        return self.classifier_.decision_function(X)


def plot_scatter(X, color, alpha=0.5):
    return plt.scatter(X[:, 0], X[:, 1], c=color, alpha=alpha, edgecolor='k')


# Generate some training data from clustering
X, y = make_blobs(n_samples=N_SAMPLES,
                  cluster_std=[1.0, 1.0, 0.5],
                  centers=[(-5, -5), (0, 0), (5, 5)],
                  random_state=RANDOM_STATE)

# Train a clustering algorithm on the training data and get the cluster labels
clusterer = AgglomerativeClustering(n_clusters=3)
cluster_labels = clusterer.fit_predict(X)

plt.figure(figsize=(12, 4))

plt.subplot(131)
plot_scatter(X, cluster_labels)
plt.title("Ward Linkage")

# Generate new samples and plot them along with the original dataset
X_new, y_new = make_blobs(n_samples=10,
                          centers=[(-7, -1), (-2, 4), (3, 6)],
                          random_state=RANDOM_STATE)

plt.subplot(132)
plot_scatter(X, cluster_labels)
コード例 #40
0
    b = b + 7

# Del NaN and 1
ListKey = [
    y for x, y in zip(ListVal, ListKey)
    if not (math.isnan(x[0]) or (x[0] == 1 and x[1] == 1))
]
ListVal = [
    x for x in ListVal if not (math.isnan(x[0]) or (x[0] == 1 and x[1] == 1))
]

DictF = {x: y for x, y in zip(ListKey, ListVal)}
print('Processed {} descriptors'.format(len(DictF)))

for i in range(2, 7):
    agg = AC(n_clusters=i, linkage='ward')
    assignment = agg.fit_predict(ListVal)
    result = Counter(assignment)
    clustElem = {}
    for ind, val in enumerate(assignment):
        if val + 1 not in clustElem.keys():
            clustElem[val + 1] = [ListKey[ind]]
        else:
            clustElem[val + 1].append(ListKey[ind])
    clustMedian = {i[0]: i[1][len(i[1]) // 2] for i in clustElem.items()}
    print('========== {} lavel =========='.format(i - 1))
    print('{} clusters'.format(i))
    cE = list(clustElem.items())
    cE.sort()
    for j in cE:
        print('Number of elements in {0} cluster: {1}'.format(j[0], len(j[1])))
コード例 #41
0
ファイル: wine2_ HW2.py プロジェクト: runru1030/PA
     y_true= data[:, 11]
     y_pred= classifier.predict(data[:, 0:11])
     
     
     print("4.k-NN classifier: \n", recall_score(y_true, y_pred, average=None))
 
 elif ans==4:
     
     sel=input("Select the algorithm ((h)ierarchicalor (k)-means): ")
     if sel=='h':
         from sklearn.cluster import AgglomerativeClustering
         data = np.genfromtxt("./winequality-red.csv", dtype= np.float32, delimiter = ";", skip_header= 1)
         X=data[:, 0:11]
         
         cluster=int(input("Input the number of clusters: "))
         model = AgglomerativeClustering(n_clusters= cluster)
         model.fit(X)
         first=int(input("Input the number of the first wine: "))
         second=int(input("Input the number of the second wine: "))
         
 
         if model.labels_[first]== model.labels_[second]:
             print("Result : %d and %d are in the same cluster"%(first,second))
         else:
             print("Result : %d and %d are in the different cluster"%(first,second))
     if sel=='k':
         
         from sklearn.cluster import KMeans
         data = np.genfromtxt("./winequality-red.csv", dtype= np.float32, delimiter = ";", skip_header= 1)
         X=data[:, 0:11]
     
コード例 #42
0
ファイル: 2.14.1.py プロジェクト: Ahmed2020538/Data_Science
#Import Libraries
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt
#----------------------------------------------------

#Applying AggClusteringModel Model 

'''
sklearn.cluster.AgglomerativeClustering(n_clusters=2, affinity='euclidean’, memory=None, connectivity=None, 
                                        compute_full_tree='auto’, linkage=’ward’,pooling_func=’deprecated’)
'''

AggClusteringModel = AgglomerativeClustering(n_clusters=5,affinity='euclidean',# it can be l1,l2,manhattan,cosine,precomputed
                                             linkage='ward')# it can be complete,average,single

y_pred_train = AggClusteringModel.fit_predict(X_train)
y_pred_test = AggClusteringModel.fit_predict(X_test)

#draw the Hierarchical graph for Training set
dendrogram = sch.dendrogram(sch.linkage(X_train[: 30,:], method = 'ward'))# it can be complete,average,single
plt.title('Training Set')
plt.xlabel('X Values')
plt.ylabel('Distances')
plt.show()

#draw the Hierarchical graph for Test set
dendrogram = sch.dendrogram(sch.linkage(X_test[: 30,:], method = 'ward'))# it can be complete,average,single
plt.title('Test Set')
plt.xlabel('X Value')
plt.ylabel('Distances')
コード例 #43
0
cluster_y  = [i[1] for i in kmeans.cluster_centers_]

#plt.plot(x,y,'.',alpha=0.15)

#plt.plot(cluster_x,cluster_y,'o')
cluster_plot(15,y_km)
sns.kdeplot(x,y,cmap='Blues',shade=True,shade_lowest=False,bw=2,alpha=0.6)
plt.show()
plt.close()

#%%
# import hierarchical clustering libraries
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering

# create dendrogram
dendrogram = sch.dendrogram(sch.linkage(points, method='ward'))

# create clusters
hc = AgglomerativeClustering(n_clusters=15, affinity = 'euclidean',
                             linkage = 'ward')

# save clusters for chart
y_hc = hc.fit_predict(points)

#%%
cluster_plot(6,y_hc)
sns.kdeplot(x,y,cmap='Blues',shade=False,bw=2,alpha=0.5)
plt.show()
plt.close()
#veri kümesi büyüdükçe hiyerarşik bölütleme k-means den daha iyi çalışmaz.
#büyük veri kümesi için uygun dağildir.

#kütüphaneler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#2.1 veri yükleme
data = pd.read_csv('../data/customer.csv')
X = data.iloc[:, 3:].values

#hierarchical clustering
from sklearn.cluster import AgglomerativeClustering
agc = AgglomerativeClustering(n_clusters=4,
                              affinity='euclidean',
                              linkage='ward')
y_prediction = agc.fit_predict(X)
print(y_prediction)
#n_clusters kaç küme olacak
#affinity mesafe ne ölçüsü ile alınacak
#linkage clusterlar arası mesafe nasıl ölçülecek
#ward kullanacaksak sadece euclidean ölçü birimi kullanılmak zorunda
#fit inşa ediyor fit_predict hem inşa et hemde tahmin et

#yapılan kümeleme işleminin grafiğini çizmek
plt.scatter(X[y_prediction == 0, 0], X[y_prediction == 0, 1], s=100, c='red')
plt.scatter(X[y_prediction == 1, 0], X[y_prediction == 1, 1], s=100, c='blue')
plt.scatter(X[y_prediction == 2, 0], X[y_prediction == 2, 1], s=100, c='green')
plt.scatter(X[y_prediction == 3, 0],
            X[y_prediction == 3, 1],
コード例 #45
0
def Cluster(A):
    if (len(A) > 1):
        hc = AgglomerativeClustering(n_clusters=cluster_number,
                                     affinity='euclidean',
                                     linkage='ward')
        return hc.fit_predict(A)
コード例 #46
0
location_of_images="../../../images/"

sys.path.append(os.path.join(os.path.dirname(__file__), "../functions/"))


data = np.load('cluster_mask.npy')

data_new = data[..., 10:15]

X = np.reshape(data_new, (-1, 1))

connectivity = grid_to_graph(n_x= data_new.shape[0], n_y = data_new.shape[1], n_z = data_new.shape[2])

st = time.time()
n_clusters = 7 # number of regions
ward = AgglomerativeClustering(n_clusters=n_clusters,
        linkage='ward', connectivity=connectivity).fit(X)
label = np.reshape(ward.labels_, data_new.shape)

label_mean = np.zeros(n_clusters)
center = list()

#FIND THE AVERAGE T-VALUE PER CLUSTER
for j in range(n_clusters):
    mask = label==j
    index = np.where(mask)
    center.append((np.mean(index[0]),np.mean(index[1]),np.mean(index[2])))
    label_mean[j] =np.mean(data_new[mask])
   
#PRINT THE PLOTS
for i in range(data_new.shape[-1]):
    plt.figure()
コード例 #47
0
def main():
    if len(sys.argv) == 2:
        superclass = sys.argv[1]
    else:
        print('Param error')
        exit()

    cluster = 'KMeans'
    cluster = 'AC'

    classNum = {'Animals': 10, 'Fruits': 10}

    features_path = 'features_%s.pickle' % (superclass)
    features_path_cluster = 'features_%s_cluster.pickle' % (superclass)

    fread = open(features_path, 'rb')
    fsave = open(features_path_cluster, 'wb')

    data_all = pickle.load(fread)
    features_all = data_all['features_all']
    labels_all = data_all['labels_all']
    images_all = data_all['images_all']

    test_labels_idxes = np.where(np.array(labels_all) == 'test')[0]
    test_features = list(np.array(features_all)[test_labels_idxes])
    test_images = list(np.array(images_all)[test_labels_idxes])

    if cluster == 'KMeans':
        clf = KMeans(n_clusters=classNum[superclass], max_iter=300)
        s = clf.fit(test_features)
    else:
        clf = AgglomerativeClustering(n_clusters=classNum[superclass],
                                      linkage='complete')
        cluster_labels = clf.fit_predict(test_features)
        ac_cluster_centers = {}
        for i in range(classNum[superclass]):
            cluster_labels_idxes = np.where(np.array(cluster_labels) == i)[0]
            sub_test_features = np.array(test_features)[cluster_labels_idxes]
            ac_cluster_centers[i] = list(np.mean(sub_test_features, axis=0))

    idx = 0
    images_cluster = {}
    for image in test_images:
        if cluster == 'KMeans':
            cluster_idx = clf.predict([test_features[idx]])
            cluster_idx = cluster_idx[0]
            feature_cluster = clf.cluster_centers_[cluster_idx]
        else:
            cluster_idx = cluster_labels[idx]
            feature_cluster = ac_cluster_centers[cluster_idx]
        features_all[test_labels_idxes[idx]] = feature_cluster
        if str(cluster_idx) not in images_cluster.keys():
            images_cluster[str(cluster_idx)] = []
        images_cluster[str(cluster_idx)].append(image)
        idx += 1

    fimages = open('images_cluster.json', 'w')
    fimages.write(json.dumps(images_cluster))
    fimages.close()

    feval = open('images_cluster_eval.json', 'w')
    feval.write(json.dumps(eval_cluster_result(superclass, images_cluster)))
    feval.close()

    data_all = {
        'features_all': features_all,
        'labels_all': labels_all,
        'images_all': images_all
    }

    pickle.dump(data_all, fsave)

    fread.close()
    fsave.close()
コード例 #48
0
              data=data)

bench_AffinityPropagation(AffinityPropagation(convergence_iter=20),
                          name="AP",
                          data=data)

bench_MeanShift(MeanShift(), name="MeanShift", data=data)

# bench_SpectralClustering(SpectralClustering(),name="MeanShift", data=data)

bench_SpectralClustering(SpectralClustering(n_clusters=n_digits),
                         name="Spectral",
                         data=data)

bench_AgglomerativeClustering(AgglomerativeClustering(n_clusters=n_digits,
                                                      linkage='ward',
                                                      connectivity=None),
                              name="Ward-hier",
                              data=data)

bench_AgglomerativeClustering(AgglomerativeClustering(n_clusters=n_digits,
                                                      linkage='complete',
                                                      connectivity=None),
                              name="Agglomerative",
                              data=data)

bench_DBSCAN(DBSCAN(eps=5, min_samples=3), name="DBSCAN", data=data)

bench_GaussianMixture(mixture.GaussianMixture(n_components=n_digits,
                                              covariance_type='full'),
                      name="GaussMix",
コード例 #49
0
import pandas as pd
import matplotlib.cm as cm
import numpy as np
import os
from sklearn.cluster import AgglomerativeClustering

path = os.getcwd() + '/shopping_data.csv'
customer_data = pd.read_csv(path)

data = customer_data.iloc[:, 3:5].values

n_clusters = 5
linkage_list = ['single', 'average', 'complete', 'ward']
for l in linkage_list:
    clusterer = AgglomerativeClustering(n_clusters=n_clusters,
                                        affinity='euclidean',
                                        linkage=l)
    cluster_labels = clusterer.fit(data)

    plt.figure()
    colors = cm.nipy_spectral(
        cluster_labels.labels_.astype(float) / n_clusters)
    plt.scatter(data[:, 0],
                data[:, 1],
                marker='.',
                s=70,
                lw=0,
                alpha=0.7,
                c=colors,
                edgecolors='k')
    plt.title(l)
コード例 #50
0
ファイル: sunny_hc.py プロジェクト: sunilkurahatti/sunpy
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib qt

#importing data sets
ds=pd.read_csv('Mall_Customers.csv')
X=ds.iloc[:,[3,4]].values

#plotting dendogram
import scipy.cluster.hierarchy as sch
dendogram=sch.dendrogram(sch.linkage(X, method='ward'))
plt.title('Dendogram')
plt.xlabel('salary')
plt.ylabel('customerscore')

#fittin model heirachical clusturing
from sklearn.cluster import AgglomerativeClustering
hc=AgglomerativeClustering(affinity='euclidean',linkage='ward',n_clusters=5)
y_hc=hc.fit_predict(X)

#plotting graph
scatter(X[y_hc==0,0],X[y_hc==0,1],color='red',label='Cautious',s=100)
plt.scatter(X[y_hc==1,0],X[y_hc==1,1],color='green',label='Standerd',s=100)
plt.scatter(X[y_hc==2,0],X[y_hc==2,1],color='blue',label='target',s=100)
plt.scatter(X[y_hc==3,0],X[y_hc==3,1],color='black',label='Careless',s=100)
plt.scatter(X[y_hc==4,0],X[y_hc==4,1],color='magenta',label='Sensible',s=100)
plt.legend()
plt.show()
コード例 #51
0
ファイル: clustering.py プロジェクト: otakumesi/CaBE
 def clustering(self, affinity):
     return AgglomerativeClustering(distance_threshold=self.threshold,
                                    n_clusters=None,
                                    affinity=affinity,
                                    linkage=self.linkage)
コード例 #52
0
ファイル: model.py プロジェクト: Moloospa/TCN
            self.direction_count) / self.instance_count * 100

    def to_string(self):
        print('feature: ', self.feature, '\ninstance_count: ',
              self.instance_count, '\ndirection_count: ', self.direction_count,
              '\nreliability: ', self.reliability, ' %')


datafile = 'test.csv'
m = Model(datafile)
model = m.model
direction = m.direction
print(model.shape)
n = 10
print('just before clustering')
clusters = AgglomerativeClustering(linkage='average').fit(model[n:, :])

patterns = []

for x in range(model.shape[0]):
    if direction[x] == 1: direction_count = np.array([1, 0, 0])
    elif direction[x] == -1: direction_count = np.array([0, 1, 0])
    else: direction_count = np.array([0, 0, 1])

    patterns.append(Pattern(model[x], 1, direction_count))

for x in clusters.children_:
    feature = np.average((patterns[x[0]].feature, patterns[x[1]].feature),
                         axis=0)
    instance_count = patterns[x[0]].instance_count + patterns[
        x[1]].instance_count
コード例 #53
0
    import datetime
    time_stamps = [datetime.datetime(2020,7,18,5,1,3,23),
                   datetime.datetime(2020,7,19,5,1,3,222),
                   datetime.datetime(2020,7,21,0,0,0,0),
                   datetime.datetime(2020,7,21,0,0,0,0),
                   datetime.datetime(2020,7,21,3,2,1,110),
                   datetime.datetime(2020,7,23,0,0,0,0),
                   datetime.datetime(2020,7,24,0,0,0,0),
                   datetime.datetime(2020,7,25,0,0,0,0)]

    # y = incremental_average(coords)
    # print(y)

    from sklearn.cluster import AgglomerativeClustering
    # clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage='average', distance_threshold=1.0)
    clustering = AgglomerativeClustering(n_clusters=None, linkage='average', distance_threshold=1.0)
    # print(sim_map)
    # print(dist_mat)
    y_pred = clustering.fit(coords[:4]).labels_
    print('orig batch:', y_pred)

    clustering = AgglomerativeClustering(n_clusters=None, linkage='average', distance_threshold=1.0)
    # print(sim_map)
    # print(dist_mat)
    y_pred = clustering.fit(coords[4:]).labels_
    print('orig batch:', y_pred)

    clustering = AgglomerativeClustering(n_clusters=None, linkage='average', distance_threshold=1.0)
    # print(sim_map)
    # print(dist_mat)
    y_pred = clustering.fit(coords).labels_
コード例 #54
0
# Plotting the "Raw Data"
plt.subplot(1, 2, 1)
plt.title("Raw Data")
plot()

# Plotting the Assigned Points using the Hierarchical Clustering (with Euclidean distance)
plt.subplot(1, 2, 2)
plt.grid()
plt.xlim(0, max(coordinates[i][0] for i in coordinates.keys()) + 0.05)
plt.xticks(np.arange(0, max(coordinates[i][0] for i in coordinates.keys()) + 0.20, 0.10))
plt.xlabel("X-Axis")
plt.ylim(0, max(coordinates[i][0] for i in coordinates.keys()) + 0.05)
plt.yticks(np.arange(0, max(coordinates[i][0] for i in coordinates.keys()) + 0.20, 0.10))
plt.ylabel("Y-Axis")
plt.title("Assigned Points")
clusters = AgglomerativeClustering(n_clusters=n_cluster, affinity='euclidean', linkage='ward')
clusters.fit_predict(coo_array)
print(clusters.labels_)
plt.scatter(coo_array[:, 0], coo_array[:, 1], c=clusters.labels_, cmap='rainbow', edgecolors="black")

# Plotting the Dendrogram for showing the order and distances of merges during the hierarchical clustering.
plt.figure()
linked = linkage(coo_array, 'ward')
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title("Dendrogram: Ward Method")
plt.xlabel("Points")
plt.ylabel("Euclidean Distance")

plt.show()

# I was not sure which method to use between "ward" and "complete".
コード例 #55
0
def clusterer(clusterer_id):
    clusterers = Clustering.query.all()
    algo = AlgoTypes.query.filter_by(algotype_id=3).first()

    ## Describe dataset
    dataset = pd.read_csv(
        '/home/ubuntu/workspace/static/datasets/Mall_Customers.csv')
    X = dataset.iloc[:, [3, 4]].values
    dataset_head = dataset.head(10)
    stats_data = dataset.iloc[:, 2:5]
    describe = stats_data.describe()
    rows = len(dataset.index)
    columns = len(dataset.columns)
    pred = 'Choose Algorithm'

    choice = clusterer_id

    if choice == '2':

        ## Using the dendogram to find the optimal number of clusters
        plt.gcf().clear()
        dendrogram = sch.dendrogram(sch.linkage(X, method='ward'))
        img_dendrogram = BytesIO()
        sns.set_style("darkgrid", {"axes.facecolor": ".9"})
        plt.title("Dendrogram")
        plt.xlabel('Customers')
        plt.ylabel('Euclidean distances')
        plt.savefig(img_dendrogram, format='png')
        img_dendrogram.seek(0)
        plot_determine = base64.b64encode(img_dendrogram.getvalue())
        ## Result = 5

        ## Fitting Hierarchical Clustering to the dataset (optimal clusters = 5)
        hc = AgglomerativeClustering(n_clusters=5,
                                     affinity='euclidean',
                                     linkage='ward')
        y_hc = hc.fit_predict(X)

        ## Visualising the clusters
        img = BytesIO()
        plt.gcf().clear()
        plt.scatter(
            X[y_hc == 0, 0],
            X[y_hc == 0,
              1],  ## specify that we want first cluster + first column vs second column for 'y'
            s=100,
            c='red',
            label='Savers')  ## size for datapoints/color
        plt.scatter(X[y_hc == 1, 0],
                    X[y_hc == 1, 1],
                    s=100,
                    c='blue',
                    label='Average')
        plt.scatter(X[y_hc == 2, 0],
                    X[y_hc == 2, 1],
                    s=100,
                    c='green',
                    label='Target Group')
        plt.scatter(X[y_hc == 3, 0],
                    X[y_hc == 3, 1],
                    s=100,
                    c='orange',
                    label='Overspenders')
        plt.scatter(X[y_hc == 4, 0],
                    X[y_hc == 4, 1],
                    s=100,
                    c='magenta',
                    label='Careful')
        sns.set_style("darkgrid", {"axes.facecolor": ".9"})
        plt.title('Suggested Clusters')
        plt.xlabel('Annual income (k$)')
        plt.ylabel('Spending Score (1-100)', fontsize=12)
        plt.ylim(ymin=0)
        plt.legend(fontsize=9)
        plt.savefig(img, format='png')
        img.seek(0)
        plot_url = base64.b64encode(img.getvalue())

    if choice == '1':

        ## Using the elbow method to find the optimal number of clusters
        wcss = []  ## initialize the list
        for i in range(1, 11):
            kmeans = KMeans(
                n_clusters=i,  ## from 1 to 10
                init=
                'k-means++',  ## k-means++ to avoid random initialziation trap
                max_iter=300,  ## 300 is deafault        
                n_init=
                10,  ## algorithm runs with different initial centroids      
                random_state=0)
            kmeans.fit(X)
            wcss.append(kmeans.inertia_)  ## to compute wcss
        ## Result = 5

        ## Visualising Elbow Method
        plt.gcf().clear()
        img_elbow = BytesIO()
        plt.plot(range(1, 11), wcss)
        plt.title('The Elbow Method')
        plt.xlabel('Number of clusters')
        plt.ylabel('WCSS')
        sns.set_style("darkgrid", {"axes.facecolor": ".9"})
        plt.savefig(img_elbow, format='png')
        img_elbow.seek(0)
        plot_determine = base64.b64encode(img_elbow.getvalue())

        ## Applying k-means to the mall dataset - from the plot we can see that optimum is 5 clusters.
        kmeans = KMeans(
            n_clusters=5,
            init='k-means++',  ## k-means++ to avoid random initialziation trap
            max_iter=300,  ## 300 is deafault        
            n_init=10,  ## algorithm runs with different initial centroids      
            random_state=0)
        y_kmeans = kmeans.fit_predict(
            X)  ## fit_predict returns a cluster for each observation

        ## Visualising the clusters
        img = BytesIO()
        plt.gcf().clear()
        sns.set_style("darkgrid", {"axes.facecolor": ".9"})
        plt.scatter(
            X[y_kmeans == 0, 0],
            X[y_kmeans == 0,
              1],  ## specify that we want first cluster + first column vs second column for 'y'
            s=100,
            c='red',
            label='Savers')  ## size for datapoints/color
        plt.scatter(X[y_kmeans == 1, 0],
                    X[y_kmeans == 1, 1],
                    s=100,
                    c='blue',
                    label='Average')
        plt.scatter(X[y_kmeans == 2, 0],
                    X[y_kmeans == 2, 1],
                    s=100,
                    c='green',
                    label='Target Group')
        plt.scatter(X[y_kmeans == 3, 0],
                    X[y_kmeans == 3, 1],
                    s=100,
                    c='orange',
                    label='Overspenders')
        plt.scatter(X[y_kmeans == 4, 0],
                    X[y_kmeans == 4, 1],
                    s=100,
                    c='magenta',
                    label='Careful')
        plt.scatter(
            kmeans.cluster_centers_[:, 0],
            kmeans.cluster_centers_[:, 1],  ## cluster centers coordinates
            s=200,
            c='black',
            label='Centroids')
        plt.title('Suggested Clusters')
        plt.xlabel('Annual income (k$)')
        plt.ylabel('Spending Score (1-100)', fontsize=12)
        plt.ylim(ymin=0)
        plt.legend(fontsize=9)
        plt.savefig(img, format='png')
        img.seek(0)
        plot_url = base64.b64encode(img.getvalue())

    return render_template('clustering.html',
                           data=dataset_head.to_html(),
                           describe=describe.to_html(),
                           plot_determine=plot_determine,
                           plot_url=plot_url,
                           rows=rows,
                           columns=columns,
                           clusterers=clusterers,
                           algo=algo,
                           user=current_user.username)
コード例 #56
0
    'optics_xi', 'optics_dbscan', 'dbscan', 'agglomerative_clustering',
    'affinity_propagation', 'spectral_clustering'
]
clusterID_xi = OPTICS(metric='precomputed',
                      max_eps=0.16,
                      xi=0.05,
                      algorithm='brute',
                      min_samples=3).fit_predict(distanceMatrix)
clusterID_op = OPTICS(metric='precomputed',
                      max_eps=0.16,
                      cluster_method='dbscan',
                      min_samples=7).fit_predict(distanceMatrix)
clusterID_db = DBSCAN(metric='precomputed',
                      eps=0.1).fit_predict(distanceMatrix)
clusterID_ag = AgglomerativeClustering(
    affinity='precomputed', linkage='average',
    n_clusters=2).fit_predict(distanceMatrix)
clusterID_af = AffinityPropagation(affinity='precomputed',
                                   damping=0.7).fit_predict(1 - distanceMatrix)
clusterID_sp = SpectralClustering(affinity='precomputed',
                                  n_clusters=2).fit_predict(1 - distanceMatrix)
clusterIDs = [
    clusterID_xi, clusterID_op, clusterID_db, clusterID_ag, clusterID_af,
    clusterID_sp
]

# Evaluation
for clusterID in clusterIDs:
    try:
        print(
            metrics.silhouette_score(distanceMatrix,
コード例 #57
0
#Importing Dataset
data = pd.read_csv('Mall_Customers.csv')
x = data.iloc[:, [3, 4]].values

#Dendrogram Graph (To find the optimal number of clusters)
import scipy.cluster.hierarchy as sch
dendrograms = sch.dendrogram(sch.linkage(x, method='ward'))
plt.title('Dendrogram Model')
plt.xlabel('Pts')
plt.ylabel('Euclidean Distance')
plt.show()

#Applying hierarchical clustering
from sklearn.cluster import AgglomerativeClustering
algo = AgglomerativeClustering(n_clusters=5,
                               affinity='euclidean',
                               linkage='ward')
y_hc = algo.fit_predict(x)

#Visualising the HC
plt.scatter(x[y_hc == 0, 0], x[y_hc == 0, 1], s=30, c='red', label='Cluster 1')
plt.scatter(x[y_hc == 1, 0],
            x[y_hc == 1, 1],
            s=30,
            c='blue',
            label='Cluster 2')
plt.scatter(x[y_hc == 2, 0],
            x[y_hc == 2, 1],
            s=30,
            c='green',
            label='Cluster 3')
コード例 #58
0
kmeans = KMeans(n_clusters=10, random_state=123)
# Fit
fit = kmeans.fit(X_scaled)
# Print inertia: sum of squared distances to closest cluster center
print("Sum of squared distances for 10 clusters is", kmeans.inertia_)

############## Hierarchical agglomerative clustering
#Make dendrograms -- tree diagrams that connect each datapoint by distance.
#Draw perpendicular lines through the dendrogram to select out the groups

# Create dendrogram
dendrogram = sch.dendrogram(sch.linkage(X_scaled, method='ward'))
#What is y axis? Measure of closeness of either individual data points or clusters
plt.show()
# Create clusters and fit
hc = AgglomerativeClustering(affinity='euclidean', linkage='ward')
hc.fit(X_scaled)

# Print number of clusters
print(hc.n_clusters_)

############## Determining K
#In general, two methods for determining K: silhouette method and elbow method
#Silhouette method uses silhouette coefficient, composed of mean distance between observation and all others in
#same cluster, and mean distance between each observation and all others in next nearest cluster.
#1 is good, means observation is close to others in same cluster. -1 is bad.

#Elbow method - plot the sum of the square distance from each observation to the centroid against the number
#of clusters. The "elbow point" on the plot will be the optimal k point.

# Silhouette method
コード例 #59
0
ファイル: seg.py プロジェクト: viterid/IBIO4490
def segmentByClustering (rgbImage, colorSpace, clusteringMethod, numberOfClusters ):
    
    import numpy as np
    #determine if xy is required
    space=colorSpace.split("+")
    leng=len(space)
    w ,h = rgbImage.shape[:2]
	#generate XY matrix
    if leng == 2:
        import numpy as np
        x=range(w)
        xmat= np.repeat(x,h)
        xmat=xmat.reshape(w,h)
        xmat=np.uint8(xmat)
        y=range(h)
        ymat= np.repeat(y,w)
        ymat=ymat.reshape(w,h)
        ymat=np.uint8(ymat)
        colorSpace=space[0]
	#change image to the specified color space
    def RGB(rgbImage):
        newImage = rgbImage
        return newImage
    def HSV (rgbImage):
        import cv2
        newImage = cv2.cvtColor(rgbImage, cv2.COLOR_BGR2HSV)
        return newImage
    def LAB (rgbImage):
        import skimage
        newImage = cv2.cvtColor(rgbImage, cv2.COLOR_BGR2LAB)
        return newImage
	#Switch for color space
    S_color = {
        "rgb" : RGB,
        "lab" : LAB,
		"hsv" : HSV
	}
    func = S_color.get(colorSpace)
    
    newImage=func(rgbImage)
    
	#aply XY matrix if needed
    if leng == 2:
        
        temp=np.ndarray(shape=(w,h,5))
        temp[:,:,0]=newImage[:,:,0]
        temp[:,:,1]=newImage[:,:,1]
        temp[:,:,2]=newImage[:,:,2]
        temp[:,:,3]=xmat
        temp[:,:,4]=ymat
        newImage=temp
        
    indx= 0
    size=newImage.shape
    
    if leng ==2: 
        
        repMat=np.zeros((size[0]*size[1],5))
    else:
        repMat=np.zeros((size[0]*size[1],3))
    
    for i in range(size[0]):
        for j in range(size[1]):
        

            if leng ==2:
            
                i1= (newImage[i,j,3]/(255-0))
                j1= (newImage[i,j,4]/(255-0))
                repMat[indx]= [newImage[i,j,0],newImage[i,j,1],newImage[i,j,2], i1,j1]
                indx=indx+1
        
            else:
        
                repMat[indx]= [newImage[i,j,0],newImage[i,j,1],newImage[i,j,2]]
        
    k= numberOfClusters
        
    if  clusteringMethod == 'kmeans':
            
        kmeans = KMeans(n_clusters=k).fit(repMat)
        labels=kmeans.labels_
        labels= np.reshape(labels,(size[0],size[1]))
        seg= labels
            
    elif clusteringMethod == 'gmm': 
            
        gmm = mixture.GaussianMixture(n_components=k).fit(repMat,y='None')
        labels = gmm.predict(repMat)
        labels= np.reshape(labels,(size[0],size[1]))
        seg= labels
        
    elif clusteringMethod == 'hierarchical':
        
        import sklearn.cluster
        from sklearn.cluster import AgglomerativeClustering 

        cluster = AgglomerativeClustering(n_clusters=k, affinity='euclidean', linkage='ward').fit(repMat)  
        labels=cluster.labels_
        labels= np.reshape(labels,(size[0],size[1]))
        seg=labels
        
    elif clusteringMethod == 'watershed':
        
        a=2
            
            
            
        
    
    return seg
コード例 #60
0
view2D.show(sm, col_sz=4, what = 'codebook',)#which_dim="all", denormalize=True)
plt.show()

view2D  = View2D(20,20,"", text_size=9)
view2D.show(sm, col_sz=2, what = 'codebook',)#which_dim="all", denormalize=True)
plt.show()

vhts  = BmuHitsView(12,12,"Hits Map",text_size=7)
vhts.show(sm, anotate=True, onlyzeros=False, labelsize=10, cmap="autumn", logaritmic=False)

## Hierarchical Clustering ##
som_cluster = final_clusters.groupby("Labels").mean()
dend = shc.dendrogram(shc.linkage(som_cluster, method='ward'))
plt.title("Dendogram with SOM nodes", size=12)

som_cluster["h_cluster"] = AgglomerativeClustering(n_clusters=3).fit_predict(som_cluster)
# Calculate centroids of clusters and inverse scaling for interpretation
h_cluster = som_cluster.groupby("h_cluster").mean()
h_cluster = pd.DataFrame(scaler.inverse_transform(X=h_cluster), columns = customer_related_num)
# Assign customer to cluster generated by hierarchical clustering
final_clusters["h_cluster"] = [som_cluster.loc[label,"h_cluster"] for label in final_clusters["Labels"].values]
# Silhoutte graph
create_silgraph(df_cust_norm, final_clusters["h_cluster"])
plt.title("Silhouette graph customer clusters", size=12)
silhouette_avg = silhouette_score(df_cust_norm, final_clusters["h_cluster"])
print("the average silhouette_score is :", silhouette_avg) 
df["c_cluster"] = final_clusters["h_cluster"]

#################################################################
################## Decision Tree classifier #####################
# Find most important features