def test_connectivity_popagation():
    """
    Check that connectivity in the ward tree is propagated correctly during
    merging.
    """
    from sklearn.neighbors import NearestNeighbors

    X = np.array(
        [
            (0.014, 0.120),
            (0.014, 0.099),
            (0.014, 0.097),
            (0.017, 0.153),
            (0.017, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.152),
            (0.018, 0.149),
            (0.018, 0.144),
        ]
    )
    nn = NearestNeighbors(n_neighbors=10).fit(X)
    connectivity = nn.kneighbors_graph(X)
    ward = Ward(n_clusters=4, connectivity=connectivity)
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X)
Esempio n. 2
0
def hieclu(data_matrix, k):
	#use Hierarchical clustering
	print 'using hierarchical clustering......'
	ac = Ward(n_clusters=k)
	ac.fit(data_matrix)
	result = ac.fit_predict(data_matrix)
	return result
Esempio n. 3
0
def hierarchicalClustering(x,k):
    model = Ward(n_clusters=k)
    labels = model.fit_predict(np.asarray(x))

    # Centroids is a list of lists
    centroids = []
    for c in range(k):
        base = []
        for d in range(len(x[0])):
            base.append(0)
        centroids.append(base)

    # Stores number of examples per cluster
    ctrs = np.zeros(k)

    # Sum up all vectors for each cluster
    for c in range(len(x)):
        centDex = labels[c]
        for d in range(len(centroids[centDex])):
            centroids[centDex][d] += x[c][d]
        ctrs[centDex] += 1

    # Average the vectors in each cluster to get the centroids
    for c in range(len(centroids)):
        for d in range(len(centroids[c])):
            centroids[c][d] = centroids[c][d]/ctrs[c]

    return (centroids,labels)
Esempio n. 4
0
    def agglomerate(self, nodes, edges, clusters):
        if len(nodes) != len(clusters):
            print("#nodes(%d) != #clusters(%d)" % (len(nodes), len(clusters)))

        neighbors = {}
        for edge in edges:
            if edge[0] in neighbors:
                neighbors[edge[0]].append(edge[1])
            else:
                neighbors[edge[0]] = [edge[1]]

        node_clusters = {}  # node: its cluster id
        communities = {}  # cluster id: all neighbors for its members
        for i in range(len(nodes)):
            if clusters[i] in communities:
                communities[clusters[i]].extend(neighbors[nodes[i]])
            else:
                communities[clusters[i]] = neighbors[nodes[i]]
            node_clusters[nodes[i]] = clusters[i]

        N = len(communities)
        affinity_matrix = sp.zeros([N, N])
        for comm in communities:
            members = [node_clusters[node] for node in communities[comm]]
            degree = dict(Counter(members))
            for key in degree:
                affinity_matrix[comm, key] = degree[key]

        ward = Ward(n_clusters=6)
        predicts = ward.fit_predict(affinity_matrix)

        return [predicts[node_clusters[node]] for node in nodes]
Esempio n. 5
0
    def constraint(self, nodes, edges, lables):
        if len(nodes) != len(lables):
            print("#nodes(%d) != #clusters(%d)" % (len(nodes), len(lables)))

        N = len(nodes)
        circles = {}

        guidance_matrix = sp.zeros([N, N])
        # guidance_matrix = {}
        for i in range(len(nodes)):
            if lables[i] in circles:
                circles[lables[i]].append(nodes[i])
            else:
                circles[lables[i]] = [nodes[i]]

        for key in circles.iterkeys():
            print(key, len(circles[key]))

        c = 36
        for ni in circles[c]:
            i = nodes.index(ni)
            for nj in circles[c]:
                j = nodes.index(nj)
                guidance_matrix[i, j] = 1.0

        guidance_matrix = sparse.lil_matrix(guidance_matrix)

        # pos = sum(x > 0 for x in guidance_matrix)
        print(guidance_matrix)
        ward = Ward(n_clusters=6, n_components=2, connectivity=guidance_matrix)
        predicts = ward.fit_predict(self.A)

        print(predicts)
Esempio n. 6
0
    def buildFromImageCollectionWard(self,
                                     pathTxtFile,
                                     pathDirImages,
                                     fileImageExtension,
                                     vocabularySize,
                                     maxNumImages=sys.maxint):
        # vocabularySize could be 4096
        # Read the image IDs
        imageIds = self.readImageIdsFromTxtFile(pathTxtFile)

        # If there are more images than the considered ones...
        if (len(imageIds) > maxNumImages):
            imageIds = random.sample(imageIds, maxNumImages)

        # Extract the SURF descriptors from a collection of images and save in dictionary
        surfExtractor = SurfExtractor(True)
        surfExtractor.processCollectionFilesImage(imageIds, pathDirImages,
                                                  fileImageExtension)

        # Create a numpy array from the descriptors
        descriptors = surfExtractor.getDescriptors()
        arr_descriptor = np.vstack(tuple(descriptors))

        #self.mbk = MiniBatchKMeans(init='k-means++',
        #                                k=vocabularySize,
        #                                n_init=10,
        #                                max_no_improvement=10,
        #                                verbose=0)
        self.ward = Ward(n_clusters=vocabularySize)

        self.ward.fit(arr_descriptor)
def compute_clusters(dataset, features_vector):
    """
    Apply clustering method
    """

    labels = dataset.target
    true_k = np.unique(labels).shape[0]

    # Run clustering method
    print "Performing clustering with method ", cmd_options.clust_method.upper(
    )
    print

    if (cmd_options.clust_method == "hclust"):
        result = features_vector.toarray()
        ward = Ward(n_clusters=true_k)
        ward.fit(result)

        return ward

    if (cmd_options.clust_method == "kmeans"):
        km = KMeans(n_clusters=true_k,
                    init='k-means++',
                    max_iter=1000,
                    verbose=1)
        km.fit(features_vector)

        return km
Esempio n. 8
0
    def constraint(self, nodes, edges, lables):
        if len(nodes) != len(lables):
            print("#nodes(%d) != #clusters(%d)" % (len(nodes), len(lables)))

        N = len(nodes)
        circles = {}

        guidance_matrix = sp.zeros([N, N])
        # guidance_matrix = {}
        for i in range(len(nodes)):
            if lables[i] in circles:
                circles[lables[i]].append(nodes[i])
            else:
                circles[lables[i]] = [nodes[i]]

        for key in circles.iterkeys():
            print(key, len(circles[key]))

        c = 36
        for ni in circles[c]:
            i = nodes.index(ni)
            for nj in circles[c]:
                j = nodes.index(nj)
                guidance_matrix[i, j] = 1.0

        guidance_matrix = sparse.lil_matrix(guidance_matrix)

        # pos = sum(x > 0 for x in guidance_matrix)
        print(guidance_matrix)
        ward = Ward(n_clusters=6, n_components=2, connectivity=guidance_matrix)
        predicts = ward.fit_predict(self.A)

        print(predicts)
Esempio n. 9
0
def hierarchicalClustering(x, k):
    model = Ward(n_clusters=k)
    labels = model.fit_predict(np.asarray(x))

    # Centroids is a list of lists
    centroids = []
    for c in range(k):
        base = []
        for d in range(len(x[0])):
            base.append(0)
        centroids.append(base)

    # Stores number of examples per cluster
    ctrs = np.zeros(k)

    # Sum up all vectors for each cluster
    for c in range(len(x)):
        centDex = labels[c]
        for d in range(len(centroids[centDex])):
            centroids[centDex][d] += x[c][d]
        ctrs[centDex] += 1

    # Average the vectors in each cluster to get the centroids
    for c in range(len(centroids)):
        for d in range(len(centroids[c])):
            centroids[c][d] = centroids[c][d] / ctrs[c]

    return (centroids, labels)
Esempio n. 10
0
 def __hieclu(self):
     #use Hierarchical clustering
     print 'using hierarchical clustering......'
     ac = Ward(n_clusters=self.k)
     ac.fit(self.data_matrix)
     result = ac.fit_predict(self.data_matrix)
     return result
Esempio n. 11
0
    def agglomerate(self, nodes, edges, clusters):
        if len(nodes) != len(clusters):
            print("#nodes(%d) != #clusters(%d)" % (len(nodes), len(clusters)))

        neighbors = {}
        for edge in edges:
            if edge[0] in neighbors:
                neighbors[edge[0]].append(edge[1])
            else:
                neighbors[edge[0]] = [edge[1]]

        node_clusters = {}  # node: its cluster id
        communities = {}    # cluster id: all neighbors for its members
        for i in range(len(nodes)):
            if clusters[i] in communities:
                communities[clusters[i]].extend(neighbors[nodes[i]])
            else:
                communities[clusters[i]] = neighbors[nodes[i]]
            node_clusters[nodes[i]] = clusters[i]

        N = len(communities)
        affinity_matrix = sp.zeros([N, N])
        for comm in communities:
            members = [node_clusters[node] for node in communities[comm]]
            degree = dict(Counter(members))
            for key in degree:
                affinity_matrix[comm, key] = degree[key]

        ward = Ward(n_clusters=6)
        predicts = ward.fit_predict(affinity_matrix)

        return [predicts[node_clusters[node]] for node in nodes]
Esempio n. 12
0
def hieclu(data_matrix, k):
    #use Hierarchical clustering
    print 'using hierarchical clustering......'
    ac = Ward(n_clusters=k)
    ac.fit(data_matrix)
    result = ac.fit_predict(data_matrix)
    return result
Esempio n. 13
0
def test_connectivity_popagation():
    """
    Check that connectivity in the ward tree is propagated correctly during
    merging.
    """
    from sklearn.neighbors import NearestNeighbors

    X = np.array([
        (.014, .120),
        (.014, .099),
        (.014, .097),
        (.017, .153),
        (.017, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .152),
        (.018, .149),
        (.018, .144),
    ])
    nn = NearestNeighbors(n_neighbors=10, warn_on_equidistant=False).fit(X)
    connectivity = nn.kneighbors_graph(X)
    ward = Ward(n_clusters=4, connectivity=connectivity)
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X)
	def __hieclu(self):
		#use Hierarchical clustering
		print 'using hierarchical clustering......'
		ac = Ward(n_clusters = self.k)
		ac.fit(self.data_matrix)
		result = ac.fit_predict(self.data_matrix)
		return result
Esempio n. 15
0
def test_connectivity_popagation():
    """
    Check that connectivity in the ward tree is propagated correctly during
    merging.
    """
    from sklearn.neighbors import kneighbors_graph

    X = np.array([
        (.014, .120),
        (.014, .099),
        (.014, .097),
        (.017, .153),
        (.017, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .152),
        (.018, .149),
        (.018, .144),
    ])
    connectivity = kneighbors_graph(X, 10)
    ward = Ward(n_clusters=4, connectivity=connectivity)
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X)
Esempio n. 16
0
def test_ward_clustering():
    """
    Check that we obtain the correct number of clusters with Ward clustering.
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(100, 50)
    connectivity = grid_to_graph(*mask.shape)
    clustering = Ward(n_clusters=10, connectivity=connectivity)
    clustering.fit(X)
    assert_true(np.size(np.unique(clustering.labels_)) == 10)
Esempio n. 17
0
def test_ward_clustering():
    """
    Check that we obtain the correct number of clusters with Ward clustering.
    """
    np.random.seed(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = np.random.randn(100, 50)
    connectivity = grid_to_graph(*mask.shape)
    clustering = Ward(n_clusters=10, connectivity=connectivity)
    clustering.fit(X)
    assert_true(np.size(np.unique(clustering.labels_)) == 10)
Esempio n. 18
0
def test_connectivity_fixing_non_lil():
    """
    Check non regression of a bug if a non item assignable connectivity is
    provided with more than one component.
    """
    # create dummy data
    x = np.array([[0, 0], [1, 1]])
    # create a mask with several components to force connectivity fixing
    m = np.array([[True, False], [False, True]])
    c = grid_to_graph(n_x=2, n_y=2, mask=m)
    w = Ward(connectivity=c)
    w.fit(x)
Esempio n. 19
0
def test_connectivity_fixing_non_lil():
    """
    Check non regression of a bug if a non item assignable connectivity is
    provided with more than one component.
    """
    # create dummy data
    x = np.array([[0, 0], [1, 1]])
    # create a mask with several components to force connectivity fixing
    m = np.array([[True, False], [False, True]])
    c = grid_to_graph(n_x=2, n_y=2, mask=m)
    w = Ward(connectivity=c)
    w.fit(x)
Esempio n. 20
0
def cluster_ward(classif_data, vect_data):
	ward = Ward(n_clusters=10)

	np_arr_train = np.array(vect_data["train_vect"])
	np_arr_label = np.array(classif_data["topics"])
	np_arr_test = np.array(vect_data["test_vect"])

	labels = ward.fit_predict(np_arr_train)
	print "Ward"
	sil_score = metrics.silhouette_score(np_arr_train, labels, metric='euclidean')
	print sil_score
	
	return labels
def get_km_segments(x, image, sps, n_segments=25):
    if len(x) == 2:
        feats, edges = x
    else:
        feats, edges, _ = x
    colors_ = get_colors(image, sps)
    centers = get_centers(sps)
    n_spixel = len(feats)
    graph = sparse.coo_matrix((np.ones(edges.shape[0]), edges.T), shape=(n_spixel, n_spixel))
    ward = Ward(n_clusters=n_segments, connectivity=graph + graph.T)
    # km = KMeans(n_clusters=n_segments)
    color_feats = np.hstack([colors_, centers * 0.5])
    # return km.fit_predict(color_feats)
    return ward.fit_predict(color_feats)
Esempio n. 22
0
def spectral_cluster(data, n_clusters, method='sl'):
    # 获取拉普拉斯矩阵
    if method == 'NJW':
        lap_matrix = get_lap_matrix_njw(data, 0.1)
        eigenvalues, eigenvectors = np.linalg.eig(lap_matrix)
        idx = eigenvalues.argsort()[::-1]
        eigenvalues = eigenvalues[idx]
        eigenvectors = eigenvectors[:, idx]

    elif method == 'self-tuning':
        lap_matrix = get_lap_matrix_self_tuning(data)
        eigenvalues, eigenvectors = np.linalg.eig(lap_matrix)
        idx = eigenvalues.argsort()[::-1]
        eigenvalues = eigenvalues[idx]
        eigenvectors = eigenvectors[:, idx]

    else:
        lap_matrix = get_lap_matrix_sl(data, 0.1)
        eigenvalues, eigenvectors = np.linalg.eig(lap_matrix)
        idx = eigenvalues.argsort()
        eigenvalues = eigenvalues[idx]
        eigenvectors = eigenvectors[:, idx]

    #print(eigenvalues)
    # 获取前n_clusters个特征向量
    x_matrix = eigenvectors[:, 0:n_clusters]
    # 归一化特征向量矩阵
    y_matrix = normal_eigen(x_matrix)

    # 调用自己写的k_means函数
    """
    k_dist_dic, k_centers_dic, cluster_group = kmeans.k_means(y_matrix, n_clusters)
    mat_plot_cluster_sample(data, cluster_group, method)
    """
    # 调用自己写的bi_k_means函数
    """center_list, cluster_assign = bikmeans.exe_bi_k_means(y_matrix, n_clusters)
    labels = cluster_assign[:, 0]
    mat_plot_cluster_sample(data, labels. method)

    # 调用sklearn中的KMeans函数,效果比自己写的强了好多
    k_means = KMeans(n_clusters)
    k_means.fit(y_matrix)
    #k_centers = k_means.cluster_centers_
    #mat_plot_cluster_sample(data, k_means.labels_, method)
    """
    # 调用sklearn中的hierarchical 聚类方法进行聚类
    hie_cluster = Ward(n_clusters)
    hie_cluster.fit(y_matrix)
    mat_plot_cluster_sample(data, hie_cluster.labels_, method)
Esempio n. 23
0
def get_km_segments(x, image, sps, n_segments=25):
    if len(x) == 2:
        feats, edges = x
    else:
        feats, edges, _ = x
    colors_ = get_colors(image, sps)
    centers = get_centers(sps)
    n_spixel = len(feats)
    graph = sparse.coo_matrix((np.ones(edges.shape[0]), edges.T),
                              shape=(n_spixel, n_spixel))
    ward = Ward(n_clusters=n_segments, connectivity=graph + graph.T)
    #km = KMeans(n_clusters=n_segments)
    color_feats = np.hstack([colors_, centers * .5])
    #return km.fit_predict(color_feats)
    return ward.fit_predict(color_feats)
def cluster_evaluation(D, y_true, n_clusters, eps=0.8, min_samples=10):
    ##############################################################################
    # Extract Y true
    labels_true = y_true

    ##############################################################################
    # transform distance matrix into a similarity matrix
    S = 1 - D

    ##############################################################################
    # compute DBSCAN
    #db = DBSCAN(eps=eps, min_samples=min_samples).fit(S)
    db = Ward(n_clusters=n_clusters).fit(S)
    #core_samples = db.core_sample_indices_
    labels = db.labels_

    # number of clusters in labels, ignoring noise if present
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print 'Number of clusters: %d' % n_clusters_
    print 'Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, labels)
    print 'Completeness: %0.3f' % metrics.completeness_score(
        labels_true, labels)
    print 'V-meassure: %0.3f' % metrics.v_measure_score(labels_true, labels)
    print 'Adjusted Rand Index: %0.3f' % metrics.adjusted_rand_score(
        labels_true, labels)
    print 'Adjusted Mutual Information: %0.3f' % metrics.adjusted_mutual_info_score(
        labels_true, labels)
    print 'Silhouette Coefficient: %0.3f' % metrics.silhouette_score(
        D, labels, metric='precomputed')
Esempio n. 25
0
def spect_clust_segmentation(lena, regions=20):
    X = np.reshape(lena, (-1, 1))

    connectivity = grid_to_graph(*lena.shape)

    print("Compute structured hierarchical clustering...")

    st = time.time()

    n_clusters = regions
    ward = Ward(n_clusters=n_clusters, connectivity=connectivity).fit(X)
    label = np.reshape(ward.labels_, lena.shape)

    print("Elapsed time: ", time.time() - st)
    print("Number of pixels: ", label.size)
    print("Number of clusters: ", np.unique(label).size)

    plt.imshow(lena, cmap=plt.cm.gray)
    for l in range(n_clusters):
        plt.contour(label == l,
                    contours=1,
                    colors=[
                        plt.cm.spectral(l / float(n_clusters)),
                    ])
    plt.show()
Esempio n. 26
0
def test_linkage_misc():
    # Misc tests on linkage
    X = np.ones((5, 5))
    assert_raises(ValueError,
                  AgglomerativeClustering(linkage='foobar').fit,
                  X)
    assert_raises(ValueError, linkage_tree, X, linkage='foobar')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", UserWarning)
        # Use the copy argument, to raise a warning
        Ward(copy=True).fit(X)
    # We should be getting 2 warnings: one for using Ward that is
    # deprecated, one for using the copy argument
    assert_equal(len(warning_list), 2)

    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", UserWarning)
        # Use the copy argument, to raise a warning
        ward_tree(X, copy=True)
    # We should be getting 1 warnings: for using the copy argument
    assert_equal(len(warning_list), 1)

    # Let's test a hiearchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)
    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
Esempio n. 27
0
def test_linkage_misc():
    # Misc tests on linkage
    rnd = np.random.RandomState(42)
    X = rnd.normal(size=(5, 5))
    assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
    assert_raises(ValueError, linkage_tree, X, linkage='foo')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    # Deprecation of Ward class
    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", DeprecationWarning)
        Ward().fit(X)
    assert_equal(len(warning_list), 1)

    # test hiearchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)
    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hiearchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
Esempio n. 28
0
	def cluster_tiestrength_kmeans(self,vertices=None, nclusters=2, cluster_prop='tsk'):
		if vertices is None:
			vertices=self.gs
		ts=self.similarity_dice(vertices) #list of list of similarity(float)
		ward=Ward(nclusters).fit(ts)
		for i,v in enumerate(vertices):
			v[cluster_prop]=ward.labels_[i]
Esempio n. 29
0
    def _run_interface(self, runtime):
        #load data
        data = nb.load(self.inputs.in_File).get_data()
        corrmatrix = np.squeeze(data)
        if self.inputs.cluster_type == 'spectral':
            positivecorrs = np.where(
                corrmatrix > 0, corrmatrix,
                0)  #threshold at 0 (spectral uses non-negative values)
            newmatrix = np.asarray(
                positivecorrs,
                dtype=np.double)  #spectral expects dtype=double values
            labels = spectral(newmatrix,
                              n_clusters=self.inputs.n_clusters,
                              eigen_solver='arpack',
                              assign_labels='discretize')
        if self.inputs.cluster_type == 'hiercluster':
            labels = Ward(
                n_clusters=self.inputs.n_clusters).fit_predict(corrmatrix)
        if self.inputs.cluster_type == 'kmeans':
            labels = km(
                n_clusters=self.inputs.n_clusters).fit_predict(corrmatrix)
        if self.inputs.cluster_type == 'dbscan':
            labels = DBSCAN(eps=self.inputs.epsilon).fit_predict(corrmatrix)

        new_img = nb.Nifti1Image(labels + 1,
                                 None)  #+1 because cluster labels start at 0
        _, base, _ = split_filename(self.inputs.in_File)
        nb.save(
            new_img,
            os.path.abspath(base + '_' + str(self.inputs.n_clusters) + '_' +
                            self.inputs.cluster_type + '_' + self.inputs.hemi +
                            '.nii'))

        return runtime
Esempio n. 30
0
    def doCoClustering(self,
                       leftClustCount,
                       rightClustCount,
                       clustPropName='coclust'):

        vsleft = self.left()
        simleft = np.matrix(self.similarity_dice(vsleft))
        clustleft = Ward(n_clusters=leftClustCount).fit(simleft).labels_

        vsright = self.right()
        full2bipart = [
            (None, -1)
        ] * self.vcount()  #tuple of (isOnRightSide,index in left/right list)
        for i, v in enumerate(vsleft):
            full2bipart[v.index] = (False, i)
        for i, v in enumerate(vsright):
            full2bipart[v.index] = (True, i)

        sizeright = len(vsright)
        m_rclust = np.zeros(shape=(sizeright, leftClustCount))
        for e in self.es:
            (srcOnRight, src) = full2bipart[e.source]
            (_, dst) = full2bipart[e.target]
            if srcOnRight:
                vright = src
                clust = clustleft[dst]
            else:
                vright = dst
                clust = clustleft[src]

            m_rclust[vright, clust] += 1

        clustSizes = [0] * leftClustCount
        for c in clustleft:
            clustSizes[c] += 1

        for (row, col) in [(row, col)
                           for (row, col), val in np.ndenumerate(m_rclust)
                           if val]:
            #m_rclust[row,col]=float(val)/clustSizes[col]
            m_rclust[row, col] = float(val) / vsright[row].degree()

        simRight = cdist(m_rclust, m_rclust, 'cosine')
        clustright = Ward(n_clusters=rightClustCount).fit(simRight).labels_

        for i, c in enumerate(clustright):
            vsright[i][clustPropName] = c
Esempio n. 31
0
def max_diff_dist_idx(dist_mat, min_dist, max_dist):
    num_nodes = dist_mat.shape[0]
    dist_diff = []
    max_diff = -1
    max_diff_row = 0
    max_diff_label = []
    max_cluster_idx = []
    for i, dist_vals in enumerate(dist_mat):
        # exclude its own distance
        idx_set = np.r_[np.r_[0:i:1], np.r_[i + 1:num_nodes:1]]
        #print i,'th row k-mean cluster'
        temp = dist_vals[idx_set]
        if np.min(temp) > max_dist:
            exemplar_idx = i
            max_cluster_idx = i
            #import pdb;pdb.set_trace()
            return exemplar_idx, max_cluster_idx

        ########################################
        # K-mean
        #_,label,_=cluster.k_means(temp[:,None],2)
        # Herichical Binary Clutering
        ward = Ward(n_clusters=2).fit(temp[:, None])
        label = ward.labels_
        #kmean=KMeans(n_clusters=2).fit(temp[:,None])
        #label=kmean.labels_

        # max is default
        centroid = np.zeros(2)
        #import pdb;pdb.set_trace()
        centroid[0] = np.max(temp[label == 0])
        centroid[1] = np.max(temp[label == 1])
        #idx0=idx_set[np.nonzero(label==0)]
        #idx1=idx_set[np.nonzero(label==1)]
        #dist01=np.round([dist_mat[v0,v1] for v0 in idx0 for v1 in idx1],2)
        #num_min_dist_violation=len(np.nonzero(dist01<min_dist)[0])
        ########################################
        temp_1 = abs(centroid[0] - centroid[1])
        cent_diff = centroid[0] - centroid[1]
        dist_diff.append(abs(cent_diff))
        if max_diff < temp_1:
            #if (max_diff< temp_1) and (num_min_dist_violation==0):
            max_idx_set = idx_set
            max_diff_row = i
            max_diff = temp_1
            max_diff_label = label
            max_cent_diff = cent_diff

    #import pdb;pdb.set_trace()
    cur_cent_idx = set([])
    if max_cent_diff > 0:
        cur_cent_idx = cur_cent_idx | set(np.nonzero(max_diff_label == 1)[0])
    else:
        cur_cent_idx = cur_cent_idx | set(np.nonzero(max_diff_label == 0)[0])
    max_cluster_idx = list(
        set(max_idx_set[list(cur_cent_idx)]) | set([max_diff_row]))
    exemplar_idx = max_diff_row

    return exemplar_idx, max_cluster_idx
Esempio n. 32
0
File: cluster.py Progetto: zaycev/n7
 def ward(self, X, n_clusters, plot=True):
     k_means = Ward(n_clusters=n_clusters, copy=False, compute_full_tree=True, memory="cache")
     k_means.fit(X)
     labels = k_means.labels_
     
     pl.close('all')
     pl.figure(1)
     pl.clf()
     
     if plot:
         colors = "rbgcmybgrcmybgrcmybgrcm" * 10
         X2d = RandomizedPCA(n_components=2).fit_transform(X)
         for i in xrange(len(X2d)):
             x = X2d[i]
             pl.plot(x[0], x[1], "o", markerfacecolor=colors[labels[i]], markeredgecolor=colors[labels[i]], alpha=0.035)
         pl.show()
     
     return k_means.labels_
Esempio n. 33
0
 def cluster_ward(self, calpha=True):
     '''
     cluster the positively predicted residues using the Ward method.
     Returns a list of cluster labels the same length as the number of positively predicted residues.
     '''
     
     if calpha:
         data_atoms = self.positive_surface_residues.ca
     #else:
     #    data_atoms = self.positive_surface_residues.select('ca or sidechain').copy()
     if data_atoms.getCoords().shape[0] < 4:
         print self.pdbid, data_atoms.getCoords().shape
         return {}
     connectivity = kneighbors_graph(data_atoms.getCoords(), 5)
     ward = Ward(n_clusters=self.WARD_N_CLUSTERS, connectivity=connectivity)
     ward.fit(data_atoms.getCoords())
     resnums = data_atoms.getResnums()
     reslabels = ward.labels_
     clusters = sorted([resnums[reslabels==i] for i in set(reslabels)], key=len, reverse=True)
     return dict(enumerate(clusters))
Esempio n. 34
0
def hac_derived_ordering(
    bags_file,
    num_clusters_multiplier=0.4
):  #uses HAC analysis to output hierarchies and evaluate results with ground truth
    print '*HAC DERIVED ORDERING*', num_clusters_multiplier
    print 'Starting Hierarchical Agglomerative Clustering analysis...'
    data, words, transcripts = doc_term_mat_from_bags(bags_file)
    model = Ward(n_clusters=int(num_clusters_multiplier *
                                len(transcripts))).fit(data)
    clust = model.fit_predict(data)
    hier_sets = []
    for i in range(len(transcripts)):
        s = [i + 1]
        #print transcripts[i]
        for j in range(0, i):
            if (clust[i] == clust[j]):
                #print '>>', transcripts[j]
                s.append(j + 1)
        hier_sets.append(set(s))
    return compare_hierarchies(hier_sets)
Esempio n. 35
0
def cluster_w_else(network, similarity_matrix, number_of_communities=20):
    raw_communities = Ward(
        n_clusters=number_of_communities).fit(similarity_matrix).labels_
    #raw_communities = KMeans(k=number_of_communities).fit(similarity_matrix).labels_
    #raw_communities = DBSCAN().fit(similarity_matrix, eps=eps, min_samples=min_samples).labels_
    communities = OrderedDict([(x, []) for x in range(number_of_communities)])
    for i in range(len(network)):
        community_idx = raw_communities[i]
        if community_idx != -1:
            communities[community_idx].append(network.keys()[i])
    return communities
Esempio n. 36
0
    def cluster_hierarchically(self, raw_data, num_clusters, cmtrx=None):
        """
		"""
        if cmtrx is None:
            cmtrx = self.generate_connectivity_matrix(raw_data.shape[0])
        try:
            ward_clusters = Ward(n_clusters=num_clusters,
                                 connectivity=cmtrx).fit(raw_data)
        except NameError:
            print 'WARNING: sklearn Ward clustering disabled.'
            return None
        return ward_clusters.labels_
Esempio n. 37
0
def identify_communities(number_of_communities, similarity_matrix, node_ids):
    raw_communities = Ward(
        n_clusters=number_of_communities).fit(similarity_matrix).labels_
    #raw_communities = KMeans(k=number_of_communities).fit(similarity_matrix).labels_
    #raw_communities = DBSCAN().fit(similarity_matrix, eps=eps, min_samples=min_samples).labels_
    num_communities = len(
        set(raw_communities)) - (1 if -1 in raw_communities else 0)
    communities = OrderedDict([(x, []) for x in range(num_communities)])
    for i in range(len(node_ids)):
        community_idx = raw_communities[i]
        if community_idx != -1:
            communities[community_idx].append(node_ids[i])
    return communities
Esempio n. 38
0
def main():
    print "## Welcome to the clustering tutorial ##"
    args = parse_args()
    x, tc = generate_data(args.n)

    ks = numpy.arange(1, args.k + 1)
    crs = numpy.zeros(args.k)
    col = 'k'

    print "Computing %s clustering quality criterion" % args.criterion
    for j in xrange(args.k):
        ward = Ward(n_clusters=ks[j]).fit(x)
        labels = ward.labels_

        if args.criterion == 'squared':
            crs[j] = squared_criterion(x, labels)
            col = 'r'
        elif args.criterion == 'diameter':
            crs[j] = diameter_criterion(x, labels)
            col = 'g'
        elif args.criterion == 'silhouette':
            crs[j] = silhouette_criterion(x, labels)
            col = 'b'
        else:
            raise ValueError("Wrong criterion" + args.criterion)

    pylab.figure(figsize=(12, 6))

    ward = Ward(n_clusters=args.n).fit(x)
    labels = ward.labels_

    pylab.subplot(1, 2, 1)
    plot_data(x, labels)

    pylab.subplot(1, 2, 2)
    plot_criterion(ks, crs, col)

    pylab.show()
Esempio n. 39
0
def compute_clusters(dataset,features_vector):
    """
    Apply clustering method
    """

    labels = dataset.target
    true_k = np.unique(labels).shape[0]
    
    # Run clustering method
    print "Performing clustering with method ", cmd_options.clust_method.upper()
    print

    if(cmd_options.clust_method == "hclust"):
        result = features_vector.toarray()
        ward = Ward(n_clusters=true_k)
        ward.fit(result) 

        return ward

    if(cmd_options.clust_method == "kmeans"):
        km = KMeans(n_clusters=true_k, init='k-means++', max_iter=1000, verbose=1)
        km.fit(features_vector)

        return km
Esempio n. 40
0
def do_experiments(dataset):
    X, y = dataset.data, dataset.target
    dataset_name = dataset.DESCR.split('\n')[0]
    if dataset_name.startswith("Iris"):
        # iris has duplicate data points. That messes up our
        # MeanNN implementation.
        from scipy.spatial.distance import pdist, squareform
        dist = squareform(pdist(X))
        doubles = np.unique(np.where(np.tril(dist - 1, -1) == -1)[0])
        mask = np.ones(X.shape[0], dtype=np.bool)
        mask[doubles] = False
        X = X[mask]
        y = y[mask]

    n_clusters = len(np.unique(y))
    print("\n\nDataset %s samples: %d, features: %d, clusters: %d" %
          (dataset_name, X.shape[0], X.shape[1], n_clusters))
    print("=" * 70)

    classes = [
        ITM(n_clusters=n_clusters),
        ITM(n_clusters=n_clusters, infer_dimensionality=True),
        Ward(n_clusters=n_clusters),
        KMeans(n_clusters=n_clusters)
    ]
    names = ["ITM", "ITM ID", "Ward", "KMeans"]
    for clusterer, method in zip(classes, names):
        start = time()
        clusterer.fit(X)
        y_pred = clusterer.labels_

        ari = adjusted_rand_score(y, y_pred)
        ami = adjusted_mutual_info_score(y, y_pred)
        nmi = normalized_mutual_info_score(y, y_pred)
        objective = tree_information(X, y_pred)

        runtime = time() - start

        print("%-15s ARI: %.3f, AMI: %.3f, NMI: %.3f objective: %.3f time:"
              "%.2f" % (method, ari, ami, nmi, objective, runtime))

    i_gt = tree_information(X, y)
    print("GT objective: %.3f" % i_gt)
Esempio n. 41
0
def test_linkage_misc():
    # Misc tests on linkage
    X = np.ones((5, 5))
    assert_raises(ValueError,
                  AgglomerativeClustering(linkage='foobar').fit,
                  X)
    assert_raises(ValueError, linkage_tree, X, linkage='foobar')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", UserWarning)
        # Use the copy argument, to raise a warning
        Ward(copy=True).fit(X)
    # We should be getting 2 warnings: one for using Ward that is
    # deprecated, one for using the copy argument
    assert_equal(len(warning_list), 2)
Esempio n. 42
0
def cluster(dump_path, file_name, n_clusters=200):
    # Obtain data from file.
    #feature_file = 'feature.list'
    data = np.loadtxt(file_name, unpack=True)
    m1 = data[1]
    
    X = np.transpose(data)
    X = scale(X)
    labels_true = np.zeros(len(m1))
    
    ###############################################################################
    # Compute clustering
    print("Compute unstructured hierarchical clustering...")
    st = time.time()
    ward = Ward(n_clusters=n_clusters).fit(X)
    label = ward.labels_
    print("Elapsed time: ", time.time() - st)
    print("Number of points: ", label.size)

    label_file = dump_path + "ward_labels.list"
    fp = open(label_file, 'w')
    for i in label:
        fp.write("%d\n" % i)
    fp.close()

    num_cluster_file = dump_path + "_num_clusters_ward.info"
    fp = open(num_cluster_file, 'w')
    fp.write("%d" % n_clusters)
    fp.close()


    cluster_centers = ward.cluster_centers_
    
    score = 0.0
    # print "evaluating performance..."
    # score = metrics.silhouette_score(X, label, metric='euclidean', sample_size=20000)
    # print "evaluation done."
    # score = metrics.silhouette_samples(X, k_means_labels, metric='euclidean', sample_size=1000)
    # score = np.sum(score)/len(score)

    return score
def clusterRT_ward(values) :
    if len(values) == 0 : return []

    v = sorted([[val] for val in values])
    
    #connectivity = kneighbors_graph(np.asarray(v), n_neighbors=3)
    ward = Ward(n_clusters=2).fit(np.asarray(v))
    labels = ward.labels_

    curr_l = -2
    cl_output = []
    curr_cluster = []
    for i,l in enumerate(labels) :
        if l != curr_l :
            if len(curr_cluster) > 0 : cl_output.append(curr_cluster)
            curr_l = l
            curr_cluster = []
        curr_cluster.append(values[i])
    cl_output.append(curr_cluster)
    
    return cl_output
Esempio n. 44
0
# Generate data
lena = misc.imread('dyfoc.png')
# Downsample the image by a factor of 4
lena = lena[::2, ::2] + lena[1::2, ::2] + lena[::2, 1::2] + lena[1::2, 1::2]
X = np.reshape(lena, (-1, 1))

###############################################################################
# Define the structure A of the data. Pixels connected to their neighbors.
connectivity = grid_to_graph(*lena.shape)

###############################################################################
# Compute clustering
print("Compute structured hierarchical clustering...")
st = time.time()
n_clusters = 15  # number of regions
ward = Ward(n_clusters=n_clusters, connectivity=connectivity).fit(X)
label = np.reshape(ward.labels_, lena.shape)
print("Elapsed time: ", time.time() - st)
print("Number of pixels: ", label.size)
print("Number of clusters: ", np.unique(label).size)

###############################################################################
# Plot the results on an image
pl.figure(figsize=(5, 5))
pl.imshow(lena, cmap=pl.cm.gray)
for l in range(n_clusters):
    pl.contour(label == l, contours=1,
               colors=[pl.cm.spectral(l / float(n_clusters)), ])
pl.xticks(())
pl.yticks(())
pl.show()
Esempio n. 45
0
def test_ward_clustering():
    """
    Check that we obtain the correct number of clusters with Ward clustering.
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(100, 50)
    connectivity = grid_to_graph(*mask.shape)
    clustering = Ward(n_clusters=10, connectivity=connectivity)
    clustering.fit(X)
    # test caching
    clustering = Ward(n_clusters=10, connectivity=connectivity,
                      memory=mkdtemp())
    clustering.fit(X)
    labels = clustering.labels_
    assert_true(np.size(np.unique(labels)) == 10)
    # Turn caching off now
    clustering = Ward(n_clusters=10, connectivity=connectivity)
    # Check that we obtain the same solution with early-stopping of the
    # tree building
    clustering.compute_full_tree = False
    clustering.fit(X)
    np.testing.assert_array_equal(clustering.labels_, labels)
    clustering.connectivity = None
    clustering.fit(X)
    assert_true(np.size(np.unique(clustering.labels_)) == 10)
    # Check that we raise a TypeError on dense matrices
    clustering = Ward(n_clusters=10,
                      connectivity=connectivity.todense())
    assert_raises(TypeError, clustering.fit, X)
    clustering = Ward(n_clusters=10,
                      connectivity=sparse.lil_matrix(
                          connectivity.todense()[:10, :10]))
    assert_raises(ValueError, clustering.fit, X)
Esempio n. 46
0
    def encode(self, interm_rep, neighborhood_size = 26,
               clust_ratio=10,
               encoding='geometrical',
               similarity_measure='pearson',
               threshold=0.3, n_jobs=1, **kwds):
        """
        Parameters
        ----------
        interm_rep: IntermRep
            IntermRep object containing the arr_xyz and arr_voxel matrixes.
        neighborhood_size: int
            Number of neighbors each voxel will be connected to.
        clust_ratio: int
            The number of clusters will be equal to n/clust_ratio, where n is
            the number of voxels.
        encoding: string
            Type of encoding. 'geometrical' and 'functional' are allowed.
        similarity_measure: string
            Similarity measure used to compare the representative value of each
            parcel (cluster). 'pearson' or the measures available in scikit-learn
            are allowed.
        threshold: float
            Threshold applied to the similarity values in order to define the
            edges in the graph.

        Returns
        -------
        g: Graph
            Networkx graph representing the graph encoding of the data.
        """

        #computing the connectivity matrix, each voxel is connected to
        #"neighborhood_size" neighbors.
        #
        conn = kneighbors_graph(interm_rep.arr_xyz, n_neighbors=neighborhood_size)
#        conn_n = kneighbors_graph(interm_rep.arr_xyz, n_neighbors=neighborhood_size)
#        conn_r = radius_neighbors_graph(interm_rep.arr_xyz, radius=10)
#        conn = conn_n * conn_r

        #Hierarchical clustering algorithm. The number of clusters is defined
        #accoring to the parameter "clust_ratio".
        ward = Ward(n_clusters=len(interm_rep.arr_xyz)/clust_ratio, connectivity=conn)
        #ward = Ward(n_clusters=60, connectivity=conn)

        #Type of encoding: geometrical (only xyz data is used) or
        # functional (voxel time series is used).
        if encoding=='geometrical':
            ward.fit(interm_rep.arr_xyz)
        elif encoding=='functional':
            ward.fit(interm_rep.arr_voxels)

        labels = ward.labels_

        #Plotting the voxels with the cluster labels.
        #pp.plot_clustering_intermediate_representation(interm_rep, labels*10)


        #Computing the unique cluster indentifiers
        l_unique = np.unique(labels)

        mean_voxels = np.zeros((len(l_unique), interm_rep.arr_voxels.shape[1]))
        mean_xyz = np.zeros((len(l_unique), interm_rep.arr_xyz.shape[1]))

        cont = 0
        for i in l_unique:
            #Taking the possitions corresponding to the same cluster.
            pos = np.where(labels == i)[0]
            #Taking data from these possitions and computing the mean time serie
            m_voxel = interm_rep.arr_voxels[pos].mean(0)
            #Taking the xyz from these positions and computing the mean value
            m_xyz = interm_rep.arr_xyz[pos].mean(0)

            mean_voxels[cont] = m_voxel
            mean_xyz[cont] = m_xyz

            cont += 1


        #plotting the voxels time series for each cluster
        #pp.plot_interm_representation_time_series(ir.IntermRep(mean_voxels, mean_xyz))

        #The new intermediate representation is given by mean_voxels and
        # mean_xyz.

        #Computing similarity matrix and applying the threshold
        adj_mat = np.zeros((len(mean_voxels), len(mean_voxels)),
                           dtype = np.byte)
        for j in range(len(mean_voxels) - 1):
            for k in range(j + 1, len(mean_voxels)):
                if similarity_measure == 'pearson':
                    aux = st.pearsonr(mean_voxels[j], mean_voxels[k])[0]
                else:
                    aux = skpw.pairwise_kernel(mean_voxels[j], mean_voxels[k],
                                               metric = similarity_measure,
                                               n_jobs = n_jobs)
                if aux >= threshold:
                    adj_mat[j,k] = 1
                    adj_mat[k,j] = 1


#        #Weighted encoding (for graph kernels that work with weighted graphs)
#        #------------------------------------
#        adj_mat = np.zeros((len(mean_voxels), len(mean_voxels)),
#                           dtype = np.float)
#        for j in range(len(mean_voxels) - 1):
#            for k in range(j + 1, len(mean_voxels)):
#                if similarity_measure == 'pearson':
#                    aux = st.pearsonr(mean_voxels[j], mean_voxels[k])[0]
#                else:
#                    aux = skpw.pairwise_kernel(mean_voxels[j], mean_voxels[k],
#                                               metric = similarity_measure,
#                                               n_jobs = n_jobs)
##                if aux >= threshold:
##                    adj_mat[j,k] = aux
##                    adj_mat[k,j] = aux
#                adj_mat[j,k] = adj_mat[k,j] = aux
#        adj_mat = (adj_mat - np.mean(adj_mat))/np.std(adj_mat)
#        adj_mat = (adj_mat - np.min(adj_mat))/(np.max(adj_mat) - np.min(adj_mat))
#        adj_mat = np.where(adj_mat>=threshold, 1, 0)
#        #------------------------------------


        #Building the graph from the adjacency matrix
        g = nx.from_numpy_matrix(adj_mat)

        #Spliting the node degrees into some categories and using them as node labels.
#        num_lab = 5
        deg = g.degree()
#        for k in deg:
#            deg[k]/= num_lab
        nx.set_node_attributes(g, 'node_label', deg)

        ############
        #Storing the mean time-series of each parcell as a node attribute
        ts_att = {}
        mv = mean_voxels.tolist()
        for pos in range(len(mv)):
            ts_att[pos] = mv[pos]
        nx.set_node_attributes(g, 'time_series', ts_att)



        #Saving the graphs for CLFR subject (the one for which I have the structural data)
#        if interm_rep.subj_name == 'CLFR':
#            nx.write_gexf(g, 'graph_gephi_format.gexf')
#            np.savetxt('CLFR_clusters_xyz.txt', mean_xyz, fmt='%1d', delimiter=' ')
#            edges = np.array(np.where(adj_mat==1)).T
#            np.savetxt('CLFR_clusters_timeseries_cond%s.txt' %(interm_rep.cls), edges, fmt='%1d', delimiter=' ')


        #Plot Graphs
        #pp.plot_graph(mean_xyz, g)

        return g
Esempio n. 47
0
print("Homogeneity k-means: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness k-means: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure k-means: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Silhouette Coefficient k-means: %0.3f" % metrics.silhouette_score(clustering, km.labels_, sample_size = 8000))

# DBSCAN
# Structured hierarchical clustering
db = DBSCAN()
db.fit(clustering)
print 'DBSCAN clusters created..'

print("Homogeneity DBSCAN: %0.3f" % metrics.homogeneity_score(labels, db.labels_))
print("Completeness DBSCAN: %0.3f" % metrics.completeness_score(labels, db.labels_))
print("V-measure DBSCAN: %0.3f" % metrics.v_measure_score(labels, db.labels_))
print("Silhouette Coefficient DBSCAN: %0.3f" % metrics.silhouette_score(clustering, db.labels_, sample_size = 5000))

# Structured hierarchical clustering
ward = Ward(n_clusters = 9)
ward.fit(clustering)
print 'Hierarchical clusters created..'

print("Homogeneity hierarchical: %0.3f" % metrics.homogeneity_score(labels, ward.labels_))
print("Completeness hierarchical: %0.3f" % metrics.completeness_score(labels, ward.labels_))
print("V-measure hierarchical: %0.3f" % metrics.v_measure_score(labels, ward.labels_))
print("Silhouette Coefficient hierarchical: %0.3f" % metrics.silhouette_score(clustering, ward.labels_, sample_size = 5000))




Esempio n. 48
0
 def hierarchical(self, n_clusters):
     ward = Ward(n_clusters=n_clusters)
     return ward.fit_predict(sp.array(self.A))
Esempio n. 49
0
def ward(X, n_clust):
    "H"

    ward = Ward(n_clusters=n_clust)
    ward.fit(X)
    return ward
Esempio n. 50
0
"""
Benchmark scikit-learn's Ward implement compared to SciPy's
"""

import time

import numpy as np
from scipy.cluster import hierarchy
import pylab as pl

from sklearn.cluster import Ward

ward = Ward(n_clusters=3)

n_samples = np.logspace(.5, 3, 9)
n_features = np.logspace(1, 3.5, 7)
N_samples, N_features = np.meshgrid(n_samples,
                                    n_features)
scikits_time = np.zeros(N_samples.shape)
scipy_time = np.zeros(N_samples.shape)

for i, n in enumerate(n_samples):
    for j, p in enumerate(n_features):
        X = np.random.normal(size=(n, p))
        t0 = time.time()
        ward.fit(X)
        scikits_time[j, i] = time.time() - t0
        t0 = time.time()
        hierarchy.ward(X)
        scipy_time[j, i] = time.time() - t0
Esempio n. 51
0
    if i != 'Combined Queries' and i != 'Report ID' and i != 'Object Name' and i != 'Report Name' and i != 'Operands':
        print i
        train = pd.concat([train, pd.get_dummies(raw_train[i])], axis=1)
        
freq = train.groupby('Report ID').sum()
freq = freq.drop('Has Combined Queries', 1)


# Train Model #############################

num_cluster = 12

kmean = KMeans(n_clusters=num_cluster, max_iter=400, verbose = 0, n_jobs = 2, n_init=20, tol=1e-6)
model_kmean = kmean.fit(freq)
        
ward = Ward(n_clusters=num_cluster)
model_ward = ward.fit(freq)


from sklearn.neighbors import kneighbors_graph
connectivity = kneighbors_graph(freq, n_neighbors=4)

#ward = Ward(n_clusters=num_cluster, connectivity = connectivity)
#model_ward = ward.fit(freq)

# Visualization #####################################################

import mpl_toolkits.mplot3d.axes3d as p3
import pylab as pl
from sklearn.datasets.samples_generator import make_friedman3