コード例 #1
0
def test_ward_clustering():
    """
    Check that we obtain the correct number of clusters with Ward clustering.
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(100, 50)
    connectivity = grid_to_graph(*mask.shape)
    clustering = Ward(n_clusters=10, connectivity=connectivity)
    clustering.fit(X)
    # test caching
    clustering = Ward(n_clusters=10, connectivity=connectivity,
                      memory=mkdtemp())
    clustering.fit(X)
    labels = clustering.labels_
    assert_true(np.size(np.unique(labels)) == 10)
    # Turn caching off now
    clustering = Ward(n_clusters=10, connectivity=connectivity)
    # Check that we obtain the same solution with early-stopping of the
    # tree building
    clustering.compute_full_tree = False
    clustering.fit(X)
    np.testing.assert_array_equal(clustering.labels_, labels)
    clustering.connectivity = None
    clustering.fit(X)
    assert_true(np.size(np.unique(clustering.labels_)) == 10)
    # Check that we raise a TypeError on dense matrices
    clustering = Ward(n_clusters=10,
                      connectivity=connectivity.todense())
    assert_raises(TypeError, clustering.fit, X)
    clustering = Ward(n_clusters=10,
                      connectivity=sparse.lil_matrix(
                          connectivity.todense()[:10, :10]))
    assert_raises(ValueError, clustering.fit, X)
コード例 #2
0
def hierarchicalClustering(x, k):
    model = Ward(n_clusters=k)
    labels = model.fit_predict(np.asarray(x))

    # Centroids is a list of lists
    centroids = []
    for c in range(k):
        base = []
        for d in range(len(x[0])):
            base.append(0)
        centroids.append(base)

    # Stores number of examples per cluster
    ctrs = np.zeros(k)

    # Sum up all vectors for each cluster
    for c in range(len(x)):
        centDex = labels[c]
        for d in range(len(centroids[centDex])):
            centroids[centDex][d] += x[c][d]
        ctrs[centDex] += 1

    # Average the vectors in each cluster to get the centroids
    for c in range(len(centroids)):
        for d in range(len(centroids[c])):
            centroids[c][d] = centroids[c][d] / ctrs[c]

    return (centroids, labels)
コード例 #3
0
def cluster_evaluation(D, y_true, n_clusters, eps=0.8, min_samples=10):
    ##############################################################################
    # Extract Y true
    labels_true = y_true

    ##############################################################################
    # transform distance matrix into a similarity matrix
    S = 1 - D

    ##############################################################################
    # compute DBSCAN
    #db = DBSCAN(eps=eps, min_samples=min_samples).fit(S)
    db = Ward(n_clusters=n_clusters).fit(S)
    #core_samples = db.core_sample_indices_
    labels = db.labels_

    # number of clusters in labels, ignoring noise if present
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print 'Number of clusters: %d' % n_clusters_
    print 'Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, labels)
    print 'Completeness: %0.3f' % metrics.completeness_score(
        labels_true, labels)
    print 'V-meassure: %0.3f' % metrics.v_measure_score(labels_true, labels)
    print 'Adjusted Rand Index: %0.3f' % metrics.adjusted_rand_score(
        labels_true, labels)
    print 'Adjusted Mutual Information: %0.3f' % metrics.adjusted_mutual_info_score(
        labels_true, labels)
    print 'Silhouette Coefficient: %0.3f' % metrics.silhouette_score(
        D, labels, metric='precomputed')
コード例 #4
0
def spect_clust_segmentation(lena, regions=20):
    X = np.reshape(lena, (-1, 1))

    connectivity = grid_to_graph(*lena.shape)

    print("Compute structured hierarchical clustering...")

    st = time.time()

    n_clusters = regions
    ward = Ward(n_clusters=n_clusters, connectivity=connectivity).fit(X)
    label = np.reshape(ward.labels_, lena.shape)

    print("Elapsed time: ", time.time() - st)
    print("Number of pixels: ", label.size)
    print("Number of clusters: ", np.unique(label).size)

    plt.imshow(lena, cmap=plt.cm.gray)
    for l in range(n_clusters):
        plt.contour(label == l,
                    contours=1,
                    colors=[
                        plt.cm.spectral(l / float(n_clusters)),
                    ])
    plt.show()
コード例 #5
0
def test_linkage_misc():
    # Misc tests on linkage
    X = np.ones((5, 5))
    assert_raises(ValueError,
                  AgglomerativeClustering(linkage='foobar').fit,
                  X)
    assert_raises(ValueError, linkage_tree, X, linkage='foobar')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", UserWarning)
        # Use the copy argument, to raise a warning
        Ward(copy=True).fit(X)
    # We should be getting 2 warnings: one for using Ward that is
    # deprecated, one for using the copy argument
    assert_equal(len(warning_list), 2)

    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", UserWarning)
        # Use the copy argument, to raise a warning
        ward_tree(X, copy=True)
    # We should be getting 1 warnings: for using the copy argument
    assert_equal(len(warning_list), 1)

    # Let's test a hiearchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)
    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
コード例 #6
0
def compute_clusters(dataset, features_vector):
    """
    Apply clustering method
    """

    labels = dataset.target
    true_k = np.unique(labels).shape[0]

    # Run clustering method
    print "Performing clustering with method ", cmd_options.clust_method.upper(
    )
    print

    if (cmd_options.clust_method == "hclust"):
        result = features_vector.toarray()
        ward = Ward(n_clusters=true_k)
        ward.fit(result)

        return ward

    if (cmd_options.clust_method == "kmeans"):
        km = KMeans(n_clusters=true_k,
                    init='k-means++',
                    max_iter=1000,
                    verbose=1)
        km.fit(features_vector)

        return km
コード例 #7
0
	def cluster_tiestrength_kmeans(self,vertices=None, nclusters=2, cluster_prop='tsk'):
		if vertices is None:
			vertices=self.gs
		ts=self.similarity_dice(vertices) #list of list of similarity(float)
		ward=Ward(nclusters).fit(ts)
		for i,v in enumerate(vertices):
			v[cluster_prop]=ward.labels_[i]
コード例 #8
0
    def buildFromImageCollectionWard(self,
                                     pathTxtFile,
                                     pathDirImages,
                                     fileImageExtension,
                                     vocabularySize,
                                     maxNumImages=sys.maxint):
        # vocabularySize could be 4096
        # Read the image IDs
        imageIds = self.readImageIdsFromTxtFile(pathTxtFile)

        # If there are more images than the considered ones...
        if (len(imageIds) > maxNumImages):
            imageIds = random.sample(imageIds, maxNumImages)

        # Extract the SURF descriptors from a collection of images and save in dictionary
        surfExtractor = SurfExtractor(True)
        surfExtractor.processCollectionFilesImage(imageIds, pathDirImages,
                                                  fileImageExtension)

        # Create a numpy array from the descriptors
        descriptors = surfExtractor.getDescriptors()
        arr_descriptor = np.vstack(tuple(descriptors))

        #self.mbk = MiniBatchKMeans(init='k-means++',
        #                                k=vocabularySize,
        #                                n_init=10,
        #                                max_no_improvement=10,
        #                                verbose=0)
        self.ward = Ward(n_clusters=vocabularySize)

        self.ward.fit(arr_descriptor)
コード例 #9
0
    def _run_interface(self, runtime):
        #load data
        data = nb.load(self.inputs.in_File).get_data()
        corrmatrix = np.squeeze(data)
        if self.inputs.cluster_type == 'spectral':
            positivecorrs = np.where(
                corrmatrix > 0, corrmatrix,
                0)  #threshold at 0 (spectral uses non-negative values)
            newmatrix = np.asarray(
                positivecorrs,
                dtype=np.double)  #spectral expects dtype=double values
            labels = spectral(newmatrix,
                              n_clusters=self.inputs.n_clusters,
                              eigen_solver='arpack',
                              assign_labels='discretize')
        if self.inputs.cluster_type == 'hiercluster':
            labels = Ward(
                n_clusters=self.inputs.n_clusters).fit_predict(corrmatrix)
        if self.inputs.cluster_type == 'kmeans':
            labels = km(
                n_clusters=self.inputs.n_clusters).fit_predict(corrmatrix)
        if self.inputs.cluster_type == 'dbscan':
            labels = DBSCAN(eps=self.inputs.epsilon).fit_predict(corrmatrix)

        new_img = nb.Nifti1Image(labels + 1,
                                 None)  #+1 because cluster labels start at 0
        _, base, _ = split_filename(self.inputs.in_File)
        nb.save(
            new_img,
            os.path.abspath(base + '_' + str(self.inputs.n_clusters) + '_' +
                            self.inputs.cluster_type + '_' + self.inputs.hemi +
                            '.nii'))

        return runtime
コード例 #10
0
def test_connectivity_popagation():
    """
    Check that connectivity in the ward tree is propagated correctly during
    merging.
    """
    from sklearn.neighbors import NearestNeighbors

    X = np.array([
        (.014, .120),
        (.014, .099),
        (.014, .097),
        (.017, .153),
        (.017, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .152),
        (.018, .149),
        (.018, .144),
    ])
    nn = NearestNeighbors(n_neighbors=10, warn_on_equidistant=False).fit(X)
    connectivity = nn.kneighbors_graph(X)
    ward = Ward(n_clusters=4, connectivity=connectivity)
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X)
コード例 #11
0
 def __hieclu(self):
     #use Hierarchical clustering
     print 'using hierarchical clustering......'
     ac = Ward(n_clusters=self.k)
     ac.fit(self.data_matrix)
     result = ac.fit_predict(self.data_matrix)
     return result
コード例 #12
0
def test_linkage_misc():
    # Misc tests on linkage
    rnd = np.random.RandomState(42)
    X = rnd.normal(size=(5, 5))
    assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
    assert_raises(ValueError, linkage_tree, X, linkage='foo')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    # Deprecation of Ward class
    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", DeprecationWarning)
        Ward().fit(X)
    assert_equal(len(warning_list), 1)

    # test hiearchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)
    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hiearchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
コード例 #13
0
    def constraint(self, nodes, edges, lables):
        if len(nodes) != len(lables):
            print("#nodes(%d) != #clusters(%d)" % (len(nodes), len(lables)))

        N = len(nodes)
        circles = {}

        guidance_matrix = sp.zeros([N, N])
        # guidance_matrix = {}
        for i in range(len(nodes)):
            if lables[i] in circles:
                circles[lables[i]].append(nodes[i])
            else:
                circles[lables[i]] = [nodes[i]]

        for key in circles.iterkeys():
            print(key, len(circles[key]))

        c = 36
        for ni in circles[c]:
            i = nodes.index(ni)
            for nj in circles[c]:
                j = nodes.index(nj)
                guidance_matrix[i, j] = 1.0

        guidance_matrix = sparse.lil_matrix(guidance_matrix)

        # pos = sum(x > 0 for x in guidance_matrix)
        print(guidance_matrix)
        ward = Ward(n_clusters=6, n_components=2, connectivity=guidance_matrix)
        predicts = ward.fit_predict(self.A)

        print(predicts)
コード例 #14
0
    def agglomerate(self, nodes, edges, clusters):
        if len(nodes) != len(clusters):
            print("#nodes(%d) != #clusters(%d)" % (len(nodes), len(clusters)))

        neighbors = {}
        for edge in edges:
            if edge[0] in neighbors:
                neighbors[edge[0]].append(edge[1])
            else:
                neighbors[edge[0]] = [edge[1]]

        node_clusters = {}  # node: its cluster id
        communities = {}  # cluster id: all neighbors for its members
        for i in range(len(nodes)):
            if clusters[i] in communities:
                communities[clusters[i]].extend(neighbors[nodes[i]])
            else:
                communities[clusters[i]] = neighbors[nodes[i]]
            node_clusters[nodes[i]] = clusters[i]

        N = len(communities)
        affinity_matrix = sp.zeros([N, N])
        for comm in communities:
            members = [node_clusters[node] for node in communities[comm]]
            degree = dict(Counter(members))
            for key in degree:
                affinity_matrix[comm, key] = degree[key]

        ward = Ward(n_clusters=6)
        predicts = ward.fit_predict(affinity_matrix)

        return [predicts[node_clusters[node]] for node in nodes]
コード例 #15
0
def hieclu(data_matrix, k):
    #use Hierarchical clustering
    print 'using hierarchical clustering......'
    ac = Ward(n_clusters=k)
    ac.fit(data_matrix)
    result = ac.fit_predict(data_matrix)
    return result
コード例 #16
0
def test_connectivity_popagation():
    """
    Check that connectivity in the ward tree is propagated correctly during
    merging.
    """
    from sklearn.neighbors import kneighbors_graph

    X = np.array([
        (.014, .120),
        (.014, .099),
        (.014, .097),
        (.017, .153),
        (.017, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .152),
        (.018, .149),
        (.018, .144),
    ])
    connectivity = kneighbors_graph(X, 10)
    ward = Ward(n_clusters=4, connectivity=connectivity)
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X)
コード例 #17
0
ファイル: bipart.py プロジェクト: pdphuong/soclust
    def doCoClustering(self,
                       leftClustCount,
                       rightClustCount,
                       clustPropName='coclust'):

        vsleft = self.left()
        simleft = np.matrix(self.similarity_dice(vsleft))
        clustleft = Ward(n_clusters=leftClustCount).fit(simleft).labels_

        vsright = self.right()
        full2bipart = [
            (None, -1)
        ] * self.vcount()  #tuple of (isOnRightSide,index in left/right list)
        for i, v in enumerate(vsleft):
            full2bipart[v.index] = (False, i)
        for i, v in enumerate(vsright):
            full2bipart[v.index] = (True, i)

        sizeright = len(vsright)
        m_rclust = np.zeros(shape=(sizeright, leftClustCount))
        for e in self.es:
            (srcOnRight, src) = full2bipart[e.source]
            (_, dst) = full2bipart[e.target]
            if srcOnRight:
                vright = src
                clust = clustleft[dst]
            else:
                vright = dst
                clust = clustleft[src]

            m_rclust[vright, clust] += 1

        clustSizes = [0] * leftClustCount
        for c in clustleft:
            clustSizes[c] += 1

        for (row, col) in [(row, col)
                           for (row, col), val in np.ndenumerate(m_rclust)
                           if val]:
            #m_rclust[row,col]=float(val)/clustSizes[col]
            m_rclust[row, col] = float(val) / vsright[row].degree()

        simRight = cdist(m_rclust, m_rclust, 'cosine')
        clustright = Ward(n_clusters=rightClustCount).fit(simRight).labels_

        for i, c in enumerate(clustright):
            vsright[i][clustPropName] = c
コード例 #18
0
ファイル: pack_cluster.py プロジェクト: jbkoh/DDEA
def max_diff_dist_idx(dist_mat, min_dist, max_dist):
    num_nodes = dist_mat.shape[0]
    dist_diff = []
    max_diff = -1
    max_diff_row = 0
    max_diff_label = []
    max_cluster_idx = []
    for i, dist_vals in enumerate(dist_mat):
        # exclude its own distance
        idx_set = np.r_[np.r_[0:i:1], np.r_[i + 1:num_nodes:1]]
        #print i,'th row k-mean cluster'
        temp = dist_vals[idx_set]
        if np.min(temp) > max_dist:
            exemplar_idx = i
            max_cluster_idx = i
            #import pdb;pdb.set_trace()
            return exemplar_idx, max_cluster_idx

        ########################################
        # K-mean
        #_,label,_=cluster.k_means(temp[:,None],2)
        # Herichical Binary Clutering
        ward = Ward(n_clusters=2).fit(temp[:, None])
        label = ward.labels_
        #kmean=KMeans(n_clusters=2).fit(temp[:,None])
        #label=kmean.labels_

        # max is default
        centroid = np.zeros(2)
        #import pdb;pdb.set_trace()
        centroid[0] = np.max(temp[label == 0])
        centroid[1] = np.max(temp[label == 1])
        #idx0=idx_set[np.nonzero(label==0)]
        #idx1=idx_set[np.nonzero(label==1)]
        #dist01=np.round([dist_mat[v0,v1] for v0 in idx0 for v1 in idx1],2)
        #num_min_dist_violation=len(np.nonzero(dist01<min_dist)[0])
        ########################################
        temp_1 = abs(centroid[0] - centroid[1])
        cent_diff = centroid[0] - centroid[1]
        dist_diff.append(abs(cent_diff))
        if max_diff < temp_1:
            #if (max_diff< temp_1) and (num_min_dist_violation==0):
            max_idx_set = idx_set
            max_diff_row = i
            max_diff = temp_1
            max_diff_label = label
            max_cent_diff = cent_diff

    #import pdb;pdb.set_trace()
    cur_cent_idx = set([])
    if max_cent_diff > 0:
        cur_cent_idx = cur_cent_idx | set(np.nonzero(max_diff_label == 1)[0])
    else:
        cur_cent_idx = cur_cent_idx | set(np.nonzero(max_diff_label == 0)[0])
    max_cluster_idx = list(
        set(max_idx_set[list(cur_cent_idx)]) | set([max_diff_row]))
    exemplar_idx = max_diff_row

    return exemplar_idx, max_cluster_idx
コード例 #19
0
ファイル: cluster_w_mcl.py プロジェクト: mrcrabby/smarttypes
def cluster_w_else(network, similarity_matrix, number_of_communities=20):
    raw_communities = Ward(
        n_clusters=number_of_communities).fit(similarity_matrix).labels_
    #raw_communities = KMeans(k=number_of_communities).fit(similarity_matrix).labels_
    #raw_communities = DBSCAN().fit(similarity_matrix, eps=eps, min_samples=min_samples).labels_
    communities = OrderedDict([(x, []) for x in range(number_of_communities)])
    for i in range(len(network)):
        community_idx = raw_communities[i]
        if community_idx != -1:
            communities[community_idx].append(network.keys()[i])
    return communities
コード例 #20
0
def test_ward_clustering():
    """
    Check that we obtain the correct number of clusters with Ward clustering.
    """
    np.random.seed(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = np.random.randn(100, 50)
    connectivity = grid_to_graph(*mask.shape)
    clustering = Ward(n_clusters=10, connectivity=connectivity)
    clustering.fit(X)
    assert_true(np.size(np.unique(clustering.labels_)) == 10)
コード例 #21
0
    def cluster_hierarchically(self, raw_data, num_clusters, cmtrx=None):
        """
		"""
        if cmtrx is None:
            cmtrx = self.generate_connectivity_matrix(raw_data.shape[0])
        try:
            ward_clusters = Ward(n_clusters=num_clusters,
                                 connectivity=cmtrx).fit(raw_data)
        except NameError:
            print 'WARNING: sklearn Ward clustering disabled.'
            return None
        return ward_clusters.labels_
コード例 #22
0
def test_connectivity_fixing_non_lil():
    """
    Check non regression of a bug if a non item assignable connectivity is
    provided with more than one component.
    """
    # create dummy data
    x = np.array([[0, 0], [1, 1]])
    # create a mask with several components to force connectivity fixing
    m = np.array([[True, False], [False, True]])
    c = grid_to_graph(n_x=2, n_y=2, mask=m)
    w = Ward(connectivity=c)
    assert_warns(UserWarning, w.fit, x)
コード例 #23
0
def identify_communities(number_of_communities, similarity_matrix, node_ids):
    raw_communities = Ward(
        n_clusters=number_of_communities).fit(similarity_matrix).labels_
    #raw_communities = KMeans(k=number_of_communities).fit(similarity_matrix).labels_
    #raw_communities = DBSCAN().fit(similarity_matrix, eps=eps, min_samples=min_samples).labels_
    num_communities = len(
        set(raw_communities)) - (1 if -1 in raw_communities else 0)
    communities = OrderedDict([(x, []) for x in range(num_communities)])
    for i in range(len(node_ids)):
        community_idx = raw_communities[i]
        if community_idx != -1:
            communities[community_idx].append(node_ids[i])
    return communities
コード例 #24
0
def main():
    print "## Welcome to the clustering tutorial ##"
    args = parse_args()
    x, tc = generate_data(args.n)

    ks = numpy.arange(1, args.k + 1)
    crs = numpy.zeros(args.k)
    col = 'k'

    print "Computing %s clustering quality criterion" % args.criterion
    for j in xrange(args.k):
        ward = Ward(n_clusters=ks[j]).fit(x)
        labels = ward.labels_

        if args.criterion == 'squared':
            crs[j] = squared_criterion(x, labels)
            col = 'r'
        elif args.criterion == 'diameter':
            crs[j] = diameter_criterion(x, labels)
            col = 'g'
        elif args.criterion == 'silhouette':
            crs[j] = silhouette_criterion(x, labels)
            col = 'b'
        else:
            raise ValueError("Wrong criterion" + args.criterion)

    pylab.figure(figsize=(12, 6))

    ward = Ward(n_clusters=args.n).fit(x)
    labels = ward.labels_

    pylab.subplot(1, 2, 1)
    plot_data(x, labels)

    pylab.subplot(1, 2, 2)
    plot_criterion(ks, crs, col)

    pylab.show()
コード例 #25
0
def get_km_segments(x, image, sps, n_segments=25):
    if len(x) == 2:
        feats, edges = x
    else:
        feats, edges, _ = x
    colors_ = get_colors(image, sps)
    centers = get_centers(sps)
    n_spixel = len(feats)
    graph = sparse.coo_matrix((np.ones(edges.shape[0]), edges.T),
                              shape=(n_spixel, n_spixel))
    ward = Ward(n_clusters=n_segments, connectivity=graph + graph.T)
    #km = KMeans(n_clusters=n_segments)
    color_feats = np.hstack([colors_, centers * .5])
    #return km.fit_predict(color_feats)
    return ward.fit_predict(color_feats)
コード例 #26
0
def test_linkage_misc():
    # Misc tests on linkage
    X = np.ones((5, 5))
    assert_raises(ValueError,
                  AgglomerativeClustering(linkage='foobar').fit,
                  X)
    assert_raises(ValueError, linkage_tree, X, linkage='foobar')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", UserWarning)
        # Use the copy argument, to raise a warning
        Ward(copy=True).fit(X)
    # We should be getting 2 warnings: one for using Ward that is
    # deprecated, one for using the copy argument
    assert_equal(len(warning_list), 2)
コード例 #27
0
def do_experiments(dataset):
    X, y = dataset.data, dataset.target
    dataset_name = dataset.DESCR.split('\n')[0]
    if dataset_name.startswith("Iris"):
        # iris has duplicate data points. That messes up our
        # MeanNN implementation.
        from scipy.spatial.distance import pdist, squareform
        dist = squareform(pdist(X))
        doubles = np.unique(np.where(np.tril(dist - 1, -1) == -1)[0])
        mask = np.ones(X.shape[0], dtype=np.bool)
        mask[doubles] = False
        X = X[mask]
        y = y[mask]

    n_clusters = len(np.unique(y))
    print("\n\nDataset %s samples: %d, features: %d, clusters: %d" %
          (dataset_name, X.shape[0], X.shape[1], n_clusters))
    print("=" * 70)

    classes = [
        ITM(n_clusters=n_clusters),
        ITM(n_clusters=n_clusters, infer_dimensionality=True),
        Ward(n_clusters=n_clusters),
        KMeans(n_clusters=n_clusters)
    ]
    names = ["ITM", "ITM ID", "Ward", "KMeans"]
    for clusterer, method in zip(classes, names):
        start = time()
        clusterer.fit(X)
        y_pred = clusterer.labels_

        ari = adjusted_rand_score(y, y_pred)
        ami = adjusted_mutual_info_score(y, y_pred)
        nmi = normalized_mutual_info_score(y, y_pred)
        objective = tree_information(X, y_pred)

        runtime = time() - start

        print("%-15s ARI: %.3f, AMI: %.3f, NMI: %.3f objective: %.3f time:"
              "%.2f" % (method, ari, ami, nmi, objective, runtime))

    i_gt = tree_information(X, y)
    print("GT objective: %.3f" % i_gt)
コード例 #28
0
def hac_derived_ordering(
    bags_file,
    num_clusters_multiplier=0.4
):  #uses HAC analysis to output hierarchies and evaluate results with ground truth
    print '*HAC DERIVED ORDERING*', num_clusters_multiplier
    print 'Starting Hierarchical Agglomerative Clustering analysis...'
    data, words, transcripts = doc_term_mat_from_bags(bags_file)
    model = Ward(n_clusters=int(num_clusters_multiplier *
                                len(transcripts))).fit(data)
    clust = model.fit_predict(data)
    hier_sets = []
    for i in range(len(transcripts)):
        s = [i + 1]
        #print transcripts[i]
        for j in range(0, i):
            if (clust[i] == clust[j]):
                #print '>>', transcripts[j]
                s.append(j + 1)
        hier_sets.append(set(s))
    return compare_hierarchies(hier_sets)
コード例 #29
0
ファイル: ward.py プロジェクト: vyzuer/CliskSmart
def cluster(dump_path, file_name, n_clusters=200):
    # Obtain data from file.
    #feature_file = 'feature.list'
    data = np.loadtxt(file_name, unpack=True)
    m1 = data[1]
    
    X = np.transpose(data)
    X = scale(X)
    labels_true = np.zeros(len(m1))
    
    ###############################################################################
    # Compute clustering
    print("Compute unstructured hierarchical clustering...")
    st = time.time()
    ward = Ward(n_clusters=n_clusters).fit(X)
    label = ward.labels_
    print("Elapsed time: ", time.time() - st)
    print("Number of points: ", label.size)

    label_file = dump_path + "ward_labels.list"
    fp = open(label_file, 'w')
    for i in label:
        fp.write("%d\n" % i)
    fp.close()

    num_cluster_file = dump_path + "_num_clusters_ward.info"
    fp = open(num_cluster_file, 'w')
    fp.write("%d" % n_clusters)
    fp.close()


    cluster_centers = ward.cluster_centers_
    
    score = 0.0
    # print "evaluating performance..."
    # score = metrics.silhouette_score(X, label, metric='euclidean', sample_size=20000)
    # print "evaluation done."
    # score = metrics.silhouette_samples(X, k_means_labels, metric='euclidean', sample_size=1000)
    # score = np.sum(score)/len(score)

    return score
コード例 #30
0
def clusterRT_ward(values) :
    if len(values) == 0 : return []

    v = sorted([[val] for val in values])
    
    #connectivity = kneighbors_graph(np.asarray(v), n_neighbors=3)
    ward = Ward(n_clusters=2).fit(np.asarray(v))
    labels = ward.labels_

    curr_l = -2
    cl_output = []
    curr_cluster = []
    for i,l in enumerate(labels) :
        if l != curr_l :
            if len(curr_cluster) > 0 : cl_output.append(curr_cluster)
            curr_l = l
            curr_cluster = []
        curr_cluster.append(values[i])
    cl_output.append(curr_cluster)
    
    return cl_output