Ejemplo n.º 1
0
    def time_subcluster(self, locs):
        # Getting subclusters at Mapzen's limit
        cluster_linkage = linkage(locs, method='ward')
        clusters = fcluster(cluster_linkage, 50, criterion='maxclust')

        cluster_means = np.array([np.mean(
            locs[np.where(clusters == i)], axis=0
        ) for i in range(1, 51)])

        mapzen_locs = [{'lat': p[1], 'lon': p[0]} for p in cluster_means]
        mapzen_matrix = self.mapzen_matrix(mapzen_locs)

        # Cluster labels used for mapping back together
        # Subtracting one to use 0 index
        cl = clusters - 1

        # Get a matching distance matrix of lat/lon distance, get ratios
        cluster_km_dist = squareform(pdist(cluster_means,
                                           (lambda u,v: haversine(u,v))))

        dist_ratio_matrix = np.nan_to_num(np.divide(mapzen_matrix,
                                                    cluster_km_dist))
        # Divide items by mean to normalize a bit
        dist_ratio_matrix = np.nan_to_num(np.divide(dist_ratio_matrix,
                                                    dist_ratio_matrix.mean()))

        locs_km_dist = squareform(pdist(locs, (lambda u,v: haversine(u,v))))

        # Iterate through each, updating by ratio in dist_ratio_matrix
        it = np.nditer(locs_km_dist, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            it[0] = it[0] * dist_ratio_matrix[cl[it.multi_index[0]]][cl[it.multi_index[1]]]
            it.iternext()

        return locs_km_dist
Ejemplo n.º 2
0
 def _train(self, trainset):
     self._dataset = trainset
     self.ulabels = trainset.uniquelabels
     # Do cross-validation for normal classifier
     self.cvterr = CrossValidatedTransferError(TransferError(self._clf),
                                               self._splitter,
                                               enable_states=["confusion"])
     self.cvterr(self._dataset)
     # From the confusion matrix, calculate linkage and tree-structure
     # First prepare distance matrix from confusion matrix
     dist = self.cvterr.confusion.matrix
     dist = dist.max(
     ) - dist  # Kind of inversion. High values in confusion -> similar -> small distance
     dist = (dist +
             dist.T) / 2  # Distance must be symmetric (property of a norm)
     dist -= np.diag(
         np.diag(dist)
     )  # Distance to self must be zero -> make diagonal elements zero
     # Calculate linkage matrix
     self.linkage = hcluster.linkage(hcluster.squareform(dist))
     # Build tree and according TreeClassifier
     self.tree = hcluster.to_tree(self.linkage)
     self._tree_clf = self.build_tree_classifier_from_linkage_tree(
         self.tree)[0]
     self._tree_clf.train(trainset)
Ejemplo n.º 3
0
def DBSCAN(Dataset, Epsilon,MinumumPoints,DistanceMethod = 'euclidean'):
#    Dataset is a mxn matrix, m is number of item and n is the dimension of data
    m,n=Dataset.shape
    Visited=numpy.zeros(m,'int')
    Type=numpy.zeros(m)
#   -1 noise, outlier
#    0 border
#    1 core
    ClustersList=[]
    Cluster=[]
    PointClusterNumber=numpy.zeros(m)
    PointClusterNumberIndex=1
    PointNeighbors=[]
    DistanceMatrix = hcluster.squareform(hcluster.pdist(Dataset, DistanceMethod))
    for i in xrange(m):
        if Visited[i]==0:
            Visited[i]=1
            PointNeighbors=numpy.where(DistanceMatrix[i]<Epsilon)[0]
            if len(PointNeighbors)<MinumumPoints:
                Type[i]=-1
            else:
                for k in xrange(len(Cluster)):
                    Cluster.pop()
                Cluster.append(i)
                PointClusterNumber[i]=PointClusterNumberIndex
                
                
                PointNeighbors=set2List(PointNeighbors)    
                ExpandClsuter(Dataset[i], PointNeighbors,Cluster,MinumumPoints,Epsilon,Visited,DistanceMatrix,PointClusterNumber,PointClusterNumberIndex  )
                Cluster.append(PointNeighbors[:])
                ClustersList.append(Cluster[:])
                PointClusterNumberIndex=PointClusterNumberIndex+1
                 
                    
    return PointClusterNumber 
Ejemplo n.º 4
0
def optics(x, k, distMethod = 'euclidean'):
    if len(x.shape)>1:
        m,n = x.shape
    else:
        m = x.shape[0]
        n == 1

    try:
        D = H.squareform(H.pdist(x, distMethod))
        distOK = True
    except:
        print "squareform or pdist error"
        distOK = False


    CD = np.zeros(m)
    RD = np.ones(m)*1E10

    for i in xrange(m):
        #again you can use the euclid function if you don't want hcluster
#        d = euclid(x[i],x)
#        d.sort()
#        CD[i] = d[k]

        tempInd = D[i].argsort()
        tempD = D[i][tempInd]
#        tempD.sort() #we don't use this function as it changes the reference
        CD[i] = tempD[k]#**2


    order = []
    seeds = np.arange(m, dtype = np.int)

    ind = 0
    while len(seeds) != 1:
#    for seed in seeds:
        ob = seeds[ind]
        seedInd = np.where(seeds != ob)
        seeds = seeds[seedInd]

        order.append(ob)
        tempX = np.ones(len(seeds))*CD[ob]
        tempD = D[ob][seeds]#[seeds]
        #you can use this function if you don't want to use hcluster
        #tempD = euclid(x[ob],x[seeds])

        temp = np.column_stack((tempX, tempD))
        mm = np.max(temp, axis = 1)
        ii = np.where(RD[seeds]>mm)[0]
        RD[seeds[ii]] = mm[ii]
        ind = np.argmin(RD[seeds])


    order.append(seeds[0])
    RD[0] = 0 #we set this point to 0 as it does not get overwritten
    return RD, CD, order
Ejemplo n.º 5
0
    def pdist(self, X):
        import hcluster
        import pylab

        Y = hcluster.squareform(hcluster.pdist(array(X), metric=self.metric))

        if self.plot:
            pylab.imshow(Y)
            pylab.show()
            
        yield Y
Ejemplo n.º 6
0
    def pdist(self, X):
        import hcluster
        import pylab

        Y = hcluster.squareform(hcluster.pdist(array(X), metric=self.metric))

        if self.plot:
            pylab.imshow(Y)
            pylab.show()

        yield Y
Ejemplo n.º 7
0
def cluster(dupes, threshold=.5, max_components=30000):
    '''
    Takes in a list of duplicate pairs and clusters them in to a
    list records that all refer to the same entity based on a given
    threshold

    Keyword arguments:
    threshold -- number betweent 0 and 1 (default is .5). lowering the 
                 number will increase precision, raising it will increase
                 recall
    '''
    threshold = 1 - threshold

    dupe_sub_graphs = connected_components(dupes, max_components)

    clustering = {}
    cluster_id = 0
    for sub_graph in dupe_sub_graphs:
        if len(sub_graph) > 1:

            (i_to_id, condensed_distances) = condensedDistance(sub_graph)
            N = max(i_to_id) + 1

            linkage = fastcluster.linkage(condensed_distances,
                                          method='centroid', 
                                          preserve_input=False)

            partition = hcluster.fcluster(linkage, 
                                          threshold,
                                          criterion='distance')

            clusters = {}

            for (i, sub_cluster_id) in enumerate(partition):
                clusters.setdefault(cluster_id + sub_cluster_id, []).append(i)

            distances = hcluster.squareform(condensed_distances)
            
            for cluster_id, items in clusters.iteritems() :
                if len(items) > 1 :
                    scores = confidences(items, distances)
                    clustering[cluster_id] =\
                        (tuple(i_to_id[item] for item in items), tuple(scores))

            cluster_id += max(partition) + 1
        else:
            ids, score = sub_graph[0]
            clustering[cluster_id] = (tuple(ids), tuple([score]*2))
            cluster_id += 1
            

    return clustering.values()
Ejemplo n.º 8
0
Archivo: sdp.py Proyecto: stober/lspi
def MVU_slack(datafile, dim = 3):
    # takes in a pickled matrix of points - outputs a MVU embedding

    fp = open(datafile)
    pts = pickle.load(fp)
    ans = pickle.load(fp) # latent space coordinates
    size = len(pts)

    k = len(ans[0]) # the number of latent dimensions

    # mean center coordinates
    m = np.mean(pts, axis=0)
    pts = pts - m

    # TODO: move graph cluster algorithm to own file - write in C?

    # compute the distance matrix and cluster
    Y = hc.squareform(hc.pdist(pts,'euclidean'))
    res = cluster_graph(Y, fnc = 'k', size = 8)
    x,y = np.nonzero(res & (Y != 0)) # indices of nearest neighbors

    # generate data to write problem in SPDA format
    # TODO: add slack variable block
    indx = []
    for (i,j) in zip(x,y):
        if i <= j:
            indx.append((i,j))

    m = len(indx) + 1
    nblocks = 2
    c = [0.0]
    for (i,j) in indx:
        c.append(Y[i,j]**2)

    write_spda_file_slack("../ds/sdp.dat", m, nblocks, size, c, indx, .01)

    # TODO: add some error checking
    os.system("csdp ../ds/sdp.dat ../ds/sdp.sol")

    y,Z,X = read_sol_file_slack("../ds/sdp.sol", size)

    # spectral decomposition of the dual solution (X)
    u,s,v = la.svd(X)

    results = []
    for i in range(dim):
        results.append(np.sqrt(s[i]) * u[:,i])

    # returns the neighborhood graph for proper plotting
    return results, pts, res
def optics(x, k, distMethod = 'euclidean'):
    if len(x.shape)>1:
        m,n = x.shape
    else:
        m = x.shape[0]
        n == 1

    try:
        D = H.squareform(H.pdist(x, distMethod))
        distOK = True
    except Exception, ex:
        print ex
        print "squareform or pdist error"
        distOK = False
Ejemplo n.º 10
0
def MVU_slack(datafile, dim=3):
    # takes in a pickled matrix of points - outputs a MVU embedding

    fp = open(datafile)
    pts = pickle.load(fp)
    ans = pickle.load(fp)  # latent space coordinates
    size = len(pts)

    k = len(ans[0])  # the number of latent dimensions

    # mean center coordinates
    m = np.mean(pts, axis=0)
    pts = pts - m

    # TODO: move graph cluster algorithm to own file - write in C?

    # compute the distance matrix and cluster
    Y = hc.squareform(hc.pdist(pts, 'euclidean'))
    res = cluster_graph(Y, fnc='k', size=8)
    x, y = np.nonzero(res & (Y != 0))  # indices of nearest neighbors

    # generate data to write problem in SPDA format
    # TODO: add slack variable block
    indx = []
    for (i, j) in zip(x, y):
        if i <= j:
            indx.append((i, j))

    m = len(indx) + 1
    nblocks = 2
    c = [0.0]
    for (i, j) in indx:
        c.append(Y[i, j]**2)

    write_spda_file_slack("../ds/sdp.dat", m, nblocks, size, c, indx, .01)

    # TODO: add some error checking
    os.system("csdp ../ds/sdp.dat ../ds/sdp.sol")

    y, Z, X = read_sol_file_slack("../ds/sdp.sol", size)

    # spectral decomposition of the dual solution (X)
    u, s, v = la.svd(X)

    results = []
    for i in range(dim):
        results.append(np.sqrt(s[i]) * u[:, i])

    # returns the neighborhood graph for proper plotting
    return results, pts, res
Ejemplo n.º 11
0
 def _train(self, dataset):
     self._dataset = dataset
     self.ulabels=self._dataset.uniquelabels
     # Do cross-validation for normal classifier
     self.cvterr = CrossValidatedTransferError(TransferError(self._clf),self._splitter,enable_states=["confusion"])
     self.cvterr(self._dataset)
     # From the confusion matrix, calculate linkage and tree-structure
     # First prepare distance matrix from confusion matrix
     dist = self.cvterr.confusion.matrix
     dist = (dist+dist.T)/2 # Distance must be symmetric (property of a norm)
     dist = dist.max()-dist # Kind of inversion. High values in confusion -> similar -> small distance
     dist -= np.diag(np.diag(dist)) # Distance to self must be zero -> make diagonal elements zero
     # Calculate linkage matrix
     self.linkage = hcluster.linkage(hcluster.squareform(dist))
     # Build tree and according TreeClassifier
     self.tree = hcluster.to_tree(self.linkage)
     self._tree_clf = self.build_tree_classifier_from_linkage_tree(self.tree)[0]
     self._tree_clf.train(self._dataset)
Ejemplo n.º 12
0
    def cluster(self):
        """Cluster strokes"""

        # the purpose of this step is to cluster strokes using
        # the previously calculated distance matrix

        matrix = numpy.load(self.DTW_DATA)
        Y = hcluster.squareform(matrix)
        Z = hcluster.linkage(Y, method=self.CLUSTERING_METHOD)
        T = hcluster.fcluster(Z, 1.15)
        clusters = self.get_cluster_dict_from_array(T)

        if self.verbose:
            self.print_clusters(clusters)

        if not os.path.exists(self.CLUSTER_ROOT):
            os.makedirs(self.CLUSTER_ROOT)

        pickle.dump(clusters, open(self.CLUSTER_DATA, "w"))
print "\n ___________________________________________\n"
import hcluster as H
x = [1,2,3,4,5,6,7,8,9,10]
print H.squareform(x)
print "\n ___________________________________________\n"
print H.pdist(  [[1],[3] , [5],[2]] )
print "\n ___________________________________________\n"
print H.squareform(H.pdist(  [[1],[3] , [5],[2]] ))

print "\n ___________________________________________\n"
print  [[1],[3],[5],[2]]
print [[1,3] , [5,2]]
print H.pdist(  [[1,3] , [5,2]] )

print "\n___________________________________________\n"

print H.squareform(H.pdist( [  [1,2,3,4] , [3,4,5,6] ,[5,6,7,8] ] ))


Ejemplo n.º 14
0
    y_min, y_max = X[:,1].min()-1, X[:,1].max()+1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    axes.set_cmap(pylab.cm.Paired)
    pylab.axes(axes)
    pylab.contourf(xx, yy, Z)
    pylab.axis('off')

    # Plot also the training points
    pylab.scatter(X[:,0], X[:,1], c=Y)

plotWindow = plot(show_toolbar=True)
fig, axes = plotWindow.get_figure_and_axes(0)
plot_decision_surface(axes, svm, data, labels)
plotWindow.show()

cdm = np.array([d_matrix, d_matrix.T])
cdm = np.max(cdm, axis=0)
cdm = cdm - 0.5
cdm[np.identity(cdm.shape[0], dtype=bool)] = 0.0
plotWindow = plot(show_toolbar=True)
fig, axes = plotWindow.get_figure_and_axes(0)
import hcluster
cdm = hcluster.squareform(cdm)
Z = hcluster.linkage(cdm, 'average')
print_engine.draw_dendrogram(axes, Z, labels=names)
plotWindow.show()
Ejemplo n.º 15
0
def optics_alg(x, k, distMethod='cosine'):  #was euclidean
    import time
    tic = time.clock()
    import numpy as N
    import pylab as P
    import hcluster as H
    if len(x.shape) > 1:
        m, n = x.shape
    else:
        m = x.shape[0]
        n == 1

    try:
        # D = H.squareform(H.pdist(x, distMethod))
        from scipy.spatial.distance import pdist
        D = H.squareform(pdist(x, distMethod))
        distOK = True
    except:
        print "squareform or pdist error"
        distOK = False

    CD = N.zeros(m)
    RD = N.ones(m) * 1E10

    for i in xrange(m):
        # again you can use the euclid function if you don't want hcluster
        #        d = euclid(x[i],x)
        #        d.sort()
        #        CD[i] = d[k]

        tempInd = D[i].argsort()
        tempD = D[i][tempInd]
        #        tempD.sort() #we don't use this function as it changes the reference
        CD[i] = tempD[k]  # **2

    order = []
    seeds = N.arange(m, dtype=N.int)

    ind = 0
    while len(seeds) != 1:
        #    for seed in seeds:
        ob = seeds[ind]
        seedInd = N.where(seeds != ob)
        seeds = seeds[seedInd]

        order.append(ob)
        tempX = N.ones(len(seeds)) * CD[ob]
        tempD = D[ob][seeds]  # [seeds]
        # you can use this function if you don't want to use hcluster
        # tempD = euclid(x[ob],x[seeds])

        temp = N.column_stack((tempX, tempD))
        mm = N.max(temp, axis=1)
        ii = N.where(RD[seeds] > mm)[0]
        RD[seeds[ii]] = mm[ii]
        ind = N.argmin(RD[seeds])
    toc = time.clock()
    res = toc - tic
    print "Compute time is: ", res
    order.append(seeds[0])
    RD[0] = 0  # we set this point to 0 as it does not get overwritten
    return RD, CD, order
Ejemplo n.º 16
0
 def dendrogram(self):
     self.linkage = hcluster.linkage(hcluster.squareform(self.matrix), method="complete")
Ejemplo n.º 17
0
def cluster(M, method='complete'):
    return hcluster.linkage(hcluster.squareform(M), method=method)
Ejemplo n.º 18
0
    [0, 0, 0, 0, 0, 0, 1, 1, 1],
    [0, 0, 0, 0, 0, 0, 1, 1, 0],
    [0, 0, 0, 1, 1, 1, 1, 1, 0],
    [0, 0, 0, 1, 1, 1, 0, 0, 0],
    [0, 0, 1, 1, 1, 1, 0, 0, 0]])

y = pdist(data, metric=metric)
Z = linkage(y, method=method, metric=metric)
dendrogram(Z)
Z = [(int(l), int(r), max(0., s), int(n)) for (l, r, s, n) in Z] # cleaning

leaves = list(leaves_list(Z))
count = len(leaves)
root = len(Z)+count-1

X = squareform(y)
assert len(X) == count


from utils import memoise


# bar-joseph optimal ordering ################################################

from barjoseph import optimal

leaves = optimal(root, **{
    "S":        lambda i, j: X[i][j],
    "left":     lambda i: None if i < count else Z[i-count][0],
    "right":    lambda i: None if i < count else Z[i-count][1],
    "is_leaf":  lambda i: i < count,
Ejemplo n.º 19
0
def maxclust_dists(dists, k, method = 'complete'):
    d2 = hcluster.squareform(dists)
    Z = hcluster.linkage(d2, method = method)
    fcl = hcluster.fcluster(Z, t = k, criterion = 'maxclust')
    return fcl
Ejemplo n.º 20
0
def dbscan(x,k,Eps = None, distMethod = 'euclidean'):
    '''
    Calculate the density based clustering of an array

    '''
    try:
        m = x.shape[0]
        if Eps == None:
            Eps = epsilon(x,k)

        #need to test if the squareform will fail
        #squareform makes a large matrix and if the arrays
        #input are too large not enough memory exists
        try:
            dist = H.squareform(H.pdist(x, distMethod))
            distOK = True
        except:
            distOK = False

        x = N.column_stack((N.arange(0,m),x))
        if len(x.shape)>1:
            m,n = x.shape
        else:
            m = x.shape[0]
            n == 1
        type = N.zeros(m)
        touched = N.zeros(m)
        no = 1

        tType = N.zeros(m)
        cClass = N.zeros(m)

        if distOK:
            for i in xrange(m):
                if touched[i] == 0:
                    ob = x[i]
                    D = dist[ob[0]]
#                    D = euclid(ob[1:],x[:,1:3])
                    ind = N.where(D<=Eps)
                    ind = set2List(ind)[0]

                    if len(ind)>1 and len(ind)<(k+1):
                        tType[i] = 0
                        cClass[i] = 0

                    if len(ind) == 1:
                        tType[i] = -1
                        cClass[i] = -1
                        touched[i] = 1

                    if len(ind) >= k+1:
                        tType[i] = 1
                        cClass[ind] = N.ones(len(ind))*no

                        for l in ind:
                            ob2 = x[l]
                            touched[l]=1
                            D2 = dist[ob2[0]]
                            i1 = N.where(D2<=Eps)
                            i1 = set2List(i1)[0]
                            if len(i1) > 1:
                                cClass[i1] = no
                                if len(i1)>=k+1:
                                    tType[ob2[0]] = 1
                                else:
                                    tType[ob2[0]] = 0

                                for j in xrange(len(i1)):
                                    if touched[i1[j]] == 0:
                                        touched[i1[j]]=1
                                        ind.append(i1[j])
                                        cClass[i1[j]] = no

                        no+=1
        else:#this is the very slow way but gets around the memory problem.
            print "The Input Array is too big and a squareform cannot be computed"
            raise "MemoryErro"
#            for i in xrange(m):
#                if touched[i] == 0:
#                    ob = x[i]
#    #                D = dist[ob[0]]
#                    D = euclid(ob[1:],x[:,1:3])
#                    ind = N.where(D<=Eps)
#                    ind = set2List(ind)[0]
#
#                    if len(ind)>1 and len(ind)<(k+1):
#                        tType[i] = 0
#                        cClass[i] = 0
#
#                    if len(ind) == 1:
#                        tType[i] = -1
#                        cClass[i] = -1
#                        touched[i] = 1
#
#                    if len(ind) >= k+1:
#                        tType[i] = 1
#                        cClass[ind] = N.ones(len(ind))*no
#
#                        for l in ind:
#                            ob2 = x[l]
#                            touched[l]=1
#                            D2 = euclid(ob2[1:],x[:,1:3])
##                            D2 = dist[ob2[0]]
#                            i1 = N.where(D2<=Eps)
#                            i1 = set2List(i1)[0]
#                            if len(i1) > 1:
#                                cClass[i1] = no
#                                if len(i1)>=k+1:
#                                    tType[ob2[0]] = 1
#                                else:
#                                    tType[ob2[0]] = 0
#
#                                for j in xrange(len(i1)):
#                                    if touched[i1[j]] == 0:
#                                        touched[i1[j]]=1
#                                        ind.append(i1[j])
#                                        cClass[i1[j]] = no
#
#                        no+=1


        i1 = N.where(cClass == 0)
        i1 = set2List(i1)[0]
        cClass[i1] = -1
        tType[i1] = -1
        return cClass, tType, Eps, True
    except:
        errorMsg ="An error occured with the DBSCAN Algorithm\n"
        errorMsg += "Sorry: %s\n\n%s\n"%(sys.exc_type, sys.exc_value)
        print errorMsg

        return None,None,None,False
def dbscan(x,k,Eps = None, distMethod = 'euclidean'):
    try:
        m = x.shape[0]
        if Eps == None:
            Eps = epsilon(x,k)
        dist = H.squareform(H.pdist(x, distMethod))

        x = N.column_stack((N.arange(0,m),x))
        if len(x.shape)>1:
            m,n = x.shape
        else:
            m = x.shape[0]
            n == 1
        type = N.zeros(m)
        touched = N.zeros(m)
        no = 1

        tType = N.zeros(m)
        cClass = N.zeros(m)

        for i in range(0,m):
            if touched[i] == 0:
                ob = x[i]
                D = dist[ob[0]]
                ind = N.where(D<=Eps)
                ind = set2List(ind)[0]

                if len(ind)>1 and len(ind)<(k+1):
                    tType[i] = 0
                    cClass[i] = 0

                if len(ind) == 1:
                    tType[i] = -1
                    cClass[i] = -1
                    touched[i] = 1

                if len(ind) >= k+1:
                    tType[i] = 1
                    cClass[ind] = N.ones(len(ind))*no

                    for l in ind:
                        ob2 = x[l]
                        touched[l]=1
                        D2 = dist[ob2[0]]
                        i1 = N.where(D2<=Eps)
                        i1 = set2List(i1)[0]
                        if len(i1) > 1:
                            cClass[i1] = no
                            if len(i1)>=k+1:
                                tType[ob2[0]] = 1
                            else:
                                tType[ob2[0]] = 0

                            for j in xrange(len(i1)):
                                if touched[i1[j]] == 0:
                                    touched[i1[j]]=1
                                    ind.append(i1[j])
                                    cClass[i1[j]] = no

                    no+=1
        i1 = N.where(cClass == 0)
        i1 = set2List(i1)[0]
        cClass[i1] = -1
        tType[i1] = -1
        return cClass, tType, Eps, True
    except:
        errorMsg ="An error occured with the DBSCAN Algorithm"
        errorMsg += "Sorry: %s\n\n%s\n"%(sys.exc_type, sys.exc_value)
        print errorMsg

        return None,None,None,False
Ejemplo n.º 22
0
        sym_matrix[i][j]=sym_matrix[j][i]=dendropy.treecalc.symmetric_difference(trees[i],trees[j])  
        euc_matrix[i][j]=euc_matrix[j][i]=dendropy.treecalc.euclidean_distance(trees[i],trees[j])
#Normalise if specified (normalise here means subtract minimum value and divide by maximum to place each measurement in the range [0,1])
if normalise:
    rf_matrix = rf_matrix / np.max(rf_matrix)
    sym_matrix = sym_matrix / np.max(sym_matrix)
    euc_matrix = euc_matrix / np.max(euc_matrix)

linkages = ['single','complete','average','weighted','ward']
matrices = [rf_matrix, sym_matrix, euc_matrix]
matrix_names = ['rf','sym', 'euc']

for x in range(len(linkages)):
    for y in range(len(matrices)):
        filename = "{0}{1}_{2}_{3}.pdf".format(INPUT_DIR,save_prefix,linkages[x],matrix_names[y])
        try: 
            Y = squareform(matrices[y])
            link = linkage(Y, linkages[x])
        except:
            Y = matrices[y]
            link = linkage(Y, linkages[x])
        cut = (link[-1][2])*cut_proportion
        T = fcluster(link,cut,criterion="distance")
        dendrogram( link, color_threshold=cut, leaf_font_size=font_size, leaf_rotation=90,leaf_label_func=lambda leaf: tree_files[leaf][1+tree_files[leaf].rindex('/'):tree_files[leaf].rindex('.')]+"_"+str(T[leaf]),count_sort=True)
        title("{0} linkage of {1} matrix".format(linkages[x],matrix_names[y]))
        axhline(cut,color='grey',ls='dashed')
        xlabel('Gene')
        ylabel('Distance')
        savefig(filename,format='pdf',dpi=1600)
        clf()       
Ejemplo n.º 23
0
def cluster(M, method='complete'):
    return hcluster.linkage(hcluster.squareform(M), method=method)
Ejemplo n.º 24
0
    # for f in range(0, len(features)):
    #   if features[f]['property'] in t:
    #     if features[f]['type'] == 'numeric':
    #       pass
    #     elif features[f]['type'] == 'discrete':
    #       if t[features[f]['property']]['value'] == features[f]['value']:
    #         resource_features[snum][f] = float(1.0)


print "Found %d distinct resources" % len(resources)

rows = None
time.sleep(10)

print "Computing distances"
distances = squareform(pdist(resource_features))
sorted_distance_args = numpy.argsort(distances)

print "Writing arff with id %s" % source_id
fout = open("%s.arff" % source_id, "w")
fout.write("%% Similar resources generated by %s\n" % __file__)
fout.write("%% Date: %s\n" % datetime.datetime.today().isoformat())
fout.write("%% Source dataset: %s\n" % opts.dataset)
for query_orig in queries:
    fout.write("%% Query: %s\n" % query_orig)
fout.write("%% Found %d distinct resources\n" % len(resources))
if opts.weights:
    fout.write("%% Weights: %s\n" % ", ".join(opts.weights))

fout.write("@DATA")
for r in resources:
Ejemplo n.º 25
0
method = 'single'

data = np.matrix([[1, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 1, 1],
                  [0, 0, 0, 0, 0, 0, 1, 1, 0], [0, 0, 0, 1, 1, 1, 1, 1, 0],
                  [0, 0, 0, 1, 1, 1, 0, 0, 0], [0, 0, 1, 1, 1, 1, 0, 0, 0]])

y = pdist(data, metric=metric)
Z = linkage(y, method=method, metric=metric)
dendrogram(Z)
Z = [(int(l), int(r), max(0., s), int(n)) for (l, r, s, n) in Z]  # cleaning

leaves = list(leaves_list(Z))
count = len(leaves)
root = len(Z) + count - 1

X = squareform(y)
assert len(X) == count

from utils import memoise

# bar-joseph optimal ordering ################################################

from barjoseph import optimal

leaves = optimal(
    root, **{
        "S": lambda i, j: X[i][j],
        "left": lambda i: None if i < count else Z[i - count][0],
        "right": lambda i: None if i < count else Z[i - count][1],
        "is_leaf": lambda i: i < count,
        "is_empty": lambda v: v is None,
Ejemplo n.º 26
0
import hcluster
import matplotlib.pyplot as plt
import pickle
import urllib

url = "http://examples.obspy.org/dissimilarities.pkl"
dissimilarity = pickle.load(urllib.urlopen(url))

plt.subplot(121)
plt.imshow(1 - dissimilarity, interpolation="nearest")

dissimilarity = hcluster.squareform(dissimilarity)
threshold = 0.3
linkage = hcluster.linkage(dissimilarity, method="single")
clusters = hcluster.fcluster(linkage, 0.3, criterion="distance")

plt.subplot(122)
hcluster.dendrogram(linkage, color_threshold=0.3)
plt.xlabel("Event number")
plt.ylabel("Dissimilarity")
plt.show()
Ejemplo n.º 27
0
# Compute and plot first dendrogram.
fig = plt.figure(figsize=(8, 8))
# x ywidth height
ax1 = fig.add_axes([0.05, 0.1, 0.2, 0.6])
Y = linkage(data_dist, method='single')
Z1 = dendrogram(Y, orientation='right',
                labels=data.dtype.names)  # adding/removing the axes
ax1.set_xticks([])

# Compute and plot second dendrogram.
ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2])
Z2 = dendrogram(Y)
ax2.set_xticks([])
ax2.set_yticks([])

#Compute and plot the heatmap
axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = squareform(data_dist)
D = D[idx1, :]
D = D[:, idx2]
im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=plt.cm.YlGnBu)
axmatrix.set_xticks([])
axmatrix.set_yticks([])

# Plot colorbar.
axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.6])
plt.colorbar(im, cax=axcolor)

fig.savefig('../../results/heatmap.png')
Ejemplo n.º 28
0
# Compute and plot first dendrogram.
fig = plt.figure(figsize=(8,8))
# x ywidth height
ax1 = fig.add_axes([0.05,0.1,0.2,0.6])
Y = linkage(data_dist, method='single')
Z1 = dendrogram(Y, orientation='right',labels=data.dtype.names) # adding/removing the axes
ax1.set_xticks([])

# Compute and plot second dendrogram.
ax2 = fig.add_axes([0.3,0.71,0.6,0.2])
Z2 = dendrogram(Y)
ax2.set_xticks([])
ax2.set_yticks([])

#Compute and plot the heatmap
axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = squareform(data_dist)
D = D[idx1,:]
D = D[:,idx2]
im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=plt.cm.YlGnBu)
axmatrix.set_xticks([])
axmatrix.set_yticks([])

# Plot colorbar.
axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
plt.colorbar(im, cax=axcolor)

fig.savefig('../../results/heatmap.png')