Beispiel #1
0
    def time_subcluster(self, locs):
        # Getting subclusters at Mapzen's limit
        cluster_linkage = linkage(locs, method='ward')
        clusters = fcluster(cluster_linkage, 50, criterion='maxclust')

        cluster_means = np.array([np.mean(
            locs[np.where(clusters == i)], axis=0
        ) for i in range(1, 51)])

        mapzen_locs = [{'lat': p[1], 'lon': p[0]} for p in cluster_means]
        mapzen_matrix = self.mapzen_matrix(mapzen_locs)

        # Cluster labels used for mapping back together
        # Subtracting one to use 0 index
        cl = clusters - 1

        # Get a matching distance matrix of lat/lon distance, get ratios
        cluster_km_dist = squareform(pdist(cluster_means,
                                           (lambda u,v: haversine(u,v))))

        dist_ratio_matrix = np.nan_to_num(np.divide(mapzen_matrix,
                                                    cluster_km_dist))
        # Divide items by mean to normalize a bit
        dist_ratio_matrix = np.nan_to_num(np.divide(dist_ratio_matrix,
                                                    dist_ratio_matrix.mean()))

        locs_km_dist = squareform(pdist(locs, (lambda u,v: haversine(u,v))))

        # Iterate through each, updating by ratio in dist_ratio_matrix
        it = np.nditer(locs_km_dist, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            it[0] = it[0] * dist_ratio_matrix[cl[it.multi_index[0]]][cl[it.multi_index[1]]]
            it.iternext()

        return locs_km_dist
def gethclinks(exparray, method):
	hcdists = hcluster.pdist(exparray, method)
	hclinks = hcluster.linkage(hcdists)
	links = []
	for hclink in hclinks:
		links.append([int(hclink[0]), int(hclink[1])])
	return links
def c_dists(Y,use_swt=True,level_weights=False):
	w = pywt.Wavelet('sym2')
	if use_swt:
		L = pywt.swt_max_level(Y.shape[0])
		C = [pywt.swt(Y[:,i],w,level=L) for i in range(Y.shape[1])]
		C = [[list(reshape(l[0],-1)) + list(reshape(l[1],-1)) for l in c] for c in C]
	else:
		L = pywt.dwt_max_level(Y.shape[0],w)
		C = [pywt.wavedec(Y[:,i],w,level=L) for i in range(Y.shape[1])]
	if level_weights:
		if use_swt:
			raise NameError('No level weights with SWT')
		Wc = [1. for x in range(1,L+1)]
		D = zeros((len(C),len(C)))
		for i in range(len(C)):
			for j in range(i+1,len(C)):
				d = sum([distance.cosine(C[i][x],C[j][x])*Wc[x] for x in range(L)])/sum(Wc)
				D[i,j] = d
				D[j,i] = d
		return D
	else:
		Cn = []
		for c in C:
			cn = []
			for l in c:
				cn += list(l)
			Cn.append(cn)
		return abs(pdist(Cn,'cosine'))
Beispiel #4
0
def dendro(X,metric='cosine',combine='average',showdendro=True,leaf_label_func=identity,**kw):
    Y = pdist(X,metric)
    Z = linkage(Y,combine)
    if showdendro:
        dendrogram(Z,leaf_label_func=leaf_label_func,**kw)
        show()
    return Z
Beispiel #5
0
def main():
    print "hola"
    X = rand(10,100)
    X[0:5,:] *= 2
    Y = pdist(X)
    Z = linkage(Y)
    dendrogram(Z)
Beispiel #6
0
def dendro(X, metric="cosine", combine="average", showdendro=True, leaf_label_func=identity, **kw):
    Y = pdist(X, metric)
    Z = linkage(Y, combine)
    if showdendro:
        dendrogram(Z, leaf_label_func=leaf_label_func, **kw)
        show()
    return Z
Beispiel #7
0
def test_pdist(repeat, runs, data):

    np.random.seed(int(time.time()))

    clocks = np.empty((repeat, runs))
    times = np.empty((repeat, runs))

    for i in xrange(repeat):
        for j in xrange(runs):
            t1 = time.time()
            c1 = time.clock()
            dist_m = hcluster.pdist(data)
            c2 = time.clock()
            t2 = time.time()
            dt = t2 - t1
            dc = c2 - c1
            clocks[i, j] = c2 - c1
            times[i, j] = t2 - t1
            del dist_m

    mean_clock = np.mean(clocks)
    std_clock = np.std(clocks)
    mean_time = np.mean(times)
    std_time = np.std(times)

    print '%d objects, %d features: clocks=%f +- %f, times=%f +- %f' % (data.shape[0], data.shape[1], mean_clock, std_clock, mean_time, std_time)

    return mean_time, std_time, mean_clock, std_clock
Beispiel #8
0
def test():
  word_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O' ]
  cons_words = ['C', 'B']
  X = rand(15, 2)
  #X = [[0.35, 0.37], [0.40, 0.40], [0.53, 0.53], [0.34, 0.51]]
  print X
  Y = pdist(X)
  print Y
  Z = linkage(Y)
  R = dendrogram(Z)

  index1 = word_list.index(cons_words[0])
  assert index1 >= 0
  path1 = findPath(Z, index1, len(word_list))
  index2 = word_list.index(cons_words[1])
  assert index2 >= 0  
  path2 = findPath(Z, index2, len(word_list))
  
  print Z
  print path1
  print path2

  common = set(path1).intersection(set(path2))
  first = min(common)
  assert(first >= len(word_list))
  first -= len(word_list) 
  cluster_root = Z[first][0]
  merge1 = findCluster(Z, cluster_root, word_list)
  cluster_root = Z[first][1]
  merge2 = findCluster(Z, cluster_root, word_list)

  print merge1
  print merge2
def ClusteringWithC_Index(Data,NumberOfClusters,NumberofIterationsForCindex,DistanceBetweenAllPairNodesSorted,DistanceMethod='euclidean'):
    NumberOfClusters=NumberOfClusters
    x=Data
    NumberofIterationsForCindex=NumberofIterationsForCindex
    NUmberOfNodesInTheClusters=0
    D=DistanceBetweenAllPairNodesSorted
    OptimalCenter=[]
    C=1
    Old_C=sys.maxint
    Scl=0
    N=0
    Smin=0
    Smax=0


    for NumberofIterations in xrange(NumberofIterationsForCindex):
        centroid,labels=Classifier=kmeans2(Data, NumberOfClusters, iter=500, thresh=1e-05, minit='random', missing='warn')
        for i in xrange( NumberOfClusters ):
            NUmberOfNodesInTheClusters=len(x[numpy.where(labels==i)])
            Scl=Scl+numpy.sum( hcluster.pdist(x[numpy.where(labels==i)], DistanceMethod))
            N=N+Combination(NUmberOfNodesInTheClusters, 2)
        Smin=numpy.sum( D[0:N:1])
        Smax=numpy.sum(D[len(D)-N::1])
        C=(Scl-Smin)/(Smax-Smin)
        Scl=0
        N=0
        Smin=0
        Smax=0
        if(C<Old_C):
            Old_C=C
            OptimalCenter=centroid[:]
    return OptimalCenter,Old_C
def c_dists(Y, use_swt=True, level_weights=False):
    w = pywt.Wavelet('sym2')
    if use_swt:
        L = pywt.swt_max_level(Y.shape[0])
        C = [pywt.swt(Y[:, i], w, level=L) for i in range(Y.shape[1])]
        C = [[list(reshape(l[0], -1)) + list(reshape(l[1], -1)) for l in c]
             for c in C]
    else:
        L = pywt.dwt_max_level(Y.shape[0], w)
        C = [pywt.wavedec(Y[:, i], w, level=L) for i in range(Y.shape[1])]
    if level_weights:
        if use_swt:
            raise NameError('No level weights with SWT')
        Wc = [1. for x in range(1, L + 1)]
        D = zeros((len(C), len(C)))
        for i in range(len(C)):
            for j in range(i + 1, len(C)):
                d = sum([
                    distance.cosine(C[i][x], C[j][x]) * Wc[x] for x in range(L)
                ]) / sum(Wc)
                D[i, j] = d
                D[j, i] = d
        return D
    else:
        Cn = []
        for c in C:
            cn = []
            for l in c:
                cn += list(l)
            Cn.append(cn)
        return abs(pdist(Cn, 'cosine'))
    def run(self):
        print 'hello world'
        features = self.getRandomFeatures()
        dist = hcluster.pdist(features)
        print len(dist)

        self.drawDendrogram(dist)
Beispiel #12
0
def DBSCAN(Dataset, Epsilon,MinumumPoints,DistanceMethod = 'euclidean'):
#    Dataset is a mxn matrix, m is number of item and n is the dimension of data
    m,n=Dataset.shape
    Visited=numpy.zeros(m,'int')
    Type=numpy.zeros(m)
#   -1 noise, outlier
#    0 border
#    1 core
    ClustersList=[]
    Cluster=[]
    PointClusterNumber=numpy.zeros(m)
    PointClusterNumberIndex=1
    PointNeighbors=[]
    DistanceMatrix = hcluster.squareform(hcluster.pdist(Dataset, DistanceMethod))
    for i in xrange(m):
        if Visited[i]==0:
            Visited[i]=1
            PointNeighbors=numpy.where(DistanceMatrix[i]<Epsilon)[0]
            if len(PointNeighbors)<MinumumPoints:
                Type[i]=-1
            else:
                for k in xrange(len(Cluster)):
                    Cluster.pop()
                Cluster.append(i)
                PointClusterNumber[i]=PointClusterNumberIndex
                
                
                PointNeighbors=set2List(PointNeighbors)    
                ExpandClsuter(Dataset[i], PointNeighbors,Cluster,MinumumPoints,Epsilon,Visited,DistanceMatrix,PointClusterNumber,PointClusterNumberIndex  )
                Cluster.append(PointNeighbors[:])
                ClustersList.append(Cluster[:])
                PointClusterNumberIndex=PointClusterNumberIndex+1
                 
                    
    return PointClusterNumber 
def gethclinks(exparray, method):
    hcdists = hcluster.pdist(exparray, method)
    hclinks = hcluster.linkage(hcdists)
    links = []
    for hclink in hclinks:
        links.append([int(hclink[0]), int(hclink[1])])
    return links
def plotSampleDistanceDendrogram(ds):
    """Plot a sample distance cluster dendrogram using all samples and features
    of a dataset.

    :Parameter:
      ds: Dataset
        The source dataset.
    """
    # generate map from num labels to literal labels
    # to put them on the dendrogram leaves
    lmap = dict([(v, k) for k, v in ds.labels_map.iteritems()])

    # compute distance matrix, default is squared euclidean distance
    dist = clust.pdist(ds.samples)

    # determine clusters
    link = clust.linkage(dist, 'complete')

    # plot dendrogram with literal labels on leaves
    # this does not work with etch's version of matplotlib (verified for
    # matplotlib 0.98)
    clust.dendrogram(
        link,
        colorthreshold=0,
        labels=[lmap[l] for l in ds.labels],
        # all black
        link_color_func=lambda x: 'black',
        distance_sort=False)
    labels = P.gca().get_xticklabels()
    # rotate labels
    P.setp(labels, rotation=90, fontsize=9)
Beispiel #15
0
def cluster():
    data = json.load(open("./data/clustering-data.json"))

    vectors = [ufo['vector'] for ufo in data]

    distances = pdist(vectors)

    print distances
def time_series_clusters(Y, ct=0.5, return_clusters=False):
    D = pdist(transpose(Y), 'correlation')
    D = abs(D)
    if return_clusters:
        L = linkage(D, method='single', metric='cosine')
        C = fcluster(L, ct, criterion='distance')
        return cluster_sets(C)
    plot_clusters(D, ct)
def time_series_clusters(Y,ct=0.5,return_clusters=False):
	D = pdist(transpose(Y),'correlation')
	D = abs(D)
	if return_clusters:
		L = linkage(D,method='single',metric='cosine')
		C = fcluster(L,ct,criterion='distance')
		return cluster_sets(C)
	plot_clusters(D,ct)
Beispiel #18
0
def cluster_elut(mat):
    import hcluster
    ymat = hcluster.pdist(mat)
    zmat = hcluster.linkage(ymat)
    figure()
    order = hcluster.dendrogram(zmat)['leaves']
    clf() 
    imshow(mat[order,:])
Beispiel #19
0
def optics(x, k, distMethod = 'euclidean'):
    if len(x.shape)>1:
        m,n = x.shape
    else:
        m = x.shape[0]
        n == 1

    try:
        D = H.squareform(H.pdist(x, distMethod))
        distOK = True
    except:
        print "squareform or pdist error"
        distOK = False


    CD = np.zeros(m)
    RD = np.ones(m)*1E10

    for i in xrange(m):
        #again you can use the euclid function if you don't want hcluster
#        d = euclid(x[i],x)
#        d.sort()
#        CD[i] = d[k]

        tempInd = D[i].argsort()
        tempD = D[i][tempInd]
#        tempD.sort() #we don't use this function as it changes the reference
        CD[i] = tempD[k]#**2


    order = []
    seeds = np.arange(m, dtype = np.int)

    ind = 0
    while len(seeds) != 1:
#    for seed in seeds:
        ob = seeds[ind]
        seedInd = np.where(seeds != ob)
        seeds = seeds[seedInd]

        order.append(ob)
        tempX = np.ones(len(seeds))*CD[ob]
        tempD = D[ob][seeds]#[seeds]
        #you can use this function if you don't want to use hcluster
        #tempD = euclid(x[ob],x[seeds])

        temp = np.column_stack((tempX, tempD))
        mm = np.max(temp, axis = 1)
        ii = np.where(RD[seeds]>mm)[0]
        RD[seeds[ii]] = mm[ii]
        ind = np.argmin(RD[seeds])


    order.append(seeds[0])
    RD[0] = 0 #we set this point to 0 as it does not get overwritten
    return RD, CD, order
Beispiel #20
0
    def pdist(self, X):
        import hcluster
        import pylab

        Y = hcluster.squareform(hcluster.pdist(array(X), metric=self.metric))

        if self.plot:
            pylab.imshow(Y)
            pylab.show()
            
        yield Y
Beispiel #21
0
def t_dendrogram(X, nclusters):
    from matplotlib.pyplot import show
    from hcluster import pdist, linkage, dendrogram
    import numpy
    from numpy.random import rand
#     X = X[:10, :]
    Y = pdist(X)
    Z = linkage(Y)
    res = dendrogram(Z)
    show()
    pass
Beispiel #22
0
def t_dendrogram(X, nclusters):
    from matplotlib.pyplot import show
    from hcluster import pdist, linkage, dendrogram
    import numpy
    from numpy.random import rand
    #     X = X[:10, :]
    Y = pdist(X)
    Z = linkage(Y)
    res = dendrogram(Z)
    show()
    pass
Beispiel #23
0
    def pdist(self, X):
        import hcluster
        import pylab

        Y = hcluster.squareform(hcluster.pdist(array(X), metric=self.metric))

        if self.plot:
            pylab.imshow(Y)
            pylab.show()

        yield Y
Beispiel #24
0
def get_clustering_as_tree(vectors, ward = True, clustering_distance='euclidean', clustering_method = 'complete', progress = progress):
    if ward:
        progress.update('Clustering data with Ward linkage and euclidean distances')
        clustering_result = hcluster.ward(vectors)
    else:
        progress.update('Computing distance matrix using "%s" distance' % clustering_distance)
        distance_matrix = hcluster.pdist(vectors, clustering_distance)
        progress.update('Clustering data with "%s" linkage' % clustering_method)
        clustering_result = hcluster.linkage(distance_matrix, method = clustering_method)

    progress.update('Returning results')
    return hcluster.to_tree(clustering_result)
def generate_dendrogram(root):
    from hcluster import pdist, linkage, dendrogram
    import numpy
    from numpy.random import rand
    import matplotlib
    
    X = rand(10,100)
    X[0:5,:] *= 2
    Y = pdist(X)
    Z = linkage(Y)
    print Y
    print Z
    dendrogram(Z)
def do_clusters(cluster_coords,Labels=None,link_method='single',d=0.2):
	D = pdist(cluster_coords,'cosine')
	# SEEMS THERE MAY SOMETIME BE VERY SMALL NEGATIVE DISTANCES ie -2*10**-16
	D = abs(D)
	L = linkage(D,method=link_method,metric='cosine')
	F = fcluster(L,d,'distance','cosine')
	C = defaultdict(list)
	for i in range(len(F)):
		if Labels:
			C[F[i]].append(Labels[i])
		else:
			C[F[i]].append(i)
	return C
Beispiel #27
0
def MVU_slack(datafile, dim=3):
    # takes in a pickled matrix of points - outputs a MVU embedding

    fp = open(datafile)
    pts = pickle.load(fp)
    ans = pickle.load(fp)  # latent space coordinates
    size = len(pts)

    k = len(ans[0])  # the number of latent dimensions

    # mean center coordinates
    m = np.mean(pts, axis=0)
    pts = pts - m

    # TODO: move graph cluster algorithm to own file - write in C?

    # compute the distance matrix and cluster
    Y = hc.squareform(hc.pdist(pts, 'euclidean'))
    res = cluster_graph(Y, fnc='k', size=8)
    x, y = np.nonzero(res & (Y != 0))  # indices of nearest neighbors

    # generate data to write problem in SPDA format
    # TODO: add slack variable block
    indx = []
    for (i, j) in zip(x, y):
        if i <= j:
            indx.append((i, j))

    m = len(indx) + 1
    nblocks = 2
    c = [0.0]
    for (i, j) in indx:
        c.append(Y[i, j]**2)

    write_spda_file_slack("../ds/sdp.dat", m, nblocks, size, c, indx, .01)

    # TODO: add some error checking
    os.system("csdp ../ds/sdp.dat ../ds/sdp.sol")

    y, Z, X = read_sol_file_slack("../ds/sdp.sol", size)

    # spectral decomposition of the dual solution (X)
    u, s, v = la.svd(X)

    results = []
    for i in range(dim):
        results.append(np.sqrt(s[i]) * u[:, i])

    # returns the neighborhood graph for proper plotting
    return results, pts, res
def optics(x, k, distMethod = 'euclidean'):
    if len(x.shape)>1:
        m,n = x.shape
    else:
        m = x.shape[0]
        n == 1

    try:
        D = H.squareform(H.pdist(x, distMethod))
        distOK = True
    except Exception, ex:
        print ex
        print "squareform or pdist error"
        distOK = False
def printSummary(updatedtfidfMatrix, queriedSentences):

    print "\n"
    a = pdist(updatedtfidfMatrix,'cosine')
    print a
    b = linkage(a)
    dendrogram(b)
    show()
    print b


    sumOrder = []
    count = 0
	
    f = open("foo.txt", "w")
    for i in range(len(b)):
	x = int(b[i][0])
	y = int(b[i][1])

	if x <= (len(queriedSentences)-1):
	   sumOrder.append(x)
	if y <= (len(queriedSentences)-1):
	   sumOrder.append(y)
        if x <= (len(queriedSentences)-1) and y > (len(queriedSentences)-1):
           sumOrder.append(y)
	if x > (len(queriedSentences)-1) and y > (len(queriedSentences)-1):
	   sumOrder.append(x)

    previous = 0
   
    queriedSentences = [sentence.capitalize() for sentence in queriedSentences]
        
    
    for num in sumOrder:
	if num > (len(queriedSentences)-1):
	   f.write('<br></br>')
	else:
	   f.write(queriedSentences[num])
	   f.write('.')
	   f.write(' ')


    f.close()

    with open ("foo.txt", "r") as myfile:
       #print myfile
       data=myfile.read()
	
    print data
    return data
Beispiel #30
0
def MVU_slack(datafile, dim = 3):
    # takes in a pickled matrix of points - outputs a MVU embedding

    fp = open(datafile)
    pts = pickle.load(fp)
    ans = pickle.load(fp) # latent space coordinates
    size = len(pts)

    k = len(ans[0]) # the number of latent dimensions

    # mean center coordinates
    m = np.mean(pts, axis=0)
    pts = pts - m

    # TODO: move graph cluster algorithm to own file - write in C?

    # compute the distance matrix and cluster
    Y = hc.squareform(hc.pdist(pts,'euclidean'))
    res = cluster_graph(Y, fnc = 'k', size = 8)
    x,y = np.nonzero(res & (Y != 0)) # indices of nearest neighbors

    # generate data to write problem in SPDA format
    # TODO: add slack variable block
    indx = []
    for (i,j) in zip(x,y):
        if i <= j:
            indx.append((i,j))

    m = len(indx) + 1
    nblocks = 2
    c = [0.0]
    for (i,j) in indx:
        c.append(Y[i,j]**2)

    write_spda_file_slack("../ds/sdp.dat", m, nblocks, size, c, indx, .01)

    # TODO: add some error checking
    os.system("csdp ../ds/sdp.dat ../ds/sdp.sol")

    y,Z,X = read_sol_file_slack("../ds/sdp.sol", size)

    # spectral decomposition of the dual solution (X)
    u,s,v = la.svd(X)

    results = []
    for i in range(dim):
        results.append(np.sqrt(s[i]) * u[:,i])

    # returns the neighborhood graph for proper plotting
    return results, pts, res
def plot_cluster_tree(cluster_coords,Labels=None,link_method='single',color_thresh=.25,fontsize=8):
	D = pdist(cluster_coords,'cosine')
	# SEEMS THERE MAY SOMETIME BE VERY SMALL NEGATIVE DISTANCES ie -2*10**-16
	D = abs(D)
	L = linkage(D,method=link_method,metric='cosine')
	if Labels:
		dendrogram(L,labels=Labels,orientation='left',color_threshold=color_thresh)
	else:
		dendrogram(L,orientation='left',color_threshold=color_thresh)
	pylab.title('HMP Buccal Mucosa - Latent Strain Analysis')
	pylab.xlabel('Cosine Distance')
	pylab.ylabel('Strain with the Most Alignments to Each Cluster')
	pylab.rcParams.update({'font.size': fontsize})
	pylab.show()
Beispiel #32
0
def do_gen_feature_z(X_L_list, X_D_list, M_c, filename, tablename=''):
    num_cols = len(X_L_list[0]['column_partition']['assignments'])
    column_names = [M_c['idx_to_name'][str(idx)] for idx in range(num_cols)]
    column_names = numpy.array(column_names)
    # extract unordered z_matrix
    num_latent_states = len(X_L_list)
    z_matrix = numpy.zeros((num_cols, num_cols))
    for X_L in X_L_list:
        assignments = X_L['column_partition']['assignments']
        for i in range(num_cols):
            for j in range(num_cols):
                if assignments[i] == assignments[j]:
                    z_matrix[i, j] += 1
    z_matrix /= float(num_latent_states)
    # hierachically cluster z_matrix
    Y = hcluster.pdist(z_matrix)
    Z = hcluster.linkage(Y)
    pylab.figure()
    hcluster.dendrogram(Z)
    intify = lambda x: int(x.get_text())
    reorder_indices = map(intify, pylab.gca().get_xticklabels())
    pylab.close()
    # REORDER!
    z_matrix_reordered = z_matrix[:, reorder_indices][reorder_indices, :]
    column_names_reordered = column_names[reorder_indices]
    # actually create figure
    fig = pylab.figure()
    fig.set_size_inches(16, 12)
    pylab.imshow(z_matrix_reordered,
                 interpolation='none',
                 cmap=pylab.matplotlib.cm.Greens)
    pylab.colorbar()
    if num_cols < 14:
        pylab.gca().set_yticks(range(num_cols))
        pylab.gca().set_yticklabels(column_names_reordered, size='x-small')
        pylab.gca().set_xticks(range(num_cols))
        pylab.gca().set_xticklabels(column_names_reordered,
                                    rotation=90,
                                    size='x-small')
    else:
        pylab.gca().set_yticks(range(num_cols)[::2])
        pylab.gca().set_yticklabels(column_names_reordered[::2],
                                    size='x-small')
        pylab.gca().set_xticks(range(num_cols)[1::2])
        pylab.gca().set_xticklabels(column_names_reordered[1::2],
                                    rotation=90,
                                    size='small')
    pylab.title('column dependencies for: %s' % tablename)
    pylab.savefig(filename)
Beispiel #33
0
def OnLeftDClick(self, event):
#def OnLeftDClick(event):
	""" Left Double Click has been invocked.
		This plugin call pdist function from hcluster package and
		plot the dendrogram using matplotlib.pyplot package.
	"""
	#canvas = event.GetEventObject()
	#model = canvas.getCurrentShape(event)
	devs = self.getDEVSModel()
	if devs:
		Y = pdist(devs.vectors)
		Z = linkage(Y)
		dendrogram(Z)
		show()
	else:
		wx.MessageBox(_("No DEVS model is instanciated.\nGo back to the simulation!"), _("Info"), wx.OK|wx.ICON_INFORMATION)
    def cluster_path_times(self, path_times,display):
        recordings = path_times.recordings
        X=[]

        for recording in recordings:
            X.append([recording.time.seconds+recording.time.microseconds/10**6.,recording.date.hour*60+recording.date.minute])
        print X
        Y=pdist(X)
        Z=linkage(Y)
        dendrogram(Z)
        for i in range(len(X)):
            print('{0}, {1}'.format(i,X[i]))
        print Z
        print self.calculate_variances(X,Z)
        if display:
            show()
    def hierarchical(self,lst,fulldataset):
        #Samples are colored according to its sample type #
        label_color={}
        for i in self.numbering(self.classLabel(lst)):
            r=('r')
            b=('b')
            if i[0:6]=='cancer':
                label_color[i]=r
                #print label_colors
            elif i[0:6]=='normal' :
                label_color[i]=b
                #print label_colors
            else:
                continue
        tg=zip(*fulldataset)
        Y = pdist(tg)
        #average linkage is applied #
        Z = linkage(Y,method='average')
        sch.set_link_color_palette(['black'])
        a=sch.dendrogram(Z,leaf_font_size=6,labels=self.newlist)
            

        #dendrogram is plotted #
        ax = plt.gca()
        xlbls = ax.get_xmajorticklabels()
    
        for lbl in xlbls:
            lbl.set_color(label_color[lbl.get_text()])
        plt.title("Average Hierarchical Clustering Algorithm")
        plt.savefig('Average Hierarchical Clustering.pdf',dpi=500)
        #plt.show()
        plt.close()

        self.labels=array([])
        c=array([1])
        n=array([0])

        #Silhouette Test #
        #Samples are converted into '0' or '1' for validation #
        for i in self.classLabel(lst):
            if i=='cancer':
                self.labels=np.concatenate([self.labels,c])
            else:
                self.labels=np.concatenate([self.labels,n])

        self.labels=np.delete(self.labels,self.labels[-1])
        self.score=metrics.silhouette_score(Z, self.labels, metric='euclidean')
def oldClusteringWithC_Index(Data,NumberOfClusters,NumberofIterationsForCindex,DistanceBetweenAllPairNodesSorted,DistanceMethod='euclidean'):
    NumberOfClusters=NumberOfClusters
    x=Data
    NumberofIterationsForCindex=NumberofIterationsForCindex
    NUmberOfNodesInTheClusters=0
    D=DistanceBetweenAllPairNodesSorted
    OptimalCenter=[]
    
    C=1
    Old_C=sys.maxint
    Scl=0
    N=0
    Smin=0
    Smax=0


    for NumberofIterations in xrange(NumberofIterationsForCindex):
        #init : {'k-means++', 'random', 'points','matrix'}
        #'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence
        # http://scikit-learn.sourceforge.net/modules/generated/scikits.learn.cluster.KMeans.html#scikits.learn.cluster.KMeans
        classifier=cluster.KMeans(k=NumberOfClusters, init='random', n_init=10, max_iter=300, tol=0.0001, verbose=0, random_state=None, copy_x=True)
        y=classifier.fit(x)
        for i in xrange( NumberOfClusters ):
#            print 'NumberofIterations'
#            print NumberofIterations
#            print 'NumberOfClusters'
#            print NumberOfClusters
#            print 'classifier.cluster_centers_'
#            print classifier.cluster_centers_
            NUmberOfNodesInTheClusters=len(x[numpy.where(classifier.labels_==i)])
            Scl=Scl+numpy.sum( hcluster.pdist(x[numpy.where(classifier.labels_==i)], DistanceMethod))
            N=N+Combination(NUmberOfNodesInTheClusters, 2)
        Smin=numpy.sum( D[0:N:1])
        Smax=numpy.sum(D[len(D)-N::1])
        C=(Scl-Smin)/(Smax-Smin)
        Scl=0
        N=0
        Smin=0
        Smax=0
        if(C<Old_C):
            Old_C=C
            OptimalCenter=classifier.cluster_centers_[:]

    
    return OptimalCenter,Old_C
Beispiel #37
0
def cluster_ids(gids, unnorm_eluts, sp, gt=None, dist='cosine', do_plot=True,
        norm_rows=True, bigarr=None, **kwargs):
    import plotting as pl
    import hcluster
    arr = (bigarr if bigarr is not None else single_array(gids, unnorm_eluts,
        sp, norm_rows=norm_rows))
    ymat = hcluster.pdist(arr, metric=dist)
    zmat = hcluster.linkage(ymat)
    zmat = np.clip(zmat, 0, 10**8)
    if do_plot: pl.figure()
    order = hcluster.dendrogram(zmat, no_plot=bool(1-do_plot), 
            **kwargs)['leaves']
    if do_plot: 
        ax = pl.gca()
        ax.axes.set_xticklabels([gt.id2name[gids[ind]] for ind in order])
        pl.figure() 
        pl.imshow(arr[order,:])
    return list(np.array(list(gids))[order])
Beispiel #38
0
def do_gen_feature_z(X_L_list, X_D_list, M_c, filename, tablename=''):
    num_cols = len(X_L_list[0]['column_partition']['assignments'])
    column_names = [M_c['idx_to_name'][str(idx)] for idx in range(num_cols)]
    column_names = numpy.array(column_names)
    # extract unordered z_matrix
    num_latent_states = len(X_L_list)
    z_matrix = numpy.zeros((num_cols, num_cols))
    for X_L in X_L_list:
      assignments = X_L['column_partition']['assignments']
      for i in range(num_cols):
        for j in range(num_cols):
          if assignments[i] == assignments[j]:
            z_matrix[i, j] += 1
    z_matrix /= float(num_latent_states)
    # hierachically cluster z_matrix
    Y = hcluster.pdist(z_matrix)
    Z = hcluster.linkage(Y)
    pylab.figure()
    hcluster.dendrogram(Z)
    intify = lambda x: int(x.get_text())
    reorder_indices = map(intify, pylab.gca().get_xticklabels())
    pylab.close()
    # REORDER! 
    z_matrix_reordered = z_matrix[:, reorder_indices][reorder_indices, :]
    column_names_reordered = column_names[reorder_indices]
    # actually create figure
    fig = pylab.figure()
    fig.set_size_inches(16, 12)
    pylab.imshow(z_matrix_reordered, interpolation='none',
                 cmap=pylab.matplotlib.cm.Greens)
    pylab.colorbar()
    if num_cols < 14:
      pylab.gca().set_yticks(range(num_cols))
      pylab.gca().set_yticklabels(column_names_reordered, size='x-small')
      pylab.gca().set_xticks(range(num_cols))
      pylab.gca().set_xticklabels(column_names_reordered, rotation=90, size='x-small')
    else:
      pylab.gca().set_yticks(range(num_cols)[::2])
      pylab.gca().set_yticklabels(column_names_reordered[::2], size='x-small')
      pylab.gca().set_xticks(range(num_cols)[1::2])
      pylab.gca().set_xticklabels(column_names_reordered[1::2],
                                  rotation=90, size='small')
    pylab.title('column dependencies for: %s' % tablename)
    pylab.savefig(filename)
def DrawDendrogram(feature_vector, obj_names, motion_name):
    distances = pdist(feature_vector)
    linkage_list = ['single', 'average', 'complete']
    Z = linkage(distances, linkage_list[1])
    render = hierarchy.dendrogram(Z,
                                  #p=51,
                                  #truncate_mode='level',
                                  #show_contracted=True,
                                  color_threshold=1.5,
                                  labels=obj_names,
                                  orientation='left',
                                  show_leaf_counts=True,
                                  leaf_font_size=10,
                                  )

    plt.title(motion_name+'_'+linkage_list[1])
    plt.show()
    #plt.savefig(motion_name+'_dendro_complete.png')

    return render
Beispiel #40
0
def get_clustering_as_tree(vectors,
                           ward=True,
                           clustering_distance='euclidean',
                           clustering_method='complete',
                           progress=progress):
    if ward:
        progress.update(
            'Clustering data with Ward linkage and euclidean distances')
        clustering_result = hcluster.ward(vectors)
    else:
        progress.update('Computing distance matrix using "%s" distance' %
                        clustering_distance)
        distance_matrix = hcluster.pdist(vectors, clustering_distance)
        progress.update('Clustering data with "%s" linkage' %
                        clustering_method)
        clustering_result = hcluster.linkage(distance_matrix,
                                             method=clustering_method)

    progress.update('Returning results')
    return hcluster.to_tree(clustering_result)
def dendrogramBuild(tfidfMatrix,queriedSentences,degree):
 
    a = pdist(tfidfMatrix,'cosine')
    print a
    b = linkage(a)
    print b


    if b[0][2] < degree:
       mag1 = tfidf.magnitude(tfidfMatrix[int(b[0][0])])
       mag2 = tfidf.magnitude(tfidfMatrix[int(b[0][1])])
       if mag1 > mag2:
	  print int(b[0][1])
	  tfidfMatrix.pop(int(b[0][1]))
	  queriedSentences.pop(int(b[0][1]))
       else:
	  print int(b[0][0])
          tfidfMatrix.pop(int(b[0][0]))
	  queriedSentences.pop(int(b[0][0]))
       dendrogramBuild(tfidfMatrix,queriedSentences,degree)


    return (tfidfMatrix,queriedSentences)
Beispiel #42
0
def hierarchical_clusters( log, show_plot=None ):
    """Translates traces to Parikh vectors and computes in the vector space
       a hierarchical clustering."""
    def get_parikh(case,alphabet):
        v = zeros(len(alphabet),dtype=int)
        for act in case:
            v[alphabet[act]] = v[alphabet[act]] +1
        # canonical representation
        m = min(v)
        return v - m   
    
    actsind = {}
    i = 0
    for act in log.get_alphabet():
        actsind[act] = i
        i = i +1

    uniq_cases = log.get_uniq_cases()
    N = len(uniq_cases)
    M = len(actsind)
    data = zeros((N,M),dtype=int)
    i = 0
    parikhdict = {}
    for case in uniq_cases.keys():
        data[i] = get_parikh(case,actsind)
        str_i = ','.join(map(str,data[i]))
        if str_i not in parikhdict:
            parikhdict[str_i] = [i]
        else:
            parikhdict[str_i].append(i)
        i = i + 1
    df = DataFrame(data)
    data_uniq = df.drop_duplicates()
    Y = pdist(data_uniq,metric='euclidean')
    Z = linkage(Y,method='average')
    dendrogram(Z)
    show()
data = pickle.load(open(path.join(result_path,
                                  p['data_label'], 'data.pickle')))

for key, val in data.iteritems():
# for bla in [1]:
#     key, val = 'eagle', data['eagle']
    

    fig = plt.figure()
    fig.canvas.mpl_connect('pick_event', onpick)
    plt.subplot(3, 1, 1)
    plt.title(key)

    proj = np.dot(val['U'][:, 0:2].T, val['vecs'])
    Y = pdist(proj.T)
    Z = linkage(Y)
    dendrogram(Z)
    ax = plt.subplot(3, 1, 2)


    for i in range(proj.shape[1]):
        col = (1 - (val['ratings'][i] / 100.0)) * 0.7
        pt, = ax.plot(proj[0, i], proj[1, i],
                       '.',
                       color=('%f' % col),
                       picker=3)
        ax.text(proj[0, i], proj[1, i], i)
        pt.name = val['keys'][i]

    plt.subplot(3, 1, 3)
Beispiel #44
0
from matplotlib.pyplot import show

from hcluster import pdist, linkage, dendrogram
import numpy
from numpy.random import rand

X = rand(10, 100)
X[0:5, :] *= 2
Y = pdist(X)
Z = linkage(Y)
dendrogram(Z)

show()
Beispiel #45
0
def feature_extraction_torso_camera(input_torso, input_camera):

    numero_juntas = 15
    # number of joints
    frame_rate = 1 / 30.0
    # frame rate
    window = 10
    # temporal window

    x = input_torso[:, 0::6]
    y = input_torso[:, 1::6]
    z = input_torso[:, 2::6]

    # Guarantees that the number of frames is the same for torso and camera features
    if input_torso.size < input_camera.size:
        [m, n] = input_torso.shape
    else:
        [m, n] = input_camera.shape

    ## Log-Cov of distances between every joints relative to the torso

    distancias = np.zeros((numero_juntas, numero_juntas))
    distancias_total = np.array([[]])

    for frame in range(0, m):
        for i in range(0, 15):
            for j in range(0, 15):
                distancias[i, j] = mat.pdist(
                    [[x[frame, i], y[frame, i], z[frame, i]],
                     [x[frame, j], y[frame, j], z[frame, j]]])

        distlower = np.tril(distancias)
        distupper = np.triu(distancias)
        distancias_final = distlower[1:, :] + distupper[
            0:-1, :]  # elimination of null diagonal

        cov_distancias = np.cov(distancias_final.T)
        #cov_distancias_final = np.triu(cov_distancias)
        #aux = np.reshape(cov_distancias_final.T,(1,15*15)).copy()
        #aux=np.array([aux[aux!=0]])

        aux = apply_log_vect(cov_distancias)

        #aux2 = np.reshape(distlower.T,(1,np.size(distlower))).copy()
        #aux2 = np.array([aux2[aux2!=0]])

        distancias_total = np.concatenate([distancias_total, aux
                                           ]) if distancias_total.size else aux

    ## Distances between every joints and torso
    distancias = np.zeros((m, numero_juntas))

    for frame in range(0, m):
        for i in range(0, 15):
            distancias[frame,
                       i] = mat.pdist([[x[frame, i], y[frame, i], z[frame, i]],
                                       [x[frame, 3], y[frame, 3], z[frame,
                                                                    3]]])

    distancias_ao_torso = distancias

    ## Absolute velocities

    velocidades = np.zeros((m, numero_juntas))

    for frame in range(0, m):

        if frame == 0:
            anterior = frame
        else:
            anterior = frame - 1

        actual = frame

        for i in range(0, 15):
            velocidades[frame, i] = (mat.pdist([[
                x[actual, i], y[actual, i], z[actual, i]
            ], [x[anterior, i], y[anterior, i], z[anterior, i]]
                                                ])) / (frame_rate)

    velocidades_total = velocidades

    ## Velocities and directions for each dimension {x,y,z}

    vx = np.zeros((m, numero_juntas))
    vy = np.zeros((m, numero_juntas))
    vz = np.zeros((m, numero_juntas))
    dx = np.zeros((m, numero_juntas))
    dy = np.zeros((m, numero_juntas))
    dz = np.zeros((m, numero_juntas))

    for frame in range(0, m):

        if frame == 0:
            anterior = frame
        else:
            anterior = frame - 1

        actual = frame

        for i in range(0, 15):
            dx[frame, i] = x[actual, i] - x[anterior, i]
            dy[frame, i] = y[actual, i] - y[anterior, i]
            dz[frame, i] = z[actual, i] - z[anterior, i]
            vx[frame, i] = dx[frame, i] / (frame_rate)
            vy[frame, i] = dy[frame, i] / (frame_rate)
            vz[frame, i] = dz[frame, i] / (frame_rate)

    velocidade_xyz = np.c_[vx, vy, vz]
    direcao_xyz = np.c_[dx, dy, dz]

    ## Angles of the triangles formed by {shoulders, elbows, hands}, {shoulders, hips, knees} and {hips, knees, feet}
    angulos = np.array([])

    for frame in range(0, m):

        ombro_esq_cotovelo_esq = mat.pdist(
            [[x[frame, 4 - 1], y[frame, 4 - 1], z[frame, 4 - 1]],
             [x[frame, 5 - 1], y[frame, 5 - 1], z[frame, 5 - 1]]])
        # distance between left shoulder and left elbow
        ombro_esq_mao_esq = mat.pdist(
            [[x[frame, 4 - 1], y[frame, 4 - 1], z[frame, 4 - 1]],
             [x[frame, 12 - 1], y[frame, 12 - 1], z[frame, 12 - 1]]])
        # distance between left shoulder and left hand
        mao_esq_cotovelo_esq = mat.pdist(
            [[x[frame, 12 - 1], y[frame, 12 - 1], z[frame, 12 - 1]],
             [x[frame, 5 - 1], y[frame, 5 - 1], z[frame, 5 - 1]]])
        # distance between left hand and left elbow
        angulo_esq1 = np.arccos(
            (ombro_esq_cotovelo_esq**2 + mao_esq_cotovelo_esq**2 -
             ombro_esq_mao_esq**2) /
            (2 * ombro_esq_cotovelo_esq * mao_esq_cotovelo_esq))
        # angle

        ombro_dir_cotovelo_dir = mat.pdist(
            [[x[frame, 6 - 1], y[frame, 6 - 1], z[frame, 6 - 1]],
             [x[frame, 7 - 1], y[frame, 7 - 1], z[frame, 7 - 1]]])
        # distance between right shoulder and right elbow
        ombro_dir_mao_dir = mat.pdist(
            [[x[frame, 6 - 1], y[frame, 6 - 1], z[frame, 6 - 1]],
             [x[frame, 13 - 1], y[frame, 13 - 1], z[frame, 13 - 1]]])
        # distance between right shoulder and right hand
        mao_dir_cotovelo_dir = mat.pdist(
            [[x[frame, 13 - 1], y[frame, 13 - 1], z[frame, 13 - 1]],
             [x[frame, 7 - 1], y[frame, 7 - 1], z[frame, 7 - 1]]])
        # distance between right hand and right elbow
        angulo_dir1 = np.arccos(
            (ombro_dir_cotovelo_dir**2 + mao_dir_cotovelo_dir**2 -
             ombro_dir_mao_dir**2) /
            (2 * ombro_dir_cotovelo_dir * mao_dir_cotovelo_dir))
        # angle

        ombro_esq_anca_esq = mat.pdist(
            [[x[frame, 4 - 1], y[frame, 4 - 1], z[frame, 4 - 1]],
             [x[frame, 8 - 1], y[frame, 8 - 1], z[frame, 8 - 1]]])
        # distance between left shoulder and left hip
        ombro_esq_joelho_esq = mat.pdist(
            [[x[frame, 4 - 1], y[frame, 4 - 1], z[frame, 4 - 1]],
             [x[frame, 9 - 1], y[frame, 9 - 1], z[frame, 9 - 1]]])
        # distance between left shoulder and left knee
        anca_esq_joelho_esq = mat.pdist(
            [[x[frame, 8 - 1], y[frame, 8 - 1], z[frame, 8 - 1]],
             [x[frame, 9 - 1], y[frame, 9 - 1], z[frame, 9 - 1]]])
        # distance between left hip and left knee
        angulo_esq2 = np.arccos(
            (ombro_esq_anca_esq**2 + anca_esq_joelho_esq**2 -
             ombro_esq_joelho_esq**2) /
            (2 * ombro_esq_anca_esq * anca_esq_joelho_esq))
        # angle

        ombro_dir_anca_dir = mat.pdist(
            [[x[frame, 6 - 1], y[frame, 6 - 1], z[frame, 6 - 1]],
             [x[frame, 10 - 1], y[frame, 10 - 1], z[frame, 10 - 1]]])
        # distance between right shoulder and right hip
        ombro_dir_joelho_dir = mat.pdist(
            [[x[frame, 6 - 1], y[frame, 6 - 1], z[frame, 6 - 1]],
             [x[frame, 11 - 1], y[frame, 11 - 1], z[frame, 11 - 1]]])
        # distance between right shoulder and right knee
        anca_dir_joelho_dir = mat.pdist(
            [[x[frame, 10 - 1], y[frame, 10 - 1], z[frame, 10 - 1]],
             [x[frame, 11 - 1], y[frame, 11 - 1], z[frame, 11 - 1]]])
        # distance between right hip and right knee
        angulo_dir2 = np.arccos(
            (ombro_dir_anca_dir**2 + anca_dir_joelho_dir**2 -
             ombro_dir_joelho_dir**2) /
            (2 * ombro_dir_anca_dir * anca_dir_joelho_dir))
        # angle

        pe_esq_anca_esq = mat.pdist(
            [[x[frame, 14 - 1], y[frame, 14 - 1], z[frame, 14 - 1]],
             [x[frame, 8 - 1], y[frame, 8 - 1], z[frame, 8 - 1]]])
        # distance between left foot and left hip
        pe_esq_joelho_esq = mat.pdist(
            [[x[frame, 14 - 1], y[frame, 14 - 1], z[frame, 14 - 1]],
             [x[frame, 9 - 1], y[frame, 9 - 1], z[frame, 9 - 1]]])
        # distance between left foot and left knee
        anca_esq_joelho_esq = mat.pdist(
            [[x[frame, 8 - 1], y[frame, 8 - 1], z[frame, 8 - 1]],
             [x[frame, 9 - 1], y[frame, 9 - 1], z[frame, 9 - 1]]])
        # distance between left hip and left knee
        angulo_esq3 = np.arccos((pe_esq_joelho_esq**2 +
                                 anca_esq_joelho_esq**2 - pe_esq_anca_esq**2) /
                                (2 * pe_esq_joelho_esq * anca_esq_joelho_esq))
        # angle

        pe_dir_anca_dir = mat.pdist(
            [[x[frame, 15 - 1], y[frame, 15 - 1], z[frame, 15 - 1]],
             [x[frame, 10 - 1], y[frame, 10 - 1], z[frame, 10 - 1]]])
        # distance between right foot and right hip
        pe_dir_joelho_dir = mat.pdist(
            [[x[frame, 15 - 1], y[frame, 15 - 1], z[frame, 15 - 1]],
             [x[frame, 11 - 1], y[frame, 11 - 1], z[frame, 11 - 1]]])
        # distance between right foot and right knee
        anca_dir_joelho_dir = mat.pdist(
            [[x[frame, 10 - 1], y[frame, 10 - 1], z[frame, 10 - 1]],
             [x[frame, 11 - 1], y[frame, 11 - 1], z[frame, 11 - 1]]])
        # distance between right hip and right knee
        angulo_dir3 = np.arccos((pe_dir_joelho_dir**2 +
                                 anca_dir_joelho_dir**2 - pe_dir_anca_dir**2) /
                                (2 * pe_dir_joelho_dir * anca_dir_joelho_dir))
        # angle

        an = np.c_[angulo_esq1, angulo_dir1, angulo_esq2, angulo_dir2,
                   angulo_esq3, angulo_dir3]
        angulos = np.r_[angulos, an] if angulos.size else an

    ## Angular Difference

    variacao_angulos = np.array([[]])
    for frame in range(0, m):
        if frame == 0:
            anterior = frame
        else:
            anterior = frame - 1

        actual = frame

        dif = np.array([angulos[actual, :] - angulos[anterior, :]])
        variacao_angulos = np.r_[variacao_angulos,
                                 dif] if variacao_angulos.size else dif

    ## Variation of all joints relative to the camera in {x,y,z}
    x_camera = input_camera[:, 0::6]
    y_camera = input_camera[:, 1::6]
    z_camera = input_camera[:, 2::6]
    dx_camera = np.zeros((m, numero_juntas))
    dy_camera = np.zeros((m, numero_juntas))
    dz_camera = np.zeros((m, numero_juntas))
    vx_camera = np.zeros((m, numero_juntas))
    vy_camera = np.zeros((m, numero_juntas))
    vz_camera = np.zeros((m, numero_juntas))

    for frame in range(0, m):
        if frame == 0:
            anterior = frame
        else:
            anterior = frame - 1

        actual = frame

        for i in range(0, 15):
            dx_camera[frame, i] = x_camera[actual, i] - x_camera[anterior, i]
            dy_camera[frame, i] = y_camera[actual, i] - y_camera[anterior, i]
            dz_camera[frame, i] = z_camera[actual, i] - z_camera[anterior, i]
            vx_camera[frame, i] = dx_camera[frame, i] / (frame_rate)
            vy_camera[frame, i] = dy_camera[frame, i] / (frame_rate)
            vz_camera[frame, i] = dz_camera[frame, i] / (frame_rate)

    variacao_xyz_camera = np.c_[dx_camera, dy_camera, dz_camera]
    velocidade_xyz_camera = np.c_[vx_camera, vy_camera, vz_camera]

    # Absolute velocities relative to the camera

    velocidades = np.zeros((m, numero_juntas))

    for frame in range(0, m):

        if frame == 0:
            anterior = frame
        else:
            anterior = frame - 1

        actual = frame

        for i in range(0, 15):
            velocidades[frame, i] = (mat.pdist([[
                x_camera[actual, i], y_camera[actual, i], z_camera[actual, i]
            ],
                                                [
                                                    x_camera[anterior, i],
                                                    y_camera[anterior, i],
                                                    z_camera[anterior, i]
                                                ]])) / (frame_rate)

    velocidades_total_camera = velocidades

    return [
        distancias_total, distancias_ao_torso, velocidades_total,
        velocidade_xyz, direcao_xyz, angulos, variacao_angulos,
        variacao_xyz_camera, velocidade_xyz_camera, velocidades_total_camera
    ]
Beispiel #46
0
import numpy as np
import matplotlib.pyplot as plt
from hcluster import pdist, linkage, dendrogram, squareform  # same as import them from scipy

data = np.genfromtxt("../../data/ExpRawData-E-TABM-84-A-AFFY-44.tab",
                     names=True,
                     usecols=tuple(range(1, 30)),
                     dtype=float,
                     delimiter="\t")

data_array = data.view((np.float, len(data.dtype.names)))
data_array = data_array[1:1000].transpose()

data_dist = pdist(data_array)  # computing the distance

data_link = linkage(data_dist)  # computing the linkage

# just plot the dendrogram.
dendrogram(data_link, labels=data.dtype.names)
plt.savefig('../../results/dendrogram.png')

# or plot the heatmap too!

# Compute and plot first dendrogram.
fig = plt.figure(figsize=(8, 8))
# x ywidth height
ax1 = fig.add_axes([0.05, 0.1, 0.2, 0.6])
Y = linkage(data_dist, method='single')
Z1 = dendrogram(Y, orientation='right',
                labels=data.dtype.names)  # adding/removing the axes
ax1.set_xticks([])
Beispiel #47
0
from hcluster import pdist, linkage, leaves_list, squareform, dendrogram
import numpy as np
import matplotlib as mp

metric = 'euclidean'
method = 'single'

data = np.matrix([[1, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 1, 1],
                  [0, 0, 0, 0, 0, 0, 1, 1, 0], [0, 0, 0, 1, 1, 1, 1, 1, 0],
                  [0, 0, 0, 1, 1, 1, 0, 0, 0], [0, 0, 1, 1, 1, 1, 0, 0, 0]])

y = pdist(data, metric=metric)
Z = linkage(y, method=method, metric=metric)
dendrogram(Z)
Z = [(int(l), int(r), max(0., s), int(n)) for (l, r, s, n) in Z]  # cleaning

leaves = list(leaves_list(Z))
count = len(leaves)
root = len(Z) + count - 1

X = squareform(y)
assert len(X) == count

from utils import memoise

# bar-joseph optimal ordering ################################################

from barjoseph import optimal

leaves = optimal(
    root, **{
Beispiel #48
0
def dbscan(x,k,Eps = None, distMethod = 'euclidean'):
    '''
    Calculate the density based clustering of an array

    '''
    try:
        m = x.shape[0]
        if Eps == None:
            Eps = epsilon(x,k)

        #need to test if the squareform will fail
        #squareform makes a large matrix and if the arrays
        #input are too large not enough memory exists
        try:
            dist = H.squareform(H.pdist(x, distMethod))
            distOK = True
        except:
            distOK = False

        x = N.column_stack((N.arange(0,m),x))
        if len(x.shape)>1:
            m,n = x.shape
        else:
            m = x.shape[0]
            n == 1
        type = N.zeros(m)
        touched = N.zeros(m)
        no = 1

        tType = N.zeros(m)
        cClass = N.zeros(m)

        if distOK:
            for i in xrange(m):
                if touched[i] == 0:
                    ob = x[i]
                    D = dist[ob[0]]
#                    D = euclid(ob[1:],x[:,1:3])
                    ind = N.where(D<=Eps)
                    ind = set2List(ind)[0]

                    if len(ind)>1 and len(ind)<(k+1):
                        tType[i] = 0
                        cClass[i] = 0

                    if len(ind) == 1:
                        tType[i] = -1
                        cClass[i] = -1
                        touched[i] = 1

                    if len(ind) >= k+1:
                        tType[i] = 1
                        cClass[ind] = N.ones(len(ind))*no

                        for l in ind:
                            ob2 = x[l]
                            touched[l]=1
                            D2 = dist[ob2[0]]
                            i1 = N.where(D2<=Eps)
                            i1 = set2List(i1)[0]
                            if len(i1) > 1:
                                cClass[i1] = no
                                if len(i1)>=k+1:
                                    tType[ob2[0]] = 1
                                else:
                                    tType[ob2[0]] = 0

                                for j in xrange(len(i1)):
                                    if touched[i1[j]] == 0:
                                        touched[i1[j]]=1
                                        ind.append(i1[j])
                                        cClass[i1[j]] = no

                        no+=1
        else:#this is the very slow way but gets around the memory problem.
            print "The Input Array is too big and a squareform cannot be computed"
            raise "MemoryErro"
#            for i in xrange(m):
#                if touched[i] == 0:
#                    ob = x[i]
#    #                D = dist[ob[0]]
#                    D = euclid(ob[1:],x[:,1:3])
#                    ind = N.where(D<=Eps)
#                    ind = set2List(ind)[0]
#
#                    if len(ind)>1 and len(ind)<(k+1):
#                        tType[i] = 0
#                        cClass[i] = 0
#
#                    if len(ind) == 1:
#                        tType[i] = -1
#                        cClass[i] = -1
#                        touched[i] = 1
#
#                    if len(ind) >= k+1:
#                        tType[i] = 1
#                        cClass[ind] = N.ones(len(ind))*no
#
#                        for l in ind:
#                            ob2 = x[l]
#                            touched[l]=1
#                            D2 = euclid(ob2[1:],x[:,1:3])
##                            D2 = dist[ob2[0]]
#                            i1 = N.where(D2<=Eps)
#                            i1 = set2List(i1)[0]
#                            if len(i1) > 1:
#                                cClass[i1] = no
#                                if len(i1)>=k+1:
#                                    tType[ob2[0]] = 1
#                                else:
#                                    tType[ob2[0]] = 0
#
#                                for j in xrange(len(i1)):
#                                    if touched[i1[j]] == 0:
#                                        touched[i1[j]]=1
#                                        ind.append(i1[j])
#                                        cClass[i1[j]] = no
#
#                        no+=1


        i1 = N.where(cClass == 0)
        i1 = set2List(i1)[0]
        cClass[i1] = -1
        tType[i1] = -1
        return cClass, tType, Eps, True
    except:
        errorMsg ="An error occured with the DBSCAN Algorithm\n"
        errorMsg += "Sorry: %s\n\n%s\n"%(sys.exc_type, sys.exc_value)
        print errorMsg

        return None,None,None,False
Beispiel #49
0
def Xi_activity_similarity(X, Y):
	num_different = sum(x != y for x,y in zip(X,Y))
	possibly_different = sum(X)+sum(Y)
	return num_different/possibly_different # corresponds to the binary distance i the R function dist

stateD = {'XI':1, 'bi':1, 'nd':0, 'xa':0}

if '__main__' == __name__:
	
	# load table
	linefeed = dr_tools.splitlines('chrX_clones_allelic_calls.txt')
	sample_labels = next(linefeed)[1:]
	character_matrixT = []
	for cells in linefeed:
		# values in cells are nd, XI, xa, bi, except first column which is gene symbol
		if any(c!='nd' for c in cells):
			character_matrixT.append([stateD[c] for c in cells[1:]])
	
	# make clusters
	character_matrix = numpy.array(character_matrixT).transpose()
	#hcdists = hcluster.pdist(character_matrix, metric='cityblock')
	hcdists = hcluster.pdist(character_matrix, metric=Xi_activity_similarity)
	hclinks = hcluster.linkage(hcdists, method='complete')
	draw_order = hcluster.leaves_list(hclinks)
	
	# draw tree
	scipyhcluster.dendrogram(hclinks, labels=sample_labels, leaf_rotation=90)
	pylab.subplots_adjust(bottom=0.3)
	pylab.savefig('tree_Xiexpr.pdf')
def feature_extraction(entrada):

    frame_rate=1.0/30.0    # tempo decorrido entre cada frame em segundos
    window = 10.0        # janela de frames para aspectos temporais

    x = entrada[:,0::6]
    y = entrada[:,1::6]
    z = entrada[:,2::6]
    [m, n] = np.shape(entrada)

    ## distancias

    distancias = np.zeros((14,14))
    distancias_total=np.array([])

    for frame in range(0,m):
        for i in range(0,14):
            for j in range(0,14):
                distancias[i,j]= mat.pdist([[x[frame,i], y[frame,i], z[frame,i]], [x[frame,j], y[frame,j], z[frame,j]]])
       
        distlower = np.tril(distancias)
        distupper = np.triu(distancias)
        distancias_final = distlower[1:, :] + distupper[0:-1,:] # eleminacao da diagonal de zeros
        
        #cov_distancias = np.cov(distancias_final)
        #print np.shape(cov_distancias)
        #cov_distancias_final = np.triu(cov_distancias)
        #aux = np.reshape(cov_distancias_final.T,1,15*15)
        #aux[aux==0]=[]
        #distancias_total=concatenate((distancias_total, aux))
        aux = np.array([get_triu_cov(distancias_final.T)])
        distancias_total = np.concatenate([distancias_total, aux]) if distancias_total.size else aux
        #print np.shape(distancias_total) 
    # velocidades absolutas

    velocidades=np.zeros((np.floor(m/window),14))
    velocidades_total = np.array([])

    for frame in range(0,int(np.floor(m/window))):
            
        actual = frame*window
        anterior = frame*(window-9)

        for i in range(0,14): 
            velocidades[frame,i]= mat.pdist([[x[actual,i], y[actual,i], z[actual,i]], [x[anterior,i], y[anterior,i], z[anterior,i]]])/(frame_rate*window)

        if frame==int(np.floor(m/window)):
            velocidades_total = np.concatenate((velocidades_total, np.tile(velocidades[frame,:],(m-(window*frame)+window,1))))
        else:
            velocidades_total = np.concatenate((velocidades_total, np.tile(velocidades[frame,:],(window,1)))) if velocidades_total.size else np.tile(velocidades[frame,:],(window,1))
 
    # velocidades e direcoes relativamente a cada eixo

    vx = np.zeros((np.floor(m/window),14))
    vy = np.zeros((np.floor(m/window),14))
    vz = np.zeros((np.floor(m/window),14))
    dx = np.zeros((np.floor(m/window),14))
    dy = np.zeros((np.floor(m/window),14))
    dz = np.zeros((np.floor(m/window),14))
    direcao_xyz = np.array([])
    velocidade_xyz = np.array([])

    for frame in range(0,int(np.floor(m/window))):
            
        actual = frame*window
        anterior = frame*window-9
        
        for i in range(0,14):
            dx[frame,i] = x[actual,i]-x[anterior,i]
            dy[frame,i] = y[actual,i]-y[anterior,i]
            dz[frame,i] = z[actual,i]-z[anterior,i]
            vx[frame,i] = dx[frame,i]/(frame_rate*window)
            vy[frame,i] = dy[frame,i]/(frame_rate*window)
            vz[frame,i] = dz[frame,i]/(frame_rate*window)

        if frame==np.floor(m/window):
            aux_v = np.c_[np.tile(vx[frame,:],((m-(window*frame)+window,1))), np.tile(vy[frame,:],((m-(window*frame)+window,1))), np.tile(vz[frame,:],((m-(window*frame)+window,1)))]
            
            velocidade_xyz = np.concatenate([velocidade_xyz, aux_v]) if velocidade_xyz.size else aux_v
            
            aux_d = np.array([np.c_[np.tile(dx[frame,:],(m-(window*frame)+window,1)), np.tile(dy[frame,:],(m-(window*frame)+window,1)), np.tile(dz[frame,:],((m-(window*frame)+window,1)))]])
            
            direcao_xyz = np.concatenate([direcao_xyz, aux_d]) if direcao_xyz.size else aux_d
        else:
            aux_v = np.c_[np.tile(vx[frame,:],(window,1)), np.tile(vy[frame,:],(window,1)), np.tile(vz[frame,:],(window,1))]
            
            velocidade_xyz = np.concatenate([velocidade_xyz, aux_v]) if velocidade_xyz.size else aux_v
            
            aux_d = np.c_[np.tile(dx[frame,:],(window,1)), np.tile(dy[frame,:],(window,1)), np.tile(dz[frame,:],(window,1))]
            
            direcao_xyz = np.concatenate([direcao_xyz, aux_d]) if direcao_xyz.size else aux_d
            
                        
    return [distancias_total, velocidades_total, velocidade_xyz, direcao_xyz]             
Beispiel #51
0
    def _do_gen_matrix(self,
                       col_function_name,
                       X_L_list,
                       X_D_list,
                       M_c,
                       T,
                       tablename='',
                       filename=None,
                       col=None,
                       confidence=None,
                       limit=None,
                       submatrix=False):
        if col_function_name == 'mutual information':
            col_function = getattr(self, '_mutual_information')
        elif col_function_name == 'dependence probability':
            col_function = getattr(self, '_dependence_probability')
        elif col_function_name == 'correlation':
            col_function = getattr(self, '_correlation')
        elif col_function_name == 'view_similarity':
            col_function = getattr(self, '_view_similarity')
        else:
            raise Exception('Invalid column function')

        num_cols = len(X_L_list[0]['column_partition']['assignments'])
        column_names = [
            M_c['idx_to_name'][str(idx)] for idx in range(num_cols)
        ]
        column_names = numpy.array(column_names)
        # extract unordered z_matrix
        num_latent_states = len(X_L_list)
        z_matrix = numpy.zeros((num_cols, num_cols))
        for i in range(num_cols):
            for j in range(num_cols):
                z_matrix[i][j] = col_function(i, j, X_L_list, X_D_list, M_c, T)

        if col:
            z_column = list(z_matrix[M_c['name_to_idx'][col]])
            data_tuples = zip(z_column, range(num_cols))
            data_tuples.sort(reverse=True)
            if confidence:
                data_tuples = filter(lambda tup: tup[0] >= float(confidence),
                                     data_tuples)
            if limit and limit != float("inf"):
                data_tuples = data_tuples[:int(limit)]
            data = [tuple([d[0] for d in data_tuples])]
            columns = [d[1] for d in data_tuples]
            column_names = [
                M_c['idx_to_name'][str(idx)] for idx in range(num_cols)
            ]
            column_names = numpy.array(column_names)
            column_names_reordered = column_names[columns]
            if submatrix:
                z_matrix = z_matrix[columns, :][:, columns]
                z_matrix_reordered = z_matrix
            else:
                return {'data': data, 'columns': column_names_reordered}
        else:
            # hierachically cluster z_matrix
            import hcluster
            Y = hcluster.pdist(z_matrix)
            Z = hcluster.linkage(Y)
            pylab.figure()
            hcluster.dendrogram(Z)
            intify = lambda x: int(x.get_text())
            reorder_indices = map(intify, pylab.gca().get_xticklabels())
            pylab.close()
            # REORDER!
            z_matrix_reordered = z_matrix[:,
                                          reorder_indices][reorder_indices, :]
            column_names_reordered = column_names[reorder_indices]

        title = 'Pairwise column %s for %s' % (col_function_name, tablename)
        if filename:
            utils.plot_matrix(z_matrix_reordered, column_names_reordered,
                              title, filename)

        return dict(matrix=z_matrix_reordered,
                    column_names=column_names_reordered,
                    title=title,
                    filename=filename,
                    message="Created " + title)
    cosine_similarity(tfidf_matrix,my_ref_len)

    my_sentences = sentence.extract_sentence("test3.txt")

    #for sentence in my_sentences:    
	#print "Sentence ------>", sentence

    for sentence in my_sentences:
	tfisf_sum = 0
	for word in sentence:
	    for word_score in tfidf_scores:
		if word == word_score:
		   tfisf_sum = tfisf_sum + tfidf_scores[word_score]
	tfidf_sentence_scores[sentence] = tfisf_sum

    #for sentence in tfidf_sentence_scores:
	#tfisf_scores[sentence] = float(tfidf_sentence_scores[sentence]/tfidf_sum(tfidf_scores))

    Y = pdist(tfidf_matrix)
    Z = linkage(Y)
    dendrogram(Z)
 
    show()


    


	
Beispiel #53
0
    N = len(uniq_cases)
    M = len(actsind)
    data = zeros((N, M), dtype=int)
    i = 0
    parikhdict = {}
    for case in uniq_cases.keys():
        data[i] = get_parikh(case, actsind)
        str_i = ','.join(map(str, data[i]))
        if str_i not in parikhdict:
            parikhdict[str_i] = [i]
        else:
            parikhdict[str_i].append(i)
        i = i + 1
    df = DataFrame(data)
    data_uniq = df.drop_duplicates()
    Y = pdist(data_uniq, metric='euclidean')
    Z = linkage(Y, method='average')
    dendrogram(Z)
    show()


def similarity_clusters(log, show_plot=None):
    """Translates traces to Parikh vectors and computes in the vector space
       a K-means clustering."""
    def get_parikh(case, alphabet):
        v = zeros(len(alphabet), dtype=int)
        for act in case:
            v[alphabet[act]] = v[alphabet[act]] + 1
        return v

    actsind = {}
    elif o.method == 'numsameCAST':
        m = castoverlap_numgenes
    elif o.method == 'numsamemono_norm':
        m = monoallelic_numgenes_norm
    elif o.method == 'numsamemono100_norm':
        m = monoallelic_numgenes_norm_100
    elif o.method == 'numsameC57_norm':
        m = c57overlap_numgenes_norm
    elif o.method == 'numsameCAST_norm':
        m = castoverlap_numgenes_norm
    else:
        m = o.method

    # make clusters
    exparray = character_matrix
    hcdists = hcluster.pdist(exparray, metric=m)
    hclinks = hcluster.linkage(hcdists, method=o.linkage)
    draw_order = hcluster.leaves_list(hclinks)

    # draw tree
    scipyhcluster.dendrogram(hclinks, labels=samplenames, leaf_rotation=90)
    pylab.subplots_adjust(bottom=0.3)
    pylab.ylabel('%s (linkage=%s)' % (o.method, o.linkage))
    if o.method in ('numsamemono', 'numsameC57', 'numsameCAST',
                    'numsamemono_norm', 'numsameC57_norm', 'numsameCAST_norm'):
        pylab.yticks([1.0, 0.8, 0.6, 0.4, 0.2, 0.0],
                     [0, 100, 200, 300, 400, 500])
    elif o.method in ('numsamemono100', 'numsamemono100_norm'):
        pylab.yticks([1.0, 0.8, 0.6, 0.4, 0.2, 0.0], [0, 20, 40, 60, 80, 100])
    pylab.savefig(o.fig)