Example #1
1
def random_distribution(n):

    #make up some data
    data = np.random.normal(scale=n, size=(n, n))
    data[0:n / 2,0:n / 2] += 75
    data[n / 2:, n / 2:] = np.random.poisson(lam=n,size=data[n / 2:, n / 2:].shape)
    #cluster the rows
    row_dist = ssd.squareform(ssd.pdist(data))
    row_Z = sch.linkage(row_dist)
    row_idxing = sch.leaves_list(row_Z)

    row_labels = ['bar{}'.format(i) for i in range(n)]

    #cluster the columns
    col_dist = ssd.squareform(ssd.pdist(data.T))
    col_Z = sch.linkage(col_dist)
    col_idxing = sch.leaves_list(col_Z)
    #make the dendrogram

    col_labels = ['foo{}'.format(i) for i in range(n)]

    data = data[:,col_idxing][row_idxing,:]

    heatmap = pdh.DendroHeatMap(heat_map_data=data,left_dendrogram=row_Z, top_dendrogram=col_Z, heatmap_colors=("#ffeda0", "#feb24c", "#f03b20"), window_size="auto", color_legend_displayed=False, label_color="#777777")
    heatmap.row_labels = row_labels
    heatmap.col_labels = col_labels
    heatmap.title = 'An example heatmap'
    heatmap.show()#heatmap.save("example.png")
    def test_rsa_relatedness(self):
        ref_mat = loadmat('rsa_ref/debug_rsa_relatedness.mat')
        rdm_stack_all = ref_mat['rdm_stack_all']
        cand_rdm_stack_all = ref_mat['cand_rdm_stack_all']
        index_matrix_array = ref_mat['index_matrix_array']
        p_value_array = ref_mat['p_value_array']
        # print(rdm_stack_all.shape, cand_rdm_stack_all.shape, index_matrix_array.shape, p_value_array.shape)

        for i_case in range(p_value_array.shape[-1]):
            ref_rdms = rdm_stack_all[:, :, :, i_case]

            if i_case % 2 != 0:
                ref_rdms = ref_rdms[:, :, :1]  # check singular case.

            ref_rdms = np.array([squareform(ref_rdms[:, :, x]) for x in range(ref_rdms.shape[2])])
            cand_rdms = cand_rdm_stack_all[:, :, :, i_case]
            cand_rdms = np.array([squareform(cand_rdms[:, :, x]) for x in range(cand_rdms.shape[2])])
            # compute similarity.
            similarity_matrix_ref = rdm_similarity_batch(ref_rdms, cand_rdms, computation_method='spearmanr').mean(
                axis=1)
            p_val_this = rdm_relatedness_test(mean_ref_rdm=ref_rdms.mean(axis=0), model_rdms=cand_rdms,
                                              similarity_ref=similarity_matrix_ref,
                                              n=100, perm_idx_list=index_matrix_array[:, :, i_case].T - 1)
            p_val_ref = p_value_array[:, i_case]
            assert p_val_this.shape == p_val_ref.shape
            # print(p_val_this - p_val_ref)
            # print(abs(p_val_this - p_val_ref).max())
            self.assertTrue(np.allclose(p_val_this, p_val_ref))
def compute_distance():
	'''
	Computes distances between congress members for a particular category and writes out the results
	in a text file. Web App reads these text files to show graphs. 
	'''

	category_map = {1: 'Health Care', 2: 'National Security', 3:'Economy', 4:'Environment', 5:'Domestic Issues' }
	vm = Voting_Matrix('114')

	for j in xrange(1,6):
		votes, member_to_row =  vm.generate_matrix(category = [j])
		y = pdist(votes, 'cosine')
		y_dist = squareform(y)
		normed_distances = np.zeros((len(y_dist), len(y_dist)))
		for i in xrange(len(y_dist)):
			min_value = min(y_dist[i,:])
			max_value = max(y_dist[i,:])
			normed_distances[i,:] = (y_dist[i,:]-min_value) / (max_value-min_value)

		np.savetxt("data/%s114Distance.csv" %category_map[j], normed_distances, delimiter=",", fmt='%5.5f')

	votes, member_to_row =  vm.generate_matrix(category = [1,2,3,4,5])
	y = pdist(votes, 'cosine')
	y_dist = squareform(y)
	normed_distances = np.zeros((len(y_dist), len(y_dist)))
	for i in xrange(len(y_dist)):
		min_value = min(y_dist[i,:])
		max_value = max(y_dist[i,:])
		normed_distances[i,:] = (y_dist[i,:]-min_value) / (max_value-min_value)
	np.savetxt("data/All Categories114Distance.csv" , normed_distances, delimiter=",", fmt='%5.5f')

	df = pd.read_csv('../DataCollectionInsertion/Members/114Members.csv')
	row_nums = np.array([member_to_row[str(df.iloc[i]['person__id'])] for i in xrange(len(df))])
	df['row_nums'] = row_nums
	df.to_csv('../DataCollectionInsertion/Members/114Members.csv', sep=',')
def calculate_cophenetic_correlation(connmat):
    Y = 1 - connmat
    Z = linkage(squareform(Y),method='average')
    c,d= cophenet(Z,squareform(Y))
    #print c
    #print d
    return (c,d)
Example #5
0
File: rti.py Project: npatwari/rti
def initRTI(nodeLocs, delta_p, sigmax2, delta, excessPathLen):

    # Set up pixel locations as a grid.
    personLL        = nodeLocs.min(axis=0)
    personUR        = nodeLocs.max(axis=0)
    pixelCoords, xVals, yVals = calcGridPixelCoords(personLL, personUR, delta_p)
    pixels          = pixelCoords.shape[0]
    #plt.figure(3)
    #plotLocs(pixelCoords)
    

    # Find distances between pixels and transceivers
    DistPixels  = dist.squareform(dist.pdist(pixelCoords))
    DistPixelAndNode = dist.cdist(pixelCoords, nodeLocs)
    DistNodes   = dist.squareform(dist.pdist(nodeLocs))

    # Find the (inverse of) the Covariance matrix between pixels
    CovPixelsInv       = linalg.inv(sigmax2*np.exp(-DistPixels/delta))

    # Calculate weight matrix for each link.
    nodes = len(nodeLocs)
    links = nodes*(nodes-1)
    W = np.zeros((links, pixels))
    for ln in range(links):
        txNum, rxNum  = txRxForLinkNum(ln, nodes)
        ePL           = DistPixelAndNode[:,txNum] + DistPixelAndNode[:,rxNum] - DistNodes[txNum,rxNum]  
        inEllipseInd  = np.argwhere(ePL < excessPathLen)
        pixelsIn      = len(inEllipseInd)
        if pixelsIn > 0:
            W[ln, inEllipseInd] = 1.0 / float(pixelsIn)

    # Compute the projection matrix
    inversion       = np.dot(linalg.inv(np.dot(W.T, W) + CovPixelsInv), W.T)

    return (inversion, xVals, yVals)
def mds_author_term(fname1='corr_2d_mds_authors_by_terms.png', fname2='corr_2d_mds_terms_by_authors.png'):
    bib_data = get_bib_data()
    mat, authors, term_list, authors_cnt = get_author_by_term_mat(bib_data, tfreq=5, afreq=10)
    adist = dist.squareform(dist.pdist(mat, 'correlation'))
    coords,_ = mds(adist, dim=2)
    
    fig = plt.figure()
    fig.clf()
    plt.xlim(-15, 20)
    plt.ylim(-15, 20)
    for label, x, y in zip(authors, coords[:,0], coords[:,1]):
        plt.annotate(label, xy=(x*20,y*20))
    plt.axis('off')
    plt.savefig(fname1)
    
    
    mat = mat.T
    tdist = dist.squareform(dist.pdist(mat, 'correlation'))
    coords, _ = mds(tdist, dim=2)
    #fig = plt.figure()
    fig.clf();
    plt.xlim(-80,100)
    plt.ylim(-100,100)
    for label, x, y in zip(term_list, coords[:,0], coords[:,1]):
        plt.annotate(label, xy=(x*500,y*500))
    plt.axis('off')
    plt.savefig(fname2)
Example #7
0
def dCorr(x, y):
    """Returns the distance-correlation between x and y"""
    n = len(x)
    assert n == len(y), "Vectors must be of the same length"
    def dCov2(xM, yM):
        """Returns the distance-covariance squared of x and y, given the pairwise
        distance matrices xM and yM."""
        return (1.0 / n**2) * np.sum(xM * yM) #sum of all entries in component-wise product
        

    A = distance.squareform(distance.pdist(np.array(x).reshape(n, -1)))
    B = distance.squareform(distance.pdist(np.array(y).reshape(n, -1)))

    #Center along both axes:
    A -= A.mean(axis = 0)
    B -= B.mean(axis = 0)

    A -= A.mean(axis = 1)
    B -= B.mean(axis = 1)

    #Calculate distance covariances
    dcov  = np.sqrt(dCov2(A, B))
    dvarx = np.sqrt(dCov2(A, A))
    dvary = np.sqrt(dCov2(B, B))
    toR = dcov / np.sqrt(dvarx * dvary)
    if np.isnan(toR):
        return 0.0
    else:
        return toR
def distcorr(X, Y):
    """ Compute the distance correlation function
    
    >>> a = [1,2,3,4,5]
    >>> b = np.array([1,2,9,4,4])
    >>> distcorr(a, b)
    0.762676242417
    """
    X,Y = zip(*[v for i,v in enumerate(zip(X,Y)) if not np.any(np.isnan(v))])
    X = np.atleast_1d(X)
    Y = np.atleast_1d(Y)
    if np.prod(X.shape) == len(X):
        X = X[:, None]
    if np.prod(Y.shape) == len(Y):
        Y = Y[:, None]
    X = np.atleast_2d(X)
    Y = np.atleast_2d(Y)
    n = X.shape[0]
    if Y.shape[0] != X.shape[0]:
        raise ValueError('Number of samples must match')
    a = squareform(pdist(X))
    b = squareform(pdist(Y))
    A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean()
    B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean()
    
    dcov2_xy = (A * B).sum()/float(n * n)
    dcov2_xx = (A * A).sum()/float(n * n)
    dcov2_yy = (B * B).sum()/float(n * n)
    dcor = np.sqrt(dcov2_xy)/np.sqrt(np.sqrt(dcov2_xx) * np.sqrt(dcov2_yy))
    return dcor
Example #9
0
def getDistances(x, attr, var, cidx, didx, cheader):
    """ This creates the distance array for only discrete or continuous data 
        with no missing data """
    from scipy.spatial.distance import pdist, squareform
    #--------------------------------------------------------------------------
    def pre_normalize(x):
        idx = 0
        for i in cheader:
            cmin = attr[i][2]
            diff = attr[i][3]
            x[:,idx] -= cmin
            x[:,idx] /= diff
            idx += 1

        return x
    #--------------------------------------------------------------------------
    dtype = var['dataType']
    numattr = var['NumAttributes']

    if(dtype == 'discrete'):
        return squareform(pdist(x,metric='hamming'))

    if(dtype == 'mixed'):
        d_dist = squareform(pdist(x[:,didx],metric='hamming'))
        xc = pre_normalize(x[:,cidx])
        c_dist = squareform(pdist(xc,metric='cityblock'))
        return np.add(d_dist, c_dist) / numattr

    else: #(dtype == 'continuous'):
        return squareform(pdist(pre_normalize(x),metric='cityblock'))
    def __init__(self, rng, matches_vec, batch_size,
            sample_diff_every_epoch=True, n_same_pairs=None):
        """
        If `n_same_pairs` is given, this number of same pairs is sampled,
        otherwise all same pairs are used.
        """
        self.rng = rng
        self.matches_vec = matches_vec
        self.batch_size = batch_size

        if n_same_pairs is None:
            # Use all pairs
            I, J = np.where(np.triu(distance.squareform(matches_vec)))  # indices of same pairs
        else:
            # Sample same pairs
            n_pairs = min(n_same_pairs, len(np.where(matches_vec == True)[0]))
            same_sample = self.rng.choice(
                np.where(matches_vec == True)[0], size=n_pairs, replace=False
                )
            same_vec = np.zeros(self.matches_vec.shape[0], dtype=np.bool)
            same_vec[same_sample] = True
            I, J = np.where(np.triu(distance.squareform(same_vec)))

        self.x1_same_indices = []
        self.x2_same_indices = []
        for i, j in zip(I, J):
            self.x1_same_indices.append(i)
            self.x2_same_indices.append(j)

        if not sample_diff_every_epoch:
            self.x1_diff_indices, self.x2_diff_indices = self._sample_diff_pairs()
        self.sample_diff_every_epoch = sample_diff_every_epoch
def test_grad_grad(x):
        r_r = 7; q = 0
        x_in = copy.deepcopy(x)
        in_dims = n_channels_in*(filter_sz**2)
        x = copy.deepcopy(x_g)
        x[r_r,q] = x_in

        N = x.shape[1]
        grad_s = np.zeros((in_dims, in_dims, N))
        x_mean = np.mean(x,axis=1)
        x_no_mean = x - x_mean[:,np.newaxis]
        corrs = (1-pdist(x,'correlation'))
        corr_mat = squareform(corrs); target_mat = squareform(target)
	loss = np.std(corrs) #np.sqrt(np.sum((corrs - corrs.mean())**2))

        d_sum_n = np.mean(x_no_mean, axis=1)
        d2_sum_sqrt = np.sqrt(np.sum(x_no_mean**2, axis=1))
        d2_sum_sqrt2 = d2_sum_sqrt**2
        d_minus_sum_n = x_no_mean - d_sum_n[:,np.newaxis]
        d_minus_sum_n_div = d_minus_sum_n/d2_sum_sqrt[:,np.newaxis]
        d_dot_dT = np.dot(x_no_mean, x_no_mean.T)

        for i in np.arange(in_dims):
		for j in np.arange(in_dims):
	            if i != j:
                         grad_s[i,j] = (d_minus_sum_n[j]*d2_sum_sqrt[i] - d_dot_dT[i,j]*d_minus_sum_n_div[i])/(d2_sum_sqrt[j]*d2_sum_sqrt2[i])
	grad_s_mean = grad_s.sum(1)/len(corrs) # in_dims by  N

	grad = np.sum((grad_s - grad_s_mean)*(corr_mat[r_r] - corrs.mean())[:,np.newaxis],axis=1)/(loss*(N**2))
	
	return grad[r_r,q]
Example #12
0
def loglike(x, A):
    P = x.dot(x.T)
    P = squareform(P-diag(diag(P)))
    
    B = squareform(A)
    
    return np.
Example #13
0
def test_PDist():
    targets = np.tile(xrange(3),2)
    chunks = np.repeat(np.array((0,1)),3)
    ds = dataset_wizard(samples=data, targets=targets, chunks=chunks)
    data_c = data - np.mean(data,0)
    # DSM matrix elements should come out as samples of one feature
    # to be in line with what e.g. a classifier returns -- facilitates
    # collection in a searchlight ...
    euc = pdist(data, 'euclidean')[None].T
    pear = pdist(data, 'correlation')[None].T
    city = pdist(data, 'cityblock')[None].T
    center_sq = squareform(pdist(data_c,'correlation'))

    # Now center each chunk separately
    dsm1 = PDist()
    dsm2 = PDist(pairwise_metric='euclidean')
    dsm3 = PDist(pairwise_metric='cityblock')
    dsm4 = PDist(center_data=True,square=True)
    assert_array_almost_equal(dsm1(ds).samples,pear)
    assert_array_almost_equal(dsm2(ds).samples,euc)
    dsm_res = dsm3(ds)
    assert_array_almost_equal(dsm_res.samples,city)
    # length correspondings to a single triangular matrix
    assert_equal(len(dsm_res.sa.pairs), len(ds) * (len(ds) - 1) / 2)
    # generate label pairs actually reflect the vectorform generated by
    # squareform()
    dsm_res_square = squareform(dsm_res.samples.T[0])
    for i, p in enumerate(dsm_res.sa.pairs):
        assert_equal(dsm_res_square[p[0], p[1]], dsm_res.samples[i, 0])
    dsm_res = dsm4(ds)
    assert_array_almost_equal(dsm_res.samples,center_sq)
    # sample attributes are carried over
    assert_almost_equal(ds.sa.targets, dsm_res.sa.targets)
def main():
        # fetch distance matrix from specified input file
        distMatFile = sys.argv[1]
        nameList,Dij_sq,N=fetchDistMat(distMatFile)

        # in scipy most routines operate on 'condensed'
        # distance matrices, i.e. upper triagonal matrices
        # the function square contained in the scipy.spatial
        # submodule might be used in order to switch from 
        # full square to condensed matrices and vice versa
        Dij_cd = ssd.squareform(Dij_sq)
        # hierarchical clustering where the distance between
        # two coordinates is the distance of the cluster
        # averages
        # cluster Result = 'top down view' of the hierarchical
        # clustering
        clusterResult = sch.linkage(Dij_cd, method='average')
        # returns cophenetic distances
        # corr = cophenetic correlation
        # Cij_cd = condensed cophenetic distance matrix
        corr,Cij_cd   = sch.cophenet(clusterResult,Dij_cd)
        Cij_sq = ssd.squareform(Cij_cd)

        # print dendrogram on top of cophenetic distance 
        # matrix to standard outstream
        droPyt_distMat_dendrogram_sciPy(Cij_sq,clusterResult,N)
Example #15
0
def vi_pairwise_matrix(segs, split=False):
    """Compute the pairwise VI distances within a set of segmentations.

    If 'split' is set to True, two matrices are returned, one for each 
    direction of the conditional entropy.

    0-labeled pixels are ignored.

    Parameters
    ----------
    segs : iterable of np.ndarray of int
        A list or iterable of segmentations. All arrays must have the same
        shape.
    split : bool, optional
        Should the split VI be returned, or just the VI itself (default)?

    Returns
    -------
    vi_sq : np.ndarray of float, shape (len(segs), len(segs))
        The distances between segmentations. If `split==False`, this is a
        symmetric square matrix of distances. Otherwise, the lower triangle
        of the output matrix is the false split distance, while the upper
        triangle is the false merge distance.
    """
    d = np.array([s.ravel() for s in segs])
    if split:
        def dmerge(x, y): return split_vi(x, y)[0]
        def dsplit(x, y): return split_vi(x, y)[1]
        merges, splits = [squareform(pdist(d, df)) for df in [dmerge, dsplit]]
        out = merges
        tri = np.tril(np.ones(splits.shape), -1).astype(bool)
        out[tri] = splits[tri]
    else:
        out = squareform(pdist(d, vi))
    return out
Example #16
0
    def kcca(self, X, Y, kernel_x=gaussian_kernel, kernel_y=gaussian_kernel, eta=1.0):
        n, p = X.shape
        n, q = Y.shape
        
        Kx = DIST.squareform(DIST.pdist(X, kernel_x))
        Ky = DIST.squareform(DIST.pdist(Y, kernel_y))
        J = np.eye(n) - np.ones((n, n)) / n
        M = np.dot(np.dot(Kx.T, J), Ky) / n
        L = np.dot(np.dot(Kx.T, J), Kx) / n + eta * Kx
        N = np.dot(np.dot(Ky.T, J), Ky) / n + eta * Ky


        sqx = SLA.sqrtm(SLA.inv(L))
        sqy = SLA.sqrtm(SLA.inv(N))
        
        a = np.dot(np.dot(sqx, M), sqy.T)
        A, s, Bh = SLA.svd(a, full_matrices=False)
        B = Bh.T
        
        # U = np.dot(np.dot(A.T, sqx), X).T
        # V = np.dot(np.dot(B.T, sqy), Y).T
        print s.shape
        print A.shape
        print B.shape
        return s, A, B
def distcorr(X, Y, flip=True):
    """ Compute the distance correlation function
    
    >>> a = [1,2,3,4,5]
    >>> b = np.array([1,2,9,4,4])
    >>> distcorr(a, b)
    0.762676242417
    
    Taken from: https://gist.github.com/satra/aa3d19a12b74e9ab7941
    """
    X = np.atleast_1d(X)
    Y = np.atleast_1d(Y)
    if np.prod(X.shape) == len(X):
        X = X[:, None]
    if np.prod(Y.shape) == len(Y):
        Y = Y[:, None]
    X = np.atleast_2d(X)
    Y = np.atleast_2d(Y)
    n = X.shape[0]
    if Y.shape[0] != X.shape[0]:
        raise ValueError('Number of samples must match')
    a = squareform(pdist(X))
    b = squareform(pdist(Y))
    A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean()
    B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean()
    
    dcov2_xy = (A * B).sum()/float(n * n)
    dcov2_xx = (A * A).sum()/float(n * n)
    dcov2_yy = (B * B).sum()/float(n * n)
    dcor = np.sqrt(dcov2_xy)/np.sqrt(np.sqrt(dcov2_xx) * np.sqrt(dcov2_yy))
    if flip == True:
        dcor = 1-dcor
    return dcor
Example #18
0
def kcca(X, Y, kernel_x=gaussian_kernel, kernel_y=gaussian_kernel, eta=1.0):
    '''
    カーネル正準相関分析
    http://staff.aist.go.jp/s.akaho/papers/ibis00.pdf
    '''
    n, p = X.shape
    n, q = Y.shape

    Kx = DIST.squareform(DIST.pdist(X, kernel_x))
    Ky = DIST.squareform(DIST.pdist(Y, kernel_y))
    J = np.eye(n) - np.ones((n, n)) / n
    M = np.dot(np.dot(Kx.T, J), Ky) / n
    L = np.dot(np.dot(Kx.T, J), Kx) / n + eta * Kx
    N = np.dot(np.dot(Ky.T, J), Ky) / n + eta * Ky

    sqx = LA.sqrtm(LA.inv(L))
    sqy = LA.sqrtm(LA.inv(N))

    a = np.dot(np.dot(sqx, M), sqy.T)
    A, s, Bh = LA.svd(a, full_matrices=False)
    B = Bh.T

    # U = np.dot(np.dot(A.T, sqx), X).T
    # V = np.dot(np.dot(B.T, sqy), Y).T

    return s, A, B
Example #19
0
File: gp.py Project: davidar/gpo
def K_SE(xs, ys=None, l=1, deriv=False, wrt='l'):
    l = asarray(l)
    sig = 1 #l[0]
    #l = l[1:]
    xs = ascolumn(xs)
    if ys is None:
        d = squareform(pdist(xs/l, 'sqeuclidean'))
    else:
        ys = ascolumn(ys)
        d = cdist(xs/l, ys/l, 'sqeuclidean')
    cov = exp(-d/2)
    if not deriv: return sig * cov

    grads = []
    if wrt == 'l':
        #grads.append(cov) # grad of sig
        for i in xrange(shape(xs)[1]):
            if ys is None:
                grad = sig * cov * squareform(pdist(ascolumn(xs[:,i]), 'sqeuclidean'))
            else:
                grad = sig * cov * cdist(ascolumn(xs[:,i]), ascolumn(ys[:,i]), 'sqeuclidean')
            grad /= l[i] ** 3
            grads.append(grad)
        return sig * cov, grads
    elif wrt == 'y':
        if shape(xs)[0] != 1: print '*** x not a row vector ***'
        jac = sig * cov * ((ys - xs) / l**2).T
        return sig * cov, jac
def kernelMatrixLaplacian(x, firstVar=None, grid=None, par=[1., 3], diff=False, diff2 = False, constant_plane=False, precomp = None):
    sig = par[0]
    ord=par[1]
    if precomp == None:
        precomp = kernelMatrixLaplacianPrecompute(x, firstVar, grid, par)

    u = precomp[0]
    expu = precomp[1]

    if firstVar == None and grid==None:
        if diff==False and diff2==False:
            K = dfun.squareform(lapPol(u,ord) *expu)
            np.fill_diagonal(K, 1)
        elif diff2==False:
            K = dfun.squareform(-lapPolDiff(u, ord) * expu/(2*sig*sig))
            np.fill_diagonal(K, -1./((2*ord-1)*2*sig*sig))
        else:
            K = dfun.squareform(lapPolDiff2(u, ord) *expu /(4*sig**4))
            np.fill_diagonal(K, 1./((35)*4*sig**4))
    else:
        if diff==False and diff2==False:
            K = lapPol(u,ord) * expu
        elif diff2==False:
            K = -lapPolDiff(u, ord) * expu/(2*sig*sig)
        else:
            K = lapPolDiff2(u, ord) *expu/(4*sig**4)

    if constant_plane:
        uu = dfun.pdist(x[:,x.shape[1]-1])/sig
        K2 = dfun.squareform(lapPol(uu,ord)*np.exp(-uu))
        np.fill_diagonal(K2, 1)
        return K,K2,precomp
    else:
        return K,precomp
Example #21
0
def covMatrix(X, Y, theta, symmetric = True, kernel = lambda u, theta: theta[0]*theta[0]*np.exp(-0.5*u*u/(theta[1]*theta[1])), \
        dist_f=None):
    if len(np.array(X).shape) == 1:
        _X = np.array([X]).T
    else:
        _X = np.array(X)
        
    if len(np.array(Y).shape) == 1:
        _Y = np.array([Y]).T
    else:
        _Y = np.array(Y)
        
    if dist_f == None:
        if symmetric:
            cM = pdist(_X)
            M = squareform(cM)
            M = kernel(M, theta)
            return M
        else:
            cM = cdist(_X, _Y)
            M = kernel(cM, theta)
            return M
    else:
        if symmetric:
            cM = pdist(_X, dist_f)
            M = squareform(cM)
            M = kernel(M, theta)
            return M
        else:
            cM = cdist(_X, _Y, dist_f)
            M = kernel(cM, theta)
            return M
    return
    def getDistMatrixes(cls, distDict, distMeasure, linkageCriterion):
        """
        Find and return the correlation matrix, linkage matrix and distance matrix for the distance/correlation
        measure given with distMeasure parameter.
        """
        from scipy.spatial.distance import squareform
        from numpy import ones, fill_diagonal
        from scipy.cluster.hierarchy import linkage

        if distMeasure == cls.CORR_PEARSON or distMeasure == cls.SIM_MCCONNAUGHEY:
            '''As these measures generate values between -1 and 1, need special handling'''

            # Cluster distances, i.e. convert correlation into distance between 0 and 1
            triangularCorrMatrix = distDict[distMeasure]
            triangularDistMatrix = ones(len(triangularCorrMatrix)) - [(x + 1) / 2 for x in triangularCorrMatrix]
            linkageMatrix = linkage(cls.removeNanDistances(triangularDistMatrix), linkageCriterion)

            # Make correlation matrix square
            correlationMatrix = squareform(triangularCorrMatrix)
            fill_diagonal(correlationMatrix, 1)
        else:

            # Cluster distances
            triangularDistMatrix = distDict[distMeasure]
            linkageMatrix = linkage(cls.removeNanDistances(triangularDistMatrix), linkageCriterion)

            # Convert triangular distances into square correlation matrix
            squareDistMatrix = squareform(triangularDistMatrix)
            squareSize = len(squareDistMatrix)
            correlationMatrix = ones((squareSize, squareSize)) - squareDistMatrix

        return correlationMatrix, linkageMatrix, triangularDistMatrix
Example #23
0
def correlate_all(M):
  """Return all-pairs Pearson's correlation as a squareform matrix. 
  Best on numpy.array(dtype=float)
  TODO: this can be more efficient

  Args:
    M: numpy.array row matrix
  Returns:
    squareform top triangle matrix of all-pairs correlation, row order index.

  RUNTIME on random.rand(500,200)
    21.2 ms (improve of 200x over formula)
  RUNTIME on random.rand(15000,250)
  """
  m = np.size(M, 0) # number of rows (variables)
  n = np.size(M, 1) # number of columns (power)

  sums = np.sum(M,1).reshape(m,1)
  stds = np.std(M,1).reshape(m,1) # divided by n

  # TODO: does making this cummlative matter?
  Dot = squareform(np.dot(M, M.T), checks=False)
  SumProd = squareform(np.dot(sums, sums.T), checks=False)
  StdProd = squareform(np.dot(stds, stds.T), checks=False)

  CorrMatrix = (Dot - (SumProd/n)) / (StdProd*n)
  
  # Correlation Matrix
  return CorrMatrix
Example #24
0
def slRSA_m_1Ss(ds, model, omit, partial_dsm = None, radius=3, cmetric='pearson'):
    '''one subject

    Executes slRSA on single subjects and returns tuple of arrays of 1-p's [0], and fisher Z transformed r's [1]

    ds: pymvpa dsets for 1 subj
    model: model DSM to be correlated with neural DSMs per searchlight center
    partial_dsm: model DSM to be partialled out of model-neural DSM correlation
    omit: list of targets omitted from pymvpa datasets
    radius: sl radius, default 3
    cmetric: default pearson, other optin 'spearman'
    '''        

    if __debug__:
        debug.active += ["SLC"]

    for om in omit:
        ds = ds[ds.sa.targets != om] # cut out omits
        print('Target |%s| omitted from analysis' % (om))
    ds = mean_group_sample(['targets'])(ds) #make UT ds
    print('Mean group sample computed at size:',ds.shape,'...with UT:',ds.UT)

    print('Beginning slRSA analysis...')
    if partial_dsm == None: tdcm = rsa.TargetDissimilarityCorrelationMeasure(squareform(model), comparison_metric=cmetric)
    elif partial_dsm != None: tdcm = rsa.TargetDissimilarityCorrelationMeasure(squareform(model), comparison_metric=cmetric, partial_dsm = squareform(partial_dsm))
    sl = sphere_searchlight(tdcm,radius=radius)
    slmap = sl(ds)
    if partial_dsm == None:
        print('slRSA complete with map of shape:',slmap.shape,'...p max/min:',slmap.samples[0].max(),slmap.samples[0].min(),'...r max/min',slmap.samples[1].max(),slmap.samples[1].min())
        return 1-slmap.samples[1],np.arctanh(slmap.samples[0])
    else:
        print('slRSA complete with map of shape:',slmap.shape,'...r max/min:',slmap.samples[0].max(),slmap.samples[0].min())
        return 1-slmap.samples[1],np.arctanh(slmap.samples[0])
def similarities(obj):
    """
    Optional: similarities of entities.
    """
    phi = coo_matrix(np.load(str(obj.directory / 'phi.npy')))
    theta = coo_matrix(np.load(str(obj.directory / 'theta.npy')))

    with CsvWriter(obj.directory, DocumentSimilarity) as out:
        distances = squareform(pdist(theta.T, 'cosine'))
        out << (dict(a_id=i,
                     b_id=sim_i,
                     similarity=1 - row[sim_i])
                for i, row in enumerate(distances)
                for sim_i in row.argsort()[:31]  # first 30 similar docs
                if sim_i != i)

    with CsvWriter(obj.directory, TopicSimilarity) as out:
        distances = squareform(pdist(phi.T, 'cosine'))
        out << (dict(a_id=topic_id(1, i),
                     b_id=topic_id(1, sim_i),
                     similarity=1 - row[sim_i])
                for i, row in enumerate(distances)
                for sim_i in row.argsort()[:]
                if sim_i != i)

    with CsvWriter(obj.directory, TermSimilarity) as out:
        distances = squareform(pdist(phi, 'cosine'))
        out << (dict(a_modality_id=1, a_id=i,
                     b_modality_id=1, b_id=sim_i,
                     similarity=1 - row[sim_i])
                for i, row in enumerate(distances)
                for sim_i in row.argsort()[:21]  # first 20 similar terms
                if sim_i != i)
def bootstrap_correlations(df, cor, bootstraps=100, procs=1):
    """"""
    # take absolute value of all values in cor for calculating two-sided p-value
    abs_cor = np.abs(squareform(cor, checks=False))
    # create an empty array of significant value counts in same shape as abs_cor
    n_sig = np.zeros(abs_cor.shape)

    if procs == 1:
        for i in xrange(bootstraps):
            n_sig += bootstrapped_correlation(i, df, abs_cor)
    else:
        import multiprocessing
        pool = multiprocessing.Pool(procs)
        print "Number of processors used: " + str(procs)

        # make partial function for use in multiprocessing
        pfun = partial(bootstrapped_correlation, cor=abs_cor, df=df)
        # run multiprocessing
        multi_results = pool.map(pfun, xrange(bootstraps))
        pool.close()
        pool.join()

        # find number of significant results across all bootstraps
        n_sig = np.sum(multi_results, axis=0)

    # get p_values out
    p_val_square = squareform(1. * n_sig / bootstraps, checks=False)
    p_vals = []
    for i in xrange(p_val_square.shape[0]):
        for j in xrange(i + 1, p_val_square.shape[0]):
            p_vals.append(p_val_square[i, j])
    return p_vals
Example #27
0
def _compute_AB(x, y, index):
    xa = np.atleast_2d(x)
    ya = np.atleast_2d(y)       

    if xa.ndim > 2 or ya.ndim > 2:
        raise ValueError("x and y must be 1d or 2d array_like objects")

    if xa.shape[0] == 1:
        xa = xa.T

    if ya.shape[0] == 1: 
        ya = ya.T

    if xa.shape[0] != ya.shape[0]:
        raise ValueError("x and y must have the same sample sizes")
       
    if index <= 0 or index > 2:
        raise ValueError("index must be in (0, 2]")

    # compute A
    a_kl = squareform(pdist(xa, 'euclidean')**index)
    a_k = np.mean(a_kl, axis=1).reshape(-1, 1)
    a_l = a_k.T
    a = np.mean(a_kl)
    A = a_kl - a_k - a_l + a

    # compute B
    b_kl = squareform(pdist(ya, 'euclidean')**index)
    b_k = np.mean(b_kl, axis=1).reshape(-1, 1)
    b_l = b_k.T
    b = np.mean(b_kl)
    B = b_kl - b_k - b_l + b

    return A, B
Example #28
0
    def test_pdist(self):
        for metric, argdict in self.scipy_metrics.iteritems():
            keys = argdict.keys()
            for vals in itertools.product(*argdict.values()):
                kwargs = dict(zip(keys, vals))
                D_true = pdist(self.X1, metric, **kwargs)
                Dsq_true = squareform(D_true)
                dm = DistanceMetric(metric, **kwargs)
                for X in self.X1, self.spX1:
                    yield self.check_pdist, metric, X, dm, Dsq_true, True

                for X in self.X1, self.spX1:
                    yield self.check_pdist, metric, X, dm, D_true, False

        for rmetric, (metric, func) in self.reduced_metrics.iteritems():
            argdict = self.scipy_metrics[metric]
            keys = argdict.keys()
            for vals in itertools.product(*argdict.values()):
                kwargs = dict(zip(keys, vals))
                D_true = func(pdist(self.X1, metric, **kwargs),
                              **kwargs)
                Dsq_true = squareform(D_true)
                dm = DistanceMetric(rmetric, **kwargs)
                for X in self.X1, self.spX1:
                    yield self.check_pdist, rmetric, X, dm, Dsq_true, True

                for X in self.X1, self.spX1:
                    yield self.check_pdist, rmetric, X, dm, D_true, False
Example #29
0
def optimal_clustering(df, patch, method='kmeans', statistic='gap', max_K=5):
    if len(patch) == 1:
        return [patch]

    if statistic == 'db':
        if method == 'kmeans':
            if len(patch) <= 5:
                K_max = 2
            else:
                K_max = min(len(patch) / 2, max_K)
            clustering = {}
            db_index = []
            X = df.ix[patch, :]
            for k in range(2, K_max + 1):
                kmeans = cluster.KMeans(n_clusters=k).fit(X)
                clustering[k] = pd.DataFrame(kmeans.predict(X), index=patch)
                dist_mu = squareform(pdist(kmeans.cluster_centers_))
                sigma = []
                for i in range(k):
                    points_in_cluster = clustering[k][clustering[k][0] == i].index
                    sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum()))
                db_index.append(davies_bouldin(dist_mu, np.array(sigma)))
            db_index = np.array(db_index)
            k_optimal = np.argmin(db_index) + 2
            return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)]

        elif method == 'agglomerative':
            if len(patch) <= 5:
                K_max = 2
            else:
                K_max = min(len(patch) / 2, max_K)
            clustering = {}
            db_index = []
            X = df.ix[patch, :]
            for k in range(2, K_max + 1):
                agglomerative = cluster.AgglomerativeClustering(n_clusters=k, linkage='average').fit(X)
                clustering[k] = pd.DataFrame(agglomerative.fit_predict(X), index=patch)
                tmp = [list(clustering[k][clustering[k][0] == i].index) for i in range(k)]
                centers = np.array([np.mean(X.ix[c, :], axis=0) for c in tmp])
                dist_mu = squareform(pdist(centers))
                sigma = []
                for i in range(k):
                    points_in_cluster = clustering[k][clustering[k][0] == i].index
                    sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum()))
                db_index.append(davies_bouldin(dist_mu, np.array(sigma)))
            db_index = np.array(db_index)
            k_optimal = np.argmin(db_index) + 2
            return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)]

    elif statistic == 'gap':
        X = np.array(df.ix[patch, :])
        if method == 'kmeans':
            f = cluster.KMeans
        gaps = gap(X, ks=range(1, min(max_K, len(patch))), method=f)
        k_optimal = list(gaps).index(max(gaps))+1
        clustering = pd.DataFrame(f(n_clusters=k_optimal).fit_predict(X), index=patch)
        return [list(clustering[clustering[0] == i].index) for i in range(k_optimal)]

    else:
        raise 'error: only db and gat statistics are supported'
def plot_clustering_similarity(results, plot_dir=None, verbose=False, ext='png'):  
    HCA = results.HCA
    # get all clustering solutions
    clusterings = HCA.results.items()
    # plot cluster agreement across embedding spaces
    names = [k for k,v in clusterings]
    cluster_similarity = np.zeros((len(clusterings), len(clusterings)))
    cluster_similarity = pd.DataFrame(cluster_similarity, 
                                     index=names,
                                     columns=names)
    
    distance_similarity = np.zeros((len(clusterings), len(clusterings)))
    distance_similarity = pd.DataFrame(distance_similarity, 
                                     index=names,
                                     columns=names)
    for clustering1, clustering2 in combinations(clusterings, 2):
        name1 = clustering1[0].split('-')[-1]
        name2 = clustering2[0].split('-')[-1]
        # record similarity of distance_df
        dist_corr = np.corrcoef(squareform(clustering1[1]['distance_df']),
                                squareform(clustering2[1]['distance_df']))[1,0]
        distance_similarity.loc[name1, name2] = dist_corr
        distance_similarity.loc[name2, name1] = dist_corr
        # record similarity of clustering of dendrogram
        clusters1 = clustering1[1]['labels']
        clusters2 = clustering2[1]['labels']
        rand_score = adjusted_rand_score(clusters1, clusters2)
        MI_score = adjusted_mutual_info_score(clusters1, clusters2)
        cluster_similarity.loc[name1, name2] = rand_score
        cluster_similarity.loc[name2, name1] = MI_score
    
    with sns.plotting_context(context='notebook', font_scale=1.4):
        clust_fig = plt.figure(figsize = (12,12))
        sns.heatmap(cluster_similarity, square=True)
        plt.title('Cluster Similarity: TRIL: Adjusted MI, TRIU: Adjusted Rand',
                  y=1.02)
        
        dist_fig = plt.figure(figsize = (12,12))
        sns.heatmap(distance_similarity, square=True)
        plt.title('Distance Similarity, metric: %s' % HCA.dist_metric,
                  y=1.02)
        
    if plot_dir is not None:
        save_figure(clust_fig, path.join(plot_dir, 
                                   'cluster_similarity_across_measures.%s' % ext),
                    {'bbox_inches': 'tight'})
        save_figure(dist_fig, path.join(plot_dir, 
                                   'distance_similarity_across_measures.%s' % ext),
                    {'bbox_inches': 'tight'})
        plt.close(clust_fig)
        plt.close(dist_fig)
    
    if verbose:
        # assess relationship between two measurements
        rand_scores = cluster_similarity.values[np.triu_indices_from(cluster_similarity, k=1)]
        MI_scores = cluster_similarity.T.values[np.triu_indices_from(cluster_similarity, k=1)]
        score_consistency = np.corrcoef(rand_scores, MI_scores)[0,1]
        print('Correlation between measures of cluster consistency: %.2f' \
              % score_consistency)
Example #31
0
def hierarchical_consensus_cluster(
    df,
    k,
    distance__column_x_column=None,
    distance_function="euclidean",
    n_clustering=10,
    random_seed=20121020,
    linkage_method="ward",
    plot_df=True,
    directory_path=None,
):

    if distance__column_x_column is None:

        print("Computing distance with {} ...".format(distance_function))

        distance__column_x_column = DataFrame(
            squareform(pdist(df.values.T, distance_function)),
            index=df.columns,
            columns=df.columns,
        )

    print("HCC with K={} ...".format(k))

    clustering_x_column = full(
        (n_clustering, distance__column_x_column.shape[1]), nan)

    n_per_print = max(1, n_clustering // 10)

    seed(random_seed)

    for clustering in range(n_clustering):

        if clustering % n_per_print == 0:

            print("\t(K={}) {}/{} ...".format(k, clustering + 1, n_clustering))

        random_columns_with_repeat = randint(
            0,
            high=distance__column_x_column.shape[0],
            size=distance__column_x_column.shape[0],
        )

        clustering_x_column[clustering, random_columns_with_repeat] = fcluster(
            linkage(
                squareform(distance__column_x_column.iloc[
                    random_columns_with_repeat, random_columns_with_repeat]),
                method=linkage_method,
            ),
            k,
            criterion="maxclust",
        )

    column_cluster, column_cluster__ccc = _cluster_clustering_x_element_and_compute_ccc(
        clustering_x_column, k, linkage_method)

    if directory_path is not None:

        cluster_x_column = make_membership_df_from_categorical_series(
            Series(column_cluster, index=df.columns))

        cluster_x_column.index = Index(
            ("C{}".format(cluster) for cluster in cluster_x_column.index),
            name="Cluster",
        )

        cluster_x_column.to_csv(
            "{}/cluster_x_column.tsv".format(directory_path), sep="\t")

    if plot_df:

        print("Plotting df ...")

        file_name = "cluster.html"

        if directory_path is None:

            html_file_path = None

        else:

            html_file_path = "{}/{}".format(directory_path, file_name)

        plot_heat_map(
            df,
            normalization_axis=0,
            normalization_method="-0-",
            column_annotation=column_cluster,
            title="HCC K={} Column Cluster".format(k),
            xaxis_title=df.columns.name,
            yaxis_title=df.index.name,
            html_file_path=html_file_path,
        )

    return column_cluster, column_cluster__ccc
Example #32
0
    def average(self):

        self.unsaved_changes = True

        if hasattr(self, 'aActor'):
            self.ren.RemoveActor(self.aActor)

        self.ui.statLabel.setText("Averaging, applying grid . . .")
        QtWidgets.QApplication.processEvents()

        #temporarily shift all data such that it appears in the first cartesian quadrant
        tT = np.amin(self.rO, axis=0)
        self.rO, self.fO, self.rp, self.flp = self.rO - tT, self.fO - tT, self.rp - tT, self.flp - tT

        #use max to get a 'window' for assessing grid spacing
        RefMax = np.amax(self.rO, axis=0)
        RefMin = np.amin(self.rO, axis=0)
        windowVerts = np.matrix([[0.25 * RefMin[0], 0.25 * RefMin[1]],
                                 [0.25 * RefMin[0], 0.25 * (RefMax[1])],
                                 [0.25 * (RefMax[1]), 0.25 * (RefMax[1])],
                                 [0.25 * (RefMax[0]), 0.25 * (RefMin[1])]])

        p = path.Path(windowVerts)
        inWindow = p.contains_points(
            self.rp[:, :2])  #first 2 columns of RefPoints is x and y

        windowed = self.rp[inWindow, :2]

        #populate grid size if attribute doesn't exist
        if not hasattr(self, 'gsize'):
            gs = squareform(pdist(windowed, 'euclidean'))
            self.gsize = np.mean(np.sort(gs)[:, 1])
            self.ui.gridInd.setValue(self.gsize)
        else:
            self.gsize = self.ui.gridInd.value()

        #grid the reference based on gsize, bumping out the grid by 10% in either direction
        grid_x, grid_y = np.meshgrid(
            np.linspace(1.1 * RefMin[0], 1.1 * RefMax[0],
                        int((1.1 * RefMax[0] - 1.1 * RefMin[0]) / self.gsize)),
            np.linspace(1.1 * RefMin[1], 1.1 * RefMax[1],
                        int((1.1 * RefMax[1] - 1.1 * RefMin[1]) / self.gsize)),
            indexing='xy')

        #apply the grid to the reference data
        grid_Ref = griddata(self.rp[:, :2],
                            self.rp[:, -1], (grid_x, grid_y),
                            method='linear')

        #apply the grid to the aligned data
        grid_Align = griddata(self.flp[:, :2],
                              self.flp[:, -1], (grid_x, grid_y),
                              method='linear')

        self.ui.statLabel.setText("Averaging using grid . . .")
        QtWidgets.QApplication.processEvents()

        #average z values
        grid_Avg = (grid_Ref + grid_Align) / 2

        #make sure that there isn't anything averaged outside the floating outline
        p = path.Path(self.rO[:, :2])
        inTest = np.hstack((np.ravel(grid_x.T)[np.newaxis].T,
                            np.ravel(grid_y.T)[np.newaxis].T))
        inOutline = p.contains_points(inTest)

        #averaged points
        self.ap = np.hstack((inTest[inOutline,:], \
           np.ravel(grid_Avg.T)[np.newaxis].T[inOutline]))

        #move everything back to original location
        self.rO, self.fO, self.rp, self.flp, self.ap = \
        self.rO+tT, self.fO+tT, self.rp+tT, self.flp+tT, self.ap+tT

        self.ui.statLabel.setText("Rendering . . .")
        QtWidgets.QApplication.processEvents()

        #show it
        color = (int(0.2784 * 255), int(0.6745 * 255), int(0.6941 * 255))
        _, self.aActor, _, = gen_point_cloud(self.ap, color, self.PointSize)
        self.ren.AddActor(self.aActor)

        s, nl, axs = self.get_scale()

        self.aActor.SetScale(s)
        self.aActor.Modified()

        #update
        self.ui.vtkWidget.update()
        self.ui.vtkWidget.setFocus()
        self.ui.statLabel.setText("Averaging complete.")
        self.averaged = True
        self.ui.averageButton.setStyleSheet(
            "background-color :rgb(77, 209, 97);")
Example #33
0
def proclus(X, k=2, l=3, minDeviation=0.1, A=30, B=3, niters=30, seed=1234):
    """ Run PROCLUS on a database to obtain a set of clusters and 
		dimensions associated with each one.

		Parameters:
		----------
		- X: 	   		the data set
		- k: 	   		the desired number of clusters
		- l:	   		average number of dimensions per cluster
		- minDeviation: for selection of bad medoids
		- A: 	   		constant for initial set of medoids
		- B: 	   		a smaller constant than A for the final set of medoids
		- niters:  		maximum number of iterations for the second phase
		- seed:    		seed for the RNG
	"""
    np.random.seed(seed)

    N, d = X.shape

    if B > A:
        raise Exception("B has to be smaller than A.")

    if l < 2:
        raise Exception("l must be >=2.")

    ###############################
    # 1.) Initialization phase
    ###############################

    # first find a superset of the set of k medoids by random sampling
    idxs = np.arange(N)
    np.random.shuffle(idxs)
    S = idxs[0:(A * k)]
    M = greedy(X, S, B * k)

    ###############################
    # 2.) Iterative phase
    ###############################

    BestObjective = np.inf

    # choose a random set of k medoids from M:
    Mcurr = np.random.permutation(M)[0:k]  # M current
    Mbest = None  # Best set of medoids found

    D = squareform(pdist(X))  # precompute the euclidean distance matrix

    it = 0  # iteration counter
    L = []  # locality sets of the medoids, i.e., points within delta_i of m_i.
    Dis = []  # important dimensions for each cluster
    assigns = []  # cluster membership assignments

    while True:
        it += 1
        L = []

        for i in range(len(Mcurr)):
            mi = Mcurr[i]
            # compute delta_i, the distance to the nearest medoid of m_i:
            di = D[mi, np.setdiff1d(Mcurr, mi)].min()
            # compute L_i, points in sphere centered at m_i with radius d_i
            L.append(np.where(D[mi] <= di)[0])

        # find dimensions:
        Dis = findDimensions(X, k, l, L, Mcurr)

        # form the clusters:
        assigns = assignPoints(X, Mcurr, Dis)

        # evaluate the clusters:
        ObjectiveFunction = evaluateClusters(X, assigns, Dis, Mcurr)

        badM = []  # bad medoids

        Mold = Mcurr.copy()

        if ObjectiveFunction < BestObjective:
            BestObjective = ObjectiveFunction
            Mbest = Mcurr.copy()
            # compute the bad medoids in Mbest:
            badM = computeBadMedoids(X, assigns, Dis, Mcurr, minDeviation)
            print("bad medoids:")
            print(badM)

        if len(badM) > 0:
            # replace the bad medoids with random points from M:
            print("old mcurr:")
            print(Mcurr)
            Mavail = np.setdiff1d(M, Mbest)
            newSel = np.random.choice(Mavail, size=len(badM), replace=False)
            Mcurr = np.setdiff1d(Mbest, badM)
            Mcurr = np.union1d(Mcurr, newSel)
            print("new mcurr:")
            print(Mcurr)

        print("finished iter: %d" % it)

        if np.allclose(Mold, Mcurr) or it >= niters:
            break

    print("finished iterative phase...")

    ###############################
    # 3.) Refinement phase
    ###############################

    # compute a new L based on assignments:
    L = []
    for i in range(len(Mcurr)):
        mi = Mcurr[i]
        L.append(np.where(assigns == mi)[0])

    Dis = findDimensions(X, k, l, L, Mcurr)
    assigns = assignPoints(X, Mcurr, Dis)

    # handle outliers:

    # smallest Manhattan segmental distance of m_i to all (k-1)
    # other medoids with respect to D_i:
    deltais = np.zeros(k)
    for i in range(k):
        minDist = np.inf
        for j in range(k):
            if j != i:
                dist = manhattanSegmentalDist(X[Mcurr[i]], X[Mcurr[j]], Dis[i])
                if dist < minDist:
                    minDist = dist
        deltais[i] = minDist

    # mark as outliers the points that are not within delta_i of any m_i:
    for i in range(len(assigns)):
        clustered = False
        for j in range(k):
            d = manhattanSegmentalDist(X[Mcurr[j]], X[i], Dis[j])
            if d <= deltais[j]:
                clustered = True
                break
        if not clustered:
            #print "marked an outlier"
            assigns[i] = -1

    return (Mcurr, Dis, assigns)
Example #34
0
random.seed(seed)
np.random.seed(seed)
idx = np.random.permutation(data.index)

# Calculate indexes for numpy array
cluster_index = int(len(col_names))
distance_cluster_index = cluster_index + 1

###################################################################
# LOCAL
# #################################################################

# Start timing
start_time = time.perf_counter()

D = squareform(pdist(data))
max_distance, [I_row,
               I_col] = np.nanmax(D), np.unravel_index(np.argmax(D), D.shape)
n_restrictions = (((len(restrictions.index)**2) -
                   (restrictions.isin([0]).sum().sum())) / 2) - data.shape[0]
# print(max_distance)
lambda_value = (max_distance / n_restrictions) * lambda_var
# Generate neighbourhood
possible_changes = []
for i in range(len(data.index)):
    for w in range(k):
        possible_changes.append((i, w))
np.random.shuffle(possible_changes)

# Generate initial solution
data['cluster'] = np.random.randint(0, k, data.shape[0])
Example #35
0
from scipy.sparse import csr_matrix
from scipy.spatial.distance import pdist, squareform
from scipy.spatial.qhull import QhullError
from sklearn.exceptions import NotFittedError

from gtda.homology import VietorisRipsPersistence, SparseRipsPersistence, \
    WeakAlphaPersistence, EuclideanCechPersistence, FlagserPersistence

pio.renderers.default = 'plotly_mimetype'

X_pc = np.array([[[2., 2.47942554], [2.47942554, 2.84147098],
                  [2.98935825, 2.79848711], [2.79848711, 2.41211849],
                  [2.41211849, 1.92484888]]])
X_pc_list = list(X_pc)

X_dist = np.array([squareform(pdist(x)) for x in X_pc])
X_dist_list = list(X_dist)

X_pc_sparse = [csr_matrix(x) for x in X_pc]
X_dist_sparse = [csr_matrix(x) for x in X_dist]

X_dist_disconnected = np.array([[[0, np.inf], [np.inf, 0]]])

# 8-point sampling of a noisy circle
X_circle = np.array([[[1.00399159, -0.00797583], [0.70821787, 0.68571714],
                      [-0.73369765, -0.71298056], [0.01110395, -1.03739883],
                      [-0.64968271, 0.7011624], [0.03895963, 0.94494511],
                      [0.76291108, -0.68774373], [-1.01932365, -0.05793851]]])


def test_vrp_params():
Example #36
0
    np.save(abs_corr_array_path, abs_corr_array.data)
else:
    abs_corr_array = np.load(abs_corr_array_path)
    print abs_corr_array.shape
# -- calculate linkage and clusters

if IS_CALCULATE_LINKAGE:
    # -- load the correlation matrix
    abs_corr_array = np.load(abs_corr_array_path)

    # -- transform the correlation matrix into distance measure
    abs_corr_dist_arr = np.around(1 - abs_corr_array, 7)

    # -- transform the correlation matrix into condensed distance matrix
    dist_corr = spdst.squareform(abs_corr_dist_arr)

    # -- force calculation of linkage
    is_force_calc_link_arr = True

else:
    # -- skip calculation and load linkage from link_arr_path
    is_force_calc_link_arr = False
    abs_corr_dist_arr = None

# -- cluster of indices in abs_corr_dist_arr array
cluster_lst, cluster_size_lst = fap.compute_clusters_from_dist(
    abs_corr_dist_arr=abs_corr_dist_arr,
    link_arr_path=link_arr_path,
    is_force_calc_link_arr=is_force_calc_link_arr)
def find_max_distance(A):
    """
    Returns the maximum distance from  2x points.
    Each point is represented by an x,y coordinate.
    """
    return nanmax(squareform(pdist(A)))
def calculate_distance(matrix, metric):
    distance_matrix =pdist(matrix, metric=metric)
    distance_matrix = squareform(distance_matrix)
    return(distance_matrix)
Example #39
0
 def update_plot(self, x, y_target=None, n_bold=3, show_forward=True):
     plt.gcf().clear()
     x = self.unflatten_coeffs(np.array(x))
     points = self.trace_fourier_curves(x)
     for i in range(len(points)):
         plt.plot(points[i, :, 0],
                  points[i, :, 1],
                  c=(0, 0, 0, min(1, 10 / len(points))))
         if i >= len(points) - n_bold:
             plt.plot(points[i, :, 0], points[i, :, 1], c=(0, 0, 0))
             if show_forward:
                 if y_target is not None:
                     aspect_ratio, circularity, angle = y_target
                     # Visualize circularity
                     star = np.array(
                         (4, 4
                          )) + .5 * star_with_given_circularity(circularity)
                     plt.plot(star[:, 0],
                              star[:, 1],
                              c=(0, 0, 0, .25),
                              lw=1)
                     # Visualize aspect ratio and angle
                     rect = np.array(
                         (4, 2.5)) + .4 * rect_with_given_aspect_and_angle(
                             aspect_ratio, angle)
                     plt.plot(rect[:, 0],
                              rect[:, 1],
                              c=(0, 0, 0, .25),
                              lw=1)
                 # Find largest diameter of the shape
                 d = squareform(pdist(points[i]))
                 max_idx = np.unravel_index(d.argmax(), d.shape)
                 p0, p1 = points[i, max_idx[0]], points[i, max_idx[1]]
                 angle = np.arctan2((p1 - p0)[1], (p1 - p0)[0])
                 max_diameter = d[max_idx]
                 # Plot
                 d0, d1 = points[i, max_idx[0]], points[i, max_idx[1]]
                 plt.plot([d0[0], d1[0]], [d0[1], d1[1]],
                          c=(0, 1, 0),
                          ls='-',
                          lw=1)
                 plt.scatter([d0[0], d1[0]], [d0[1], d1[1]],
                             c=[(0, 1, 0)],
                             s=3,
                             zorder=10)
                 if y_target is not None:
                     # Find largest width orthogonal to diameter
                     c, s = np.cos(angle), np.sin(angle)
                     rotation = np.matrix([[c, s], [-s, c]])
                     p_rotated = np.dot(rotation, points[i].T).T
                     min_diameter = np.max(p_rotated[:, 1]) - np.min(
                         p_rotated[:, 1])
                     # Aspect ratio & circularity
                     aspect_ratio = min_diameter / max_diameter
                     shape = geo.Polygon(points[i])
                     circularity = 4 * np.pi * shape.area / shape.length**2
                     # Visualize circularity
                     star = np.array(
                         (4, 4
                          )) + .5 * star_with_given_circularity(circularity)
                     plt.plot(star[:, 0],
                              star[:, 1],
                              c=(0, 1, 0, .5),
                              ls='-',
                              lw=1)
                     # Visualize aspect ratio and angle
                     rect = np.array(
                         (4, 2.5)) + .4 * rect_with_given_aspect_and_angle(
                             aspect_ratio, angle)
                     plt.plot(rect[:, 0],
                              rect[:, 1],
                              c=(0, 1, 0, .5),
                              ls='-',
                              lw=1)
     plt.axis('equal')
     plt.axis([
         min(-5, points[:, :, 0].min() - 1),
         max(5, points[:, :, 0].max() + 1),
         min(-5, points[:, :, 1].min() - 1),
         max(5, points[:, :, 1].max() + 1)
     ])
def speakerDiarization(fileName,
                       numOfSpeakers,
                       mtSize=2.0,
                       mtStep=0.2,
                       stWin=0.05,
                       LDAdim=35,
                       PLOT=False):
    '''
    ARGUMENTS:
        - fileName:        the name of the WAV file to be analyzed
        - numOfSpeakers    the number of speakers (clusters) in the recording (<=0 for unknown)
        - mtSize (opt)     mid-term window size
        - mtStep (opt)     mid-term window step
        - stWin  (opt)     short-term window size
        - LDAdim (opt)     LDA dimension (0 for no LDA)
        - PLOT     (opt)   0 for not plotting the results 1 for plottingy
    '''
    [Fs, x] = audioBasicIO.readAudioFile(fileName)
    print fileName
    x = audioBasicIO.stereo2mono(x)
    Duration = len(x) / Fs

    [
        Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1,
        stStep1, computeBEAT1
    ] = aT.loadKNNModel(os.path.join("data", "knnSpeakerAll"))
    [
        Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2,
        stStep2, computeBEAT2
    ] = aT.loadKNNModel(os.path.join("data", "knnSpeakerFemaleMale"))

    [MidTermFeatures,
     ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs,
                                                 mtSize * Fs, mtStep * Fs,
                                                 round(Fs * stWin),
                                                 round(Fs * stWin * 0.5))

    MidTermFeatures2 = numpy.zeros(
        (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2),
         MidTermFeatures.shape[1]))

    for i in range(MidTermFeatures.shape[1]):
        curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
        curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2
        [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
        [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
        MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
        MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] +
                         len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::,
                         i] = P2 + 0.0001

    MidTermFeatures = MidTermFeatures2  # TODO
    # SELECT FEATURES:
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20];                                                                                         # SET 0A
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100];                                                                                 # SET 0B
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,
    #   97,98, 99,100];     # SET 0C

    iFeaturesSelect = [
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45,
        46, 47, 48, 49, 50, 51, 52, 53
    ]  # SET 1A
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];                                          # SET 1B
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 1C

    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53];             # SET 2A
    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];     # SET 2B
    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 2C

    #iFeaturesSelect = range(100);                                                                                                    # SET 3
    #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010

    MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

    (MidTermFeaturesNorm, MEAN,
     STD) = aT.normalizeFeatures([MidTermFeatures.T])
    MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
    numOfWindows = MidTermFeatures.shape[1]

    # remove outliers:
    DistancesAll = numpy.sum(distance.squareform(
        distance.pdist(MidTermFeaturesNorm.T)),
                             axis=0)
    MDistancesAll = numpy.mean(DistancesAll)
    iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = numpy.min(MidTermFeatures[1,:])
    #EnergyMean = numpy.mean(MidTermFeatures[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0]
    #print iNonOutLiers

    perOutLier = (100.0 *
                  (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
    MidTermFeaturesNormOr = MidTermFeaturesNorm
    MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

    # LDA dimensionality reduction:
    if LDAdim > 0:
        #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin));
        # extract mid-term features with minimum step:
        mtWinRatio = int(round(mtSize / stWin))
        mtStepRatio = int(round(stWin / stWin))
        mtFeaturesToReduce = []
        numOfFeatures = len(ShortTermFeatures)
        numOfStatistics = 2
        #for i in range(numOfStatistics * numOfFeatures + 1):
        for i in range(numOfStatistics * numOfFeatures):
            mtFeaturesToReduce.append([])

        for i in range(numOfFeatures):  # for each of the short-term features:
            curPos = 0
            N = len(ShortTermFeatures[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mtWinRatio
                if N2 > N:
                    N2 = N
                curStFeatures = ShortTermFeatures[i][N1:N2]
                mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
                mtFeaturesToReduce[i + numOfFeatures].append(
                    numpy.std(curStFeatures))
                curPos += mtStepRatio
        mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
        mtFeaturesToReduce2 = numpy.zeros(
            (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2),
             mtFeaturesToReduce.shape[1]))
        for i in range(mtFeaturesToReduce.shape[1]):
            curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
            curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
            [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
            [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
            mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0],
                                i] = mtFeaturesToReduce[:, i]
            mtFeaturesToReduce2[
                mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] +
                len(classNames1), i] = P1 + 0.0001
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] +
                                len(classNames1)::, i] = P2 + 0.0001
        mtFeaturesToReduce = mtFeaturesToReduce2
        mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
        #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
        (mtFeaturesToReduce, MEAN,
         STD) = aT.normalizeFeatures([mtFeaturesToReduce.T])
        mtFeaturesToReduce = mtFeaturesToReduce[0].T
        #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
        #MDistancesAll = numpy.mean(DistancesAll)
        #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
        #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
        Labels = numpy.zeros((mtFeaturesToReduce.shape[1], ))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / stWin
        #print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i * stWin / LDAstepRatio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=LDAdim)
        clf.fit(mtFeaturesToReduce.T, Labels)
        MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

    if numOfSpeakers <= 0:
        sRange = range(2, 10)
    else:
        sRange = [numOfSpeakers]
    clsAll = []
    silAll = []
    centersAll = []

    for iSpeakers in sRange:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(MidTermFeaturesNorm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
        clsAll.append(cls)
        centersAll.append(means)
        silA = []
        silB = []
        for c in range(iSpeakers
                       ):  # for each speaker (i.e. for each extracted cluster)
            clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(
                len(cls))
            if clusterPerCent < 0.020:
                silA.append(0.0)
                silB.append(0.0)
            else:
                MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls ==
                                                              c]  # get subset of feature vectors
                Yt = distance.pdist(
                    MidTermFeaturesNormTemp.T
                )  # compute average distance between samples that belong to the cluster (a values)
                silA.append(numpy.mean(Yt) * clusterPerCent)
                silBs = []
                for c2 in range(
                        iSpeakers
                ):  # compute distances from samples of other clusters
                    if c2 != c:
                        clusterPerCent2 = numpy.nonzero(
                            cls == c2)[0].shape[0] / float(len(cls))
                        MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,
                                                                       cls ==
                                                                       c2]
                        Yt = distance.cdist(MidTermFeaturesNormTemp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(
                            numpy.mean(Yt) *
                            (clusterPerCent + clusterPerCent2) / 2.0)
                silBs = numpy.array(silBs)
                silB.append(
                    min(silBs)
                )  # ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
        silA = numpy.array(silA)
        silB = numpy.array(silB)
        sil = []
        for c in range(iSpeakers):  # for each cluster (speaker)
            sil.append((silB[c] - silA[c]) /
                       (max(silB[c], silA[c]) + 0.00001))  # compute silhouette

        silAll.append(numpy.mean(sil))  # keep the AVERAGE SILLOUETTE

    #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5)))
    imax = numpy.argmax(silAll)  # position of the maximum sillouette value
    nSpeakersFinal = sRange[imax]  # optimal number of clusters

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window)
    cls = numpy.zeros((numOfWindows, ))
    for i in range(numOfWindows):
        j = numpy.argmin(numpy.abs(i - iNonOutLiers))
        cls[i] = clsAll[imax][j]

    # Post-process method 1: hmm smoothing
    for i in range(1):
        startprob, transmat, means, cov = trainHMM_computeStatistics(
            MidTermFeaturesNormOr, cls)
        hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0],
                                       "diag")  # hmm training
        hmm.startprob_ = startprob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(MidTermFeaturesNormOr.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = silAll[imax]  # final sillouette
    classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]

    # load ground-truth if available
    gtFile = fileName.replace('.wav', '.segments')
    # open for annotated file
    if os.path.isfile(gtFile):  # if groundturh exists
        [segStart, segEnd, segLabels] = readSegmentGT(gtFile)  # read GT data
        flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels,
                                           mtStep)  # convert to flags

    #     if PLOT:
    # fig = plt.figure()
    #     if numOfSpeakers>0:
    #         ax1 = fig.add_subplot(111)
    #     else:
    #         ax1 = fig.add_subplot(211)
    #     ax1.set_yticks(numpy.array(range(len(classNames))))
    #     ax1.axis((0, Duration, -1, len(classNames)))
    #     ax1.set_yticklabels(classNames)
    #     ax1.plot(numpy.array(range(len(cls)))*mtStep+mtStep/2.0, cls)

    if os.path.isfile(gtFile):
        # if PLOT:
        #     ax1.plot(numpy.array(range(len(flagsGT)))*mtStep+mtStep/2.0, flagsGT, 'r')
        purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(
            cls, flagsGT)
        print "{0:.1f}\t{1:.1f}".format(100 * purityClusterMean,
                                        100 * puritySpeakerMean)
        # if PLOT:
        #     plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100*purityClusterMean, 100*puritySpeakerMean) )
    # if PLOT:
    #     plt.xlabel("time (seconds)")
    #     #print sRange, silAll
    #     if numOfSpeakers<=0:
    #         plt.subplot(212)
    #         plt.plot(sRange, silAll)
    #         plt.xlabel("number of clusters");
    #         plt.ylabel("average clustering's sillouette");
    #     plt.show()
    return cls
Example #41
0
def rbf(X, sigma=0.5):
    pairwise_dists = squareform(pdist(X, 'euclidean'))
    A = scipy.exp(-pairwise_dists**2 / (2. * sigma**2))
    return A
Example #42
0
    for c_num in range(len(coords)):
        if c_num not in assigned:
            cluster_info[c_num] = {
                'members': [c_num],
                'centroid': coords[members],
                'int_time': 1.0
            }

    return cluster_info


coords, targets = load_targets('test_targets.dat')
labellist = targets['target']

seps = calc_separation(coords)
dist = ssd.squareform(seps)

linked = linkage(dist, method='complete', optimal_ordering=True)

all_clusters = extract_levels(linked, labellist)

cluster_info = do_clustering(coords, all_clusters, seps)

fig = plt.figure()
ax = fig.add_subplot(111)

for c_num in cluster_info.keys():
    cluster_data = cluster_info[c_num]
    indices = cluster_data['members']

    num_coords = len(indices)
Example #43
0
print(type(soap_water))

# Average output
average_soap = SOAP(
    species=species,
    rcut=rcut,
    nmax=nmax,
    lmax=lmax,
    average=True,
    sparse=False
)

soap_water = average_soap.create(water)
print("average soap water", soap_water.shape)

methanol = molecule('CH3OH')
soap_methanol = average_soap.create(methanol)
print("average soap methanol", soap_methanol.shape)

h2o2 = molecule('H2O2')
soap_peroxide = average_soap.create(h2o2)

# Distance
from scipy.spatial.distance import pdist, squareform
import numpy as np

molecules = np.vstack([soap_water, soap_methanol, soap_peroxide])
distance = squareform(pdist(molecules))
print("distance matrix: water - methanol - H2O2")
print(distance)
Example #44
0
def martin98(locations, E_incident, permittivity, location_sizes, wavelength,
             step_size):
    """
    Basic implementation of the algorithm as proposed in [Olivier J. F. Martin
    and Nicolas B. Piller, Electromagnetic scattering in polarizable 
    back-grounds].

    Parameters
    ----------
    locations : numpy array
        Array containing the locations where the E field must be evaluated.
    E_incident : numpy array
        Array containing the value of the incident E field at each location.
    permittivity : numpy array
        Array containing the permittivity at each location.
    location_sizes : numpy array
        Array containing the size of the samples at each location.
    wavelength : float
        Wavelength of the incident wave.
    step_size : float
        Minimal distance between samples.

    Returns
    -------
    E_r : numpy array
        Scattered E field at each location.

    """
    # Find number of locations
    nloc = np.shape(locations)[0]
    # Relative permittivity of background
    epsilon_B = 1
    # Wave number
    #k_0 = 2*np.pi*frequency/speed_of_light
    k_0 = 2 * np.pi / wavelength
    k_rho = k_0 * np.sqrt(epsilon_B)
    # Calculate distance between all points in the plane
    varrho = pdist(locations, 'euclidean')
    # Calculate G matrix
    G_condensed = 1j / 4 * hankel1(0, k_rho * varrho)
    # Convert condensed G matrix to square form
    G = squareform(G_condensed)
    # Volume of each location
    V_mesh = np.square(location_sizes * step_size)
    # Self contribution to the electric field
    R_eff = np.sqrt(V_mesh / np.pi)  #Effective radius
    beta = 1  # No coupling between TE and TM polarizations
    gamma = R_eff / k_rho * hankel1(
        1, k_rho * R_eff) + 2j / (np.pi * np.square(k_rho))
    M = 1j * np.pi / 2 * beta * gamma
    # Set diagonal of G matrix to 0
    np.fill_diagonal(G, M / V_mesh)

    # Difference between background permittivity and permittivity at a specific
    # location
    Delta_epsilon = permittivity - epsilon_B

    # Total E field (vector)
    E_r = np.linalg.inv(
        np.identity(nloc) -
        k_0**2 * G @ np.diag(Delta_epsilon * V_mesh)) @ E_incident
    return E_r
Example #45
0
def query(oracle, query, trn_type=1, smooth=False, weight=0.5):
    """

    :param oracle:
    :param query:
    :param trn_type:
    :param smooth:
    :param weight:
    :return:
    """
    """ Return the closest path in target oracle given a query sequence
    
    Args:
        oracle: an oracle object already learned, the target. 
        query: the query sequence in a matrix form such that 
             the ith row is the feature at the ith time point
        method: 
        trn_type:
        smooth:(off-line only)
        weight:
    
    """
    N = len(query)
    K = oracle.num_clusters()
    P = [[0] * K for _i in range(N)]
    if smooth:
        D = dist.pdist(oracle.f_array[1:], 'sqeuclidean')
        D = dist.squareform(D, checks=False)
        map_k_outer = partial(_query_k,
                              oracle=oracle,
                              query=query,
                              smooth=smooth,
                              D=D,
                              weight=weight)
    else:
        map_k_outer = partial(_query_k, oracle=oracle, query=query)

    map_query = partial(_query_init, oracle=oracle, query=query[0])
    P[0], C = zip(*map(map_query, oracle.rsfx[0][:]))
    P[0] = list(P[0])
    C = np.array(C)

    if trn_type == 1:
        trn = _create_trn_self
    elif trn_type == 2:
        trn = _create_trn_sfx_rsfx
    else:
        trn = _create_trn

    argmin = np.argmin
    distance_cache = np.zeros(oracle.n_states)
    for i in xrange(1, N):  # iterate over the rest of query
        state_cache = []
        dist_cache = distance_cache

        map_k_inner = partial(map_k_outer,
                              i=i,
                              P=P,
                              trn=trn,
                              state_cache=state_cache,
                              dist_cache=dist_cache)
        P[i], _c = zip(*map(map_k_inner, range(K)))
        P[i] = list(P[i])
        C += np.array(_c)

    i_hat = argmin(C)
    P = map(list, zip(*P))
    return P, C, i_hat
import pandas as pd
from scipy.spatial import distance
import numpy as np
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial import distance
np.set_printoptions(precision=1,
                    suppress=True)  # Cortar la impresión de decimales a 1

os.chdir('datos')
#Lectura simple de datos ya con la limpieza
df = pd.read_csv("tirosL.csv")
#los datos ya estan limpios

#df=df.sample(10000)

X = df.head(1000)

# Convertir el vector de distancias a una matriz cuadrada
md = distance.squareform(distance.pdist(X, 'euclidean'))
print(md)

Z = linkage(X, 'complete')
plt.figure(figsize=(12, 5))
dendrogram(Z,
           truncate_mode='lastp',
           p=5,
           show_leaf_counts=True,
           leaf_font_size=14.)
#dendrogram(Z, leaf_font_size=14)
plt.show()
                      facecolor=box_color,
                      edgecolor=box_color,
                      linewidth=basewidth,
                      clip_on=False))
        loading_axes[task_i].hlines(i + .4,
                                    -2,
                                    -.5,
                                    color=box_color,
                                    clip_on=False,
                                    linewidth=basewidth,
                                    linestyle=':')

# ****************************************************************************
# Distance Matrices
# ****************************************************************************
participant_distances = squareform(abs_pdist(data.T))
participant_distances = results['task'].HCA.results['data']['clustered_df']
loading_distances = results['task'].HCA.results['EFA5_oblimin']['clustered_df']
sns.heatmap(participant_distances,
            ax=participant_distance,
            cmap=ListedColormap(sns.color_palette('gray', n_colors=100)),
            xticklabels=False,
            yticklabels=False,
            square=True,
            cbar=False,
            linewidth=0)
sns.heatmap(loading_distances,
            ax=loading_distance,
            xticklabels=False,
            yticklabels=False,
            square=True,
Example #48
0
#print cls.inertia_
#labels = cls.labels_

###############################################################################
## Visualize the results on PCA-reduced data
tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
reduced_data = tsne.fit_transform(X)
plt.scatter(reduced_data[:, 0], reduced_data[:, 1])
plt.savefig(cur_file_dir + 'result/' + 'user_dr_tsne.png')
# plt.show()
plt.cla()
plt.clf()
plt.close()
# Compute DBSCAN
#D = distance.squareform(distance.pdist(X)) # 高维数据
D = distance.squareform(distance.pdist(reduced_data))  # 低维数据
D = np.sort(D, axis=0)
minPts = 10
nearest = D[1:(minPts + 1), :]
nearest = nearest.reshape(1, nearest.size)
sort_nearest = np.sort(nearest)
plt.plot(range(len(sort_nearest[0, :])),
         sort_nearest[0, :],
         linewidth=1.0,
         marker='x')
#plt.axis([-2, len(sort_nearest[0,:])+1000, -2, max(sort_nearest[0,:])+2])
plt.savefig(cur_file_dir + 'result/' + 'nearest.png')
plt.cla()
plt.clf()
plt.close()
#db = DBSCAN(eps=0.90, min_samples=minPts).fit(X) # 高维数据
Example #49
0
def _krige(X, y, coords, variogram_function, variogram_model_parameters,
           coordinates_type):
    """Sets up and solves the ordinary kriging system for the given
    coordinate pair. This function is only used for the statistics calculations.

    Parameters
    ----------
    X: ndarray
        float array [n_samples, n_dim], the input array of coordinates
    y: ndarray
        float array [n_samples], the input array of measurement values
    coords: ndarray
        float array [1, n_dim], point at which to evaluate the kriging system
    variogram_function: callable
        function that will be called to evaluate variogram model
    variogram_model_parameters: list
        user-specified parameters for variogram model
    coordinates_type: str
        type of coordinates in X array, can be 'euclidean' for standard
        rectangular coordinates or 'geographic' if the coordinates are lat/lon

    Returns
    -------
    zinterp: float
        kriging estimate at the specified point
    sigmasq: float
        mean square error of the kriging estimate
    """

    zero_index = None
    zero_value = False

    # calculate distance between points... need a square distance matrix
    # of inter-measurement-point distances and a vector of distances between
    # measurement points (X) and the kriging point (coords)
    if coordinates_type == 'euclidean':
        d = squareform(pdist(X, metric='euclidean'))
        bd = np.squeeze(cdist(X, coords[None, :], metric='euclidean'))

    # geographic coordinate distances still calculated in the old way...
    # assume X[:, 0] ('x') => lon, X[:, 1] ('y') => lat
    # also assume problem is 2D; check done earlier in initializing variogram
    elif coordinates_type == 'geographic':
        x1, x2 = np.meshgrid(X[:, 0], X[:, 0], sparse=True)
        y1, y2 = np.meshgrid(X[:, 1], X[:, 1], sparse=True)
        d = great_circle_distance(x1, y1, x2, y2)
        bd = great_circle_distance(X[:, 0], X[:, 1],
                                   coords[0] * np.ones(X.shape[0]),
                                   coords[1] * np.ones(X.shape[0]))

    # this check is done when initializing variogram, but kept here anyways...
    else:
        raise ValueError("Specified coordinate type '%s' "
                         "is not supported." % coordinates_type)

    # check if kriging point overlaps with measurement point
    if np.any(np.absolute(bd) <= 1e-10):
        zero_value = True
        zero_index = np.where(bd <= 1e-10)[0][0]

    # set up kriging matrix
    n = X.shape[0]
    a = np.zeros((n + 1, n + 1))
    a[:n, :n] = -variogram_function(variogram_model_parameters, d)
    np.fill_diagonal(a, 0.0)
    a[n, :] = 1.0
    a[:, n] = 1.0
    a[n, n] = 0.0

    # set up RHS
    b = np.zeros((n + 1, 1))
    b[:n, 0] = -variogram_function(variogram_model_parameters, bd)
    if zero_value:
        b[zero_index, 0] = 0.0
    b[n, 0] = 1.0

    # solve
    res = np.linalg.solve(a, b)
    zinterp = np.sum(res[:n, 0] * y)
    sigmasq = np.sum(res[:, 0] * -b[:, 0])

    return zinterp, sigmasq
Example #50
0
 def _calc_max_dist(self):
     # Simplest possible max distance measure
     return distance.squareform(distance.pdist(self.points)).max()
Example #51
0
df = pd.DataFrame({'Max/Min topics': column1,'Nights': column2,'Number of topics': column3, 'Topics': column4})
print(df) # show the data frame

writer = xlwt('table of max and min number topics.xlsx') # create an excel file from the data frame
workbook = writer.book # define the excel workbook
df.to_excel(writer, 'Sheet1') # place the data frame on the first sheet of the excel file
worksheet = writer.sheets['Sheet1'] # define the worksheet
worksheet.set_column('B:Q',35) # set the column width for columns B up to Q, so we can see all the text in the cells
writer.save()


###########################################
# Hierarchical clustering with topic model
###########################################

dm = squareform(pdist(X, 'cosine'))# 'cosine'is one of the methods that can be used to calculate the distance between newly formed clusters
# we use the cosine similarity because it works well for topic clustering

# creating a linkage matrix
linkage_object = linkage(dm, method='ward', metric='euclidean')
print(linkage_object) # linkage_object[i] will tell us which clusters were merged in the i-th pass

# calculate a full dendrogram
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(linkage_object,leaf_rotation=90.,leaf_font_size=8.,)
plt.show()

# we create a truncated dendrogram, which only shows the last p=15 out of our 989 merges.
Example #52
0
def daal_pairwise_distances(X,
                            Y=None,
                            metric="euclidean",
                            n_jobs=None,
                            force_all_finite=True,
                            **kwds):
    """ Compute the distance matrix from a vector array X and optional Y.

    This method takes either a vector array or a distance matrix, and returns
    a distance matrix. If the input is a vector array, the distances are
    computed. If the input is a distances matrix, it is returned instead.

    This method provides a safe way to take a distance matrix as input, while
    preserving compatibility with many other algorithms that take a vector
    array.

    If Y is given (default is None), then the returned matrix is the pairwise
    distance between the arrays from both X and Y.

    Valid values for metric are:

    - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
      'manhattan']. These metrics support sparse matrix
      inputs.
      ['nan_euclidean'] but it does not yet support sparse matrices.

    - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
      'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
      'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
      'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
      See the documentation for scipy.spatial.distance for details on these
      metrics. These metrics do not support sparse matrix inputs.

    Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
    valid scipy.spatial.distance metrics), the scikit-learn implementation
    will be used, which is faster and has support for sparse matrices (except
    for 'cityblock'). For a verbose description of the metrics from
    scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics
    function.

    Read more in the :ref:`User Guide <metrics>`.

    Parameters
    ----------
    X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
             [n_samples_a, n_features] otherwise
        Array of pairwise distances between samples, or a feature array.

    Y : array [n_samples_b, n_features], optional
        An optional second feature array. Only allowed if
        metric != "precomputed".

    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by scipy.spatial.distance.pdist for its metric parameter, or
        a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
        If metric is "precomputed", X is assumed to be a distance matrix.
        Alternatively, if metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays from X as input and return a value indicating
        the distance between them.

    n_jobs : int or None, optional (default=None)
        The number of jobs to use for the computation. This works by breaking
        down the pairwise matrix into n_jobs even slices and computing them in
        parallel.

        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    force_all_finite : boolean or 'allow-nan', (default=True)
        Whether to raise an error on np.inf and np.nan in array. The
        possibilities are:

        - True: Force all values of array to be finite.
        - False: accept both np.inf and np.nan in array.
        - 'allow-nan': accept only np.nan values in array. Values cannot
          be infinite.

        .. versionadded:: 0.22

    **kwds : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.

    Returns
    -------
    D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b]
        A distance matrix D such that D_{i, j} is the distance between the
        ith and jth vectors of the given matrix X, if Y is None.
        If Y is not None, then D_{i, j} is the distance between the ith array
        from X and the jth array from Y.

    See also
    --------
    pairwise_distances_chunked : performs the same calculation as this
        function, but returns a generator of chunks of the distance matrix, in
        order to limit memory usage.
    paired_distances : Computes the distances between corresponding
                       elements of two arrays
    """
    if (metric not in _VALID_METRICS and not callable(metric)
            and metric != "precomputed"):
        raise ValueError("Unknown metric %s. "
                         "Valid metrics are %s, or 'precomputed', or a "
                         "callable" % (metric, _VALID_METRICS))

    if metric == "precomputed":
        X, _ = check_pairwise_arrays(X,
                                     Y,
                                     precomputed=True,
                                     force_all_finite=force_all_finite)
        whom = ("`pairwise_distances`. Precomputed distance "
                " need to have non-negative values.")
        check_non_negative(X, whom=whom)
        return X
    elif ((metric == 'cosine') and (Y is None) and (not issparse(X))
          and X.dtype == np.float64):
        return _daal4py_cosine_distance_dense(X)
    elif ((metric == 'correlation') and (Y is None) and (not issparse(X))
          and X.dtype == np.float64):
        return _daal4py_correlation_distance_dense(X)
    elif metric in PAIRWISE_DISTANCE_FUNCTIONS:
        func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
    elif callable(metric):
        func = partial(_pairwise_callable,
                       metric=metric,
                       force_all_finite=force_all_finite,
                       **kwds)
    else:
        if issparse(X) or issparse(Y):
            raise TypeError("scipy distance metrics do not"
                            " support sparse matrices.")

        dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None

        if (dtype == bool
                and (X.dtype != bool or (Y is not None and Y.dtype != bool))):
            msg = "Data was converted to boolean for metric %s" % metric
            warnings.warn(msg, DataConversionWarning)

        X, Y = check_pairwise_arrays(X,
                                     Y,
                                     dtype=dtype,
                                     force_all_finite=force_all_finite)

        # precompute data-derived metric params
        params = _precompute_metric_params(X, Y, metric=metric, **kwds)
        kwds.update(**params)

        if effective_n_jobs(n_jobs) == 1 and X is Y:
            return distance.squareform(distance.pdist(X, metric=metric,
                                                      **kwds))
        func = partial(distance.cdist, metric=metric, **kwds)

    return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
Example #53
0
def omeClust(data,
             metadata=config.metadata,
             resolution=config.resolution,
             output_dir=config.output_dir,
             estimated_number_of_clusters=config.estimated_number_of_clusters,
             linkage_method=config.linkage_method,
             plot=config.plot,
             size_to_plot=None,
             enrichment_method="nmi"):
    # read  input files
    data = pd.read_table(data, index_col=0, header=0)
    # print(data.shape)
    #print(data.index)
    #print(data.columns)

    if metadata is not None:
        metadata = pd.read_table(metadata, index_col=0, header=0)
        #   print(data.index)
        #print(metadata.index)
        ind = metadata.index.intersection(data.index)
        #diff = set(metadata.index).difference(set(data.index))
        #print (diff)
        #print(len(ind), data.shape[1], ind)
        if len(ind) != data.shape[0]:
            print(
                "the data and metadata have different number of rows and number of common rows is: ",
                len(ind))
            print("The number of missing metadata are: ",
                  data.shape[0] - len(ind))
            # print("Metadata will not be used!!! ")
            # metadata = None
            # else:
            diff_rows = data.index.difference(metadata.index)
            #print (diff_rows)
            empty_section_metadata = pd.DataFrame(index=diff_rows,
                                                  columns=metadata.columns)
            metadata = pd.concat([metadata, empty_section_metadata])
        metadata = metadata.loc[data.index, :]
        #print (data, metadata)
        #data = data.loc[ind]
        #data = data.loc[ind, :]

    config.output_dir = output_dir
    check_requirements()
    data_flag = True

    if all(a == b for (a, b) in zip(data.columns, data.index)):
        df_distance = data
        data_flag = False
    else:
        df_distance = pd.DataFrame(squareform(
            pdist(data, metric=distance.pDistance)),
                                   index=data.index,
                                   columns=data.index)
    df_distance = df_distance[df_distance.values.sum(axis=1) != 0]
    df_distance = df_distance[df_distance.values.sum(axis=0) != 0]
    df_distance.to_csv(output_dir + '/adist.txt', sep='\t')
    # df_distance = stats.scale_data(df_distance, scale = 'log')
    # viz.tsne_ord(df_distance, cluster_members = data.columns)
    clusters = main_run(
        distance_matrix=df_distance,
        number_of_estimated_clusters=estimated_number_of_clusters,
        linkage_method=linkage_method,
        output_dir=output_dir,
        do_plot=plot,
        resolution=resolution)
    omeClust_enrichment_scores, sorted_keys = None, None
    shapeby = None
    if metadata is not None:
        omeClust_enrichment_scores, sorted_keys = utilities.omeClust_enrichment_score(
            clusters, metadata, method=enrichment_method)
        if len(sorted_keys) > 3:
            shapeby = sorted_keys[3]
            print(shapeby, " is the most influential metadata in clusters")
    else:
        omeClust_enrichment_scores, sorted_keys = utilities.omeClust_enrichment_score(
            clusters, metadata, method=enrichment_method)
    #print (omeClust_enrichment_scores, sorted_keys)
    dataprocess.write_output(clusters, output_dir, df_distance,
                             omeClust_enrichment_scores, sorted_keys)
    feature2cluster = dataprocess.feature2cluster(clusters, df_distance)
    feature2cluster_map = pd.DataFrame.from_dict(feature2cluster,
                                                 orient='index',
                                                 columns=['Cluster'])
    feature2cluster_map = feature2cluster_map.loc[data.index, :]
    feature2cluster_map.to_csv(output_dir + '/feature_cluster_label.txt',
                               sep='\t')
    if plot:
        if size_to_plot is None:
            size_to_plot = config.size_to_plot
        try:
            viz.pcoa_ord(df_distance,
                         cluster_members=dataprocess.cluster2dict(clusters),
                         size_tobe_colored=size_to_plot,
                         metadata=metadata,
                         shapeby=shapeby)
        except:
            pass
        try:
            viz.tsne_ord(df_distance,
                         cluster_members=dataprocess.cluster2dict(clusters),
                         size_tobe_colored=size_to_plot,
                         metadata=metadata,
                         shapeby=shapeby)
        except:
            pass
        try:
            viz.pca_ord(df_distance,
                        cluster_members=dataprocess.cluster2dict(clusters),
                        size_tobe_colored=size_to_plot,
                        metadata=metadata,
                        shapeby=shapeby)
        except:
            pass
        try:
            viz.mds_ord(df_distance,
                        cluster_members=dataprocess.cluster2dict(clusters),
                        size_tobe_colored=size_to_plot,
                        metadata=metadata,
                        shapeby=shapeby)
        except:
            pass
    # draw network
        max_dist = max(omeClust_enrichment_scores['branch_condensed_distance'])
        min_weight = df_distance.max().max() - max_dist
        viz.network_plot(D=df_distance,
                         partition=dataprocess.feature2cluster(clusters,
                                                               D=df_distance),
                         min_weight=min_weight)
    return feature2cluster_map
axs = axs.ravel()
colors = cm.rainbow(np.linspace(0,1,T))
for i in range(T):

    axs[i].scatter(x_latent_time[i,:,0],x_latent_time[i,:,1],color=colors[i],alpha=0.3)
    axs[i].set_title(str(i))
plt.show()

#**** plot some individual paths (by initial state?)
xdata_time = np.concatenate((dataset['state'],dataset['control']),axis=1)
xdata_time = xdata_time[:N,:,:]
xdata_time = np.transpose(xdata_time,(2,0,1)) #obs x knots x dim
# get  distance metric of initial conditions
xdata_x0 = np.squeeze(xdata_time[1,:,0:4]) # drop control
from scipy.spatial.distance import pdist,squareform
y = squareform(pdist(xdata_x0,'euclidean'))

# now get examples from one point, and plot sorted by distance to example point
import random
ind = random.randint(0,N)
dist = y[ind,:]
idx = np.argsort(dist)

# plot vae examples
plt.scatter(x_latent[:,0],x_latent[:,1])
plt.xlabel('VAE 1')
plt.ylabel('VAE 2')

numex = 10
colors = cm.rainbow(np.linspace(0,1,numex))
for i in range(numex):
Example #55
0
'''

import numpy as np

from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform

import matplotlib.pyplot as plt


mat = np.array([[0.0, 2.0, 6.0, 10.0, 9.0],
                [2.0, 0.0, 5.0, 9.0,  8.0], 
                [6.0, 5.0, 0.0, 4.0,  5.0],
                [10.0, 9.0, 4.0, 0.0, 3.0],
                [9.0, 8.0, 5.0, 3.0,  0.0]])
dists = squareform(mat)
linkage_matrix = linkage(dists, "single")
dendrogram(linkage_matrix, labels=["0", "1", "2","3", "4"])
plt.title("test")
plt.show()

# How to calculate distance_matrix
from scipy.spatial import distance_matrix
p = dataset.iloc[:5,[2,4]].values

distance_matrix(p,p)

d = np.dot(p,p.T)
norm = (p**2).sum(0, keepdims=True)
d / norm
d / norm / norm.T
def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):
    """ Compute the distance matrix from a vector array X and optional Y.

    This method takes either a vector array or a distance matrix, and returns
    a distance matrix. If the input is a vector array, the distances are
    computed. If the input is a distances matrix, it is returned instead.

    This method provides a safe way to take a distance matrix as input, while
    preserving compatibility with many other algorithms that take a vector
    array.

    If Y is given (default is None), then the returned matrix is the pairwise
    distance between the arrays from both X and Y.

    Valid values for metric are:

    - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
      'manhattan']. These metrics support sparse matrix
      inputs.
      Also, ['masked_euclidean'] but it does not yet support sparse matrices.

    - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
      'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
      'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
      'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
      See the documentation for scipy.spatial.distance for details on these
      metrics. These metrics do not support sparse matrix inputs.

    Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
    valid scipy.spatial.distance metrics), the scikit-learn implementation
    will be used, which is faster and has support for sparse matrices (except
    for 'cityblock'). For a verbose description of the metrics from
    scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics
    function.

    Read more in the :ref:`User Guide <metrics>`.

    Parameters
    ----------
    X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
             [n_samples_a, n_features] otherwise
        Array of pairwise distances between samples, or a feature array.

    Y : array [n_samples_b, n_features], optional
        An optional second feature array. Only allowed if
        metric != "precomputed".

    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by scipy.spatial.distance.pdist for its metric parameter, or
        a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
        If metric is "precomputed", X is assumed to be a distance matrix.
        Alternatively, if metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays from X as input and return a value indicating
        the distance between them.

    n_jobs : int
        The number of jobs to use for the computation. This works by breaking
        down the pairwise matrix into n_jobs even slices and computing them in
        parallel.

        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging. For n_jobs below -1,
        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
        are used.

    **kwds : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.

    Returns
    -------
    D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b]
        A distance matrix D such that D_{i, j} is the distance between the
        ith and jth vectors of the given matrix X, if Y is None.
        If Y is not None, then D_{i, j} is the distance between the ith array
        from X and the jth array from Y.

    See also
    --------
    pairwise_distances_chunked : performs the same calculation as this funtion,
        but returns a generator of chunks of the distance matrix, in order to
        limit memory usage.
    paired_distances : Computes the distances between corresponding
                       elements of two arrays
    """
    if (metric not in _VALID_METRICS and not callable(metric)
            and metric != "precomputed"):
        raise ValueError("Unknown metric %s. "
                         "Valid metrics are %s, or 'precomputed', or a "
                         "callable" % (metric, _VALID_METRICS))

    if metric in _MASKED_METRICS or callable(metric):
        missing_values = kwds.get("missing_values") if kwds.get(
            "missing_values") is not None else np.nan

        if np.all(_get_mask(X.data if issparse(X) else X, missing_values)):
            raise ValueError(
                "One or more samples(s) only have missing values.")

    if metric == "precomputed":
        X, _ = check_pairwise_arrays(X, Y, precomputed=True)
        return X
    elif metric in PAIRWISE_DISTANCE_FUNCTIONS:
        func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
    elif callable(metric):
        func = partial(_pairwise_callable, metric=metric, **kwds)
    else:
        if issparse(X) or issparse(Y):
            raise TypeError("scipy distance metrics do not"
                            " support sparse matrices.")

        dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None

        X, Y = check_pairwise_arrays(X, Y, dtype=dtype)

        if n_jobs == 1 and X is Y:
            return distance.squareform(distance.pdist(X, metric=metric,
                                                      **kwds))
        func = partial(distance.cdist, metric=metric, **kwds)

    return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
Example #57
0
def CosineScore(M):

    cos_M = squareform(pdist(M, 'cosine'))
    alpha_cos = softmax(cos_M, axis=0)

    return np.sum(alpha_cos, axis=1)
Example #58
0
plt.show()

H = linkage(dataset1, 'complete')
plt.figure(figsize=(10, 10))
dendro = dendrogram(H, leaf_font_size=30)
plt.title('Dendrogram on microstructural dataset using complete linkage')
plt.show()

H = linkage(dataset1, 'single', metric='correlation')
plt.figure(figsize=(10, 10))
dendro = dendrogram(H, leaf_font_size=30)
plt.title('Dendrogram on microstructural dataset using single linkage')
plt.show()

# Distance matrix
dm = squareform(pdist(dataset1))
# For euclidean
h = sns.clustermap(dm, metric='euclidean')
plt.show()
# For jaccard
h = sns.clustermap(dm, metric='jaccard')
plt.show()
# For correlation
h = sns.clustermap(dm, metric='correlation')
plt.show()
# For single
h = sns.clustermap(dm, method='single')
plt.show()

# Gaussian mixture
g_m = GaussianMixture(n_components=72).fit(x)
Example #59
0
def manhattenScore(M):
    man_M = squareform(pdist(M, 'cityblock'))
    alpha_man = softmax(man_M, axis=0)

    return np.sum(alpha_man, axis=1)
def find_correlation_clusters(corr, corr_thresh):
    dissimilarity = 1.0 - corr
    hierarchy = linkage(squareform(dissimilarity), method='single')
    diss_thresh = 1.0 - corr_thresh
    labels = fcluster(hierarchy, diss_thresh, criterion='distance')
    return labels