Beispiel #1
0
def _CalcMutualNearestNeighbors(hull_points, all_points):
    all_points_list = list(all_points)
    ds = distance.pdist(list(all_points))
    std_d = p.std(ds)

    square_ds = distance.squareform(ds)
    nearest_neighbors = {}

    for i, point in enumerate(all_points_list):
        if point not in hull_points:
            continue

        my_ds = [(d, j) for j, d in enumerate(square_ds[i]) if j != i]
        my_ds.sort()
        nearest_neighbors[point] = set([j for d, j in my_ds[:3]])

    no_mutual = set()
    for i, point in enumerate(all_points_list):
        if point not in hull_points:
            continue

        no_nbrs = True
        for neighbor_index in nearest_neighbors.get(point, []):
            neighbor = all_points_list[neighbor_index]
            neighbor_set = nearest_neighbors.get(neighbor, [])
            if i in neighbor_set:
                no_nbrs = False

        if no_nbrs:
            no_mutual.add(point)

    return no_mutual
def spectral_partition(W,q,method = 'complete', metric = 'cosine'):

    n,m = W.shape
    K = Kmatrix(W)

    if n == m:
        try:
            e,v = linalg.eigen(K, q)
        except TypeError:
            e,v = linalg.eigs(K, q)

    else:
        try:
            u,e,v = linalg.svds(K, q)
        except AttributeError:
            u,e,v = linalg.svd(K, q)
           
        v = np.concatenate((u, v.T), 0)
                
    max_index = e.argmax()
    v = np.delete(v,max_index,1)
    Obs = np.real(v)
    D = distance.pdist(Obs,metric = metric)
    D = np.multiply(D >= 0, D)
    Z = linkage(D, method = method, metric = metric)
    cluster = fcluster(Z, q, criterion = 'maxclust')
            
    cluster += - 1
    cluster = {'spectral' : cluster}

    return cluster
Beispiel #3
0
def _CalcMutualNearestNeighbors(hull_points, all_points):
    all_points_list = list(all_points)
    ds = distance.pdist(list(all_points))
    std_d = p.std(ds)
    
    square_ds = distance.squareform(ds)
    nearest_neighbors = {}
    
    for i, point in enumerate(all_points_list):
        if point not in hull_points:
            continue
        
        my_ds = [(d, j) for j, d in enumerate(square_ds[i])
                 if j != i]
        my_ds.sort()
        nearest_neighbors[point] = set([j for d,j in my_ds[:3]])
    
    no_mutual = set()
    for i, point in enumerate(all_points_list):
        if point not in hull_points:
            continue
        
        no_nbrs = True
        for neighbor_index in nearest_neighbors.get(point, []):
            neighbor = all_points_list[neighbor_index]
            neighbor_set = nearest_neighbors.get(neighbor, [])
            if i in neighbor_set:
                no_nbrs = False
        
        if no_nbrs:
            no_mutual.add(point)
                
    return no_mutual
Beispiel #4
0
def StdDist(points):
    """Returns the standard deviation of the pairwise distances.
    
    Args:
        points: an Nx2 matrix of points.
    """
    ds = distance.pdist(points)
    return p.mean(ds)
Beispiel #5
0
def StdDist(points):
    """Returns the standard deviation of the pairwise distances.
    
    Args:
        points: an Nx2 matrix of points.
    """
    ds = distance.pdist(points)
    return p.mean(ds)
def plt_cluster(source_path, result_path):
	# 参数初始化
	path = source_path  # 开门不同时段
	o = open(path, 'rb')
	data = pd.read_csv(o, index_col='neighbor')

	# 生成点与点之间的距离矩阵,这里用的欧氏距离:
	disMat = distance.pdist(data, metric='euclidean')
	Z = linkage(disMat, method='average')  # 进行层次聚类:
	P=dendrogram( Z )											# 将层级聚类结果以树状图表示出来并保存
	plt.savefig( result_path + 'plot_dendrogram.png')
Beispiel #7
0
def main(assets, start_date, end_date, plot_original=False):
    """Main function to draw MST from assets.

    Parameters
    ----------
    assets: list
        list of assets
    start_date: string
        start date of asset prices
    end_date: string
        end date of asset prices
    """
    assets = asset_prices(assets, start_date, end_date)
    close_prices = assets["Close"]
    # For reference, log(x) - log(y) == log(x/y)
    log_returns = np.log(close_prices / close_prices.shift(1))[1:]
    correlations = log_returns.corr()
    distances = np.sqrt(2 * (1 - correlations))
    # Build graph.
    graph = Graph(distances)
    edges = graph.weighted_edges()
    nodes = graph.vertices()
    # Plot all nodes and edges.
    if plot_original:
        fig, ax = plt.subplots(1, 1)
        fig.set_size_inches(14.5, 10.5)
        draw_graph(
            nodes,
            edges,
            ax,
            graph_layout=nx.spring_layout,
            title="Original Network Graph",
        )
    # Plot MST.
    mst = graph.kruskal_mst()
    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(14.5, 10.5)
    draw_graph(
        nodes,
        mst,
        ax,
        graph_layout=nx.spring_layout,
        title="MST Network Graph",
    )
    # Plot dendogram.
    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(20.5, 8.5)
    pdist = distance.pdist(distances.values)
    link = linkage(pdist, method="complete")
    dendrogram(link, labels=distances.columns)
    ax.set_title("Dendogram of MST", fontsize=24)
    plt.show()
Beispiel #8
0
def linkage_matrix_rep(sim_matrix):
    methods = ['average', 'single', 'complete', 'weighted']
    c_final = 0.0
    method_final = ''
    final_linkage = linkage(sim_matrix)
    for method in methods:
        linkage_matrix = linkage(sim_matrix, method=method)
        c, coph_dists = cophenet(linkage_matrix, distance.pdist(sim_matrix))
        if c > c_final:
            c_final = c
            final_linkage = linkage_matrix
            method_final = method
            cd_final = coph_dists
    return c_final, method_final, final_linkage, cd_final
Beispiel #9
0
def _CalcDensities(hull_points, all_points):
    ds = distance.pdist(list(all_points))
    std_d = p.std(ds)
    
    square_ds = distance.squareform(ds)
    densities = {}
    for i, point in enumerate(all_points):
        if point not in hull_points:
            continue
        
        my_ds = square_ds[i]
        density = len([1 for i in my_ds if i <= std_d])
        densities[point] = density
    
    tmp_densities = [(d, pt) for pt,d in densities.iteritems()]
    tmp_densities.sort(reverse=True)
    return tmp_densities, std_d
Beispiel #10
0
def _CalcDensities(hull_points, all_points):
    ds = distance.pdist(list(all_points))
    std_d = p.std(ds)

    square_ds = distance.squareform(ds)
    densities = {}
    for i, point in enumerate(all_points):
        if point not in hull_points:
            continue

        my_ds = square_ds[i]
        density = len([1 for i in my_ds if i <= std_d])
        densities[point] = density

    tmp_densities = [(d, pt) for pt, d in densities.iteritems()]
    tmp_densities.sort(reverse=True)
    return tmp_densities, std_d
def cluster(source_path, result_path):
    #参数初始化

    path = source_path  #开门不同时段
    o = open(path, 'rb')
    data = pd.read_csv(o, index_col='address')

    #生成点与点之间的距离矩阵,这里用的欧氏距离:
    disMat = distance.pdist(data, metric='euclidean')
    Z = linkage(disMat, method='average')  # 进行层次聚类:
    P = dendrogram(Z)  # 将层级聚类结果以树状图表示出来并保存
    plt.savefig(result_path + 'plot_dendrogram.png')
    cluster = fcluster(Z, t=6, criterion='maxclust')  #根据linkage matrix Z得到聚类结果
    k = len(np.unique(cluster))  # 聚类簇的数量
    # print(cluster)

    #详细输出原始数据及其类别
    if not os.path.exists(result_path + 'imgs/'):
        os.makedirs(result_path + 'imgs/')
    if not os.path.exists(result_path + 'csv/'):
        os.makedirs(result_path + 'csv/')

    r = pd.concat([data, pd.Series(cluster, index=data.index)],
                  axis=1)  #详细输出每个样本对应的类别
    r.columns = list(data.columns) + [u'聚类类别']  #重命名表头

    plt.rcParams['font.sans-serif'] = ['SimHei']  #用来正常显示中文标签
    plt.rcParams['axes.unicode_minus'] = False  #用来正常显示负号

    style = ['ro-', 'go-', 'bo-', 'co-', 'mo-', 'yo-']
    xlabels = [u'工作日工作时段', u'工作日通勤时段', u'周末日间', u'凌晨']
    pic_output = result_path + 'imgs/type_'  #聚类图文件名前缀

    for i in range(1, k + 1):  #逐一作图,作出不同样式
        plt.figure()
        tmp = r[r[u'聚类类别'] == i].iloc[:, :4]  #提取每一类除最后一列(label)的数据
        tmp.to_csv(result_path + 'csv/类别%s.csv' % (i))  #将每一类存成一个csv文件

        for j in range(len(tmp)):  #作图
            plt.plot(range(1, 5), tmp.iloc[j], style[i - 1])
            plt.xticks(range(1, 5), xlabels, rotation=20)  #坐标标签
        plt.title(u'门洞类别%s' % (i))  #从1开始计数
        plt.subplots_adjust(bottom=0.15)  #调整底部
        plt.savefig(u'%s%s.png' % (pic_output, i))  #保存图片
Beispiel #12
0
def calc_distance_matrix(gene_informative,
                         ignore_indels=True,
                         metric='jaccard'):
    """
    Calculate a pairwise distance matrix from a 
    pileup of reads across informative sites
    
    """
    n_reads, n_sites = gene_informative.shape
    if n_reads > 1000:
        print("Greater than 1000 reads!")
        print("... consider downsampling")

    if ignore_indels:
        f = np.copy(gene_informative).astype("float")
        f[f == -1] = np.nan
    else:
        f = gene_informative

    X = distance.pdist(gene_informative, metric=metric)
    return X
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, distance
from matplotlib import pyplot as plt
import scipy
import numpy as np

points = np.random.randn(10, 4)  #生成20个样本,每个样本4维
dist_mat = distance.pdist(points, 'euclidean')  #生成点与点之间的距离矩阵,这里使用欧式距离
print(dist_mat)

Z = linkage(dist_mat, method='ward')  #进行层次聚类, 使用ward方法,返回值是聚类树的合并过程
print(Z)

f = fcluster(
    Z, 2, criterion='distance')  #根据阈值决定分类,不同的阈值会导致不同类的合并,判断标准是距离,返回值是每个点的类型标记
print(f)

den = dendrogram(Z)  #将层次聚类结果以树状图表示出来
plt.show()
Beispiel #14
0
    d      = d_sr[i]
    l1     = d * n2 / (n1 + n2)
    l2     = d * n1 / (n1 + n2)

    # the first branching
    if i == 2*n - 2:
      pos_df.loc[c1] = (l1, 0)
      pos_df.loc[c2] = (-l2, 0)
    elif d == 0:
      pos_df.loc[c1] = pos_df.loc[i]
      pos_df.loc[c2] = pos_df.loc[i]
    else:
      pos_s  = pos_df.loc[sis_sr[i]]  # sister node
      pos_i  = pos_df.loc[i]
      L      = np.linalg.norm(pos_s - pos_i)
      th     = get_actual_theta(n1, n2, l1, l2, L)
      phi    = np.angle(np.complex(*(pos_i - pos_s)))
      psi    = phi + th - np.pi
      pos_df.loc[c1] = pos_i + [l1 * np.cos(psi), l1 * np.sin(psi)]
      pos_df.loc[c2] = pos_i - [l2 * np.cos(psi), l2 * np.sin(psi)]

      
  return pos_df.iloc[:n]

if __name__ == '__main__':
  from scipy.cluster.hierarchy import distance, linkage
  X = np.random.randn(5,3)                     # data matrix
  Y = distance.pdist(X, metric='euclidean')    # distance matrix
  Z = linkage(Y, method='average')             # linkage matrix
  pos_df = branching_embedding(Z)  
def getImpCat():
    vecs = []

    ids = getImpIds(imp_ids_num)  #getAllIds()

    for _id in ids:
        vecs.append(docvecs[_id])

    print(len(vecs), len(vecs[0]))
    # print(points)
    disMat = distance.pdist(vecs, 'euclidean')

    #define the linkage_matrix using ward clustering pre-computed distances
    print('开始计算')
    linkage_matrix = linkage(disMat, method='ward',
                             optimal_ordering=True)  #optonal :average ward etc

    def getTree(linkage_matrix):
        class TreeNode(object):
            def __init__(self, _id):
                self.id = _id
                self.parent = None
                self.childs = set()
                self.child_num = 0

        class NodeCompany():
            def __init__(self):
                self.id2node = {}

            def get(self, _id):
                id2node = self.id2node
                if _id in id2node:
                    return id2node[_id]
                else:
                    id2node[_id] = TreeNode(_id)
                    return id2node[_id]

        nodeCompany = NodeCompany()
        linkage_matrix = linkage_matrix.tolist()
        # print(linkage_matrix)

        for item in linkage_matrix:
            node1 = int(item[0])
            node2 = int(item[1])
            # sim = item[2]
            num = int(item[3])
            item[0] = node1
            item[1] = node2
            item[3] = num

        l_length = imp_ids_num
        for index, item in enumerate(linkage_matrix):
            child_num = item[3]
            node1 = nodeCompany.get(item[0])
            node2 = nodeCompany.get(item[1])
            index += l_length
            parent_node = nodeCompany.get(index)
            node1.parent = parent_node
            node2.parent = parent_node
            parent_node.childs.add(node1)
            parent_node.childs.add(node2)
            parent_node.child_num = child_num
        id2node = nodeCompany.id2node
        # for key in id2node:
        #     node = id2node[key]
        # print([sub.id for sub in node.childs], node.id)
        return id2node

    tree = getTree(linkage_matrix)
    result = {}
    for _id in tree:
        item = tree[_id]
        result[_id] = {
            # 'id': item.id,
            'child_num': item.child_num,
            'parent': None if (item.parent is None) else item.parent.id,
            'childs': [child.id for child in item.childs],
        }
    writeJson(tree_path, result)
Beispiel #16
0
#cosine distance
doc_sim =1-cosine_similarity(tfidf_matrix)
print (doc_sim)



# clustering using hierarchical clustering

linkage_matrix = linkage(doc_sim,method='centroid')
#assignments = fcluster(linkage_matrix,1,criterion='distance')


#assignments = fcluster(,4,'distance')

print(linkage_matrix)
c, coph_dists = cophenet(linkage_matrix, distance.pdist(doc_sim))
print (c)
assignments =fcluster(linkage_matrix, 4, 'maxclust')

cluster_doc = pd.DataFrame({'doc':doc_name , 'cluster':assignments})
print(cluster_doc)

cluster_doc.to_csv('doc_cluster.csv',sep='\t')

fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="left", labels=doc_name);

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off