Ejemplo n.º 1
0
    def computeLinkage( self, printDendogram = False ):
        # generate two clusters: a with 100 points, b with 50:
        #np.random.seed(4711)  # for repeatability of this tutorial
        #a = np.random.multivariate_normal([10, 0], [[3, 1], [1, 4]], size=[100,])
        #b = np.random.multivariate_normal([0, 20], [[3, 1], [1, 4]], size=[50,])
        #X = np.concatenate((a, b),)
        self.X = array( self.buildingAverages.values() )
        #print X  # 150 samples with 2 dimensions
        #plt.scatter(X[:,0], X[:,1])
        #plt.show()

        # generate the linkage matrix
        self.Z = linkage(self.X, 'ward')

        c, coph_dists = cophenet(self.Z, pdist(self.X))

        if (printDendogram):
            # calculate full dendrogram
            plt.figure(figsize=(25, 10))

            plt.title('Hierarchical Clustering Dendrogram (truncated)')
            plt.xlabel('Dendogram of Dartmouth campus buildings clusters')
            plt.ylabel('distance')
            dendrogram(
                self.Z,
                #truncate_mode='lastp',  # show only the last p merged clusters
                #p=20,  # show only the last p merged clusters
                show_leaf_counts=True,  # otherwise numbers in brackets are counts
                leaf_rotation=90.,
                leaf_font_size=12.,
                show_contracted=True,  # to get a distribution impression in truncated branches
            )
            plt.show()

        return self.Z
Ejemplo n.º 2
0
def make_good_heatmap(D):
    data_dist = 1. - D
    np.fill_diagonal(data_dist, 0.)
    data_dist = squareform(data_dist)

    # Compute and plot first dendrogram.
    fig = plt.figure()
    # x-y width height
    ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6])
    Y = linkage(data_dist, method='complete')
    Z1 = dendrogram(Y, orientation='right',  color_threshold=.7)
    ax1.set_xticks([])
    ax1.set_yticks([])

    # Compute and plot second dendrogram.
    ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2])
    Z2 = dendrogram(Y, color_threshold=.7)
    ax2.set_xticks([])
    ax2.set_yticks([])

    # Compute and plot the heatmap
    axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6])
    idx1 = Z1['leaves']
    idx2 = Z2['leaves']
    D = D[idx1,:]
    D = D[:,idx2]
    im = axmatrix.matshow(D, aspect='auto', origin='lower')
    axmatrix.set_xticks([])
    axmatrix.set_yticks([])

    # Plot colorbar.
    axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.6])
    plt.colorbar(im, cax=axcolor)
    show()
def draw_intensity(a, cmap=GREEN_CMAP, metric='euclidean', method='average', sort_x=True, sort_y=True):
    main_axes = plt.gca()
    divider = make_axes_locatable(main_axes)

    if sort_x is True:
        plt.sca(divider.append_axes("top", 0.5, pad=0))
        xlinkage = linkage(pdist(a.T, metric=metric), method=method, metric=metric)
        xdendro = dendrogram(xlinkage, orientation='top', no_labels=True,
                             distance_sort='descending',
                             link_color_func=lambda x: 'black')
        plt.gca().set_axis_off()
        a = a[[a.columns[i] for i in xdendro['leaves']]]

    if sort_y is True:
        plt.sca(divider.append_axes("left", 1.0, pad=0))
        ylinkage = linkage(pdist(a, metric=metric), method=method, metric=metric)
        ydendro = dendrogram(ylinkage, orientation='right', no_labels=True,
                             distance_sort='descending',
                             link_color_func=lambda x: 'black')
        plt.gca().set_axis_off()
        a = a.ix[[a.index[i] for i in ydendro['leaves']]]

    plt.sca(main_axes)
    plt.imshow(a, aspect='auto', interpolation='none',
               cmap=cmap, vmin=0.0, vmax=1.0)
    plt.colorbar(pad=0.15)
    plt.gca().yaxis.tick_right()
    plt.xticks(range(a.shape[1]), a.columns, rotation=90, size='small')
    plt.yticks(range(a.shape[0]), a.index, size='x-small')
    plt.gca().xaxis.set_ticks_position('none')
    plt.gca().yaxis.set_ticks_position('none')
    plt.gca().invert_yaxis()

    plt.show()
Ejemplo n.º 4
0
def save_mat(c2map, filepath):
	mat = c2map['mat']
	fig = pylab.figure(figsize=(8,8))
	
	# Compute and plot first dendrogram.
	ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
	Y = sch.linkage(mat, method='centroid')
	Z1 = sch.dendrogram(Y, orientation='right')
	ax1.set_xticks([])
	ax1.set_yticks([])

	# Compute and plot second dendrogram.
	ax2 = fig.add_axes([0.3,0.71,0.6,0.2])
	Y = sch.linkage(mat, method='single')
	Z2 = sch.dendrogram(Y)
	ax2.set_xticks([])
	ax2.set_yticks([])

	# Plot distance matrix.
	axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
	idx1 = Z1['leaves']
	idx2 = Z2['leaves']
	mat = mat[idx1,:]
	mat = mat[:,idx2]
	im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu)
	axmatrix.set_xticks([])
	axmatrix.set_yticks([])

	# Plot colorbar.
	axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
	pylab.colorbar(im, cax=axcolor)

	fig.savefig(filepath)
Ejemplo n.º 5
0
def plot_dist_matrix(matrix, fasta_names, heatmap_out, dendrogram_out):
    """Cluster the distance matrix hierarchically and plot using seaborn.
    Average linkage method is used."""
    # Load required modules for plotting
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    import seaborn as sns
    import pandas as pd
    from scipy.cluster.hierarchy import dendrogram, linkage

    # Create
    pdm = pd.DataFrame(matrix, index=fasta_names, columns=fasta_names)

    # Plot heatmap
    figsizex = max(10, len(fasta_names) / 4)
    clustergrid = sns.clustermap(pdm, metric='euclidean', method='average',
            figsize=(figsizex, figsizex))
    clustergrid.savefig(heatmap_out)

    # Plot dendrogram
    sns.set_style('white')
    figsizey = max(10, len(fasta_names) / 8)
    f, ax = plt.subplots(figsize=(figsizex, figsizey))
    link = linkage(pdm, metric='euclidean', method='average')
    dendrogram(link, labels=pdm.index, ax=ax)
    no_spine = {'left': True, 'bottom': True, 'right': True, 'top': True}
    sns.despine(**no_spine)
    plt.xticks(rotation=90)
    f.tight_layout()
    plt.savefig(dendrogram_out)
Ejemplo n.º 6
0
def make_dendrogram_w(LinkageMatrix, GraphFolder, 
                    Method, Metric, CorrCoeff, Labels, Colors,
                    DisplayLevels):
    import matplotlib
    if not os.path.exists(GraphFolder):
        os.makedirs(GraphFolder)
    plt.figure(figsize=(12,24))
    plt.title("Plays clustered by topic probabilities", fontsize=14)
    #plt.ylabel("Parameters: "+Method+" method, "+Metric+" metric. CorrCoeff: "+str(CorrCoeff)+".")
    plt.xlabel("Distance\n(Parameters: "+Method+" / "+Metric+")", fontsize=12)
    matplotlib.rcParams['lines.linewidth'] = 1.2
    dendrogram(
        LinkageMatrix,
        p = DisplayLevels,
        truncate_mode="level",
        color_threshold = 30,
        show_leaf_counts = True,
        no_labels = False,
        orientation="left",
        labels = Labels, 
        leaf_rotation = 0,  # rotates the x axis labels
        leaf_font_size = 4,  # font size for the x axis labels
        )
    #plt.show()
    plt.savefig(GraphFolder+"dendrogram_"+Method+"-"+Metric+"-"+str(DisplayLevels)+".png", dpi=300, figsize=(12,18), bbox_inches="tight")
    plt.close()
Ejemplo n.º 7
0
def buildEmbeddingsTree(indexMap, embeddings, comparator=None):
    embeddingsCount = len(embeddings)
    embeddingIndices = numpy.arange(0, embeddingsCount)
    xy = [xy for xy in itertools.product(embeddingIndices, embeddingIndices)]

    comparator = lambda a, b: vectors.euclideanDistance(a, b) + 1 / (2 + 2*vectors.cosineSimilarity(a, b))

    function = lambda xy: comparator(embeddings[xy[0]], embeddings[xy[1]]) if xy[0] != xy[1] else 0
    comparisons = map(function, xy)
    maxComparison = max(comparisons)
    comparisons = numpy.reshape(comparisons, (embeddingsCount, embeddingsCount)) / maxComparison
    comparisons = ssd.squareform(comparisons)
    links = linkage(comparisons)

    fig, ax = plt.subplots()
    fig.subplots_adjust(right=0.8)

    names = map(lambda nameIndexPair: nameIndexPair[0].split('/')[-1], indexMap.items())
    names = sorted(names)
    dendrogram(
        links,
        leaf_rotation=90.,
        leaf_font_size=8.,
        orientation='right',
        labels=names,
        show_contracted=True,
        show_leaf_counts=True)

    plt.show()
Ejemplo n.º 8
0
def plot_corr_dendrogram(
    corr, cluster_method='weighted', **dendrogram_kwargs):
    """
    Plot a correlation matrix as a dendrogram (on the current axes).

    Parameters
    ----------
    corr : numpy ndarray or pandas DataFrame
    cluster_method : String
        Method to use to amalgomate clusters.
        Either 'single', 'complete', 'average', or 'weighted'.
        See scipy.cluster.hierarchy.linkage for details.
    dendrogram_kwargs : Additional kwargs
        Pass to the call of scipy.cluster.hierarchy.dendrogram()
    """
    # Convert to a DataFrame in all cases.
    if not isinstance(corr, pd.DataFrame):
        names = range(len(corr))
    else:
        names = corr.index.tolist()
        corr = corr.values

    dist = (1 - corr) / 2.
    Z = linkage(squareform(dist), method=cluster_method)

    dendrogram(Z, labels=names, **dendrogram_kwargs)
def city_comparison(file_array):
    dphi = 40
    step = 1.0/dphi
    cities_fingerprint = np.zeros([len(file_array),2,dphi],'double')
    indx = 0
    for file in file_array:
        [alph1,alph2] = fingerprint(file, dphi, indx, plot=False)
        # print alph1,alph2
        cities_fingerprint[indx,0,:] = alph1
        cities_fingerprint[indx,1,:] = alph2
        indx+=1
    distance_matrix = np.zeros([len(file_array),len(file_array)],'double')
    for i in xrange(len(file_array)):
        city_i = cities_fingerprint[i,:,:]
        d_cities = np.square(cities_fingerprint - city_i)*0.01
        d_cities = np.sum(d_cities,2)
        D_cities = np.sum(np.square(d_cities),1)
        distance_matrix[i,:] = D_cities
    distance_condensed = squareform(distance_matrix, checks=False)
    linkage_matrix = linkage(distance_condensed)
    g = plt.figure(indx)
    dendrogram(linkage_matrix, orientation='right')
    plt.title('Dendrogram\nHierarchical Clustering of %d Cities'%len(file_array))
    plt.show()
    return True
Ejemplo n.º 10
0
def HierarchicalCluster(A):
    #see http://stackoverflow.com/questions/2982929/plotting-results-of-hierarchical-clustering-ontop-of-a-matrix-of-data-in-python
    Corr = np.corrcoef(A.T)
    fig = plt.figure(figsize=(8,8))
    ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
    Y = hrc.linkage(Corr, method='centroid')
    Z1 = hrc.dendrogram(Y, orientation='right')
    ax1.set_xticks([])
    ax1.set_yticks([])

    ax2 = fig.add_axes([0.3,0.71,0.6,0.2])
    Y = hrc.linkage(Corr, method='centroid')
    Z2 = hrc.dendrogram(Y)
    ax2.set_xticks([])
    ax2.set_yticks([])

    axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
    idx1 = Z1['leaves']
    idx2 = Z2['leaves']
    Corr = Corr[idx1, :]
    Corr = Corr[:, idx2]
    im = axmatrix.matshow(Corr, aspect='auto', origin='lower')

    axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
    pylab.colorbar(im, cax=axcolor)
    fig.show()
    fig.savefig('dendrogram.png')
Ejemplo n.º 11
0
    def check_dendrogram_plot(self, orientation):
        # Tests dendrogram plotting.
        Z = linkage(hierarchy_test_data.ytdist, 'single')
        expected = {'color_list': ['g', 'b', 'b', 'b', 'b'],
                    'dcoord': [[0.0, 138.0, 138.0, 0.0],
                               [0.0, 219.0, 219.0, 0.0],
                               [0.0, 255.0, 255.0, 219.0],
                               [0.0, 268.0, 268.0, 255.0],
                               [138.0, 295.0, 295.0, 268.0]],
                    'icoord': [[5.0, 5.0, 15.0, 15.0],
                               [45.0, 45.0, 55.0, 55.0],
                               [35.0, 35.0, 50.0, 50.0],
                               [25.0, 25.0, 42.5, 42.5],
                               [10.0, 10.0, 33.75, 33.75]],
                    'ivl': ['2', '5', '1', '0', '3', '4'],
                    'leaves': [2, 5, 1, 0, 3, 4]}

        fig = plt.figure()
        ax = fig.add_subplot(111)

        # test that dendrogram accepts ax keyword
        R1 = dendrogram(Z, ax=ax, orientation=orientation)
        plt.close()
        assert_equal(R1, expected)

        # test plotting to gca (will import pylab)
        R2 = dendrogram(Z, orientation=orientation)
        plt.close()
        assert_equal(R2, expected)
Ejemplo n.º 12
0
def cengci(data):
    X = data
    distMatrix = pdist(X)
    Z = linkage(X, 'ward')
    c, coph_dists = cophenet(Z, pdist(X))
    print c
    dendrogram(Z)
Ejemplo n.º 13
0
def clustering(X, labels, algo='hcluster', n_clusters=5, figname='cluster_result.png'):
    """ Clustering data.
        Params:
            X: ndarray of n x d size (n samples, d features)
            labels: labels of samples, for visualizing result.
            algo: specify clustering algorithms, e.g., "hcluster", "kmeans"
            n_clusters: #.of.cluster in case of kmeans
            figname: file name to save figure
    """
    assert algo in ['hcluster', 'kmeans'], "Invalid algorithm!"
    
    if algo == 'hcluster':
        linkage_mat = hcluster(X, metric='correlation', method='average')
        fig = plt.figure(figsize=(30,20), dpi=100)
        fig.clf()
        hier.dendrogram(linkage_mat, labels=labels, leaf_rotation=90, leaf_font_size=20)
        plt.savefig(figname)
    else:
        labels = np.asarray(labels)
        result = kmeans(X, n_clusters=n_clusters)
        for cid in xrange(n_clusters):
            print 'Cluster %d:' %(cid+1)
            for a in labels[result == cid]:
                print a.encode('utf-8')
            print '-'*30
Ejemplo n.º 14
0
 def plot_dendrogram(self, method = 'complete', metric = 'euclidean'):
     import scipy.cluster.hierarchy as sch
     """ Plot dendogram
     Parameters
     ------------
     method: str
         method to use for scipy.cluster.hierarachy.linkage.  Default
         is 'complete'
     
     metric: str
         metric to use for scipy.cluster.hierarachy.linkage.  Default
         is 'euclidean'
         
     Returns
     ------------
         Dendrogram
     """
     # Get par names
     pars = self.df.index.values
     
     D = np.abs(self.array)
     Y = sch.linkage(D, method=method, metric = metric)
     plt.figure()
     sch.dendrogram(Y, labels = pars)
     plt.tight_layout()
Ejemplo n.º 15
0
def labeledDendrogram(dmat, labels, method='complete', cmap=None):
    """Perform hierarchical clustering on df columns and plot square heatmap of pairwise distances"""
    """TODO: add tick labels, with sparsity option"""

    Z = sch.linkage(dmat, method=method)
    den = sch.dendrogram(Z, color_threshold=np.inf, no_plot=True)

    figh = plt.gcf()
    figh.clf()

    denAX = figh.add_axes([0.32, 0.05, 0.6, 0.9])
    cbAX =  figh.add_axes([0.25, 0.05, 0.05, 0.9])

    plt.sca(denAX)
    denD = sch.dendrogram(Z, color_threshold=np.inf, orientation='left')
    ind = denD['leaves']
    clean_axis(denAX)
    
    cbSE, lookup = mapColors2Labels(labels, cmap=cmap, returnLookup=True)
    axi = cbAX.imshow([[x] for x in cbSE.iloc[ind].values],
                      interpolation='nearest',
                      aspect='auto',
                      origin='lower')
    clean_axis(cbAX)

    colorLegend(list(lookup.values()), list(lookup.keys()), axh=denAX)
def dendrogram_pdf(args, dm, leafLabels):
        from scipy.cluster.hierarchy import linkage, dendrogram
        #from hcluster import squareform, linkage, dendrogram
        #from numpy import array
        #import pylab
        import matplotlib
        matplotlib.use('PDF')   # pdf
        import matplotlib.pyplot as plt
        #condensed_dm = distance.squareform( dm )
        #plt.figure(figsize=(100,10))
        leafNodes = len(leafLabels)
        fig = plt.figure(figsize=(14,(leafNodes*0.25)), dpi=100)
        #fig = plt.figure(figsize=(14,100), dpi=10)
        #fig.set_size_inches(14,(leafNodes*0.2))
        #ax = fig.add_subplot(111)
        #plt.tight_layout()
        #ax.set_title('Dendrogram: '+args.metric.capitalize())
        # padding:
        #plt.subplots_adjust(bottom=0.25)
        #plt.subplots_adjust(top=0.05)
        plt.subplots_adjust(left=0.01)
        plt.subplots_adjust(right=0.65)
        plt.subplots_adjust(top=0.7)
        plt.subplots_adjust(bottom=0.25)
        #leafLabels = [ '\n'.join(l.split('--')) for l in leafLabels ]


        linkage_matrix = linkage(dm,  method="average" )
        dendrogram(linkage_matrix,  color_threshold=1,  leaf_font_size=6,  orientation='right', labels=leafLabels)
        image_file = os.path.join(args.basedir, 'tmp',args.prefix+'_dendrogram.pdf')

        plt.savefig(image_file)
Ejemplo n.º 17
0
def Hierarchical_cluster_part(csvFile):
    df=pd.read_csv(csvFile)
    data=df.as_matrix()
    data=data[:,1:]
    # generate the linkage matrix
    Z = linkage(data, 'ward')
    c, coph_dists = cophenet(Z, pdist(data))
    print c
     ## Plotting a Dendrogram
    # calculate full dendrogram
    plt.figure(figsize=(140, 60))
    plt.title('Hierarchical Clustering Dendrogram(part)')
    plt.xlabel('sample index')
    plt.ylabel('distance')
    dendrogram(
        Z,
        leaf_rotation=90.,  # rotates the x axis labels
        leaf_font_size=2.,  # font size for the x axis labels
    )
   # fancy_dendrogram(
       # Z,
       # truncate_mode='lastp',  # show only the last p merged clusters
        #p=18,  # show only the last p merged clusters
       # leaf_rotation=90.,  # rotates the x axis labels
      #  leaf_font_size=8.,  # font size for the x axis labels
     #   show_leaf_counts=True, # numbers in brackets are counts
    #    show_contracted=True,  # to get a distribution impression in truncated branches
   #     max_d = 6000  # max_d as in max_distance
  #  )
    plt.savefig('/Users/CeciliaLee/Dropbox/Intren/HKIA/2/Dendrogram_Tree(part).png')
    plt.show()  
    
    return c, Z   
Ejemplo n.º 18
0
def dendrogram(data, vectorizer, method="ward", color_threshold=1, size=10, filename=None):
    '"median","centroid","weighted","single","ward","complete","average"'
    if hasattr(data, '__iter__'):
        iterable = data
    else:
        raise Exception('ERROR: Input must be iterable')
    import itertools
    iterable_1, iterable_2 = itertools.tee(iterable)
    # get labels
    labels = []
    for graph in iterable_2:
        label = graph.graph.get('id', None)
        if label:
            labels.append(label)
    # transform input into sparse vectors
    X = vectorizer.transform(iterable_1)

    # labels
    if not labels:
        labels = [str(i) for i in range(X.shape[0])]

    # embed high dimensional sparse vectors in 2D
    from sklearn import metrics
    from scipy.cluster.hierarchy import linkage, dendrogram
    D = metrics.pairwise.pairwise_distances(X)
    Z = linkage(D, method=method)
    plt.figure(figsize=(size, size))
    dendrogram(Z, color_threshold=color_threshold, labels=labels, orientation='right')
    if filename is not None:
        plt.savefig(filename)
    else:
        plt.show()
Ejemplo n.º 19
0
    def hcluster(self):
        """

        .. plot::
            :include-source:
            :width: 50%

            from cno import XCNOGraph, cnodata
            c = XCNOGraph(cnodata("PKN-ToyPB.sif"), cnodata("MD-ToyPB.csv"))
            c.hcluster()

        .. warning:: experimental
        """
        from scipy.cluster import hierarchy
        from scipy.spatial import distance
        path_length=nx.all_pairs_shortest_path_length(self.to_undirected())
        n = len(self.nodes())
        distances=np.zeros((n,n))
        nodes = self.nodes()
        for u,p in path_length.iteritems():
            for v,d in p.iteritems():
                distances[nodes.index(u)-1][nodes.index(v)-1] = d
        sd = distance.squareform(distances)
        hier = hierarchy.average(sd)
        pylab.clf();
        hierarchy.dendrogram(hier)

        pylab.xticks(pylab.xticks()[0], nodes)
Ejemplo n.º 20
0
def paint_clustering(results, clusters, num, chrom, tad_names):
    dendros = []
    axes = []
    prev = 0
    xlim = [-100, 100]
    tmp = []
    for i, result in enumerate(results):
        if axes:
            axes[-1].set_xticklabels([], visible=False)
        clust = linkage(result, method='ward')
        tmp = dendrogram(clust, orientation='right', no_plot=True)['leaves']
        dendros += reversed(list([clusters[i][n] for n in tmp]))
        axes.append(plt.subplot2grid((num, 9),(prev, 0), rowspan=len(result),
                                     colspan=4))
        dendrogram(clust, orientation='right',
                   labels=[tad_names[c] for c in clusters[i]])
        if xlim[0] < axes[-1].get_xlim()[0]:
            xlim[0] = axes[-1].get_xlim()[0]
        if xlim[1] > axes[-1].get_xlim()[1]:
            xlim[1] = axes[-1].get_xlim()[1]
        prev += len(result)
    for ax in axes:
        ax.set_xlim(left=xlim[0], right=xlim[1])
    axes = []
    for i, j in enumerate(dendros):
        axes.append(plt.subplot2grid((num, 9),(i, 4)))#gs1[i]))
        chrom.visualize('exp1',
                        tad=chrom.get_experiment('exp1').tads[tad_names[j]],
                        axe=axes[-1], show=False)
        axes[-1].set_axis_off()
    ax4 = plt.subplot2grid((num, 9),(0, 5), rowspan=num, colspan=4)
    chrom.visualize('exp1', paint_tads=True, axe=ax4)
    plt.draw()
Ejemplo n.º 21
0
def _draw_dendrogram(axes, Z, labels=None):
    """Draw the given linkage information as a dendrogram on the given Axes
    object. Change the drawing parameters so that the dendrogram will blend
    nicely into the figure showing multiple dendrograms.
    
    Arguments:
    axes -- matplotlib.axes.Axes object where to draw the plot
    Z -- numpy.ndarray in the format as specified in the
        scipy.cluster.hierarchy.linkage's docstring
    
    Keyword arguments:
    labels --  list or tuple (optional) where i-th value is the text to put
        under the i-th leaf node
    
    """
    # set current axes instance
    plt.sca(axes)
    # draw the dendrogram
    dendrogram(Z, labels=labels, orientation="left")
    # remove x-axis labels
    axes.set_xticks(())
    # remove the black border around axes
    for spine in axes.spines.itervalues():
        spine.set_visible(False)
    # decrease the font size of y tick labels
    for ytl in axes.get_yticklabels():
        ytl.set_fontsize("small")
    def plot_data(self):
        """
        plots a dendogram of the hierarchical clustering
        uncomment the matplotlib import if you call this function
        :return: None
        """
        matrix = self.pre_cluster()

        #original matrix empty
        if matrix is None:
            return
        Z = self.get_cluster_matrix(matrix)
        article_titles = self.matrix_creator.get_article_titles()

        # calculate full dendrogram
        plt.figure(figsize=(10, 10))
        plt.title('Hierarchical Clustering Dendrogram')
        plt.xlabel('distance')
        plt.axhline(y=2.3, c='k')
        dendrogram(
            Z,
            p=5,  # show only the last p merged clusters
            orientation="right",
            labels=article_titles,
            show_leaf_counts=False,  # otherwise numbers in brackets are counts
            #leaf_rotation=90.,  # rotates the x axis labels
            leaf_font_size=9.,  # font size for the x axis labels
        )
        plt.show()
Ejemplo n.º 23
0
def hier_cluster_and_display(dist_matrix, leaf_labels, colorthresh, to_cluster = 'all', m = 'complete', 
			imgsize = 25, fontsize=16):
	'''
		clusters domains using hierarchical clustering and displays dendrogram.
		arguments:
			dist_matrix : distance matrix between domains
			leaf_labels: list of domain names
			colorthresh: threshold to color dendrogram nodes
			to_cluster (list of ints, optional, default='all'):
				if 'all', clusters all domains
				else clusters only domains corresponding to indices in list
			m (default='complete'): method used in hierarchical clustering.
				'single' and 'average' also work; as in scipy.
			imgsize (default=25): size of image (imgsize,imgsize) of dendrogram to produce.
			fontsize (default=16): font size of dendrogram leaf labels.
		returns:
			result as outputted by scipy's hierarchical clustering.
	'''
	if to_cluster == 'all':
		cluster_indices = range(dist_matrix.shape[0])
	else:
		cluster_indices = to_cluster
	plt.figure(figsize=(imgsize,imgsize))
	result = hier_cluster(dist_matrix,cluster_indices,m)
	dendrogram(result,orientation='left',
		labels=leaf_labels[cluster_indices], color_threshold=colorthresh, leaf_font_size=fontsize)

	return result
Ejemplo n.º 24
0
def plot_dendrogram(model, **kwargs):
    '''
    taken from online example in sklearn fork
    turns hierarchical model into dendrogram
    '''
    from scipy.cluster.hierarchy import dendrogram
    from sklearn.datasets import load_iris
    from sklearn.cluster import AgglomerativeClustering
    from sklearn.metrics import pairwise_distances
    from matplotlib import pyplot as plt
    # Children of hierarchical clustering
    children = model.children_

    # Distances between each pair of children
    # Since we don't have this information, we can use a uniform one for plotting
    distance = np.arange(children.shape[0])

    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0]+2)

    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)
Ejemplo n.º 25
0
def ben_gen():

    for num_data, hier_num, grey_option in itertools.product(
        np.arange(len(data)), np.arange(len(desired_hier)), [0, 1]
    ):

        hierarchy_structure = all_clustering_data[num_data][grey_option][hier_num]
        plt.figure()
        dendrogram(hierarchy_structure, color_threshold=1.6)
        # plt.ylim(0,5)
        plt.title(data_names[num_data] + grey_output + names_distances[hier_num + 1])

        yield "okay"
        plt.close()

        plt.figure()
        dendrogram(hierarchy_structure, color_threshold=1.6)
        plt.ylim(0, ylimit)
        plt.title(data_names[num_data] + grey_output + names_distances[hier_num + 1])

        yield "okay"
        plt.close()
        # predict_average = fcluster(hierarchy_structure,1.6,criterion='distance')

    raise StopIteration
Ejemplo n.º 26
0
def create_dendrogram(cds, clusters=None, filename=None):    
    
    
    num_subj = cds.shape[0]
    num_voxels = cds.shape[1]
    
    if clusters == None:
        clusters = cds.a.event_bounds
        
    num_scenes = len(clusters)
    ds_list = np.zeros((num_subj, num_voxels, num_scenes-1))
    prev_cutoff = 0
    ds_tup = ()
    
    # average correlations for each scene
    for i in range(num_scenes - 1):
        ds_list[:,:,i] = np.mean(cds.samples[:,:,clusters[i]:clusters[i+1]], axis=2)
       
    Z = hierarchy.linkage(np.mean(ds_list, axis=0).T, metric='correlation')
        
    fig = plt.figure(figsize=(14,8))
    hierarchy.dendrogram(Z)
    plt.show()
    if filename is not None:
        fig.savefig(filename)
Ejemplo n.º 27
0
 def hierarchical_clustering(self, data = 'open_shut'):
     '''
     Cluster the clusters in the cluster list based on the method.
     '''
     feature_list = []
     
     for cluster in self.cluster_list:
         if data == 'open_shut':
             feature_list.append([np.log(cluster._get_mean_open()), 
                                  np.log(cluster._get_mean_shut())])
         elif data == 'popen':
             feature_list.append([cluster.popen,])
         elif data == 'amp':
             feature_list.append([cluster.mean_amp,])
     
     Z = linkage(feature_list, 'ward')
     plt.figure(figsize=(25, 10))
     plt.title('Hierarchical Clustering Dendrogram')
     plt.xlabel('sample index')
     plt.ylabel('distance')
     dendrogram(
         Z,
         leaf_rotation=90.,  # rotates the x axis labels
         leaf_font_size=8.,  # font size for the x axis labels
     )
     plt.show()
    def show(self,
             distance_metric='euclidean',
             linkage_method='ward'):
        '''階層的クラスタリング表示関数
        '''

        #: 指定の手法で階層的クラスタリング
        cluster = hierarchy.linkage(self.hofstede_data,
                                    method=linkage_method,
                                    metric=distance_metric)

        #: 樹形図作成
        hierarchy.dendrogram(cluster,
                             orientation='left',
                             color_threshold=150,
                             labels=numpy.array(self.data_name),
                             leaf_font_size=18)
        
        #: 日本を赤くするためのおまじない
        ax = plt.gca()
        xlbls = ax.get_ymajorticklabels()
        for lbl in xlbls:
            if lbl.get_text() == self.JAPAN_NAME:
                lbl.set_color("r")

        self.cluster = cluster
        plt.show()
Ejemplo n.º 29
0
def cal_idf_overlap():
    list_subj = utils.list_subject

    ls_distance_final = []
    ls_distance_row = []
    #print len(list_att)
    stop_words = get_stop_words('en')
    tmp_corpus = []
    for i in range(len(list_subj)):
        item = str(list_subj[i]).split(" ")
        for token in item:
            if token in stop_words:
                pass
            else:
                tmp_corpus.append(token)
    #print "corpus", corpus

    length = len(list_subj)
    for i in range(0, length):
        if i == 500 or i == 1000 or i == 1500:
            print i
        for j in range(0, length):
            print i, j
            idf_instance = IDF.IDF(str(list_subj[i]),str(list_subj[j]), tmp_corpus)
            distance = idf_instance.cal_overlap()
            ls_distance_row.append(distance)
        ls_distance_final.append(ls_distance_row)
        ls_distance_row = []

    myarray = np.asarray(ls_distance_final)
    print myarray
    Z = linkage(myarray, "ward")
    thefile = open('/Users/Aaron/test.txt', 'w')
    for item in Z:
        thefile.write("%s\n" % item)

    plt.figure(figsize=(25, 10))
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('sample index')
    plt.ylabel('distance')
    dendrogram(
         Z,
         leaf_rotation=90.,  # rotates the x axis labels
         leaf_font_size=8.,  # font size for the x axis labels
     )
    plt.show()

    plt.title('Hierarchical Clustering Dendrogram (truncated)')
    plt.xlabel('sample index')
    plt.ylabel('distance')
    dendrogram(
        Z,
        truncate_mode='lastp',  # show only the last p merged clusters
        p=30,  # show only the last p merged clusters
        show_leaf_counts=True,  # otherwise numbers in brackets are counts
        leaf_rotation=90.,
        leaf_font_size=12.,
        show_contracted=True,  # to get a distribution impression in truncated branches
    )
    plt.show()
Ejemplo n.º 30
0
Archivo: eda.py Proyecto: ANB2/rosetta
def plot_corr_dendrogram(
    corr, cluster_method='weighted', **dendrogram_kwargs):
    """
    Plot a correlation matrix as a dendrogram (on the current axes).
    Uses scipy.cluster.hierarchy.linkage
    to compute clusters based on distance between samples.  
    
    Since correlation is passed in, this correlation must be converted to a
    distance (using distance_fun).  The default distance_fun makes highly
    correlated points have low distance, and vice versa.

    Parameters
    ----------
    corr : numpy ndarray or pandas DataFrame
        corr[i, j] is the correlation (should be between -1 and 1) of samples
        i and j.
    cluster_method : String
        Method to use to amalgomate clusters.
        Either 'single', 'complete', 'average', or 'weighted'.
        See scipy.cluster.hierarchy.linkage for details.
    dendrogram_kwargs : Additional kwargs
        Pass to the call of scipy.cluster.hierarchy.dendrogram()
    """
    # Convert to a DataFrame in all cases.
    if not isinstance(corr, pd.DataFrame):
        names = range(len(corr))
    else:
        names = corr.index.tolist()
        corr = corr.values

    dist = (1 - corr) / 2.
    Z = linkage(squareform(dist), method=cluster_method)

    dendrogram(Z, labels=names, **dendrogram_kwargs)
Ejemplo n.º 31
0
#plotting the results into line graph
plt.plot(range(1, 11), wcss)
plt.title("Elbow method")
plt.xlabel("No of clusters")
plt.ylabel("WCSS")
plt.show()

# ## Using dendogram to find optimal no of clusters.

# ## Hierarchical clustering

# In[12]:

import scipy.cluster.hierarchy as sch

dendrogram = sch.dendrogram(sch.linkage(X, method='ward'))
plt.title("Dendrogram")
plt.xlabel("Species")
plt.ylabel("Euclidean Distance")
plt.show()

# optimum clusters will be cluster after which wcss remains almost constant. From above two graphs, optimum no of clusters is 3.

# ## creating kmeans classifier

# In[13]:

kmeans = KMeans(n_clusters=3,
                init='k-means++',
                max_iter=300,
                n_init=10,
'''
Hierarchical clustering of the grain data
In the video, you learned that the SciPy linkage() function performs hierarchical clustering on an array of samples. Use the linkage() function to obtain a hierarchical clustering of the grain samples, and use dendrogram() to visualize the result. A sample of the grain measurements is provided in the array samples, while the variety of each grain sample is given by the list varieties.
INSTRUCTIONS
100XP
Import:
linkage and dendrogram from scipy.cluster.hierarchy.
matplotlib.pyplot as plt.
Perform hierarchical clustering on samples using the linkage() function with the method='complete' keyword argument. Assign the result to mergings.
Plot a dendrogram using the dendrogram() function on mergings. Specify the keyword arguments labels=varieties, leaf_rotation=90, and leaf_font_size=6.
'''
# Perform the necessary imports
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Calculate the linkage: mergings
mergings = linkage(samples, method='complete')

# Plot the dendrogram, using varieties as labels
dendrogram(
    mergings,
    labels=varieties,
    leaf_rotation=90,
    leaf_font_size=6,
)
plt.show()
    np.savez('/scratch/PI/mcovert/dvanva/sequencing/smFISH/' + str(t) +
             "_dynamics_distance_matrix_kshape.npz",
             distance_matrix=distance_matrix)
    dynamics_load = np.load('/scratch/PI/mcovert/dvanva/sequencing/smFISH/' +
                            str(t) + "_dynamics_distance_matrix_kshape.npz")
    distance_matrix = dynamics_load['distance_matrix']
    Y = sch.linkage(distance_matrix, method='ward')
    ind_dynamics = sch.fcluster(Y, 0.5 * np.amax(Y[:, 2]), 'distance') - 1
    """
	Plot dendrogram
	"""

    fig = plt.figure()
    ax_dendro = fig.add_axes([0.09, 0.1, 0.2, 0.8], frame_on=False)
    Z = sch.dendrogram(Y,
                       orientation='right',
                       color_threshold=0.5 * np.amax(Y[:, 2]))

    ax_dendro.set_xticks([])
    ax_dendro.set_yticks([])
    """
	Plot heatmap
	"""

    ax_heatmap = fig.add_axes([0.3, 0.1, 0.6, 0.8])
    index = Z['leaves']
    dynamics_ordered = dynamics_matrix[index, :]
    im = ax_heatmap.matshow(dynamics_ordered,
                            aspect='auto',
                            origin='lower',
                            cmap=plt.get_cmap('Reds'),
    def linkageComparison(file, num_comps, linkList):
        '''
        Compares 2-4 linkage functions on a given set of data. 
        
        linkageComparison requires a file, number of comparisons, and a list of linkage functions. 

        Input:

        file - include full file path, use the tkinter filedialog functionality for ease of obtaining file path

        num_of_comps - make sure to give an integer the same length as the link list. 
            
        linkList - list of linkage functions that you would like to have compared. 

        Output:

        linkageComparison saves a .png file of the output to the current working directory. 
        '''

        #set recursion limit above the common max for our data.
        sys.setrecursionlimit(10**8)
        #Log that user called linkage comparison function
        logging.info(': User called the Linkage Comparison function.')
        #check that the file is appropriate for our data set
        metab_data = GB.fileCheck(file)

        if metab_data is None:
            #Logs error and returns function to ensure soft exit.
            logging.error(': Error loading in excel file check log file!')
            return

        #read in column data
        data = GB.readInColumns(metab_data)

        #Standardize the data before clustering the results
        logging.info('Standardizing the data.')
        for i in range(metab_data.shape[0]):
            data[i, :] = GB.standardize(data[i, :])
        del (i)
        if num_comps == 2:
            #Create the linkage matrix
            linkageOne = linkage(data, linkList[0])
            distMeasure = pdist(data)
            distMeasure = squareform(distMeasure)
            linkageTwo = linkage(data, linkList[1])

            #Create the appropriate plt figure to allow for the comparison of linkage functions
            fig, axes = plt.subplots(1, 2, figsize=(8, 8))

            #create the dendrograms
            dend1 = dendrogram(linkageOne,
                               ax=axes[0],
                               above_threshold_color='y',
                               orientation='left',
                               no_labels=True)
            dend2 = dendrogram(linkageTwo,
                               ax=axes[1],
                               above_threshold_color='y',
                               orientation='left',
                               no_labels=True)
            del (linkageOne, linkageTwo, num_comps)

        elif num_comps == 3:
            #Create the linkage matrix
            linkageOne = linkage(data, linkList[0])
            linkageTwo = linkage(data, linkList[1])
            linkageThree = linkage(data, linkList[2])

            #Create the appropriate plt figure to allow for the comparison of linkage functions
            fig, axes = plt.subplots(1, 3, figsize=(8, 8))

            #create the dendrograms
            dend1 = dendrogram(linkageOne,
                               ax=axes[0],
                               above_threshold_color='y',
                               orientation='left',
                               no_labels=True)
            dend2 = dendrogram(linkageTwo,
                               ax=axes[1],
                               above_threshold_color='y',
                               orientation='left',
                               no_labels=True)
            dend3 = dendrogram(linkageThree,
                               ax=axes[2],
                               above_threshold_color='y',
                               orientation='left',
                               no_labels=True)
            del (linkageOne, linkageTwo, linkageThree, num_comps)

        elif num_comps == 4:
            #Create the linkage matrix
            linkageOne = linkage(data, linkList[0])
            linkageTwo = linkage(data, linkList[1])
            linkageThree = linkage(data, linkList[2])
            linkageFour = linkage(data, linkList[3])

            #Create the appropriate figure to allow for the comparison of linkage functions
            fig, axes = plt.subplots(2, 2, figsize=(8, 8))
            plt.title('Linkage Comparison')

            #create the dendrograms
            dend1 = dendrogram(linkageOne,
                               ax=axes[0, 0],
                               above_threshold_color='y',
                               orientation='left',
                               no_labels=True)
            dend2 = dendrogram(linkageTwo,
                               ax=axes[0, 1],
                               above_threshold_color='y',
                               orientation='left',
                               no_labels=True)
            dend3 = dendrogram(linkageThree,
                               ax=axes[1, 0],
                               above_threshold_color='y',
                               orientation='left',
                               no_labels=True)
            dend4 = dendrogram(linkageFour,
                               ax=axes[1, 1],
                               above_threshold_color='y',
                               orientation='left',
                               no_labels=True)
            del (linkageOne, linkageTwo, linkageThree, linkageFour, num_comps)

        linkPre = 'LinkageComparison'
        linkSuf = '.png'
        sep = '_'
        firstCheck = linkPre + sep
        for i in range(len(linkList)):
            #create the first file check
            firstCheck += linkList[i] + sep

        firstCheck += '01' + linkSuf

        chkBuffer = glob.glob("*.png")
        count = 1
        if firstCheck in chkBuffer:
            checkVal = False
            firstCheck = firstCheck.strip(linkSuf)
            firstCheck = firstCheck.strip('01')
            while checkVal == False:
                count += 1
                #search the "buffer" for ensemble cluster
                if count < 10:
                    #determine if the file has already been made
                    curFileCheck = firstCheck + '0' + str(count) + linkSuf
                    if curFileCheck not in chkBuffer:
                        checkVal = True
                        linkFile = curFileCheck

                else:
                    curFileCheck = firstCheck + str(count) + linkSuf
                    if curFileCheck not in chkBuffer:
                        checkVal = True
                        linkFile = curFileCheck
            plt.savefig(linkFile)
        else:
            linkFile = firstCheck
            plt.savefig(linkFile)
        plt.show()

        #log the completion of the linkage comparison
        logging.info(
            ': Sucessfuly completed the comparison of the linkage functions!')
        return
Ejemplo n.º 35
0
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

from knock60 import load_model
from knock67 import collect_target_vecs

if __name__ == '__main__':
    countries = []
    with open('./countries.txt') as fp:
        for line in fp:
            country = line.strip()
            countries.append(country)

    vecs, target_countries = collect_target_vecs(countries)

    plt.figure(figsize=(32.0, 24.0))
    link = linkage(vecs, method='ward')
    dendrogram(link,
               labels=target_countries,
               leaf_rotation=90,
               leaf_font_size=10)
    plt.show()
    plt.savefig('ward.png')
Ejemplo n.º 36
0
    for i in range(len(df)):
        ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)

    plt.show()  # show the plot

    from scipy.cluster.hierarchy import ward, dendrogram

    linkage_matrix = ward(
        dist
    )  # define the linkage_matrix using ward clustering pre-computed distances

    sklearn.externals.joblib.dump(dist, 'title_dist.pkl')
    sklearn.externals.joblib.dump(titles, 'titles.pkl')
    sklearn.externals.joblib.dump(urls, 'urls.pkl')

    fig, ax = plt.subplots(figsize=(30, 60))  # set size
    ax = dendrogram(linkage_matrix, orientation="right", labels=titles)

    plt.tick_params( \
        axis='x',  # changes apply to the x-axis
        which='both',  # both major and minor ticks are affected
        bottom='off',  # ticks along the bottom edge are off
        top='off',  # ticks along the top edge are off
        labelbottom='off')

    plt.tight_layout()  # show plot with tight layout

    # uncomment below to save figure
    plt.savefig(basename(args.fileName) +
                '.png')  # save figure as ward_clusters
Ejemplo n.º 37
0
# to determine the number of clusters directly
from scipy.cluster.hierarchy import fcluster
k = 5
clusters = fcluster(Z, k, criterion='maxclust')

fig = pylab.figure(figsize=(18, 50))


def llf(id):
    return '[%s %s %s]' % (pdf['manufact'][id], pdf['model'][id],
                           int(float(pdf['type'][id])))


dendro = hierarchy.dendrogram(Z,
                              leaf_label_func=llf,
                              leaf_rotation=0,
                              leaf_font_size=12,
                              orientation='right')

# Clustering using sci-kit learn

#dist_matrix = distance_matrix(feature_mtx,feature_mtx)
agglom = AgglomerativeClustering(n_clusters=6, linkage='complete')
agglom.fit(feature_mtx)
pdf['cluster_'] = agglom.labels_

# to save as csv file
# pdf.to_csv("D:/VIT/rtt.csv")

import matplotlib.cm as cm
n_clusters = max(agglom.labels_) + 1
Ejemplo n.º 38
0
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering

dataset = pd.read_csv('dataset.csv')
X = dataset.iloc[:, [0, 3]].values

dendrogram = sch.dendrogram(sch.linkage(X, method="ward"))
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show()

hc = AgglomerativeClustering(n_clusters=5,
                             affinity='euclidean',
                             linkage='ward')
y_hc = hc.fit_predict(X)

plt.scatter(X[y_hc == 0, 0],
            X[y_hc == 0, 1],
            s=100,
            c='red',
            label='Cluster 1')
plt.scatter(X[y_hc == 1, 0],
            X[y_hc == 1, 1],
            s=100,
            c='blue',
            label='Cluster 2')
plt.scatter(X[y_hc == 2, 0],
assignment = agg.fit_predict(X)

mglearn.discrete_scatter(X[:,0], X[:,1], assignment)
plt.legend(['cluster0', 'cluster1','cluster2'], loc='best')
plt.xlabel('특성0')
plt.ylabel('특성1')
# %%
%matplotlib inline
agg1 = AgglomerativeClustering(n_clusters=3).fit_predict(X)
agg2 = AgglomerativeClustering(n_clusters=5).fit_predict(X)
fig = plt.figure(figsize=(8,5))
fig.add_subplot(1,2,1)
mglearn.discrete_scatter(X[:,0], X[:,1], agg1)
plt.legend(['cluster0', 'cluster1','cluster2'], loc='best')
fig.add_subplot(1,2,2)
mglearn.discrete_scatter(X[:,0], X[:,1], agg2)
plt.legend(['cluster0', 'cluster1','cluster2', 'cluster3', 'cluster4'], loc='best')
# %%
from scipy.cluster.hierarchy import dendrogram, ward
import pandas as pd
X, y = make_blobs()
df = pd.DataFrame(dict(x=X[:,0], y=X[:,1], label=y))
df.columns
# %%
linkage_array = ward(df[['x', 'y']])
linkage_array
dendrogram(linkage_array)

# %%

Ejemplo n.º 40
0
    def __init__(self, files, file_format, method=None):

        self.files = files
        self.files_opened = []
        for f in self.files:
            self.files_opened.append(OpenFile(f))
        self.docLabels = []
        self.db_server = db_handler()
        for doc in self.files_opened:
            self.docLabels.append(doc.location)

        # create a list data that stores the content of all text files in order of their names in docLabels
        data = []
        if file_format == "docx" or file_format == "pptx":
            for doc in self.files_opened:
                #data.append(open(doc, encoding='latin-1').read())
                db = db_ds
                data.append(doc.text)
        elif file_format == "xlsx":
            for i, doc in enumerate(self.files_opened):
                #data.append(open(doc, encoding='latin-1').read())
                db = db_xs
                try:
                    data.append(json.dumps(doc.tables, skipkeys=True))
                except:
                    print("error parsing document {}".format(
                        self.docLabels[i]))
                    data.append("")

        data = nlp_clean(data)
        if method == "fuzzywuzzy":
            for i, f1 in enumerate(data):
                for f2 in data[i + 1:]:
                    # print(self.docLabels[i],self.docLabels[i+1])
                    x = fuzz.ratio(f1, f2)
                    y = fuzz.partial_ratio(f1, f2)
                    print(
                        "overall similarity ration: {} %\npartial similarity ration: {}"
                        .format(x, y))
                    db_data = {
                        'dok_id': {
                            'dok_1': self.docLabels[i],
                            'dok_2': self.docLabels[i + 1]
                        },
                        'kullanici': user_default,
                        'overall similarity ratio': x,
                        'partial similarity ratio': y
                    }
                    self.db_server.save(db,
                                        db_data,
                                        doc_id=self.docLabels[i] + "_" +
                                        self.docLabels[i + 1])

        elif method == "inference":
            #res = self.db_server.query(db_gensim,["_attachments"],query_key="_id", query_value=file_format)

            #model_loc ="{}gensim_models/docx/models/doc2vec_{}.model".format(server_default,file_format)
            model_loc = "models/doc2vec_{}.model".format(file_format)
            # loading the model
            d2v_model = gensim.models.doc2vec.Doc2Vec.load(model_loc)
            # d2v_model.init_sims(replace=False)

            # infer_vector is non-deterministic; i.e. the resulting vector is different each time, but it should be similar enough with a good model
            infervec = d2v_model.infer_vector(data[0],
                                              alpha=0.025,
                                              min_alpha=0.025,
                                              steps=300)
            similar_doc = d2v_model.docvecs.most_similar([infervec])
            most_similar = similar_doc[0][0]
            print(type(most_similar))
            print("most similar: {}".format(most_similar))

            #db_res = self.db_server.query(db_dc,["_id","docs"])
            db_res = self.db_server.query(db_dc, ["docs", "clusters"],
                                          query_key="_id",
                                          query_value=file_format)
            print(db_res)
            db_res_a = []
            db_res_b = []
            for row in db_res:
                # db_res_a.append(row)
                for a in row.key[0]:
                    db_res_a.append(a)
                for b in row.key[1]:
                    db_res_b.append(b)
            # print(db_res_a)
            # print(db_res_b)
            most_similar_class = db_res_b[db_res_a.index(most_similar)]
            print("most likely class: {}".format(most_similar_class))
            print("other documents in same category")
            for i in range(len(db_res_b)):
                if db_res_b[i] == most_similar_class:
                    print(db_res_a[i])

        else:
            # iterator returned over all documents
            it = LabeledLineSentence(data, self.docLabels)
            model = gensim.models.Doc2Vec(vector_size=300,
                                          min_count=0,
                                          alpha=0.025,
                                          min_alpha=0.025)
            model.build_vocab(it)
            # training of model
            for epoch in range(100):
                #print ('iteration '+str(epoch+1))
                model.train(it, total_examples=model.corpus_count, epochs=3)
                model.alpha -= 0.002
                model.min_alpha = model.alpha

            model.save('models/doc2vec_{}.model'.format(file_format))

            db_g = db_gensim
            db_data = {"time": "time", "path": dataset_path}
            self.db_server.save(
                db_g,
                db_data,
                doc_id=file_format,
                attachment='models/doc2vec_{}.model'.format(file_format))

            print("model saved")

            # loading the model
            d2v_model = gensim.models.doc2vec.Doc2Vec.load(
                'models/doc2vec_{}.model'.format(file_format))

            # start testing
            X = []
            # printing the vector of documents in docLabels
            for i, _ in enumerate(self.docLabels):
                docvec = d2v_model.docvecs[i]
                # print(docvec)
                X.append(docvec)
            X = np.array(X)
            #docvec = d2v_model.docvecs[0]
            #print (docvec)
            #docvec = d2v_model.docvecs[1]
            #print (docvec)

            # to get most similar document with similarity scores using document-index
            #similar_doc = d2v_model.docvecs.most_similar(0)
            # print(similar_doc)

            # for doc in similar_doc:
            #    db_data = {'dok_id' : {'dok_1' : self.docLabels[0],'dok_2' : doc[0]}, 'kullanici': user_default, 'benzerlik orani': str(doc[1])}
            #    self.db_server.save(db, db_data)
            #similar_doc = d2v_model.docvecs.most_similar(1)
            # print(similar_doc)

            # printing the vector of the file using its name
            # docvec = d2v_model.docvecs['shakespeare-hamlet.txt'] #if string tag used in training
            # print(docvec)
            # to get most similar document with similarity scores using document- name
            #sims = d2v_model.docvecs.most_similar('shakespeare-hamlet.txt')
            # print(sims)

            # #############################################################################
            # Compute Affinity

            af = AffinityPropagation(preference=-50).fit(X)
            cluster_centers_indices = af.cluster_centers_indices_
            labels = af.labels_
            #labels2 = []
            # for i, lb in enumerate(labels):
            #    labels2.append(self.files[i].split('/')[-1])
            #print("labels: {}".format(labels))
            #print("labels2: {}".format(labels2))
            n_clusters_ = len(cluster_centers_indices)
            print("number of clusters: {}".format(n_clusters_))
            dic = {i: np.where(labels == i)[0] for i in range(n_clusters_)}
            dic2 = {}
            # print(dic)

            for key, value in dic.items():
                print("cluster {}:".format(key))
                for e in value:
                    print("{} : {}".format(e, self.files[e].split('/')[-1]))
                    dic2[self.docLabels[e]] = key

            print(dic2)

            # print('Estimated number of clusters: %d' % n_clusters_)
            # print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
            # print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
            # print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
            # print("Adjusted Rand Index: %0.3f"
            #     % metrics.adjusted_rand_score(labels_true, labels))
            # print("Adjusted Mutual Information: %0.3f"
            #     % metrics.adjusted_mutual_info_score(labels_true, labels))
            print("Silhouette Coefficient: %0.3f" %
                  metrics.silhouette_score(X, labels, metric='sqeuclidean'))

            # #############################################################################
            # Plot result
            import matplotlib.pyplot as plt
            from mpl_toolkits.mplot3d import Axes3D
            from itertools import cycle

            plt.close('all')
            plt.figure(figsize=(25, 10))
            plt.clf()

            colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')

            # reduce dimensions
            # pca = PCA(n_components=2)
            # reduced = pca.fit_transform(X)
            # X = reduced

            for k, col in zip(range(n_clusters_), colors):
                class_members = labels == k
                cluster_center = X[cluster_centers_indices[k]]
                plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
                plt.plot(cluster_center[0],
                         cluster_center[1],
                         'o',
                         markerfacecolor=col,
                         markeredgecolor='k',
                         markersize=5)
                for x in X[class_members]:
                    plt.plot([cluster_center[0], x[0]],
                             [cluster_center[1], x[1]], col)

            plt.title(
                'Clustering with Affinity Propagation | Estimated number of clusters: %d'
                % n_clusters_)
            plt.savefig('models/{}_affinity_clusters.png'.format(file_format),
                        dpi=300)
            plt.show()

            #db = db_dc
            db_data = dic2
            db_data["docs"] = self.docLabels
            db_data["clusters"] = labels.tolist()
            self.db_server.save(
                db_dc,
                db_data,
                doc_id=file_format,
                attachment='models/{}_affinity_clusters.png'.format(
                    file_format))

            # #########################
            # hierarchical

            linkage_matrix = []
            #linkage_matrix.append(linkage(X, method='single', metric='euclidean'))
            linkage_matrix.append(
                linkage(X, method='average', metric='euclidean'))
            #linkage_matrix.append(linkage(X, method='complete', metric='euclidean'))
            #linkage_matrix.append(linkage(X, method='ward', metric='euclidean'))

            #linkage_matrix.append(linkage(X, method='single', metric='seuclidean'))
            # linkage_matrix.append(linkage(X, method='average', metric='seuclidean'))
            #linkage_matrix.append(linkage(X, method='complete', metric='seuclidean'))

            for n, l in enumerate(linkage_matrix):
                # calculate full dendrogram
                plt.figure(figsize=(25, 10))
                plt.title('Hierarchical Clustering Dendrogram')
                plt.ylabel('word')
                plt.xlabel('distance')

                dendrogram(
                    l,
                    leaf_rotation=0.,  # rotates the x axis labels
                    leaf_font_size=16.,  # font size for the x axis labels
                    orientation='left',
                    leaf_label_func=lambda v: str(self.files[v].split('/')[-1])
                )
                # plt.savefig('clusters_{}.png'.format(n), dpi=200) #save figure as ward_clusters
                plt.savefig(
                    'models/{}_hierarchical_clusters.png'.format(file_format),
                    dpi=300)
                plt.show()

                db_data = {}
                self.db_server.save(
                    db_dc,
                    db_data,
                    doc_id=file_format,
                    attachment='models/{}_hierarchical_clusters.png'.format(
                        file_format))
the method='single' keyword argument. Assign the result to mergings.
Plot a dendrogram of the hierarchical clustering, using the list country_names 
as the labels. In addition, specify the leaf_rotation=90, and leaf_font_size=6 
keyword arguments as you have done earlier.
'''
#Done by DataCamp
import pandas as pd
from numpy import genfromtxt

country_names = pd.read_csv(
    'E:/DataCamp/Unsupervised-learning-in-python/data/eurovision-2016.csv')

country_names = country_names.iloc[:, 0].unique().tolist()

samples = genfromtxt(
    'E:/DataCamp/Unsupervised-learning-in-python/data/eurovision_votes.csv',
    delimiter=',')

#End done by DataCamp

# Perform the necessary imports
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram

# Calculate the linkage: mergings
mergings = linkage(samples, method='single')

# Plot the dendrogram
dendrogram(mergings, labels=country_names, leaf_rotation=90, leaf_font_size=6)
plt.show()
Ejemplo n.º 42
0
clients = pd.read_csv("data/customer_online_closing_store.csv")
clients["return_rate"] = clients["items_returned"] / clients["items_purchased"]
clients["average_price"] = clients["total_spent"] / clients["items_purchased"]

X = clients[["average_price", "return_rate", "overall_rating"]]
print(X)

min_max_scaler = sk_preprocessing.MinMaxScaler()
X = min_max_scaler.fit_transform(X)

print(X)

plt.title("Customer dendrogram")

linkage_method = "ward"  # single complete average ward
dendrogram = sp_clustering_hr.dendrogram(sp_clustering_hr.linkage(X, method=linkage_method))

agglomerative_model = sk_clustering.AgglomerativeClustering(n_clusters=4, linkage=linkage_method)
agglomerative_model.fit(X)

clients["class"] = agglomerative_model.labels_

print(clients[["average_price", "return_rate", "overall_rating", "class"]])

client_pivot_table = clients.pivot_table(index="class",
                                         values=["average_price", "return_rate", "overall_rating", "customer_id"],
                                         aggfunc={"average_price": np.mean, "return_rate": np.mean,
                                                  "overall_rating": np.mean, "customer_id": len})
print(client_pivot_table)

plt.show()
for Method in Methods:
    Z = linkage(X, method=Method, metric=Metric)

    # Compute and display clusters by thresholding the dendrogram
    cls = fcluster(Z, criterion='maxclust', t=Maxclust)

    plt.figure(3 + 2*i, figsize=(12, 9))
    plt.title('Hierarchical clustering using {} method'.format(Method))
    plt.xlabel('PC 1')
    plt.ylabel('PC 2')
    clusterplot(X, cls, y=y)

    # Display dendrogram
    plt.figure(4 + 2*i, figsize=(15, 8))
    plt.title('Hierarchical clustering using {} method'.format(Method))
    dendrogram(Z, truncate_mode='lastp', p=max_display_levels)
    plt.show()

    # Evaluate hierarchical method
    Rand_hier[i], Jaccard_hier[i], NMI_hier[i] = clusterval(y, cls)
    i += 1


print('###################################################')
print('#            MODELS QUALITY EVALUATION            #')
print('###################################################')
Jaccard = {'gmm': Jaccard_gmm}
NMI = {'gmm': NMI_gmm}
Rand = {'gmm': Rand_gmm}
for i in range(len(Methods)):
    Jaccard[Methods[i]] = Jaccard_hier[i]
Ejemplo n.º 44
0
    feature_vectors.append(vec)

# print feature_vectors
import numpy
mat = numpy.empty((n, n))
for i in xrange(0, n):
    for j in xrange(0, n):
        mat[i][j] = nltk.cluster.util.cosine_distance(
            feature_vectors[i],
            feature_vectors[j])  #calculating the cosine distance

# hierarchical clustering
from scipy.cluster.hierarchy import dendrogram, linkage
t = 0.8
Z = linkage(mat, 'single')
d = dendrogram(Z, color_threshold=t)
from matplotlib import pyplot
print Z
import pylab
pylab.savefig("dendo.png", dpi=800)


## extract data
def extract_clusters(Z, threshold, n):
    clusters = {}
    ct = n
    for row in Z:
        if row[2] < threshold:
            n1 = int(row[0])
            n2 = int(row[1])
Ejemplo n.º 45
0
import pandas as pd
import plotly.graph_objs as go
import os

values = pd.read_csv('Intensity.values.csv')
values.index = values['Metabolite']
del values['Metabolite']

values_t = values.T
values_t.index.name = 'Sample'


#plt.figure(figsize=(10, 7))  
#plt.title("Customer Dendograms")  

dend_metabolite = shc.dendrogram(shc.linkage(values, method='ward'),labels=values.index)  
dend_metabolite_order = dend_metabolite['ivl']


dend_sample = shc.dendrogram(shc.linkage(values_t, method='ward'),labels=values_t.index)  
dend_sample_order = dend_sample['ivl']

df = values[dend_sample_order]
df = df.reindex(dend_metabolite_order)



values_t = values.T
values_t.index.name = 'Sample'

#plt.figure(figsize=(10, 7))  
Ejemplo n.º 46
0
#!/usr/bin/env python3

from matplotlib import pyplot as plt
import numpy as np
from scipy.cluster.hierarchy import dendrogram

from imagecluster import calc as ic
from imagecluster import io as icio

images = icio.read_images('pics/', size=(224,224))
model = ic.get_model()
fingerprints = ic.fingerprints(images, model)
clusters,extra = ic.cluster(fingerprints, sim=0.5, extra_out=True)

# linkage matrix Z
fig,ax = plt.subplots()
dendrogram(extra['Z'], ax=ax)

# Adjust yaxis labels (values from Z[:,2]) to our definition of the `sim`
# parameter.
ymin, ymax = ax.yaxis.get_data_interval()
tlocs = np.linspace(ymin, ymax, 5)
ax.yaxis.set_ticks(tlocs)
tlabels = np.linspace(1, 0, len(tlocs))
ax.yaxis.set_ticklabels(tlabels)
ax.set_xlabel("image index")
ax.set_ylabel("sim")

fig.savefig('dendrogram.png')
plt.show()
Ejemplo n.º 47
0
def heatmap_vec(x, y, vec):

    # Compute and plot first dendrogram.

    vec = np.array(vec)
    mn = np.mean(vec, axis=1)

    mat = []
    ylb = []
    for i in xrange(len(mn)):
        if mn[i] > 0.:
            mat.append(vec[i])
            ylb.append(y[i])

    mat = np.array(mat)
    norm, corr, dist = analyse.all_corr(mat.T)

    del corr
    fig = plt.figure(figsize=(8, 8))
    ax1 = fig.add_axes([0.09, 0.1, 0.25, 0.6])
    print 'fastcluster...'
    z = fastcluster.linkage(dist, method='complete')
    del dist
    #print 'dendogramming...'
    Z1 = sch.dendrogram(z, orientation='right')
    ticks = ax1.get_xticks()
    ticks = np.array(ticks)
    ticks /= 2.
    ticks = ['%.1f' % a for a in ticks]
    ax1.set_xticklabels(ticks)
    idx1 = Z1['leaves']
    yy = []
    for i in idx1:
        yy.append(ylb[int(i)])
    ax1.set_yticks(range(len(yy)), yy)

    if len(yy) < 20:
        ax1.set_yticklabels(yy, fontsize=12)
    elif len(yy) < 50:
        ax1.set_yticklabels(yy, fontsize=6)
    elif len(yy) < 150:
        ax1.set_yticklabels(yy, fontsize=4)
    elif len(yy) < 250:
        ax1.set_yticklabels(yy, fontsize=3)
    elif len(yy) < 500:
        ax1.set_yticklabels(yy, fontsize=2)
    elif len(yy) < 1500:
        ax1.set_yticklabels(yy, fontsize=1)
    else:
        ax1.set_yticklabels(yy, fontsize=.2)


# Plot distance matrix.
    axmatrix = fig.add_axes([0.4, 0.1, 0.5, 0.6])
    D = norm[idx1, :]
    D = D[::-1, :]

    im = axmatrix.matshow(D,
                          aspect='auto',
                          origin='lower',
                          cmap='RdYlBu',
                          alpha=0.8,
                          vmin=0)

    plt.xticks(np.arange(len(x)), x)
    plt.xticks(rotation=90)
    mytemplate(D)
    plt.xticks(fontsize=6)
    axmatrix.set_yticks([])

    #print x
    # Plot colorbar.
    axcolor = fig.add_axes([0.91, 0.3, 0.01, 0.4])
    plt.colorbar(im, cax=axcolor)

    return yy
Ejemplo n.º 48
0
            motif_dict[i] = 1
    motif_list = list(motif_dict.keys())
    #### Matrix for levenstein_distances
    cols = len(motif_list)
    rows = cols
    distance_matrix = np.zeros((rows, cols))
    for i in range(rows):
        distance_matrix[i][i] = 0
        for ii in range(i + 1, cols):
            distance_matrix[i][ii] = \
                iterative_levenshtein(motif_list[i],motif_list[ii],costs =cost)
            ################# flip matrix
            distance_matrix[ii][i] = distance_matrix[i][ii]
    return distance_matrix


dist_matrix = levenstein_distances(motif, (2, 2, 2))
print(dist_matrix)

#dist_matrix_unroll = [item for sublist in dist_matrix for item in sublist]

from scipy.cluster import hierarchy
import matplotlib.pyplot as plt

Z = hierarchy.linkage(dist_matrix)
plt.xlabel('Selected Motifs')
plt.ylabel('Levinstein Edit Distances (No. of Edits)')
plt.title('Motif Candidate Cluster')
hierarchy.dendrogram(Z, leaf_rotation=10, leaf_font_size=7, labels=motif)
plt.show()
Ejemplo n.º 49
0
# LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_x['fueltype'] = le.fit_transform(train_x['fueltype'])
train_x['aspiration'] = le.fit_transform(train_x['aspiration'])
train_x['doornumber'] = le.fit_transform(train_x['doornumber'])
train_x['carbody'] = le.fit_transform(train_x['carbody'])
train_x['drivewheel'] = le.fit_transform(train_x['drivewheel'])
train_x['enginelocation'] = le.fit_transform(train_x['enginelocation'])
train_x['cylindernumber'] = le.fit_transform(train_x['cylindernumber'])
train_x['fuelsystem'] = le.fit_transform(train_x['fuelsystem'])


# 规范化到 [0,1] 空间
min_max_scaler=preprocessing.MinMaxScaler()
train_x=min_max_scaler.fit_transform(train_x)


#聚类分析
from scipy.cluster.hierarchy import dendrogram, ward
from sklearn.cluster import KMeans, AgglomerativeClustering
import matplotlib.pyplot as plt
model = AgglomerativeClustering(linkage='ward', n_clusters=10)
y = model.fit_predict(train_x)
print(y)
linkage_matrix = ward(train_x)
dendrogram(linkage_matrix)
plt.show()


Ejemplo n.º 50
0
topics_matrix = lda.show_topics(formatted=False)
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(lda.get_topics())

# =============================================================================
# MDS()
# mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
#
# pos = mds.fit_transform(dist)  # shape (n_components, n_samples)
#
# xs, ys = pos[:, 0], pos[:, 1]
#
#
# plt.scatter(x = xs, y = ys)
# plt.show()
# =============================================================================

linkage_matrix = ward(dist)
plt.clf()
plt.figure(figsize=(40, 20))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('person Id')
plt.ylabel('Distance')
dendrogram(linkage_matrix, leaf_rotation=90., leaf_font_size=12.)
plt.savefig('ward_dendrogram.png', dpi=200)
plt.show()

# print(linkage_matrix)
# print(lda.get_topics())
Ejemplo n.º 51
0
    def create_heatmap(matrix,
                       main_title=None,
                       output_filename=None,
                       pd_data_1=None,
                       pd_data_2=None,
                       data_hist_1=None,
                       data_hist_2=None,
                       graphs=None,
                       ordered=False,
                       fitness_slice=500):
        matrixdf = pd.DataFrame(matrix)
        font = {'family': 'normal', 'weight': 'normal', 'size': 8}
        last_data_1 = 0.0
        matplotlib.rc('font', **font)
        # look at raw data
        #axi = plt.imshow(matrixdf,interpolation='nearest')
        #ax = axi.get_axes()

        #plt.clean_axis(ax)

        # row clusters
        if ordered:
            row_pairwise_dists = squareform(pdist(matrixdf))
            row_clusters = linkage(row_pairwise_dists, method='complete')
            row_dendogram = dendrogram(row_clusters,
                                       no_plot=True,
                                       count_sort='ascending')

        # calculate pairwise distances for columns
        if ordered:
            col_pairwise_dists = squareform(pdist(matrixdf.T))
            col_clusters = linkage(col_pairwise_dists, method='complete')
            col_dendogram = dendrogram(col_clusters,
                                       no_plot=True,
                                       count_sort='ascending')

        # plot the results
        fig = plt.figure(figsize=(12.5, 10))
        #plot_gridspec = gridspec.GridSpec(3,2, wspace=0.05,
        #  hspace=0.05, width_ratios=[0.25,1],height_ratios=[0.25,1,0.25])
        plot_gridspec = gridspec.GridSpec(
            5, 5, width_ratios=[0.15, 0.15, 0.2, 0.2, 0.2])

        ### col dendrogram ####
        #col_denAX = fig.add_subplot(plot_gridspec[0,1])
        if pd_data_1 is not None:
            title = ''
            if type(pd_data_1) == tuple:  # not so pythonic
                title = pd_data_1[0]
                pd_data_1 = pd_data_1[1]
            last_data_1 = pd_data_1['y'][len(pd_data_1) - 1]

            #ax3 = fig.add_subplot(plot_gridspec[0,1])
            ax1 = plt.subplot(plot_gridspec[0, 2:])
            slice_base = max(0, len(pd_data_1) - fitness_slice)
            plt.plot(pd_data_1['x'], pd_data_1['y'], linestyle='-')
            plt.xlim(slice_base, len(pd_data_1))
            plt.title(title)

    #     else:
    #         col_denAX = fig.add_subplot(plot_gridspec[0,1])
    #create an empty graph

    ### row dendrogram ###
    ## t ODO: fix that please:
        if ordered:
            pass
            #row_denAX = fig.add_subplot(plot_gridspec[1,0])
            #row_denD = dendrogram(row_clusters, orientation='right', count_sort='ascending')
            #row_denAX.get_xaxis().set_ticks([]) # removes ticks

            #slice_base = max(0, max(pd_data_1['x']) - fitness_slice)
            #plt.plot(pd_data_1['x'], pd_data_1['y'], linestyle='-')
            #plt.xlim(slice_base, len(pd_data_1))

        if graphs is not None:
            gs_index = 0
            for title_graph in graphs:
                title, graph, graph_histogram = title_graph
                ax3 = plt.subplot(plot_gridspec[gs_index, 0])
                graph = graph.to_undirected()
                # we don't care about the weight because we already are filtering here
                nx.draw(graph,
                        node_size=2,
                        width=0.4,
                        with_labels=False,
                        pos=nx.spring_layout(graph, weight=None))
                plt.title(title)
                ax3 = plt.subplot(plot_gridspec[gs_index, 1])
                # let's add the histogram, but remove all 1 values
                graph_histogram_without_one = []
                for v in graph_histogram:
                    if v != 1:
                        graph_histogram_without_one.append(v)
                print(str(graph_histogram_without_one))
                #                 print str(graph_histogram)
                if not graph_histogram_without_one:
                    continue
                binwidth = 1
                min_bin = numpy.min(graph_histogram_without_one)
                max_bin = numpy.max(graph_histogram_without_one)
                bins = range(min_bin, max_bin + binwidth, binwidth)
                ax3.hist(graph_histogram_without_one,
                         bins=bins,
                         facecolor='red',
                         alpha=0.45)
                plt.xticks(numpy.unique(graph_histogram_without_one))
                plt.tick_params(axis='both', which='major', labelsize=5)
                plt.tick_params(axis='both', which='minor', labelsize=5)
                #                plt.xticks(range(numpy.min(graph_histogram_without_one),
                #                           numpy.max(graph_histogram_without_one),
                #                           (numpy.min(graph_histogram_without_one) +  numpy.max(graph_histogram_without_one))/5))
                #plt.xlim(1, numpy.max(graph_histogram))
                if gs_index == 0:
                    plt.title("Components size\nhistogram")
                gs_index += 1

        ### heatmap ###
        heatmap_subplot = fig.add_subplot(plot_gridspec[1:4, 2:])

        if ordered:
            pass
            axi = heatmap_subplot.imshow(matrixdf.ix[row_dendogram['leaves'],
                                                     col_dendogram['leaves']],
                                         interpolation='nearest',
                                         aspect='auto',
                                         origin='lower')
        else:
            axi = heatmap_subplot.imshow(matrixdf,
                                         interpolation='nearest',
                                         aspect='auto',
                                         origin='lower')
        # removes ticks
        heatmap_subplot.get_xaxis().set_ticks([])
        heatmap_subplot.get_yaxis().set_ticks([])
        axcolor = fig.add_axes([0.91, 0.27, 0.02, 0.45])
        plt.colorbar(axi, cax=axcolor)
        #fig.tight_layout()

        if pd_data_2 is not None:
            title = ''
            if type(pd_data_2) == tuple:  # not so pythonic
                title = pd_data_2[0]
                pd_data_2 = pd_data_2[1]
            ax3 = fig.add_subplot(plot_gridspec[4, 2])
            plt.plot(pd_data_2['x'], pd_data_2['y'], linestyle='-', marker='.')
            plt.xlim(min(pd_data_2['x']), max(pd_data_2['x']))
            #plt.ylim(0, 1.1)
            plt.title(title)

        if data_hist_1 is not None:
            title = ''
            if type(data_hist_1) == tuple:  # not so pythonic
                title = data_hist_1[0]
                data_hist_1 = data_hist_1[1]
            #binwidth = 1
            ax3 = fig.add_subplot(plot_gridspec[4, 3])
            #min_bin = numpy.min(data_hist_1)
            #max_bin = numpy.max(data_hist_1)
            #bins = range(min_bin,max_bin+binwidth,binwidth)
            ax3.hist(data_hist_1, facecolor='blue', alpha=0.45)
            #plt.xticks(numpy.unique(data_hist_1))
            plt.tick_params(axis='both', which='major', labelsize=5)
            plt.tick_params(axis='both', which='minor', labelsize=5)
            plt.title(title)

        if data_hist_2 is not None:
            title = ''
            if type(data_hist_2) == tuple:  # not so pythonic
                title = data_hist_2[0]
                data_hist_2 = data_hist_2[1]
            if data_hist_2:
                ax3 = fig.add_subplot(plot_gridspec[4, 4])
                #bins = range(min_bin,max_bin+binwidth,binwidth)
                ax3.hist(data_hist_2, facecolor='blue', alpha=0.45)
                #plt.xticks(numpy.unique(data_hist_1))
                plt.tick_params(axis='both', which='major', labelsize=5)
                plt.tick_params(axis='both', which='minor', labelsize=5)
                plt.title(title)

        if main_title:
            if pd_data_1 is not None:
                main_title = main_title + '\n(' + str(
                    last_data_1).strip() + ')'
            plt.suptitle(main_title)
        if output_filename:
            plt.savefig(output_filename)
            #plt.clf()
            plt.close()
        else:
            plt.show()
Ejemplo n.º 52
0
# In[117]:

market = pd.read_csv('C:/Users/USER/Desktop/test/DirectMarketing.csv')
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.cluster.hierarchy import fcluster
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
market = market.dropna()

scaler_s = StandardScaler()
market = market[['Salary', 'Children', 'Location', 'AmountSpent']]
used_data_dummy = pd.get_dummies(market, drop_first=True)
data_features_s = pd.DataFrame(scaler_s.fit_transform(used_data_dummy))

clustering = linkage(data_features_s, 'ward')
dn = dendrogram(clustering)

# #### (b) 군집의 수를 2개부터 10개까지 늘려가면서 실루엣 스코어를 구하고, 군집 수와 실루엣 스코어를 그래프로 나타내라. 실루엣 스코어로 보았을 때 가장 적절한 군 집의 수는 몇개인가? (7점)

# In[114]:

si = []
K = range(2, 11)
for k in K:
    cl = fcluster(clustering, k, criterion='maxclust')
    si.append(silhouette_score(data_features_s, cl, metric='euclidean'))
plt.plot(K, si, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouette Score')
plt.show()
Ejemplo n.º 53
0
def cluster_distance_mat(dist_mat, names, figsize=(8, 8)):
    """

    Parameters
    ----------
    dist_mat : np.array
        Distance matrix array.
    names : list_like
        Names of ticks for distance matrix
    figsize : tuple
        Size of figure, passed to matplotlib

    Returns
    -------

    """
    # Compute and plot first dendrogram.
    fig = plt.figure(figsize=figsize)

    # Compute and plot second dendrogram.
    ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2])
    Y = sch.linkage(dist_mat, method='average')
    Z2 = sch.dendrogram(Y)
    ax2.set_xticks([])
    ax2.set_yticks([])

    # Plot distance matrix.
    axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6])

    # reorder matrix
    idx1 = Z2['leaves']
    dist_mat = dist_mat[idx1, :]
    dist_mat = dist_mat[:, idx1]
    names = names[idx1]

    # create figure
    im = axmatrix.matshow(dist_mat,
                          aspect='auto',
                          origin='lower',
                          cmap=plt.cm.Reds,
                          vmin=0,
                          vmax=1)

    # add xtick labels
    axmatrix.set_xticks(range(len(names)))
    axmatrix.set_xticklabels(names, minor=False)
    axmatrix.xaxis.set_label_position('bottom')
    axmatrix.xaxis.tick_bottom()
    plt.xticks(rotation=90, fontsize=8)

    # add ytick labels
    axmatrix.set_yticks(range(len(names)))
    axmatrix.set_yticklabels(names, minor=False)
    axmatrix.yaxis.set_label_position('left')
    axmatrix.yaxis.tick_left()
    plt.yticks(rotation=0, fontsize=8)

    # add colorbar
    axcolor = fig.add_axes([0.94, 0.1, 0.02, 0.6])
    plt.colorbar(im, cax=axcolor)

    return fig
Ejemplo n.º 54
0
    def plot_heatmap(matrix=None,
                     matrixdf=None,
                     main_title=None,
                     output_filename=None,
                     titles=None,
                     ordered=False,
                     font_size=9,
                     font_family='normal',
                     font_weight='normal',
                     figsize=None,
                     tight_layout=None,
                     titles_x=None,
                     titles_y=None,
                     values_on=False,
                     values_on_text=None,
                     vmin=-1.0,
                     vmax=1.0,
                     grid=False,
                     x_label=None,
                     y_label=None,
                     set_yticks=None,
                     set_xticks=None,
                     subplot_adjust=None,
                     colorbar_on=True,
                     **kargs):
        assert (matrix is not None
                or matrixdf is not None), "Give me matrix or matrixdf!"
        if matrix is not None:
            matrixdf = pd.DataFrame(matrix)
        # font = {'family': font_family,
        #         'weight': font_weight,
        #         'size': font_size}
        # matplotlib.rc('font', **font)
        # look at raw data
        #axi = plt.imshow(matrixdf,interpolation='nearest')
        #ax = axi.get_axes()

        #plt.clean_axis(ax)

        # row clusters
        if ordered:
            row_pairwise_dists = squareform(pdist(matrixdf))
            row_clusters = linkage(row_pairwise_dists, method='complete')
            row_dendogram = dendrogram(row_clusters,
                                       no_plot=True,
                                       count_sort='ascending')

        # calculate pairwise distances for columns
        if ordered:
            col_pairwise_dists = squareform(pdist(matrixdf.T))
            col_clusters = linkage(col_pairwise_dists, method='complete')
            col_dendogram = dendrogram(col_clusters,
                                       no_plot=True,
                                       count_sort='ascending')

        # plot the results
        if figsize is not None:
            fig = plt.figure(figsize=figsize)
        else:
            fig = plt.figure()
        #plot_gridspec = gridspec.GridSpec(3,2, wspace=0.05,
        #  hspace=0.05, width_ratios=[0.25,1],height_ratios=[0.25,1,0.25])
        # plot_gridspec = gridspec.GridSpec(5, 5, width_ratios=[0.15, 0.15, 0.2, 0.2, 0.2])
        ### heatmap ###
        heatmap_subplot = fig.add_subplot(111)
        if titles and not titles_x:
            titles_x = titles
        if titles and not titles_y:
            titles_y = titles
        if ordered:
            axi = heatmap_subplot.matshow(matrixdf.ix[row_dendogram['leaves'],
                                                      col_dendogram['leaves']],
                                          interpolation='nearest',
                                          aspect='auto',
                                          origin='lower',
                                          vmin=vmin,
                                          vmax=vmax,
                                          **kargs)
            if titles_x:
                heatmap_subplot.set_xticklabels(
                    [titles_x[i] for i in col_dendogram['leaves']],
                    rotation=90)
            if titles_y:
                heatmap_subplot.set_yticklabels(
                    [titles_y[i] for i in row_dendogram['leaves']])
        else:
            axi = heatmap_subplot.matshow(matrixdf,
                                          interpolation='nearest',
                                          aspect='auto',
                                          origin='lower',
                                          vmin=vmin,
                                          vmax=vmax,
                                          **kargs)
            if titles_x:
                heatmap_subplot.set_xticklabels(titles_x, rotation=0)
                heatmap_subplot.tick_params(labelbottom='on', labeltop='off')
            if titles_y:
                heatmap_subplot.set_yticklabels(titles_y)
        if set_xticks:
            heatmap_subplot.set_xticks(set_xticks)
        else:
            pass
            # heatmap_subplot.set_xticks(range(len(matrixdf.columns)))
        if set_yticks:
            heatmap_subplot.set_yticks(set_yticks)
        else:
            pass
            # heatmap_subplot.set_yticks(range(len(matrixdf)))
        if colorbar_on:
            plt.colorbar(axi)
        values_on_text_format = '{:s}'
        if values_on_text is None:
            values_on_text = matrixdf
            values_on_text_format = '{:0.2f}'
        if values_on:
            if ordered:
                for (i, j), z in np.ndenumerate(
                        values_on_text.ix[row_dendogram['leaves'],
                                          col_dendogram['leaves']]):
                    heatmap_subplot.text(j,
                                         i,
                                         values_on_text_format.format(z),
                                         ha='center',
                                         va='center',
                                         weight='medium')
            else:
                for (i, j), z in np.ndenumerate(values_on_text):
                    heatmap_subplot.text(j,
                                         i,
                                         values_on_text_format.format(z),
                                         ha='center',
                                         va='center',
                                         weight='medium')
        if tight_layout is not None:
            if not tight_layout:
                plt.tight_layout()
            else:
                plt.tight_layout(rect=tight_layout)
        if subplot_adjust:
            plt.subplots_adjust(*subplot_adjust)
        if grid:
            plt.grid()
        if x_label:
            plt.xlabel(x_label)
        if y_label:
            plt.ylabel(y_label)
        if main_title:
            plt.suptitle(main_title)
        if output_filename:
            plt.savefig(output_filename)
            #plt.clf()
            plt.close()
Ejemplo n.º 55
0
def plot_heat_dendrogram(Y1,
                         Y2,
                         dist,
                         labels,
                         figname,
                         cmap=pylab.cm.YlGnBu,
                         ratio=0.6):
    """
    This function allows you to compare two clustering method, e.g. centroid vs single,
    @feature is your input feature [nsample, ndim]
    @title is the name of your plot
    @method1/method2, two methods for comparison
    @cmap, color map to use
    """

    Dist_Matrix = squareform(dist)

    # Compute and plot first dendrogram.
    fig = pylab.figure(figsize=(25, 25))
    ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6])
    Z1 = dendrogram(Y1,
                    orientation='right',
                    color_threshold=ratio * max(Y1[:, 2]))
    ax1.set_xticks([])
    ax1.set_yticks([])

    # Compute and plot second dendrogram.
    ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2])
    Z2 = dendrogram(Y2, color_threshold=ratio * max(Y2[:, 2]))
    ax2.set_xticks([])
    ax2.set_yticks([])

    # Plot distance matrix.
    axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6])
    idx1 = Z1['leaves']
    idx2 = Z2['leaves']
    Dist_Matrix = Dist_Matrix[idx1, :]
    Dist_Matrix = Dist_Matrix[:, idx2]
    idx1 = labels[Z1['leaves']]
    idx2 = labels[Z2['leaves']]
    im = axmatrix.matshow(Dist_Matrix,
                          aspect='auto',
                          origin='lower',
                          cmap=cmap)

    axmatrix.set_xticks(range(np.shape(Dist_Matrix)[0]))
    axmatrix.set_xticklabels(idx1, minor=False)
    axmatrix.xaxis.set_label_position('bottom')
    axmatrix.xaxis.tick_bottom()

    pylab.xticks(rotation=-90, fontsize=9)

    axmatrix.set_yticks(range(np.shape(Dist_Matrix)[0]))
    axmatrix.set_yticklabels(idx2, minor=False)
    axmatrix.yaxis.set_label_position('right')
    axmatrix.yaxis.tick_right()

    # Plot colorbar
    axcolor = fig.add_axes([0.95, 0.1, 0.02, 0.6])
    pylab.colorbar(im, cax=axcolor)
    if not figname:
        fig.savefig(figname)
Ejemplo n.º 56
0
def cluster(p_y_x, beta, visual):
    # The main clustering function - performs bottom-up clustering using the IB criterion
    # Inputs:
    # p_y_x: Conditional probability p(y|x)
    #    A numpy array of size [N,P]
    # beta: Tradeoff parameter in the IB objective
    #     A scalar
    # visual: Print dendrogram
    #    Boolean value
    #
    # Outputs:
    # C: Cluster assignment; an m-partitiion of X, 1 <= m <= |X|
    #    A numpy array of size [N,1]
    #
    # Objective: Min (1/beta)*I(X,C) - I(Y,C)
    # X: Features at segment-level
    # Y: Relevance variable, typically components from a GMM
    #
    # NOTE: This function ALWAYS creates 2 clusters. Use the fcluster() method to prune the dendrogram
    # variable with the desired criterion. Refer infoBottleneck.py for usage

    print("Performing agglomerative clustering using IB objective...")
    N, P = np.shape(p_y_x)
    np.random.seed(1000)
    p_c = np.empty(N)
    p_y_c = np.empty((N, P))  # p(y|c), NOT p(y,c)
    p_c_x = np.zeros((N, N))
    p_x_c = np.zeros((N, N))
    p_x_y_joint = getJointFromConditional(p_y_x)
    delta_F = np.zeros((N, N))
    N_init = N

    print("Initialization...")
    C = range(N)
    for i in range(N):
        p_c[i] = 1.0 / N
        p_c_x[i, i] = 1.0
        p_x_c[i, i] = 1.0
        for j in range(P):
            p_y_c[i, j] = p_y_x[i, j]

    for i in range(N):
        for j in range(i):
            #            delta_F[i,j] = (p_c[i] + p_c[j])*(JS_div(p_y_c,p_c,i,j,1) - (1/beta)*JS_div(p_x_c,p_c,i,j,2)) # Slower alternative
            delta_F[i,
                    j] = fastverbose_computeDeltaObj(p_y_c[i, :], p_y_c[j, :],
                                                     p_x_c[i, :], p_x_c[j, :],
                                                     p_c, i, j, beta)
        for j in range(i, N):
            delta_F[i, j] = float("inf")

#    print p_y_c
#    print p_c_x
#    print p_x_c

# Clustering
    max_clust_ind = max(C)
    Z = np.empty((max_clust_ind, 4))
    curr_val = 0
    iterIndex = 0
    print("Number of clusters = " + str(N))

    while len(np.unique(C)) > 2:
        if N % 100 == 0:
            print("Number of clusters = " + str(N))

#        print("Performing one iteration of clustering..")
        [i_opt, j_opt] = np.unravel_index(np.argmin(delta_F), delta_F.shape)
        #        print ("Optimal indices: ("+str(i_opt)+","+str(j_opt)+")")
        curr_val += abs(np.min(delta_F))
        Z[iterIndex] = [C[i_opt], C[j_opt], curr_val, 2]
        iterIndex += 1

        # Create temporary variables for storing the new distributions
        C_new = []
        p_c_new = []
        for i in range(N):
            if i != i_opt and i != j_opt:
                C_new.append(C[i])
                p_c_new.append(p_c[i])

        p_y_c_new = np.delete(p_y_c, (i_opt, j_opt), 0)
        p_c_x_new = np.delete(p_c_x, (i_opt, j_opt), 1)
        delta_F = np.delete(np.delete(delta_F, (i_opt, j_opt), 0),
                            (i_opt, j_opt), 1)

        # Update p(y|c)
        C_new.append(max_clust_ind + 1)
        temp1 = np.zeros(P)
        for j in range(P):
            temp1[j] = (p_y_c[i_opt, j] * p_c[i_opt] + p_y_c[j_opt, j] *
                        p_c[j_opt]) / (p_c[i_opt] + p_c[j_opt])
        p_y_c_new = np.vstack((p_y_c_new, temp1))

        # Update p(c|x)
        temp2 = np.zeros(N_init)
        for i in range(N):
            if i != i_opt and i != j_opt:
                temp2[i] = 0
            else:
                temp2[i] = 1
        p_c_x_new = np.concatenate(
            (p_c_x_new, np.reshape(temp2, (len(temp2), 1))), 1)

        # Update p(c)
        p_c_new.append(p_c[i_opt] + p_c[j_opt])
        max_clust_ind += 1
        C = C_new
        p_y_c = p_y_c_new
        p_c_x = p_c_x_new
        p_c = np.asarray(p_c_new)

        # Update p(x|c)
        p_x_c = np.divide(
            p_c_x.T,
            N_init *
            repmat(p_c, N_init, 1).T)  # this should be of shape (N-1,N_init)

        N -= 1
        p_y_c[p_y_c < 10e-10] = 0.
        p_c_x[p_c_x < 10e-10] = 0.
        p_x_c[p_x_c < 10e-10] = 0.
        p_c[p_c < 10e-10] = 0.

        # Update delta_F
        # Add a row
        newrow = np.zeros(N - 1)
        for i in range(N - 1):
            newrow[i] = fastverbose_computeDeltaObj(p_y_c[i, :],
                                                    p_y_c[len(p_c) - 1, :],
                                                    p_x_c[i, :],
                                                    p_x_c[len(p_c) - 1, :],
                                                    p_c, i,
                                                    len(p_c) - 1, beta)
        # Add a column of "inf"
        newcol = float("inf") * np.ones(N)

        delta_F = np.concatenate((np.vstack(
            (delta_F, newrow)), np.reshape(newcol, (len(newcol), 1))), 1)


#        print p_y_c.shape
#        print p_c_x.shape
#        print p_x_c.shape
#        print p_c.shape

#
#        print "p_y_c:"
#        print p_y_c
#        print "p_c_x:"
#        print p_c_x
#        print "p_x_c:"
#        print p_x_c
#        print "p_c:"
#        print p_c

# Complete the dendrogram variable
    max_val = Z[-2, 2]
    Z[-1] = [C[0], C[1], max_val + 0.01, 2]

    # Visualization, not really feasible for large utterances
    if visual == 1:
        plt.figure(figsize=(25, 10))
        dendrogram(Z)
        plt.show()

    return Z, C
# +
from scipy.cluster.hierarchy import fcluster, linkage, dendrogram

Z = linkage(X_train, method="complete", metric="euclidean")
clusters = fcluster(Z, 3, criterion="maxclust")

clusters = pd.Series(clusters).map({1: "r", 2: "b", 3: "y"})

X_train.plot.scatter(x="PetalLength",
                     y="PetalWidth",
                     c=clusters,
                     marker="x",
                     alpha=.5)
# -

dendrogram(Z)

# # Exercises

# **Exercise 1.** Fit a hierarchical clustering model to the Titanic passengers dataset (`https://raw.githubusercontent.com/dlsun/data-science-book/master/data/titanic.csv`). You are free to choose which features to include (but include both categorical and quantitative features) and the linkage function. Then, choose a number of clusters that seems appropriate. Look at the profiles of the passengers in each cluster. Can you come up with an "interpretation" of each cluster based on the passengers in it?

# +
# TYPE YOUR CODE HERE
# -

# **Exercise 2.** The code below reads in the "two moons" dataset, a synthetic dataset that is used to evaluate clustering algorithms. What clusters do you think hierarchical clustering will find if you use single linkage? What if you use average linkage? Once you have a hypothesis for each type of linkage, test out your hypothesis by fitting the model to this dataset and plotting the resulting clusters.

# TYPE YOUR CODE HERE
moons = pd.read_csv(
    "https://raw.githubusercontent.com/dlsun/data-science-book/master/data/two_moons.csv"
)
#Hierarchical Clustering

#Importing the Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#Importing the dataset
dataset = pd.read_csv('Mall_Customers.csv')
X = dataset.iloc[:, 3:
                 5].values  #it is always better if x is a matrix and not an array

#using the dendogram to find the optimal number of clusters
import scipy.cluster.hierarchy as sch
dendogram = sch.dendrogram(sch.linkage(
    X, method='ward'))  #to min the variance within the clusters
plt.title('Dendogram')
plt.xlabel('Customers')
plt.ylabel('Eucledian Distance')
plt.show

#Fitting hierarchical clustering to the mall dataset
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters=5,
                             affinity='euclidean',
                             linkage='ward')
y_hc = hc.fit_predict(X)

#Visualising the clusters
plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s=100, c='red', label='Careful')
plt.scatter(X[y_hc == 1, 0],
Ejemplo n.º 59
0
def cluster_dendogram(
        corpus: List[str],
        vectorizer,
        titles: List[str] = None,
        stopwords=get_stopwords,
        cleaning=simple_textcleaning,
        random_samples: float = 0.3,
        ngram: Tuple[int, int] = (1, 3),
        figsize: Tuple[int, int] = (17, 9),
        batch_size: int = 20,
):
    """
    plot hierarchical dendogram with similar texts.

    Parameters
    ----------

    corpus: List[str]
    vectorizer: class
        vectorizer class.
    num_clusters: int, (default=5)
        size of unsupervised clusters.
    titles: List[str], (default=None)
        list of titles, length must same with corpus.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]
    cleaning: function, (default=malaya.text.function.simple_textcleaning)
        function to clean the corpus.
    random_samples: float, (default=0.3)
        random samples from the corpus, 0.3 means 30%.
    ngram: Tuple[int, int], (default=(1,3))
        n-grams size to train a corpus.
    batch_size: int, (default=20)
        size of strings for each vectorization and attention. Only useful if use transformer vectorizer.

    Returns
    -------
    dictionary: {'linkage_matrix': linkage_matrix, 'titles': titles}
    """

    if titles:
        if len(titles) != len(corpus):
            raise ValueError('length of titles must be same with corpus')

    validator.validate_object_methods(vectorizer, ['vectorize', 'fit'],
                                      'vectorizer')
    stopwords = validator.validate_stopwords(stopwords)
    validator.validate_function(cleaning, 'cleaning')

    if not (random_samples < 1 and random_samples > 0):
        raise ValueError('random_samples must be between 0 and 1')

    try:
        import matplotlib.pyplot as plt
        import seaborn as sns
        from scipy.cluster.hierarchy import ward, dendrogram

        sns.set()
    except:
        raise ModuleNotFoundError(
            'matplotlib and seaborn not installed. Please install it and try again.'
        )

    corpus = random.sample(corpus, k=int(random_samples * len(corpus)))

    if cleaning is not None:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    text_clean = []
    for text in corpus:
        text_clean.append(' '.join(
            [word for word in text.split() if word not in stopwords]))

    if hasattr(vectorizer, 'fit'):
        vectorizer.fit(text_clean)
        transformed_text_clean = vectorizer.transform(text_clean)
        features = vectorizer.get_feature_names()
    else:
        transformed_text_clean, attentions = [], []
        for i in range(0, len(text_clean), batch_size):
            index = min(i + batch_size, len(text_clean))
            transformed_text_clean.append(
                vectorizer.vectorize(text_clean[i:index]))
            if hasattr(vectorizer, 'attention'):
                attentions.extend(vectorizer.attention(text_clean[i:index]))
            else:
                t = []
                for s in text_clean[i:index]:
                    t.append([(w, 1.0) for w in s.split()])
                attentions.extend(t)
        transformed_text_clean = np.concatenate(transformed_text_clean, axis=0)

    dist = 1 - cosine_similarity(transformed_text_clean)
    linkage_matrix = ward(dist)
    if not titles:
        titles = []
        for i in range(transformed_text_clean.shape[0]):

            if hasattr(vectorizer, 'fit'):
                indices = np.argsort(
                    np.array(transformed_text_clean[i].todense())[0])[::-1]
                titles.append(' '.join(
                    [features[i] for i in indices[:ngram[1]]]))
            else:
                attentions[i].sort(key=lambda x: x[1])
                titles.append(' '.join(
                    [i[0] for i in attentions[i][-ngram[1]:]]))
    plt.figure(figsize=figsize)
    ax = dendrogram(linkage_matrix, orientation='right', labels=titles)
    plt.tick_params(
        axis='x',
        which='both',
        bottom='off',
        top='off',
        labelbottom='off',
    )
    plt.tight_layout()
    plt.show()
    return {'linkage_matrix': linkage_matrix, 'titles': titles}
for novel in novels:
    print ("处理:{}".format(novel))
    with codecs.open('{}.txt'.format(novel), encoding="utf8") as f:
        sentences += [list(jieba.cut(line.strip())) for line in f]

model = gensim.models.Word2Vec(sentences,
                               size=100,
                               window=5,
                               min_count=5,
                               workers=4)
#for k, s in model.most_similar(positive=["牧尘"]):
    #print (k,s)
all_names = np.array(list(filter(lambda c: c in model, novel_names["\ufeff斗破苍穹"])))
word_vectors = np.array(list(map(lambda c: model[c], all_names)))
import scipy.cluster.hierarchy as sch

Y = sch.linkage(word_vectors, method="ward")

_, ax = plt.subplots(figsize=(10, 40))

Z = sch.dendrogram(Y, orientation='right')
idx = Z['leaves']

ax.set_xticks([])
ax.set_yticklabels(all_names[idx],
                  fontproperties=font_yahei_consolas)
ax.set_frame_on(False)

plt.show()

#designed by pwy