Ejemplo n.º 1
0
def snpclust(snp_fn, dist_fn, snpout_fn, clust_fn, thr=0.01):

    # read SNP and distance matrix files
    snp = pd.read_csv(snp_fn, sep=',', index_col=0)
    dist = pd.read_csv(dist_fn, sep=',', index_col=0)

    # hierarchical clustering
    Z = hierarchy.complete(squareform(dist))
    clust_ids = hierarchy.fcluster(Z, t=thr, criterion='distance')

    # compute the cluster representatives and build the cluster dictionary
    clust = dict()
    for ci in np.unique(clust_ids):
        idx = np.where(ci == clust_ids)[0]
        if idx.shape[0] == 1:
            r = dist.index[idx[0]]
            clust[r] = [r]
        else:
            dist_sum = dist.iloc[idx, idx].sum(axis=0)
            clust[dist_sum.idxmin()] = dist.index[idx].tolist()

    # write the SNP output file containing the medoids only
    snp_out = pd.concat((snp["Ref"], snp[clust.keys()]), axis=1)
    snp_out.to_csv(snpout_fn, index_label=snp.index.name)

    # write the cluster file
    with open(clust_fn, 'wb') as clust_handle:
        clust_writer = csv.writer(clust_handle,
                                  delimiter='\t',
                                  lineterminator='\n')
        for r, ms in clust.items():
            for m in ms:
                clust_writer.writerow([m, r, "{:.6f}".format(dist.loc[m, r])])
Ejemplo n.º 2
0
def create_hc2(G,t=1.15):
    """Creates hierarchical cluster of graph G from distance matrix
    This return of this function is an argument to create a blockmodel with
    nx.quotient_graph...because nx.blockmodel is not supported by networkx v.2.0
    ----------------------------------------------
    INPUT:
    G, instaciated networkx graph
    t, is the threshold for partition selection, which is arbitrarity set to t=1.15 by default.
    OUPUT:
    returns list of partition values split on hierarchical cluster"""
    path_length = dict(nx.all_pairs_shortest_path_length(G))
    dist_matrix = np.zeros((len(G),len(G)))

    for u,p in path_length.items():
        for v,d in p.items():
            dist_matrix[u][v]=d

    # Create hierarchical cluster
    Y = distance.squareform(dist_matrix)

    # Creates HC using farthest point linkage
    Z = hierarchy.complete(Y)

    # This partition selection
    membership = list(hierarchy.fcluster(Z,t=t))

    # Create collection of lists for the blockmodel
    partition = defaultdict(list)
    for n,p in zip(list(range(len(G))), membership):
        partition[p].append(n)
    return list(partition.values())
Ejemplo n.º 3
0
def plotSimpleDendogram(df, linkage):
    """
    Plot simple dendogram

    Parameters:
    df (DataFrame): Transposed and Normalize Dataframes that contains sensor data
    linkage (str): type of linkage e.g. 'ward', 'average', 'complete', 'single'

    """
    plt.figure(figsize=(18, 10))

    if linkage == 'average':
        likage = average(df)
    elif linkage == 'ward':
        likage = ward(df)
    elif linkage == 'single':
        likage = single(df)
    elif linkage == 'complete':
        likage = complete(df)

    dendrogram(likage,
               labels=df.index,
               orientation='top',
               distance_sort='descending',
               leaf_rotation=90,
               leaf_font_size=12)
    plt.show()
Ejemplo n.º 4
0
def create_correlation_tree(corr_matrix, method="average"):
    """ Creates hierarchical clustering (correlation tree)
    from a correlation matrix
    :param corr_matrix: the correlation matrix
    :param method: 'single', 'average', 'fro', or 'complete'
    returns: 'link' of the correlation tree, as in scipy"""

    # Distance matrix for tree method
    if method == "fro":
        dist_matrix = np.around(1 - np.power(corr_matrix, 2), decimals=7)
    else:
        dist_matrix = np.around(1 - np.abs(corr_matrix), decimals=7)
    dist_matrix -= np.diagflat(np.diag(dist_matrix))

    condensed_dist_matrix = ssd.squareform(dist_matrix)

    # Create linkage
    if method == "single":
        link = hierarchy.single(condensed_dist_matrix)
    elif method == "average" or method == "fro":
        link = hierarchy.average(condensed_dist_matrix)
    elif method == "complete":
        link = hierarchy.complete(condensed_dist_matrix)
    else:
        raise ValueError(
            f'Only "single", "complete", "average", "fro" are valid methods, not {method}'
        )

    return link
Ejemplo n.º 5
0
def create_hc(G, t=1.0):
    """
    Creates hierarchical cluster of graph G from distance matrix
    Maksim Tsvetovat ->> Generalized HC pre- and post-processing to work on labelled graphs and return labelled clusters
    The threshold value is now parameterized; useful range should be determined experimentally with each dataset
    """
    """Modified from code by Drew Conway"""

    ## Create a shortest-path distance matrix, while preserving node labels
    labels = G.nodes()
    path_length = nx.all_pairs_shortest_path_length(G)
    distances = numpy.zeros((len(G), len(G)))
    i = 0
    for u, p in path_length.items():
        j = 0
        for v, d in p.items():
            distances[i][j] = d
            distances[j][i] = d
            if i == j: distances[i][j] = 0
            j += 1
        i += 1

    # Create hierarchical cluster
    Y = distance.squareform(distances)
    Z = hierarchy.complete(Y)  # Creates HC using farthest point linkage
    # This partition selection is arbitrary, for illustrive purposes
    membership = list(hierarchy.fcluster(Z, t=t))
    # Create collection of lists for blockmodel
    partition = defaultdict(list)
    for n, p in zip(list(range(len(G))), membership):
        partition[p].append(labels[n])
    return list(partition.values())
Ejemplo n.º 6
0
    def _cluster_hierarchically(self, designs, verbose=False):
        import scipy.spatial.distance as sp_dist
        import scipy.cluster.hierarchy as sp_clust
        from itertools import combinations

        num_designs = len(designs)
        if num_designs < 2: return

        dist_matrix = self._get_pairwise_distance_matrix(designs)
        dist_vector = sp_dist.squareform(dist_matrix)
        mean_dist = np.mean(dist_vector)
        hierarchy = sp_clust.complete(dist_vector)
        clusters = sp_clust.fcluster(hierarchy,
                                     mean_dist,
                                     criterion='distance')

        for cluster, design in zip(clusters, designs):
            design.sequence_cluster = cluster

        if verbose:
            import pylab
            print("Made {} clusters.".format(len(set(clusters))))
            pylab.hist(dist_vector, bins=100)
            pylab.axvline(mean_dist)
            pylab.show()
def hierarchical_clustering_average(similarity_matrix, linkage_type):
    """
    Hierarchical Clustering with custom distance
    :return dendrogram from the hierarchical clustering
    :param similarity_matrix - matrix with weight function between classes
    :param linkage_type - string representing the type of linkage to be applied
    """
    if linkage_type == 'average':
        hierarc = hierarchy.average(similarity_matrix)
    elif linkage_type == 'single':
        hierarc = hierarchy.single(similarity_matrix)
    elif linkage_type == 'complete':
        hierarc = hierarchy.complete(similarity_matrix)

    hierarchy.dendrogram(hierarc,
                         labels=sorted(list(get_all_controller_classes())),
                         distance_sort='descending')

    # Uncomment to see image instead of saving
    #plt.show()

    ###################################################
    # Uncomment to save the .png file of the dendrogram

    plab.savefig("dendrogram_" + linkage_type + "_2.png",
                 format="png",
                 bbox_inches='tight')

    # Closes the open pyplot windows so the dendrograms can be redrawn
    plt.close('all')

    return hierarc, linkage_type
Ejemplo n.º 8
0
def make_modules(dist, min_dist, obs_ids):
    # create linkage matrix using complete linkage
    z = complete(dist)
    # make tree from linkage matrix with names from dist
    tree = TreeNode.from_linkage_matrix(z, obs_ids)
    # get all tips so in the end we can check if we are done
    all_tips = len([i for i in tree.postorder() if i.is_tip()])
    modules = set()
    seen = set()
    dist = pd.DataFrame(squareform(dist), index=obs_ids, columns=obs_ids)
    for node in tree.levelorder():
        if node.is_tip():
            seen.add(node.name)
        else:
            tip_names = frozenset(
                (i.name for i in node.postorder() if i.is_tip()))
            if tip_names.issubset(seen):
                continue
            dists = (dist.loc[tip1, tip2] > min_dist
                     for tip1, tip2 in combinations(tip_names, 2))
            if any(dists):
                continue
            else:
                modules.add(tip_names)
                seen.update(tip_names)
        if len(seen) == all_tips:
            modules = sorted(modules, key=len, reverse=True)
            return modules
    raise ValueError("Well, how did I get here?")
Ejemplo n.º 9
0
def _get_cluster(components, my_inds=None):
    if my_inds is None:
        my_inds = list(components.keys())
    dist = distance.pdist([components[ind] for ind in my_inds])
    hcomp = hierarchy.complete(dist)
    ll = hierarchy.leaves_list(hcomp)
    return ll
Ejemplo n.º 10
0
def create_hc(G, t=1.0):
    """
    Creates hierarchical cluster of graph G from distance matrix
    Maksim Tsvetovat ->> Generalized HC pre- and post-processing to work on labelled graphs and return labelled clusters
    The threshold value is now parameterized; useful range should be determined experimentally with each dataset
    """

    """Modified from code by Drew Conway"""
    
    ## Create a shortest-path distance matrix, while preserving node labels
    labels=G.nodes()    
    path_length=nx.all_pairs_shortest_path_length(G)
    distances=numpy.zeros((len(G),len(G))) 
    i=0   
    for u,p in path_length.items():
        j=0
        for v,d in p.items():
            distances[i][j]=d
            distances[j][i]=d
            if i==j: distances[i][j]=0
            j+=1
        i+=1
    
    # Create hierarchical cluster
    Y=distance.squareform(distances)
    Z=hierarchy.complete(Y)  # Creates HC using farthest point linkage
    # This partition selection is arbitrary, for illustrive purposes
    membership=list(hierarchy.fcluster(Z,t=t))
    # Create collection of lists for blockmodel
    partition=defaultdict(list)
    for n,p in zip(list(range(len(G))),membership):
        partition[p].append(labels[n])
    return list(partition.values())
Ejemplo n.º 11
0
def find_clusters(conn_df, max_dist):
    distances = pdist(conn_df[["z", "y", "x"]], "chebyshev")
    linkage = hierarchy.complete(distances)
    fclusters = hierarchy.fcluster(linkage, max_dist, criterion="distance")
    clustered_df = conn_df.copy()
    clustered_df["cluster_id"] = fclusters
    return clustered_df
def process_hierarchy(inf, h, method):
    df = pd.read_csv(inf, header=0, index_col=0)
    df = df.fillna(0)
    strains = df.index
    df = 1 - (df / 100)
    df_v = ssd.squareform(
        df, force='tovector',
        checks=False)  # flatten matrix to condensed distance vector
    if method == 'single':
        li = sch.single(df_v)
    elif method == 'complete':
        li = sch.complete(df_v)
    elif method == 'average':
        li = sch.average(df_v)
    elif method == 'weighted':
        li = sch.weighted(df_v)
    else:
        print('\nERROR: Please enter a valid clustering method\n')
        sys.exit()
    hclus = cut_tree(
        li, height=h
    )  # using the height (percent ID as decimal, for example), cluster OFUs from dendrogram
    hclus = pd.DataFrame(hclus, index=strains)
    hclus.ix[:,
             0] += 1  # cut_tree defaults to the first 'cluster' being named "0"; this just bumps all IDs +1
    return hclus
Ejemplo n.º 13
0
def complete_dendogram(similarity_matrix, book_names):
    linkage_matrix = complete(
        similarity_matrix
    )  # Define the linkage_matrix using ward clustering pre-computed distances
    assignments = fcluster(linkage_matrix, 3, depth=5)
    clusters = get_clusters_with_hierarchy(to_tree(linkage_matrix))
    return [assignments, clusters]
Ejemplo n.º 14
0
def hierarchical_cluster(trainx):
    """ See the scipy.cluster.hierarchy documentation for the
    meanings of entries in T.

    The result can be plotted by calling hac.dendrogram(T). """

    T = hac.complete(pdist(trainx.T) + .1)
    return T
Ejemplo n.º 15
0
def hierarchical_cluster(trainx):
    """ See the scipy.cluster.hierarchy documentation for the
    meanings of entries in T.

    The result can be plotted by calling hac.dendrogram(T). """    

    T = hac.complete(pdist(trainx.T) + .1)
    return T
 def __init__(self, distance_matrix, labels_out):
     '''
     Constructor
     '''
     z = hac.complete(distance_matrix)
     hac.dendrogram(z, labels=labels_out)
     tree = hac.to_tree(z, False)
     self.nwk = self.getNewick(tree, "", tree.dist, labels_out)
Ejemplo n.º 17
0
def custom_dendrogram(label_type='titles', linkage_method='ward'):
    """
    Plots a dendogram
    uses cosine similarity
    :param
    label_type: {'titles', 'ids'}
    linkage: {'ward', average}
    :return: None
    """

    # Read data
    books = collection_reader.read_books_from_mongo()
    documents = collection_reader.extract_corpus(books)

    # Labels
    if label_type == 'titles':
        labels = [
            "(" + book["book_id3"] + ") " + book["title"][:25] +
            ("..." if len(book["title"]) > 25 else "") for book in books
        ]
    else:
        labels = ["(" + book["book_id3"] + ")" for book in books]

    # Create term-document representation
    vectorizer = TfidfVectorizer(min_df=0.1, max_df=0.7, use_idf=True)
    X = vectorizer.fit_transform(documents)

    # Cosine similarity matrix
    dist = 1 - cosine_similarity(X)

    # Define the linkage_matrix using ward clustering pre-computed distances
    if linkage_method == 'ward':
        linkage_matrix = ward(dist)
    elif linkage_method == 'average':
        linkage_matrix = average(dist)
    elif linkage_method == 'complete':
        linkage_matrix = complete(dist)
    else:
        raise Exception("Parameter linkage_method is not recognized!")

    # Calculate metrics

    # Plot dendrogram
    plt.subplots(figsize=(5, 5))  # set size
    ax = dendrogram(linkage_matrix, orientation="right", labels=labels)

    plt.tick_params(
        axis='x',  # changes apply to the x-axis
        which='both',  # both major and minor ticks are affected
        bottom='off',  # ticks along the bottom edge are off
        top='off',  # ticks along the top edge are off
        labelbottom='off')

    print(ax["leaves"])
    print(ax["ivl"])

    # plt.tight_layout()  # show plot with tight layout
    plt.show()
Ejemplo n.º 18
0
def heirarchy_cluster( matrix, threshold_RMSD = 2.0):
	import scipy.cluster.hierarchy as sp_clust

	# Complete linkage clustering
	link_matrix = sp_clust.complete( matrix )
	# Make flat clusters at the tree point where distance threshold is met: default RMSD < 2.0
	clusters 	= sp_clust.fcluster( link_matrix, threshold_RMSD, criterion = 'distance' )

	return clusters
Ejemplo n.º 19
0
 def order_contigs_by_hc(self):
     from scipy.cluster.hierarchy import complete
     from scipy.spatial.distance import squareform
     from scipy.cluster.hierarchy import dendrogram
     g = self.create_contig_graph()
     inverse_edge_weights(g)
     D = squareform(nx.adjacency_matrix(g).todense())
     Z = complete(D)
     return dendrogram(Z)['leaves']
Ejemplo n.º 20
0
 def __get_linkage_method(self, method = LinkageMethod.SINGLE):
     if method == LinkageMethod.SINGLE:
         return single(self.data)
     elif method == LinkageMethod.COMPLETE:
         return complete(self.data)
     elif method == LinkageMethod.AVERAGE:
         return average(self.data)
     else:
         return ward(self.data)
Ejemplo n.º 21
0
def group_tuples(items=None, val_ind=None, dist_thresh = 0.1, distance_matrix=None, 
                 metric='jaccard', linkage='complete', sp_areas=None):
    '''
    items: a dict or list of tuples
    val_ind: the index of the item of interest within each tuple
    '''
    
    if distance_matrix is not None:
        if items is not None:
            if isinstance(items, dict):
                keys = items.keys()
                values = items.values()
            elif isinstance(items, list):
                keys = range(len(items))
                if isinstance(items[0], tuple):
                    values = map(itemgetter(val_ind), items)
                else:
                    values = items
    else:
        if isinstance(items, dict):
            keys = items.keys()
            values = items.values()
        elif isinstance(items, list):
            keys = range(len(items))
            if isinstance(items[0], tuple):
                values = map(itemgetter(val_ind), items)
            else:
                values = items
        else:
            raise Exception('clusters is not the right type')

        assert items is not None, 'items must be provided'
        distance_matrix = compute_pairwise_distances(values, metric, sp_areas=sp_areas)
    
    if items is None:
        assert distance_matrix is not None, 'distance_matrix must be provided.'    
        
    if linkage=='complete':
        lk = complete(squareform(distance_matrix))
    elif linkage=='average':
        lk = average(squareform(distance_matrix))
    elif linkage=='single':
        lk = single(squareform(distance_matrix))

    # T = fcluster(lk, 1.15, criterion='inconsistent')
    T = fcluster(lk, dist_thresh, criterion='distance')
    
    n_groups = len(set(T))
    groups = [None] * n_groups

    for group_id in range(n_groups):
        groups[group_id] = np.where(T == group_id+1)[0]

    index_groups = [[keys[i] for i in g] for g in groups if len(g) > 0]
    item_groups = [[items[i] for i in g] for g in groups if len(g) > 0]
    
    return index_groups, item_groups, distance_matrix
Ejemplo n.º 22
0
    def order_contigs_by_hc(self):
        from scipy.cluster.hierarchy import complete
        from scipy.spatial.distance import squareform
        from scipy.cluster.hierarchy import dendrogram

        g = self.create_contig_graph()
        inverse_edge_weights(g)
        D = squareform(nx.adjacency_matrix(g).todense())
        Z = complete(D)
        return dendrogram(Z)["leaves"]
Ejemplo n.º 23
0
def klustering():

    from sklearn.metrics.pairwise import cosine_similarity
    from scipy.cluster.hierarchy import ward, dendrogram, single, complete
    import matplotlib.pyplot as plt
    import matplotlib as mpl

    savepkl = get_id('UPLOAD_VEKTOR', session["nama_vektor"], ".pkl")

    with open(savepkl, 'rb') as f:
        tfidf_matrix = pickle.load(f)

    isifile = session["tmp"]
    df = pd.read_csv(isifile,
                     names=['ID', 'Pertanyaan'],
                     sep=';',
                     lineterminator='\r')

    dist = 1 - cosine_similarity(tfidf_matrix)

    if session["linkage_method"] == 'ward':
        linkage_matrix = ward(dist)
    elif session["linkage_method"] == 'single':
        linkage_matrix = single(dist)
    else:
        linkage_matrix = complete(dist)

    ax = plt.subplots(figsize=(10, 10))
    ax = dendrogram(linkage_matrix,
                    orientation="right",
                    labels=df['Pertanyaan'].values.astype('U'))

    plt.tick_params(\
        axis= 'y',
        which='both',
        bottom='off',
        top='off',
        labelbottom='off')

    plt.tight_layout()

    if 'jarak' in request.form:
        plt.axvline(float(request.form['jarak']), color='black')
        t = datetime.datetime.now().time().strftime('%y%m%d%H%M%S')
        fname = ''.join([session['nama_vektor'], "_rev_", t])
        pathfile = get_id('UPLOAD_IMAGE_HIRARKI', fname, '.png')
        plt.savefig(pathfile, dpi=400)

        return jsonify({"link": fname})

    pathfile = get_id('UPLOAD_IMAGE_HIRARKI', session['nama_vektor'], '.png')
    plt.savefig(pathfile, dpi=400)

    return jsonify({'vektor': session['nama_vektor']})
Ejemplo n.º 24
0
def plotDendogramsLineCharts(df_list, n_rows, n_cols, figsize, linkage):
    """
    Plot simple dendogram and line chart side by side

    Parameters:
    df_list (list of DataFrames): Transposed and Normalize Dataframes that contains sensor data
    n_rows (integer): the number of days you want to plot.
    n_cols (integer):
    figsize (tuple): figure size
    linkage (str):  type of linkage e.g. 'ward', 'average', 'complete', 'single'

    """
    #PLOTTING Dendogram and Line Chart
    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=figsize)
    fig.subplots_adjust(wspace=.4)
    for ax, df in zip(axes, df_list):

        #PLOTTING DENDOGRAM
        #Compute linkage matrix
        if linkage == 'average':
            linkage_matrix = average(df)
        elif linkage == 'ward':
            linkage_matrix = ward(df)
        elif linkage == 'single':
            linkage_matrix = single(df)
        elif linkage == 'complete':
            linkage_matrix = complete(df)
        #Get dendrogram
        dendrogram(linkage_matrix,
                   ax=ax[0],
                   labels=df.index,
                   orientation='left')
        ax[0].set_title('Dendogram', fontsize='14')

        ##PLOTTING LINE CHART
        #To plot we need to make columns as sensor rows as data points
        df_plot = np.transpose(df)
        #Reset index timestamp to change its data type
        df_plot.reset_index(inplace=True)
        #Change its data type to datatime
        df_plot['Timestamp'] = pd.to_datetime(df_plot['Timestamp'])
        #Get start date and end date
        start_date = str(df_plot['Timestamp'].min())
        end_date = str(df_plot['Timestamp'].max())
        #make it index again
        df_plot.set_index('Timestamp', inplace=True)

        ax[1].plot(df_plot)
        ax[1].set_title('All Sensors\n {} - {}'.format(start_date, end_date),
                        fontsize='14')
        ax[2].plot(df_plot.iloc[:, 2:])
        ax[2].set_title('Temperature & Vibration Sensors\n {} - {}'.format(
            start_date, end_date),
                        fontsize='14')
Ejemplo n.º 25
0
def part2(computedTFIDF, showDendograms=False):
  startTime = time.time()
  runningTotalTime=0

  print("Executing code for Part 2...\n")

  print("Creating and cutting single link clusters...")
  singleCluster = single(computedTFIDF.similarityMatrix)
  singleClusterCut = cut_tree(singleCluster, n_clusters=[i for i in range(0, computedTFIDF.docCount-1)])
  singleClusterTime = round(time.time() - startTime, 3)
  runningTotalTime+=singleClusterTime
  print("Time: " + str(singleClusterTime) + " seconds")

  print("Creating list of single link clusters each document is contained in...")
  finalSingleClustering = singleClusterCut[len(singleClusterCut)-1]
  documentClusters=createDocumentCluster(finalSingleClustering, computedTFIDF)
  singleTrackingTime = round(time.time() - startTime - runningTotalTime, 3)
  runningTotalTime+=singleTrackingTime
  print("Time: " + str(singleTrackingTime) + " seconds")

  print("Writing single link clusters to file...")
  writeToFile(documentClusters, 'single.txt')
  singleWritingTime = round(time.time() - startTime - runningTotalTime, 3)
  runningTotalTime+=singleWritingTime
  print("Time: " + str(singleWritingTime) + " seconds")

  print("Creating and cutting complete link clusters...")
  completeCluster = complete(computedTFIDF.similarityMatrix)
  completeClusterCut = cut_tree(completeCluster, n_clusters=[i for i in range(0, computedTFIDF.docCount-1)])
  completeClusterTime = round(time.time() - startTime - runningTotalTime, 3)
  runningTotalTime+=completeClusterTime
  print("Time: " + str(completeClusterTime) + " seconds")

  print("Creating list of complete link clusters each document is contained in...")
  finalCompleteClustering = completeClusterCut[len(completeClusterCut)-1]
  completeDocumentClusters=createDocumentCluster(finalCompleteClustering, computedTFIDF)
  completeTrackingTime = round(time.time() - startTime - runningTotalTime, 3)
  runningTotalTime+=completeTrackingTime
  print("Time: " + str(completeTrackingTime) + " seconds")

  print("Writing complete link clusters to file...")
  writeToFile(completeDocumentClusters, 'complete.txt')
  completeWritingTime = round(time.time() - startTime - runningTotalTime, 3)
  runningTotalTime+=completeWritingTime
  print("Time: " + str(completeWritingTime) + " seconds")

  if showDendograms:
    displayDendogram(completeCluster, 'Single')
    displayDendogram(completeCluster, 'Complete')

  print('\nPart 2 Complete')
  print("Execution Time: " + str(round(time.time() - startTime, 3)) + " seconds\n")
  
  return documentClusters, completeDocumentClusters
Ejemplo n.º 26
0
def group_clusters(clusters=None,
                   dist_thresh=0.1,
                   distance_matrix=None,
                   metric='jaccard',
                   linkage='complete'):

    if distance_matrix is not None:
        keys = range(len(distance_matrix))
        if clusters is not None:
            values = clusters
        else:
            values = range(len(distance_matrix))
    else:
        if isinstance(clusters, dict):
            keys = clusters.keys()
            values = clusters.values()
        elif isinstance(clusters, list):
            if isinstance(clusters[0], tuple):
                keys = [i for i, j in clusters]
                values = [j for i, j in clusters]
            else:
                keys = range(len(clusters))
                values = clusters
        else:
            raise Exception('clusters is not the right type')

    if clusters is None:
        assert distance_matrix is not None, 'distance_matrix must be provided.'

    if distance_matrix is None:
        assert clusters is not None, 'clusters must be provided'
        distance_matrix = compute_pairwise_distances(values, metric)

    if linkage == 'complete':
        lk = complete(squareform(distance_matrix))
    elif linkage == 'average':
        lk = average(squareform(distance_matrix))
    elif linkage == 'single':
        lk = single(squareform(distance_matrix))

    # T = fcluster(lk, 1.15, criterion='inconsistent')
    T = fcluster(lk, dist_thresh, criterion='distance')

    n_groups = len(set(T))
    groups = [None] * n_groups

    for group_id in range(n_groups):
        groups[group_id] = np.where(T == group_id + 1)[0]

    index_groups = [[keys[i] for i in g] for g in groups if len(g) > 0]
    res = [[values[i] for i in g] for g in groups if len(g) > 0]

    return index_groups, res, distance_matrix
Ejemplo n.º 27
0
def __apply_cluster_alg(cluster_data=[], alg="kmean", prior_cluster_num=2, t=0.155):
    pass
    """clustering"""
    if alg == "kmean":
        from scipy.cluster.vq import whiten

        cluster_data = whiten(cluster_data)
        from scipy.cluster.vq import kmeans, vq

        centroids, _ = kmeans(cluster_data, prior_cluster_num, iter=250)
        idx, dist = vq(cluster_data, centroids)
        return idx, prior_cluster_num
    elif alg == "spec":
        from sklearn import cluster
        from sklearn.preprocessing import StandardScaler

        X = cluster_data
        X = StandardScaler().fit_transform(X)
        spectral = cluster.SpectralClustering(n_clusters=prior_cluster_num, eigen_solver="arpack")
        spectral.fit(X)
        import numpy as N

        idx = spectral.labels_.astype(N.int)
        return idx, prior_cluster_num
    else:
        """hierarchical clustering
		   http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html"""
        import scipy.cluster.hierarchy as hcluster

        """needs distance matrix: 
		   http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.pdist.html"""
        import scipy.spatial.distance as dist

        distmat = dist.pdist(cluster_data, "minkowski")  #'euclidean')
        if alg == "hflat":
            link = hcluster.linkage(distmat)
        elif alg == "hcomp":
            link = hcluster.complete(distmat)
        elif alg == "hweight":
            link = hcluster.weighted(distmat)
        elif alg == "havg":
            link = hcluster.average(distmat)
        idx = hcluster.fcluster(link, t=t, criterion="distance")
        import numpy as N

        post_cluster_num = len(N.unique(idx))
        print "# of channels established:", post_cluster_num
        assert post_cluster_num < 64, "number of cluster too large to be biological meaningful"
        return idx, post_cluster_num
Ejemplo n.º 28
0
def diffCluster(matDist, threshold, labels, clusteringType):
    if clusteringType == 1:
        linkage_matrix = ward(matDist)
    elif clusteringType == 2:
        linkage_matrix = single(matDist)
    elif clusteringType == 3:
        linkage_matrix = complete(matDist)
    elif clusteringType == 4:
        linkage_matrix = average(matDist)
    else:
        return {}
    cluster_labels = fcluster(linkage_matrix, threshold)
    clusters_dict = defaultdict(list)
    for sent, cluster_id in zip(cluster_labels, labels):
        clusters_dict[cluster_id].append(sent)
    return clusters_dict
Ejemplo n.º 29
0
    def mock_random_tree(self):

        np.random.seed(0)
        x = np.random.rand(10)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x - y))
        lm = complete(dm.condensed_form())
        ids = np.arange(len(x)).astype(np.str)
        tree = TreeNode.from_linkage_matrix(lm, ids)

        # initialize tree with branch length and named internal nodes
        for i, n in enumerate(tree.postorder(include_self=True)):
            n.length = 1
            if not n.is_tip():
                n.name = "y%d" % i

        return tree
Ejemplo n.º 30
0
def hierarchical_clustering(dist_matrix, method='complete'):
    if method == 'complete':
        Z = complete(dist_matrix)
    if method == 'single':
        Z = single(dist_matrix)
    if method == 'average':
        Z = average(dist_matrix)
    if method == 'ward':
        Z = ward(dist_matrix)

    fig = plt.figure(figsize=(20, 20))
    dn = dendrogram(Z)
    plt.title(f"Dendrogram for {method}-linkage with correlation distance")
    plt.show()

    return Z
Ejemplo n.º 31
0
def hierarchical_clustering(distance_matrix, method):
    if method == 'complete':
        Z = complete(distance_matrix)
    if method == 'single':
        Z = single(distance_matrix)
    if method == 'average':
        Z = average(distance_matrix)
    if method == 'ward':
        Z = ward(distance_matrix)

    # fig = plt.figure(figsize=(16, 8))
    # dn = dendrogram(Z)
    # plt.title(f"Dendrogram for {method}-linkage with dtw distance")
    # plt.show()

    return Z
Ejemplo n.º 32
0
def pc():
    import random
    Y=[random.randint(1,30) for x in range(10) ]

   

    print distance.squareform(Y)
    Z=hierarchy.complete(Y)  # Creates HC using farthest point linkage
    # This partition selection is arbitrary, for illustrive purposes
    print Z
    membership=list(hierarchy.fcluster(Z,1))
    # Create collection of lists for blockmodel
    print 'membership',membership
    partition=defaultdict(list)
    for n,p in zip(list(range(10)),membership):
        partition[p].append(n)
    return list(partition.values())
Ejemplo n.º 33
0
 def group(cls,
           project,
           results,
           threshold=0.0,
           use_single_linkage_by_default=False):
     """
     Returns dict with groups, where key are group index and values are list of results indices
     :param project: Project object used for getting matching score function parameters
     :param results: list of results
     :param threshold: the comparison between 2 results should be above this threshold to join them in one group
     :param use_single_linkage_by_default:
     :return: {0: [1,2,3], 1: [4,5], 2: [6]}
     """
     num_results = len(results)
     if num_results == 0:
         raise ValueError(
             f'Can\'t group empty results for project {project}')
     if num_results == 1:
         return {0: [0]}
     result_sim = np.zeros((num_results, num_results), dtype=np.float64)
     for i in range(num_results):
         for j in range(i + 1, num_results):
             result_sim[i, j] = Metrics.apply(project,
                                              results[i],
                                              results[j],
                                              symmetric=True)
     result_sim = result_sim + result_sim.T
     margin = 1.01 * np.max(result_sim)
     dists = margin - squareform(result_sim)
     if project.agreement_method == project.SINGLE or not project.agreement_method:
         linkage_matrix = single(dists)
     elif project.agreement_method == project.COMPLETE:
         linkage_matrix = complete(dists)
     else:
         if use_single_linkage_by_default:
             linkage_matrix = single(dists)
         else:
             raise ValueError(
                 f'Unknown agreement method {project.agreement_method}')
     clusters = fcluster(linkage_matrix,
                         t=margin - threshold,
                         criterion='distance')
     groups = defaultdict(list)
     for i, cluster_idx in enumerate(clusters):
         groups[cluster_idx].append(i)
     return groups
Ejemplo n.º 34
0
def create_hc(G):
    """Creates hierarchical cluster of graph G from distance matrix"""
    path_length = nx.all_pairs_shortest_path_length(G)
    distances = numpy.zeros((len(G), len(G)))
    for u, p in path_length.items():
        for v, d in p.items():
            distances[u][v] = d
    # Create hierarchical cluster
    Y = distance.squareform(distances)
    Z = hierarchy.complete(Y)  # Creates HC using farthest point linkage
    # This partition selection is arbitrary, for illustrive purposes
    membership = list(hierarchy.fcluster(Z, t=1.15))
    # Create collection of lists for blockmodel
    partition = defaultdict(list)
    for n, p in zip(list(range(len(G))), membership):
        partition[p].append(n)
    return list(partition.values())
Ejemplo n.º 35
0
def create_hc(G):
    """Creates hierarchical cluster of graph G from distance matrix"""
    path_length=nx.all_pairs_shortest_path_length(G)
    distances=numpy.zeros((len(G),len(G)))
    for u,p in path_length.items():
        for v,d in p.items():
            distances[u][v]=d
    # Create hierarchical cluster
    Y=distance.squareform(distances)
    Z=hierarchy.complete(Y)  # Creates HC using farthest point linkage
    # This partition selection is arbitrary, for illustrive purposes
    membership=list(hierarchy.fcluster(Z,t=1.15))
    # Create collection of lists for blockmodel
    partition=defaultdict(list)
    for n,p in zip(list(range(len(G))),membership):
        partition[p].append(n)
    return list(partition.values())
Ejemplo n.º 36
0
def consensus_clustering(consensus, n_components=5):
    """
    :param consensus: cells x cells consensus matrix
    :param n_components: number of clusters
    :return: cells x 1 labels
    """
    print 'SC3 Agglomorative hierarchical clustering.'
    # condensed distance matrix
    cdm = dist.pdist(consensus)
    # hierarchical clustering (SC3: complete agglomeration + cutree)
    hclust = spc.complete(cdm)
    cutree = spc.cut_tree(hclust, n_clusters=n_components)
    labels = cutree.reshape(consensus.shape[0])
    # Below is the hclust code for the older version, fyi
    # hclust = spc.linkage(cdm)
    # labels = spc.fcluster(hclust, n_components, criterion='maxclust')
    return labels, dist.squareform(cdm)
Ejemplo n.º 37
0
def consensus_clustering_here(consensus, n_components=5):
    """
    :param consensus: cells x cells consensus matrix
    :param n_components: number of clusters
    :return: cells x 1 labels
    """
    # print 'SC3 Agglomorative hierarchical clustering.'
    # condensed distance matrix
    cdm = dist.pdist(consensus)
    # hierarchical clustering (SC3: complete agglomeration + cutree)
    hclust = spc.complete(cdm)
    cutree = spc.cut_tree(hclust, n_clusters=n_components)
    labels = cutree.reshape(consensus.shape[0])
    # Below is the hclust code for the older version, fyi
    # hclust = spc.linkage(cdm)
    # labels = spc.fcluster(hclust, n_components, criterion='maxclust')
    return labels
Ejemplo n.º 38
0
def create_hc(G):
    """Creates hierarchical cluster of graph G from distance matrix"""
    path_length = nx.all_pairs_shortest_path_length(G)
    distances = numpy.zeros((len(G), len(G)))

    # l1 = sorted(path_length.items(),key=lambda x: x[0])
    # for u, p in l1:
    #     l2 = sorted(p.items(),key=lambda x: x[0])
    #     for v, d in l2:
    #         x = getIndexOfTuple(l1, 0, u)
    #         y = getIndexOfTuple(l2, 0, v)
    #         distances[x][y] = d
    for u, p in path_length.items():
        for v, d in p.items():
            distances[u][v] = d

    # Create hierarchical cluster
    Y = distance.squareform(distances)
    Z = hierarchy.complete(Y)  # Creates HC using farthest point linkage

    plt.figure(figsize=(25, 10))
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('sample index')
    plt.ylabel('distance')
    hierarchy.dendrogram(
        Z,
        leaf_rotation=90.,  # rotates the x axis labels
        leaf_font_size=8.,  # font size for the x axis labels
    )
    plt.show()

    # This partition selection is arbitrary, for illustrive purposes
    membership = list(hierarchy.fcluster(Z, t=1.15))

    # Create collection of lists for blockmodel
    partition = defaultdict(list)

    for n, p in zip(list(range(len(G))), membership):
        partition[p].append(n)

    # [0, 179, 305]
    # print "Clustering [0, 179, 305]"
    # print l1[0][0], l1[179][0], l1[305][0]

    return list(partition.values())
Ejemplo n.º 39
0
def blockmodel_output(G, t=1.15):
    # Makes life easier to have consecutively labeled integer nodes
    H = nx.convert_node_labels_to_integers(G, label_attribute='label')
    """Creates hierarchical cluster of graph G from distance matrix"""
    # Create distance matrix
    path_length = dict(nx.all_pairs_shortest_path_length(H))
    distances = np.zeros((len(H), len(H)))
    for u, p in path_length.items():
        for v, d in p.items():
            distances[u][v] = d
    # Create hierarchical cluster
    Y = distance.squareform(distances)
    Z = hierarchy.complete(Y)  # Creates HC using farthest point linkage
    # This partition selection is arbitrary, for illustrative purposes
    membership = list(hierarchy.fcluster(Z, t=t))
    # Create collection of lists for blockmodel
    partitions = defaultdict(list)
    for n, p in zip(list(range(len(G))), membership):
        partitions[p].append(n)

    # Build blockmodel graph
    #BM = nx.blockmodel(H, partitions) # change in nx 2.0
    p_values = list(partitions.values())
    BM = nx.quotient_graph(H, p_values, relabel=True)

    label_dict = dict([(n, H.node[n]['label']) for n in H])
    order = [label_dict[item] for sublist in p_values for item in sublist]
    nm = nx.to_pandas_dataframe(G)
    nm = nm.reindex(index=order)
    nm.columns = nm.index

    ho = homophily(G, 'type')

    output = {
        'G': G,
        'H': H,
        'partitions': partitions,
        'BM': BM,
        'nm': nm,
        'label_dict': label_dict,
        'order': order,
        'distances': distances
    }
    output.update(ho)
    return output
Ejemplo n.º 40
0
def create_hc(G):
    """Creates hierarchical cluster of graph G from distance matrix"""
    path_length = nx.all_pairs_shortest_path_length(G)
    distances = numpy.zeros((len(G), len(G)))

    # l1 = sorted(path_length.items(),key=lambda x: x[0])
    # for u, p in l1:
    #     l2 = sorted(p.items(),key=lambda x: x[0])
    #     for v, d in l2:
    #         x = getIndexOfTuple(l1, 0, u)
    #         y = getIndexOfTuple(l2, 0, v)
    #         distances[x][y] = d
    for u, p in path_length.items():
        for v, d in p.items():
            distances[u][v] = d

    # Create hierarchical cluster
    Y = distance.squareform(distances)
    Z = hierarchy.complete(Y)  # Creates HC using farthest point linkage

    plt.figure(figsize=(25, 10))
    plt.title("Hierarchical Clustering Dendrogram")
    plt.xlabel("sample index")
    plt.ylabel("distance")
    hierarchy.dendrogram(
        Z, leaf_rotation=90.0, leaf_font_size=8.0  # rotates the x axis labels  # font size for the x axis labels
    )
    plt.show()

    # This partition selection is arbitrary, for illustrive purposes
    membership = list(hierarchy.fcluster(Z, t=1.15))

    # Create collection of lists for blockmodel
    partition = defaultdict(list)

    for n, p in zip(list(range(len(G))), membership):
        partition[p].append(n)

    # [0, 179, 305]
    # print "Clustering [0, 179, 305]"
    # print l1[0][0], l1[179][0], l1[305][0]

    return list(partition.values())
Ejemplo n.º 41
0
def cluster(distances):
  names = list(set(itertools.chain.from_iterable(distances)))
  mat = numpy.zeros((len(names), len(names)))

  for i, name_i in enumerate(names):
    for j, name_j in enumerate(names):
      mat[i][j] = distances.get((name_i, name_j)) or distances.get((name_j, name_i)) or 0

  condensed = distance.squareform(mat)
  linkage_matrix = hierarchy.complete(condensed)
  leaves_dict = {}
  traverse_tree(hierarchy.to_tree(linkage_matrix), leaves_dict)
  print(leaves_dict)

  with contextlib.closing(sqlite3.connect('voynich.db')) as connection:
    cursor = connection.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS clusters(name, clusterid)')
    cursor.execute('DELETE FROM clusters')
    for key, values in leaves_dict.items():
      for clusterid in values:
        cursor.execute('INSERT INTO clusters(name, clusterid) VALUES (?, ?)', 
          (names[key], clusterid))
    connection.commit()
def do_clustering(types, max_clust):
    """
    Helper method for clustering that takes a list of all of the things being
    clustered (which are assumed to be binary numbers represented as strings),
    and an int representing the maximum number of clusters that are allowed.

    Returns: A dictionary mapping cluster ids to lists of numbers that are part
    of that cluster.
    """
    #Fill in leading zeros to make all numbers same length.
    ls = [list(t[t.find("b")+1:]) for t in types]
    prepend_zeros_to_lists(ls)

    dist_matrix = pdist(ls, weighted_hamming)
    clusters = hierarchicalcluster.complete(dist_matrix)
    clusters = hierarchicalcluster.fcluster(clusters, max_clust, \
                                            criterion="maxclust")

    #Group members of each cluster together
    cluster_dict = dict((c, []) for c in set(clusters))
    for i in range(len(types)):
        cluster_dict[clusters[i]].append(types[i])

    return cluster_dict
Ejemplo n.º 43
0
	def CalculateClusterTree(self):
		fullMatrix = self.GenerateFullMatrix(self.results)
		dissMatrix = []
		labels = fullMatrix.keys()
		for i in xrange(0, len(labels)):
			sampleNameI = labels[i]
			for j in xrange(i+1, len(labels)):
				sampleNameJ = labels[j]
				dissMatrix.append(fullMatrix[sampleNameI][sampleNameJ])
				
		# calculate hierarchical cluster tree
		if self.radioSingleLinkage.GetValue():
			linkageMatrix = single(dissMatrix)
		elif self.radioUPGMA.GetValue():
			linkageMatrix = average(dissMatrix)
		elif self.radioCompleteLinkage.GetValue():
			linkageMatrix = complete(dissMatrix)
		elif self.radioWeighted.GetValue():
			linkageMatrix = weighted(dissMatrix)
			
		root = to_tree(linkageMatrix)
		
		# create Newick string
		return self.CreateNewickString(root, labels) + ';'
    #order = np.argsort(height, kind='mergesort')
    #a = a[order]
    #b = b[order]
    #height = height[order]
    if 1:
        import pylab as pl
        children = np.c_[a, b].astype(np.int)
        from sklearn.cluster.hierarchical import _hc_cut, ward_tree
        labels = _hc_cut(n_clusters=4, children=children, n_leaves=N)
        pl.figure(1)
        pl.clf()
        pl.scatter(X[:, 0], X[:, 1], c=labels, cmap=pl.cm.spectral)
        pl.title('Complete linkage')
    if 1:
        from scipy.cluster import hierarchy
        children_s = hierarchy.complete(X)[:, :2].astype(np.int)
        labels_s = _hc_cut(n_clusters=4, children=children_s, n_leaves=N)
        import pylab as pl
        pl.figure(0)
        pl.clf()
        pl.scatter(X[:, 0], X[:, 1], c=labels_s, cmap=pl.cm.spectral)
        pl.title('Complete linkage (scipy)')
    if 0:
        pl.figure(2)
        pl.clf()
        children_w, _, _ = ward_tree(X)
        labels_w = _hc_cut(n_clusters=4, children=children_w, n_leaves=N)
        pl.scatter(X[:, 0], X[:, 1], c=labels_w, cmap=pl.cm.spectral)
        pl.title('Ward')
        pl.show()
Ejemplo n.º 45
0
hclust_model = cluster.AgglomerativeClustering(n_clusters = 2, linkage = 'average')
hclust_model.fit(X)
print('Cluster labels: {}\n'.format(hclust_model.labels_))

hclust_model = cluster.AgglomerativeClustering(n_clusters = 2, linkage = 'complete')
hclust_model.fit(X)
print('Cluster labels: {}\n'.format(hclust_model.labels_))


print '''
*********************************************************************************************************************
                                 scipy: dendrogram
*********************************************************************************************************************
'''

# from: https://github.com/JWarmenhoven/ISLR-python/blob/master/Notebooks/Chapter%2010.ipynb

fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(15,18))

for linkage, cluster, ax in zip([hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)], ['c1','c2','c3'],
                                [ax1,ax2,ax3]):
    cluster = hierarchy.dendrogram(linkage, ax=ax, color_threshold=0)

ax1.set_title('Complete Linkage')
ax2.set_title('Average Linkage')
ax3.set_title('Single Linkage')

plt.show()

Ejemplo n.º 46
0
    # entries of distance matrix
    AB = ds.cdtw(query, zchild[100+index], window, True)
    AC = np.array(dist).min()
    BC = ds.cdtw(zchild[100+best], zchild[100+index], window, True)
    
    print index, best, AC, AB
    
    # distance matrix
    M = np.array([[0, AB, AC], [AB, 0, BC], [AC, BC, 0]])
    
    # label function
    L = lambda x: {0: "P", 1: "C", 2: "L"}[int(x)]
    
    # render dendrogram
    D = h.dendrogram(h.complete(M), orientation="left", leaf_label_func=L, 
                     link_color_func=lambda k: "b", leaf_font_size=40)
    
    # adjust clipping
    pl.axis((-2**10-200, np.max(D["dcoord"])*1.2, 0, 30))
    
    # colors for time signals
    C = {"P": "b", "C": "b", "L": "r"}
    
    # list of signals
    signals = {"P": query, "C": zchild[100+index], "L": zchild[100+best]}

    # plot signals
    for offset, label in enumerate(D["ivl"]):
        pl.plot(range(-2**10-100, -100), 
                signals[label]+offset*10+5, c=C[label])
Ejemplo n.º 47
0
    #подготовка данных
    niks,cols,data,rec = model.get_data("%s%s.csv" % (worker.CSV_PATH, options.filename) );
    logging.info("Prepared %d players and %d colums" % (len(niks), len(cols)) );
    
    
    logging.info("\nFirstStep");

    logging.info("Distance euclidean");
    start = time.time();
    euclid_data = pdist(data, 'euclidean');
    logging.info("Time: %s" % (time.time() - start));

    logging.info("Clustering start");
    start = time.time();
    Z = hierarchy.complete(euclid_data);
    worker.hierarchy_draw(Z, niks, 'study_complete_euclid', 0.4);
    logging.info("Time complete: %s" % (time.time() - start));

    start = time.time();
    Z = hierarchy.average(euclid_data);
    worker.hierarchy_draw(Z, niks, 'study_average_euclid', 0.25);
    logging.info("Time average: %s" % (time.time() - start));

    start = time.time();
    Z = hierarchy.weighted(euclid_data);
    worker.hierarchy_draw(Z, niks, 'study_weighted_euclid', 0.25);
    logging.info("Time weighted: %s" % (time.time() - start));


Ejemplo n.º 48
0
def compute_stability_fold(samples, train, test, method='ward',
                           max_k=None, stack=False,
                           stability=True, cv_likelihood=False,
                           corr_score=None,
                           ground_truth=None, n_neighbors=1,  **kwargs):
    """
    General function to compute the stability on a cross-validation fold.
    
    Parameters:
    -----------
        samples : list of arrays
            List of arrays containing the samples to cluster, each
            array has shape (n_samples, n_features) in PyMVPA terminology.
            We are clustering the features, i.e., the nodes.
        train : list or array
            Indices for the training set.
        test : list or array
            Indices for the test set.
        method : {'complete', 'gmm', 'kmeans', 'ward'}
            Clustering method to use. Default is 'ward'.
        max_k : int or None
            Maximum k to compute the stability testing, starting from 2. By
            default it will compute up to the maximum possible k, i.e.,
            the number of points.
        stack : bool
            Whether to stack or average the datasets. Default is False,
            meaning that the datasets are averaged by default.
        stability : bool
            Whether to compute the stability measure described in Lange et
            al., 2004. Default is True.
        cv_likelihood : bool
            Whether to compute the cross-validated likelihood for mixture
            model; only valid if 'gmm' method is used. Default is False.
        corr_score : {'pearson','spearman'} or None
            Whether to compute the specified type of correlation score. 
            Default is None.
        ground_truth : array or None
            Array containing the ground truth of the clustering of the data,
            useful to compare stability against ground truth for simulations.
        n_neighbors : int
            Number of neighbors to use to predict clustering solution on
            test set using K-nearest neighbors. Currently used only for
            methods `complete` and `ward`. Default is 1.
        kwargs : optional
            Keyword arguments being passed to the clustering method (only for
            'ward' and 'gmm').
    
    Returns:
    --------
        ks : array
            A (max_k-1,) array, where ks[i] is the `k` of the clustering
            solution for iteration `i`.
        ari : array
            A (max_k-1,) array, where ari[i] is the Adjusted Rand Index of the
            predicted clustering solution on the test set and the actual
            clustering solution of the test set for `k` of ks[i].
        ami : array
            A (max_k-1,) array, where ari[i] is the Adjusted Mutual
            Information of the predicted clustering solution on the test set
            and the actual clustering solution of the test set for
            `k` of ks[i].
        stab : array or None
            A (max_k-1,) array, where stab[i] is the stability measure
            described in Lange et al., 2004 for `k` of ks[i]. Note that this
            measure is the un-normalized one. It will be normalized later in
            the process.
        likelihood : array or None
            If method is 'gmm' and cv_likelihood is True, a
            (max_k-1,) array, where likelihood[i] is the cross-validated
            likelihood of the GMM clustering solution for `k` of ks[i].
            Otherwise returns None.
        ari_gt : array or None
            If ground_truth is not None, a (max_k-1,) array, where ari_gt[i]
            is the Adjusted Rand Index of the predicted clustering solution on
            the test set for `k` of ks[i] and the ground truth clusters of the
            data.
            Otherwise returns None.
        ami_gt : array or None
            If ground_truth is not None, a (max_k-1,) array, where ami_gt[i]
            is the Adjusted Mutual Information of the predicted clustering
            solution on the test set for `k` of ks[i] and the ground truth
            clusters of the data.
            Otherwise returns None.
        stab_gt : array or None
            If ground_truth is not None, a (max_k-1,) array, where stab_gt[i]
            is the stability measure of the predicted clustering
            solution on the test set for `k` of ks[i] and the ground truth
            clusters of the data.
            Otherwise returns None.
        corr : array or None
            Average correlation for each fold. TODO
        corr_gt : array or None
            Avg correlation against GT. TODO
    """
    if method not in AVAILABLE_METHODS:
        raise ValueError('Method {0} not implemented'.format(method))

    if cv_likelihood and method != 'gmm':
        raise ValueError(
            "Cross-validated likelihood is only available for 'gmm' method")

    # if max_k is None, set max_k to maximum value
    if not max_k:
        max_k = samples[0].shape[1]

    # preallocate arrays for results
    ks = np.zeros(max_k-1, dtype=int)
    ari = np.zeros(max_k-1)
    ami = np.zeros(max_k-1)
    if stability:
        stab = np.zeros(max_k-1)
    if cv_likelihood:
        likelihood = np.zeros(max_k-1)
    if corr_score is not None:
        corr = np.zeros(max_k-1)
    if ground_truth is not None:
        ari_gt = np.zeros(max_k-1)
        ami_gt = np.zeros(max_k-1)
        if stability:
            stab_gt = np.zeros(max_k-1)
        if corr_score is not None:
            corr_gt = np.zeros(max_k-1)

    # get training and test
    train_set = [samples[x] for x in train]
    test_set = [samples[x] for x in test]
    
    if stack:
        train_ds = np.vstack(train_set)
        test_ds = np.vstack(test_set)
    else:
        train_ds = np.mean(np.dstack(train_set), axis=2)
        test_ds = np.mean(np.dstack(test_set), axis=2)

    # compute clustering on training set
    if method == 'complete':
        train_ds_dist = pdist(train_ds.T, metric='correlation')
        test_ds_dist = pdist(test_ds.T, metric='correlation')
        # I'm computing the full tree and then cutting
        # afterwards to speed computation
        Y_train = complete(train_ds_dist)
        # same on testing set
        Y_test = complete(test_ds_dist)
    elif method == 'ward':
        (children_train, n_comp_train, 
         n_leaves_train, parents_train) = ward_tree(train_ds.T, **kwargs)
        # same on testing set
        (children_test, n_comp_test, 
         n_leaves_test, parents_test) = ward_tree(test_ds.T, **kwargs)
    elif method == 'gmm' or method == 'kmeans':
        pass  # we'll have to run it for each k
    else:
        raise ValueError("We shouldn't get here")

    for i_k, k in enumerate(range(2, max_k+1)):
        if method == 'complete':
            # cut the tree with right K for both train and test
            train_label = cut_tree_scipy(Y_train, k)
            test_label = cut_tree_scipy(Y_test, k)
            # train a classifier on this clustering
            knn = KNeighborsClassifier(#algorithm='brute',
            # metric='correlation',
                                       n_neighbors=n_neighbors)
            knn.fit(train_ds.T, train_label)
            # predict the clusters in the test set
            prediction_label = knn.predict(test_ds.T)
        elif method == 'ward':
            # cut the tree with right K for both train and test
            train_label = _hc_cut(k, children_train, n_leaves_train)
            test_label = _hc_cut(k, children_test, n_leaves_test)
            # train a classifier on this clustering
            knn = KNeighborsClassifier(n_neighbors=n_neighbors)
            knn.fit(train_ds.T, train_label)
            # predict the clusters in the test set
            prediction_label = knn.predict(test_ds.T)
        elif method == 'gmm':
            gmm = GMM(n_components=k, **kwargs)
            # fit on train and predict test
            gmm.fit(train_ds.T)
            prediction_label = gmm.predict(test_ds.T)
            if cv_likelihood:
                log_prob = np.sum(gmm.score(test_ds.T))
            # fit on test and get labels
            gmm.fit(test_ds.T)
            test_label = gmm.predict(test_ds.T)
        elif method == 'kmeans':
            kmeans = KMeans(n_clusters=k)
            # fit on train and predict test
            kmeans.fit(train_ds.T)
            prediction_label = kmeans.predict(test_ds.T)
            # fit on test and get labels
            kmeans.fit(test_ds.T)
            test_label = kmeans.predict(test_ds.T)
        else:
            raise ValueError("We shouldn't get here")
            
        # append results
        ks[i_k] = k
        ari[i_k] = adjusted_rand_score(prediction_label, test_label)
        ami[i_k] = adjusted_mutual_info_score(prediction_label, test_label)
        if stability:
            stab[i_k] = stability_score(prediction_label, test_label, k)
        if cv_likelihood:
            likelihood[i_k] = log_prob
        if corr_score is not None:
            corr[i_k] = correlation_score(prediction_label, test_label,
                                          test_ds, corr_score)
        if ground_truth is not None:
            ari_gt[i_k] = adjusted_rand_score(prediction_label, ground_truth)
            ami_gt[i_k] = adjusted_mutual_info_score(prediction_label,
                                                     ground_truth)
            if stability:
                stab_gt[i_k] = stability_score(prediction_label,
                                               ground_truth, k)
            if corr_score is not None:
                corr_gt[i_k] = correlation_score(prediction_label,
                                                 ground_truth,
                                                 test_ds, corr_score)

    results = [ks, ari, ami]
    if stability:
        results.append(stab)
    else:
        results.append(None)
    if cv_likelihood:
        results.append(likelihood)
    else:
        results.append(None)

    if ground_truth is not None:
        results += [ari_gt, ami_gt]
    else:
        results += [None, None]

    if stability and ground_truth is not None:
        results.append(stab_gt)
    else:
        results.append(None)

    if corr_score is not None:
        results.append(corr)
    else:
        results.append(None)

    if corr_score is not None and ground_truth is not None:
        results.append(corr_gt)
    else:
        results.append(None)

    return results
Ejemplo n.º 49
0
def _run_complete(data, metric="correlation"):
    """Just to allow caching"""
    return complete(pdist(data, metric=metric))
Ejemplo n.º 50
0
    def finalize_learning(self, grouping_method='AHC', spatial_pooler=None):
        """Finalize learning in the following steps:
         1. Remove rare coincidences (done in SpatialPooler)
         2. Compute coincidence priors
         3. Make T symmetric
         4. Normalize T by rows
         5. Temporal grouping
         6. Compute PCG
         """

        def add_to_temporal_group(c_id, g_id=None):
            """Add coincidence to a new or to an existing temporal group.

            Args:
                c_id: coincidence index
                g_id: existing temporal group index

            Returns:
                group id if creating a new one
            """
            if c_id not in nonassigned_coincidences:
                return
            nonassigned_coincidences.remove(c_id)
            if g_id is None:
                self.temporal_groups[len(self.temporal_groups)] = [c_id]
                return len(self.temporal_groups) - 1
            else:
                if (len(self.temporal_groups[g_id]) < self.group_max_size):
                    self.temporal_groups[g_id].append(c_id)

        # 2. Compute coincidence priors
        self.conincidence_prior = dict()
        count_sum = float(sum(self.coincidences_stats))
        for c_id, count in enumerate(self.coincidences_stats):
            self.conincidence_prior[c_id] = count / count_sum

        # visualize.show_image(np.asarray(self.conincidence_prior.values()).reshape(10,20))

        # 3. Make T symmetric
        if self.symmetrizeTAM:
            self.TAM = utils.symmetrize(self.TAM.T)

        # zero-out the diagonal
        for i in range(self.TAM.shape[0]):
            self.TAM[i, i] = 0

#         4. Normalize T by rows
#        for i in xrange(self.TAM.shape[0]):
#            for j in xrange(self.TAM.shape[1]):
#                if self.TAM[i].sum() > 0:
#                    self.TAM[i, j] /= float(self.TAM[i].sum())

        # normalize byt rows and columns
        row_max = self.TAM.max(axis=1).reshape((self.TAM.shape[1], 1))
        col_max = self.TAM.max(axis=0).reshape((self.TAM.shape[0], 1))
        self.TAM = np.nan_to_num(np.divide(self.TAM, np.sqrt(np.dot(row_max, col_max.T))))

#        visualize.show_matrix(self.TAM)

        # 5. Temporal grouping
        if grouping_method == "AHC":
            # AHC algorithm
            # http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html
            # http://math.stanford.edu/~muellner/fastcluster.html
            import scipy.cluster.hierarchy as hier

            # AHC needs a distance matrix
            TAM_invs = 1 - self.TAM

#            Z = hier.average(TAM_invs)
            Z = hier.complete(TAM_invs)
            # Z = hier.weighted(TAM_invs)
            # Z = hier.centroid(TAM_invs)
            t = self.requested_group_count
            T = hier.fcluster(Z, t, criterion='maxclust')

            # T is a list of indices to groups for each of the coincidences
            # creating temporal groups based on T
            for c_id, g_id in enumerate(T):
                g_id = g_id - 1
                if not g_id in self.temporal_groups.keys():
                    self.temporal_groups[g_id] = [c_id]
                else:
                    self.temporal_groups[g_id].append(c_id)

        elif grouping_method == "Numenta":  # greedy algorithm
            nonassigned_coincidences = range(self.coincidences_stats)  # ids of coincidences

            while len(nonassigned_coincidences) > 0:
                # 5.1 Select the non-assigned coincidence c_i with the highest
                # temporal connection TC and add it to a new temporal group g_k.
                htc = -1  # highest temporal connection value
                htc_id = None  # id of the coincidence
                for i in nonassigned_coincidences:
                    if self.TAM[i].max() > htc:
                        htc = self.TAM[i].max()
                        htc_id = i
                assert(htc_id is not None)
                # add selected coincidence to a new temporal group
                g_id = add_to_temporal_group(htc_id)

                # 5.2 Pick at most topNeighbors non-assigned coincidences with the
                # highest temporal connection and pool them to the same group g_k
                j = 0
                tmp = dict()
                while len(self.temporal_groups[g_id]) < self.group_max_size and \
                        len(nonassigned_coincidences) > 0:
                    if not len(self.temporal_groups[g_id]) - 1 >= j:
                        break
                    htc_id = self.temporal_groups[g_id][j]
                    tmp.clear()
                    for k in range(self.TAM.shape[1]):
                        tmp[k] = self.TAM[htc_id, k]  # dict(c_id => temporal connection value)
                    del tmp[htc_id]  # remove previously selected c_id
                    sorted_tmp = sorted(tmp, key=itemgetter(1), reverse=True)[0:self.top_neighbors]
                    for c_id, tc in sorted_tmp:
                        add_to_temporal_group(c_id, g_id)
                    j += 1

        # 5.1 purge garbage group
        # garbage group is the largest one, TODO use better metric
        if self.purge_garbage and spatial_pooler is not None:
            # find the largest temporal group
            garbage_id = 0
            max_len = 0
            for g_id in self.temporal_groups.keys():
                if len(self.temporal_groups[g_id]) > max_len:
                    max_len = len(self.temporal_groups[g_id])
                    garbage_id = g_id

            # delete all the coincidences in that group
            spatial_pooler.coincidences = utils.multi_delete(spatial_pooler.coincidences, self.temporal_groups[garbage_id])
            self.coincidences_count = len(spatial_pooler.coincidences)
            spatial_pooler.coincidences_matrix = np.vstack(spatial_pooler.coincidences.values())

            # delete the temporal group and change the indices so that they are
            # continuous
            del self.temporal_groups[garbage_id]
            for i in range(garbage_id, len(self.temporal_groups)):
                self.temporal_groups[i] = self.temporal_groups[i + 1]
            del self.temporal_groups[len(self.temporal_groups) - 1]  # delete the last

            count = 0
            for g in self.temporal_groups.values():
                count += len(g)
            assert(count == self.coincidences_count)

        # 6. Compute PCG
        # 6.1
        self.PCG = np.zeros((self.coincidences_count, len(self.temporal_groups)))
        # for i in self.coincidences_stats.keys():
        for i in range(self.PCG.shape[0]):
            for j in range(self.PCG.shape[1]):
                if i in self.temporal_groups[j]:  # if c_i is in g_j
                    self.PCG[i, j] = self.conincidence_prior[i]  # assign P(c_i)
        # 6.2 each column in PCG should sum up to 1
        self.PCG = self.PCG.T
        for i in range(self.PCG.shape[0]):
            tsum = float(self.PCG[i].sum())
            if tsum > 0:
                self.PCG[i] /= tsum
Ejemplo n.º 51
0
def test_cut_tree_scipy():
    y = pdist(data, metric='euclidean')
    z = complete(y)
    assert_array_equal(np.sort(cut_tree_scipy(z, 2)),
                       np.hstack((np.zeros(10), np.ones(10))))
    assert_equal(len(np.unique(cut_tree_scipy(z, 10))), 10)
Ejemplo n.º 52
0
def new_heatmap(figure_number, expr_values, study, platform, sample_ids, symbols, combined):
    """
    Create a heatmap with row and column dendrograms for the array of expression values. The expression
    values are normalized before clustering is performed.
    The Euclidean distance method is used.
    The Complete clustering method is used.

    This code is based on the iPython notebook located here:
        nbviewer.ipython.org/github/ucsd-scientific-python/user-group/blob/master/presentations/20131016/
        hierarchical_clustering_heatmaps_gridspec.ipynb

    :param expr_values: a numpy array of expression values
    :param study:
    :param platform:
    :param sample_ids:
    :param symbols:
    :param combined: Boolean, is this a combined heatmap?
    """
    symbols = np.array(symbols)
    sample_ids = np.array(sample_ids)

    # Normalize the expression values
    expr_values /= np.max(np.abs(expr_values), axis=0)

    # Get the transpose of the expression array to be used in row
    # distance measurements and hierarchical clustering
    expr_values_transposed = np.transpose(expr_values)

    # Calculate the pairwise distances for the rows and columns
    # The default method is 'euclidean'
    column_distances = pdist(expr_values)
    row_distances = pdist(expr_values_transposed)

    # Create a Figure to hold all of the graphical elements
    # Set its background to white
    # Lay out a GridSpec dividing the figure inot 3 rows and 2 columns
    #   Area [1,1] = column dendrogram
    #   Area [2,0] = row dendrogram
    #   Area [2,1] = the heatmap
    #   Area [1,2] = the colorbar legend

    fig = plt.figure()
    # fig = Figure()
    fig.set_tight_layout(True)
    fig.patch.set_facecolor('white')
    heatmap_GS = gridspec.GridSpec(3, 2, wspace=0.0, hspace=0.0, width_ratios=[0.25, 1],
                                   height_ratios=[0.05, 0.25, 1])

    # Perform a cluster analysis on column distances using the complete method.
    # Create and draw a dendrogram
    # Save the reordering values ('leaves') for the columns
    column_cluster = sch.complete(column_distances)
    column_dendrogram_axis = fig.add_subplot(heatmap_GS[1, 1])
    column_dendrogram = sch.dendrogram(column_cluster, orientation='top')
    column_indexes = column_dendrogram['leaves']
    clean_axis(column_dendrogram_axis)

    # Perform a cluster analysis on row distances using the complete method-
    # Create and draw a dendrogram.
    # Save the reordering values ('leaves') for the rows
    row_cluster = sch.complete(row_distances)
    row_dendrogram_axis = fig.add_subplot(heatmap_GS[2, 0])
    row_dendrogram = sch.dendrogram(row_cluster, orientation='right')
    row_indexes = row_dendrogram['leaves']
    clean_axis(row_dendrogram_axis)

    # Reorder the normalized expression value array based on the clustering indexes
    # Create and draw the heatmap. The image itself is used to create the colorbar (below)
    expr_values_transposed = expr_values_transposed[:, column_indexes]
    expr_values_transposed = expr_values_transposed[row_indexes, :]
    heat_map_axis = fig.add_subplot(heatmap_GS[2, 1])
    image = heat_map_axis.matshow(expr_values_transposed, aspect='auto', origin='lower',
                                  #  cmap=RedBlackGreen())
                                  cmap=cm.BrBG)
    clean_axis(heat_map_axis)

    # Prepare the heatmap row labels based on the gene symbols
    gene_symbols = []
    for symbol in symbols:
        match_result = re.match(r"(.+)(_\d+)", symbol)
        if match_result:
            gene_symbols.append(match_result.group(1))
        else:
            gene_symbols.append(symbol)
    genes = np.array(gene_symbols)
    heat_map_axis.set_yticks(np.arange(len(genes)))
    heat_map_axis.yaxis.set_ticks_position('right')
    heat_map_axis.set_yticklabels(genes[row_indexes])

    # Prepare the heatmap column labels based on the sample ids
    # Rotate them 90 degrees
    heat_map_axis.set_xticks(np.arange(sample_ids.shape[0]))
    heat_map_axis.xaxis.set_ticks_position('bottom')
    xlabels = heat_map_axis.set_xticklabels(sample_ids[column_indexes])
    for label in xlabels:
        label.set_rotation(90)

    # Create and draw a scale colorbar. It is based on the values used in the
    # heatmap image.
    scale_cbGSSS = gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=heatmap_GS[1, 0],
                                                    wspace=0.0, hspace=0.0)
    scale_cb_axis = fig.add_subplot(scale_cbGSSS[0, 0])
    colorbar = fig.colorbar(image, scale_cb_axis)
    colorbar.ax.yaxis.set_ticks_position('left')
    colorbar.ax.yaxis.set_label_position('left')
    colorbar.outline.set_linewidth(0)
    tick_labels = colorbar.ax.yaxis.get_ticklabels()
    for tick_label in tick_labels:
        tick_label.set_fontsize(tick_label.get_fontsize() - 4)

    # "Tighten" up the whole figure, separating the sub plots by horizontal and vertical spaces
    # Add a title - placed at the very top
    heatmap_GS.tight_layout(fig, h_pad=0.1, w_pad=0.5)

    title = "Study: %s Platform(s): %s" % (study, platform,)
    fig.suptitle(title)

    fig.set_size_inches(12.0, 8.0)

    canvas = FigureCanvas(fig)
    # plot_file_name = 'heatmap%s.png' % figure_number
    # plot_file = os.path.join(settings.MEDIA_ROOT, plot_file_name)
    # canvas.print_figure(plot_file)
    #
    # return '/media/' + plot_file_name
    plot_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
    file_name = plot_file.name.split('/')[-1]
    canvas.print_figure(plot_file)

    return settings.MEDIA_URL+file_name
def main():
    usage = """
    ./helix_orienation_divergences.py

    Analyze how much the helix-helix orientations diverge between two data sets.
    """
    num_args=0
    parser = OptionParser()

    parser.add_option('-r', '--resolution', dest='resolution', default=10, help="The resolution of the resulting plot", type='int')
    parser.add_option('-a', '--angle', dest='angle', default=0, help="The angle of the camera", type='float')
    parser.add_option('-f', '--fig-name', dest='fig_name', default='', help="The name of the file to save the figure to. If it is not specified, the figure will not be saved", type='str')
    parser.add_option('-i', '--interior_loops', dest='interior_loops', default=False, help='Cluster only the interior loops', action='store_true')
    parser.add_option('-m', '--multi_loops', dest='multi_loops', default=False, help='Cluster only the interior loops', action='store_true')

    #parser.add_option('-u', '--useless', dest='uselesss', default=False, action='store_true', help='Another useless option')

    (options, args) = parser.parse_args()

    if len(args) < num_args:
        parser.print_help()
        sys.exit(1)

    column_names = ['type', 'pdb', 's1', 's2', 'u', 'v', 't', 'r', 'u1', 'v1', 'atype', 'something1', 'something2', 'sth3', 'sth4']


    real_stats = ftms.ConformationStats('fess/stats/real.stats').angle_stats
    sampled_stats = ftms.ConformationStats('fess/stats/temp.stats').angle_stats

    # count how many statistics we have for each statistic type
    stat_counts = c.defaultdict(int)
    for sc in real_stats.keys():
        stat_counts[sc] += len(real_stats[sc])

    histograms = dict()
    for b in stat_counts.keys():
        if b[2] != 2.:
            # only look at type 2 angles
            continue

        if options.interior_loops:
            if b[0] == 1000 or b[1] == 1000:
                continue
        if options.multi_loops:
            if b[0] != 1000 and b[1] != 1000:
                continue

        (selected_sizes, count) = get_nearest_dimension_sizes(b, stat_counts, 1)

        if count < 3:
            continue

        fud.pv('b, selected_sizes')

        combined_real = []

        # get the statistics that correspond to the selected sampled sizes
        for ss in selected_sizes:
            #ss_r = get_certain_angle_stats(real_stats, ss)
            ss_r = real_stats[ss]

            combined_real += list(ss_r[['u','v']].as_matrix())

        num_points = len(combined_real)
        combined_real = np.array(combined_real)
        #histograms[b] = (np.histogram2d(combined_real[:,0], combined_real[:,1], range=[[0, m.pi], [-m.pi, m.pi]])[0] + 0.5) / float(num_points)
        histograms[b] = combined_real

    dists = []
    named_dists = dict()
    pp_dists = dict()
    for k1, k2 in it.combinations(histograms.keys(), 2):
        per_point_distances = []
        for p1 in histograms[k1]:
            point_distances = []
            for p2 in histograms[k2]:
                point_distances += [ftuv.magnitude(p1 - p2)]
            per_point_distances += [min(point_distances)]

        for p2 in histograms[k2]:
            point_distances = []
            for p1 in histograms[k1]:
                point_distances += [ftuv.magnitude(p1-p2)]
            per_point_distances += [min(point_distances)]

        dists += [max(per_point_distances)]
        named_dists[(k1,k2)] = max(per_point_distances)
        pp_dists[(k1,k2)] = per_point_distances

        '''
        kl = histograms[k1] * (histograms[k1] / histograms[k2])
        kl = sum(map(sum, kl))
        dists += [kl]
        '''

    fud.pv('dists')
    Z = sch.complete(dists)
    fud.pv('Z')
    sch.dendrogram(Z, labels = histograms.keys(), leaf_rotation=90)
    plt.subplots_adjust(bottom=0.25)
    
    plt.show()

    k1 = (6,7,2)
    k2 = (5,6,2)

    rs = get_certain_angle_stats(real_stats, k1)
    ss = get_certain_angle_stats(real_stats, k2)

    fud.pv('named_dists[(k1,k2)]')
    fud.pv('pp_dists[(k1,k2)]')

    real_us = rs[['u', 'v']].as_matrix()
    sampled_us = ss[['u','v']].as_matrix()

    U_r = real_us[:,0]
    V_r = real_us[:,1]

    U_s = sampled_us[:,0]
    V_s = sampled_us[:,1]

    total_r = len(U_r)
    total_s = len(U_s)

    hr = np.histogram2d(U_r, V_r)
    hs = np.histogram2d(U_s, V_s)

    pseudo_r = (hr[0] + 1) / total_r
    pseudo_s = (hs[0] + 1) / total_r
    kl = pseudo_r * (pseudo_r / pseudo_s)
    fud.pv('kl')
    fud.pv('sum(map(sum, kl))')

    X_r = np.sin(U_r) * np.cos(V_r)
    Y_r = np.sin(U_r) * np.sin(V_r)
    Z_r = np.cos(U_r)

    r = 1.
    X_s = r * np.sin(U_s) * np.cos(V_s)
    Y_s = r * np.sin(U_s) * np.sin(V_s)
    Z_s = r * np.cos(U_s)

    fud.pv('real_us')

    real_us_orig = np.copy(real_us)
    sampled_us_orig = np.copy(sampled_us)

    print len(real_us), len(sampled_us)

    fig = plt.figure(figsize=(10,10))
    ax = Axes3D(fig)

    a = Arrow3D([-1.3,1.3],[0,0],[0,0], mutation_scale=20, lw=5, arrowstyle="-|>", color="g")
    ax.add_artist(a)

    ax.plot(X_r, Y_r, Z_r, 'bo', alpha=0.3)
    ax.plot(X_s, Y_s, Z_s, 'ro', alpha=0.3)

    u, v = np.mgrid[0:2*np.pi:20j, 0:np.pi:10j]
    x=np.cos(u)*np.sin(v)
    y=np.sin(u)*np.sin(v)
    z=np.cos(v)
    ax.plot_wireframe(x, y, z, color="y")

    #surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, facecolors=colors,
    #       linewidth=0, antialiased=False)

    ax._axis3don=False
    ax.set_zlim3d(-1, 1)
    ax.w_zaxis.set_major_locator(LinearLocator(6))
    ax.view_init(0, options.angle)

    '''
    plt.subplots_adjust(left=0.4, right=0.9, top=0.9, bottom=0.1)

    for i in xrange(0, 360, 40):
        savefig("fig%d.png", (i))
    '''

    '''
    sm = cm.ScalarMappable(cmap=cm.jet)
    sm.set_array(W)
    fig.colorbar(sm)
    '''

    if options.fig_name != "":
        plt.savefig(options.fig_name, bbox_inches='tight')
    else:
        plt.show()