Ejemplo n.º 1
0
    def _guide_tree(self, dist_matrix):
        """
        @summary: Build a guide tree from the distance matrix

        @param dist_matrix: The distance matrix
        @type dist_matrix: numpy.ndarray
        @return: Pycluster similarity tree
        @rtype: Pycluster.cluster.Tree

        @author: Woon Wai Keen
        @author: Vladimir Likic
        """

        n = len(dist_matrix)

        print " -> Clustering %d pairwise alignments." % (n * (n - 1)),
        tree = Pycluster.treecluster(dist_matrix, method='a')
        print '\n'
        print tree
        # x = 1
        # for i in list(tree):
        #     print (i, x)
        #     x += 1
        #return different for of tree perhaps , list within list **
        print "Done"

        return tree
Ejemplo n.º 2
0
    def DoClustering(self, nclusters=30):
        '''Main clustering function'''

        gx = self._gx
        func = self._scale_function

        nid, jm, am, fg = zip(*[(x, gx.node[x]['JuvenileMass'],
                                 gx.node[x]['AdultMass'],
                                 gx.node[x]['FunctionalGroup'])
                                for x in gx.node.keys()])
        data = np.c_[func(jm), func(am)]

        if (self._normalize_data == True):
            data = whiten(data)
        data = np.c_[data, 1000 * np.array(fg)]

        if self._algorithm == Aggregation._HIERARCHICAL_CLUSTERING:

            if not self._tree_done:
                if self._distance_matrix:
                    self._tree = pc.treecluster(
                        distancematrix=self._distance_matrix)
                else:
                    self._tree = pc.treecluster(data)

                self._tree_done = True

        self._data = data
        self._nodes_ids = nid
        clusters_ids = self._tree.cut(nclusters)
        self._clusters_ids = clusters_ids
        self._nclusters = len(np.unique(self._clusters_ids))

        cluster_attrib = dict(zip(nid, clusters_ids))
        nx.set_node_attributes(gx, 'cluster', cluster_attrib)
        self._gx = gx

        for cid in clusters_ids:

            fg = [
                gx.node[x]['FunctionalGroup'] for x in gx.node.keys()
                if gx.node[x]['cluster'] == cid
            ]
            if len(np.unique(fg)) is not 1:
                raise Exception(
                    'Many functional groups inside the same cluster!!!!!! A CRASH JUST HAPPENED, just joking!!!!'
                )
Ejemplo n.º 3
0
 def DoClustering(self,nclusters=30,distance_matrix=None):
     #Avoid working two times
     if not self._tree_done:
         df_nc = self._df_nodes[self._df_nodes['ID']>=0].copy()
         
         data = df_nc[['JuvenileMass', 'AdultMass']]
         data = data.as_matrix()
         data = self._scale_function(data)
 
         if(self._normalize_data==True):
             data = whiten(data)
         
         data = np.c_[data,100.*df_nc.FunctionalGroup.values]
         
         if distance_matrix:
             self._tree = pc.treecluster(distancematrix=distance_matrix)
         else:
             self._tree = pc.treecluster(data)
         
         self._data = data
         self._tree_done = True
     
     self.FillClusterIndividualData(self._tree.cut(nclusters))
Ejemplo n.º 4
0
    def DoClustering(self,nclusters=30):
        '''Main clustering function'''
        
        gx = self._gx; func = self._scale_function
        
        nid,jm,am,fg=zip(*[(x,gx.node[x]['JuvenileMass'],gx.node[x]['AdultMass'],gx.node[x]['FunctionalGroup']) for x in gx.node.keys()])
        data = np.c_[func(jm),func(am)]
        
        if(self._normalize_data==True):
            data = whiten(data)        
        data = np.c_[data,1000*np.array(fg)]
       
        if self._algorithm == Aggregation._HIERARCHICAL_CLUSTERING:

            if not self._tree_done:
                if self._distance_matrix:
                    self._tree = pc.treecluster(distancematrix=self._distance_matrix)
                else:
                    self._tree = pc.treecluster(data)
            
                self._tree_done = True

        self._data = data        
        self._nodes_ids = nid
        clusters_ids = self._tree.cut(nclusters)
        self._clusters_ids = clusters_ids
        self._nclusters = len(np.unique(self._clusters_ids))
        
        cluster_attrib = dict(zip(nid,clusters_ids))
        nx.set_node_attributes(gx,'cluster',cluster_attrib)
        self._gx = gx
        
        for cid in clusters_ids:

            fg = [gx.node[x]['FunctionalGroup'] for x in gx.node.keys() if gx.node[x]['cluster']==cid]
            if len(np.unique(fg)) is not 1:
                raise Exception('Many functional groups inside the same cluster!!!!!! A CRASH JUST HAPPENED, just joking!!!!')
Ejemplo n.º 5
0
def generate_network_clusters(G):
# Function creates the cluster partitions using heierarchical clustering
# on geodesic distances
    # First check to make sure the given network is a single fully
    # connected component.
    if len(NX.component.connected_component_subgraphs(G)) >1:
        raise NX.NetworkXError, 'G must be single component! Extract main component...'
    # Now generte clusters
    dist_matrix=get_dist_matrix(G)
    # Default Heierarchical Clustering algo used
    hclus=PC.treecluster(data=None,distancematrix=dist_matrix,method='m')
    partitions={}   # create dictionary of partitioning at each cut in heierarchy
    for c in range(1,len(hclus)+1):  # treecluster cuts start at 1
        partitions[c]=hclus.cut(c).tolist()
    return partitions
Ejemplo n.º 6
0
def tree_cluster_test(data,real_labels, outputfile = None):
    start = time.time()
    tree = Pycluster.treecluster(data, method='m')    

    ks = range(25,50,1)
    if outputfile != None:
        f = open(outputfile,'w')
        f.write(out_result_header())
    for k in ks:
        print 'hierachical clustering whn k=%d' % k
        predicted = tree.cut(k).tolist()
        if outputfile != None:
            f.write(out_result(predicted,k, real_labels))

    elasped = time.time() - start
    print 'hierarchical clustering time: %.3f' % (elasped/float(len(ks)))
Ejemplo n.º 7
0
def tree_cluster_test(data, real_labels, outputfile=None):
    start = time.time()
    tree = Pycluster.treecluster(data, method='m')

    ks = range(25, 50, 1)
    if outputfile != None:
        f = open(outputfile, 'w')
        f.write(out_result_header())
    for k in ks:
        print 'hierachical clustering whn k=%d' % k
        predicted = tree.cut(k).tolist()
        if outputfile != None:
            f.write(out_result(predicted, k, real_labels))

    elasped = time.time() - start
    print 'hierarchical clustering time: %.3f' % (elasped / float(len(ks)))
Ejemplo n.º 8
0
def hierarchical(flat_data, data, nclusters, method, distance):
    """ Hierarchical clustering """
    
    tree = pc.treecluster(data=flat_data.values(),
                       mask=None,
                       weight=None,
                       transpose=0,
                       method=method,
                       dist=distance,
                       distancematrix=None)
    
    clusterid = tree.cut(nclusters)
    
    clusters = defaultdict(list)
    for i, j in zip(clusterid, data):
        clusters[i].append(j)
        
    return clusters
Ejemplo n.º 9
0
    def _guide_tree(self, dist_matrix):
        """
        @summary: Build a guide tree from the distance matrix

        @param dist_matrix: The distance matrix
        @type dist_matrix: numpy.ndarray
        @return: Pycluster similarity tree
        @rtype: Pycluster.cluster.Tree

        @author: Woon Wai Keen
        @author: Vladimir Likic
        """

        n = len(dist_matrix)

        print " -> Clustering %d pairwise alignments." % (n * (n - 1)),
        tree = Pycluster.treecluster(distancematrix=dist_matrix, method='a')
        print "Done"

        return tree
Ejemplo n.º 10
0
    def _guide_tree(self, dist_matrix):

        """
        @summary: Build a guide tree from the distance matrix

        @param dist_matrix: The distance matrix
        @type dist_matrix: numpy.ndarray
        @return: Pycluster similarity tree
        @rtype: Pycluster.cluster.Tree

        @author: Woon Wai Keen
        @author: Vladimir Likic
        """

        n = len(dist_matrix)

        print " -> Clustering %d pairwise alignments." % (n*(n-1)),
        tree = Pycluster.treecluster(distancematrix=dist_matrix, method='a')
        print "Done"

        return tree
Ejemplo n.º 11
0
def diff_plot(motifs, pwms, names, freq, counts, bgfreq, bgcounts, outfile, mindiff=0, minenr=3, minfreq=0.01):
    w_ratio = np.array([14, len(names), len(names) + 1])
    plot_order = [0, 1, 2]

    nbar = 5

    freq = np.array(freq)
    counts = np.array(counts)
    bgfreq = np.array([[x] for x in bgfreq])

    enr = np.log2(np.divide(freq, bgfreq))

    filt = np.ones(len(enr), dtype="bool")
    filters = [
        np.sum(enr > minenr, 1) > 0,
        np.sum(freq > minfreq, 1) > 0,
        (np.max(enr, 1) - np.min(enr, 1)) > mindiff,
        np.sum(counts > 2, 1) > 0,
    ]
    for f in filters:
        filt = np.logical_and(filt, f)

        print "Filter: ", sum(filt)

    motifs = np.array(motifs)[filt]
    freq = freq[filt]
    bgfreq = bgfreq[filt]
    enr = enr[filt]

    for m, f, b, e in zip(motifs, freq, bgfreq, enr):
        sys.stderr.write("{0}\t{1}\t{2}\t{3}\n".format(m, f, b, e))

    if len(freq) == 0:
        sys.stderr.write("No enriched and/or differential motifs found.\n")
        return
    elif len(freq) >= 3:
        tree = Pycluster.treecluster(freq, method="m", dist="c")
        ind = sort_tree(tree, np.arange(len(motifs)))
    else:
        ind = np.arange(len(freq))

    fig = plt.figure(figsize=((5 + 0.75 * len(names)) * 3, (0.3 * len(motifs) + 1.5) * 3))

    gs = GridSpec(
        len(motifs) + 3 + nbar, 3, height_ratios=[1] * nbar + [3] * (len(motifs) + 3), width_ratios=w_ratio[plot_order]
    )

    # Colormaps
    c1 = mpl.cm.RdBu
    c2 = mpl.cm.Blues  ##create_colormap("white", "blue")

    ### Frequency plot ###

    # Create axis
    ax = plt.subplot(gs[nbar:-3, plot_order[2]])

    # Plot frequencies
    vmin = 0
    vmax = 0.3

    pfreq = np.hstack((freq, bgfreq))
    ax.pcolormesh(pfreq[ind], cmap=c2, vmin=vmin, vmax=vmax)

    sm = plt.cm.ScalarMappable(cmap=c2, norm=mpl.colors.Normalize(vmin=vmin, vmax=vmax))

    # Show percentages
    for y, row in enumerate(pfreq[ind]):
        for x, val in enumerate(row):
            v = vmax
            if val >= (vmin + ((vmax - vmin) / 2)):
                v = vmin
            plt.text(x + 0.5, y + 0.5, "{:.1%}".format(val), ha="center", va="center", color=sm.to_rgba(v))

    # Hide most labels
    plt.setp(ax.get_xticklines(), visible=False)
    plt.setp(ax.get_yticklines(), visible=False)
    plt.setp(ax.get_yticklabels(), visible=False)

    # Set the X labels
    ticks = np.arange(len(names) + 1) + 0.5
    plt.xticks(ticks, names + ["background"], rotation=30, ha="right")

    ax.set_ylim(0, len(motifs))

    # Title
    plt.title("Frequency")

    # Colorbar
    sm._A = []
    cax = plt.subplot(gs[0, plot_order[2]])
    cb = fig.colorbar(sm, cax=cax, ticks=[0, 0.3], orientation="horizontal")
    cb.ax.set_xticklabels(["0%", "30%"])

    #### Enrichment plot
    ax = plt.subplot(gs[nbar:-3, plot_order[1]])
    vmin = -10
    vmax = 10
    ax.pcolormesh(enr[ind], cmap=c1, vmin=vmin, vmax=vmax)
    for y, row in enumerate(enr[ind]):
        for x, val in enumerate(row):
            col = "black"
            if val >= (vmin + ((vmax - vmin) / 8.0 * 7)):
                col = "white"
            elif val <= (vmin + ((vmax - vmin) / 8.0)):
                col = "white"
            plt.text(x + 0.5, y + 0.5, "{:.1f}".format(val), ha="center", va="center", color=col)

    ticks = np.arange(len(names)) + 0.5
    plt.xticks(ticks, names, rotation=30, ha="right")
    # plt.setp(plt.xticks()[1], rotation=30)
    # for label in labels:
    #    label.set_rotation(30)
    ticks = np.arange(len(motifs)) + 0.5
    plt.yticks(ticks, motifs[ind])
    plt.setp(ax.get_xticklines(), visible=False)
    plt.setp(ax.get_yticklines(), visible=False)

    ax.set_ylim(0, len(motifs))

    # Title
    plt.title("Enrichment (log2)")

    # Colorbar
    sm = plt.cm.ScalarMappable(cmap=c1, norm=mpl.colors.Normalize(vmin=vmin, vmax=vmax))
    sm._A = []
    cax = plt.subplot(gs[0, plot_order[1]])
    cb = fig.colorbar(sm, cax=cax, ticks=[vmin, 0, vmax], orientation="horizontal")
    cb.ax.set_xticklabels([vmin, 0, vmax])

    #### Motif logos

    for i, motif in enumerate(motifs[ind][::-1]):
        ax = plt.subplot(gs[i + nbar, plot_order[0]])
        axes_off(ax)
        tmp = NamedTemporaryFile(dir=mytmpdir(), suffix=".png")
        pwms[motif].to_img(tmp.name, format="PNG", height=6)
        ax.imshow(plt.imread(tmp.name), interpolation="none")

    # plt.show()
    plt.savefig(outfile, dpi=300, bbox_inches="tight")
    plt.close(fig)
Ejemplo n.º 12
0
                        strand = "-"
                    else:
                        strand = "+"
                    regions[k] = (chrom, start, end, gene, strand)
            n = len(set(labels))
            labels[labels == j] = i
            for k in range(j + 1, n):
                labels[labels == k] = k - 1
            (i,j) = mirror_clusters(data, labels)
            
    ind = labels.argsort()
    # Other cluster implementation
    #    centres, labels, dist = kmeanssample(clus, options.numclusters, len(clus) / 10,  metric=cl, maxiter=200, verbose=1, delta=0.00001)
elif cluster_type == "h":
    print "Hierarchical clustering"
    tree = Pycluster.treecluster(clus, method="m", dist=METRIC)
    labels = tree.cut(options.numclusters)
    ind = sort_tree(tree, arange(len(regions)))
else:
    ind = arange(len(regions))
    labels = zeros(len(regions))
f = open("{0}_clusters.bed".format(outfile), "w")
for (chrom,start,end,gene,strand), cluster in zip(array(regions, dtype="object")[ind], array(labels)[ind]):
  if not gene:
    f.write("{0}\t{1}\t{2}\t.\t{3}\t{4}\n".format(chrom, start, end, cluster+1, strand))
  else: 
    f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(chrom, start, end, gene, cluster+1, strand))
f.close()

if not cluster_type == "k":
    labels = None
Ejemplo n.º 13
0
def heatmap(args):
    datafiles = args.datafiles
    for x in args.datafiles:
        if not os.path.isfile(x):
            print "ERROR: Data file '{0}' does not exist".format(x)
            sys.exit(1)
    for x in args.datafiles:
        if '.bam' in x and not os.path.isfile("{0}.bai".format(x)):
            print "Data file '{0}' does not have an index file. Creating an index file for {0}.".format(
                x)
            pysam.index(x)

    # Options Parser
    featurefile = args.featurefile
    datafiles = [x.strip() for x in args.datafiles]
    tracks = [os.path.basename(x) for x in datafiles]
    titles = [os.path.splitext(x)[0] for x in tracks]
    colors = parse_colors(args.colors)
    bgcolors = parse_colors(args.bgcolors)
    outfile = args.outfile
    extend_up = args.extend
    extend_down = args.extend
    fragmentsize = args.fragmentsize
    cluster_type = args.clustering[0].lower()
    merge_mirrored = args.merge_mirrored
    bins = (extend_up + extend_down) / args.binsize
    rmdup = args.rmdup
    rpkm = args.rpkm
    rmrepeats = args.rmrepeats
    ncpus = args.cpus
    distancefunction = args.distancefunction[0].lower()
    dynam = args.graphdynamics
    fontsize = args.textfontsize

    # Check for mutually exclusive parameters
    if dynam:
        if merge_mirrored:
            print "ERROR: -m and -g option CANNOT be used together"
            sys.exit(1)
        if distancefunction == 'e':
            print 'Dynamics can only be identified using Pearson correlation as metric.'
            print 'Assigning metric to Pearson correlation'
            distancefunction = 'p'

    # Warning about too much files
    if (len(tracks) > 4):
        print "Warning: Running fluff with too many files might make you system use enormous amount of memory!"

    # Method of clustering
    if (args.pick != None):
        pick = [i - 1 for i in split_ranges(args.pick)]
        if not all(i <= len(tracks) - 1 for i in pick):
            sys.stderr.write(
                "You picked a non-existent file for clustering.\n")
            sys.exit(1)
    else:
        pick = range(len(datafiles))

    if not cluster_type in ["k", "h", "n"]:
        sys.stderr.write("Unknown clustering type!\n")
        sys.exit(1)
    # Number of clusters
    if cluster_type == "k" and not args.numclusters >= 2:
        sys.stderr.write("Please provide number of clusters!\n")
        sys.exit(1)
    # Distance function
    if not distancefunction in ["e", "p"]:
        sys.stderr.write("Unknown distance function!\n")
        sys.exit(1)
    else:
        if distancefunction == "e":
            METRIC = cfg.DEFAULT_METRIC
            print "Euclidean distance method"
        else:
            METRIC = "c"
            print "Pearson distance method"
    ## Get scale for each track
    tscale = [1.0 for track in datafiles]

    # Function to load heatmap data
    def load_data(featurefile,
                  amount_bins,
                  extend_dyn_up,
                  extend_dyn_down,
                  rmdup,
                  rpkm,
                  rmrepeats,
                  fragmentsize,
                  dynam,
                  guard=None):
        if guard is None:
            guard = []
        # Calculate the profile data
        data = {}
        regions = []
        print "Loading data"
        try:
            # Load data in parallel
            pool = multiprocessing.Pool(processes=ncpus)
            jobs = []
            for datafile in datafiles:
                jobs.append(
                    pool.apply_async(load_heatmap_data,
                                     args=(featurefile, datafile, amount_bins,
                                           extend_dyn_up, extend_dyn_down,
                                           rmdup, rpkm, rmrepeats,
                                           fragmentsize, dynam, guard)))
            for job in jobs:
                track, regions, profile, guard = job.get()
                data[track] = profile
        except Exception as e:
            sys.stderr.write("Error loading data in parallel, trying serial\n")
            sys.stderr.write("Error: {}\n".format(e))
            for datafile in datafiles:
                track, regions, profile, guard = load_heatmap_data(
                    featurefile, datafile, amount_bins, extend_dyn_up,
                    extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize,
                    dynam, guard)
                data[track] = profile
        return data, regions, guard

    # -g : Option to try and get dynamics
    # Extend features 1kb up/down stream
    # Cluster them in one bin
    # Cluster them in one bin
    guard = []
    amount_bins = bins
    extend_dyn_up = extend_up
    extend_dyn_down = extend_down
    if dynam:
        # load the data once to get the features which extend below 0
        guard = check_data(featurefile, extend_dyn_up, extend_dyn_down)
        extend_dyn_up = 1000
        extend_dyn_down = 1000
        amount_bins = 1

    # Load data for clustering
    data, regions, guard = load_data(featurefile, amount_bins, extend_dyn_up,
                                     extend_dyn_down, rmdup, rpkm, rmrepeats,
                                     fragmentsize, dynam, guard)

    # Normalize
    norm_data = normalize_data(data, cfg.DEFAULT_PERCENTILE)

    clus = hstack([
        norm_data[t] for i, t in enumerate(tracks) if (not pick or i in pick)
    ])

    # Clustering
    if cluster_type == "k":
        print "K-means clustering"
        ## K-means clustering
        # PyCluster
        labels, _, nfound = Pycluster.kcluster(clus,
                                               args.numclusters,
                                               dist=METRIC)
        if not dynam and merge_mirrored:
            (i, j) = mirror_clusters(data, labels)
            while j:
                for track in data.keys():
                    data[track][labels == j] = [
                        row[::-1] for row in data[track][labels == j]
                    ]
                for k in range(len(regions)):
                    if labels[k] == j:
                        (chrom, start, end, gene, strand) = regions[k]
                        if strand == "+":
                            strand = "-"
                        else:
                            strand = "+"
                        regions[k] = (chrom, start, end, gene, strand)
                n = len(set(labels))
                labels[labels == j] = i
                for k in range(j + 1, n):
                    labels[labels == k] = k - 1
                (i, j) = mirror_clusters(data, labels)

        ind = labels.argsort()

        # Hierarchical clustering
    elif cluster_type == "h":
        print "Hierarchical clustering"
        tree = Pycluster.treecluster(clus, method="m", dist=METRIC)
        labels = tree.cut(args.numclusters)
        ind = sort_tree(tree, arange(len(regions)))
    else:
        ind = arange(len(regions))
        labels = zeros(len(regions))

    # Load data for visualization if -g option was used
    if dynam:
        data, regions, guard = load_data(featurefile, bins, extend_up,
                                         extend_down, rmdup, rpkm, rmrepeats,
                                         fragmentsize, dynam, guard)

    f = open("{0}_clusters.bed".format(outfile), "w")
    for (chrom, start, end, gene, strand), cluster in zip(
            array(regions, dtype="object")[ind],
            array(labels)[ind]):
        if not gene:
            f.write("{0}\t{1}\t{2}\t.\t{3}\t{4}\n".format(
                chrom, start, end, cluster + 1, strand))
        else:
            f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(
                chrom, start, end, gene, cluster + 1, strand))
    f.close()
    # Save read counts
    readcounts = {}
    for i, track in enumerate(tracks):
        readcounts[track] = {}
        readcounts[track]['bins'] = []
        for idx, row in enumerate(data[track]):
            bins = ''
            for b in row:
                if not bins:
                    bins = '{0}'.format(b)
                else:
                    bins = '{0};{1}'.format(bins, b)
            readcounts[track]['bins'].append(bins)

    input_fileBins = open('{0}_readCounts.txt'.format(outfile), 'w')
    input_fileBins.write('Regions\t')
    for i, track in enumerate(titles):
        input_fileBins.write('{0}\t'.format(track))
    input_fileBins.write('\n')
    for i, track in enumerate(tracks):
        for idx in ind:
            input_fileBins.write('{0}:{1}-{2}\t'.format(
                regions[idx][0], regions[idx][1], regions[idx][2]))
            for i, track in enumerate(tracks):
                input_fileBins.write('{0}\t'.format(
                    readcounts[track]['bins'][idx]))
            input_fileBins.write('\n')
        break
    input_fileBins.close()

    if not cluster_type == "k":
        labels = None

    scale = get_absolute_scale(args.scale, [data[track] for track in tracks])
    heatmap_plot(data, ind[::-1], outfile, tracks, titles, colors, bgcolors,
                 scale, tscale, labels, fontsize)
Ejemplo n.º 14
0
def heatmap(args):
    datafiles = args.datafiles
    for x in args.datafiles:
        if not os.path.isfile(x):
            print "ERROR: Data file '{0}' does not exist".format(x)
            sys.exit(1)
    for x in args.datafiles:
        if '.bam' in x and not os.path.isfile("{0}.bai".format(x)):
            print "Data file '{0}' does not have an index file. Creating an index file for {0}.".format(x)
            pysam.index(x)

    # Options Parser
    featurefile = args.featurefile
    datafiles = [x.strip() for x in args.datafiles]
    tracks = [os.path.basename(x) for x in datafiles]
    titles = [os.path.splitext(x)[0] for x in tracks]
    colors = parse_colors(args.colors)
    bgcolors = parse_colors(args.bgcolors)
    outfile = args.outfile
    extend_up = args.extend
    extend_down = args.extend
    fragmentsize = args.fragmentsize
    cluster_type = args.clustering[0].lower()
    merge_mirrored = args.merge_mirrored
    bins = (extend_up + extend_down) / args.binsize
    rmdup = args.rmdup
    rpkm = args.rpkm
    rmrepeats = args.rmrepeats
    ncpus = args.cpus
    distancefunction = args.distancefunction[0].lower()
    dynam = args.graphdynamics
    fontsize = args.textfontsize

    # Check for mutually exclusive parameters
    if dynam:
        if merge_mirrored:
            print "ERROR: -m and -g option CANNOT be used together"
            sys.exit(1)
        if distancefunction == 'e':
            print 'Dynamics can only be identified using Pearson correlation as metric.'
            print 'Assigning metric to Pearson correlation'
            distancefunction = 'p'

    # Warning about too much files
    if (len(tracks) > 4):
        print "Warning: Running fluff with too many files might make you system use enormous amount of memory!"
    # Method of clustering
    if (args.pick != None):
        pick = [i - 1 for i in split_ranges(args.pick)]
    else:
        pick = range(len(datafiles))
    if not cluster_type in ["k", "h", "n"]:
        sys.stderr.write("Unknown clustering type!\n")
        sys.exit(1)
    # Number of clusters
    if cluster_type == "k" and not args.numclusters >= 2:
        sys.stderr.write("Please provide number of clusters!\n")
        sys.exit(1)
    # Distance function
    if not distancefunction in ["e", "p"]:
        sys.stderr.write("Unknown distance function!\n")
        sys.exit(1)
    else:
        if distancefunction == "e":
            METRIC = DEFAULT_METRIC
            print "Euclidean distance method"
        else:
            METRIC = "c"
            print "Pearson distance method"
    ## Get scale for each track
    tscale = [1.0 for track in datafiles]

    # Function to load heatmap data
    def load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam,
                  guard=[]):
        # Calculate the profile data
        data = {}
        regions = []
        print "Loading data"
        try:
            # Load data in parallel
            import multiprocessing
            pool = multiprocessing.Pool(processes=ncpus)
            jobs = []
            for datafile in datafiles:
                jobs.append(pool.apply_async(load_heatmap_data, args=(
                featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats,
                fragmentsize, dynam, guard)))
            for job in jobs:
                track, regions, profile, guard = job.get()
                data[track] = profile
        except:
            sys.stderr.write("Python multiprocessing not installed, can't load data in parallel\n")
            for datafile in datafiles:
                track, regions, profile, guard = load_heatmap_data(featurefile, datafile, amount_bins, extend_dyn_up,
                                                                   extend_dyn_down, rmdup, rpkm, rmrepeats,
                                                                   fragmentsize, dynam, guard)
                data[track] = profile
        return data, regions, guard

    # -g : Option to try and get dynamics
    # Extend features 1kb up/down stream
    # Cluster them in one bin
    guard = []
    if dynam:
        amount_bins = 1
        extend_dyn_up = 1000
        extend_dyn_down = 1000
        # load the data once to get the features which extend below 0
        guard = check_data(featurefile, guard, dynam, extend_dyn_up, extend_dyn_down)
    else:
        amount_bins = bins
        extend_dyn_up = extend_up
        extend_dyn_down = extend_down

    # Load data for clustering
    data, regions, guard = load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats,
                                     fragmentsize, dynam, guard)
    # Normalize
    norm_data = normalize_data(data, DEFAULT_PERCENTILE)

    clus = hstack([norm_data[t] for i, t in enumerate(tracks) if (not pick or i in pick)])

    # Clustering
    if cluster_type == "k":
        print "K-means clustering"
        ## K-means clustering
        # PyCluster
        labels, error, nfound = Pycluster.kcluster(clus, args.numclusters, dist=METRIC)
        if not dynam and merge_mirrored:
            (i, j) = mirror_clusters(data, labels)
            while j:
                for track in data.keys():
                    data[track][labels == j] = [row[::-1] for row in data[track][labels == j]]
                for k in range(len(regions)):
                    if labels[k] == j:
                        (chrom, start, end, gene, strand) = regions[k]
                        if strand == "+":
                            strand = "-"
                        else:
                            strand = "+"
                        regions[k] = (chrom, start, end, gene, strand)
                n = len(set(labels))
                labels[labels == j] = i
                for k in range(j + 1, n):
                    labels[labels == k] = k - 1
                (i, j) = mirror_clusters(data, labels)

        ind = labels.argsort()

        # Hierarchical clustering
    elif cluster_type == "h":
        print "Hierarchical clustering"
        tree = Pycluster.treecluster(clus, method="m", dist=METRIC)
        labels = tree.cut(args.numclusters)
        ind = sort_tree(tree, arange(len(regions)))
    else:
        ind = arange(len(regions))
        labels = zeros(len(regions))


    # Load data for visualization if -g option was used
    if dynam:
        data, regions, guard = load_data(featurefile, bins, extend_up, extend_down, rmdup, rpkm, rmrepeats,
                                         fragmentsize, dynam, guard)

    f = open("{0}_clusters.bed".format(outfile), "w")
    for (chrom, start, end, gene, strand), cluster in zip(array(regions, dtype="object")[ind], array(labels)[ind]):
        if not gene:
            f.write("{0}\t{1}\t{2}\t.\t{3}\t{4}\n".format(chrom, start, end, cluster + 1, strand))
        else:
            f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(chrom, start, end, gene, cluster + 1, strand))
    f.close()
    # Save read counts
    readcounts = {}
    for i, track in enumerate(tracks):
        readcounts[track] = {}
        readcounts[track]['bins'] = []
        for idx, row in enumerate(data[track]):
            bins = ''
            for bin in row:
                if not bins:
                    bins = '{0}'.format(bin)
                else:
                    bins = '{0};{1}'.format(bins, bin)
            readcounts[track]['bins'].append(bins)

    input_fileBins = open('{0}_readCounts.txt'.format(outfile), 'w')
    input_fileBins.write('Regions\t'.format(track))
    for i, track in enumerate(titles):
        input_fileBins.write('{0}\t'.format(track))
    input_fileBins.write('\n')
    for i, track in enumerate(tracks):
        for idx in ind:
            input_fileBins.write('{0}:{1}-{2}\t'.format(regions[idx][0], regions[idx][1], regions[idx][2]))
            for i, track in enumerate(tracks):
                input_fileBins.write('{0}\t'.format(readcounts[track]['bins'][idx]))
            input_fileBins.write('\n')
        break
    input_fileBins.close()

    if not cluster_type == "k":
        labels = None

    scale = get_absolute_scale(args.scale, [data[track] for track in tracks])
    heatmap_plot(data, ind[::-1], outfile, tracks, titles, colors, bgcolors, scale, tscale, labels, fontsize)
Ejemplo n.º 15
0
		data = eventutils.trim_data_to_events(data, events, trimming_range)

	input_vecs = []
	if treat_data_differentially:
		input_vecs = utils.make_prices_diffs_vecs(data)
	else:
		input_vecs = utils.make_prices_vecs(data)

	# Run clustering algorithm.

	if algorithm_type == ClusterAlg.KMEANS:
		labels, wcss, n = Pycluster.kcluster(input_vecs, number_of_clusters, 
				dist = dist_measure, npass = number_of_iters, 
				method = dist_method)
	elif algorithm_type == ClusterAlg.HIERARCHICAL:
		tree = Pycluster.treecluster(input_vecs, method = dist_method,
				dist = dist_method)
		labels = tree.cut(number_of_clusters)
	elif algorithm_type == ClusterAlg.SELFORGMAPS:
		labels, celldata = Pycluster.somcluster(input_vecs, nxgrid = xgrid, 
				nygrid = ygrid, niter = number_of_iters)

	# If algorithm is self-organizing maps each item is assigned to
	# a particular 2D point, so we need to create groups from 2D points.
	# See implementation of making groups from labels for details.

	if algorithm_type == ClusterAlg.SELFORGMAPS:
		clusters = utils.make_groups_from_labels(labels, data, True)
	else:
		clusters = utils.make_groups_from_labels(labels, data)

	# Check with which type of key we have to deal with.
Ejemplo n.º 16
0
red1 = cv2.inRange(imgHSV,a([0,150,80]),a([3,255,255]))
red2 = cv2.inRange(imgHSV,a([150,150,80]),a([180,255,255]))
red = red1 | red2
red = clean(red)
stickers += detect(red)

for i in range(len(stickers)):
	stickers[i].id = i
stickertime=clock()*1000
print "stickers detected in " + str(int(stickertime-starttime)) + " ms"

#CLUSTERING

D=[[distance(st1,st2) for st1 in stickers] for st2 in stickers]
tree = Pycluster.treecluster(distancematrix=D)

cluster_count = 1
while True:
	clusters = tree.cut(cluster_count)
	for i in range(len(stickers)): #Debug
		if (stickers[i].weight>1 and cluster_count<7):
			cv2.drawContours(eval("klastry"+str(cluster_count)), np.array([stickers[i].V]), 0, 255*(clusters[i]+1)/(cluster_count+1),-1)
	weights=[0 for i in range(cluster_count+1)]
	for i in range(len(stickers)):
		weights[clusters[i]] += stickers[i].size + stickers[i].weight
	maxcluster_weight = 0
	maxcluster_id = 0
	maximum = 0
	for i in range(cluster_count):
		if weights[i]>maximum:
Ejemplo n.º 17
0
        nclusters=noClust,
        transpose=0,
        method=mDict[method],
        dist=dDict[distance])
    silScore = metrics.silhouette_score(np.array(rawData)[:, actHistDNaseList],
                                        clusterListActiveHistDNase,
                                        metric='euclidean')
    silhouetteList.append(silScore)

# Hierarchical
if (algorithm == "h"):
    # Method
    mDict = dict([("sl", "s"), ("cl", "m"), ("el", "c"), ("al", "a")])
    # All
    tree = pc.treecluster(np.array(rawData),
                          transpose=0,
                          method=mDict[method],
                          dist=dDict[distance])
    clusterListAll = tree.cut(noClust)
    silScore = metrics.silhouette_score(rawData,
                                        clusterListAll,
                                        metric='euclidean')
    silhouetteList.append(silScore)
    # Single
    clusterListSingle = []
    for i in range(0, len(labelList)):
        tree = pc.treecluster(np.array(rawData)[:, (rawVph * i):(rawVph * i) +
                                                rawVph],
                              transpose=0,
                              method=mDict[method],
                              dist=dDict[distance])
        cluterListTemp = tree.cut(noClust)