Exemple #1
0
 def get_clusterfos_from_partition(self, partition, all_seqs):
     clusterfos = []
     for cluster in partition:
         cfo = {'seqfos' : [{'name' : uid, 'seq' : all_seqs[uid]} for uid in cluster]}  # note that vsearch clustering also adds 'centroid', but I think it isn't subsequently used
         cfo['cons_seq'] = utils.cons_seq(0.1, unaligned_seqfos=cfo['seqfos'])
         clusterfos.append(cfo)
     return clusterfos
Exemple #2
0
    def make_single_tree(self, partitions, annotations, uid_set, get_fasttrees=False, n_max_cons_seqs=10, debug=False):
        # NOTE don't call this externally -- if you want a single tree, call make_trees() with <i_only_cluster> set
        def getline(uidstr, uid_set=None):
            if uidstr in annotations:  # if we have this exact annotation
                return annotations[uidstr]
            else:
                if uid_set is None:
                    uid_set = set(uidstr.split(':'))  # should only get called if it's a singleton
                # note that for internal nodes in a fasttree-derived subtree, the uids will be out of order compared the the annotation keys
                for line in annotations.values():  # we may actually have the annotation for every subcluster (e.g. if --calculate-alternative-annotations was set), but in case we don't, this is fine
                    if len(uid_set & set(line['unique_ids'])) > 0:  # just take the first one with any overlap. Yeah, it's not necessarily the best, but its naive sequence probably isn't that different, and for just getting the fasttree it reeeeeeaaaallly doesn't matter
                        return line
            raise Exception('couldn\'t find uid %s in annotations' % uid)
        def getseq(uid):
            line = getline(uid)
            return line['seqs'][line['unique_ids'].index(uid)]
        def lget(uid_list):
            return ':'.join(uid_list)

        # check for repeated uids (was only from seed uid, which shouldn't happen any more, but the code below throws an infinite loop if we do, so may as well be careful)
        for partition in partitions:
            if sum(len(c) for c in partition) > len(set(u for c in partition for u in c)):
                repeated_uids = [u for u, count in collections.Counter([u for c in partition for u in c]).items() if count > 1]
                raise Exception('found %d uid%s in more than one cluster (%s)' % (len(repeated_uids), utils.plural(len(repeated_uids)), ', '.join(repeated_uids)))

        default_edge_length = 999999  # it's nice to have the edges all set to something that's numeric (so the trees print), but also obvious wrong, if we forget to set somebody
        assert len(partitions[-1]) == 1
        root_label = lget(partitions[-1][0])  # we want the order of the uids in the label to correspond to the order in self.partitions
        tns = dendropy.TaxonNamespace([root_label])
        root_node = dendropy.Node(taxon=tns.get_taxon(root_label))
        root_node.uids = uid_set  # each node keeps track of the uids of its children
        dtree = dendropy.Tree(taxon_namespace=tns, seed_node=root_node)
        if debug:
            print '    starting tree with %d leaves' % len(uid_set)
        for ipart in reversed(range(len(partitions) - 1)):  # dendropy seems to only have fcns to build a tree from the root downward, so we loop starting with the last partition (- 1 is because the last partition is guaranteed to be just one cluster)
            for lnode in dtree.leaf_node_iter():  # look for leaf nodes that contain uids from two clusters in this partition, and add those as children
                tclusts = [c for c in partitions[ipart] if len(set(c) & lnode.uids) > 0]
                if len(tclusts) < 2:
                    continue
                for tclust in tclusts:
                    ttaxon = dendropy.Taxon(lget(tclust))
                    tns.add_taxon(ttaxon)
                    child = lnode.new_child(taxon=ttaxon, edge_length=default_edge_length)
                    child.uids = set(tclust)
                if debug:
                    print '      ipart %d' % ipart
                    print '        split node: %d --> %s      %s --> %s' % (len(lnode.uids), ' '.join([str(len(tc)) for tc in tclusts]), lnode.taxon.label, ' '.join([c.taxon.label for c in lnode.child_node_iter()]))

        # split existing leaves, which are probably not singletons (they're probably from the initial naive sequence collapse step) into subtrees such that each leaf is a singleton
        for lnode in dtree.leaf_node_iter():
            if len(lnode.uids) == 1:
                continue
            if get_fasttrees and len(lnode.uids) > 2:
                seqfos = [{'name' : uid, 'seq' : getseq(uid)} for uid in lnode.taxon.label.split(':')]  # may as well add them in the right order, although I don't think it matters
                subtree = treeutils.get_fasttree_tree(seqfos, getline(lnode.taxon.label, uid_set=lnode.uids)['naive_seq'], suppress_internal_node_taxa=True)  # note that the fasttree distances get ignored below (no idea if they'd be better than what we set down there, but they probably wouldn't be consistent, so I'd rather ignore them)
                for tmpnode in subtree.postorder_node_iter():
                    if tmpnode.is_leaf():
                        tmpnode.uids = set([tmpnode.taxon.label])
                    else:
                        tmpnode.uids = set([uid for c in tmpnode.child_node_iter() for uid in c.uids])
                        ttaxon = dendropy.Taxon(lget(tmpnode.uids))
                        subtree.taxon_namespace.add_taxon(ttaxon)
                        tmpnode.taxon = ttaxon  # ...and use the string of leaf nodes, even though they'll be in the wrong order (I think these get ignored when I call label_nodes() below, but it's still tidier to have them right in the meantime, and anyway since I'm suppressing internal taxa I think I need to set them to something)

                if debug:
                    print '   adding subtree with %d leaves from fastree at leaf node %s' % (len(seqfos), lnode.taxon.label)
                    print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=subtree))
                dtree.taxon_namespace.add_taxa(subtree.taxon_namespace)
                lnode.add_child(subtree.seed_node)
                assert len(lnode.child_edges()) == 1  # we're iterating over leaves, so this should always be true
                lnode.child_edges()[0].collapse()
            else:  # just add a star subtree
                for uid in lnode.taxon.label.split(':'):  # may as well add them in the right order, although I don't think it matters
                    ttaxon = dendropy.Taxon(uid)
                    tns.add_taxon(ttaxon)
                    child = lnode.new_child(taxon=ttaxon, edge_length=default_edge_length)
                    child.uids = set([uid])
                if debug:
                    print '      added %d singleton children for %s' % (len(lnode.uids), lnode.taxon.label)

        # in order to set edge lengths, we need node sequences, so first set leaf node seqs
        for lnode in dtree.leaf_node_iter():
            assert len(lnode.uids) == 1
            lnode.seq = getseq(lnode.taxon.label)
            lnode.n_descendent_leaves = 1  # keep track of how many leaf nodes contributed to each node's consensus sequence (these are leaves, so it's trivally 1). This is less accurate than keeping track of all the sequences, but also faster

        # then set internal node seqs as the consensus of their children, and set the distance as hamming distance to child seqs
        if debug:
            print '    adding edge lengths either from fasttree %s or cons seq %s' % (utils.color('blue', 'x'), utils.color('red', 'x'))
        min_edge_length = None  # setting this is nice for better debug viewing
        for node in dtree.postorder_internal_node_iter():  # includes root node
            child_cons_seq_counts = [c.n_descendent_leaves for c in node.child_node_iter()]
            total_descendent_leaves = sum(child_cons_seq_counts)
            if total_descendent_leaves > n_max_cons_seqs:  # if there's tons of descendent leaves, we don't want to pass them all to the consensus fcn since it's slow, so we choose them in proportion to their actual proportions, but scaled down to <n_max_cons_seqs>
                child_cons_seq_counts = [int(n_max_cons_seqs * csc / float(total_descendent_leaves)) for csc in child_cons_seq_counts]
                child_cons_seq_counts = [max(1, csc) for csc in child_cons_seq_counts]  # don't eliminate any sequences entirely (this makes the proportions less accurate (in some cases), but is the easy way to handle the case where there's a ton of singleton children
            if debug:
                print '  %s' % utils.color('green', node.taxon.label)
                csc_str = '  (reduced: %s)' % ' '.join([str(csc) for csc in child_cons_seq_counts]) if total_descendent_leaves > n_max_cons_seqs else ''
                print '      desc leaves per child: %s%s' % (' '.join(str(c.n_descendent_leaves) for c in node.child_node_iter()), csc_str)
            child_seqfos = [{'name' : cn.taxon.label + '-leaf-' + str(il), 'seq' : cn.seq} for cn, count in zip(node.child_node_iter(), child_cons_seq_counts) for il in range(count)]
            node.seq = utils.cons_seq(0.01, aligned_seqfos=child_seqfos, tie_resolver_seq=getline(root_label)['naive_seq'])  #, debug=debug)  # the consensus has an N at every position where the constituent sequences gave a tie. But Ns screw up the distances (especially because once we *get* an N, we can't get rid of it and it's propagated all the way up the tree), and in almost all cases the correct choice should be the naive base, so we use that
            node.n_descendent_leaves = total_descendent_leaves
            for edge in node.child_edge_iter():
                from_fasttree = False
                if edge.length == default_edge_length:  # otherwise it was set by fasttree, and it's probably better than what we'd get from this (it'd be nice to skip the cons seq stuff for the whole fasttree subtree, but then we don't have the cons seqs we need for later)
                    edge.length = utils.hamming_distance(edge.head_node.seq, node.seq) / float(len(node.seq))
                else:
                    from_fasttree = True
                if min_edge_length is not None:
                    edge.length = max(min_edge_length, edge.length)
                if debug:
                    print '       %6.3f   %s  %s' % (edge.length, utils.color('blue' if from_fasttree else 'red', 'x'), edge.head_node.taxon.label)

        if debug:
            print '        naive seq %s' % getline(root_label)['naive_seq'] # NOTE might be worthwhile to add an edge connecting seed node and the actual naive sequence (i.e. for cases where our approximate naive is off)
            print '    root cons seq %s' % utils.color_mutants(getline(root_label)['naive_seq'], dtree.seed_node.seq)

        for node in dtree.preorder_node_iter():
            del node.uids
            del node.seq
            del node.n_descendent_leaves

        treeutils.label_nodes(dtree, ignore_existing_internal_node_labels=True, ignore_existing_internal_taxon_labels=True, debug=debug)
        dtree.update_bipartitions()  # probably don't really need this
        if debug:
            print treeutils.utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=dtree, width=250))

        return dtree
    def make_single_hexbin_shm_vs_identity_plot(self,
                                                cluster,
                                                annotation,
                                                plotdir,
                                                plotname,
                                                debug=False):
        """ shm (identity to naive sequence) vs identity to some reference sequence """
        import matplotlib.pyplot as plt

        fig, ax = self.plotting.mpl_init()

        if self.args.seed_unique_id is not None and self.args.seed_unique_id in cluster:
            seed_index = cluster.index(self.args.seed_unique_id)
            ref_seq = annotation['seqs'][seed_index]
            ref_label = 'seed seq'
            xref = annotation['n_mutations'][seed_index]
        else:
            ref_seq = utils.cons_seq(0.1,
                                     aligned_seqfos=[{
                                         'name':
                                         cluster[iseq],
                                         'seq':
                                         annotation['seqs'][iseq]
                                     } for iseq in range(len(cluster))])
            ref_label = 'consensus seq'
            xref = utils.hamming_distance(annotation['naive_seq'], ref_seq)

        xvals, yvals = zip(*[[
            annotation['n_mutations'][iseq],
            utils.hamming_distance(ref_seq, annotation['seqs'][iseq])
        ] for iseq in range(len(cluster))])
        hb = ax.hexbin(xvals,
                       yvals,
                       gridsize=40,
                       cmap=plt.cm.Blues,
                       bins='log')

        # add a red mark for the reference sequence
        yref = 0
        ax.plot([xref], [yref], color='red', marker='.', markersize=10)
        ax.text(xref, yref, ref_label, color='red', fontsize=8)

        if self.args.queries_to_include is not None:  # note similarity to code in make_single_hexbin_size_vs_shm_plot()
            queries_to_include_in_this_cluster = set(cluster) & set(
                self.args.queries_to_include
            )  # TODO merge with similar code in make_single_hexbin_shm_vs_identity_plot
            for uid in queries_to_include_in_this_cluster:
                iseq = cluster.index(uid)
                xval = annotation['n_mutations'][iseq]
                yval = utils.hamming_distance(ref_seq,
                                              annotation['seqs'][iseq])
                ax.plot([xval], [yval], color='red', marker='.', markersize=10)
                ax.text(xval, yval, uid, color='red', fontsize=8)

        ylabel = 'identity to %s' % ref_label
        self.plotting.mpl_finish(ax,
                                 plotdir,
                                 plotname,
                                 xlabel='N mutations',
                                 ylabel=ylabel,
                                 title='%d sequences' % len(cluster))