Exemple #1
0
class Test(unittest.TestCase):


    def setUp(self):
        self.io = StringIO(diag_str)
        self.io.seek(0)
        self.tree = Tree(tree_str)
        self.tree.node(0).get_data().id = 10
        self.tree.node(1).get_data().id = 20
        self.tree.node(2).get_data().id = 30
        
    def tearDown(self):
        pass

    def testParser(self):
        parser = DiagCharsParser(self.tree)
        diagchars = list(parser.parse(self.io))
        index = defaultdict(lambda: list())
        for d in diagchars:
            index[d.tree_node_key].append(d)
            index[d.tree_node_key].sort(cmp=lambda x,y:cmp(x.column,y.column))

        assert len(index[10]) == 5
        assert len(index[20]) == 8
        assert len(index[30]) == 5
        assert index[10][0].column == 0
        assert index[10][4].aa == "IK"
        assert index[20][2].column == 8 
        assert index[20][2].aa == "D"
Exemple #2
0
    def _subtree(self,root):
        """
        Split a tree on a given node, pruning from the original tree.
        @param root: the node to use as the new root 
        @return: subtree rooted on this node.
        """
        sub = Tree(weight=self.tree.weight, 
                        rooted=self.tree.rooted, 
                        name=self.tree.name, 
                        data=self.tree.dataclass, 
                        max_support=self.tree.max_support)

        sub.node(sub.root).data = self.tree.node(root).data
        def _add(old_id,new_id):
            """
            Walk from this node, using the id from the old tree, and the id
            from the new tree to both load the data from the old tree and link 
            to the correct node in the new tree.
            """
            for old_succ in self.tree.node(old_id).succ:
                #print old_id,new_id
                to_add = Node(data = self.tree.node(old_succ).data)
                new_succ = sub.add(to_add,new_id)
                #print "\t",old_succ,new_succ
                _add(old_succ,new_succ)
        _add(root,sub.root)
        self.annotater.annotate(sub)
        unlink(self.tree,root)
        return sub
Exemple #3
0
    def _subtree(self, root):
        # Find paths to targets to build a new tree
        g = self._subgraph(root)
        #for node in g:
        #    print self.tree.node(node).data.taxon

        sub = Tree(weight=self.tree.weight, 
                   rooted=self.tree.rooted, 
                   name=self.tree.name, 
                   data=self.tree.dataclass, 
                   max_support=self.tree.max_support)

        sub.node(sub.root).data = self.tree.node(root).data
        def _add(old_id,new_id):
            """
            Walk from this node, using the id from the old tree, and the id
            from the new tree to both load the data from the old tree and link 
            to the correct node in the new tree.
            """
            for old_succ in g.successors_iter(old_id):
                to_add = Node(data = self.tree.node(old_succ).data)
                new_succ = sub.add(to_add,new_id)
                _add(old_succ,new_succ)
        _add(root,sub.root)

        # Delete nodes from old tree
        for node in g:
            #print "collapsing node",node
            self.collapse(self.tree,node)
            
        return sub
Exemple #4
0
class OIDImporter(object):
    """
    Import a set of OID files into the database
    """

    def __init__(
        self,
        familyName,
        alignFile,
        alignColcullLog,
        alignSeqcullLog,
        treeFile,
        treeDiagCharsFile,
        codemlFile=None,
        alignFormat="fasta",
        oid_key=None,
    ):
        self.familyName = familyName
        self.treeFile = treeFile
        self.treeDiagCharsFile = treeDiagCharsFile
        self.alignFile = alignFile
        self.alignColcullLog = alignColcullLog
        self.alignSeqcullLog = alignSeqcullLog
        self.codemlFile = codemlFile
        self.alignFormat = alignFormat
        self.oid_key = oid_key

    def merge(self):
        from hpf.hddb.db import Session, Family

        self.session = Session()

        self.family = self.session.query(Family).filter(Family.name == self.familyName).first()
        if not self.family:
            runtime().debug("Creating family", self.familyName)
            self._family()
            self._alignment()
            self._tree()
        else:
            self.alignment = self.family.alignment
            self.tree = self.alignment.tree
            runtime().debug("Found family", self.family.id)

        if not self.family.alignments[0].tree.codeml:
            runtime().debug("Importing codeml")
            self._codeml()
        else:
            runtime().debug("Already found codeml", self.family.alignments[0].tree.codeml.id)

        # Commit the session, close, and finish
        self.session.commit()
        self.session.close()

    def _index(self, name):
        n = name.split("#")[-1]
        if n.startswith("N"):
            n = n[1:]
        assert n.isdigit()
        return n

    def _tree(self):
        session = self.session

        # # Load the tree file and rename the taxa.
        # from Bio.Nexus.Nexus import Nexus
        # nex=Nexus(self.treeFile)
        # self.nexus = nex.trees[0]

        from Bio.Nexus.Trees import Tree as NewickTree

        tree_str = open(self.treeFile).read()
        self.nexus = NewickTree(tree_str)

        # Rename all the taxa.
        for id in self.nexus.get_terminals():
            node = self.nexus.node(id)
            node.data.taxon = self._index(node.data.taxon)

        # Create the DB object
        from hpf.hddb.db import Tree

        self.tree = Tree(
            alignment_key=self.alignment.id,
            text=self.nexus.to_string(plain=False, plain_newick=True),
            filename=self.treeFile,
        )
        session.add(self.tree)
        session.flush()

        # Now add in the node references
        self.nexus.name = self.tree.id
        assert self.tree.id != None
        runtime().debug("Added tree", self.tree)
        from hpf.hddb.db import TreeNodeFactory

        nodes = list(TreeNodeFactory().create(self.nexus))
        for node in nodes:
            node.ancestor_node = node.ancestor.id if node.ancestor else None
            # This should add the new object into the session
            self.tree.nodes.append(node)
            # session.add(node)
            session.flush()

        runtime().debug("Appended", len(nodes), "tree nodes")
        session.flush()

        # Now import the diagnostic characters and reference the nodes.
        from hpf.amnh.oid import DiagCharsParser
        from hpf.hddb.db import TreeFactory

        biotree = TreeFactory(name_func=lambda node: str(node.id)).create(self.tree.nodes, self.tree.id)
        parser = DiagCharsParser(biotree)
        runtime().debug(self.treeDiagCharsFile)
        with open(self.treeDiagCharsFile) as handle:
            diagchars = list(parser.parse(handle))
            runtime().debug("DiagChars", len(diagchars))
            for d in diagchars:
                session.add(d)
        session.flush()

    def _codeml(self):
        if not self.codemlFile:
            return
        assert self.family.id != None
        assert self.tree.id != None

        # We need to convert the columns to the original alignment indices
        mapper = CulledColumnMapper(self.alignment, self.alignment.culled_columns)
        parser = PositiveSelectionParser()
        models = list(parser.parse(self.codemlFile))
        runtime().debug("Found", len(models), "models")
        for i, model in enumerate(models):
            model.tree_key = self.tree.id
            self.session.add(model)
            self.session.flush()
            ps = list(model.ps)
            runtime().debug("Found", len(ps), "sites in model", model.model)
            for j, site in enumerate(ps):
                site.codeml_key = model.id
                # Indices in CodeML start at 1, convert to 0 and then map
                orig = site.column
                site.column = mapper[site.column - 1]
                runtime().debug("column", orig, "mapped to", site.column, site.probability)
                try:
                    self.session.add(site)
                except:
                    runtime().debug(i, ":", j, " failure on column", orig, "mapped to", site.column, site.probability)
                    raise
            runtime().debug("Finished with model")
            self.session.flush()

        #        with open(self.codemlFile) as handle:
        #            text = handle.read()
        #        from hpf.hddb.db import CodeML
        #        self.codeml = CodeML(tree_key=self.tree.id,
        #                             filename=self.codemlFile,
        #                             text=text)
        #        self.session.add(self.codeml)
        #        self.session.flush()
        #        parser = LRTParser(self.alignment, self.alignment.culled_columns,self.codeml)
        #        with open(self.codemlFile) as handle:
        #            for selection in parser.parse(handle):
        #                selection.codeml_key = self.codeml.id
        #                self.session.merge(selection)
        runtime().debug("finished import codeml")

    def _alignment(self):
        session = self.session

        # Read the alignment
        from Bio import AlignIO

        with open(self.alignFile) as handle:
            align = AlignIO.read(handle, self.alignFormat)
        # Rename 'id' with the correct protein key
        for record in align:
            record.id = self._index(record.id)
        # Write to a text buffer and create the DB object
        text = StringIO()
        AlignIO.write([align], text, self.alignFormat)
        from hpf.hddb.db import Alignment

        self.alignment = Alignment(
            family_key=self.family.id, format=self.alignFormat, filename=self.alignFile, text=text.getvalue()
        )
        # Add to session and flush
        session.add(self.alignment)
        session.flush()

        # Flip through the proteins in the alignment and add
        # the records.
        for record in align:
            protein_key = record.id
            assert protein_key != 0 and protein_key != None, protein_key
            runtime().debug("protein: ", protein_key)
            from hpf.hddb.db import AlignmentProtein

            s = AlignmentProtein(alignment_key=self.alignment.id, protein_key=protein_key, sequence=str(record.seq))
            session.add(s)
            session.flush()

            # There may exist multiple alignments, but the definition
            # of membership in the family is done here.
            from hpf.hddb.db import FamilyProtein

            fs = FamilyProtein(family_key=self.family.id, protein_key=protein_key, seed=True)
            session.merge(fs)

        # Now read the colulmn culling log.  Indices start at 0 here.
        from hpf.hddb.db import AlignmentColcull, AlignmentSeqcull

        with open(self.alignColcullLog) as handle:
            for line in handle:
                column, gap, taxa, ratio = line.split()
                col = AlignmentColcull(alignment_key=self.alignment.id, column=column, gap_percentage=ratio)
                session.merge(col)
        with open(self.alignSeqcullLog) as handle:
            # rice#1182215    0.712765957446808
            for line in handle:
                parts = line.split()
                seq, score = parts
                seq = self._index(seq)
                # seq.split("#")[-1]
                if not seq.isdigit():
                    print parts, "SEQ:", seq
                    assert false
                cul = AlignmentSeqcull(alignment_key=self.alignment.id, protein_key=seq, score=score)
        session.flush()

    def _family(self):
        session = self.session
        from hpf.hddb.db import Family

        self.family = Family(name=self.familyName, experiment_key=0)
        session.add(self.family)
        session.flush()
        ### prune taxa we don't want ###

        alltaxa = mytreeobj.get_taxa()
        badtaxa = []
        slowest_inparalogs = {}

        for taxon in alltaxa:
            if taxon not in oldid_newid.values():
                badtaxa.append(taxon)

            else:
                sp = taxon.split('_bpgseq')[0]

                if sp in slowest_inparalogs:
                    (old_taxon, old_brlen) = slowest_inparalogs[sp]
                    new_brlen = mytreeobj.node(
                        mytreeobj.search_taxon(taxon)).get_data().branchlength

                    if new_brlen < old_brlen:
                        slowest_inparalogs[sp] = (
                            taxon, mytreeobj.node(mytreeobj.search_taxon(
                                taxon)).get_data().branchlength)
                        badtaxa.append(old_taxon)

                    else:
                        badtaxa.append(taxon)

                else:
                    slowest_inparalogs[sp] = (taxon,
                                              mytreeobj.node(
                                                  mytreeobj.search_taxon(taxon)
                                              ).get_data().branchlength)
Exemple #6
0
class OIDImporter(object):
    """
    Import a set of OID files into the database
    """
    def __init__(self,
                 familyName,
                 alignFile,
                 alignColcullLog,
                 alignSeqcullLog,
                 treeFile,
                 treeDiagCharsFile,
                 codemlFile=None,
                 alignFormat="fasta",
                 oid_key=None):
        self.familyName = familyName
        self.treeFile = treeFile
        self.treeDiagCharsFile = treeDiagCharsFile
        self.alignFile = alignFile
        self.alignColcullLog = alignColcullLog
        self.alignSeqcullLog = alignSeqcullLog
        self.codemlFile = codemlFile
        self.alignFormat = alignFormat
        self.oid_key = oid_key

    def merge(self):
        from hpf.hddb.db import Session, Family
        self.session = Session()

        self.family = self.session.query(Family).filter(
            Family.name == self.familyName).first()
        if not self.family:
            runtime().debug("Creating family", self.familyName)
            self._family()
            self._alignment()
            self._tree()
        else:
            self.alignment = self.family.alignment
            self.tree = self.alignment.tree
            runtime().debug("Found family", self.family.id)

        if not self.family.alignments[0].tree.codeml:
            runtime().debug("Importing codeml")
            self._codeml()
        else:
            runtime().debug("Already found codeml",
                            self.family.alignments[0].tree.codeml.id)

        # Commit the session, close, and finish
        self.session.commit()
        self.session.close()

    def _index(self, name):
        n = name.split("#")[-1]
        if n.startswith("N"):
            n = n[1:]
        assert n.isdigit()
        return n

    def _tree(self):
        session = self.session

        # # Load the tree file and rename the taxa.
        # from Bio.Nexus.Nexus import Nexus
        # nex=Nexus(self.treeFile)
        # self.nexus = nex.trees[0]

        from Bio.Nexus.Trees import Tree as NewickTree
        tree_str = open(self.treeFile).read()
        self.nexus = NewickTree(tree_str)

        # Rename all the taxa.
        for id in self.nexus.get_terminals():
            node = self.nexus.node(id)
            node.data.taxon = self._index(node.data.taxon)

        # Create the DB object
        from hpf.hddb.db import Tree
        self.tree = Tree(alignment_key=self.alignment.id,
                         text=self.nexus.to_string(plain=False,
                                                   plain_newick=True),
                         filename=self.treeFile)
        session.add(self.tree)
        session.flush()

        # Now add in the node references
        self.nexus.name = self.tree.id
        assert self.tree.id != None
        runtime().debug("Added tree", self.tree)
        from hpf.hddb.db import TreeNodeFactory
        nodes = list(TreeNodeFactory().create(self.nexus))
        for node in nodes:
            node.ancestor_node = node.ancestor.id if node.ancestor else None
            # This should add the new object into the session
            self.tree.nodes.append(node)
            #session.add(node)
            session.flush()

        runtime().debug("Appended", len(nodes), "tree nodes")
        session.flush()

        # Now import the diagnostic characters and reference the nodes.
        from hpf.amnh.oid import DiagCharsParser
        from hpf.hddb.db import TreeFactory
        biotree = TreeFactory(name_func=lambda node: str(node.id)).create(
            self.tree.nodes, self.tree.id)
        parser = DiagCharsParser(biotree)
        runtime().debug(self.treeDiagCharsFile)
        with open(self.treeDiagCharsFile) as handle:
            diagchars = list(parser.parse(handle))
            runtime().debug("DiagChars", len(diagchars))
            for d in diagchars:
                session.add(d)
        session.flush()

    def _codeml(self):
        if not self.codemlFile:
            return
        assert self.family.id != None
        assert self.tree.id != None

        # We need to convert the columns to the original alignment indices
        mapper = CulledColumnMapper(self.alignment,
                                    self.alignment.culled_columns)
        parser = PositiveSelectionParser()
        models = list(parser.parse(self.codemlFile))
        runtime().debug("Found", len(models), "models")
        for i, model in enumerate(models):
            model.tree_key = self.tree.id
            self.session.add(model)
            self.session.flush()
            ps = list(model.ps)
            runtime().debug("Found", len(ps), "sites in model", model.model)
            for j, site in enumerate(ps):
                site.codeml_key = model.id
                # Indices in CodeML start at 1, convert to 0 and then map
                orig = site.column
                site.column = mapper[site.column - 1]
                runtime().debug("column", orig, "mapped to", site.column,
                                site.probability)
                try:
                    self.session.add(site)
                except:
                    runtime().debug(i, ":", j, " failure on column", orig,
                                    "mapped to", site.column, site.probability)
                    raise
            runtime().debug("Finished with model")
            self.session.flush()


#        with open(self.codemlFile) as handle:
#            text = handle.read()
#        from hpf.hddb.db import CodeML
#        self.codeml = CodeML(tree_key=self.tree.id,
#                             filename=self.codemlFile,
#                             text=text)
#        self.session.add(self.codeml)
#        self.session.flush()
#        parser = LRTParser(self.alignment, self.alignment.culled_columns,self.codeml)
#        with open(self.codemlFile) as handle:
#            for selection in parser.parse(handle):
#                selection.codeml_key = self.codeml.id
#                self.session.merge(selection)
        runtime().debug("finished import codeml")

    def _alignment(self):
        session = self.session

        # Read the alignment
        from Bio import AlignIO
        with open(self.alignFile) as handle:
            align = AlignIO.read(handle, self.alignFormat)
        # Rename 'id' with the correct protein key
        for record in align:
            record.id = self._index(record.id)
        # Write to a text buffer and create the DB object
        text = StringIO()
        AlignIO.write([align], text, self.alignFormat)
        from hpf.hddb.db import Alignment
        self.alignment = Alignment(family_key=self.family.id,
                                   format=self.alignFormat,
                                   filename=self.alignFile,
                                   text=text.getvalue())
        # Add to session and flush
        session.add(self.alignment)
        session.flush()

        # Flip through the proteins in the alignment and add
        # the records.
        for record in align:
            protein_key = record.id
            assert protein_key != 0 and protein_key != None, protein_key
            runtime().debug("protein: ", protein_key)
            from hpf.hddb.db import AlignmentProtein
            s = AlignmentProtein(alignment_key=self.alignment.id,
                                 protein_key=protein_key,
                                 sequence=str(record.seq))
            session.add(s)
            session.flush()

            # There may exist multiple alignments, but the definition
            # of membership in the family is done here.
            from hpf.hddb.db import FamilyProtein
            fs = FamilyProtein(family_key=self.family.id,
                               protein_key=protein_key,
                               seed=True)
            session.merge(fs)

        # Now read the colulmn culling log.  Indices start at 0 here.
        from hpf.hddb.db import AlignmentColcull, AlignmentSeqcull
        with open(self.alignColcullLog) as handle:
            for line in handle:
                column, gap, taxa, ratio = line.split()
                col = AlignmentColcull(alignment_key=self.alignment.id,
                                       column=column,
                                       gap_percentage=ratio)
                session.merge(col)
        with open(self.alignSeqcullLog) as handle:
            #rice#1182215    0.712765957446808
            for line in handle:
                parts = line.split()
                seq, score = parts
                seq = self._index(seq)
                #seq.split("#")[-1]
                if not seq.isdigit():
                    print parts, "SEQ:", seq
                    assert false
                cul = AlignmentSeqcull(alignment_key=self.alignment.id,
                                       protein_key=seq,
                                       score=score)
        session.flush()

    def _family(self):
        session = self.session
        from hpf.hddb.db import Family
        self.family = Family(name=self.familyName, experiment_key=0)
        session.add(self.family)
        session.flush()