Ejemplo n.º 1
0
def add(file, family_key):
    with open(file) as handle:
        from Bio.Nexus.Trees import Tree as NexusTree
        tree = NexusTree(handle.read())
    dbtree = Tree()
    dbtree.family_key = family_key
    dbtree.text = tree.to_string(plain=False, plain_newick=True)
    dbtree.filename = os.path.basename(file)
    session.add(dbtree)
    session.flush()
    tree.name = dbtree.id
    for node in TreeNodeFactory().create(tree):
        session.add(node)
Ejemplo n.º 2
0
def add(file, family_key):
    with open(file) as handle:
        from Bio.Nexus.Trees import Tree as NexusTree
        tree = NexusTree(handle.read())
    dbtree = Tree()
    dbtree.family_key = family_key
    dbtree.text = tree.to_string(plain=False,plain_newick=True)
    dbtree.filename = os.path.basename(file)
    session.add(dbtree)
    session.flush()
    tree.name = dbtree.id
    for node in TreeNodeFactory().create(tree):
        session.add(node)
Ejemplo n.º 3
0
Archivo: oid.py Proyecto: bsmithers/hpf
class OIDImporter(object):
    """
    Import a set of OID files into the database
    """

    def __init__(
        self,
        familyName,
        alignFile,
        alignColcullLog,
        alignSeqcullLog,
        treeFile,
        treeDiagCharsFile,
        codemlFile=None,
        alignFormat="fasta",
        oid_key=None,
    ):
        self.familyName = familyName
        self.treeFile = treeFile
        self.treeDiagCharsFile = treeDiagCharsFile
        self.alignFile = alignFile
        self.alignColcullLog = alignColcullLog
        self.alignSeqcullLog = alignSeqcullLog
        self.codemlFile = codemlFile
        self.alignFormat = alignFormat
        self.oid_key = oid_key

    def merge(self):
        from hpf.hddb.db import Session, Family

        self.session = Session()

        self.family = self.session.query(Family).filter(Family.name == self.familyName).first()
        if not self.family:
            runtime().debug("Creating family", self.familyName)
            self._family()
            self._alignment()
            self._tree()
        else:
            self.alignment = self.family.alignment
            self.tree = self.alignment.tree
            runtime().debug("Found family", self.family.id)

        if not self.family.alignments[0].tree.codeml:
            runtime().debug("Importing codeml")
            self._codeml()
        else:
            runtime().debug("Already found codeml", self.family.alignments[0].tree.codeml.id)

        # Commit the session, close, and finish
        self.session.commit()
        self.session.close()

    def _index(self, name):
        n = name.split("#")[-1]
        if n.startswith("N"):
            n = n[1:]
        assert n.isdigit()
        return n

    def _tree(self):
        session = self.session

        # # Load the tree file and rename the taxa.
        # from Bio.Nexus.Nexus import Nexus
        # nex=Nexus(self.treeFile)
        # self.nexus = nex.trees[0]

        from Bio.Nexus.Trees import Tree as NewickTree

        tree_str = open(self.treeFile).read()
        self.nexus = NewickTree(tree_str)

        # Rename all the taxa.
        for id in self.nexus.get_terminals():
            node = self.nexus.node(id)
            node.data.taxon = self._index(node.data.taxon)

        # Create the DB object
        from hpf.hddb.db import Tree

        self.tree = Tree(
            alignment_key=self.alignment.id,
            text=self.nexus.to_string(plain=False, plain_newick=True),
            filename=self.treeFile,
        )
        session.add(self.tree)
        session.flush()

        # Now add in the node references
        self.nexus.name = self.tree.id
        assert self.tree.id != None
        runtime().debug("Added tree", self.tree)
        from hpf.hddb.db import TreeNodeFactory

        nodes = list(TreeNodeFactory().create(self.nexus))
        for node in nodes:
            node.ancestor_node = node.ancestor.id if node.ancestor else None
            # This should add the new object into the session
            self.tree.nodes.append(node)
            # session.add(node)
            session.flush()

        runtime().debug("Appended", len(nodes), "tree nodes")
        session.flush()

        # Now import the diagnostic characters and reference the nodes.
        from hpf.amnh.oid import DiagCharsParser
        from hpf.hddb.db import TreeFactory

        biotree = TreeFactory(name_func=lambda node: str(node.id)).create(self.tree.nodes, self.tree.id)
        parser = DiagCharsParser(biotree)
        runtime().debug(self.treeDiagCharsFile)
        with open(self.treeDiagCharsFile) as handle:
            diagchars = list(parser.parse(handle))
            runtime().debug("DiagChars", len(diagchars))
            for d in diagchars:
                session.add(d)
        session.flush()

    def _codeml(self):
        if not self.codemlFile:
            return
        assert self.family.id != None
        assert self.tree.id != None

        # We need to convert the columns to the original alignment indices
        mapper = CulledColumnMapper(self.alignment, self.alignment.culled_columns)
        parser = PositiveSelectionParser()
        models = list(parser.parse(self.codemlFile))
        runtime().debug("Found", len(models), "models")
        for i, model in enumerate(models):
            model.tree_key = self.tree.id
            self.session.add(model)
            self.session.flush()
            ps = list(model.ps)
            runtime().debug("Found", len(ps), "sites in model", model.model)
            for j, site in enumerate(ps):
                site.codeml_key = model.id
                # Indices in CodeML start at 1, convert to 0 and then map
                orig = site.column
                site.column = mapper[site.column - 1]
                runtime().debug("column", orig, "mapped to", site.column, site.probability)
                try:
                    self.session.add(site)
                except:
                    runtime().debug(i, ":", j, " failure on column", orig, "mapped to", site.column, site.probability)
                    raise
            runtime().debug("Finished with model")
            self.session.flush()

        #        with open(self.codemlFile) as handle:
        #            text = handle.read()
        #        from hpf.hddb.db import CodeML
        #        self.codeml = CodeML(tree_key=self.tree.id,
        #                             filename=self.codemlFile,
        #                             text=text)
        #        self.session.add(self.codeml)
        #        self.session.flush()
        #        parser = LRTParser(self.alignment, self.alignment.culled_columns,self.codeml)
        #        with open(self.codemlFile) as handle:
        #            for selection in parser.parse(handle):
        #                selection.codeml_key = self.codeml.id
        #                self.session.merge(selection)
        runtime().debug("finished import codeml")

    def _alignment(self):
        session = self.session

        # Read the alignment
        from Bio import AlignIO

        with open(self.alignFile) as handle:
            align = AlignIO.read(handle, self.alignFormat)
        # Rename 'id' with the correct protein key
        for record in align:
            record.id = self._index(record.id)
        # Write to a text buffer and create the DB object
        text = StringIO()
        AlignIO.write([align], text, self.alignFormat)
        from hpf.hddb.db import Alignment

        self.alignment = Alignment(
            family_key=self.family.id, format=self.alignFormat, filename=self.alignFile, text=text.getvalue()
        )
        # Add to session and flush
        session.add(self.alignment)
        session.flush()

        # Flip through the proteins in the alignment and add
        # the records.
        for record in align:
            protein_key = record.id
            assert protein_key != 0 and protein_key != None, protein_key
            runtime().debug("protein: ", protein_key)
            from hpf.hddb.db import AlignmentProtein

            s = AlignmentProtein(alignment_key=self.alignment.id, protein_key=protein_key, sequence=str(record.seq))
            session.add(s)
            session.flush()

            # There may exist multiple alignments, but the definition
            # of membership in the family is done here.
            from hpf.hddb.db import FamilyProtein

            fs = FamilyProtein(family_key=self.family.id, protein_key=protein_key, seed=True)
            session.merge(fs)

        # Now read the colulmn culling log.  Indices start at 0 here.
        from hpf.hddb.db import AlignmentColcull, AlignmentSeqcull

        with open(self.alignColcullLog) as handle:
            for line in handle:
                column, gap, taxa, ratio = line.split()
                col = AlignmentColcull(alignment_key=self.alignment.id, column=column, gap_percentage=ratio)
                session.merge(col)
        with open(self.alignSeqcullLog) as handle:
            # rice#1182215    0.712765957446808
            for line in handle:
                parts = line.split()
                seq, score = parts
                seq = self._index(seq)
                # seq.split("#")[-1]
                if not seq.isdigit():
                    print parts, "SEQ:", seq
                    assert false
                cul = AlignmentSeqcull(alignment_key=self.alignment.id, protein_key=seq, score=score)
        session.flush()

    def _family(self):
        session = self.session
        from hpf.hddb.db import Family

        self.family = Family(name=self.familyName, experiment_key=0)
        session.add(self.family)
        session.flush()
                    else:
                        badtaxa.append(taxon)

                else:
                    slowest_inparalogs[sp] = (taxon,
                                              mytreeobj.node(
                                                  mytreeobj.search_taxon(taxon)
                                              ).get_data().branchlength)

        for taxon in badtaxa:
            mytreeobj.prune(taxon)

        ### print tree file ###

        treestr = mytreeobj.to_string(branchlengths_only=True,
                                      plain_newick=True)

        outfname = '%s/%d.tre' % (resultdir, phogacc)
        outf = open(outfname, 'w')
        print >> outf, treestr
        outf.close()

        ### now get the alignment ###

        alnfname = dirbase + bpg[0:6] + '/' + bpg[0:9] + '/user/shmms/' + bpg[
            0:9] + '.subfam'
        outfname = '%s/%d.a2m' % (resultdir, phogacc)
        outfile = open(outfname, 'w')
        handle = open(alnfname, 'r')

        doneit = set([])
Ejemplo n.º 5
0
class OIDImporter(object):
    """
    Import a set of OID files into the database
    """
    def __init__(self,
                 familyName,
                 alignFile,
                 alignColcullLog,
                 alignSeqcullLog,
                 treeFile,
                 treeDiagCharsFile,
                 codemlFile=None,
                 alignFormat="fasta",
                 oid_key=None):
        self.familyName = familyName
        self.treeFile = treeFile
        self.treeDiagCharsFile = treeDiagCharsFile
        self.alignFile = alignFile
        self.alignColcullLog = alignColcullLog
        self.alignSeqcullLog = alignSeqcullLog
        self.codemlFile = codemlFile
        self.alignFormat = alignFormat
        self.oid_key = oid_key

    def merge(self):
        from hpf.hddb.db import Session, Family
        self.session = Session()

        self.family = self.session.query(Family).filter(
            Family.name == self.familyName).first()
        if not self.family:
            runtime().debug("Creating family", self.familyName)
            self._family()
            self._alignment()
            self._tree()
        else:
            self.alignment = self.family.alignment
            self.tree = self.alignment.tree
            runtime().debug("Found family", self.family.id)

        if not self.family.alignments[0].tree.codeml:
            runtime().debug("Importing codeml")
            self._codeml()
        else:
            runtime().debug("Already found codeml",
                            self.family.alignments[0].tree.codeml.id)

        # Commit the session, close, and finish
        self.session.commit()
        self.session.close()

    def _index(self, name):
        n = name.split("#")[-1]
        if n.startswith("N"):
            n = n[1:]
        assert n.isdigit()
        return n

    def _tree(self):
        session = self.session

        # # Load the tree file and rename the taxa.
        # from Bio.Nexus.Nexus import Nexus
        # nex=Nexus(self.treeFile)
        # self.nexus = nex.trees[0]

        from Bio.Nexus.Trees import Tree as NewickTree
        tree_str = open(self.treeFile).read()
        self.nexus = NewickTree(tree_str)

        # Rename all the taxa.
        for id in self.nexus.get_terminals():
            node = self.nexus.node(id)
            node.data.taxon = self._index(node.data.taxon)

        # Create the DB object
        from hpf.hddb.db import Tree
        self.tree = Tree(alignment_key=self.alignment.id,
                         text=self.nexus.to_string(plain=False,
                                                   plain_newick=True),
                         filename=self.treeFile)
        session.add(self.tree)
        session.flush()

        # Now add in the node references
        self.nexus.name = self.tree.id
        assert self.tree.id != None
        runtime().debug("Added tree", self.tree)
        from hpf.hddb.db import TreeNodeFactory
        nodes = list(TreeNodeFactory().create(self.nexus))
        for node in nodes:
            node.ancestor_node = node.ancestor.id if node.ancestor else None
            # This should add the new object into the session
            self.tree.nodes.append(node)
            #session.add(node)
            session.flush()

        runtime().debug("Appended", len(nodes), "tree nodes")
        session.flush()

        # Now import the diagnostic characters and reference the nodes.
        from hpf.amnh.oid import DiagCharsParser
        from hpf.hddb.db import TreeFactory
        biotree = TreeFactory(name_func=lambda node: str(node.id)).create(
            self.tree.nodes, self.tree.id)
        parser = DiagCharsParser(biotree)
        runtime().debug(self.treeDiagCharsFile)
        with open(self.treeDiagCharsFile) as handle:
            diagchars = list(parser.parse(handle))
            runtime().debug("DiagChars", len(diagchars))
            for d in diagchars:
                session.add(d)
        session.flush()

    def _codeml(self):
        if not self.codemlFile:
            return
        assert self.family.id != None
        assert self.tree.id != None

        # We need to convert the columns to the original alignment indices
        mapper = CulledColumnMapper(self.alignment,
                                    self.alignment.culled_columns)
        parser = PositiveSelectionParser()
        models = list(parser.parse(self.codemlFile))
        runtime().debug("Found", len(models), "models")
        for i, model in enumerate(models):
            model.tree_key = self.tree.id
            self.session.add(model)
            self.session.flush()
            ps = list(model.ps)
            runtime().debug("Found", len(ps), "sites in model", model.model)
            for j, site in enumerate(ps):
                site.codeml_key = model.id
                # Indices in CodeML start at 1, convert to 0 and then map
                orig = site.column
                site.column = mapper[site.column - 1]
                runtime().debug("column", orig, "mapped to", site.column,
                                site.probability)
                try:
                    self.session.add(site)
                except:
                    runtime().debug(i, ":", j, " failure on column", orig,
                                    "mapped to", site.column, site.probability)
                    raise
            runtime().debug("Finished with model")
            self.session.flush()


#        with open(self.codemlFile) as handle:
#            text = handle.read()
#        from hpf.hddb.db import CodeML
#        self.codeml = CodeML(tree_key=self.tree.id,
#                             filename=self.codemlFile,
#                             text=text)
#        self.session.add(self.codeml)
#        self.session.flush()
#        parser = LRTParser(self.alignment, self.alignment.culled_columns,self.codeml)
#        with open(self.codemlFile) as handle:
#            for selection in parser.parse(handle):
#                selection.codeml_key = self.codeml.id
#                self.session.merge(selection)
        runtime().debug("finished import codeml")

    def _alignment(self):
        session = self.session

        # Read the alignment
        from Bio import AlignIO
        with open(self.alignFile) as handle:
            align = AlignIO.read(handle, self.alignFormat)
        # Rename 'id' with the correct protein key
        for record in align:
            record.id = self._index(record.id)
        # Write to a text buffer and create the DB object
        text = StringIO()
        AlignIO.write([align], text, self.alignFormat)
        from hpf.hddb.db import Alignment
        self.alignment = Alignment(family_key=self.family.id,
                                   format=self.alignFormat,
                                   filename=self.alignFile,
                                   text=text.getvalue())
        # Add to session and flush
        session.add(self.alignment)
        session.flush()

        # Flip through the proteins in the alignment and add
        # the records.
        for record in align:
            protein_key = record.id
            assert protein_key != 0 and protein_key != None, protein_key
            runtime().debug("protein: ", protein_key)
            from hpf.hddb.db import AlignmentProtein
            s = AlignmentProtein(alignment_key=self.alignment.id,
                                 protein_key=protein_key,
                                 sequence=str(record.seq))
            session.add(s)
            session.flush()

            # There may exist multiple alignments, but the definition
            # of membership in the family is done here.
            from hpf.hddb.db import FamilyProtein
            fs = FamilyProtein(family_key=self.family.id,
                               protein_key=protein_key,
                               seed=True)
            session.merge(fs)

        # Now read the colulmn culling log.  Indices start at 0 here.
        from hpf.hddb.db import AlignmentColcull, AlignmentSeqcull
        with open(self.alignColcullLog) as handle:
            for line in handle:
                column, gap, taxa, ratio = line.split()
                col = AlignmentColcull(alignment_key=self.alignment.id,
                                       column=column,
                                       gap_percentage=ratio)
                session.merge(col)
        with open(self.alignSeqcullLog) as handle:
            #rice#1182215    0.712765957446808
            for line in handle:
                parts = line.split()
                seq, score = parts
                seq = self._index(seq)
                #seq.split("#")[-1]
                if not seq.isdigit():
                    print parts, "SEQ:", seq
                    assert false
                cul = AlignmentSeqcull(alignment_key=self.alignment.id,
                                       protein_key=seq,
                                       score=score)
        session.flush()

    def _family(self):
        session = self.session
        from hpf.hddb.db import Family
        self.family = Family(name=self.familyName, experiment_key=0)
        session.add(self.family)
        session.flush()