コード例 #1
0
    def addFamilies(self, eventsfile, discard=[]):

        if not tableExists(self.cur, "Families"):
            self.makeFamiliesTable()

        util.tic("add families")
        events_tab = tablelib.read_table(eventsfile)
        events_lookup = events_tab.lookup("partid")
        familyGeneNames = self.makeFamilyGeneNames()
        discard = set(discard)

        for row in events_tab:
            famid = row["partid"]
            if famid in discard:
                util.logger("discarding '%s'" % famid)
                continue

            tree = treelib.read_tree(self.getTreeFile(famid))
            treelen = sum(x.dist for x in tree)
            seqs = fasta.read_fasta(self.getFastaFile(famid))
            seqlen = stats.median(map(len, seqs.values()))

            self.cur.execute(
                """INSERT INTO Families VALUES
                ("%s", "%s", %f, %f, %f, %d, %d, %d,
                "%s");""" %
                (row["partid"],
                 familyGeneNames.get(row["partid"], ("", ""))[0],
                 row["famrate"], treelen, seqlen * 3,
                 row["dup"], row["loss"], row["genes"],
                 familyGeneNames.get(row["partid"], ("", ""))[1]))
        util.toc()
コード例 #2
0
    def addFamilies(self, eventsfile, discard=[]):

        if not tableExists(self.cur, "Families"):
            self.makeFamiliesTable()

        util.tic("add families")
        events_tab = tablelib.read_table(eventsfile)
        events_lookup = events_tab.lookup("partid")
        familyGeneNames = self.makeFamilyGeneNames()
        discard = set(discard)

        for row in events_tab:
            famid = row["partid"]
            if famid in discard:
                util.logger("discarding '%s'" % famid)
                continue

            tree = treelib.read_tree(self.getTreeFile(famid))
            treelen = sum(x.dist for x in tree)
            seqs = fasta.read_fasta(self.getFastaFile(famid))
            seqlen = stats.median(map(len, seqs.values()))

            self.cur.execute(
                """INSERT INTO Families VALUES 
                                ("%s", "%s", %f, %f, %f, %d, %d, %d,
                                 "%s");""" %
                (row["partid"], familyGeneNames.get(row["partid"],
                                                    ("", ""))[0],
                 row["famrate"], treelen, seqlen * 3, row["dup"], row["loss"],
                 row["genes"], familyGeneNames.get(row["partid"],
                                                   ("", ""))[1]))
        util.toc()
コード例 #3
0
def revtranslate_align(aaseqs, dnaseqs, check=False, trim=False):
    """Reverse translates aminoacid alignment into DNA alignment

       Must supply original ungapped DNA.
    """

    align = new_align(aaseqs)

    for name, seq in aaseqs.iteritems():
        try:
            dna = dnaseqs[name].upper()
            dnalen = len(dna)
            aalen = sum(int(a != "-") for a in seq)

            if len(dna) != aalen * 3:
                if trim:
                    # make dna a multiple of three
                    dna = dna[:(len(dna) // 3) * 3]

                    if len(dna) > aalen * 3:
                        # trim dna
                        dna = dna[:aalen*3]
                    else:
                        # trim peptide to match nucleotide
                        j = 0
                        for i in xrange(len(seq)):
                            if seq[i] != '-':
                                j += 1
                                if j > len(dna) // 3:
                                    seq = seq[:i] + "-" * (len(seq) - i)
                                    break

                    aalen2 = sum(int(a != "-") for a in seq)
                    assert len(dna) == aalen2 * 3, (
                        len(dna), aalen2 * 3)

                    util.logger("trim dna (%d) and pep (%d)" %
                                (dnalen - len(dna), aalen - aalen2))

                else:
                    # is last residue X?
                    for i in xrange(len(seq)-1, -1, -1):
                        if seq[i] == "-":
                            continue
                        if seq[i] == "X":
                            # repair
                            seq = seq[:i] + "-" * (len(seq)-i)
                            dna = dna[:-3]
                        break

            align[name] = seqlib.revtranslate(seq, dna, check=check)
        except seqlib.TranslateError:
            raise

    return align
コード例 #4
0
ファイル: seqlib.py プロジェクト: sarab609/scraps
    def add(self, key, value, errors=False):
        if key in self:
            if errors:
                util.logger("duplicate key", key)

            # keep the longest value, by default
            if len(value) >= len(self[key]):
                dict.__setitem__(self, key, value)
        else:    
            self.names.append(key)
            dict.__setitem__(self, key, value)
コード例 #5
0
def revtranslate_align(aaseqs, dnaseqs, check=False, trim=False):
    """Reverse translates aminoacid alignment into DNA alignment
    
       Must supply original ungapped DNA.
    """

    align = new_align(aaseqs)

    for name, seq in aaseqs.items():
        try:
            dna = dnaseqs[name].upper()
            dnalen = len(dna)
            aalen = sum(int(a != "-") for a in seq)

            if len(dna) != aalen * 3:
                if trim:
                    # make dna a multiple of three
                    dna = dna[:(len(dna) // 3) * 3]

                    if len(dna) > aalen * 3:
                        # trim dna
                        dna = dna[:aalen * 3]
                    else:
                        # trim peptide to match nucleotide
                        j = 0
                        for i in range(len(seq)):
                            if seq[i] != '-':
                                j += 1
                                if j > len(dna) // 3:
                                    seq = seq[:i] + "-" * (len(seq) - i)
                                    break

                    aalen2 = sum(int(a != "-") for a in seq)
                    assert len(dna) == aalen2 * 3, (len(dna), aalen2 * 3)

                    util.logger("trim dna (%d) and pep (%d)" %
                                (dnalen - len(dna), aalen - aalen2))

                else:
                    # is last residue X?
                    for i in range(len(seq) - 1, -1, -1):
                        if seq[i] == "-":
                            continue
                        if seq[i] == "X":
                            # repair
                            seq = seq[:i] + "-" * (len(seq) - i)
                            dna = dna[:-3]  #-(len(dna) % 3)]
                        break

            align[name] = revtranslate(seq, dna, check=check)
        except TranslateError as e:
            raise

    return align
コード例 #6
0
ファイル: seqlib.py プロジェクト: xysheep/compbio
    def add(self, key, value, errors=False):
        if key in self:
            if errors:
                util.logger("duplicate key", key)

            # keep the longest value, by default
            if len(value) >= len(self[key]):
                dict.__setitem__(self, key, value)
        else:
            self.names.append(key)
            dict.__setitem__(self, key, value)
コード例 #7
0
def exec_phylip(cmd, args, verbose=False):
    """Execute a phylip-like program that expects arguments from stdin"""

    util.logger("exec: %s" % cmd)
    util.logger("args: %s" % args)

    if verbose:
        util.logger("exec: %s" % cmd)
        util.logger("args: %s" % args)
        assert (
            os.system(
                """cat <<EOF | %s
%s"""
                % (cmd, args)
            )
            == 0
        )
    else:
        assert (
            os.system(
                """cat <<EOF | %s >/dev/null 2>&1
%s"""
                % (cmd, args)
            )
            == 0
        )
コード例 #8
0
    def walk(node):
        for child in node.children:
            walk(child)

        if not node.is_leaf():
            blastfiles = []
            leaves1 = node.children[0].leaf_names()
            leaves2 = node.children[1].leaf_names()

            # determine sibling blast files
            for leaf1 in leaves1:
                for leaf2 in leaves2:
                    if leaf1 in blastFileLookup and \
                       leaf2 in blastFileLookup[leaf1]:
                        blastfiles.append(blastFileLookup[leaf1][leaf2])

            # determine outgroup blast files (all other files, potentially)
            # go up one level, blastfiles for leaves, and subtract
            # sibling files
            outblastfiles = []
            if node.parent:
                inleaves = leaves1 + leaves2
                outleaves = set(node.parent.leaf_names()) - set(inleaves)

                for leaf1 in inleaves:
                    for leaf2 in outleaves:
                        if leaf1 in blastFileLookup and \
                           leaf2 in blastFileLookup[leaf1]:
                            outblastfiles.append(blastFileLookup[leaf1][leaf2])

            util.tic("merging")
            util.logger("leaves1: ", leaves1)
            util.logger("leaves2: ", leaves2)

            if "merge" in conf and \
               conf["merge"] == "avg":
                node.parts = mergeAvg(conf,
                                      genes,
                                      node.children[0].parts,
                                      node.children[1].parts,
                                      blastfiles,
                                      outblastfiles)
            else:
                node.parts = mergeBuh(conf,
                                      genes,
                                      node.children[0].parts,
                                      node.children[1].parts,
                                      blastfiles)

            if "output" in conf and len(node.parts) > 0:
                util.write_delim(conf["output"] +
                                 str(node.name) +
                                 ".part", node.parts)

            util.logger("number of parts: ", len(node.parts))
            if len(node.parts) > 0:
                util.logger("largest part:", max(map(len, node.parts)))

            util.toc()
コード例 #9
0
    def removeFamily(self, *famids):
        famids = set(famids)
        self.famtab[:] = filter(lambda fam: fam["famid"] not in famids,
                                self.famtab)
        current_famids = set(self.famtab.cget("famid"))

        # update disk
        self.archive()
        self.saveTable()
        for famid in famids:
            if famid in current_famids:
                famdir = self.familyDir(famid)
                shutil.move(famdir, os.path.join(self.olddatadir, famid))
                util.logger("famdb: archived '%s'" % famdir)
            else:
                util.logger("famdb: family '%s' does not exist" % famid)
コード例 #10
0
 def removeFamily(self, *famids):
     famids = set(famids)
     self.famtab[:] = filter(lambda fam: fam["famid"] not in famids, 
                             self.famtab)
     current_famids = set(self.famtab.cget("famid"))
     
     # update disk
     self.archive()
     self.saveTable()
     for famid in famids:
         if famid in current_famids:
             famdir = self.familyDir(famid)
             shutil.move(famdir, os.path.join(self.olddatadir, famid))
             util.logger("famdb: archived '%s'" % famdir)
         else:
             util.logger("famdb: family '%s' does not exist" % famid)
コード例 #11
0
    def addFamily(self, parts, famids=None):
        # determine highest current famid
        if famids == None:
            # assume famids are ints
            maxid = max(map(int, self.famtab.cget("famid")))
            famids = map(str, range(maxid + 1, maxid + 1 + len(parts)))

        for famid, part in izip(famids, parts):
            self.famtab.add(famid=famid, genes=",".join(part))
            util.logger("famdb: added family '%s'" % famid)

        # update disk
        self.archive()
        self.saveTable()

        for famid in famids:
            os.mkdir(self.familyDir(famid))
コード例 #12
0
 def addFamily(self, parts, famids=None):
     # determine highest current famid
     if famids == None:
         # assume famids are ints
         maxid = max(map(int, self.famtab.cget("famid")))
         famids = map(str, range(maxid + 1, maxid + 1 + len(parts)))
         
     for famid, part in izip(famids, parts):
         self.famtab.add(famid=famid,
                         genes=",".join(part))
         util.logger("famdb: added family '%s'" % famid)
     
     # update disk
     self.archive()
     self.saveTable()
     
     for famid in famids:
         os.mkdir(self.familyDir(famid))
コード例 #13
0
    def walk(node):
        for child in node.children:
            walk(child)

        if not node.is_leaf():
            blastfiles = []
            leaves1 = node.children[0].leaf_names()
            leaves2 = node.children[1].leaf_names()

            # determine sibling blast files
            for leaf1 in leaves1:
                for leaf2 in leaves2:
                    if leaf1 in blastFileLookup and \
                       leaf2 in blastFileLookup[leaf1]:
                        blastfiles.append(blastFileLookup[leaf1][leaf2])

            # determine outgroup blast files (all other files, potentially)
            # go up one level, blastfiles for leaves, and subtract
            # sibling files
            outblastfiles = []
            if node.parent:
                inleaves = leaves1 + leaves2
                outleaves = set(node.parent.leaf_names()) - set(inleaves)

                for leaf1 in inleaves:
                    for leaf2 in outleaves:
                        if leaf1 in blastFileLookup and \
                           leaf2 in blastFileLookup[leaf1]:
                            outblastfiles.append(blastFileLookup[leaf1][leaf2])

            util.tic("merging")
            util.logger("leaves1: ", leaves1)
            util.logger("leaves2: ", leaves2)

            if "merge" in conf and \
               conf["merge"] == "avg":
                node.parts = mergeAvg(conf, genes, node.children[0].parts,
                                      node.children[1].parts, blastfiles,
                                      outblastfiles)
            else:
                node.parts = mergeBuh(conf, genes, node.children[0].parts,
                                      node.children[1].parts, blastfiles)

            if "output" in conf and len(node.parts) > 0:
                util.write_delim(conf["output"] + str(node.name) + ".part",
                                 node.parts)

            util.logger("number of parts: ", len(node.parts))
            if len(node.parts) > 0:
                util.logger("largest part:", max(map(len, node.parts)))

            util.toc()
コード例 #14
0
def exec_phylip(cmd, args, verbose=False):
    """Execute a phylip-like program that expects arguments from stdin"""

    util.logger("exec: %s" % cmd)
    util.logger("args: %s" % args)

    if verbose:
        util.logger("exec: %s" % cmd)
        util.logger("args: %s" % args)
        assert os.system("""cat <<EOF | %s
%s""" % (cmd, args)) == 0
    else:
        assert os.system("""cat <<EOF | %s >/dev/null 2>&1
%s""" % (cmd, args)) == 0
コード例 #15
0
ファイル: argweaverc.py プロジェクト: mjhubisz/argweaver
def resample_arg_regions(arg, seqs, niters, width=1000,
                         ntimes=20, rho=1.5e-8, mu=2.5e-8,
                         popsize=1e4, times=None, carg=False,
                         verbose=False):
    seqlen = len(seqs.values()[0])

    if is_carg(arg):
        trees, names = arg
        arg2 = ctrees2arg(trees, names, times, verbose=verbose,
                          delete_arg=False)
        recomb_pos = list(x.pos for x in arg2 if x.event == "recomb")
    else:
        recomb_pos = list(x.pos for x in arg if x.event == "recomb")

    for it in range(niters):
        maxr = 0
        for i, j, a, b in stats.iter_window_index(recomb_pos, width):
            r = j - i + 1
            if r > maxr:
                maxr = r
                region = [max(recomb_pos[i]-10, 10),
                          min(recomb_pos[j]+10, seqlen - 10)]

        if verbose:
            util.tic("sample ARG region %s" % region)
        print arg
        arg = argweaver.resample_arg_region(arg, seqs, region[0], region[1],
                                            rho=rho, mu=mu, times=times,
                                            carg=carg, verbose=True)
        if not carg:
            recomb_pos = list(x.pos for x in arg if x.event == "recomb")
            if verbose:
                util.logger("%d: # recombs %d" % (it, len(recomb_pos)))
        if verbose:
            util.toc()

    return arg
コード例 #16
0
    def archive(self):
        archivefile = os.path.join(self.olddatadir,
                                   self.filename + "-" + self.timestamp())

        shutil.copy(self.filename, archivefile)
        util.logger("famdb: archived '%s'" % archivefile)
コード例 #17
0
 def saveTable(self):
     self.famtab.write(self.filename)
     util.logger("famdb: updated '%s'" % self.filename)
コード例 #18
0
 def archive(self):
     archivefile = os.path.join(self.olddatadir, 
                                self.filename + "-" + self.timestamp())
 
     shutil.copy(self.filename, archivefile)
     util.logger("famdb: archived '%s'" % archivefile)
コード例 #19
0
 def saveTable(self):
     self.famtab.write(self.filename)
     util.logger("famdb: updated '%s'" % self.filename)