def layout_arg_leaves(arg):
    """Layout the leaves of an ARG"""

    basetree = treelib.Tree()
    nodes = list(arg.postorder())
    nodes.sort(key=lambda x: x.age)
    lookup = {}

    for node in nodes:
        if node.is_leaf():
            lookup[node] = basetree.new_node(node.name)
        else:
            basechildren = []
            for child in node.children:
                basechild = lookup[child]
                while basechild.parent:
                    basechild = basechild.parent
                basechildren.append(basechild)
            basechildren = util.unique(basechildren)
            if len(basechildren) > 1:
                lookup[node] = basenode = basetree.new_node(node.name)
                for basechild in basechildren:
                    basetree.add_child(basenode, basechild)
            else:
                lookup[node] = basechildren[0]
    basetree.root = lookup[nodes[-1]]

    # assign layout based on basetree layout
    # layout leaves
    return dict((arg[name], i) for i, name in enumerate(basetree.leaf_names()))
Esempio n. 2
0
def layout_arg_leaves(arg):
    """Layout the leaves of an ARG"""

    basetree = treelib.Tree()
    nodes = list(arg.postorder())
    nodes.sort(key=lambda x: x.age)
    lookup = {}

    for node in nodes:
        if node.is_leaf():
            lookup[node] = basetree.new_node(node.name)
        else:
            basechildren = []
            for child in node.children:
                basechild = lookup[child]
                while basechild.parent:
                    basechild = basechild.parent
                basechildren.append(basechild)
            basechildren = util.unique(basechildren)
            if len(basechildren) > 1:
                lookup[node] = basenode = basetree.new_node(node.name)
                for basechild in basechildren:
                    basetree.add_child(basenode, basechild)
            else:
                lookup[node] = basechildren[0]
    basetree.root = lookup[nodes[-1]]

    # assign layout based on basetree layout
    # layout leaves
    return dict((arg[name], i) for i, name in enumerate(basetree.leaf_names()))
Esempio n. 3
0
def guess_seq(seq):
    """Guesses whether a sequence is 'dna' or 'pep'"""
    dna = "ACTG-N"

    chars = util.unique(seq.upper())

    for char in chars:
        if char not in dna:
            return "pep"
    return "dna"
def guess_seq(seq):
    """Guesses whether a sequence is 'dna' or 'pep'"""
    dna = "ACTG-N"
    
    chars = util.unique(seq.upper())
    
    for char in chars:
        if char not in dna:
            return "pep"
    return "dna"
Esempio n. 5
0
    def makeFamilyGeneNames(self):
        """Tries to name and describe a family using its genes"""

        self.cur.execute("""SELECT g.famid, g.common_name, g.description
                            FROM Genes g
                         """)

        fams = util.groupby(lambda x: x[0], self.cur)

        familyGeneNames = {}
        for famid, rows in fams.iteritems():
            names = util.unique([
                "".join([i for i in x if not i.isdigit() and i != "-"])
                for x in util.cget(rows, 1) if x != ""
            ])
            names.sort()

            description = self.getFamDescription(util.cget(rows, 2))

            familyGeneNames[famid] = (",".join(names), description)
        return familyGeneNames
    def makeFamilyGeneNames(self):
        """Tries to name and describe a family using its genes"""

        self.cur.execute("""SELECT g.famid, g.common_name, g.description
                            FROM Genes g
                         """)

        fams = util.groupby(lambda x: x[0], self.cur)

        familyGeneNames = {}
        for famid, rows in fams.iteritems():
            names = util.unique(["".join([i for i in x
                                          if not i.isdigit() and i != "-"])
                                 for x in util.cget(rows, 1)
                                 if x != ""])
            names.sort()

            description = self.getFamDescription(util.cget(rows, 2))

            familyGeneNames[famid] = (",".join(names), description)
        return familyGeneNames
Esempio n. 7
0
# make degenerate counts
#
# example: 
#
# CGT => "R"
# CGC => "R"
# CGA => "R"
# CGG => "R"
# 
# CODON_DEGEN["R"] = [1, 1, 4]
# CODON_DEGEN["CGT"] = [1, 1, 4]
#
CODON_DEGEN = {}
AA_DEGEN = {}
for aa, lst in REV_CODON_TABLE.items():
    folds = map(lambda x: len(util.unique(x)), zip(* lst))
    for codon in lst:
        AA_DEGEN[aa] = folds
        CODON_DEGEN[codon] = folds


# substitution types
SUB_NONE = 0  # none
SUB_TSIT = 1  # tranSition
SUB_TVER = 2  # transVersion
SUB_INS  = 3  # insert
SUB_DEL  = 4  # del
SUBSTITUTION_TYPES = {
    "AA": SUB_NONE, "AC": SUB_TVER, "AG": SUB_TSIT, "AT": SUB_TVER,
    "CA": SUB_TVER, "CC": SUB_NONE, "CG": SUB_TVER, "CT": SUB_TSIT,
    "GA": SUB_TSIT, "GC": SUB_TVER, "GG": SUB_NONE, "GT": SUB_TVER,
Esempio n. 8
0
def mergeBuh(conf, genes, parts1, parts2, blastfiles):
    """Merge by Best Unidirectional Hits"""

    # don't use this code without double checking it
    assert False

    lookup1 = item2part(parts1)
    lookup2 = item2part(parts2)

    best = util.Dict(dim=1, default=(0, None))

    util.tic("read hits")
    for blastfile, order in blastfiles:
        util.tic("determine best hits '%s'" % os.path.basename(blastfile))
        for hit in blast.BlastReader(blastfile):
            if order:
                gene1 = blast.query(hit)
                gene2 = blast.subject(hit)
                alnlen1 = blast.queryLength(hit)
                alnlen2 = blast.subjectLength(hit)
            else:
                gene2 = blast.query(hit)
                gene1 = blast.subject(hit)
                alnlen2 = blast.queryLength(hit)
                alnlen1 = blast.subjectLength(hit)
            score = blast.bitscore(hit)

            len1 = genes[gene1]["length"]
            len2 = genes[gene2]["length"]
            coverage = max(alnlen1 / float(len1), alnlen2 / float(len2))

            # discard a hit that does not pass basic cutoffs
            if blast.bitscore(hit) / float(blast.alignLength(hit)) < \
                   conf["bitspersite"] or \
               coverage < conf["coverage"] or \
               blast.evalue(hit) > conf["signif"]:
                continue

            #if blast.evalue(hit) > conf["signif"]:
            #    continue

            if gene1 in lookup1:
                part1 = (0, lookup1[gene1])
            else:
                parts1.append([gene1])
                lookup1[gene1] = len(parts1) - 1
                part1 = (0, len(parts1) - 1)

            if gene2 in lookup2:
                part2 = (1, lookup2[gene2])
            else:
                parts2.append([gene2])
                lookup2[gene2] = len(parts2) - 1
                part2 = (1, len(parts2) - 1)

            if score > best[part1][0]:
                best[part1] = (score, part2)
            if score > best[part2][0]:
                best[part2] = (score, part1)
        util.toc()

    util.toc()

    util.tic("determine clusters")
    sets = {}
    for gene in best:
        sets[gene] = sets.UnionFind([gene])

    for blastfile, order in blastfiles:
        util.tic("read hits '%s'" % os.path.basename(blastfile))
        for hit in blast.BlastReader(blastfile):
            if order:
                gene1 = blast.query(hit)
                gene2 = blast.subject(hit)
                alnlen1 = blast.queryLength(hit)
                alnlen2 = blast.subjectLength(hit)
            else:
                gene2 = blast.query(hit)
                gene1 = blast.subject(hit)
                alnlen2 = blast.queryLength(hit)
                alnlen1 = blast.subjectLength(hit)
            score = blast.bitscore(hit)

            len1 = genes[gene1]["length"]
            len2 = genes[gene2]["length"]
            coverage = max(alnlen1 / float(len1), alnlen2 / float(len2))

            # discard a hit that does not pass basic cutoffs
            if blast.bitscore(hit) / float(blast.alignLength(hit)) < \
                   conf["bitspersite"] or \
               coverage < conf["coverage"] or \
               blast.evalue(hit) > conf["signif"]:
                continue

            #if blast.evalue(hit) > conf["signif"]:
            #    continue

            part1 = (0, lookup1[gene1])
            part2 = (1, lookup2[gene2])

            if score >= best[part1][0] * conf["relcutoff"]:
                sets[part1].union(sets[part2])
            if score >= best[part2][0] * conf["relcutoff"]:
                sets[part2].union(sets[part1])
        util.toc()

    sets = util.unique([x.root() for x in sets.values()])

    parts = []
    joining = (parts1, parts2)
    for set in sets:
        parts.append([])
        for i, row in set.members():
            parts[-1].extend(joining[i][row])
    util.toc()

    return parts
Esempio n. 9
0
def mergeAvg(conf, genes, parts1, parts2, blastfiles, outblastfiles):
    lookup1 = item2part(parts1)
    lookup2 = item2part(parts2)

    # value is [sum, total]
    hits = util.Dict(dim=2, default=[0, 0])

    if "accept" in conf:
        accept = conf["accept"]
    else:
        accept = False

    util.tic("read hits")
    for blastfile, order in blastfiles:
        util.tic("determine best hits '%s'" % os.path.basename(blastfile))
        for hit in blast.BlastReader(blastfile):
            if order:
                gene1 = blast.query(hit)
                gene2 = blast.subject(hit)
                alnlen1 = blast.queryLength(hit)
                alnlen2 = blast.subjectLength(hit)
            else:
                gene2 = blast.query(hit)
                gene1 = blast.subject(hit)
                alnlen2 = blast.queryLength(hit)
                alnlen1 = blast.subjectLength(hit)
            score = blast.bitscore(hit)

            len1 = genes[gene1]["length"]
            len2 = genes[gene2]["length"]
            coveragesmall = min(alnlen1 / float(len1), alnlen2 / float(len2))
            coveragebig = max(alnlen1 / float(len1), alnlen2 / float(len2))

            # discard a hit that does not pass basic cutoffs
            if blast.bitscore(hit) / float(blast.alignLength(hit)) < \
                   conf["bitspersite"] or \
               coveragesmall < conf["coveragesmall"] or \
               coveragebig < conf["coveragebig"] or \
               blast.evalue(hit) > conf["signif"]:
                continue


            if accept and \
               (gene1 not in accept or
                gene2 not in accept):
                continue

            # create a key for a partition: (side, index)
            if gene1 in lookup1:
                part1 = (0, lookup1[gene1])
            else:
                parts1.append([gene1])
                lookup1[gene1] = len(parts1) - 1
                part1 = (0, len(parts1) - 1)

            if gene2 in lookup2:
                part2 = (1, lookup2[gene2])
            else:
                parts2.append([gene2])
                lookup2[gene2] = len(parts2) - 1
                part2 = (1, len(parts2) - 1)

            val = hits[part1][part2]
            val[0] += score
            val[1] += 1
            hits[part2][part1] = val

        util.toc()
    util.toc()

    util.tic("read outgroup hits")
    outbest = util.Dict(default=[0, 0])
    for blastfile, order in outblastfiles:
        util.tic("determine best hits '%s'" % os.path.basename(blastfile))
        for hit in blast.BlastReader(blastfile):
            if order:
                genein = blast.query(hit)
                geneout = blast.subject(hit)
            else:
                geneout = blast.query(hit)
                genein = blast.subject(hit)
            score = blast.bitscore(hit)

            # create a key for a partition: (side, index)
            if genein in lookup1:
                partin = (0, lookup1[genein])
            elif gene1 in lookup2:
                partin = (1, lookup2[genein])
            else:
                continue

            val = outbest[partin]
            val[0] += score
            val[1] += 1

        util.toc()
    util.toc()

    assert len(parts1) == len(unionPart(parts1))
    assert len(parts2) == len(unionPart(parts2))

    util.tic("determine clusters")
    sets = {}
    for i in xrange(len(parts1)):
        sets[(0, i)] = sets.UnionFind([(0, i)])
    for i in xrange(len(parts2)):
        sets[(1, i)] = sets.UnionFind([(1, i)])

    # merge top avg hits
    for part1 in hits:
        o1 = outbest[part1]
        outavg1 = float(o1[0]) / max(o1[1], 1)

        top = 0
        toppart = None

        for part2, (tot, num) in hits[part1].iteritems():
            avg = float(tot) / num
            o2 = outbest[part2]
            outavg2 = float(o2[0]) / max(o2[1], 1)

            if avg > outavg1 and avg > outavg2 and avg > top:
                top = avg
                toppart = part2

        if toppart:
            sets[part1].union(sets[toppart])

    sets = util.unique([x.root() for x in sets.values()])

    # create partition of genes
    parts = []
    joining = (parts1, parts2)
    for set in sets:
        parts.append([])
        for i, row in set:
            parts[-1].extend(joining[i][row])
    util.toc()

    assert len(parts) == len(unionPart(parts))

    return parts
Esempio n. 10
0
# make degenerate counts
#
# example:
#
# CGT => "R"
# CGC => "R"
# CGA => "R"
# CGG => "R"
#
# CODON_DEGEN["R"] = [1, 1, 4]
# CODON_DEGEN["CGT"] = [1, 1, 4]
#
CODON_DEGEN = {}
AA_DEGEN = {}
for aa, lst in REV_CODON_TABLE.items():
    folds = map(lambda x: len(util.unique(x)), zip(*lst))
    for codon in lst:
        AA_DEGEN[aa] = folds
        CODON_DEGEN[codon] = folds

# substitution types
SUB_NONE = 0  # none
SUB_TSIT = 1  # tranSition
SUB_TVER = 2  # transVersion
SUB_INS = 3  # insert
SUB_DEL = 4  # del
SUBSTITUTION_TYPES = {
    "AA": SUB_NONE,
    "AC": SUB_TVER,
    "AG": SUB_TSIT,
    "AT": SUB_TVER,
 def keys(self):
     return util.sort(util.unique(self.db.keys()))
Esempio n. 12
0
 def keys(self):
     return util.sort(util.unique(self.db.keys()))
def mergeBuh(conf, genes, parts1, parts2, blastfiles):
    """Merge by Best Unidirectional Hits"""
    
    # don't use this code without double checking it
    assert False
    
    lookup1 = item2part(parts1)
    lookup2 = item2part(parts2)
    
    
    best = util.Dict(dim=1, default=(0, None))

    util.tic("read hits")
    for blastfile, order in blastfiles:
        util.tic("determine best hits '%s'" % os.path.basename(blastfile))
        for hit in blast.BlastReader(blastfile):
            if order:
                gene1 = blast.query(hit)
                gene2 = blast.subject(hit)
                alnlen1 = blast.queryLength(hit)
                alnlen2 = blast.subjectLength(hit)                
            else:
                gene2 = blast.query(hit)
                gene1 = blast.subject(hit)
                alnlen2 = blast.queryLength(hit)
                alnlen1 = blast.subjectLength(hit)                    
            score = blast.bitscore(hit)
            
            len1 = genes[gene1]["length"]
            len2 = genes[gene2]["length"]
            coverage = max(alnlen1 / float(len1), 
                           alnlen2 / float(len2))
            
            # discard a hit that does not pass basic cutoffs
            if blast.bitscore(hit) / float(blast.alignLength(hit)) < \
                   conf["bitspersite"] or \
               coverage < conf["coverage"] or \
               blast.evalue(hit) > conf["signif"]:
                continue
            
            #if blast.evalue(hit) > conf["signif"]:
            #    continue
            
                        
            if gene1 in lookup1:
                part1 = (0, lookup1[gene1])
            else:
                parts1.append([gene1])
                lookup1[gene1] = len(parts1) - 1
                part1 = (0, len(parts1) - 1)
            
            if gene2 in lookup2:
                part2 = (1, lookup2[gene2])
            else:
                parts2.append([gene2])
                lookup2[gene2] = len(parts2) - 1
                part2 = (1, len(parts2) - 1)
            
            
            if score > best[part1][0]:
                best[part1] = (score, part2)
            if score > best[part2][0]:
                best[part2] = (score, part1)
        util.toc()
        
        
        
    util.toc()

    util.tic("determine clusters")
    sets = {}
    for gene in best:
        sets[gene] = sets.UnionFind([gene])
    
    for blastfile, order in blastfiles:
        util.tic("read hits '%s'" % os.path.basename(blastfile))
        for hit in blast.BlastReader(blastfile):
            if order:
                gene1 = blast.query(hit)
                gene2 = blast.subject(hit)
                alnlen1 = blast.queryLength(hit)
                alnlen2 = blast.subjectLength(hit)                
            else:
                gene2 = blast.query(hit)
                gene1 = blast.subject(hit)
                alnlen2 = blast.queryLength(hit)
                alnlen1 = blast.subjectLength(hit)                    
            score = blast.bitscore(hit)
            
            
            len1 = genes[gene1]["length"]
            len2 = genes[gene2]["length"]
            coverage = max(alnlen1 / float(len1), 
                           alnlen2 / float(len2))
            
            # discard a hit that does not pass basic cutoffs
            if blast.bitscore(hit) / float(blast.alignLength(hit)) < \
                   conf["bitspersite"] or \
               coverage < conf["coverage"] or \
               blast.evalue(hit) > conf["signif"]:
                continue
            
            
            #if blast.evalue(hit) > conf["signif"]:
            #    continue
            
            
            part1 = (0, lookup1[gene1])
            part2 = (1, lookup2[gene2])        

            if score >= best[part1][0] * conf["relcutoff"]:
                sets[part1].union(sets[part2])
            if score >= best[part2][0] * conf["relcutoff"]:
                sets[part2].union(sets[part1])
        util.toc()
    
    
    sets = util.unique([x.root() for x in sets.values()])
    
    parts = []
    joining = (parts1, parts2)
    for set in sets:
        parts.append([])
        for i, row in set.members():
            parts[-1].extend(joining[i][row])
    util.toc()

    return parts
def mergeAvg(conf, genes, parts1, parts2, blastfiles, outblastfiles):
    lookup1 = item2part(parts1)
    lookup2 = item2part(parts2)
    
    # value is [sum, total]
    hits = util.Dict(dim=2, default = [0, 0])
    
    if "accept" in conf:
        accept = conf["accept"]
    else:
        accept = False
    
    
    util.tic("read hits")
    for blastfile, order in blastfiles:
        util.tic("determine best hits '%s'" % os.path.basename(blastfile))
        for hit in blast.BlastReader(blastfile):
            if order:
                gene1 = blast.query(hit)
                gene2 = blast.subject(hit)
                alnlen1 = blast.queryLength(hit)
                alnlen2 = blast.subjectLength(hit)
            else:
                gene2 = blast.query(hit)
                gene1 = blast.subject(hit)
                alnlen2 = blast.queryLength(hit)
                alnlen1 = blast.subjectLength(hit)
            score = blast.bitscore(hit)
            
            len1 = genes[gene1]["length"]
            len2 = genes[gene2]["length"]
            coveragesmall = min(alnlen1 / float(len1), 
                                alnlen2 / float(len2))
            coveragebig = max(alnlen1 / float(len1), 
                              alnlen2 / float(len2))

            # discard a hit that does not pass basic cutoffs
            if blast.bitscore(hit) / float(blast.alignLength(hit)) < \
                   conf["bitspersite"] or \
               coveragesmall < conf["coveragesmall"] or \
               coveragebig < conf["coveragebig"] or \
               blast.evalue(hit) > conf["signif"]:
                continue
            
            
            if accept and \
               (gene1 not in accept or
                gene2 not in accept):
                 continue
            
            # create a key for a partition: (side, index)
            if gene1 in lookup1:
                part1 = (0, lookup1[gene1])
            else:
                parts1.append([gene1])
                lookup1[gene1] = len(parts1) - 1
                part1 = (0, len(parts1) - 1)
            
            if gene2 in lookup2:
                part2 = (1, lookup2[gene2])
            else:
                parts2.append([gene2])
                lookup2[gene2] = len(parts2) - 1
                part2 = (1, len(parts2) - 1)
            
            val = hits[part1][part2]
            val[0] += score
            val[1] += 1
            hits[part2][part1] = val
            
        util.toc()
    util.toc()
    
    
    util.tic("read outgroup hits")    
    outbest = util.Dict(default=[0, 0])
    for blastfile, order in outblastfiles:
        util.tic("determine best hits '%s'" % os.path.basename(blastfile))
        for hit in blast.BlastReader(blastfile):
            if order:
                genein  = blast.query(hit)
                geneout = blast.subject(hit)
            else:
                geneout = blast.query(hit)
                genein = blast.subject(hit)            
            score = blast.bitscore(hit)
            
            # create a key for a partition: (side, index)
            if genein in lookup1:
                partin = (0, lookup1[genein])
            elif gene1 in lookup2:
                partin = (1, lookup2[genein])
            else:
                continue
            
            val = outbest[partin]
            val[0] += score
            val[1] += 1
            
        util.toc()
    util.toc()
    
    assert len(parts1) == len(unionPart(parts1))
    assert len(parts2) == len(unionPart(parts2))
    

    util.tic("determine clusters")
    sets = {}
    for i in xrange(len(parts1)):
        sets[(0, i)] = sets.UnionFind([(0, i)])
    for i in xrange(len(parts2)):
        sets[(1, i)] = sets.UnionFind([(1, i)])

    
    # merge top avg hits
    for part1 in hits:
        o1 = outbest[part1]
        outavg1 = float(o1[0]) / max(o1[1], 1)
        
        top = 0
        toppart = None
        
        for part2, (tot, num) in hits[part1].iteritems():
            avg = float(tot) / num
            o2 = outbest[part2]
            outavg2 = float(o2[0]) / max(o2[1], 1)
            
            if avg > outavg1 and avg > outavg2 and avg > top:
                top = avg
                toppart = part2
                
        if toppart:
            sets[part1].union(sets[toppart])
    
    sets = util.unique([x.root() for x in sets.values()])
    
    # create partition of genes
    parts = []
    joining = (parts1, parts2)
    for set in sets:
        parts.append([])
        for i, row in set:
            parts[-1].extend(joining[i][row])
    util.toc()
    
    assert len(parts) == len(unionPart(parts))
    
    return parts
Esempio n. 15
0
# make degenerate counts
#
# example:
#
# CGT => "R"
# CGC => "R"
# CGA => "R"
# CGG => "R"
#
# CODON_DEGEN["R"] = [1, 1, 4]
# CODON_DEGEN["CGT"] = [1, 1, 4]
#
CODON_DEGEN = {}
AA_DEGEN = {}
for aa, lst in list(REV_CODON_TABLE.items()):
    folds = [len(util.unique(x)) for x in zip(*lst)]
    for codon in lst:
        AA_DEGEN[aa] = folds
        CODON_DEGEN[codon] = folds

# substitution types
SUB_NONE = 0  # none
SUB_TSIT = 1  # tranSition
SUB_TVER = 2  # transVersion
SUB_INS = 3  # insert
SUB_DEL = 4  # del
SUBSTITUTION_TYPES = {
    "AA": SUB_NONE,
    "AC": SUB_TVER,
    "AG": SUB_TSIT,
    "AT": SUB_TVER,