def draw_events(canvas, tree, coords, events, losses,
               lossColor=(0, 0, 1),
               dupColor=(1, 0, 0),
               size=4):

    # draw duplications
    for node in tree:
        x, y = coords[node]
        if events[node] == "dup":
            canvas.rect(x - size/2.0, y - size/2.0,
                        size, size,  fillColor=dupColor, strokeColor=(0,0,0,0))

    # draw losses
    losses_per_branch = util.hist_dict([node for node, schild in losses])

    for node, nlosses in losses_per_branch.iteritems():
        if node.parent == None:
            continue

        x1 = coords[node.parent][0]
        x2, y1 = coords[node]
        step = (x2 - x1) / float(nlosses + 1)

        for x in util.frange(x1 + step, x2-(step/2.0), step):
            canvas.line(x, y1 - size, x, y1 + size, color=lossColor)
Exemple #2
0
def find_orthologs(gtree, stree, recon, counts=True):
    """Find all ortholog pairs within a gene tree"""

    events = label_events(gtree, recon)
    orths = []
    
    for node, event in events.items():
        if event == "spec":
            leavesmat = [x.leaves() for x in node.children]
            sp_counts = [util.hist_dict(util.mget(recon, row))
                         for row in leavesmat]
            
            for i in range(len(leavesmat)):
                for j in range(i+1, len(leavesmat)):
                    for gene1 in leavesmat[i]:
                        for gene2 in leavesmat[j]:
                            if gene1.name > gene2.name:
                                g1, g2 = gene2, gene1
                                a, b = j, i
                            else:
                                g1, g2 = gene1, gene2
                                a, b = i, j
                            
                            if not counts:
                                orths.append((g1.name, g2.name))
                            else:
                                orths.append((g1.name, g2.name,
                                              sp_counts[a][recon[g1]],
                                              sp_counts[b][recon[g2]]))
    
    return orths
Exemple #3
0
def draw_events(canvas,
                tree,
                coords,
                events,
                losses,
                lossColor=(0, 0, 1),
                dupColor=(1, 0, 0),
                size=4):

    # draw duplications
    for node in tree:
        x, y = coords[node]
        if events[node] == "dup":
            canvas.rect(x - size / 2.0,
                        y - size / 2.0,
                        size,
                        size,
                        fillColor=dupColor,
                        strokeColor=(0, 0, 0, 0))

    # draw losses
    losses_per_branch = util.hist_dict([node for node, schild in losses])

    for node, nlosses in losses_per_branch.iteritems():
        if node.parent == None:
            continue

        x1 = coords[node.parent][0]
        x2, y1 = coords[node]
        step = (x2 - x1) / float(nlosses + 1)

        for x in util.frange(x1 + step, x2 - (step / 2.0), step):
            canvas.line(x, y1 - size, x, y1 + size, color=lossColor)
Exemple #4
0
    def draw_events(self):

        # draw duplications
        dups = [color(*self.dup_color)]        
        for node in self.tree:
            if self.events[node] == "dup":
                dups.append(
                    zoom_clamp(
                        shapes.box(node.x - .5, node.y - .5,
                                   node.x + .5, node.y + .5),
                        link=True, link_type="smaller",
                        maxx=8, minx=1,
                        maxy=8, miny=1,
                        origin=(node.x, node.y),
                        prezoom=(self.xscale, 1.0)))
        
        # draw losses
        losses_per_branch = util.hist_dict([node for node, schild in self.losses])
        
        losses = [color(*self.loss_color)]
        for node, nlosses in losses_per_branch.iteritems():
            if node.parent == None:
                continue
                
            x1 = node.parent.x        
            x2 = node.x
            step = (x2 - x1) / float(nlosses + 1)
            
            for x in util.frange(x1 + step, x2-(step/2.0), step):
                losses.append(lines(x, node.y - .2, x, node.y + .2))
        
        return group(group(*dups), group(*losses))
Exemple #5
0
def histtab(items, headers=None, item="item", count="count", percent="percent",
            cols=None):
    """Make a histogram table."""
    if cols is not None:
        # items is a Table.
        items = items.as_tuples(cols=cols)
        if headers is None:
            headers = cols + [count, percent]

    if headers is None:
        headers = [item, count, percent]

    h = util.hist_dict(items)
    tab = Table(headers=headers)
    tot = float(sum(h.itervalues()))
    hist_items = h.items()

    if cols is not None:
        for key, val in hist_items:
            row = dict(zip(cols, key))
            row[count] = val
            tab.append(row)
    else:
        for key, val in hist_items:
            tab.append({item: key,
                        count: val})

    if percent is not None:
        for i, (key, val) in enumerate(hist_items):
            tab[i][percent] = val / tot

    tab.sort(col=count, reverse=True)

    return tab
Exemple #6
0
def mode(vals):
    """Computes the mode of a list of numbers"""
    top = 0
    topkey = None
    for key, val in util.hist_dict(vals).iteritems():
        if val > top:
            top = val
            topkey = key
    return topkey
Exemple #7
0
def mode(vals):
    """Computes the mode of a list of numbers"""
    top = 0
    topkey = None
    for key, val in util.hist_dict(vals).iteritems():
        if val > top:
            top = val
            topkey = key
    return topkey
Exemple #8
0
def make_pep_colors(prop2color=prop2color):
    pep_colors = util.Dict(default=color(.5, .5, .5))

    AA = 'ARNDCEQGHILKMFPSTWYVU*'
    pep_per_prop = util.hist_dict(util.mget(seqlib.AA_PROPERTY, AA))

    prop_counts = util.Dict(default=0)
    for char in AA:
        prop = seqlib.AA_PROPERTY[char]
        tint = prop_counts[prop] / float(pep_per_prop[prop])
        pep_colors[char] = prop2color(prop, tint * .5)
        prop_counts[prop] += 1

    return pep_colors
def make_pep_colors(prop2color=prop2color):
    pep_colors = util.Dict(default=color(.5, .5, .5))

    AA = 'ARNDCEQGHILKMFPSTWYVU*'
    pep_per_prop = util.hist_dict(util.mget(seqlib.AA_PROPERTY, AA))

    prop_counts = util.Dict(default=0)
    for char in AA:
        prop = seqlib.AA_PROPERTY[char]
        tint = prop_counts[prop] / float(pep_per_prop[prop])
        pep_colors[char] = prop2color(prop, tint * .5)
        prop_counts[prop] += 1
    
    return pep_colors
Exemple #10
0
    def test_sample_coal_recomb(self):
        rho = 1.5e-8  # recomb/site/gen
        l = 2000  # length of locus
        k = 10  # number of lineages
        n = 2 * 10000  # effective popsize
        r = rho * l  # recomb/locus/gen
        nsamples = 10000

        samples = [arglib.sample_coal_recomb(k, n, r) for i in range(nsamples)]
        events = dict(
            (event, count / float(nsamples))
            for event, count in util.hist_dict(util.cget(samples, 0)).items())
        expected = {'coal': 0.88146, 'recomb': 0.11854}

        for key, value in events.items():
            self.assertAlmostEqual(value, expected[key], places=2)
Exemple #11
0
    def test_sample_coal_recomb(self):
        rho = 1.5e-8  # recomb/site/gen
        l = 2000      # length of locus
        k = 10        # number of lineages
        n = 2*10000   # effective popsize
        r = rho * l   # recomb/locus/gen
        nsamples = 10000

        samples = [arglib.sample_coal_recomb(k, n, r)
                   for i in range(nsamples)]
        events = dict(
            (event, count / float(nsamples))
            for event, count in util.hist_dict(util.cget(samples, 0)).items())
        expected = {'coal': 0.88146, 'recomb': 0.11854}

        for key, value in events.items():
            self.assertAlmostEqual(value, expected[key], places=2)
def find_four_fold(aln):
    """Returns index of all columns in alignment that are completely 
       fourfold degenerate
       
       Assumes that columns are already filtered for aligned codons
    """
    
    # create peptide alignment
    pepAln = mapalign(aln, valfunc=translate)
    
    # find peptide conservation
    pepcons = []
    pep = []
    for i in xrange(pepAln.alignlen()):
        # get a column from the peptide alignment
        col = [seq[i] for seq in pepAln.itervalues()]
        
        # compute the histogram of the column.
        # ignore gaps '-' and non-translated 'X'
        hist = util.hist_dict(col)
        if "-" in hist:
            del hist["-"]
        if "X" in hist:
            del hist["X"]
        
        # column is conserved if only one AA appears
        if len(hist) == 1:
            pepcons.append(True)
            pep.append(hist.keys()[0])
        else:
            pepcons.append(False)
            pep.append("X")
        
    
    # find four-fold sites in conserved peptides
    ind = []
    
    for i in range(0, len(aln.values()[0]), 3):
        # process only those columns that are conserved at the peptide level
        if pepcons[i//3]:
            degen = AA_DEGEN[pep[i//3]]
            for j in range(3):
                if degen[j] == 4:
                    ind.append(i+j)
    return ind
Exemple #13
0
def find_four_fold(aln):
    """Returns index of all columns in alignment that are completely 
       fourfold degenerate
       
       Assumes that columns are already filtered for aligned codons
    """
    
    # create peptide alignment
    pepAln = mapalign(aln, valfunc=translate)
    
    # find peptide conservation
    pepcons = []
    pep = []
    for i in xrange(pepAln.alignlen()):
        # get a column from the peptide alignment
        col = [seq[i] for seq in pepAln.itervalues()]
        
        # compute the histogram of the column.
        # ignore gaps '-' and non-translated 'X'
        hist = util.hist_dict(col)
        if "-" in hist:
            del hist["-"]
        if "X" in hist:
            del hist["X"]
        
        # column is conserved if only one AA appears
        if len(hist) == 1:
            pepcons.append(True)
            pep.append(hist.keys()[0])
        else:
            pepcons.append(False)
            pep.append("X")
        
    
    # find four-fold sites in conserved peptides
    ind = []
    
    for i in range(0, len(aln.values()[0]), 3):
        # process only those columns that are conserved at the peptide level
        if pepcons[i//3]:
            degen = AA_DEGEN[pep[i//3]]
            for j in range(3):
                if degen[j] == 4:
                    ind.append(i+j)
    return ind
Exemple #14
0
def debug_test3():
    stree = treelib.read_tree('examples/nbin.stree') # run from ../ of this directory
    for node in stree:
        node.dist *= 1e7 # gen per myr
    popsize = 2e7
    freq = 1e0
    dr = .0000012 / 1e7 #.0012/1e7
    lr = .0000011 / 1e7 #.0006/1e7
    freqdup = freqloss = .05
    forcetime = 1e7
    
    for node in stree:
        print node.name, node.dist, len(node.children)
    print
    
    locus_tree, locus_extras = sim_DLILS_gene_tree(stree, popsize, freq, \
                                                        dr, lr, \
                                                        freqdup, freqloss, \
                                                        forcetime)
    
    for node in locus_tree:
        print node.name, node.dist, len(node.children)
    print
    
    logged_locus_tree, logged_extras = locus_to_logged_tree(locus_tree, popsize)
    daughters = logged_extras[0]
    pops = logged_extras[1]
    
    coal_tree, coal_recon = dlcoal.sample_locus_coal_tree(logged_locus_tree,
                                    n=pops, daughters=daughters,
                                    namefunc=lambda x: logged_extras[2][x] + '_' + str(x))
    
    #begin debug
    print coal_tree.leaf_names()
    try:
#        print set(coal_tree) - set(coal_tree.postorder())
        treelib.assert_tree(coal_tree)
    except AssertionError:
        print 'assertion error thrown on coal_tree being a proper tree'
        from rasmus import util
        hd= util.hist_dict(x.name for x in coal_tree.postorder())
        for key in hd.keys():
            print key if hd[key]>1 else '',
        print
        print len(coal_tree.nodes) - len(list(coal_tree.postorder()))
Exemple #15
0
def find_xenologs(gtree,
                  stree,
                  recon,
                  events,
                  trans,
                  counts=True,
                  species_branch=False):
    """Find all xenolog pairs within a gene tree

    NOTE: THIS HAS NOT BEEN TESTED!!!
    """
    xenos = []

    for node, event in events.items():
        if event == "trans":
            assert len(node.children) == 2
            if trans[node] == node.children[0]:
                children = (node.children[1], node.children[0])
            else:
                children = node.children
            leavesmat = [x.leaves() for x in children]
            sp_counts = [
                util.hist_dict(util.mget(recon, row)) for row in leavesmat
            ]

            for i in range(len(leavesmat)):
                for j in range(i + 1, len(leavesmat)):
                    for gene1 in leavesmat[i]:
                        for gene2 in leavesmat[j]:
                            g1, g2 = gene1, gene2
                            a, b = i, j

                            xeno = [g1.name, g2.name]
                            if counts:
                                xeno.extend([
                                    sp_counts[a][recon[g1]],
                                    sp_counts[b][recon[g2]]
                                ])
                            if species_branch:
                                xeno.append(recon[node])
                            xenos.append(tuple(xenos))

    return xenos
Exemple #16
0
def calc_conservation(aln):
    """Returns a list of percent matching in each column of an alignment"""

    length = len(aln.values()[0])
    seqs = aln.values()
    percids = []

    # find identity positions
    identity = ""
    for i in xrange(length):
        chars = util.hist_dict(util.cget(seqs, i))
        if "-" in chars: del chars["-"]

        if len(chars) == 0:
            percids.append(0.0)
        else:
            pid = max(chars.values()) / float(len(aln))
            percids.append(pid)
    return percids
def calc_conservation(aln):
    """Returns a list of percent matching in each column of an alignment"""

    length = len(aln.values()[0])
    seqs = aln.values()
    percids = []

    # find identity positions
    for i in xrange(length):
        chars = util.hist_dict(util.cget(seqs, i))
        if "-" in chars:
            del chars["-"]

        if len(chars) == 0:
            percids.append(0.0)
        else:
            pid = max(chars.values()) / float(len(aln))
            percids.append(pid)
    return percids
Exemple #18
0
    def getFamDescription(self, descriptions):

        # TODO: remove this hardcoding
        rmdesc = set([
            "", "Predicted ORF from Assembly 19",
            "Predicted ORF in Assemblies 19 and 20",
            "ORF Predicted by Annotation Working Group",
            "possibly spurious ORF (Annotation Working Group prediction)"
        ])

        descs = []
        for d in descriptions:
            descs.extend(d.split("; "))
        descs = filter(lambda x: x not in rmdesc, descs)

        items = util.hist_dict(descs).items()
        items.sort(key=lambda x: x[1], reverse=True)

        desc = "; ".join(["%s[%d]" % item for item in items])
        return desc
    def getFamDescription(self, descriptions):

        # TODO: remove this hardcoding
        rmdesc = set([
            "",
            "Predicted ORF from Assembly 19",
            "Predicted ORF in Assemblies 19 and 20",
            "ORF Predicted by Annotation Working Group",
            "possibly spurious ORF (Annotation Working Group prediction)"])

        descs = []
        for d in descriptions:
            descs.extend(d.split("; "))
        descs = filter(lambda x: x not in rmdesc, descs)

        items = util.hist_dict(descs).items()
        items.sort(key=lambda x: x[1], reverse=True)

        desc = "; ".join(["%s[%d]" % item for item in items])
        return desc
Exemple #20
0
def histtab(items, headers=["item", "count", "percent"]):
    h = util.hist_dict(items)
    tab = Table(headers=headers)
    tot = float(sum(h.itervalues()))

    if len(headers) == 2:    
        for key, val in h.items():
            tab.append({headers[0]: key,
                        headers[1]: val})
    
    elif len(headers) == 3:
        for key, val in h.items():
            tab.append({headers[0]: key,
                        headers[1]: val,
                        headers[2]: val / tot})
    
    else:
        raise Exception("Wrong number of headers (2 or 3 only)")
    
    tab.sort(col=headers[1], reverse=True)
    
    return tab
Exemple #21
0
def histtab(items, headers=["item", "count", "percent"]):
    h = util.hist_dict(items)
    tab = Table(headers=headers)
    tot = float(len(items))

    if len(headers) == 2:    
        for key, val in h.items():
            tab.append({headers[0]: key,
                        headers[1]: val})
    
    elif len(headers) == 3:
        for key, val in h.items():
            tab.append({headers[0]: key,
                        headers[1]: val,
                        headers[2]: val / tot})
    
    else:
        raise Exception("Wrong number of headers (2 or 3 only)")
    
    tab.sort(col=headers[1], reverse=True)
    
    return tab
Exemple #22
0
def num_redundant_topology(node, gene2species, leaves=None, all_leaves=False):
    """Returns the number of 'redundant' topologies"""

    if leaves is None:
        leaves = node.leaves()
    leaves = set(leaves)
    colors = {}
    nmirrors = [0]

    def walk(node):
        if node in leaves:
            colors[node] = phylo.hash_tree(node, gene2species)
        else:
            # recurse
            for child in node.children:
                walk(child)

            childHashes = util.mget(colors, node.children)
            if len(childHashes) > 1 and util.equal(*childHashes):
                nmirrors[0] += 1

            childHashes.sort()
            colors[node] = phylo.hash_tree_compose(childHashes)

    walk(node)

    colorsizes = util.hist_dict(util.mget(colors, leaves)).values()

    if all_leaves:
        val = stats.factorial(len(leaves))
    else:
        val = 1
        for s in colorsizes:
            if s > 1:
                val *= stats.factorial(s)
    #print "py val=", val, "nmirrors=", nmirrors[0]
    return val / (2**nmirrors[0])
Exemple #23
0
def num_redundant_topology(node, gene2species, leaves=None, all_leaves=False):
    """Returns the number of 'redundant' topologies"""

    if leaves is None:
        leaves = node.leaves()
    leaves = set(leaves)
    colors = {}
    nmirrors = [0]

    def walk(node):
        if node in leaves:
            colors[node] = phylo.hash_tree(node, gene2species)
        else:
            # recurse
            for child in node.children:
                walk(child)

            childHashes = util.mget(colors, node.children)
            if len(childHashes) > 1 and util.equal(*childHashes):
                nmirrors[0] += 1

            childHashes.sort()
            colors[node] = phylo.hash_tree_compose(childHashes)

    walk(node)

    colorsizes = util.hist_dict(util.mget(colors, leaves)).values()

    if all_leaves:
        val = stats.factorial(len(leaves))
    else:
        val = 1
        for s in colorsizes:
            if s > 1:
                val *= stats.factorial(s)
    # print "py val=", val, "nmirrors=", nmirrors[0]
    return val / (2 ** nmirrors[0])
Exemple #24
0
 def isOne2one(part, gene2species):
     counts = util.hist_dict(map(gene2species, part))
     return (max(counts.values()) == 1)
 def isOne2one(part, gene2species):
     counts = util.hist_dict(map(gene2species, part))
     return (max(counts.values()) == 1)
Exemple #26
0
def gcContent(seq):
    hist = util.hist_dict(seq)
    total = hist["A"] + hist["C"] + hist["T"] + hist["G"]

    return (hist["C"] + hist["G"]) / float(total)
def gcContent(seq):
    hist = util.hist_dict(seq)
    total = hist["A"] + hist["C"] + hist["T"] + hist["G"]

    return (hist["C"] + hist["G"]) / float(total)