Esempio n. 1
0
def read_length_matrix(filename, minlen=.0001, maxlen=1.0, nooutliers=True):
    """Read a length matrix made by spidir-prep"""

    from rasmus import util

    dat = [line.rstrip().split("\t") for line in open(filename)]
    species = dat[0][2:]
    lens = util.map2(
        float, util.submatrix(dat, range(1, len(dat)), range(2, len(dat[0]))))
    gene_sizes = map(int, util.cget(dat[1:], 1))
    files = util.cget(dat[1:], 0)

    if nooutliers:
        treelens = map(sum, lens)
        m = mean(treelens)
        ind = util.find(lambda x: x < 5 * m, treelens)
        files, gene_sizes, lens, treelens = [
            util.mget(x, ind) for x in files, gene_sizes, lens, treelens
        ]

    for row in lens:
        for i in xrange(len(row)):
            if row[i] < minlen:
                row[i] = minlen

    return species, lens, gene_sizes, files
Esempio n. 2
0
def read_length_matrix(filename, minlen=.0001, maxlen=1.0,
                       nooutliers=True):
    """Read a length matrix made by spidir-prep"""

    from rasmus import util

    dat = [line.rstrip().split("\t") for line in open(filename)]
    species = dat[0][2:]
    lens = util.map2(float, util.submatrix(dat, range(1, len(dat)),
                                           range(2, len(dat[0]))))
    gene_sizes = map(int, util.cget(dat[1:], 1))
    files = util.cget(dat[1:], 0)

    if nooutliers:
        treelens = map(sum, lens)
        m = mean(treelens)
        ind = util.find(lambda x: x<5*m, treelens)
        files, gene_sizes, lens, treelens = [util.mget(x, ind) for x in
                                             files, gene_sizes, lens, treelens]



    for row in lens:
        for i in xrange(len(row)):
            if row[i] < minlen:
                row[i] = minlen

    
    return species, lens, gene_sizes, files
Esempio n. 3
0
    def _test_ml(self):
        """Test ML code"""

        # params
        bgfreq = [.258, .267, .266, .209]
        kappa = 1.59

        # data
        tree = treelib.readTree("test/data/flies.nt/0/0.tree")
        align = fasta.readFasta("test/data/flies.nt/0/0.align")

        likes = []
        dists = []

        nodes = sorted(tree.nodes.values(), key=lambda x: x.dist)

        util.tic("find ML")
        for i in range(40):
            l = spidir.find_ml_branch_lengths_hky(tree,
                                                  util.mget(
                                                      align, tree.leafNames()),
                                                  bgfreq,
                                                  kappa,
                                                  parsinit=False,
                                                  maxiter=1)

            dists.append([n.dist for n in nodes])
            likes.append(l)
        util.toc()

        print likes

        prep_dir("test/output/ml/")

        # distances plot
        util.rplot_start("test/output/ml/ml_branches.pdf")
        util.rplot("plot",
                   util.cget(dists, 0),
                   ylim=[0, max(dists[0])],
                   t="l",
                   main="branch length convergence",
                   xlab="iterations",
                   ylab="branch lengths (sub/site)")
        for d in zip(*dists):
            util.rplot("lines", d)
        util.rplot_end(True)

        print util.cget(dists, 4)

        # likelihood plot
        util.rplot_start("test/output/ml/ml_likelihood.pdf")
        util.rplot("plot",
                   likes,
                   t="l",
                   xlab="iterations",
                   ylab="log likelihood",
                   main="likelihood convergence")
        util.rplot_end(True)
Esempio n. 4
0
    def _test_ml(self):
        """Test ML code"""

        # params
        bgfreq = [.258,.267,.266,.209]
        kappa = 1.59

        # data
        tree = treelib.readTree("test/data/flies.nt/0/0.tree")
        align = fasta.readFasta("test/data/flies.nt/0/0.align")


        likes = []
        dists = []

        nodes = sorted(tree.nodes.values(), key=lambda x: x.dist)

        util.tic("find ML")
        for i in range(40):
            l = spidir.find_ml_branch_lengths_hky(
                    tree,
                    util.mget(align, tree.leafNames()),
                    bgfreq, kappa,
                    parsinit=False,
                    maxiter=1)
            
            dists.append([n.dist for n in nodes])
            likes.append(l)
        util.toc()

        print likes

        prep_dir("test/output/ml/")

        # distances plot
        util.rplot_start("test/output/ml/ml_branches.pdf")
        util.rplot("plot", util.cget(dists, 0),
                   ylim=[0, max(dists[0])], t="l",
                   main="branch length convergence",
                   xlab="iterations",
                   ylab="branch lengths (sub/site)")
        for d in zip(* dists):
            util.rplot("lines", d)
        util.rplot_end(True)

        print util.cget(dists, 4)

        # likelihood plot
        util.rplot_start("test/output/ml/ml_likelihood.pdf")
        util.rplot("plot", likes, t="l",
                   xlab="iterations",
                   ylab="log likelihood",
                   main="likelihood convergence")
        util.rplot_end(True)
Esempio n. 5
0
def join_tables(* args, **kwargs):
    """Join together tables into one table.
       Each argument is a tuple (table_i, key_i, cols_i)
       
       key_i is either a column name or a function that maps a 
       table row to a unique key
    """
    
    if len(args) == 0:
        return Table()
    
    # determine common keys
    tab, key, cols = args[0]
    if isinstance(key, str):
        keys = tab.cget(key)
        lookups = [tab.lookup(key)]        
    else:
        keys = map(key, tab)
        lookup = {}
        for row in tab:
            lookup[key(row)] = row
        lookups = [lookup]
        
    keyset = set(keys)
    

    for tab, key, cols in args[1:]:
        if isinstance(key, str):
            keyset = keyset & set(tab.cget(key))
            lookups.append(tab.lookup(key))            
        else:
            keyset = keyset & set(map(key, tab))
            lookup = {}
            for row in tab:
                lookup[key(row)] = row
            
            lookups.append(lookup)
    
    keys = filter(lambda x: x in keyset, keys)
    
    
    # build new table
    if "headers" not in kwargs:
        headers = util.concat(*util.cget(args, 2))
    else:
        headers = kwargs["headers"]
    tab = Table(headers=headers)
    
    for key in keys:
        row = {}
        for (tab2, key2, cols), lookup in zip(args, lookups):
            row.update(util.subdict(lookup[key], cols))
        tab.append(row)
    
    return tab
Esempio n. 6
0
def join_tables(* args, **kwargs):
    """Join together tables into one table.
       Each argument is a tuple (table_i, key_i, cols_i)
       
       key_i is either a column name or a function that maps a 
       table row to a unique key
    """
    
    if len(args) == 0:
        return Table()
    
    # determine common keys
    tab, key, cols = args[0]
    if isinstance(key, str):
        keys = tab.cget(key)
        lookups = [tab.lookup(key)]        
    else:
        keys = map(key, tab)
        lookup = {}
        for row in tab:
            lookup[key(row)] = row
        lookups = [lookup]
        
    keyset = set(keys)
    

    for tab, key, cols in args[1:]:
        if isinstance(key, str):
            keyset = keyset & set(tab.cget(key))
            lookups.append(tab.lookup(key))            
        else:
            keyset = keyset & set(map(key, tab))
            lookup = {}
            for row in tab:
                lookup[key(row)] = row
            
            lookups.append(lookup)
    
    keys = filter(lambda x: x in keyset, keys)
    
    
    # build new table
    if "headers" not in kwargs:
        headers = util.concat(*util.cget(args, 2))
    else:
        headers = kwargs["headers"]
    tab = Table(headers=headers)
    
    for key in keys:
        row = {}
        for (tab2, key2, cols), lookup in zip(args, lookups):
            row.update(util.subdict(lookup[key], cols))
        tab.append(row)
    
    return tab
Esempio n. 7
0
    def test_local_trees(self):

        rho = 1.5e-8   # recomb/site/gen
        l = 10000      # length of locus
        k = 10         # number of lineages
        n = 2*1e4      # effective popsize

        arg = arglib.sample_arg(k, n, rho, 0, l)
        blocks1 = util.cget(arglib.iter_local_trees(arg, 200, 1200), 0)
        blocks2 = list(arglib.iter_recomb_blocks(arg, 200, 1200))
        self.assertEqual(blocks1, blocks2)
Esempio n. 8
0
def is_contig(db, genes):
    """Returns True if genes are contiguous along chromosome"""
    
    if len(genes) > 1:
        pos = [db.get_region_pos_full(i) for i in genes if i in db.regions]

        # ensure hits are on same chromosome
        if not util.equal(* util.cget(pos, 1)):
            return False

        ind = util.cget(pos, 2)
        ind.sort()

        # check that each position is present
        i = ind[0]
        for j in ind[1:]:
            if j != i+1:
                return False
            i += 1

    return True
Esempio n. 9
0
    def makeFamilyGeneNames(self):
        """Tries to name and describe a family using its genes"""

        self.cur.execute("""SELECT g.famid, g.common_name, g.description
                            FROM Genes g
                         """)

        fams = util.groupby(lambda x: x[0], self.cur)

        familyGeneNames = {}
        for famid, rows in fams.iteritems():
            names = util.unique([
                "".join([i for i in x if not i.isdigit() and i != "-"])
                for x in util.cget(rows, 1) if x != ""
            ])
            names.sort()

            description = self.getFamDescription(util.cget(rows, 2))

            familyGeneNames[famid] = (",".join(names), description)
        return familyGeneNames
def is_contig(db, genes):
    """Returns True if genes are contiguous along chromosome"""

    if len(genes) > 1:
        pos = [db.get_region_pos_full(i) for i in genes if i in db.regions]

        # ensure hits are on same chromosome
        if not util.equal(*util.cget(pos, 1)):
            return False

        ind = util.cget(pos, 2)
        ind.sort()

        # check that each position is present
        i = ind[0]
        for j in ind[1:]:
            if j != i + 1:
                return False
            i += 1

    return True
    def makeFamilyGeneNames(self):
        """Tries to name and describe a family using its genes"""

        self.cur.execute("""SELECT g.famid, g.common_name, g.description
                            FROM Genes g
                         """)

        fams = util.groupby(lambda x: x[0], self.cur)

        familyGeneNames = {}
        for famid, rows in fams.iteritems():
            names = util.unique(["".join([i for i in x
                                          if not i.isdigit() and i != "-"])
                                 for x in util.cget(rows, 1)
                                 if x != ""])
            names.sort()

            description = self.getFamDescription(util.cget(rows, 2))

            familyGeneNames[famid] = (",".join(names), description)
        return familyGeneNames
Esempio n. 12
0
    def test_sample_coal_recomb(self):
        rho = 1.5e-8  # recomb/site/gen
        l = 2000  # length of locus
        k = 10  # number of lineages
        n = 2 * 10000  # effective popsize
        r = rho * l  # recomb/locus/gen
        nsamples = 10000

        samples = [arglib.sample_coal_recomb(k, n, r) for i in range(nsamples)]
        events = dict(
            (event, count / float(nsamples))
            for event, count in util.hist_dict(util.cget(samples, 0)).items())
        expected = {'coal': 0.88146, 'recomb': 0.11854}

        for key, value in events.items():
            self.assertAlmostEqual(value, expected[key], places=2)
Esempio n. 13
0
    def test_sample_coal_recomb(self):
        rho = 1.5e-8  # recomb/site/gen
        l = 2000      # length of locus
        k = 10        # number of lineages
        n = 2*10000   # effective popsize
        r = rho * l   # recomb/locus/gen
        nsamples = 10000

        samples = [arglib.sample_coal_recomb(k, n, r)
                   for i in range(nsamples)]
        events = dict(
            (event, count / float(nsamples))
            for event, count in util.hist_dict(util.cget(samples, 0)).items())
        expected = {'coal': 0.88146, 'recomb': 0.11854}

        for key, value in events.items():
            self.assertAlmostEqual(value, expected[key], places=2)
Esempio n. 14
0
def calc_conservation(aln):
    """Returns a list of percent matching in each column of an alignment"""

    length = len(aln.values()[0])
    seqs = aln.values()
    percids = []

    # find identity positions
    identity = ""
    for i in xrange(length):
        chars = util.hist_dict(util.cget(seqs, i))
        if "-" in chars: del chars["-"]

        if len(chars) == 0:
            percids.append(0.0)
        else:
            pid = max(chars.values()) / float(len(aln))
            percids.append(pid)
    return percids
def calc_conservation(aln):
    """Returns a list of percent matching in each column of an alignment"""

    length = len(aln.values()[0])
    seqs = aln.values()
    percids = []

    # find identity positions
    for i in xrange(length):
        chars = util.hist_dict(util.cget(seqs, i))
        if "-" in chars:
            del chars["-"]

        if len(chars) == 0:
            percids.append(0.0)
        else:
            pid = max(chars.values()) / float(len(aln))
            percids.append(pid)
    return percids
Esempio n. 16
0
def drawTreeLogl(tree, out=None, events={}, baserate=1.0):
    labels = {}
    
    if out == None:
        out = DEBUG
    
    if "baserate" in tree.data:
        baserate = tree.data["baserate"]
    
    
    for node in tree.nodes.values():
        notes = ""
        if "extra" in node.data:
            notes += "E"
        if "unfold" in node.data:
            notes += "U"
        
        if "logl" in node.data:
            if isinstance(node.data["logl"], float):
                labels[node.name] = "[%s]\n%.3f (%.3f) %s" % \
                    (node.name, node.dist, node.data["logl"], notes)
                #logl += node.data["logl"]
            else:
                labels[node.name] = "[%s]\n%.3f (%s) %s" % \
                    (node.name, node.dist, str(node.data["logl"]), notes)

        else:
            labels[node.name] = "[%s]\n%.3f (*) %s" % \
                (node.name, node.dist, notes)
        
        if "params" in node.data:
            try:
                fracs = map(stats.mean, zip(* node.data["fracs"]))
                mean = sum(util.vmul(util.cget(node.data["params"], 0), fracs))
                sdev = sum(util.vmul(util.cget(node.data["params"], 1), fracs))
            
                mean *= baserate
                sdev *= baserate
            
                labels[node.name] += "\n%.3f %.3f" % (mean, sdev)
            except:
                print fracs, node.data['params']
        
        
        #if "error" in node.data:
        #    labels[node.name] += "\nerr %.4f" % node.data["error"]
        
        if node in events:
            labels[node.name] += " %s" % events[node]
        
    if "logl" in tree.data:
        debug("logl:      %f" % tree.data["logl"])
        debug("eventlogl: %f" % tree.data["eventlogl"])
        debug("errorlogl: %f" % tree.data["errorlogl"])
    debug("baserate:  %f" % baserate)
    debug("treelen:   %f" % sum(x.dist for x in tree.nodes.values()))
    if "error" in tree.data:
        debug("error:     %f" % tree.data["error"])
    
    treelib.drawTree(tree, minlen=20, maxlen=100, labels=labels, spacing=4, 
                        labelOffset=-3, out=out)