def chi_square_fit(cdf, params, data, ndivs=20, minsamples=5, plot=False, start=-util.INF, end=util.INF): from rasmus import gnuplot import scipy import scipy.stats # determine ndiv and binsize binsize = len(data) / ndivs if binsize < minsamples: ndivs = len(data) / minsamples binsize = len(data) / ndivs data = sorted(data) bins = [data[i:i+binsize] for i in xrange(0, len(data), binsize)] obs = scipy.array(map(len, bins)) ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins) obs = util.mget(obs, ind) x = [bin[0] for bin in bins] expected = [len(data) * cdf(x[1], params)] expected.extend([len(data) * (cdf(x[i+1], params) - cdf(x[i], params)) for i in range(1, len(x)-1)]) expected.append(len(data) * (1.0 - cdf(x[-1], params))) expected = scipy.array(util.mget(expected, ind)) chi2, pval = scipy.stats.chisquare(obs, expected) if plot: p = gnuplot.plot(util.mget(x, ind), obs) p.plot(util.mget(x, ind), expected) return chi2, pval
def chi_square_fit(cdf, params, data, ndivs=20, minsamples=5, plot=False, start=-util.INF, end=util.INF): from rasmus import gnuplot import scipy import scipy.stats # determine ndiv and binsize binsize = len(data) / ndivs if binsize < minsamples: ndivs = len(data) / minsamples binsize = len(data) / ndivs data = sorted(data) bins = [data[i:i+binsize] for i in range(0, len(data), binsize)] obs = scipy.array(list(map(len, bins))) ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins) obs = util.mget(obs, ind) x = [bin[0] for bin in bins] expected = [len(data) * cdf(x[1], params)] expected.extend([len(data) * (cdf(x[i+1], params) - cdf(x[i], params)) for i in range(1, len(x)-1)]) expected.append(len(data) * (1.0 - cdf(x[-1], params))) expected = scipy.array(util.mget(expected, ind)) chi2, pval = scipy.stats.chisquare(obs, expected) if plot: p = gnuplot.plot(util.mget(x, ind), obs) p.plot(util.mget(x, ind), expected) return chi2, pval
def _write_directive(self, line, out, delim): """Write a directive""" if line == DIR_VERSION: out.write("##version:%s\n" % self.version) elif line == DIR_TYPES: if len(self) > 0: entry = self[0] else: entry = [""] * len(self.headers) out.write("##types:" + self._type_lookup.formatTableTypes( util.mget(self.types, self.headers), delim) + "\n") elif line == DIR_DEFAULTS: out.write("##defaults:" + delim.join(map(str, util.mget(self.defaults, self.headers))) + "\n") elif line == DIR_HEADERS: out.write("##headers:%d\n" % self.nheaders) else: raise "unknown directive:", line
def walk(node): if node.isLeaf(): return smap(node.name) else: child_hashes = map(walk, node.children) ind = util.sortrank(child_hashes) child_hashes = util.mget(child_hashes, ind) node.children = util.mget(node.children, ind) return hash_tree_compose(child_hashes)
def init_distmats(self): """Initialize distance matrices Initialization should by done after trees and alignments """ if len(self.distmats) > 0: self.matrices = [] # setup matrices for i, distmat in enumerate(self.distmats): # convert distmatrix to summon Matrix if isinstance(distmat, matrix.Matrix): mat = distmat else: mat = matrix.Matrix() mat.from2DList(distmat) # set default colormap if mat.colormap == None: mat.colormap = self.matrix_colormap # determine labels if self.dist_labels_from_align and self.align_order != None: # determine row/col labels from alignment if it exists mat.rowlabels = self.align_order mat.collabels = self.align_order elif self.distlabels != None: mat.rowlabels = self.distlabels[i] mat.collabels = self.distlabels[i] else: raise Exception("no labels given for matrix") # reorder according to any given tree if self.order != None: lookup = util.list2lookup(mat.rowlabels) mat.rperm = util.mget(lookup, self.order) mat.cperm = util.mget(lookup, self.order) mat.setup() self.matrices.append(mat) if self.seqs == None: seqs = self.current_align else: seqs = self.seqs # create matrix vis self.current_matrix = self.matrices[0] self.visdist = distmatrixvis.DistMatrixViewer(self.current_matrix, seqs=seqs, bgcolor=(1, 1, 1)) else: self.visdist = None
def init_distmats(self): """Initialize distance matrices Initialization should by done after trees and alignments """ if len(self.distmats) > 0: self.matrices = [] # setup matrices for i, distmat in enumerate(self.distmats): # convert distmatrix to summon Matrix if isinstance(distmat, matrix.Matrix): mat = distmat else: mat = matrix.Matrix() mat.from2DList(distmat) # set default colormap if mat.colormap == None: mat.colormap = self.matrix_colormap # determine labels if self.dist_labels_from_align and self.align_order != None: # determine row/col labels from alignment if it exists mat.rowlabels = self.align_order mat.collabels = self.align_order elif self.distlabels != None: mat.rowlabels = self.distlabels[i] mat.collabels = self.distlabels[i] else: raise Exception("no labels given for matrix") # reorder according to any given tree if self.order != None: lookup = util.list2lookup(mat.rowlabels) mat.rperm = util.mget(lookup, self.order) mat.cperm = util.mget(lookup, self.order) mat.setup() self.matrices.append(mat) if self.seqs == None: seqs = self.current_align else: seqs = self.seqs # create matrix vis self.current_matrix = self.matrices[0] self.visdist = distmatrixvis.DistMatrixViewer(self.current_matrix, seqs=seqs, bgcolor=(1,1,1)) else: self.visdist = None
def test_ml_large(self): """Test ML code""" # params bgfreq = [.258, .267, .266, .209] kappa = 1.59 # data tree = treelib.readTree("test/data/verts/19520/19520.ensembl.tree") align = fasta.readFasta("test/data/verts/19520/19520.nt.mfa") likes = [] dists = [] nodes = sorted(tree.nodes.values(), key=lambda x: x.dist) l = spidir.calc_seq_likelihood_hky(tree, align, bgfreq, kappa) print l self.assert_(l != -util.INF) l = spidir.find_ml_branch_lengths_hky(tree, util.mget( align, tree.leafNames()), bgfreq, kappa, parsinit=False, maxiter=1) print l self.assert_(l != -util.INF)
def _test_ml_speed(self): # params bgfreq = [.258, .267, .266, .209] kappa = 1.59 # data tree = treelib.readTree("test/data/flies.nt/0/0.tree") align = fasta.readFasta("test/data/flies.nt/0/0.align") likes = [] dists = [] nodes = sorted(tree.nodes.values(), key=lambda x: x.dist) util.tic("find ML") for i in xrange(10): l = spidir.find_ml_branch_lengths_hky(tree, util.mget( align, tree.leafNames()), bgfreq, kappa, maxiter=10) util.toc() dists.append([n.dist for n in nodes]) likes.append(l)
def fit_distrib(cdf, params_init, data, ndivs=20, minsamples=5, start=-util.INF, end=util.INF): import scipy import scipy.optimize import scipy.stats # determine ndiv and binsize binsize = len(data) / ndivs if binsize < minsamples: ndivs = len(data) / minsamples binsize = len(data) / ndivs data = sorted(data) bins = [data[i:i+binsize] for i in range(0, len(data), binsize)] obs = scipy.array(list(map(len, bins))) ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins) obs = util.mget(obs, ind) def optfunc(params): x = [bin[0] for bin in bins] expected = [len(data) * cdf(x[1], params)] expected.extend([len(data) * (cdf(x[i+1], params) - cdf(x[i], params)) for i in range(1, len(x)-1)]) expected.append(len(data) * (1.0 - cdf(x[-1], params))) expected = scipy.array(util.mget(expected, ind)) chi2, pval = scipy.stats.chisquare(obs, expected) return chi2 params = scipy.optimize.fmin(optfunc, params_init, disp=False) chi2, pval = chi_square_fit(cdf, params, data, ndivs, minsamples) return list(params), pval
def lookup(self, *keys, **options): """Returns a lookup dict based on a column 'key' or multiple keys extra options: default=None uselast=False # allow multiple rows, just use last """ options.setdefault("default", None) options.setdefault("uselast", False) lookup = util.Dict(dim=len(keys), default=options["default"]) uselast = options["uselast"] for row in self: keys2 = util.mget(row, keys) ptr = lookup for i in xrange(len(keys2) - 1): ptr = lookup[keys2[i]] if not uselast and keys2[-1] in ptr: raise Exception("duplicate key '%s'" % str(keys2[-1])) ptr[keys2[-1]] = row lookup.insert = False return lookup
def _test_ml_speed(self): # params bgfreq = [.258,.267,.266,.209] kappa = 1.59 # data tree = treelib.readTree("test/data/flies.nt/0/0.tree") align = fasta.readFasta("test/data/flies.nt/0/0.align") likes = [] dists = [] nodes = sorted(tree.nodes.values(), key=lambda x: x.dist) util.tic("find ML") for i in xrange(10): l = spidir.find_ml_branch_lengths_hky( tree, util.mget(align, tree.leafNames()), bgfreq, kappa, maxiter=10) util.toc() dists.append([n.dist for n in nodes]) likes.append(l)
def walk(node): node.recurse(walk) if not node.isLeaf(): # this node's species is lca of children species recon[node] = reconcile_lca(stree, order, util.mget(recon, node.children))
def find_orthologs(gtree, stree, recon, counts=True): """Find all ortholog pairs within a gene tree""" events = label_events(gtree, recon) orths = [] for node, event in events.items(): if event == "spec": leavesmat = [x.leaves() for x in node.children] sp_counts = [util.hist_dict(util.mget(recon, row)) for row in leavesmat] for i in range(len(leavesmat)): for j in range(i+1, len(leavesmat)): for gene1 in leavesmat[i]: for gene2 in leavesmat[j]: if gene1.name > gene2.name: g1, g2 = gene2, gene1 a, b = j, i else: g1, g2 = gene1, gene2 a, b = i, j if not counts: orths.append((g1.name, g2.name)) else: orths.append((g1.name, g2.name, sp_counts[a][recon[g1]], sp_counts[b][recon[g2]])) return orths
def read_length_matrix(filename, minlen=.0001, maxlen=1.0, nooutliers=True): """Read a length matrix made by spidir-prep""" from rasmus import util dat = [line.rstrip().split("\t") for line in open(filename)] species = dat[0][2:] lens = util.map2(float, util.submatrix(dat, range(1, len(dat)), range(2, len(dat[0])))) gene_sizes = map(int, util.cget(dat[1:], 1)) files = util.cget(dat[1:], 0) if nooutliers: treelens = map(sum, lens) m = mean(treelens) ind = util.find(lambda x: x<5*m, treelens) files, gene_sizes, lens, treelens = [util.mget(x, ind) for x in files, gene_sizes, lens, treelens] for row in lens: for i in xrange(len(row)): if row[i] < minlen: row[i] = minlen return species, lens, gene_sizes, files
def test_ml_large(self): """Test ML code""" # params bgfreq = [.258,.267,.266,.209] kappa = 1.59 # data tree = treelib.readTree("test/data/verts/19520/19520.ensembl.tree") align = fasta.readFasta("test/data/verts/19520/19520.nt.mfa") likes = [] dists = [] nodes = sorted(tree.nodes.values(), key=lambda x: x.dist) l = spidir.calc_seq_likelihood_hky(tree, align, bgfreq, kappa) print l self.assert_(l != -util.INF) l = spidir.find_ml_branch_lengths_hky( tree, util.mget(align, tree.leafNames()), bgfreq, kappa, parsinit=False, maxiter=1) print l self.assert_(l != -util.INF)
def read_length_matrix(filename, minlen=.0001, maxlen=1.0, nooutliers=True): """Read a length matrix made by spidir-prep""" from rasmus import util dat = [line.rstrip().split("\t") for line in open(filename)] species = dat[0][2:] lens = util.map2( float, util.submatrix(dat, range(1, len(dat)), range(2, len(dat[0])))) gene_sizes = map(int, util.cget(dat[1:], 1)) files = util.cget(dat[1:], 0) if nooutliers: treelens = map(sum, lens) m = mean(treelens) ind = util.find(lambda x: x < 5 * m, treelens) files, gene_sizes, lens, treelens = [ util.mget(x, ind) for x in files, gene_sizes, lens, treelens ] for row in lens: for i in xrange(len(row)): if row[i] < minlen: row[i] = minlen return species, lens, gene_sizes, files
def fit_distrib(cdf, params_init, data, ndivs=20, minsamples=5, start=-util.INF, end=util.INF): import scipy import scipy.optimize import scipy.stats # determine ndiv and binsize binsize = len(data) / ndivs if binsize < minsamples: ndivs = len(data) / minsamples binsize = len(data) / ndivs data = sorted(data) bins = [data[i:i+binsize] for i in xrange(0, len(data), binsize)] obs = scipy.array(map(len, bins)) ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins) obs = util.mget(obs, ind) def optfunc(params): x = [bin[0] for bin in bins] expected = [len(data) * cdf(x[1], params)] expected.extend([len(data) * (cdf(x[i+1], params) - cdf(x[i], params)) for i in range(1, len(x)-1)]) expected.append(len(data) * (1.0 - cdf(x[-1], params))) expected = scipy.array(util.mget(expected, ind)) chi2, pval = scipy.stats.chisquare(obs, expected) return chi2 params = scipy.optimize.fmin(optfunc, params_init, disp=False) chi2, pval = chi_square_fit(cdf, params, data, ndivs, minsamples) return list(params), pval
def on_reorder_leaves(self): leaves = self.current_tree.leaf_names() # reorder matrix for mat in self.matrices: lookup = util.list2lookup(mat.rowlabels) mat.rperm = util.mget(lookup, leaves) mat.cperm = util.mget(lookup, leaves) mat.setup() if self.visdist: self.visdist.redraw() # reorder alignment for aln in self.aligns: aln.names = leaves if self.visalign: self.visalign.show()
def _test_ml(self): """Test ML code""" # params bgfreq = [.258, .267, .266, .209] kappa = 1.59 # data tree = treelib.readTree("test/data/flies.nt/0/0.tree") align = fasta.readFasta("test/data/flies.nt/0/0.align") likes = [] dists = [] nodes = sorted(tree.nodes.values(), key=lambda x: x.dist) util.tic("find ML") for i in range(40): l = spidir.find_ml_branch_lengths_hky(tree, util.mget( align, tree.leafNames()), bgfreq, kappa, parsinit=False, maxiter=1) dists.append([n.dist for n in nodes]) likes.append(l) util.toc() print likes prep_dir("test/output/ml/") # distances plot util.rplot_start("test/output/ml/ml_branches.pdf") util.rplot("plot", util.cget(dists, 0), ylim=[0, max(dists[0])], t="l", main="branch length convergence", xlab="iterations", ylab="branch lengths (sub/site)") for d in zip(*dists): util.rplot("lines", d) util.rplot_end(True) print util.cget(dists, 4) # likelihood plot util.rplot_start("test/output/ml/ml_likelihood.pdf") util.rplot("plot", likes, t="l", xlab="iterations", ylab="log likelihood", main="likelihood convergence") util.rplot_end(True)
def parsimony_C(aln, tree): ptree, nodes, nodelookup = makePtree(tree) leaves = [x.name for x in nodes if isinstance(x.name, str)] seqs = util.mget(aln, leaves) dists = pyspidir.parsimony(ptree, seqs) for i in xrange(len(dists)): nodes[i].dist = dists[i]
def make_degen_str(aln): """Returns a string containing the degeneracy for each column in an alignment """ degens = find_degen(aln) degenmap = {-1: " ", 0: "0", 1: "1", 2: "2", 3: "3", 4: "4"} return "".join(util.mget(degenmap, degens))
def optfunc(params): x = [bin[0] for bin in bins] expected = [len(data) * cdf(x[1], params)] expected.extend([len(data) * (cdf(x[i+1], params) - cdf(x[i], params)) for i in range(1, len(x)-1)]) expected.append(len(data) * (1.0 - cdf(x[-1], params))) expected = scipy.array(util.mget(expected, ind)) chi2, pval = scipy.stats.chisquare(obs, expected) return chi2
def mlhkydist_C(aln, tree, bgfreq, ratio, maxiter): ptree, nodes, nodelookup = makePtree(tree) leaves = [x.name for x in nodes if isinstance(x.name, str)] seqs = util.mget(aln, leaves) dists, logl = pyspidir.mlhkydist(ptree, seqs, bgfreq, ratio, maxiter) for i in xrange(len(dists)): nodes[i].dist = dists[i] return logl
def _test_ml(self): """Test ML code""" # params bgfreq = [.258,.267,.266,.209] kappa = 1.59 # data tree = treelib.readTree("test/data/flies.nt/0/0.tree") align = fasta.readFasta("test/data/flies.nt/0/0.align") likes = [] dists = [] nodes = sorted(tree.nodes.values(), key=lambda x: x.dist) util.tic("find ML") for i in range(40): l = spidir.find_ml_branch_lengths_hky( tree, util.mget(align, tree.leafNames()), bgfreq, kappa, parsinit=False, maxiter=1) dists.append([n.dist for n in nodes]) likes.append(l) util.toc() print likes prep_dir("test/output/ml/") # distances plot util.rplot_start("test/output/ml/ml_branches.pdf") util.rplot("plot", util.cget(dists, 0), ylim=[0, max(dists[0])], t="l", main="branch length convergence", xlab="iterations", ylab="branch lengths (sub/site)") for d in zip(* dists): util.rplot("lines", d) util.rplot_end(True) print util.cget(dists, 4) # likelihood plot util.rplot_start("test/output/ml/ml_likelihood.pdf") util.rplot("plot", likes, t="l", xlab="iterations", ylab="log likelihood", main="likelihood convergence") util.rplot_end(True)
def draw_matches(self, sp, chrom, start, end, drawn=None): vis = [] if drawn is None: drawn = set() # build list of matches in order of drawing for gene in iter_chrom(self.db.get_regions(sp, chrom), start, end): # need to sort matches by genome order so that mult-genome synteny # is drawn top-down # get orthologs genes2 = [x for x in self.orth_lookup.get(gene.data["ID"], []) if x in self.region_layout] if len(genes2) == 0: continue rows = util.groupby(lambda x: self.region_layout[x].y, genes2) keys = util.sort(rows.keys(), reverse=True) rows = util.mget(rows, keys) l = self.region_layout for i in range(1, len(rows)): for botGene in rows[i]: gene1 = self.db.get_region(botGene) for topGene in rows[i-1]: if (botGene, topGene) in drawn: continue drawn.add((botGene, topGene)) gene2 = self.db.get_region(topGene) y1 = l[topGene].y y2 = l[botGene].y + 1 x1 = l[topGene].x x2 = l[topGene].x + gene2.length() x3 = l[botGene].x + gene1.length() x4 = l[botGene].x if self.fat_matches: vis.append(quads( self.colors["matches"], x1, y1, x2, y1, x3, y2, x4, y2)) vis.append(lines(self.colors["matches"], x1, y1, x4, y2)) return group(* vis)
def chiSquareFit(xbins, ybins, func, nsamples, nparams, minsamples=5): sizes = [xbins[i+1] - xbins[i] for i in xrange(len(xbins)-1)] sizes.append(sizes[-1]) # NOTE: assumes bins are of equal size # only focus on bins that are large enough counts = [ybins[i] * sizes[i] * nsamples for i in xrange(len(xbins)-1)] expected = [] for i in xrange(len(xbins)-1): expected.append((func(xbins[i]) + func(xbins[i+1]))/2.0 * sizes[i] * nsamples) # ensure we have enough expected samples in each bin ind = util.find(util.gefunc(minsamples), expected) counts = util.mget(counts, ind) expected = util.mget(expected, ind) if len(counts) == 0: return [0, 1], counts, expected else: return chiSquare([counts], [expected], nparams), counts, expected
def chiSquareFit(xbins, ybins, func, nsamples, nparams, minsamples=5): sizes = [xbins[i + 1] - xbins[i] for i in xrange(len(xbins) - 1)] sizes.append(sizes[-1]) # NOTE: assumes bins are of equal size # only focus on bins that are large enough counts = [ybins[i] * sizes[i] * nsamples for i in xrange(len(xbins) - 1)] expected = [] for i in xrange(len(xbins) - 1): expected.append( (func(xbins[i]) + func(xbins[i + 1])) / 2.0 * sizes[i] * nsamples) # ensure we have enough expected samples in each bin ind = util.find(util.gefunc(minsamples), expected) counts = util.mget(counts, ind) expected = util.mget(expected, ind) if len(counts) == 0: return [0, 1], counts, expected else: return chiSquare([counts], [expected], nparams), counts, expected
def draw_matches(self, sp, chrom, start, end, drawn=None): vis = [] if drawn is None: drawn = set() # build list of matches in order of drawing for gene in iter_chrom(self.db.get_regions(sp, chrom), start, end): # need to sort matches by genome order so that mult-genome synteny # is drawn top-down # get orthologs genes2 = [ x for x in self.orth_lookup.get(gene.data["ID"], []) if x in self.region_layout ] if len(genes2) == 0: continue rows = util.groupby(lambda x: self.region_layout[x].y, genes2) keys = util.sort(rows.keys(), reverse=True) rows = util.mget(rows, keys) l = self.region_layout for i in range(1, len(rows)): for botGene in rows[i]: gene1 = self.db.get_region(botGene) for topGene in rows[i - 1]: if (botGene, topGene) in drawn: continue drawn.add((botGene, topGene)) gene2 = self.db.get_region(topGene) y1 = l[topGene].y y2 = l[botGene].y + 1 x1 = l[topGene].x x2 = l[topGene].x + gene2.length() x3 = l[botGene].x + gene1.length() x4 = l[botGene].x if self.fat_matches: vis.append( quads(self.colors["matches"], x1, y1, x2, y1, x3, y2, x4, y2)) vis.append( lines(self.colors["matches"], x1, y1, x4, y2)) return group(*vis)
def query_point_regions(point, regions, inc=True): ind = util.sortindex(regions, key=lambda r: r[1]) rind = util.mget(range(len(regions)), ind) regions_by_end = util.mget(regions, ind) end = util.binsearch([r[0] for r in regions], x)[1] start = util.binsearch([r[1] for r in regions_by_end], x)[0] if start is None: start = 0 if end is None: end = len(regions) if inc: for i in xrange(start, end): if regions[i][0] <= x <= regions[i][1]: yield regions[i] else: for i in xrange(start, end): if regions[i][0] < x < regions[i][1]: yield regions[i]
def walk(node): if node in leaves: colors[node] = phylo.hash_tree(node, gene2species) else: # recurse for child in node.children: walk(child) childHashes = util.mget(colors, node.children) if len(childHashes) > 1 and util.equal(*childHashes): nmirrors[0] += 1 childHashes.sort() colors[node] = phylo.hash_tree_compose(childHashes)
def make_pep_colors(prop2color=prop2color): pep_colors = util.Dict(default=color(.5, .5, .5)) AA = 'ARNDCEQGHILKMFPSTWYVU*' pep_per_prop = util.hist_dict(util.mget(seqlib.AA_PROPERTY, AA)) prop_counts = util.Dict(default=0) for char in AA: prop = seqlib.AA_PROPERTY[char] tint = prop_counts[prop] / float(pep_per_prop[prop]) pep_colors[char] = prop2color(prop, tint * .5) prop_counts[prop] += 1 return pep_colors
def _write_directive(self, line, out, delim): """Write a directive""" if line == DIR_VERSION: out.write("##version:%s\n" % self.version) elif line == DIR_TYPES: if len(self) > 0: entry = self[0] else: entry = [""] * len(self.headers) out.write("##types:" + self._type_lookup.formatTableTypes( util.mget(self.types, self.headers), delim) + "\n") elif line == DIR_DEFAULTS: out.write( "##defaults:" + delim.join(map(str, util.mget(self.defaults, self.headers))) + "\n") elif line == DIR_HEADERS: out.write("##headers:%d\n" % self.nheaders) else: raise "unknown directive:", line
def findFragments(regiondb, aln, overlapCutoff=.10): """Determine if alignment has gene fragments""" aln_genes = util.mget(regiondb.regions, aln.keys()) nbrs = findNeighbors(regiondb, aln_genes) frags = [] # are there any neighbors? if max(map(len, nbrs)) > 1: # do neighbors overlap in alignment? for nbr in nbrs: if len(nbr) > 1: aln2 = aln.get(x.data['ID'] for x in nbr) frags.extend(findMerges(aln2, overlapCutoff=overlapCutoff)) return frags
def get_aligns(self, species, chrom, start, end, mainspecies=lambda keys: keys[0], collapse=False): """By default assumes main species is 1st sequence""" # get records for this region records = self.get(species, chrom, start, end) records.sort(key=lambda x: x["start"]) # read alignments alns = [] for record in records: aln = fasta.read_fasta(record["filename"]) # collapse alignment if collapse: ind = util.findneq("-", aln[mainspecies(aln.keys())]) for key, seq in aln.iteritems(): if len(seq) != 0: aln[key] = "".join(util.mget(seq, ind)) l2a = alignlib.local2align(aln[mainspecies(aln.keys())]) # trim front if start > record["start"]: trimstart = l2a[start - record["start"]] else: trimstart = 0 # trim end if end < record["end"]: trimend = l2a[-(record["end"] - end)] else: trimend = aln.alignlen() # perform trim for key, seq in aln.iteritems(): aln[key] = seq[trimstart:trimend] alns.append(aln) return alns
def learnModel(trees, stree, gene2species, statsprefix="", filenames=None): util.tic("learn model") util.tic("find branch length distributions") lengths, used = phylo.find_branch_distrib(trees, stree, gene2species, False) debug("Total trees matching species topology: %d out of %d" % (sum(used), len(trees))) util.toc() params = {} totlens = map(sum, zip(* lengths.values())) # print output stats if statsprefix != "": writeTreeDistrib(file(statsprefix + ".lens", "w"), lengths) rates = treeDistrib2table(lengths, filenames=filenames) rates.write(statsprefix + "_rates.tab") util.tic("fitting params") for node, lens in lengths.items(): if len(lens) == 0 or max(lens) == min(lens): continue util.tic("fitting params for " + str(node.name)) param = fitNormal2(util.vdiv(lens, totlens)) params[node.name] = param util.toc() util.toc() # calc distribution of total tree length trees2 = util.mget(trees, util.findeq(True, used)) lens = map(lambda x: sum(y.dist for y in x.nodes.values()), trees2) lens = filter(lambda x: x < 20, lens) mu = stats.mean(lens) lens = filter(lambda x: x < 2*mu, lens) mu = stats.mean(lens) sigma2 = stats.variance(lens) params["baserate"] = [mu*mu/sigma2, mu/sigma2] params[stree.root.name] = [0, 1] util.toc() return params
def find_xenologs(gtree, stree, recon, events, trans, counts=True, species_branch=False): """Find all xenolog pairs within a gene tree NOTE: THIS HAS NOT BEEN TESTED!!! """ xenos = [] for node, event in events.items(): if event == "trans": assert len(node.children) == 2 if trans[node] == node.children[0]: children = (node.children[1], node.children[0]) else: children = node.children leavesmat = [x.leaves() for x in children] sp_counts = [ util.hist_dict(util.mget(recon, row)) for row in leavesmat ] for i in range(len(leavesmat)): for j in range(i + 1, len(leavesmat)): for gene1 in leavesmat[i]: for gene2 in leavesmat[j]: g1, g2 = gene1, gene2 a, b = i, j xeno = [g1.name, g2.name] if counts: xeno.extend([ sp_counts[a][recon[g1]], sp_counts[b][recon[g2]] ]) if species_branch: xeno.append(recon[node]) xenos.append(tuple(xenos)) return xenos
def get_aligns(self, species, chrom, start, end, mainspecies=lambda keys: keys[0], collapse=False): """By default assumes main species is 1st sequence""" # get records for this region records = self.get(species, chrom, start, end) records.sort(key=lambda x: x["start"]) # read alignments alns = [] for record in records: aln = fasta.read_fasta(record["filename"]) # collapse alignment if collapse: ind = util.findneq("-", aln[mainspecies(aln.keys())]) for key, seq in aln.iteritems(): if len(seq) != 0: aln[key] = "".join(util.mget(seq, ind)) l2a = alignlib.local2align(aln[mainspecies(aln.keys())]) # trim front if start > record["start"]: trimstart = l2a[start - record["start"]] else: trimstart = 0 # trim end if end < record["end"]: trimend = l2a[-(record["end"]-end)] else: trimend = aln.alignlen() # perform trim for key, seq in aln.iteritems(): aln[key] = seq[trimstart:trimend] alns.append(aln) return alns
def write(self, out, fullpage=False): """Write HTML table""" out = util.open_stream(out, "w") if fullpage: out.write("<html>") if self.title: out.write("<head><title>%s</title></head>\n" % self.title) out.write( "<style>.tab { border-right: 1px solid #777; border-bottom: 1px solid #777;}</style>" ) if self.title is not None: out.write("<h1>%s</h1>" % self.title) # write headers out.write("<table cellspacing=0 style='border: 1px solid black;'>\n") out.write("<tr><td class='tab'><b>#</b></td>") for header in self.headers: out.write("<td class='tab'><b>%s</b></td>" % header) out.write("</tr>\n") # write rows for i, row in enumerate(self.table): out.write("<tr><td class='tab'>%d.</td>" % (i + 1)) for j, item in enumerate(util.mget(row, self.table.headers)): if self.formats[j] is not None: # write formating out.write("<td class='tab'>%s </td>" % self.formats[j](item)) else: out.write("<td class='tab'><nobr>%s </nobr></td>" % str(item)) out.write("</tr>\n") out.write("</table>") if fullpage: out.write("</html>")
def write(self, out, fullpage=False): """Write HTML table""" out = util.open_stream(out, "w") if fullpage: out.write("<html>") if self.title: out.write("<head><title>%s</title></head>\n" % self.title) out.write("<style>.tab { border-right: 1px solid #777; border-bottom: 1px solid #777;}</style>") # nopep8 if self.title is not None: out.write("<h1>%s</h1>" % self.title) # write headers out.write("<table cellspacing=0 style='border: 1px solid black;'>\n") out.write("<tr><td class='tab'><b>#</b></td>") for header in self.headers: out.write("<td class='tab'><b>%s</b></td>" % header) out.write("</tr>\n") # write rows for i, row in enumerate(self.table): out.write("<tr><td class='tab'>%d.</td>" % (i+1)) for j, item in enumerate(util.mget(row, self.table.headers)): if self.formats[j] is not None: # write formating out.write("<td class='tab'>%s </td>" % self.formats[j](item)) else: out.write( "<td class='tab'><nobr>%s </nobr></td>" % str(item)) out.write("</tr>\n") out.write("</table>") if fullpage: out.write("</html>")
def get_matrix(self, rowheader="rlabels"): """Returns mat, rlabels, clabels where mat is a copy of the table as a 2D list rlabels are the row labels clabels are the column labels """ # get labels if rowheader is not None and rowheader in self.headers: rlabels = self.cget(rowheader) clabels = copy.copy(self.headers) clabels.remove(rowheader) else: rlabels = range(len(self)) clabels = copy.copy(self.headers) # get data mat = [] for row in self: mat.append(util.mget(row, clabels)) return mat, rlabels, clabels
def getRelBranchLens(rates, species=None): if species == None: species = rates.headers nonspecies = set(rates.headers) - set(species) relrates = rates.new() for row in rates: row2 = {} tot = sum(util.mget(row, species)) for sp in species: row2[sp] = row[sp] / tot # copy over non-species data for key in nonspecies: row2[key] = row[key] relrates.append(row2) return relrates
def num_redundant_topology(node, gene2species, leaves=None, all_leaves=False): """Returns the number of 'redundant' topologies""" if leaves is None: leaves = node.leaves() leaves = set(leaves) colors = {} nmirrors = [0] def walk(node): if node in leaves: colors[node] = phylo.hash_tree(node, gene2species) else: # recurse for child in node.children: walk(child) childHashes = util.mget(colors, node.children) if len(childHashes) > 1 and util.equal(*childHashes): nmirrors[0] += 1 childHashes.sort() colors[node] = phylo.hash_tree_compose(childHashes) walk(node) colorsizes = util.hist_dict(util.mget(colors, leaves)).values() if all_leaves: val = stats.factorial(len(leaves)) else: val = 1 for s in colorsizes: if s > 1: val *= stats.factorial(s) #print "py val=", val, "nmirrors=", nmirrors[0] return val / (2**nmirrors[0])
def num_redundant_topology(node, gene2species, leaves=None, all_leaves=False): """Returns the number of 'redundant' topologies""" if leaves is None: leaves = node.leaves() leaves = set(leaves) colors = {} nmirrors = [0] def walk(node): if node in leaves: colors[node] = phylo.hash_tree(node, gene2species) else: # recurse for child in node.children: walk(child) childHashes = util.mget(colors, node.children) if len(childHashes) > 1 and util.equal(*childHashes): nmirrors[0] += 1 childHashes.sort() colors[node] = phylo.hash_tree_compose(childHashes) walk(node) colorsizes = util.hist_dict(util.mget(colors, leaves)).values() if all_leaves: val = stats.factorial(len(leaves)) else: val = 1 for s in colorsizes: if s > 1: val *= stats.factorial(s) # print "py val=", val, "nmirrors=", nmirrors[0] return val / (2 ** nmirrors[0])
def subalign(aln, cols): """Returns an alignment with a subset of the columns (cols)""" return mapalign(aln, valfunc=lambda x: "".join(util.mget(x, cols)))
def func(seq): dct = {-1: "-", 0: "0", 1: "1", 2: "2"} return "".join(util.mget(dct, mark_codon_pos(seq)))