def make_pep_colors(prop2color=prop2color): pep_colors = util.Dict(default=color(.5, .5, .5)) AA = 'ARNDCEQGHILKMFPSTWYVU*' pep_per_prop = util.hist_dict(util.mget(seqlib.AA_PROPERTY, AA)) prop_counts = util.Dict(default=0) for char in AA: prop = seqlib.AA_PROPERTY[char] tint = prop_counts[prop] / float(pep_per_prop[prop]) pep_colors[char] = prop2color(prop, tint * .5) prop_counts[prop] += 1 return pep_colors
def lookup(self, *keys, **options): """Returns a lookup dict based on a column 'key' or multiple keys extra options: default=None uselast=False # allow multiple rows, just use last """ options.setdefault("default", None) options.setdefault("uselast", False) lookup = util.Dict(dim=len(keys), default=options["default"]) uselast = options["uselast"] for row in self: keys2 = util.mget(row, keys) ptr = lookup for i in xrange(len(keys2) - 1): ptr = lookup[keys2[i]] if not uselast and keys2[-1] in ptr: raise Exception("duplicate key '%s'" % str(keys2[-1])) ptr[keys2[-1]] = row lookup.insert = False return lookup
def findNeighbors(regiondb, genes): """Determine which genes in 'genes' are neighboring genes in the chromosomes 'chroms' returns a clustering of the genes in clusters of neighboring streaks """ geneset = set(genes) neighbors = util.Dict(dim=2) for gene in genes: chrom = regiondb.species[gene.species][gene.seqname] ind = regionlib.findRegion(chrom, gene) # look for neighboring genes on same strand if ind > 0: left = chrom[ind-1] if left in geneset and left.strand == gene.strand: neighbors[gene][left] = 1 neighbors[left][gene] = 1 if ind < len(chrom)-1: right = chrom[ind+1] if right in geneset and right.strand == gene.strand: neighbors[gene][right] = 1 neighbors[right][gene] = 1 comps = graph.connectedComponents(genes, lambda x: neighbors[x].keys()) # sort neighbors in order of appearance along strand for comp in comps: comp.sort(key=lambda x: x.start, reverse=(comp[0].strand == -1)) return comps
def makeBlastFileLookup(blastfiles): lookup = util.Dict(dim=2) for f in blastfiles: m = util.match("(|.*/)(?P<genome1>[^/_]+)_(?P<genome2>[^/\.]+)\.[\/]*", f) lookup[m["genome1"]][m["genome2"]] = (f, True) lookup[m["genome2"]][m["genome1"]] = (f, False) return lookup
def enrichItems(in_items, out_items, M=None, N=None, useq=True, extra=False): """Calculates enrichment for items within an in-set vs and out-set. Returns a sorted table. """ # DEPRECATED # TODO: remove this function # count items counts = util.Dict(default=[0, 0]) for item in in_items: counts[item][0] += 1 for item in out_items: counts[item][1] += 1 if N is None: N = len(in_items) + len(out_items) if M is None: M = len(in_items) tab = tablelib.Table( headers=["item", "in_count", "out_count", "pval", "pval_under"]) # do hypergeometric for item, (a, b) in counts.iteritems(): tab.add(item=item, in_count=a, out_count=b, pval=rhyper(a, a + b, M, N), pval_under=rhyper(a, a + b, M, N, 1)) # add qvalues if useq: qval = qvalues(tab.cget("pval")) qval_under = qvalues(tab.cget("pval_under")) tab.add_col("qval", data=qval) tab.add_col("qval_under", data=qval_under) if extra: tab.add_col("in_size", data=[M] * len(tab)) tab.add_col("out_size", data=[N - M] * len(tab)) tab.add_col("item_ratio", data=[ row["in_count"] / float(row["in_count"] + row["out_count"]) for row in tab ]) tab.add_col("size_ratio", data=[M / float(N) for row in tab]) tab.add_col( "fold", data=[row["item_ratio"] / row["size_ratio"] for row in tab]) tab.sort(col='pval') return tab
def partLookup(parts1, parts2): """For each part in part1, which parts in parts2 share the same items""" lookup2 = util.Dict(default=-1) lookup2.update(item2part(parts2)) splits = [] for part1 in parts1: hits = set() for item in part1: hits.add(lookup2[item]) splits.append(sorted(list(hits))) return splits
def confusionMatrix(parts1, parts2): """Returns a confusion matrix of two different partitions of the same items""" confuse = util.Dict(dim=2, default=0) lookup1 = item2part(parts1) lookup2 = item2part(parts2) items1 = set(util.flatten(parts1, 1)) items2 = set(util.flatten(parts2, 1)) sameset = items1 & items2 diffset = items1.symmetric_difference(items2) for item in sameset: confuse[lookup1[item]][lookup2[item]] += 1 return confuse, list(diffset)
def __init__(self, regions1, regions2, hits, hitnames=True, style="line", color=(0, 0, 0), fill_color=None, trace_color=(0, 1, 1, .2), selfhits=True, name=None): self.name = name self.regions1 = regions1 self.regions2 = regions2 self.style = style self.color = color self.fill_color = fill_color self.trace_color = trace_color self.selfhits = selfhits if hitnames: self.hits = [] # resolve hits to regions name2region = util.Dict(default=[]) for region in itertools.chain(self.regions1, self.regions2): name2region[region.data["ID"]].append(region) for hit in hits: newhit = [] for name in hit: if name in name2region: newhit.extend(name2region[name]) if len(newhit) > 0: self.hits.append(newhit) else: self.hits = hits
def bestBidir(hits, scorefunc=bitscore): "find best bidirectional hits" best = util.Dict(default=[None, 0, None]) for hit in hits: gene1 = query(hit) gene2 = subject(hit) score = scorefunc(hit) if score > best[gene1][1]: best[gene1] = [gene2, score, hit] if score > best[gene2][1]: best[gene2] = [gene1, score, hit] mark = set() hits2 = [] for gene1, (gene2, score, hit) in best.iteritems(): if best[gene2][0] == gene1 and gene1 not in mark: mark.add(gene1) mark.add(gene2) hits2.append(hit) return hits2
def block_bbh_hits(block): """Score a block by the number of BBH it contains""" # find all unidirectional best hits best = util.Dict(default=[-util.INF, None, None]) for hit in block.data["hits"]: a, b, val = hit[:3] a = a.data["ID"] b = b.data["ID"] if val > best[a][0]: best[a] = (val, b, hit) if val > best[b][0]: best[b] = (val, a, hit) # count bi-directional best hits hits2 = [] for a, (val, b, hit) in best.iteritems(): if best[b][1] == a and a < b: hits2.append(hit) return hits2
def __init__(self, master_file=None, seq2species=lambda x: x): self.lookup = util.Dict(default=[]) self.seq2species = seq2species if master_file != None: self.read(master_file)
def visualize(mat, outfile, format="undirected", options="-Tjpg", param="overlap=\"false\";"): if format == "undirected": out = os.popen("neato " + options + " -o " + outfile, "w") elif format == "directed": out = os.popen("dot " + options + " -o " + outfile, "w") print out, format writeGraphviz(mat, out, format, param) if __name__ == "__main__": mat = util.Dict(dim=2) mat[1][2] = 1 mat[1][3] = 1 mat[2][4] = 1 mat[3][4] = 1 mat[4][5] = 1 #writeGraphviz(mat, sys.stdout, "directed") visualize(mat, "out2.jpg", "directed") class GraphViz: pass
def __init__(self, default=' '): self.mat = util.Dict(dim=2, default=default) self.default = default
def layout_frags(self, genome_name, chrom_name, start, end, direction=1): ref_chrom = self.chroms_lookup[(genome_name, chrom_name)] # setup genome display order order = {} for i, genome in enumerate(self.genomes): order[genome] = i # swap the genome with order 0 and the reference genome j = order[self.ref_genome] order[self.genomes[0]] = j order[self.ref_genome] = 0 # init reference fragment ref_frag = Frag(genome=genome_name, chrom=chrom_name, start=max(start, 0), end=min(end, ref_chrom.end), strand=direction, x=max(start, 0), y=0) self.frags.add(ref_frag) self.layout_frag_contents(ref_frag) # find all synteny blocks in this region # sort blocks by appearance in ref_chrom blocks = list(self.filter_blocks(self.blocks, ref_chrom, start, end)) def blocksort(a): if a[1] == 0: starta = a[0].region1.start else: starta = a[0].region2.start blocks.sort(key=blocksort) # make lookup for genes to block and block to fragment block_lookup = {} frag_lookup = {} for block, flip in blocks: if flip == 0: other = block.region2 else: other = block.region1 frag = Frag() frag.genome = other.species frag.chrom = other.seqname frag_lookup[block] = frag for gene2 in iter_chrom( self.db.get_regions(frag.genome, frag.chrom), other.start, other.end): block_lookup[gene2] = block self.block_lookup = block_lookup # find all genes that will be drawn # walk along ref_chrom and store drawn genes into fragments refLookup = {} for gene in iter_chrom(self.db.get_regions(genome_name, chrom_name), start, end): for name2 in self.orth_lookup.get(gene.data["ID"], []): gene2 = self.db.get_region(name2) if gene2 in block_lookup: frag_lookup[block_lookup[gene2]].genes.append(gene2) refLookup[gene2] = gene self.refLookup = refLookup # determine fragment dimensions for frag in frag_lookup.itervalues(): if len(frag.genes) == 0: frag.x = None continue frag.genes.sort(key=lambda a: a.start) # set fragment start and end frag.start = frag.genes[0].start frag.end = frag.genes[-1].end # find fragment direction vote = 0 last = None for gene2 in frag.genes: pos = refLookup[gene2].start if last != None and pos != last: if last < pos: vote += 1 else: vote -= 1 last = pos if vote > 0: frag.direction = direction else: frag.direction = -direction # find fragment x-coordinate diffs = [] for gene2 in frag.genes: if direction == 1: offset1 = refLookup[gene2].start - ref_frag.start else: offset1 = ref_frag.end - refLookup[gene2].end if frag.direction == 1: offset2 = gene2.start - frag.start else: offset2 = frag.end - gene2.end diffs.append(offset2 - offset1) frag.x = ref_frag.x - stats.median(diffs) # place blocks fragY = util.Dict(default=-self.genome_sep) for block, flip in blocks: frag = frag_lookup[block] otherGenome = frag.genome if frag.x == None: # fragment could not be placed continue frag.y = fragY[otherGenome] - \ ((order[otherGenome] - 1) * self.max_genome_sep) # re-get all genes between those coordinates #frag.genes = list(iter_chrom(self.db.get_regions(frag.genome, # frag.chrom), # frag.start, frag.end)) # store and lyaout frag self.frags.add(frag) self.layout_frag_contents(frag) # stagger fragments fragY[otherGenome] -= self.frag_sep if fragY[otherGenome] < -self.max_genome_sep: fragY[otherGenome] = -self.genome_sep
def mergeBuh(conf, genes, parts1, parts2, blastfiles): """Merge by Best Unidirectional Hits""" # don't use this code without double checking it assert False lookup1 = item2part(parts1) lookup2 = item2part(parts2) best = util.Dict(dim=1, default=(0, None)) util.tic("read hits") for blastfile, order in blastfiles: util.tic("determine best hits '%s'" % os.path.basename(blastfile)) for hit in blast.BlastReader(blastfile): if order: gene1 = blast.query(hit) gene2 = blast.subject(hit) alnlen1 = blast.queryLength(hit) alnlen2 = blast.subjectLength(hit) else: gene2 = blast.query(hit) gene1 = blast.subject(hit) alnlen2 = blast.queryLength(hit) alnlen1 = blast.subjectLength(hit) score = blast.bitscore(hit) len1 = genes[gene1]["length"] len2 = genes[gene2]["length"] coverage = max(alnlen1 / float(len1), alnlen2 / float(len2)) # discard a hit that does not pass basic cutoffs if blast.bitscore(hit) / float(blast.alignLength(hit)) < \ conf["bitspersite"] or \ coverage < conf["coverage"] or \ blast.evalue(hit) > conf["signif"]: continue #if blast.evalue(hit) > conf["signif"]: # continue if gene1 in lookup1: part1 = (0, lookup1[gene1]) else: parts1.append([gene1]) lookup1[gene1] = len(parts1) - 1 part1 = (0, len(parts1) - 1) if gene2 in lookup2: part2 = (1, lookup2[gene2]) else: parts2.append([gene2]) lookup2[gene2] = len(parts2) - 1 part2 = (1, len(parts2) - 1) if score > best[part1][0]: best[part1] = (score, part2) if score > best[part2][0]: best[part2] = (score, part1) util.toc() util.toc() util.tic("determine clusters") sets = {} for gene in best: sets[gene] = sets.UnionFind([gene]) for blastfile, order in blastfiles: util.tic("read hits '%s'" % os.path.basename(blastfile)) for hit in blast.BlastReader(blastfile): if order: gene1 = blast.query(hit) gene2 = blast.subject(hit) alnlen1 = blast.queryLength(hit) alnlen2 = blast.subjectLength(hit) else: gene2 = blast.query(hit) gene1 = blast.subject(hit) alnlen2 = blast.queryLength(hit) alnlen1 = blast.subjectLength(hit) score = blast.bitscore(hit) len1 = genes[gene1]["length"] len2 = genes[gene2]["length"] coverage = max(alnlen1 / float(len1), alnlen2 / float(len2)) # discard a hit that does not pass basic cutoffs if blast.bitscore(hit) / float(blast.alignLength(hit)) < \ conf["bitspersite"] or \ coverage < conf["coverage"] or \ blast.evalue(hit) > conf["signif"]: continue #if blast.evalue(hit) > conf["signif"]: # continue part1 = (0, lookup1[gene1]) part2 = (1, lookup2[gene2]) if score >= best[part1][0] * conf["relcutoff"]: sets[part1].union(sets[part2]) if score >= best[part2][0] * conf["relcutoff"]: sets[part2].union(sets[part1]) util.toc() sets = util.unique([x.root() for x in sets.values()]) parts = [] joining = (parts1, parts2) for set in sets: parts.append([]) for i, row in set.members(): parts[-1].extend(joining[i][row]) util.toc() return parts
def mergeAvg(conf, genes, parts1, parts2, blastfiles, outblastfiles): lookup1 = item2part(parts1) lookup2 = item2part(parts2) # value is [sum, total] hits = util.Dict(dim=2, default=[0, 0]) if "accept" in conf: accept = conf["accept"] else: accept = False util.tic("read hits") for blastfile, order in blastfiles: util.tic("determine best hits '%s'" % os.path.basename(blastfile)) for hit in blast.BlastReader(blastfile): if order: gene1 = blast.query(hit) gene2 = blast.subject(hit) alnlen1 = blast.queryLength(hit) alnlen2 = blast.subjectLength(hit) else: gene2 = blast.query(hit) gene1 = blast.subject(hit) alnlen2 = blast.queryLength(hit) alnlen1 = blast.subjectLength(hit) score = blast.bitscore(hit) len1 = genes[gene1]["length"] len2 = genes[gene2]["length"] coveragesmall = min(alnlen1 / float(len1), alnlen2 / float(len2)) coveragebig = max(alnlen1 / float(len1), alnlen2 / float(len2)) # discard a hit that does not pass basic cutoffs if blast.bitscore(hit) / float(blast.alignLength(hit)) < \ conf["bitspersite"] or \ coveragesmall < conf["coveragesmall"] or \ coveragebig < conf["coveragebig"] or \ blast.evalue(hit) > conf["signif"]: continue if accept and \ (gene1 not in accept or gene2 not in accept): continue # create a key for a partition: (side, index) if gene1 in lookup1: part1 = (0, lookup1[gene1]) else: parts1.append([gene1]) lookup1[gene1] = len(parts1) - 1 part1 = (0, len(parts1) - 1) if gene2 in lookup2: part2 = (1, lookup2[gene2]) else: parts2.append([gene2]) lookup2[gene2] = len(parts2) - 1 part2 = (1, len(parts2) - 1) val = hits[part1][part2] val[0] += score val[1] += 1 hits[part2][part1] = val util.toc() util.toc() util.tic("read outgroup hits") outbest = util.Dict(default=[0, 0]) for blastfile, order in outblastfiles: util.tic("determine best hits '%s'" % os.path.basename(blastfile)) for hit in blast.BlastReader(blastfile): if order: genein = blast.query(hit) geneout = blast.subject(hit) else: geneout = blast.query(hit) genein = blast.subject(hit) score = blast.bitscore(hit) # create a key for a partition: (side, index) if genein in lookup1: partin = (0, lookup1[genein]) elif gene1 in lookup2: partin = (1, lookup2[genein]) else: continue val = outbest[partin] val[0] += score val[1] += 1 util.toc() util.toc() assert len(parts1) == len(unionPart(parts1)) assert len(parts2) == len(unionPart(parts2)) util.tic("determine clusters") sets = {} for i in xrange(len(parts1)): sets[(0, i)] = sets.UnionFind([(0, i)]) for i in xrange(len(parts2)): sets[(1, i)] = sets.UnionFind([(1, i)]) # merge top avg hits for part1 in hits: o1 = outbest[part1] outavg1 = float(o1[0]) / max(o1[1], 1) top = 0 toppart = None for part2, (tot, num) in hits[part1].iteritems(): avg = float(tot) / num o2 = outbest[part2] outavg2 = float(o2[0]) / max(o2[1], 1) if avg > outavg1 and avg > outavg2 and avg > top: top = avg toppart = part2 if toppart: sets[part1].union(sets[toppart]) sets = util.unique([x.root() for x in sets.values()]) # create partition of genes parts = [] joining = (parts1, parts2) for set in sets: parts.append([]) for i, row in set: parts[-1].extend(joining[i][row]) util.toc() assert len(parts) == len(unionPart(parts)) return parts
def find_synteny(species1, species2, regions1, regions2, orths): # ortholog db {gene1 -> orthologs of gene1} orthdb = util.Dict(default=set()) for row in orths: orthdb[row[0]].add(row[1]) orthdb[row[1]].add(row[0]) # TODO: generalize # inparalogs db {gene1 -> gene1a | gene1 and gene1a both have same orthologs} inpardb = util.Dict(default=set()) for gene, others in orthdb.iteritems(): inpardb[gene] = orthdb[iter(others).next()] # make region db regiondb = regionlib.RegionDb(regions1 + regions2) # get chromosome sets chroms1 = regiondb.get_chroms(species1) chroms2 = regiondb.get_chroms(species2) blocks = [] for chname1, chrom1 in chroms1.iteritems(): # skip empty chromosomes if len(chrom1) == 0: continue # start a new block need_new_block = True loss_streak = [] for i, gene1 in enumerate(chrom1): names2 = orthdb[gene1.data["ID"]] # no orthologs, start a loss streak if len(names2) == 0: #need_new_block = True loss_streak.append(make_orth(regiondb, [gene1.data["ID"]], [])) continue # make ortholog cluster names1 = inpardb[gene1.data["ID"]] orth = make_orth(regiondb, names1, names2) # orthologs are not contiguous, stop block if not is_orth_contig(regiondb, orth): loss_streak = [] need_new_block = True continue # try to add to existing block if not need_new_block: block = blocks[-1] # just continue if we are still in the last ortholog pair # i.e. gene1 is a paralog in a tandem set if orth == block.orths[-1]: continue # try to append direction = can_append_orth(regiondb, block.orths[-1], block.dir, orth, orthdb) if direction == 0: loss_streak = [] need_new_block = True else: for loss in loss_streak: block.add_orth(loss, direction) loss_streak = [] block.add_orth(orth, direction) # start a new block if need_new_block: loss_streak = [] if len(blocks) > 0: blocks[-1].recalc_regions(regiondb) blocks.append(SyntenyBlock(* orth_regions(regiondb, orth))) blocks[-1].add_orth(orth) need_new_block = False if len(blocks) > 0: blocks[-1].recalc_regions(regiondb) return blocks
pep_per_prop = util.hist_dict(util.mget(seqlib.AA_PROPERTY, AA)) prop_counts = util.Dict(default=0) for char in AA: prop = seqlib.AA_PROPERTY[char] tint = prop_counts[prop] / float(pep_per_prop[prop]) pep_colors[char] = prop2color(prop, tint * .5) prop_counts[prop] += 1 return pep_colors dna_colors = util.Dict( { "A": color(1, .5, .5), "T": color(1, 1, .5), "C": color(.5, 1, .5), "G": color(.5, .5, 1) }, default=color(.5, .5, .5)) pep_colors = make_pep_colors(prop2color=prop2color) def guess_seq(seq): """Guesses whether a sequence is 'dna' or 'pep'""" dna = "ACTG-N" chars = util.unique(seq.upper()) for char in chars: if char not in dna: