def rmdup(prog, opts, wd, hitDict, organelle, nucleus): dupDict = {} dupCounts = 0 nuclearDups = set() # command line format strings lastdb = opts.last + "/lastdb %s %s %s" lastal = opts.last + "/lastal -e%s -j4 -f0 %s %s | grep -v '#'" lastex = opts.last + "/lastex -E%s %s.prj %s.prj | sed -n 4p - | cut -f1" # flanking sequence file names flankSeq = wd + "/flankSeq.fa" flankRev = wd + "/flankRev.fa" # index file name flankIndex = wd + "/flankIndex" # compare flanking sequence similarity if opts.verbose: print prog + ": filter nuclear duplications" print prog + ": compare flanking sequence similarities" wrote = extractFlankSeq(flankSeq, flankRev, opts.flank_len, hitDict, nucleus) if not wrote: hitCounts = sum(map(len, hitDict.values())) return hitCounts, dupDict, dupCounts commands.getoutput(lastdb % ("-c", flankIndex, flankSeq)) score = commands.getoutput(lastex % ('1e-25', flankIndex, flankIndex)) score, evalue = evalueSimulation(lastex, lastal, flankIndex, flankIndex, flankRev, score, '1e-25') flankResult = commands.getoutput(lastal % (score, flankIndex, flankSeq)) nuclearDups = formatResult(flankResult, opts.dup_coverage) # crosscheck with segmental duplication database if opts.segdup_db: if opts.verbose: print prog + ": crosscheck with segmental duplication database" segmentalDups = checkSegDup(opts.segdup_db, hitDict, opts.dup_coverage) nuclearDups = nuclearDups.union(segmentalDups) # remove duplicate hits dupDict = filterDups(nuclearDups, hitDict) hitCounts = sum(map(len, hitDict.values())) dupCounts = sum(map(len, dupDict.values())) return hitCounts, dupDict, dupCounts
def checkGenomes(prog, coord, ref, labels, gens, spes, spesV, opts, wd): # command line format strings lastdb = opts.last + "/lastdb %s %s %s" lastal = opts.last + "/lastal -e%s -j4 -f0 %s %s | grep -v '#'" lastex = opts.last + "/lastex -E%s %s.prj %s.prj | sed -n 4p - | cut -f1" # index file name bBaseFreq = wd + "/bBaseFreq" # hit and its flanking sequences baitSeq = wd + "/baitSeq.fa" baitRev = wd + "/baitRev.fa" i = [k for k in spes.keys() if spes[k] == ref][0] extractBaitSeq(baitSeq, baitRev, gens[i], coord) commands.getoutput(lastdb % ("-x", bBaseFreq, baitSeq)) spes.pop(i) L = spes.keys() L.sort() homolog = {} for chrom in coord.keys(): for obj in coord[chrom]: key = ":".join([chrom, obj[0], obj[1]]) beg = map(int, obj[0].split(",")) end = map(int, obj[1].split(",")) l = sum([end[i] - beg[i] for i in range(len(beg))]) homolog.setdefault(key, [{} for j in L]) homolog[key].append(l + 400) for j in range(len(L)): if opts.verbose: print prog + ": ...... surveying: " + spesV[L[j]] gIndex = wd + "/" + spes[L[j]] + "Index" commands.getoutput(lastdb % ("-c", gIndex, gens[L[j]])) score = commands.getoutput(lastex % ('1e-10', gIndex, bBaseFreq)) score, e = evalueSimulation(lastdb, lastal, gIndex, bBaseFreq, baitRev, score, '1e-10') alignResult = commands.getoutput(lastal % (score, gIndex, baitSeq)) formatResult(j, alignResult, homolog) return homolog
def checkGenomes(prog, coord, ref, labels, gens, spes, spesV, opts, wd): # command line format strings lastdb = opts.last + "/lastdb %s %s %s" lastal = opts.last + "/lastal -e%s -j4 -f0 %s %s | grep -v '#'" lastex = opts.last + "/lastex -E%s %s.prj %s.prj | sed -n 4p - | cut -f1" # index file name bBaseFreq = wd + "/bBaseFreq" # hit and its flanking sequences baitSeq = wd + "/baitSeq.fa" baitRev = wd + "/baitRev.fa" i = [k for k in spes.keys() if spes[k] == ref][0] extractBaitSeq(baitSeq, baitRev, gens[i], coord) commands.getoutput(lastdb % ("-x", bBaseFreq, baitSeq)) spes.pop(i) L = spes.keys() L.sort() homolog = {} for chrom in coord.keys(): for obj in coord[chrom]: key = ":".join([chrom, obj[0], obj[1]]) beg = map(int, obj[0].split(",")) end = map(int, obj[1].split(",")) l = sum([end[i] - beg[i] for i in range(len(beg))]) homolog.setdefault(key, [{} for j in L]) homolog[key].append(l + 400) for j in range(len(L)): if opts.verbose: print prog + ": ...... surveying: " + spesV[L[j]] gIndex = wd + "/" + spes[L[j]] + "Index" commands.getoutput(lastdb % ("-c", gIndex, gens[L[j]])) score = commands.getoutput(lastex % ("1e-10", gIndex, bBaseFreq)) score, e = evalueSimulation(lastdb, lastal, gIndex, bBaseFreq, baitRev, score, "1e-10") alignResult = commands.getoutput(lastal % (score, gIndex, baitSeq)) formatResult(j, alignResult, homolog) return homolog