def run(): global ALLTEMPFILES args = parseArgs() dirName = os.path.basename(args.asmdir) sameStrand, seeds = orderSeeds(dirName.split('_')) inputReads = FastqFile(os.path.join(args.asmdir,"input.fastq")) supportFn, flankFn = extractFlanks(inputReads, basedir=args.tempDir) preunitereads(supportFn, args) onFlank = NamedTemporaryFile(prefix="onFlank_", suffix=".m5", delete=False, dir=args.tempDir) ALLTEMPFILES.append(onFlank.name) onFlank.close() tailblasr(supportFn, flankFn, nproc=args.nproc, \ outname=onFlank.name, basedir=args.tempDir) data = getSubSeqs(onFlank.name, supportFn, sameStrand, seeds, \ args.predictedGapSize, args.maxTrim, args.maxWiggle, basedir=args.tempDir) if data.stats["spanSeedName"] != "tooShortNs": buildFillSeq(data, supportFn, args) #if data.stats["support"][0] == SUPPORTFLAGS.span: #logging.info("spanned gap") #else: #logging.info("seed1 extend %d - seed2 extend %d" % tuple(data.stats["support"][1:])) data.stats["predictedGapSize"] = args.predictedGapSize jOut = open(os.path.join(args.asmdir, "fillingMetrics.json"),'w') jOut.write(json.dumps(data.stats,indent=4)) jOut.close() if not args.keepTemp: logging.info("Cleaning %d temp files" % (len(ALLTEMPFILES))) for i in ALLTEMPFILES: os.remove(i) logging.info("Finished")
def run(argv): print argv args = parseArgs(argv) if args.m4.endswith("m5"): aligns = M5File(args.m4) else: aligns = M4File(args.m4) if args.reads.endswith("fasta"): reads = FastaFile(args.reads) elif args.reads.endswith("fastq"): temp = FastqFile(args.reads) reads = {} for i in temp: reads[i] = temp[i].seq del (temp) else: logging.error("Expected Fasta or Fastq for READS (%s)" % args.reads) exit(1) logging.info("Extracting tails") tailfastq = tempfile.NamedTemporaryFile(suffix=".fasta", delete=False, dir=args.temp) tailfastq.close() tailfastq = tailfastq.name logging.debug("Tail read tmp file %s " % (tailfastq)) r, t, m = extractTails(aligns, reads, outFq=tailfastq, minLength=args.minTail) logging.info("Parsed %d reads" % (r)) logging.info("Found %d tails" % (t)) logging.info("%d reads had double tails" % (m)) if t == 0: logging.info("No tails -- Exiting") exit(0) logging.info("Mapping Tails") tailmap = tempfile.NamedTemporaryFile(suffix=".m4", delete=False, dir=args.temp) tailmap.close() tailmap = tailmap.name logging.debug("Read map tmp file %s " % (tailmap)) mapTails(tailfastq, args.ref, nproc=args.nproc, out=tailmap, useSa=args.noSa) logging.info("Consolidating alignments") logging.debug("Final file %s " % (args.output)) n = uniteTails(aligns, tailmap, args.output, args.inplace) logging.info("%d tails mapped" % (n))
def iterAssemble(entry, myReads): """ """ @exeLog_noFail def assemble(inputFq, workDir): return exe("OLCAssembly.py %s --nproc 4 --fqOut --workDir %s" % (inputFq, workDir)) level = 0 curReads = myReads ref, alt = entry.haplotype.split('/') isHet = ref != alt while True: logging.info("Running assembly level %d" % (level)) workDir = os.path.join(os.path.dirname(myReads), "level%d" % (level)) #Potential problem try: os.mkdir(workDir) except OSError: pass assemble(curReads, workDir) outName = os.path.join(workDir, "out.fastq") if not os.path.exists(outName): logging.error("Assembly iteration didn't return consensus") logging.error("Manual checking is required(?)") logging.error("Returning the best answer we have") return curReads output = FastqFile(outName) if len(output) == 0: logging.error("Couldn't assemble contigs after %d levels" % (level)) logging.error("Returning the best answer we have") return curReads if not isHet and len(output) == 1 or isHet and len(output) == 2: logging.info("Made consensus after %d levels" % (level)) return outName elif isHet and len(output) == 1: logging.warning( "One consensus sequence created for het at level %d" % (level)) logging.warning("Manual checking is required(?)") logging.error("Returning the best answer we have") return outName level += 1 curReads = outName
def tailblasr(query, target, nproc=1, outname="out.m5", basedir="./"): """ Try getting the read to hit each target uniquely instead of hoping that bestn reports all possible alignments """ global ALLTEMPFILES #input reads reads = FastqFile(query) #map to make the primary primary= NamedTemporaryFile(prefix="primary_", suffix=".m4", delete=False, dir=basedir) primary = primary.name ALLTEMPFILES.append(primary) blasr(query, target, fmt="4", nproc=nproc, bestn=1, outname=primary) #build command to call m4pie args = "%s %s %s -t %d -n %d -o %s" % (primary, query, target, MINTAIL, nproc, outname) args = args.split() m4pie.run(args)
#!/usr/bin/env python from pbsuite.utils.FileHandlers import FastqFile, M5File from pbsuite.utils.CommandRunner import exe """ This can be run inside of an assembly folder and create our filling sequence into polish.out.fasta """ if __name__ == '__main__': input = FastqFile("input.fastq") fout = open("ref.fasta",'w') for i in input.values(): if i.name.startswith("ref"): fout.write(">%s\n%s\n" % (i.name, i.seq)) fout.close() print exe(("blasr input.fastq ref.fasta --bestn 2 -m 5 --noSplitSubreads > out.m5")) print exe(("python /stornext/snfs5/next-gen/scratch/english/Jelly/" "DevJelly/branches/consensusDev/GetSubs.py out.m5 input.fastq")) print exe(("python /stornext/snfs5/next-gen/scratch/english/Jelly/" "DevJelly/branches/sv/pbjPolish.py " "reads.fastq seed.fasta -n 4 -l"))
#!/usr/bin/python from pbsuite.utils.FileHandlers import FastqFile, M5File from pbsuite.utils.CommandRunner import exe """ This can be run inside of an assembly folder and create our filling sequence into polish.out.fasta """ if __name__ == '__main__': input = FastqFile("input.fastq") fout = open("ref.fasta",'w') for i in input.values(): if i.name.startswith("ref"): fout.write(">%s\n%s\n" % (i.name, i.seq)) fout.close() print exe(("blasr input.fastq ref.fasta --bestn 2 -m 5 --noSplitSubreads > out.m5")) print exe(("python /stornext/snfs5/next-gen/scratch/english/Jelly/" "DevJelly/branches/consensusDev/GetSubs.py out.m5 input.fastq")) print exe(("python /stornext/snfs5/next-gen/scratch/english/Jelly/" "DevJelly/branches/sv/pbjPolish.py " "reads.fastq seed.fasta -n 4 -l"))
aligns = M5File(alignFile) fout = open(consensusFile, 'w') results = consensus(aligns) fout.write(">pbjpolish_%d_vote_%d_len\n%s\n" % (results.contribBases,\ results.fillBases, results.sequence)) #fout.write(">%s\n%s\n" % consensus(aligns)) fout.close() elif args.super:#All tempfile = open("temp.fasta",'w') if args.reads.endswith(".fasta"): seqs = FastaFile(args.reads) #temp flie for s in seqs: tempfile.write(">%s\n%s\n" % (s, seqs[s])) elif args.reads.endswith(".fastq"): seqs = FastqFile(args.reads) #temp file for s in seqs: tempfile.write(">%s\n%s\n" % (s, seqs[s].seq)) blasr(args.reads, tempfile.name, nproc=args.nproc, bestn=len(seqs), outName=alignFile) aligns = M5File(alignFile) groups = defaultdict(list) for a in aligns: groups[a.tname].append(a) fout = open(consensusFile, 'w') for g in groups: results = consensus(aligns) fout.write(">pbjpolish_%d_vote_%d_len\n" % (results.contribBases,\ results.fillBases, results.sequence)) fout.close()
def preunitereads(inputFastq, args): """ sent query, I'm going to pop all of the united reads onto this """ global ALLTEMPFILES alignFile = NamedTemporaryFile(prefix="uni_", suffix=".m5", delete=False, dir=args.tempDir).name ALLTEMPFILES.append(alignFile) readFile = NamedTemporaryFile(prefix="uni_", suffix=".fasta", delete=False, dir=args.tempDir) ALLTEMPFILES.append(readFile.name) input = FastqFile(inputFastq) for read in input: readFile.write(">%s\n%s\n" % (input[read].name, input[read].seq)) readFile.close() readFile = readFile.name blasr(readFile, readFile, bestn=5, nCandidates=20, nproc=args.nproc, outname=alignFile) aligns = M5File(alignFile) con = AlignmentConnector() extenders = [] for a in aligns: if a.tname == a.qname: continue if a.qstart - a.qend < 500 or a.tstart - a.tend < 500: continue sup = con.extendsTarget(a, minCovers=500) #sup = con.extendsTarget(a, minCovers=100) a.support = sup if sup in [SUPPORTFLAGS.left, SUPPORTFLAGS.right]: extenders.append(a) best = {} #best of queries for i in extenders: score = 0 if i.qname in best: score = best[i.qname].score if i.score < score: best[i.qname] = i #print "q" #for i in best.values(): #print str(i) best2 = {} #best of targets for i in best.values(): score = 0 if i.tname in best2: score = best2[i.tname].score if i.score < score: best2[i.tname] = i #print "t" #for i in best2.values(): #print str(i) best3 = {} #best of both for i in best2.values(): keys = [i.qname, i.tname] keys.sort() keys = "".join(keys) score = 0 if keys in best3: score = best3[keys].score if i.score < score: best3[keys] = i #print 'b' #for i in best3.values(): #print str(i) reads = FastqFile(inputFastq) fout = open(inputFastq, 'a') count = 0 for i in best3.values(): qseq = None if i.support == SUPPORTFLAGS.left: if i.qstrand == '0': qseq = reads[i.qname].seq + reads[i.tname].seq[i.tend:] elif i.qstrand == '1': qseq = reads[i.qname].seq + reads[ i.tname].seq[i.tend:].translate(revComp) if i.support == SUPPORTFLAGS.right: if i.qstrand == '0': qseq = reads[i.tname].seq[:i.tstart] + reads[i.qname].seq elif i.qstrand == '1': qseq = reads[i.tname].seq[:i.tstart].translate( revComp) + reads[i.qname].seq if qseq is not None: count += 1 fout.write("@%s_%s\n%s\n+\n%s\n" % (i.qname, i.tname, qseq, "!" * len(qseq))) logging.info("Preunited %d reads" % (count)) fout.close()
def getSubSeqs(alignmentFile, readsFile, sameStrand, seeds, predictedGapSize, maxTrim, maxWiggle, basedir="./"): """ Finds the seqs that align to the flanks the best, creates a fastq of supporting reads and the seed Might have a problem with my best read no going off the edge fully so I put the maxFlank at 20 I should do more strand correction here """ global ALLTEMPFILES def singleExtendLookup(sup, a): """ For getting how a single read extends a single flank """ if sup == SUPPORTFLAGS.none: return None #set the coordinates of the extending sequence logging.debug(sup) logging.debug(a.qname) mystart = None myend = None if a.tname.endswith("e5") and sup in [ SUPPORTFLAGS.left, SUPPORTFLAGS.span ]: if a.tstrand == '0': mystart = 0 myend = a.qstart else: mystart = a.qend myend = a.qseqlength elif a.tname.endswith("e3") and sup in [ SUPPORTFLAGS.right, SUPPORTFLAGS.span ]: if a.tstrand == '0': mystart = a.qend myend = a.qseqlength else: mystart = 0 myend = a.qstart if mystart is None or myend is None or mystart < 0 or myend > a.qseqlength: return None #tscore = a.score * (myend - mystart) #what flank and is it the best if a.tname.replace('/', '.') == stats["seed1"]: stats["extendF1Count"] += 1 stats["avgExtF1Bases"] += a.qstart stats["support"][1].append(sup) if a.score < stats["extendF1SeedScore"]: stats["extendF1SeedScore"] = a.score #tscore stats["extendF1SeedName"] = a.qname stats["extendF1SeedStart"] = mystart stats["extendF1SeedEnd"] = myend stats["extendF1SeedStrand"] = a.tstrand return reads[a.qname].subSeq(mystart, myend) #myOut = f1fout elif a.tname.replace('/', '.') == stats["seed2"]: stats["extendF2Count"] += 1 stats["avgExtF2Bases"] += a.qstart stats["support"][2].append(sup) if a.score < stats["extendF2SeedScore"]: stats["extendF2SeedScore"] = a.score #tscore stats["extendF2SeedName"] = a.qname stats["extendF2SeedStart"] = mystart stats["extendF2SeedEnd"] = myend stats["extendF2SeedStrand"] = a.tstrand return reads[a.qname].subSeq(mystart, myend) #myOut = f2fout #myOut.write(str(reads[a.qname].subSeq(mystart, myend))) return None connector = AlignmentConnector() #aligns = connector.parseAlignments(M5File(alignmentFile)) #no need to connect with the tailmap aligns = defaultdict(list) for a in M4File(alignmentFile): aligns[a.qname].append(a) aligns = aligns.values() reads = FastqFile(readsFile) stats = createStats() stats["seed1"], stats["seed2"] = seeds stats["sameStrand"] = sameStrand bestSpan = None bestF1E = None bestF2E = None for readGroup in aligns: if len(readGroup) > 2: best = 0 worst = 0 keep = [] for i in readGroup: if i.score < best: keep.insert(0, i) if len(keep) >= 2: keep.pop() best = i.score elif i.score < worst: keep.insert(1, i) if len(keep) >= 2: keep.pop() worst = i.score readGroup = keep if len(readGroup) == 2: #make sure that the two hits aren't hitting the same target if readGroup[0].tname == readGroup[1].tname: if readGroup[0].score <= readGroup[1].score: del (readGroup[1]) else: del (readGroup[0]) #hit on each flank if len(readGroup) == 2: r1, r2 = readGroup if r1.tname == stats["seed2"]: r1, r2 = r2, r1 a = connector.extendsTarget(r1, maxFlank=maxTrim, minCovers=0) logging.debug(a) #Also check appropriate orientation if r1.tname.endswith('e3'): if a not in [SUPPORTFLAGS.right, SUPPORTFLAGS.span]: logging.debug('reset a') a = SUPPORTFLAGS.none elif r1.tname.endswith('e5'): if a not in [SUPPORTFLAGS.left, SUPPORTFLAGS.span]: logging.debug('reset a') a = SUPPORTFLAGS.none b = connector.extendsTarget(r2, maxFlank=maxTrim, minCovers=0) if r2.tname.endswith('e3'): if b not in [SUPPORTFLAGS.right, SUPPORTFLAGS.span]: logging.debug('reset b') b = SUPPORTFLAGS.none elif r2.tname.endswith('e5'): if b not in [SUPPORTFLAGS.left, SUPPORTFLAGS.span]: logging.debug('reset b') b = SUPPORTFLAGS.none elif len(readGroup) == 1: r1 = readGroup[0] r2 = None a = connector.extendsTarget(r1, maxFlank=10) b = SUPPORTFLAGS.none if r1.tname == stats["seed2"]: r1, r2 = r2, r1 a, b = b, a else: logging.warning("read %s gave too many alignments" % (readGroup[0].qname)) #it extends both flanks if a != SUPPORTFLAGS.none and b != SUPPORTFLAGS.none: logging.debug("%s spans" % r1.qname) logging.debug("aflag %d bflag %d" % (a, b)) logging.debug("hit1- %s (%d, %d)" % (r1.tname, r1.qstart, r1.qend)) logging.debug("hit2- %s (%d, %d)" % (r2.tname, r2.qstart, r2.qend)) rStart = min(r1.qend, r2.qend) rEnd = max(r1.qstart, r2.qstart) sz = rEnd - rStart tooShort = False if sz < 50: logging.info("fill seq is too short to call consensus") tooShort = True tooShortSeq = reads[r1.qname].subSeq(rStart, rEnd) #continue if predictedGapSize is not None and (predictedGapSize - sz) > maxWiggle: logging.info( "fill seq size %d is smaller than allowed predicted gap size wiggle %d" % (sz, maxWiggle)) continue #Need to ensure that it's extending in the correct orientation #need to ensure that everything is on the right strand if sameStrand and r1.tstrand != r2.tstrand: logging.debug("bad strandedness") continue #check for negative gaps stats["spanCount"] += 1 stats["avgSpanBases"] += rEnd - rStart stats["support"][0].append(SUPPORTFLAGS.span) t = reads[r1.qname].subSeq(rStart, rEnd) #sfout.write(str(t)) #is it the best spanner score = r1.score + r2.score if score < stats["spanSeedScore"]: logging.debug("scoring %s %s" % (r1.qname, r2.qname)) stats["spanSeedScore"] = score spanSeedName = r1.qname stats["spanSeedStrand1"] = r1.tstrand bestSpan = reads[r1.qname].subSeq(rStart, rEnd) stats["spanSeedName"] = r1.qname stats["spanSeedStart"] = rStart stats["spanSeedEnd"] = rEnd stats["spanSeedStrand2"] = r2.tstrand stats["spanShort"] = tooShort if r1.tname.endswith('e5'): stats["seed1Trim"] = r1.tstart logging.debug('trim1 %d' % (r1.tstart)) else: stats["seed1Trim"] = r1.tseqlength - r1.tend logging.debug('trim1else %d' % (r1.tseqlength - r1.tend)) if r2.tname.endswith('e5'): stats["seed2Trim"] = r2.tstart logging.debug('trim2 %d' % (r2.tstart)) else: stats["seed2Trim"] = r2.tseqlength - r2.tend logging.debug('trimelse %d' % (r2.tseqlength - r2.tend)) c = singleExtendLookup(a, r1) if c is not None: bestF1E = c c = singleExtendLookup(b, r2) if c is not None: bestF2E = c #sfout.close() #sfout = sfout.name #f1fout.close() #f1fout = f1fout.name #f2fout.close() #f2fout = f2fout.name logging.info("%d reads span" % stats["spanCount"]) logging.info("%d reads extend flank 1" % stats["extendF1Count"]) logging.info("%d reads extend flank 2" % stats["extendF2Count"]) #nt = namedtuple("SubInfo", "stats spanReads flank1Reads flank2Reads spanSeed flank1Seed flank2Seed") nt = namedtuple("SubInfo", "stats spanSeed flank1Seed flank2Seed") #seeds out files ssfout = None f1sfout = None f2sfout = None #replace too short with N's #if stats["spanCount"] == 0 and len(tooShort) > (stats["extendF1Count"] + stats["extendF2Count"])/2: """This is when I would say "oh, i'm too short - and stop early. Now, I'm still going to try to write the short stuff and treat it like anything else. It'll be up to later processes to catch this guy. if stats["spanCount"] != 0 and stats["spanShort"]: #stats["avgSpanBases"] = #stats["spanCount"] = len(tooShort) logging.info("estimated fill len %d" % (stats["avgSpanBases"])) logging.debug("but I'm too short") #stats["fillSeq"] = "N"* abs(stats["spanSeedStart"] - stats["spanSeedEnd"]) stats["fillSeq"] = tooShortSeq stats["spanSeedScore"] = -500 stats["spanSeedStrand1"] = '0' stats["spanSeedStrand2"] = '0' #stats["spanSeedName"] = "tooShortNs" #ret = nt(stats, None, None, None, None, None, None) ret = nt(stats, None, None, None) return ret """ if stats["spanCount"] > 0: stats["avgSpanBases"] = stats["avgSpanBases"] / stats["spanCount"] logging.info("estimated fill len %d" % (stats["avgSpanBases"])) #write seed if len(bestSpan.seq) < 50: logging.warning( "fill sequence is small (%dbp) can't call consensus" % (len(bestSpan.seq))) #I don't know what to return here ssfout = NamedTemporaryFile(prefix="span_", suffix=".fasta", delete=False, dir=basedir) ALLTEMPFILES.append(ssfout.name) logging.debug("spanning with %s" % (bestSpan.name)) ssfout.write(">%s\n%s\n" % (bestSpan.name, bestSpan.seq)) ssfout.close() ssfout = ssfout.name #if stats["extendF1Count"] > 0: if bestF1E is not None: stats[ "avgExtF1Bases"] = stats["avgExtF1Bases"] / stats["extendF1Count"] logging.info("estimated flank 1 extend len %d" % (stats["avgExtF1Bases"])) #write seed if len(bestF1E.seq) < 50: logging.warning( "f1e sequence is small (%dbp) can't call consensus" % (len(bestF1E.seq))) #I don't know what to return here f1sfout = NamedTemporaryFile(prefix="flank1_", suffix=".fasta", delete=False, dir=basedir) ALLTEMPFILES.append(f1sfout.name) f1sfout.write(">%s\n%s\n" % (bestF1E.name, bestF1E.seq)) f1sfout.close() f1sfout = f1sfout.name #if stats["extendF2Count"] > 0: if bestF2E is not None: stats[ "avgExtF2Bases"] = stats["avgExtF2Bases"] / stats["extendF2Count"] logging.info("estimated flank 2 extend len %d" % (stats["avgExtF2Bases"])) #write seed if len(bestF2E.seq) < 50: logging.warning( "f2e sequence is small (%dbp) can't call consensus" % (len(bestF2E.seq))) #I don't know what to return here f2sfout = NamedTemporaryFile(prefix="flank2", suffix=".fasta", delete=False, dir=basedir) ALLTEMPFILES.append(f2sfout.name) f2sfout.write(">%s\n%s\n" % (bestF2E.name, bestF2E.seq)) f2sfout.close() f2sfout = f2sfout.name #all of the info I need to return... refactor later and create useful objects #ret = nt(stats, sfout, f1fout, f2fout, ssfout, f1sfout, f2sfout) ret = nt(stats, ssfout, f1sfout, f2sfout) #seeds writing return ret
#!/usr/bin/python import argparse, json from pbsuite.jelly.Jelly import JellyProtocol from pbsuite.utils.FileHandlers import FastaFile, FastqFile from pbsuite.utils.summarizeAssembly import getStats USAGE = """Get statistics on fasta/fastq sequences recorded in a Protocol.xml""" if __name__ == '__main__': parser = argparse.ArgumentParser(description=USAGE, \ formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("xml", metavar="XML", type=str, \ help="Protocol.xml with inputs listed") args = parser.parse_args() protocol = JellyProtocol(args.xml) seqLengths = [] for i in protocol.inputs: if i.endswith(".fasta"): f = FastaFile(i) for j in f.values(): seqLengths.append(len(j)) if i.endswith(".fastq"): f = FastqFile(i) for j in f.values(): seqLengths.append(len(j.seq)) print "Read Stats", json.dumps(getStats(seqLengths), indent=4)
import sys from pbsuite.utils.FileHandlers import FastqFile, M5File from pbsuite.jelly.Support import AlignmentConnector, SUPPORTFLAGS """ Need to do work here """ if __name__ == '__main__': connector = AlignmentConnector() aligns = connector.parseAlignments(M5File(sys.argv[1])) reads = FastqFile(sys.argv[2]) bestScore = None best = None fout = open("reads.fastq",'w') spanCount = 0 for readGroup in aligns: if readGroup[0].qname.startswith("ref"): continue if len(readGroup) == 2: r1, r2 = readGroup a = connector.extendsTarget(r1) b = connector.extendsTarget(r2) if a != SUPPORTFLAGS.none and b != SUPPORTFLAGS.none: spanCount += 1 print r1.qname, "spans" rStart = min(r1.qend, r2.qend) rEnd = max(r1.qstart, r2.qstart) t = reads[r1.qname].subSeq(rStart, rEnd)
yield Sequence(name, seq, qul) if __name__ == '__main__': parser = argparse.ArgumentParser(description=USAGE) parser.add_argument("filtered_subreads", type=str, \ help="Fastq of single pass reads") parser.add_argument("ccs_reads", type=str, \ help="Fastq of ccs reads") parser.add_argument("-o", "--output", type=str, default=None, \ help="Output fastq file (STDOUT)") args = parser.parse_args() ccs = FastqFile(args.ccs_reads) cKeys = ccs.keys() if args.output != None: output = open(args.output,'w') else: output = sys.stdout #name: numBases ccsReads = defaultdict(int) subReads = {} ccsPases = defaultdict(int) #sub = FastqFile(args.filtered_subreads) for read in fastqIterator(args.filtered_subreads):