def run(argv): print argv args = parseArgs(argv) if args.m4.endswith("m5"): aligns = M5File(args.m4) else: aligns = M4File(args.m4) if args.reads.endswith("fasta"): reads = FastaFile(args.reads) elif args.reads.endswith("fastq"): temp = FastqFile(args.reads) reads = {} for i in temp: reads[i] = temp[i].seq del (temp) else: logging.error("Expected Fasta or Fastq for READS (%s)" % args.reads) exit(1) logging.info("Extracting tails") tailfastq = tempfile.NamedTemporaryFile(suffix=".fasta", delete=False, dir=args.temp) tailfastq.close() tailfastq = tailfastq.name logging.debug("Tail read tmp file %s " % (tailfastq)) r, t, m = extractTails(aligns, reads, outFq=tailfastq, minLength=args.minTail) logging.info("Parsed %d reads" % (r)) logging.info("Found %d tails" % (t)) logging.info("%d reads had double tails" % (m)) if t == 0: logging.info("No tails -- Exiting") exit(0) logging.info("Mapping Tails") tailmap = tempfile.NamedTemporaryFile(suffix=".m4", delete=False, dir=args.temp) tailmap.close() tailmap = tailmap.name logging.debug("Read map tmp file %s " % (tailmap)) mapTails(tailfastq, args.ref, nproc=args.nproc, out=tailmap, useSa=args.noSa) logging.info("Consolidating alignments") logging.debug("Final file %s " % (args.output)) n = uniteTails(aligns, tailmap, args.output, args.inplace) logging.info("%d tails mapped" % (n))
def collectStats(stats, input): if input.endswith(".m4"): parser = M4File(input) elif input.endswith(".m5"): parser = M5File(input) reads = defaultdict(list) subreads = defaultdict(list) nSubs = 0 numBases = 0 numAlignedBases = 0 tailLengths = [] for line in parser: name = line.qname.split('/')[1] reads[name].append(line) subreads[line.qname].append(line) reads = dict(reads) subreads = dict(subreads) for read in reads: readLength = {} alignedLength = 0 for subr in reads[read]: readLength[subr.qname] = subr.qseqlength alignedLength += subr.qend - subr.qstart stats["read"].append(sum(readLength.values())) stats["aln_read"].append(alignedLength) for sub in subreads: readLength = subreads[sub][0].qseqlength stats["subread"].append(readLength) alignedBases = 0 pieHits = [] for hit in subreads[sub]: alnLength = hit.qend - hit.qstart alignedBases += alnLength pieHits.append(alnLength) stats["hit"].append(alnLength) stats["hitsim"].append(hit.pctsimilarity) stats["aln_subread"].append(alignedBases) if len(subreads[sub]) > 1: stats["tail"].append(readLength) stats["aln_tail"].extend(pieHits) stats["unmappedTail"] += readLength - alignedBases
def uniteTails(origAligns, tailMapFn, outMap="multi.m4", inplace=False): """ Put the tails and original reads into a single m4. Add tags uniting the pieces every read comprises upto three pieces X->Y->Z or prolog->primary->epilog each piece has 3 tags added: (R) ref - (P) pos - (S) strand prolog and eplog will only point to the primary and the primary will point to both """ datGrab = re.compile( "^(?P<rn>.*)_(?P<shift>\d+)(?P<log>[pe])(?P<length>\d+)$") aligns = M4File(tailMapFn) mode = 'a' if inplace else 'w' aout = open(outMap, mode) nmapped = 0 for read in aligns: nmapped += 1 data = datGrab.search(read.qname).groupdict() read.qname = data["rn"] read.qseqlength = data["length"] read.qstart += int(data["shift"]) read.qend += int(data["shift"]) aout.write(str(read) + '\n') #consolidate information about the primary hits if not inplace: aout.write("\n".join([str(x) for x in origAligns])) aout.close() return nmapped
#!/usr/bin/python import sys from pbsuite.utils.FileHandlers import M4File, M5File if __name__ == '__main__': try: fn = sys.argv[1] except: sys.stderr.write(("Error! Expected One Argument, " \ "an m4 or m5 alignment file\n")) exit(1) if fn.endswith('.m4'): file = M4File(sys.argv[1]) elif fn.endswith('.m5'): file = M5File(sys.argv[1]) else: print "Unrecognized File Type (expecting .m4 or .m5)" exit(1) if len(sys.argv) == 3: out = open(sys.argv[2], 'w') else: out = sys.stdout out.write("\n".join(map(lambda x: x.toBed(), file)) + "\n")
def getSubSeqs(alignmentFile, readsFile, sameStrand, seeds, predictedGapSize, maxTrim, maxWiggle, basedir="./"): """ Finds the seqs that align to the flanks the best, creates a fastq of supporting reads and the seed Might have a problem with my best read no going off the edge fully so I put the maxFlank at 20 I should do more strand correction here """ global ALLTEMPFILES def singleExtendLookup(sup, a): """ For getting how a single read extends a single flank """ if sup == SUPPORTFLAGS.none: return None #set the coordinates of the extending sequence logging.debug(sup) logging.debug(a.qname) mystart = None myend = None if a.tname.endswith("e5") and sup in [ SUPPORTFLAGS.left, SUPPORTFLAGS.span ]: if a.tstrand == '0': mystart = 0 myend = a.qstart else: mystart = a.qend myend = a.qseqlength elif a.tname.endswith("e3") and sup in [ SUPPORTFLAGS.right, SUPPORTFLAGS.span ]: if a.tstrand == '0': mystart = a.qend myend = a.qseqlength else: mystart = 0 myend = a.qstart if mystart is None or myend is None or mystart < 0 or myend > a.qseqlength: return None #tscore = a.score * (myend - mystart) #what flank and is it the best if a.tname.replace('/', '.') == stats["seed1"]: stats["extendF1Count"] += 1 stats["avgExtF1Bases"] += a.qstart stats["support"][1].append(sup) if a.score < stats["extendF1SeedScore"]: stats["extendF1SeedScore"] = a.score #tscore stats["extendF1SeedName"] = a.qname stats["extendF1SeedStart"] = mystart stats["extendF1SeedEnd"] = myend stats["extendF1SeedStrand"] = a.tstrand return reads[a.qname].subSeq(mystart, myend) #myOut = f1fout elif a.tname.replace('/', '.') == stats["seed2"]: stats["extendF2Count"] += 1 stats["avgExtF2Bases"] += a.qstart stats["support"][2].append(sup) if a.score < stats["extendF2SeedScore"]: stats["extendF2SeedScore"] = a.score #tscore stats["extendF2SeedName"] = a.qname stats["extendF2SeedStart"] = mystart stats["extendF2SeedEnd"] = myend stats["extendF2SeedStrand"] = a.tstrand return reads[a.qname].subSeq(mystart, myend) #myOut = f2fout #myOut.write(str(reads[a.qname].subSeq(mystart, myend))) return None connector = AlignmentConnector() #aligns = connector.parseAlignments(M5File(alignmentFile)) #no need to connect with the tailmap aligns = defaultdict(list) for a in M4File(alignmentFile): aligns[a.qname].append(a) aligns = aligns.values() reads = FastqFile(readsFile) stats = createStats() stats["seed1"], stats["seed2"] = seeds stats["sameStrand"] = sameStrand bestSpan = None bestF1E = None bestF2E = None for readGroup in aligns: if len(readGroup) > 2: best = 0 worst = 0 keep = [] for i in readGroup: if i.score < best: keep.insert(0, i) if len(keep) >= 2: keep.pop() best = i.score elif i.score < worst: keep.insert(1, i) if len(keep) >= 2: keep.pop() worst = i.score readGroup = keep if len(readGroup) == 2: #make sure that the two hits aren't hitting the same target if readGroup[0].tname == readGroup[1].tname: if readGroup[0].score <= readGroup[1].score: del (readGroup[1]) else: del (readGroup[0]) #hit on each flank if len(readGroup) == 2: r1, r2 = readGroup if r1.tname == stats["seed2"]: r1, r2 = r2, r1 a = connector.extendsTarget(r1, maxFlank=maxTrim, minCovers=0) logging.debug(a) #Also check appropriate orientation if r1.tname.endswith('e3'): if a not in [SUPPORTFLAGS.right, SUPPORTFLAGS.span]: logging.debug('reset a') a = SUPPORTFLAGS.none elif r1.tname.endswith('e5'): if a not in [SUPPORTFLAGS.left, SUPPORTFLAGS.span]: logging.debug('reset a') a = SUPPORTFLAGS.none b = connector.extendsTarget(r2, maxFlank=maxTrim, minCovers=0) if r2.tname.endswith('e3'): if b not in [SUPPORTFLAGS.right, SUPPORTFLAGS.span]: logging.debug('reset b') b = SUPPORTFLAGS.none elif r2.tname.endswith('e5'): if b not in [SUPPORTFLAGS.left, SUPPORTFLAGS.span]: logging.debug('reset b') b = SUPPORTFLAGS.none elif len(readGroup) == 1: r1 = readGroup[0] r2 = None a = connector.extendsTarget(r1, maxFlank=10) b = SUPPORTFLAGS.none if r1.tname == stats["seed2"]: r1, r2 = r2, r1 a, b = b, a else: logging.warning("read %s gave too many alignments" % (readGroup[0].qname)) #it extends both flanks if a != SUPPORTFLAGS.none and b != SUPPORTFLAGS.none: logging.debug("%s spans" % r1.qname) logging.debug("aflag %d bflag %d" % (a, b)) logging.debug("hit1- %s (%d, %d)" % (r1.tname, r1.qstart, r1.qend)) logging.debug("hit2- %s (%d, %d)" % (r2.tname, r2.qstart, r2.qend)) rStart = min(r1.qend, r2.qend) rEnd = max(r1.qstart, r2.qstart) sz = rEnd - rStart tooShort = False if sz < 50: logging.info("fill seq is too short to call consensus") tooShort = True tooShortSeq = reads[r1.qname].subSeq(rStart, rEnd) #continue if predictedGapSize is not None and (predictedGapSize - sz) > maxWiggle: logging.info( "fill seq size %d is smaller than allowed predicted gap size wiggle %d" % (sz, maxWiggle)) continue #Need to ensure that it's extending in the correct orientation #need to ensure that everything is on the right strand if sameStrand and r1.tstrand != r2.tstrand: logging.debug("bad strandedness") continue #check for negative gaps stats["spanCount"] += 1 stats["avgSpanBases"] += rEnd - rStart stats["support"][0].append(SUPPORTFLAGS.span) t = reads[r1.qname].subSeq(rStart, rEnd) #sfout.write(str(t)) #is it the best spanner score = r1.score + r2.score if score < stats["spanSeedScore"]: logging.debug("scoring %s %s" % (r1.qname, r2.qname)) stats["spanSeedScore"] = score spanSeedName = r1.qname stats["spanSeedStrand1"] = r1.tstrand bestSpan = reads[r1.qname].subSeq(rStart, rEnd) stats["spanSeedName"] = r1.qname stats["spanSeedStart"] = rStart stats["spanSeedEnd"] = rEnd stats["spanSeedStrand2"] = r2.tstrand stats["spanShort"] = tooShort if r1.tname.endswith('e5'): stats["seed1Trim"] = r1.tstart logging.debug('trim1 %d' % (r1.tstart)) else: stats["seed1Trim"] = r1.tseqlength - r1.tend logging.debug('trim1else %d' % (r1.tseqlength - r1.tend)) if r2.tname.endswith('e5'): stats["seed2Trim"] = r2.tstart logging.debug('trim2 %d' % (r2.tstart)) else: stats["seed2Trim"] = r2.tseqlength - r2.tend logging.debug('trimelse %d' % (r2.tseqlength - r2.tend)) c = singleExtendLookup(a, r1) if c is not None: bestF1E = c c = singleExtendLookup(b, r2) if c is not None: bestF2E = c #sfout.close() #sfout = sfout.name #f1fout.close() #f1fout = f1fout.name #f2fout.close() #f2fout = f2fout.name logging.info("%d reads span" % stats["spanCount"]) logging.info("%d reads extend flank 1" % stats["extendF1Count"]) logging.info("%d reads extend flank 2" % stats["extendF2Count"]) #nt = namedtuple("SubInfo", "stats spanReads flank1Reads flank2Reads spanSeed flank1Seed flank2Seed") nt = namedtuple("SubInfo", "stats spanSeed flank1Seed flank2Seed") #seeds out files ssfout = None f1sfout = None f2sfout = None #replace too short with N's #if stats["spanCount"] == 0 and len(tooShort) > (stats["extendF1Count"] + stats["extendF2Count"])/2: """This is when I would say "oh, i'm too short - and stop early. Now, I'm still going to try to write the short stuff and treat it like anything else. It'll be up to later processes to catch this guy. if stats["spanCount"] != 0 and stats["spanShort"]: #stats["avgSpanBases"] = #stats["spanCount"] = len(tooShort) logging.info("estimated fill len %d" % (stats["avgSpanBases"])) logging.debug("but I'm too short") #stats["fillSeq"] = "N"* abs(stats["spanSeedStart"] - stats["spanSeedEnd"]) stats["fillSeq"] = tooShortSeq stats["spanSeedScore"] = -500 stats["spanSeedStrand1"] = '0' stats["spanSeedStrand2"] = '0' #stats["spanSeedName"] = "tooShortNs" #ret = nt(stats, None, None, None, None, None, None) ret = nt(stats, None, None, None) return ret """ if stats["spanCount"] > 0: stats["avgSpanBases"] = stats["avgSpanBases"] / stats["spanCount"] logging.info("estimated fill len %d" % (stats["avgSpanBases"])) #write seed if len(bestSpan.seq) < 50: logging.warning( "fill sequence is small (%dbp) can't call consensus" % (len(bestSpan.seq))) #I don't know what to return here ssfout = NamedTemporaryFile(prefix="span_", suffix=".fasta", delete=False, dir=basedir) ALLTEMPFILES.append(ssfout.name) logging.debug("spanning with %s" % (bestSpan.name)) ssfout.write(">%s\n%s\n" % (bestSpan.name, bestSpan.seq)) ssfout.close() ssfout = ssfout.name #if stats["extendF1Count"] > 0: if bestF1E is not None: stats[ "avgExtF1Bases"] = stats["avgExtF1Bases"] / stats["extendF1Count"] logging.info("estimated flank 1 extend len %d" % (stats["avgExtF1Bases"])) #write seed if len(bestF1E.seq) < 50: logging.warning( "f1e sequence is small (%dbp) can't call consensus" % (len(bestF1E.seq))) #I don't know what to return here f1sfout = NamedTemporaryFile(prefix="flank1_", suffix=".fasta", delete=False, dir=basedir) ALLTEMPFILES.append(f1sfout.name) f1sfout.write(">%s\n%s\n" % (bestF1E.name, bestF1E.seq)) f1sfout.close() f1sfout = f1sfout.name #if stats["extendF2Count"] > 0: if bestF2E is not None: stats[ "avgExtF2Bases"] = stats["avgExtF2Bases"] / stats["extendF2Count"] logging.info("estimated flank 2 extend len %d" % (stats["avgExtF2Bases"])) #write seed if len(bestF2E.seq) < 50: logging.warning( "f2e sequence is small (%dbp) can't call consensus" % (len(bestF2E.seq))) #I don't know what to return here f2sfout = NamedTemporaryFile(prefix="flank2", suffix=".fasta", delete=False, dir=basedir) ALLTEMPFILES.append(f2sfout.name) f2sfout.write(">%s\n%s\n" % (bestF2E.name, bestF2E.seq)) f2sfout.close() f2sfout = f2sfout.name #all of the info I need to return... refactor later and create useful objects #ret = nt(stats, sfout, f1fout, f2fout, ssfout, f1sfout, f2sfout) ret = nt(stats, ssfout, f1sfout, f2sfout) #seeds writing return ret