def run(argv): print argv args = parseArgs(argv) if args.m4.endswith("m5"): aligns = M5File(args.m4) else: aligns = M4File(args.m4) if args.reads.endswith("fasta"): reads = FastaFile(args.reads) elif args.reads.endswith("fastq"): temp = FastqFile(args.reads) reads = {} for i in temp: reads[i] = temp[i].seq del (temp) else: logging.error("Expected Fasta or Fastq for READS (%s)" % args.reads) exit(1) logging.info("Extracting tails") tailfastq = tempfile.NamedTemporaryFile(suffix=".fasta", delete=False, dir=args.temp) tailfastq.close() tailfastq = tailfastq.name logging.debug("Tail read tmp file %s " % (tailfastq)) r, t, m = extractTails(aligns, reads, outFq=tailfastq, minLength=args.minTail) logging.info("Parsed %d reads" % (r)) logging.info("Found %d tails" % (t)) logging.info("%d reads had double tails" % (m)) if t == 0: logging.info("No tails -- Exiting") exit(0) logging.info("Mapping Tails") tailmap = tempfile.NamedTemporaryFile(suffix=".m4", delete=False, dir=args.temp) tailmap.close() tailmap = tailmap.name logging.debug("Read map tmp file %s " % (tailmap)) mapTails(tailfastq, args.ref, nproc=args.nproc, out=tailmap, useSa=args.noSa) logging.info("Consolidating alignments") logging.debug("Final file %s " % (args.output)) n = uniteTails(aligns, tailmap, args.output, args.inplace) logging.info("%d tails mapped" % (n))
class NullDevice(): def write(self, s): pass if __name__ == '__main__': args = parseArgs() alignFile = args.outname+".m5" consensusFile = args.outname+".fasta" #extract the read I'm looking for if args.target is not None:#Name tempOut = open("temp.fasta",'w') fasta = FastaFile(args.reads) tempOut.write(">%s\n%s\n" % (args.target, fasta[args.target])) tempOut.write blasr(args.reads, tempOut.name, nproc=args.nproc, outName=alignFile) aligns = M5File(alignFile) fout = open(consensusFile, 'w') results = consensus(aligns) fout.write(">pbjpolish_%d_vote_%d_len\n" % (results.contribBases,\ results.fillBases, results.sequence)) #fout.write(">\n%s\n" % consensus(aligns)) fout.close() elif args.Target is not None:#File blasr(args.reads, args.Target, nproc=args.nproc, outName=alignFile)
#!/usr/bin/python import argparse, json from pbsuite.jelly.Jelly import JellyProtocol from pbsuite.utils.FileHandlers import FastaFile, FastqFile from pbsuite.utils.summarizeAssembly import getStats USAGE = """Get statistics on fasta/fastq sequences recorded in a Protocol.xml""" if __name__ == '__main__': parser = argparse.ArgumentParser(description=USAGE, \ formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("xml", metavar="XML", type=str, \ help="Protocol.xml with inputs listed") args = parser.parse_args() protocol = JellyProtocol(args.xml) seqLengths = [] for i in protocol.inputs: if i.endswith(".fasta"): f = FastaFile(i) for j in f.values(): seqLengths.append(len(j)) if i.endswith(".fastq"): f = FastqFile(i) for j in f.values(): seqLengths.append(len(j.seq)) print "Read Stats", json.dumps(getStats(seqLengths), indent=4)
#!/usr/bin/env python import argparse, json from pbsuite.jelly.Jelly import JellyProtocol from pbsuite.utils.FileHandlers import FastaFile, FastqFile from pbsuite.utils.summarizeAssembly import getStats USAGE = """Get statistics on fasta/fastq sequences recorded in a Protocol.xml""" if __name__ == '__main__': parser = argparse.ArgumentParser(description=USAGE, \ formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("xml", metavar="XML", type=str, \ help="Protocol.xml with inputs listed") args = parser.parse_args() protocol = JellyProtocol(args.xml) seqLengths = [] for i in protocol.inputs: if i.endswith(".fasta"): f = FastaFile(i) for j in f.values(): seqLengths.append(len(j)) if i.endswith(".fastq"): f = FastqFile(i) for j in f.values(): seqLengths.append(len(j.seq)) print "Read Stats", json.dumps(getStats(seqLengths), indent=4)
def run(self): #Fasta Ref Output scaffTempName = self.scaffInput + ".tempFasta" scaffOutput = open(scaffTempName, 'w') #Qual Ref Output if self.qualInput is not None: qualTempName = self.qualInput + ".tempQual" qualOutput = open(qualTempName, 'w') #Gaps Output if self.opts.gapOutput is not None: gapTableOut = open(self.opts.gapOutput, 'w') else: gapTableOut = False logging.info( "Creating reference sequence index names and identifying gaps") refTemplate = "ref%07d" refId = 1 #Read References reference = FastaFile(self.scaffInput) if self.qualInput is not None: qualReference = QualFile(self.qualInput) for key in reference: scaffIndex = refTemplate % refId scaffName = key.replace(' ', '_') refId += 1 scaffName = scaffName + "|" + scaffIndex scaffOutput.write(">" + scaffName + "\n" + wrap(reference[key]) + "\n") if self.qualInput is not None: qualOutput.write(">" + scaffName + "\n" + qwrap(qualReference[key]) + "\n") gapCoords = [] for gap in re.finditer("[^Nn]([Nn]{%d,%s})[^Nn]" % \ (self.opts.minGap, self.opts.maxGap), reference[key]): gapCoords.append([gap.start() + 1, gap.end() - 1]) if len(gapCoords) == 0: #no Gaps gapTableOut.write("\t".join( [scaffName, 'na', 'na', scaffIndex + "_0_0", '3']) + '\n') logging.debug("Scaffold %s is empty" % scaffName) continue #Consolidate gaps that are too close -- indicating LQ regions. i = 0 while i < len(gapCoords) - 1: if gapCoords[i + 1][0] - gapCoords[i][1] < 25: gapCoords[i + 1][0] = gapCoords[i][0] del (gapCoords[i]) else: i += 1 prevEnd = 0 #Contig Start Tracking idx = 0 #Make the first gap prevEnd = gapCoords[0][1] gapCoords[0][1] - gapCoords[0][0] flag = Gap.BEGIN if len(gapCoords) == 1: flag += Gap.END if gapTableOut: gapTableOut.write("%s\t%i\t%i\t%s_%i_%i\t%d\n" \ % (scaffName, gapCoords[0][0], gapCoords[0][1], scaffIndex, idx, idx+1, flag)) #Now Go Through the rest of the gaps for i in range(1, len(gapCoords)): idx += 1 prevEnd = gapCoords[i][1] gapCoords[i][1] - gapCoords[i][0] if gapTableOut: if i == len(gapCoords) - 1: flag = Gap.END else: flag = 0 gapTableOut.write("%s\t%i\t%i\t%s_%i_%i\t%d\n" \ % (scaffName, gapCoords[i][0], gapCoords[i][1], scaffIndex, idx, idx+1, flag)) #Close shop scaffOutput.close() os.rename(self.scaffInput, self.scaffInput + ".original") os.rename(scaffTempName, self.scaffInput) if self.qualInput is not None: qualOutput.close() os.rename(self.qualInput, self.qualInput + ".original") os.rename(qualTempName, self.qualInput) if gapTableOut: gapTableOut.close() if self.opts.index: logging.info("Creating .sa indexes for references") r, o, e = exe("sawriter %s.sa %s" % (self.scaffInput, self.scaffInput)) if r != 0: logging.error("sawriter returned %d" % r) logging.error("Ensure it's in your path") exit(1) logging.debug(str(o) + ' ' + str(e)) logging.info("Finished!")
from pbsuite.utils.FileHandlers import FastaFile import json, re, sys """ ## Arguments 1 - reference.fasta -- input reference created by Setup.py 2 - liftOverTable.json - created at end of Jelly run 3 - jelly.out.fasta -- new reference created by Jelly """ fasta = FastaFile(sys.argv[1]) nameLookup = {} for entry in fasta: data = entry.split('|') refKey = data[-1] origName = "|".join(data[:-1]) nameLookup[refKey] = origName liftOver = json.load(open(sys.argv[2], 'r')) jellyFasta = FastaFile(sys.argv[3]) regex = re.compile("ref\d{7}") for key in liftOver: myRefs = set() for id, strand, size in liftOver[key]: myRefs.update(regex.findall(id)) newName = [] for refId in myRefs: newName.append(nameLookup[refId]) sys.stdout.write(">%s\n%s\n" % ("_".join(newName), jellyFasta[key]))
used.append(node) next = getStrongestEdge(node, direction, used) if next is None: break path.append(next) if next.qstrand == '-': if direction == SUPPORTFLAGS.left: direction = SUPPORTFLAGS.right elif direction == SUPPORTFLAGS.right: direction = SUPPORTFLAGS.left node = next.qname paths.append(path) for p in paths: if len(p) == 0: continue for i in p: if not i.tname.startswith("ref"): sys.stdout.write(i.tname.split('/')[1], i.tstrand, "\t") else: sys.stdout.write(i.tname, i.tstrand, "\t") sys.stdout.write('\n') if __name__ == '__main__': reads = sys.argv[1] fasta = FastaFile(reads) #blasr(reads, reads, 4) ovl = m5ToOvlGraph(fasta.keys(), "out.m5") ovlSimplify(ovl) nx.write_gml(ovl, "ovl.gml")
import sys, random from pbsuite.utils.FileHandlers import FastaFile, revComp, wrap def getRandomSeq(length): return "".join([random.choice(['A', 'T', 'C', 'G']) for i in xrange(length)]) if __name__ == '__main__': fasta = FastaFile(sys.argv[1]) key = fasta.keys()[0] ref = list(fasta[key]) #800bp insertion in the sample (deletion in the reference) ref[5000:5800] = "" #5000 Insertion #Inversion in the sample (inversion in the reference) tails ref[9000:12000] = list("".join(ref[10000:13000]).translate(revComp)[::-1]) #9000-12000 - INversion #1kb deletion in sample (insert into the reference) tails seq = getRandomSeq(1000) ref[20000:20000] = list(seq) #20000-21000 -- Deletion #100bp insertion in sample (deletion in the reference) spots ref[30000:30100] = "" #30000 - Insertion #200bp deletion in sample (insert into the reference) spots seq = getRandomSeq(200) ref[35000:35000] = list(seq)