def splitSRA(filename, outFile="auto", splitBy=4000000, FASTQ_BINARY="./fastq-dump", FASTQ_ARGS=[]): inFile = os.path.abspath(filename) if outFile == "auto": outFile = filename.replace(".sra", "") + "_{0}_side{1}.fastq.gz" pread = subprocess.Popen([FASTQ_BINARY, inFile, "-Z", "--split-files"] + FASTQ_ARGS, stdout=subprocess.PIPE, bufsize=-1) inStream = pread.stdout halted = False for counter in xrange(1000000): outProc1 = gzipWriter(outFile.format(counter, 1)) outProc2 = gzipWriter(outFile.format(counter, 2)) outStream1 = outProc1.stdin outStream2 = outProc2.stdin for _ in xrange(splitBy): line = inStream.readline() try: assert line[0] == "@" except AssertionError: print 'Not fastq' raise IOError("File is not fastq: {0}".format(filename)) except IndexError: halted = True break fastq_entry = (line, inStream.readline(), inStream.readline(), inStream.readline()) outStream1.writelines(fastq_entry) outStream2.writelines((inStream.readline(), inStream.readline(), inStream.readline(), inStream.readline())) outProc1.communicate() outProc2.communicate() print "finished block number", counter if halted: return
def splitSRA(filename, outFile="auto", splitBy=4000000, FASTQ_BINARY="./fastq-dump", FASTQ_ARGS=[]): inFile = os.path.abspath(filename) if outFile == "auto": outFile = filename.replace(".sra", "") + "_{0}_side{1}.fastq.gz" pread = subprocess.Popen([FASTQ_BINARY, inFile, "-Z", "--split-files"] + FASTQ_ARGS , stdout=subprocess.PIPE, bufsize=-1) inStream = pread.stdout halted = False for counter in xrange(1000000): outProc1 = gzipWriter(outFile.format(counter, 1)) outProc2 = gzipWriter(outFile.format(counter, 2)) outStream1 = outProc1.stdin outStream2 = outProc2.stdin for _ in xrange(splitBy): line = inStream.readline() try: assert line[0] == "@" except AssertionError: print 'Not fastq' raise IOError("File is not fastq: {0}".format(filename)) except IndexError: halted = True break fastq_entry = (line, inStream.readline(), inStream.readline(), inStream.readline()) outStream1.writelines(fastq_entry) outStream2.writelines((inStream.readline(), inStream.readline(), inStream.readline(), inStream.readline())) outProc1.communicate() outProc2.communicate() print "finished block number", counter if halted: return
def splitSingleFastq(filename, outFile, splitBy=4000000, convertReadID=lambda x: x): inFile = os.path.abspath(filename) pread = subprocess.Popen(["gunzip", inFile, "-c"], stdout=subprocess.PIPE, bufsize=-1) inStream = pread.stdout halted = False counters = [] for counter in range(100000): outProc1 = gzipWriter(outFile.format(counter)) outStream1 = outProc1.stdin for j in range(splitBy): line = inStream.readline() try: assert six.indexbytes(line, 0) == 64 #"@" except AssertionError: print('Not fastq') print("bad line: {0}".format(line)) raise IOError("File is not fastq: {0}".format(filename)) except IndexError: halted = True counters.append(j) break fastq_entry = (convertReadID(line), inStream.readline(), inStream.readline(), inStream.readline()) outStream1.writelines(fastq_entry) outProc1.communicate() print("finished block number", counter) if halted: if (counters[-1] < splitBy / 3) and (len(counters) > 1): f1 = outFile.format(counter - 1) f2 = outFile.format(counter) os.system("cat {0} {1} > {0}_tmp".format(f1, f2)) shutil.move(f1 + "_tmp", f1) os.remove(f2) last = counters.pop() counters[-1] = counters[-1] + last print("Read counts", counters) return counters counters.append(splitBy)
def _filter_fastq(ids, inStream, out_fastq, in_filename="none"): # @UnusedVariable '''Filter FASTQ sequences by their IDs. Read entries from **in_fastq** and store in **out_fastq** only those the whose ID are in **ids**. ''' writingProcess = gzipWriter(out_fastq) num_filtered = 0 num_total = 0 while True: line = inStream.readline() try: assert six.indexbytes(line, 0) == 64 # "@" except AssertionError: print('Not fastq') raise except IndexError: break # raise Exception('{0} does not comply with the FASTQ standards.'.format(in_filename)) fastq_entry = (line, inStream.readline(), inStream.readline(), inStream.readline()) read_id = line.split()[0][1:] if read_id in ids: writingProcess.stdin.writelines(fastq_entry) num_filtered += 1 num_total += 1 sleep() writingProcess.communicate() if writingProcess.returncode != 0: raise RuntimeError("Writing process return code {0}".format( writingProcess.returncode)) return num_total, num_filtered
def _filter_fastq(ids, inStream, out_fastq, in_filename="none"): '''Filter FASTQ sequences by their IDs. Read entries from **in_fastq** and store in **out_fastq** only those the whose ID are in **ids**. ''' writingProcess = gzipWriter(out_fastq) num_filtered = 0 num_total = 0 while True: line = inStream.readline() try: assert line[0] == '@' except AssertionError: print 'Not fastq' except IndexError: break # raise Exception('{0} does not comply with the FASTQ standards.'.format(in_filename)) fastq_entry = (line, inStream.readline(), inStream.readline(), inStream.readline()) read_id = line.split()[0][1:] if read_id in ids: writingProcess.stdin.writelines(fastq_entry) num_filtered += 1 num_total += 1 sleep() writingProcess.communicate() if writingProcess.returncode != 0: raise RuntimeError("Writing process return code {0}".format(writingProcess.returncode)) return num_total, num_filtered
def splitSRA(filename, outFile="auto", splitBy=4000000, FASTQ_BINARY="./fastq-dump", FASTQ_ARGS=[]): if not os.path.exists(FASTQ_BINARY): raise ValueError("(fastq-dump) file not found at {0}".format( os.path.abspath(FASTQ_BINARY))) inFile = os.path.abspath(filename) if outFile == "auto": outFile = filename.replace(".sra", "") + "_{0}_side{1}.fastq.gz" pread = subprocess.Popen([FASTQ_BINARY, inFile, "-Z", "--split-files"] + FASTQ_ARGS, stdout=subprocess.PIPE, bufsize=-1) inStream = pread.stdout halted = False counters = [] for counter in range(1000000): outProc1 = gzipWriter(outFile.format(counter, 1)) outProc2 = gzipWriter(outFile.format(counter, 2)) outStream1 = outProc1.stdin outStream2 = outProc2.stdin for j in range(splitBy): line = inStream.readline() try: assert six.indexbytes(line, 0) == 64 #"@" except AssertionError: print('Not fastq') print("bad line: {0}".format(line)) raise IOError("File is not fastq: {0}".format(filename)) except IndexError: halted = True counters.append(j) break fastq_entry = (line, inStream.readline(), inStream.readline(), inStream.readline()) outStream1.writelines(fastq_entry) outStream2.writelines((inStream.readline(), inStream.readline(), inStream.readline(), inStream.readline())) outProc1.communicate() outProc2.communicate() print("finished block number", counter) if halted: if (counters[-1] < splitBy / 3) and (len(counters) > 1): for side in [1, 2]: f1 = outFile.format(counter - 1, side) f2 = outFile.format(counter, side) os.system("cat {0} {1} > {0}_tmp".format(f1, f2)) shutil.move(f1 + "_tmp", f1) os.remove(f2) last = counters.pop() counters[-1] = counters[-1] + last return counters counters.append(splitBy) return counters