def main(argv=None): parser = argparse.ArgumentParser(description="A program to filter reads by name from a single/set of fastq file(s). The script looks for reads which have a name line where the string before a space exactly matches a pattern. If multiple files are passed in, then they are processed in sync and if any name matches that read is discarded from all files.") parser.add_argument('fastqFiles', help='a fastq (potentially gzipped) file(s) containing the reads with the order of reads the same in all files',type=helper.checkFile,nargs='+') parser.add_argument("-d","--dots", help="output dot to stderr every X reads. Input a negative number to suppress output (default:-1)", default=-1,type=int) parser.add_argument('-f','--filterFile', help='a file (potentially gzipped) file containing the names of reads to be filtered one per line',type=helper.checkFile,required=True) parser.add_argument('-o','--outputFiles', help='an output file(s) (one for each input fastq file). default(out1.fastq.gz ... outn.fastq.gz where n is the number of fastqFiles)',type=str,nargs='*') args=parser.parse_args(argv) if(args.outputFiles is None): outputFiles=['out'+str(ii)+'.fastq.gz' for ii in range(1,len(args.fastqFiles)+1)] else: outputFiles=args.outputFiles if(len(outputFiles)!=len(args.fastqFiles)): raise argparse.ArgumentTypeError("Input and output file numbers do not match") outHandles=[helper.openNormalOrGz(x,'w') for x in outputFiles] patterns=set(line.strip() for line in helper.openNormalOrGz(args.filterFile)) with filterFastqIter(args.fastqFiles,patterns) as fastqIter: for currentReads in fastqIter: for read,outFile in zip(currentReads,outHandles): helper.writeFastqRead(outFile,read) if args.dots>0: if fastqIter.nGood % args.dots==0: sys.stderr.write('.') if args.dots>0: sys.stderr.write("\nGood reads: "+str(fastqIter.nGood)+" Bad reads: "+str(fastqIter.nBad)+"\n") helper.closeFiles(outHandles)
def test_main(capsys,tmpdir): with pytest.raises(SystemExit): splitbarcodes.main() out, err=capsys.readouterr() assert 'usage' in err with pytest.raises(SystemExit): splitbarcodes.main(['-h']) out, err=capsys.readouterr() assert 'usage' in out d = tmpdir.mkdir('dir') p1 = d.join('test.fastq') p1.write("@seq1\nAAA\n+\n(((\n@seq2\nTT\n+\n(A\n@seq3\nT\n+\n(\n") p2 = d.join('test2.fastq') p2.write("@seq1z\nTTT\n+\n(((\n@seq2z\nTT\n+\n((\n@seq3\nT\n+\n(\n") b = d.join('test.filter') o = d.join('test_1.fastq.gz') o2 = d.join('test_2.fastq.gz') #duplicate barcode b.write("test,AAA\ntest2,AAA\ntest3,AAT") with pytest.raises(argparse.ArgumentTypeError): splitbarcodes.main([str(p1),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1']) #duplicate sample name b.write("test,AAA\ntest,AAC\ntest3,AAT") with pytest.raises(argparse.ArgumentTypeError): splitbarcodes.main([str(p1),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1']) b.write("test,AAA") #two index files, 1 barcode with pytest.raises(argparse.ArgumentTypeError): splitbarcodes.main([str(p1),'-i',str(p1),str(p1),'-b',str(b),'-o',str(d),'-d1']) splitbarcodes.main([str(p1),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1']) out, err=capsys.readouterr() for ii,jj in zip(err.split('\n'),['.','Good reads: 1 Bad reads: 2']): assert ii==jj for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o)).readlines()],['@seq1','AAA','+seq1','(((']): assert ii==jj splitbarcodes.main([str(p1),str(p2),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1']) out, err=capsys.readouterr() for ii,jj in zip(err.split('\n'),['.','Good reads: 1 Bad reads: 2']): assert ii==jj for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o)).readlines()],['@seq1','AAA','+seq1','(((']): assert ii==jj for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o2)).readlines()],['@seq1z','TTT','+seq1z','(((']): assert ii==jj b.write("test,T,T") i1 = d.join('test3.fastq') i1.write("@seq1\nT\n+\n(\n@seq2\nT\n+\n(\n@seq3\nT\n+\n(\n") i2 = d.join('test4.fastq') i2.write("@seq1z\nT\n+\n(\n@seq2z\nT\n+\n(\n@seq3\nTT\n+\n((\n") splitbarcodes.main([str(p1),str(p2),'-i',str(i1),str(i2),'-b',str(b),'-o',str(d),'-d1']) out, err=capsys.readouterr() for ii,jj in zip(err.split('\n'),['..','Good reads: 2 Bad reads: 1']): assert ii==jj for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o)).readlines()],['@seq1','AAA','+seq1','(((','@seq2','TT','+seq2','(A']): assert ii==jj for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o2)).readlines()],['@seq1z','TTT','+seq1z','(((','@seq2z','TT','+seq2z','((']): assert ii==jj
def __init__(self, fastqFiles,indexFiles,barcodes): self.nGood = 0 self.nBad = 0 self.fastqFiles=fastqFiles self.indexFiles=indexFiles self.barcodes=barcodes self.fastqHandles=[helper.openNormalOrGz(x) for x in self.fastqFiles] self.indexHandles=[helper.openNormalOrGz(x) for x in self.indexFiles] self.fastqs=[Bio.SeqIO.QualityIO.FastqGeneralIterator(x) for x in self.fastqHandles] self.indexs=[Bio.SeqIO.QualityIO.FastqGeneralIterator(x) for x in self.indexHandles]
def test_openGzOrNormal(tmpdir): d = tmpdir.mkdir('dir') p = d.join('test.txt') gzFile = d.join('test.gz') gz=helper.openNormalOrGz(str(gzFile),'w') helper.writeFastqRead(gz,['1','22','333']) helper.writeFastqRead(gz,['55555','666666','7777777']) helper.closeFiles([gz]) gz=helper.openNormalOrGz(str(gzFile)) pred=['@1','22','+1','333','@55555','666666','+55555','7777777'] for x,y in zip(gz,[x+'\n' for x in pred]): assert x==y
def test_openGzOrNormal(tmpdir): d = tmpdir.mkdir('dir') p = d.join('test.txt') gzFile = d.join('test.gz') gz = helper.openNormalOrGz(str(gzFile), 'w') helper.writeFastqRead(gz, ['1', '22', '333']) helper.writeFastqRead(gz, ['55555', '666666', '7777777']) helper.closeFiles([gz]) gz = helper.openNormalOrGz(str(gzFile)) pred = ['@1', '22', '+1', '333', '@55555', '666666', '+55555', '7777777'] for x, y in zip(gz, [x + '\n' for x in pred]): assert x == y
def test_readSimpleCsv(tmpdir): d = tmpdir.mkdir('dir') p = d.join('test.txt') with pytest.raises(argparse.ArgumentTypeError): helper.checkFile(str(p)) with helper.openNormalOrGz(str(p),'w') as f: f.write("1,'2',3 \n \n \na,\"bb\",ccc\n 2,3,4 ") assert(helper.readSimpleCsv(str(p))==[['1','2','3'],['a','bb','ccc'],["2","3","4"]]) with helper.openNormalOrGz(str(p),'w') as f: f.write("1,2,3\n\n \na,bb,ccc,d\n 2,3,4 ") with pytest.raises(ValueError): helper.readSimpleCsv(str(p))
def test_closeFiles(tmpdir): d = tmpdir.mkdir('dir') ps = [d.join('test'+str(ii)+'.txt') for ii in range(10)] handles = [helper.openNormalOrGz(str(p),'w') for p in ps] helper.closeFiles(handles) for ii in handles: with pytest.raises(ValueError): ii.write("X") handles = dict(zip(range(10),[helper.openNormalOrGz(str(p),'w') for p in ps])) helper.closeFiles(handles) for _,ii in handles.items(): with pytest.raises(ValueError): ii.write("X")
def test_readSimpleCsv(tmpdir): d = tmpdir.mkdir('dir') p = d.join('test.txt') with pytest.raises(argparse.ArgumentTypeError): helper.checkFile(str(p)) with helper.openNormalOrGz(str(p), 'w') as f: f.write("1,'2',3 \n \n \na,\"bb\",ccc\n 2,3,4 ") assert (helper.readSimpleCsv(str(p)) == [['1', '2', '3'], ['a', 'bb', 'ccc'], ["2", "3", "4"]]) with helper.openNormalOrGz(str(p), 'w') as f: f.write("1,2,3\n\n \na,bb,ccc,d\n 2,3,4 ") with pytest.raises(ValueError): helper.readSimpleCsv(str(p))
def test_closeFiles(tmpdir): d = tmpdir.mkdir('dir') ps = [d.join('test' + str(ii) + '.txt') for ii in range(10)] handles = [helper.openNormalOrGz(str(p), 'w') for p in ps] helper.closeFiles(handles) for ii in handles: with pytest.raises(ValueError): ii.write("X") handles = dict( zip(range(10), [helper.openNormalOrGz(str(p), 'w') for p in ps])) helper.closeFiles(handles) for _, ii in handles.items(): with pytest.raises(ValueError): ii.write("X")
def __init__(self, fastqFiles,patterns): self.nGood = 0 self.nBad = 0 self.fastqFiles=fastqFiles self.patterns=patterns self.fastqHandles=[helper.openNormalOrGz(x) for x in self.fastqFiles] self.fastqs=[Bio.SeqIO.QualityIO.FastqGeneralIterator(x) for x in self.fastqHandles]
def __init__(self, fastqFile, minLength=10): self.nGood = 0 self.nBad = 0 self.minLength=minLength self.fastqFile=fastqFile self.fastqHandle=helper.openNormalOrGz(self.fastqFile) self.fastq=Bio.SeqIO.QualityIO.FastqGeneralIterator(self.fastqHandle)
def main(argv=None): parser = argparse.ArgumentParser(description="A program to convert a bam file into an aligned fasta file. The command generates fasta formatted output (two lines for each sequence: a name line prepended by > and a line containing the aligned sequence) to standard out.") parser.add_argument('bamFile', help='a bam file containing the alignment',type=helper.checkFile) parser.add_argument("-s","--refseq", help="fasta file giving the reference sequence of interest",type=helper.checkFile,required=True) parser.add_argument("-q","--minQuality", help="don't count alignments with a mapping quality less than this", type=int,default=0) parser.add_argument("-v","--verbose", help="increase output verbosity to stderr", action="store_true") parser.add_argument("-r","--region", help="the region to pull reads from (note that the underlying pysam does not like single base regions like ch1:25. These instead be specified as chr1:25-25.)",default=None) parser.add_argument("-e","--endSpan", help="ignore spans of matches at the start or end of a read less than this cutoff",default=0,type=int) args=parser.parse_args(argv) if args.verbose: sys.stderr.write("Arguments: \n") for key, value in vars(args).items(): sys.stderr.write(" "+key+": "+str(value)+'\n') with helper.openNormalOrGz(args.refseq) as fasta: args.region,ref=getRefFromFasta(helper.readSimpleFasta(fasta),args.region) nRead=0 aligns=[read for read in getAlignsInFile(args.bamFile,args.region,args.minQuality,args.endSpan)] inserts=[[insertion[0],len(insertion[1])] for align in aligns if len(align['insertions'])>0 for insertion in align['insertions'] ] maxInserts={} for pos,length in inserts: maxInserts[pos]=max(length,maxInserts[pos]) if pos in maxInserts else length nChar=len(ref) print('>'+args.region) refPad=padRead(ref,0,nChar,[],maxInserts) print(refPad) for align in aligns: padded=padRead(align['seq'],align['start'],nChar,align['insertions'],maxInserts) print('>'+align['name']) print(padded)
def countKmersInFile(args): fastqFile = args[0] k = args[1] sys.stderr.write('Working on file %s\n' % fastqFile) with helper.openNormalOrGz(fastqFile) as fastqHandle: fastq = Bio.SeqIO.QualityIO.FastqGeneralIterator(fastqHandle) return (countKmersInReads(fastq, k))
def test_main(capsys,tmpdir): with pytest.raises(SystemExit): removereads.main() out, err=capsys.readouterr() assert 'usage' in err with pytest.raises(SystemExit): removereads.main(['-h']) out, err=capsys.readouterr() assert 'usage' in out d = tmpdir.mkdir('dir') p = d.join('test.fastq') p.write("@seq1\nAAA\n+\n(((\n@seq2\nTT\n+\n((\n@seq3\nT\n+\n(\n") p2 = d.join('test2.fastq') p2.write("@seq1z\nTTT\n+\n(((\n@seq2z\nTT\n+\n((\n@seq3\nT\n+\n(\n") f = d.join('test.filter') f.write("seq2\nseq3") o = d.join('test.out') o2 = d.join('test2.out') with pytest.raises(argparse.ArgumentTypeError): removereads.main([str(p),'-f',str(f),'-o',str(o),str(o),'-d1']) with pytest.raises(argparse.ArgumentTypeError): removereads.main([str(p),str(p),'-f',str(f),'-o',str(o),'-d1']) removereads.main([str(p),'-f',str(f),'-o',str(o),'-d1']) out, err=capsys.readouterr() for ii,jj in zip(err.split('\n'),['.','Good reads: 1 Bad reads: 2']): assert ii==jj for ii,jj in zip([x.rstrip('\n') for x in o.readlines()],['@seq1','AAA','+seq1','(((']): assert ii==jj removereads.main([str(p),str(p2),'-f',str(f),'-o',str(o),str(o2),'-d1']) out, err=capsys.readouterr() for ii,jj in zip(err.split('\n'),['.','Good reads: 1 Bad reads: 2']): assert ii==jj for ii,jj in zip([x.rstrip('\n') for x in o.readlines()],['@seq1','AAA','+seq1','(((']): assert ii==jj for ii,jj in zip([x.rstrip('\n') for x in o2.readlines()],['@seq1z','TTT','+seq1z','(((']): assert ii==jj os.chdir(str(d)) removereads.main([str(p),str(p2),'-f',str(f),'-d1']) out, err=capsys.readouterr() for ii,jj in zip(err.split('\n'),['.','Good reads: 1 Bad reads: 2']): assert ii==jj for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz('out1.fastq.gz').readlines()],['@seq1','AAA','+seq1','(((']): assert ii==jj for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz('out2.fastq.gz').readlines()],['@seq1z','TTT','+seq1z','(((']): assert ii==jj
def __init__(self, fastqFiles, patterns, keep=False): self.nGood = 0 self.nBad = 0 self.fastqFiles = fastqFiles self.patterns = patterns self.fastqHandles = [helper.openNormalOrGz(x) for x in self.fastqFiles] self.fastqs = [ Bio.SeqIO.QualityIO.FastqGeneralIterator(x) for x in self.fastqHandles ] self.keep = keep
def __init__(self, fastqFiles, indexFiles, barcodes, returnUnassigned=False): self.nGood = 0 self.nBad = 0 self.fastqFiles = fastqFiles self.indexFiles = indexFiles self.barcodes = barcodes self.fastqHandles = [helper.openNormalOrGz(x) for x in self.fastqFiles] self.indexHandles = [helper.openNormalOrGz(x) for x in self.indexFiles] self.fastqs = [ Bio.SeqIO.QualityIO.FastqGeneralIterator(x) for x in self.fastqHandles ] self.indexs = [ Bio.SeqIO.QualityIO.FastqGeneralIterator(x) for x in self.indexHandles ] self.returnUnassigned = returnUnassigned
def __init__(self, fastqFile, minLength=10, removeN=False, suppressBad=False): self.nGood = 0 self.nBad = 0 self.minLength=minLength self.removeN=removeN self.suppressBad=suppressBad self.fastqFile=fastqFile self.fastqHandle=helper.openNormalOrGz(self.fastqFile) if suppressBad: self.fastq=helper.readSimpleFastq(self.fastqHandle) else: self.fastq=Bio.SeqIO.QualityIO.FastqGeneralIterator(self.fastqHandle) self.nSearch=re.compile('[^ACTG]').search self.badList=[]
def main(argv=None): parser = argparse.ArgumentParser(description="A program to take a list of barcodes and one or more fastq reads and one or two index reads and output reads matching the barcodes into a seperate file for each barcode. The script takes read files and index files where the reads and indexs are in the same order and outputs reads which match the appropriate barcodes into separate files.") parser.add_argument('fastqFiles', help='a fastq file(s) (potentially gzipped) containing the sequence reads',type=helper.checkFile,nargs='+') parser.add_argument('-i','--indexFiles', help='a fastq file(s) (potentially gzipped) containing the index reads',type=helper.checkFile,nargs='+') parser.add_argument("-d","--dots", help="output dot to stderr every X reads. Input a negative number to suppress output (default:-1)", default=-1,type=int) parser.add_argument('-b','--barcodeFile', help='a file (potentially gzipped) file containing comma separated sample names, first barcode and second barcode (with no header and no commas in the sample names)',type=helper.checkFile,required=True) parser.add_argument('-o','--outputPath', help='a string giving the desired output directory',type=helper.checkDir,default='.') args=parser.parse_args(argv) nFiles=len(args.fastqFiles) barcodes=helper.readSimpleCsv(args.barcodeFile) if(len(args.indexFiles)!=len(barcodes[0])-1): raise argparse.ArgumentTypeError("Number of index files and index columns in the barcodeFile do not agree") samples=[xx[0] for xx in barcodes] if(len(set(samples))!=len(samples)): raise argparse.ArgumentTypeError("Two or more samples share the same name") outputFiles=[[os.path.join(args.outputPath,xx)+"_"+str(ii+1)+".fastq.gz" for ii in range(nFiles)] for xx in samples] bars=[tuple(xx[1:]) for xx in barcodes] barSet=set(bars) if len(barSet)!=len(bars): raise argparse.ArgumentTypeError("Two or more samples share the same set of barcodes") outHandles=dict(zip(bars,[[helper.openNormalOrGz(yy,'w') for yy in xx] for xx in outputFiles])) with barcodeFastqIter(args.fastqFiles,args.indexFiles,bars) as fastqIter: for currentReads,bar in fastqIter: for read,outFile in zip(currentReads,outHandles[bar]): helper.writeFastqRead(outFile,read) if args.dots>0: if fastqIter.nGood % args.dots==0: sys.stderr.write('.') if args.dots>0: sys.stderr.write("\nGood reads: "+str(fastqIter.nGood)+" Bad reads: "+str(fastqIter.nBad)+"\n") for key in outHandles: helper.closeFiles(outHandles[key])
def test_main(capsys, tmpdir, bamFile): with pytest.raises(SystemExit): bamtoalign.main() out, err = capsys.readouterr() assert 'usage' in err with pytest.raises(SystemExit): bamtoalign.main(['-h']) out, err = capsys.readouterr() assert 'usage' in out d = tmpdir.mkdir('dir') p = d.join('ref.fa') with helper.openNormalOrGz(str(p), 'w') as f: f.write(">ref\nAAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGGAAAAAAAAAA") bamtoalign.main(['-v', '-s', str(p), str(bamFile)]) out, err = capsys.readouterr() assert 'Arguments' in err for ii, jj in zip(out.split('\n'), [ '>ref', 'AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGG--GGGGGGGGAA-AAA-AAAAA', '>read3', '----------------------------GGGG--AAAAAT--------------', '>read2', '--------------------------------A---AAAATTTTT---------', '>read1', '--------------------------------CCAAAAACCCCC---GGCC---' ]): assert ii == jj
def main(argv=None): parser = argparse.ArgumentParser( description= "A program to filter reads by name from a single/set of fastq file(s). The script looks for reads which have a name line where the string before a space exactly matches a pattern. If multiple files are passed in, then they are processed in sync and if any name matches that read is discarded (or kept) from all files." ) parser.add_argument( 'fastqFiles', help= 'a (potentially gzipped) fastq file(s) containing the reads with the order of reads the same in all files', type=helper.checkFile, nargs='+') parser.add_argument( "-d", "--dots", help= "output dot to stderr every X reads. Input a negative number to suppress output (default:-1)", default=-1, type=int) parser.add_argument( '-o', '--outputFiles', help= 'an output file(s) (one for each input fastq file). default(out1.fastq.gz ... outn.fastq.gz where n is the number of fastqFiles)', type=str, nargs='*') parser.add_argument( '-f', '--filterFile', help= 'a (potentially gzipped) file containing the names of reads to be filtered one per line', type=helper.checkFile, required=True) parser.add_argument( '-k', '--keep', help= 'keep reads matching the filter file and filter all nonmatching reads', action='store_true') args = parser.parse_args(argv) if (args.outputFiles is None): outputFiles = [ 'out' + str(ii) + '.fastq.gz' for ii in range(1, len(args.fastqFiles) + 1) ] else: outputFiles = args.outputFiles if (len(outputFiles) != len(args.fastqFiles)): raise argparse.ArgumentTypeError( "Input and output file numbers do not match") outHandles = [helper.openNormalOrGz(x, 'w') for x in outputFiles] patterns = set(line.strip() for line in helper.openNormalOrGz(args.filterFile)) with filterFastqIter(args.fastqFiles, patterns, args.keep) as fastqIter: for currentReads in fastqIter: for read, outFile in zip(currentReads, outHandles): helper.writeFastqRead(outFile, read) if args.dots > 0: if fastqIter.nGood % args.dots == 0: sys.stderr.write('.') sys.stderr.flush() if args.dots > 0: sys.stderr.write("\nGood reads: " + str(fastqIter.nGood) + " Bad reads: " + str(fastqIter.nBad) + "\n") helper.closeFiles(outHandles)
def main(argv=None): parser = argparse.ArgumentParser( description= "A program to take a list of barcodes and one or more fastq reads and one or two index reads and output reads matching the barcodes into a seperate file for each barcode. The script takes read files and index files where the reads and indexs are in the same order and outputs reads which match the appropriate barcodes into separate files." ) parser.add_argument( 'fastqFiles', help= 'a fastq file(s) (potentially gzipped) containing the sequence reads', type=helper.checkFile, nargs='+') parser.add_argument( '-i', '--indexFiles', help='a fastq file(s) (potentially gzipped) containing the index reads', type=helper.checkFile, nargs='+') parser.add_argument( "-d", "--dots", help= "output dot to stderr every X reads. Input a negative number to suppress output (default:-1)", default=-1, type=int) parser.add_argument( '-b', '--barcodeFile', help= 'a (potentially gzipped) file containing comma separated sample names, first barcode and second barcode (with no header and no commas in the sample names)', type=helper.checkFile, required=True) parser.add_argument('-o', '--outputPath', help='a string giving the desired output directory', type=helper.checkDir, default='.') parser.add_argument( '-u', '--unassigned', help= 'if set then store unassigned reads to {outputPath}/__UNASSIGNED__R#.fastq.gz with their corresponding barcodes in {outputPath}/__UNASSIGNED__I#.fastq.gz', action='store_true') args = parser.parse_args(argv) nFiles = len(args.fastqFiles) nIndexs = len(args.indexFiles) barcodes = helper.readSimpleCsv(args.barcodeFile) if (nIndexs != len(barcodes[0]) - 1): raise argparse.ArgumentTypeError( "Number of index files and index columns in the barcodeFile do not agree" ) samples = [xx[0] for xx in barcodes] if (len(set(samples)) != len(samples)): raise argparse.ArgumentTypeError( "Two or more samples share the same name") outputFiles = [[ os.path.join(args.outputPath, xx) + "_" + str(ii + 1) + ".fastq.gz" for ii in range(nFiles) ] for xx in samples] bars = [tuple(xx[1:]) for xx in barcodes] barSet = set(bars) if len(barSet) != len(bars): raise argparse.ArgumentTypeError( "Two or more samples share the same set of barcodes") outHandles = dict( zip(bars, [[helper.openNormalOrGz(yy, 'w') for yy in xx] for xx in outputFiles])) if args.unassigned: if any([ xx[0] in ['__UNASSIGNED__R', '__UNASSIGNED__I'] for xx in barcodes ]): raise argparse.ArgumentTypeError( "Sample named __UNASSIGNED__ clashes with unassigned output. Please rename" ) badIndexFiles = [ os.path.join(args.outputPath, "__UNASSIGNED__I") + str(ii + 1) + ".fastq.gz" for ii in range(nIndexs) ] badReadFiles = [ os.path.join(args.outputPath, "__UNASSIGNED__R") + str(ii + 1) + ".fastq.gz" for ii in range(nFiles) ] badIndexHandles = [ helper.openNormalOrGz(ii, 'w') for ii in badIndexFiles ] badReadHandles = [ helper.openNormalOrGz(ii, 'w') for ii in badReadFiles ] with barcodeFastqIter(args.fastqFiles, args.indexFiles, bars, args.unassigned) as fastqIter: for currentReads, bar, assigned, fullBar in fastqIter: if args.unassigned and not assigned: for read, outFile in zip(currentReads, badReadHandles): helper.writeFastqRead(outFile, read) for read, outFile in zip(fullBar, badIndexHandles): helper.writeFastqRead(outFile, read) else: for read, outFile in zip(currentReads, outHandles[bar]): helper.writeFastqRead(outFile, read) if args.dots > 0: if fastqIter.nGood % args.dots == 0: sys.stderr.write('.') sys.stderr.flush() if args.dots > 0: sys.stderr.write("\nReads assigned to barcode: " + str(fastqIter.nGood) + " Unassigned reads: " + str(fastqIter.nBad) + "\n") for key in outHandles: helper.closeFiles(outHandles[key])
def test_main(capsys,tmpdir): with pytest.raises(SystemExit): splitbarcodes.main() out, err=capsys.readouterr() assert 'usage' in err with pytest.raises(SystemExit): splitbarcodes.main(['-h']) out, err=capsys.readouterr() assert 'usage' in out d = tmpdir.mkdir('dir') p1 = d.join('test.fastq') p1.write("@seq1\nAAA\n+\n(((\n@seq2\nTT\n+\n(A\n@seq3\nT\n+\n(\n") p2 = d.join('test2.fastq') p2.write("@seq1z\nTTT\n+\n(((\n@seq2z\nTT\n+\n((\n@seq3\nATC\n+\n([(\n") b = d.join('test.filter') o = d.join('test_1.fastq.gz') o2 = d.join('test_2.fastq.gz') #duplicate barcode b.write("test,AAA\ntest2,AAA\ntest3,AAT") with pytest.raises(argparse.ArgumentTypeError): splitbarcodes.main([str(p1),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1']) #duplicate sample name b.write("test,AAA\ntest,AAC\ntest3,AAT") with pytest.raises(argparse.ArgumentTypeError): splitbarcodes.main([str(p1),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1']) #sample named __UNASSIGNED__R b.write("__UNASSIGNED__R,AAA") with pytest.raises(argparse.ArgumentTypeError): splitbarcodes.main([str(p1),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1','-u']) b.write("__UNASSIGNED__I,AAA") with pytest.raises(argparse.ArgumentTypeError): splitbarcodes.main([str(p1),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1','-u']) b.write("test,AAA") #two index files, one barcode with pytest.raises(argparse.ArgumentTypeError): splitbarcodes.main([str(p1),'-i',str(p1),str(p1),'-b',str(b),'-o',str(d),'-d1']) #one read file, one index file splitbarcodes.main([str(p1),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1']) out, err=capsys.readouterr() for ii,jj in zip(err.split('\n'),['.','Reads assigned to barcode: 1 Unassigned reads: 2']): assert ii==jj for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o)).readlines()],['@seq1','AAA','+seq1','(((']): assert ii==jj #two read files, one index file splitbarcodes.main([str(p1),str(p2),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1']) out, err=capsys.readouterr() for ii,jj in zip(err.split('\n'),['.','Reads assigned to barcode: 1 Unassigned reads: 2']): assert ii==jj for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o)).readlines()],['@seq1','AAA','+seq1','(((']): assert ii==jj for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o2)).readlines()],['@seq1z','TTT','+seq1z','(((']): assert ii==jj #two read, two index files. test unassigned b.write("test,T,T\ntest2,A,T") i1 = d.join('test3.fastq') i1.write("@seq1\nT\n+\n(\n@seq2\nA\n+\n(\n@seq3\nT\n+\n(\n") i2 = d.join('test4.fastq') i2.write("@seq1z\nT\n+\n(\n@seq2z\nT\n+\n(\n@seq3\nTT\n+\n(!\n") splitbarcodes.main([str(p1),str(p2),'-i',str(i1),str(i2),'-b',str(b),'-o',str(d),'-d1','-u']) out, err=capsys.readouterr() for ii,jj in zip(err.split('\n'),['..','Reads assigned to barcode: 2 Unassigned reads: 1']): assert ii==jj for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o)).readlines()],['@seq1','AAA','+seq1','(((','@seq2','TT','+seq2','(A']): assert ii==jj for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o2)).readlines()],['@seq1z','TTT','+seq1z','(((','@seq2z','TT','+seq2z','((']): assert ii==jj #test the unassigned ur1=d.join('__UNASSIGNED__R1.fastq.gz') ur2=d.join('__UNASSIGNED__R2.fastq.gz') ui1=d.join('__UNASSIGNED__I1.fastq.gz') ui2=d.join('__UNASSIGNED__I2.fastq.gz') assert len(helper.openNormalOrGz(str(ur1)).readlines())==4 assert len(helper.openNormalOrGz(str(ur2)).readlines())==4 assert len(helper.openNormalOrGz(str(ui1)).readlines())==4 assert len(helper.openNormalOrGz(str(ui2)).readlines())==4 for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(ur1)).readlines()],['@seq3','T','+seq3','(']): assert ii==jj for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(ur2)).readlines()],['@seq3','ATC','+seq3','([(']): assert ii==jj for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(ui1)).readlines()],['@seq3','T','+seq3','(']): assert ii==jj for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(ui2)).readlines()],['@seq3','TT','+seq3','(!']): assert ii==jj
def test_main(capsys, tmpdir): with pytest.raises(SystemExit): removereads.main() out, err = capsys.readouterr() assert 'usage' in err with pytest.raises(SystemExit): removereads.main(['-h']) out, err = capsys.readouterr() assert 'usage' in out d = tmpdir.mkdir('dir') p = d.join('test.fastq') p.write("@seq1\nAAA\n+\n(((\n@seq2\nTT\n+\n((\n@seq3\nT\n+\n(\n") p2 = d.join('test2.fastq') p2.write("@seq1z\nTTT\n+\n(((\n@seq2z\nTT\n+\n((\n@seq3\nT\n+\n!\n") f = d.join('test.filter') f.write("seq2\nseq3") o = d.join('test.out') o2 = d.join('test2.out') with pytest.raises(argparse.ArgumentTypeError): removereads.main([str(p), '-f', str(f), '-o', str(o), str(o), '-d1']) with pytest.raises(argparse.ArgumentTypeError): removereads.main([str(p), str(p), '-f', str(f), '-o', str(o), '-d1']) with pytest.raises(SystemExit): removereads.main([str(p), '-o', str(o), '-d1']) #clear out, err = capsys.readouterr() removereads.main([str(p), '-f', str(f), '-o', str(o), '-d1']) out, err = capsys.readouterr() for ii, jj in zip(err.split('\n'), ['.', 'Good reads: 1 Bad reads: 2']): assert ii == jj for ii, jj in zip([x.rstrip('\n') for x in o.readlines()], ['@seq1', 'AAA', '+seq1', '(((']): assert ii == jj removereads.main( [str(p), str(p2), '-f', str(f), '-o', str(o), str(o2), '-d1']) out, err = capsys.readouterr() for ii, jj in zip(err.split('\n'), ['.', 'Good reads: 1 Bad reads: 2']): assert ii == jj for ii, jj in zip([x.rstrip('\n') for x in o.readlines()], ['@seq1', 'AAA', '+seq1', '(((']): assert ii == jj for ii, jj in zip([x.rstrip('\n') for x in o2.readlines()], ['@seq1z', 'TTT', '+seq1z', '(((']): assert ii == jj os.chdir(str(d)) removereads.main([str(p), str(p2), '-f', str(f), '-d1']) out, err = capsys.readouterr() for ii, jj in zip(err.split('\n'), ['.', 'Good reads: 1 Bad reads: 2']): assert ii == jj for ii, jj in zip([ x.rstrip('\n') for x in helper.openNormalOrGz('out1.fastq.gz').readlines() ], ['@seq1', 'AAA', '+seq1', '(((']): assert ii == jj for ii, jj in zip([ x.rstrip('\n') for x in helper.openNormalOrGz('out2.fastq.gz').readlines() ], ['@seq1z', 'TTT', '+seq1z', '(((']): assert ii == jj removereads.main([str(p), str(p2), '-f', str(f), '-d1', '-k']) out, err = capsys.readouterr() for ii, jj in zip(err.split('\n'), ['..', 'Good reads: 2 Bad reads: 1']): assert ii == jj for ii, jj in zip([ x.rstrip('\n') for x in helper.openNormalOrGz('out1.fastq.gz').readlines() ], ['@seq2', 'TT', '+seq2', '((', '@seq3', 'T', '+seq3', '(']): assert ii == jj for ii, jj in zip([ x.rstrip('\n') for x in helper.openNormalOrGz('out2.fastq.gz').readlines() ], ['@seq2z', 'TT', '+seq2z', '((', '@seq3', 'T', '+seq3', '!']): assert ii == jj