Example #1
0
def main(argv=None):
    parser = argparse.ArgumentParser(description="A program to filter reads by name from a single/set of fastq file(s). The script looks for reads which have a name line where the string before a space exactly matches a pattern. If multiple files are passed in, then they are processed in sync and if any name matches that read is discarded from all files.")
    parser.add_argument('fastqFiles', help='a fastq (potentially gzipped) file(s) containing the reads with the order of reads the same in all files',type=helper.checkFile,nargs='+')
    parser.add_argument("-d","--dots", help="output dot to stderr every X reads. Input a negative number to suppress output (default:-1)", default=-1,type=int)
    parser.add_argument('-f','--filterFile', help='a file (potentially gzipped) file containing the names of reads to be filtered one per line',type=helper.checkFile,required=True)
    parser.add_argument('-o','--outputFiles', help='an output file(s) (one for each input fastq file). default(out1.fastq.gz ... outn.fastq.gz where n is the number of fastqFiles)',type=str,nargs='*')

    args=parser.parse_args(argv)
    if(args.outputFiles is None):
        outputFiles=['out'+str(ii)+'.fastq.gz' for ii in range(1,len(args.fastqFiles)+1)]
    else:
        outputFiles=args.outputFiles
    if(len(outputFiles)!=len(args.fastqFiles)):
        raise argparse.ArgumentTypeError("Input and output file numbers do not match")
    outHandles=[helper.openNormalOrGz(x,'w') for x in outputFiles]

    patterns=set(line.strip() for line in helper.openNormalOrGz(args.filterFile))

    with filterFastqIter(args.fastqFiles,patterns) as fastqIter:
        for currentReads in fastqIter:
            for read,outFile in zip(currentReads,outHandles):
                helper.writeFastqRead(outFile,read)
            if args.dots>0:
                if fastqIter.nGood % args.dots==0:
                    sys.stderr.write('.')

        if args.dots>0:
            sys.stderr.write("\nGood reads: "+str(fastqIter.nGood)+" Bad reads: "+str(fastqIter.nBad)+"\n")

    helper.closeFiles(outHandles)
Example #2
0
def test_main(capsys,tmpdir):
    with pytest.raises(SystemExit):
        splitbarcodes.main()
    out, err=capsys.readouterr()
    assert 'usage' in err
    with pytest.raises(SystemExit):
        splitbarcodes.main(['-h'])
    out, err=capsys.readouterr()
    assert 'usage' in out

    d = tmpdir.mkdir('dir')
    p1 = d.join('test.fastq')
    p1.write("@seq1\nAAA\n+\n(((\n@seq2\nTT\n+\n(A\n@seq3\nT\n+\n(\n")
    p2 = d.join('test2.fastq')
    p2.write("@seq1z\nTTT\n+\n(((\n@seq2z\nTT\n+\n((\n@seq3\nT\n+\n(\n")
    b = d.join('test.filter')
    o = d.join('test_1.fastq.gz')
    o2 = d.join('test_2.fastq.gz')
    #duplicate barcode
    b.write("test,AAA\ntest2,AAA\ntest3,AAT")
    with pytest.raises(argparse.ArgumentTypeError):
        splitbarcodes.main([str(p1),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1'])
    #duplicate sample name
    b.write("test,AAA\ntest,AAC\ntest3,AAT")
    with pytest.raises(argparse.ArgumentTypeError):
        splitbarcodes.main([str(p1),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1'])
    b.write("test,AAA")
    #two index files, 1 barcode
    with pytest.raises(argparse.ArgumentTypeError):
        splitbarcodes.main([str(p1),'-i',str(p1),str(p1),'-b',str(b),'-o',str(d),'-d1'])
    splitbarcodes.main([str(p1),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1'])
    out, err=capsys.readouterr()
    for ii,jj in zip(err.split('\n'),['.','Good reads: 1 Bad reads: 2']):
        assert ii==jj
    for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o)).readlines()],['@seq1','AAA','+seq1','(((']):
        assert ii==jj
    splitbarcodes.main([str(p1),str(p2),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1'])
    out, err=capsys.readouterr()
    for ii,jj in zip(err.split('\n'),['.','Good reads: 1 Bad reads: 2']):
        assert ii==jj
    for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o)).readlines()],['@seq1','AAA','+seq1','(((']):
        assert ii==jj
    for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o2)).readlines()],['@seq1z','TTT','+seq1z','(((']):
        assert ii==jj
    b.write("test,T,T")
    i1 = d.join('test3.fastq')
    i1.write("@seq1\nT\n+\n(\n@seq2\nT\n+\n(\n@seq3\nT\n+\n(\n")
    i2 = d.join('test4.fastq')
    i2.write("@seq1z\nT\n+\n(\n@seq2z\nT\n+\n(\n@seq3\nTT\n+\n((\n")
    splitbarcodes.main([str(p1),str(p2),'-i',str(i1),str(i2),'-b',str(b),'-o',str(d),'-d1'])
    out, err=capsys.readouterr()
    for ii,jj in zip(err.split('\n'),['..','Good reads: 2 Bad reads: 1']):
        assert ii==jj
    for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o)).readlines()],['@seq1','AAA','+seq1','(((','@seq2','TT','+seq2','(A']):
        assert ii==jj
    for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o2)).readlines()],['@seq1z','TTT','+seq1z','(((','@seq2z','TT','+seq2z','((']):
        assert ii==jj
Example #3
0
 def __init__(self, fastqFiles,indexFiles,barcodes):
     self.nGood = 0
     self.nBad = 0
     self.fastqFiles=fastqFiles
     self.indexFiles=indexFiles
     self.barcodes=barcodes
     self.fastqHandles=[helper.openNormalOrGz(x) for x in self.fastqFiles]
     self.indexHandles=[helper.openNormalOrGz(x) for x in self.indexFiles]
     self.fastqs=[Bio.SeqIO.QualityIO.FastqGeneralIterator(x) for x in self.fastqHandles]
     self.indexs=[Bio.SeqIO.QualityIO.FastqGeneralIterator(x) for x in self.indexHandles]
Example #4
0
def test_openGzOrNormal(tmpdir):
    d = tmpdir.mkdir('dir')
    p = d.join('test.txt')
    gzFile = d.join('test.gz')
    gz=helper.openNormalOrGz(str(gzFile),'w')
    helper.writeFastqRead(gz,['1','22','333'])
    helper.writeFastqRead(gz,['55555','666666','7777777'])
    helper.closeFiles([gz])
    gz=helper.openNormalOrGz(str(gzFile))
    pred=['@1','22','+1','333','@55555','666666','+55555','7777777']
    for x,y in zip(gz,[x+'\n' for x in pred]):
        assert x==y
Example #5
0
def test_openGzOrNormal(tmpdir):
    d = tmpdir.mkdir('dir')
    p = d.join('test.txt')
    gzFile = d.join('test.gz')
    gz = helper.openNormalOrGz(str(gzFile), 'w')
    helper.writeFastqRead(gz, ['1', '22', '333'])
    helper.writeFastqRead(gz, ['55555', '666666', '7777777'])
    helper.closeFiles([gz])
    gz = helper.openNormalOrGz(str(gzFile))
    pred = ['@1', '22', '+1', '333', '@55555', '666666', '+55555', '7777777']
    for x, y in zip(gz, [x + '\n' for x in pred]):
        assert x == y
Example #6
0
def test_readSimpleCsv(tmpdir):
    d = tmpdir.mkdir('dir')
    p = d.join('test.txt')
    with pytest.raises(argparse.ArgumentTypeError):
        helper.checkFile(str(p))
    with helper.openNormalOrGz(str(p),'w') as f:
        f.write("1,'2',3   \n  \n  \na,\"bb\",ccc\n  2,3,4  ")
    assert(helper.readSimpleCsv(str(p))==[['1','2','3'],['a','bb','ccc'],["2","3","4"]])
    with helper.openNormalOrGz(str(p),'w') as f:
        f.write("1,2,3\n\n  \na,bb,ccc,d\n  2,3,4  ")
    with pytest.raises(ValueError):
        helper.readSimpleCsv(str(p))
Example #7
0
def test_closeFiles(tmpdir):
    d = tmpdir.mkdir('dir')
    ps = [d.join('test'+str(ii)+'.txt') for ii in range(10)]
    handles = [helper.openNormalOrGz(str(p),'w') for p in ps]
    helper.closeFiles(handles)
    for ii in handles:
        with pytest.raises(ValueError):
            ii.write("X")
    handles = dict(zip(range(10),[helper.openNormalOrGz(str(p),'w') for p in ps]))
    helper.closeFiles(handles)
    for _,ii in handles.items():
        with pytest.raises(ValueError):
            ii.write("X")
Example #8
0
def test_readSimpleCsv(tmpdir):
    d = tmpdir.mkdir('dir')
    p = d.join('test.txt')
    with pytest.raises(argparse.ArgumentTypeError):
        helper.checkFile(str(p))
    with helper.openNormalOrGz(str(p), 'w') as f:
        f.write("1,'2',3   \n  \n  \na,\"bb\",ccc\n  2,3,4  ")
    assert (helper.readSimpleCsv(str(p)) == [['1', '2',
                                              '3'], ['a', 'bb', 'ccc'],
                                             ["2", "3", "4"]])
    with helper.openNormalOrGz(str(p), 'w') as f:
        f.write("1,2,3\n\n  \na,bb,ccc,d\n  2,3,4  ")
    with pytest.raises(ValueError):
        helper.readSimpleCsv(str(p))
Example #9
0
def test_closeFiles(tmpdir):
    d = tmpdir.mkdir('dir')
    ps = [d.join('test' + str(ii) + '.txt') for ii in range(10)]
    handles = [helper.openNormalOrGz(str(p), 'w') for p in ps]
    helper.closeFiles(handles)
    for ii in handles:
        with pytest.raises(ValueError):
            ii.write("X")
    handles = dict(
        zip(range(10), [helper.openNormalOrGz(str(p), 'w') for p in ps]))
    helper.closeFiles(handles)
    for _, ii in handles.items():
        with pytest.raises(ValueError):
            ii.write("X")
Example #10
0
 def __init__(self, fastqFiles,patterns):
     self.nGood = 0
     self.nBad = 0
     self.fastqFiles=fastqFiles
     self.patterns=patterns
     self.fastqHandles=[helper.openNormalOrGz(x) for x in self.fastqFiles]
     self.fastqs=[Bio.SeqIO.QualityIO.FastqGeneralIterator(x) for x in self.fastqHandles]
Example #11
0
 def __init__(self, fastqFile, minLength=10):
     self.nGood = 0
     self.nBad = 0
     self.minLength=minLength
     self.fastqFile=fastqFile
     self.fastqHandle=helper.openNormalOrGz(self.fastqFile)
     self.fastq=Bio.SeqIO.QualityIO.FastqGeneralIterator(self.fastqHandle)
Example #12
0
 def __init__(self, fastqFile, minLength=10):
     self.nGood = 0
     self.nBad = 0
     self.minLength=minLength
     self.fastqFile=fastqFile
     self.fastqHandle=helper.openNormalOrGz(self.fastqFile)
     self.fastq=Bio.SeqIO.QualityIO.FastqGeneralIterator(self.fastqHandle)
Example #13
0
def main(argv=None):
    parser = argparse.ArgumentParser(description="A program to convert a bam file into an aligned fasta file. The command generates fasta formatted output (two lines for each sequence: a name line prepended by > and a line containing the aligned sequence) to standard out.")
    parser.add_argument('bamFile', help='a bam file containing the alignment',type=helper.checkFile)
    parser.add_argument("-s","--refseq", help="fasta file giving the reference sequence of interest",type=helper.checkFile,required=True)
    parser.add_argument("-q","--minQuality", help="don't count alignments with a mapping quality less than this", type=int,default=0)
    parser.add_argument("-v","--verbose", help="increase output verbosity to stderr", action="store_true")
    parser.add_argument("-r","--region", help="the region to pull reads from (note that the underlying pysam does not like single base regions like ch1:25. These instead be specified as chr1:25-25.)",default=None)
    parser.add_argument("-e","--endSpan", help="ignore spans of matches at the start or end of a read less than this cutoff",default=0,type=int)
    args=parser.parse_args(argv)
 
        
    if args.verbose:
        sys.stderr.write("Arguments: \n")
        for key, value in vars(args).items():
            sys.stderr.write("   "+key+": "+str(value)+'\n')

    with helper.openNormalOrGz(args.refseq) as fasta:
        args.region,ref=getRefFromFasta(helper.readSimpleFasta(fasta),args.region)

    nRead=0
    aligns=[read for read in getAlignsInFile(args.bamFile,args.region,args.minQuality,args.endSpan)]
    inserts=[[insertion[0],len(insertion[1])] for align in aligns if len(align['insertions'])>0 for insertion in align['insertions'] ]
    maxInserts={}
    for pos,length in inserts:
        maxInserts[pos]=max(length,maxInserts[pos]) if pos in maxInserts else length
    nChar=len(ref)
    print('>'+args.region)
    refPad=padRead(ref,0,nChar,[],maxInserts)
    print(refPad)
    for align in aligns:
        padded=padRead(align['seq'],align['start'],nChar,align['insertions'],maxInserts)
        print('>'+align['name'])
        print(padded)
Example #14
0
def countKmersInFile(args):
    fastqFile = args[0]
    k = args[1]
    sys.stderr.write('Working on file %s\n' % fastqFile)
    with helper.openNormalOrGz(fastqFile) as fastqHandle:
        fastq = Bio.SeqIO.QualityIO.FastqGeneralIterator(fastqHandle)
        return (countKmersInReads(fastq, k))
Example #15
0
def test_main(capsys,tmpdir):
    with pytest.raises(SystemExit):
        removereads.main()
    out, err=capsys.readouterr()
    assert 'usage' in err
    with pytest.raises(SystemExit):
        removereads.main(['-h'])
    out, err=capsys.readouterr()
    assert 'usage' in out

    d = tmpdir.mkdir('dir')
    p = d.join('test.fastq')
    p.write("@seq1\nAAA\n+\n(((\n@seq2\nTT\n+\n((\n@seq3\nT\n+\n(\n")
    p2 = d.join('test2.fastq')
    p2.write("@seq1z\nTTT\n+\n(((\n@seq2z\nTT\n+\n((\n@seq3\nT\n+\n(\n")
    f = d.join('test.filter')
    f.write("seq2\nseq3")
    o = d.join('test.out')
    o2 = d.join('test2.out')
    with pytest.raises(argparse.ArgumentTypeError):
        removereads.main([str(p),'-f',str(f),'-o',str(o),str(o),'-d1'])
    with pytest.raises(argparse.ArgumentTypeError):
        removereads.main([str(p),str(p),'-f',str(f),'-o',str(o),'-d1'])
    removereads.main([str(p),'-f',str(f),'-o',str(o),'-d1'])
    out, err=capsys.readouterr()
    for ii,jj in zip(err.split('\n'),['.','Good reads: 1 Bad reads: 2']):
        assert ii==jj
    for ii,jj in zip([x.rstrip('\n') for x in o.readlines()],['@seq1','AAA','+seq1','(((']):
        assert ii==jj
    removereads.main([str(p),str(p2),'-f',str(f),'-o',str(o),str(o2),'-d1'])
    out, err=capsys.readouterr()
    for ii,jj in zip(err.split('\n'),['.','Good reads: 1 Bad reads: 2']):
        assert ii==jj
    for ii,jj in zip([x.rstrip('\n') for x in o.readlines()],['@seq1','AAA','+seq1','(((']):
        assert ii==jj
    for ii,jj in zip([x.rstrip('\n') for x in o2.readlines()],['@seq1z','TTT','+seq1z','(((']):
        assert ii==jj
    os.chdir(str(d))
    removereads.main([str(p),str(p2),'-f',str(f),'-d1'])
    out, err=capsys.readouterr()
    for ii,jj in zip(err.split('\n'),['.','Good reads: 1 Bad reads: 2']):
        assert ii==jj
    for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz('out1.fastq.gz').readlines()],['@seq1','AAA','+seq1','(((']):
        assert ii==jj
    for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz('out2.fastq.gz').readlines()],['@seq1z','TTT','+seq1z','(((']):
        assert ii==jj
Example #16
0
 def __init__(self, fastqFiles, patterns, keep=False):
     self.nGood = 0
     self.nBad = 0
     self.fastqFiles = fastqFiles
     self.patterns = patterns
     self.fastqHandles = [helper.openNormalOrGz(x) for x in self.fastqFiles]
     self.fastqs = [
         Bio.SeqIO.QualityIO.FastqGeneralIterator(x)
         for x in self.fastqHandles
     ]
     self.keep = keep
Example #17
0
 def __init__(self,
              fastqFiles,
              indexFiles,
              barcodes,
              returnUnassigned=False):
     self.nGood = 0
     self.nBad = 0
     self.fastqFiles = fastqFiles
     self.indexFiles = indexFiles
     self.barcodes = barcodes
     self.fastqHandles = [helper.openNormalOrGz(x) for x in self.fastqFiles]
     self.indexHandles = [helper.openNormalOrGz(x) for x in self.indexFiles]
     self.fastqs = [
         Bio.SeqIO.QualityIO.FastqGeneralIterator(x)
         for x in self.fastqHandles
     ]
     self.indexs = [
         Bio.SeqIO.QualityIO.FastqGeneralIterator(x)
         for x in self.indexHandles
     ]
     self.returnUnassigned = returnUnassigned
Example #18
0
 def __init__(self, fastqFile, minLength=10, removeN=False, suppressBad=False):
     self.nGood = 0
     self.nBad = 0
     self.minLength=minLength
     self.removeN=removeN
     self.suppressBad=suppressBad
     self.fastqFile=fastqFile
     self.fastqHandle=helper.openNormalOrGz(self.fastqFile)
     if suppressBad:
         self.fastq=helper.readSimpleFastq(self.fastqHandle)
     else:
         self.fastq=Bio.SeqIO.QualityIO.FastqGeneralIterator(self.fastqHandle)
     self.nSearch=re.compile('[^ACTG]').search
     self.badList=[]
Example #19
0
def main(argv=None):
    parser = argparse.ArgumentParser(description="A program to take a list of barcodes and one or more fastq reads and one or two index reads and output reads matching the barcodes into a seperate file for each barcode. The script takes read files and index files where the reads and indexs are in the same order and outputs reads which match the appropriate barcodes into separate files.")
    parser.add_argument('fastqFiles', help='a fastq file(s) (potentially gzipped) containing the sequence reads',type=helper.checkFile,nargs='+')
    parser.add_argument('-i','--indexFiles', help='a fastq file(s) (potentially gzipped) containing the index reads',type=helper.checkFile,nargs='+')
    parser.add_argument("-d","--dots", help="output dot to stderr every X reads. Input a negative number to suppress output (default:-1)", default=-1,type=int)
    parser.add_argument('-b','--barcodeFile', help='a file (potentially gzipped) file containing comma separated sample names, first barcode and second barcode (with no header and no commas in the sample names)',type=helper.checkFile,required=True)
    parser.add_argument('-o','--outputPath', help='a string giving the desired output directory',type=helper.checkDir,default='.')

    args=parser.parse_args(argv)

    nFiles=len(args.fastqFiles)
    barcodes=helper.readSimpleCsv(args.barcodeFile)
    if(len(args.indexFiles)!=len(barcodes[0])-1):
        raise argparse.ArgumentTypeError("Number of index files and index columns in the barcodeFile do not agree")

    samples=[xx[0] for xx in barcodes]
    if(len(set(samples))!=len(samples)):
        raise argparse.ArgumentTypeError("Two or more samples share the same name")
    outputFiles=[[os.path.join(args.outputPath,xx)+"_"+str(ii+1)+".fastq.gz" for ii in range(nFiles)] for xx in samples]
    bars=[tuple(xx[1:]) for xx in barcodes]
    barSet=set(bars)
    if len(barSet)!=len(bars):
        raise argparse.ArgumentTypeError("Two or more samples share the same set of barcodes")
    outHandles=dict(zip(bars,[[helper.openNormalOrGz(yy,'w') for yy in xx] for xx in outputFiles]))


    with barcodeFastqIter(args.fastqFiles,args.indexFiles,bars) as fastqIter:
        for currentReads,bar in fastqIter:
            for read,outFile in zip(currentReads,outHandles[bar]):
                helper.writeFastqRead(outFile,read)
            if args.dots>0:
                if fastqIter.nGood % args.dots==0:
                    sys.stderr.write('.')

        if args.dots>0:
            sys.stderr.write("\nGood reads: "+str(fastqIter.nGood)+" Bad reads: "+str(fastqIter.nBad)+"\n")

    for key in outHandles:
        helper.closeFiles(outHandles[key])
Example #20
0
def test_main(capsys, tmpdir, bamFile):
    with pytest.raises(SystemExit):
        bamtoalign.main()
    out, err = capsys.readouterr()
    assert 'usage' in err
    with pytest.raises(SystemExit):
        bamtoalign.main(['-h'])
    out, err = capsys.readouterr()
    assert 'usage' in out
    d = tmpdir.mkdir('dir')
    p = d.join('ref.fa')
    with helper.openNormalOrGz(str(p), 'w') as f:
        f.write(">ref\nAAAAAAAAAATTTTTTTTTTCCCCCCCCCCGGGGGGGGGGAAAAAAAAAA")
    bamtoalign.main(['-v', '-s', str(p), str(bamFile)])
    out, err = capsys.readouterr()
    assert 'Arguments' in err
    for ii, jj in zip(out.split('\n'), [
            '>ref', 'AAAAAAAAAATTTTTTTTTTCCCCCCCCCCGG--GGGGGGGGAA-AAA-AAAAA',
            '>read3', '----------------------------GGGG--AAAAAT--------------',
            '>read2', '--------------------------------A---AAAATTTTT---------',
            '>read1', '--------------------------------CCAAAAACCCCC---GGCC---'
    ]):
        assert ii == jj
Example #21
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        description=
        "A program to filter reads by name from a single/set of fastq file(s). The script looks for reads which have a name line where the string before a space exactly matches a pattern. If multiple files are passed in, then they are processed in sync and if any name matches that read is discarded (or kept) from all files."
    )
    parser.add_argument(
        'fastqFiles',
        help=
        'a (potentially gzipped) fastq file(s) containing the reads with the order of reads the same in all files',
        type=helper.checkFile,
        nargs='+')
    parser.add_argument(
        "-d",
        "--dots",
        help=
        "output dot to stderr every X reads. Input a negative number to suppress output (default:-1)",
        default=-1,
        type=int)
    parser.add_argument(
        '-o',
        '--outputFiles',
        help=
        'an output file(s) (one for each input fastq file). default(out1.fastq.gz ... outn.fastq.gz where n is the number of fastqFiles)',
        type=str,
        nargs='*')
    parser.add_argument(
        '-f',
        '--filterFile',
        help=
        'a (potentially gzipped) file containing the names of reads to be filtered one per line',
        type=helper.checkFile,
        required=True)
    parser.add_argument(
        '-k',
        '--keep',
        help=
        'keep reads matching the filter file and filter all nonmatching reads',
        action='store_true')

    args = parser.parse_args(argv)
    if (args.outputFiles is None):
        outputFiles = [
            'out' + str(ii) + '.fastq.gz'
            for ii in range(1,
                            len(args.fastqFiles) + 1)
        ]
    else:
        outputFiles = args.outputFiles
    if (len(outputFiles) != len(args.fastqFiles)):
        raise argparse.ArgumentTypeError(
            "Input and output file numbers do not match")
    outHandles = [helper.openNormalOrGz(x, 'w') for x in outputFiles]

    patterns = set(line.strip()
                   for line in helper.openNormalOrGz(args.filterFile))

    with filterFastqIter(args.fastqFiles, patterns, args.keep) as fastqIter:
        for currentReads in fastqIter:
            for read, outFile in zip(currentReads, outHandles):
                helper.writeFastqRead(outFile, read)
            if args.dots > 0:
                if fastqIter.nGood % args.dots == 0:
                    sys.stderr.write('.')
                    sys.stderr.flush()

        if args.dots > 0:
            sys.stderr.write("\nGood reads: " + str(fastqIter.nGood) +
                             " Bad reads: " + str(fastqIter.nBad) + "\n")

    helper.closeFiles(outHandles)
Example #22
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        description=
        "A program to take a list of barcodes and one or more fastq reads and one or two index reads and output reads matching the barcodes into a seperate file for each barcode. The script takes read files and index files where the reads and indexs are in the same order and outputs reads which match the appropriate barcodes into separate files."
    )
    parser.add_argument(
        'fastqFiles',
        help=
        'a fastq file(s) (potentially gzipped) containing the sequence reads',
        type=helper.checkFile,
        nargs='+')
    parser.add_argument(
        '-i',
        '--indexFiles',
        help='a fastq file(s) (potentially gzipped) containing the index reads',
        type=helper.checkFile,
        nargs='+')
    parser.add_argument(
        "-d",
        "--dots",
        help=
        "output dot to stderr every X reads. Input a negative number to suppress output (default:-1)",
        default=-1,
        type=int)
    parser.add_argument(
        '-b',
        '--barcodeFile',
        help=
        'a (potentially gzipped) file containing comma separated sample names, first barcode and second barcode (with no header and no commas in the sample names)',
        type=helper.checkFile,
        required=True)
    parser.add_argument('-o',
                        '--outputPath',
                        help='a string giving the desired output directory',
                        type=helper.checkDir,
                        default='.')
    parser.add_argument(
        '-u',
        '--unassigned',
        help=
        'if set then store unassigned reads to {outputPath}/__UNASSIGNED__R#.fastq.gz with their corresponding barcodes in {outputPath}/__UNASSIGNED__I#.fastq.gz',
        action='store_true')

    args = parser.parse_args(argv)

    nFiles = len(args.fastqFiles)
    nIndexs = len(args.indexFiles)
    barcodes = helper.readSimpleCsv(args.barcodeFile)

    if (nIndexs != len(barcodes[0]) - 1):
        raise argparse.ArgumentTypeError(
            "Number of index files and index columns in the barcodeFile do not agree"
        )

    samples = [xx[0] for xx in barcodes]
    if (len(set(samples)) != len(samples)):
        raise argparse.ArgumentTypeError(
            "Two or more samples share the same name")
    outputFiles = [[
        os.path.join(args.outputPath, xx) + "_" + str(ii + 1) + ".fastq.gz"
        for ii in range(nFiles)
    ] for xx in samples]
    bars = [tuple(xx[1:]) for xx in barcodes]
    barSet = set(bars)
    if len(barSet) != len(bars):
        raise argparse.ArgumentTypeError(
            "Two or more samples share the same set of barcodes")
    outHandles = dict(
        zip(bars, [[helper.openNormalOrGz(yy, 'w') for yy in xx]
                   for xx in outputFiles]))

    if args.unassigned:
        if any([
                xx[0] in ['__UNASSIGNED__R', '__UNASSIGNED__I']
                for xx in barcodes
        ]):
            raise argparse.ArgumentTypeError(
                "Sample named __UNASSIGNED__ clashes with unassigned output. Please rename"
            )
        badIndexFiles = [
            os.path.join(args.outputPath, "__UNASSIGNED__I") + str(ii + 1) +
            ".fastq.gz" for ii in range(nIndexs)
        ]
        badReadFiles = [
            os.path.join(args.outputPath, "__UNASSIGNED__R") + str(ii + 1) +
            ".fastq.gz" for ii in range(nFiles)
        ]
        badIndexHandles = [
            helper.openNormalOrGz(ii, 'w') for ii in badIndexFiles
        ]
        badReadHandles = [
            helper.openNormalOrGz(ii, 'w') for ii in badReadFiles
        ]

    with barcodeFastqIter(args.fastqFiles, args.indexFiles, bars,
                          args.unassigned) as fastqIter:
        for currentReads, bar, assigned, fullBar in fastqIter:
            if args.unassigned and not assigned:
                for read, outFile in zip(currentReads, badReadHandles):
                    helper.writeFastqRead(outFile, read)
                for read, outFile in zip(fullBar, badIndexHandles):
                    helper.writeFastqRead(outFile, read)
            else:
                for read, outFile in zip(currentReads, outHandles[bar]):
                    helper.writeFastqRead(outFile, read)
                if args.dots > 0:
                    if fastqIter.nGood % args.dots == 0:
                        sys.stderr.write('.')
                        sys.stderr.flush()

        if args.dots > 0:
            sys.stderr.write("\nReads assigned to barcode: " +
                             str(fastqIter.nGood) + " Unassigned reads: " +
                             str(fastqIter.nBad) + "\n")

    for key in outHandles:
        helper.closeFiles(outHandles[key])
Example #23
0
def test_main(capsys,tmpdir):
    with pytest.raises(SystemExit):
        splitbarcodes.main()
    out, err=capsys.readouterr()
    assert 'usage' in err
    with pytest.raises(SystemExit):
        splitbarcodes.main(['-h'])
    out, err=capsys.readouterr()
    assert 'usage' in out

    d = tmpdir.mkdir('dir')
    p1 = d.join('test.fastq')
    p1.write("@seq1\nAAA\n+\n(((\n@seq2\nTT\n+\n(A\n@seq3\nT\n+\n(\n")
    p2 = d.join('test2.fastq')
    p2.write("@seq1z\nTTT\n+\n(((\n@seq2z\nTT\n+\n((\n@seq3\nATC\n+\n([(\n")
    b = d.join('test.filter')
    o = d.join('test_1.fastq.gz')
    o2 = d.join('test_2.fastq.gz')
    #duplicate barcode
    b.write("test,AAA\ntest2,AAA\ntest3,AAT")
    with pytest.raises(argparse.ArgumentTypeError):
        splitbarcodes.main([str(p1),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1'])
    #duplicate sample name
    b.write("test,AAA\ntest,AAC\ntest3,AAT")
    with pytest.raises(argparse.ArgumentTypeError):
        splitbarcodes.main([str(p1),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1'])
    #sample named __UNASSIGNED__R
    b.write("__UNASSIGNED__R,AAA")
    with pytest.raises(argparse.ArgumentTypeError):
        splitbarcodes.main([str(p1),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1','-u'])
    b.write("__UNASSIGNED__I,AAA")
    with pytest.raises(argparse.ArgumentTypeError):
        splitbarcodes.main([str(p1),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1','-u'])
    b.write("test,AAA")
    #two index files, one barcode
    with pytest.raises(argparse.ArgumentTypeError):
        splitbarcodes.main([str(p1),'-i',str(p1),str(p1),'-b',str(b),'-o',str(d),'-d1'])

    #one read file, one index file
    splitbarcodes.main([str(p1),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1'])
    out, err=capsys.readouterr()
    for ii,jj in zip(err.split('\n'),['.','Reads assigned to barcode: 1 Unassigned reads: 2']):
        assert ii==jj
    for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o)).readlines()],['@seq1','AAA','+seq1','(((']):
        assert ii==jj
    
    #two read files, one index file
    splitbarcodes.main([str(p1),str(p2),'-i',str(p1),'-b',str(b),'-o',str(d),'-d1'])
    out, err=capsys.readouterr()
    for ii,jj in zip(err.split('\n'),['.','Reads assigned to barcode: 1 Unassigned reads: 2']):
        assert ii==jj
    for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o)).readlines()],['@seq1','AAA','+seq1','(((']):
        assert ii==jj
    for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o2)).readlines()],['@seq1z','TTT','+seq1z','(((']):
        assert ii==jj

    #two read, two index files. test unassigned
    b.write("test,T,T\ntest2,A,T")
    i1 = d.join('test3.fastq')
    i1.write("@seq1\nT\n+\n(\n@seq2\nA\n+\n(\n@seq3\nT\n+\n(\n")
    i2 = d.join('test4.fastq')
    i2.write("@seq1z\nT\n+\n(\n@seq2z\nT\n+\n(\n@seq3\nTT\n+\n(!\n")
    splitbarcodes.main([str(p1),str(p2),'-i',str(i1),str(i2),'-b',str(b),'-o',str(d),'-d1','-u'])
    out, err=capsys.readouterr()
    for ii,jj in zip(err.split('\n'),['..','Reads assigned to barcode: 2 Unassigned reads: 1']):
        assert ii==jj
    for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o)).readlines()],['@seq1','AAA','+seq1','(((','@seq2','TT','+seq2','(A']):
        assert ii==jj
    for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(o2)).readlines()],['@seq1z','TTT','+seq1z','(((','@seq2z','TT','+seq2z','((']):
        assert ii==jj
    #test the unassigned
    ur1=d.join('__UNASSIGNED__R1.fastq.gz')
    ur2=d.join('__UNASSIGNED__R2.fastq.gz')
    ui1=d.join('__UNASSIGNED__I1.fastq.gz')
    ui2=d.join('__UNASSIGNED__I2.fastq.gz')
    assert len(helper.openNormalOrGz(str(ur1)).readlines())==4
    assert len(helper.openNormalOrGz(str(ur2)).readlines())==4
    assert len(helper.openNormalOrGz(str(ui1)).readlines())==4
    assert len(helper.openNormalOrGz(str(ui2)).readlines())==4
    for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(ur1)).readlines()],['@seq3','T','+seq3','(']):
        assert ii==jj
    for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(ur2)).readlines()],['@seq3','ATC','+seq3','([(']):
        assert ii==jj
    for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(ui1)).readlines()],['@seq3','T','+seq3','(']):
        assert ii==jj
    for ii,jj in zip([x.rstrip('\n') for x in helper.openNormalOrGz(str(ui2)).readlines()],['@seq3','TT','+seq3','(!']):
        assert ii==jj
Example #24
0
def test_main(capsys, tmpdir):
    with pytest.raises(SystemExit):
        removereads.main()
    out, err = capsys.readouterr()
    assert 'usage' in err
    with pytest.raises(SystemExit):
        removereads.main(['-h'])
    out, err = capsys.readouterr()
    assert 'usage' in out

    d = tmpdir.mkdir('dir')
    p = d.join('test.fastq')
    p.write("@seq1\nAAA\n+\n(((\n@seq2\nTT\n+\n((\n@seq3\nT\n+\n(\n")
    p2 = d.join('test2.fastq')
    p2.write("@seq1z\nTTT\n+\n(((\n@seq2z\nTT\n+\n((\n@seq3\nT\n+\n!\n")
    f = d.join('test.filter')
    f.write("seq2\nseq3")
    o = d.join('test.out')
    o2 = d.join('test2.out')
    with pytest.raises(argparse.ArgumentTypeError):
        removereads.main([str(p), '-f', str(f), '-o', str(o), str(o), '-d1'])
    with pytest.raises(argparse.ArgumentTypeError):
        removereads.main([str(p), str(p), '-f', str(f), '-o', str(o), '-d1'])
    with pytest.raises(SystemExit):
        removereads.main([str(p), '-o', str(o), '-d1'])
    #clear
    out, err = capsys.readouterr()
    removereads.main([str(p), '-f', str(f), '-o', str(o), '-d1'])
    out, err = capsys.readouterr()
    for ii, jj in zip(err.split('\n'), ['.', 'Good reads: 1 Bad reads: 2']):
        assert ii == jj
    for ii, jj in zip([x.rstrip('\n') for x in o.readlines()],
                      ['@seq1', 'AAA', '+seq1', '(((']):
        assert ii == jj
    removereads.main(
        [str(p), str(p2), '-f',
         str(f), '-o',
         str(o), str(o2), '-d1'])
    out, err = capsys.readouterr()
    for ii, jj in zip(err.split('\n'), ['.', 'Good reads: 1 Bad reads: 2']):
        assert ii == jj
    for ii, jj in zip([x.rstrip('\n') for x in o.readlines()],
                      ['@seq1', 'AAA', '+seq1', '(((']):
        assert ii == jj
    for ii, jj in zip([x.rstrip('\n') for x in o2.readlines()],
                      ['@seq1z', 'TTT', '+seq1z', '(((']):
        assert ii == jj
    os.chdir(str(d))
    removereads.main([str(p), str(p2), '-f', str(f), '-d1'])
    out, err = capsys.readouterr()
    for ii, jj in zip(err.split('\n'), ['.', 'Good reads: 1 Bad reads: 2']):
        assert ii == jj
    for ii, jj in zip([
            x.rstrip('\n')
            for x in helper.openNormalOrGz('out1.fastq.gz').readlines()
    ], ['@seq1', 'AAA', '+seq1', '(((']):
        assert ii == jj
    for ii, jj in zip([
            x.rstrip('\n')
            for x in helper.openNormalOrGz('out2.fastq.gz').readlines()
    ], ['@seq1z', 'TTT', '+seq1z', '(((']):
        assert ii == jj
    removereads.main([str(p), str(p2), '-f', str(f), '-d1', '-k'])
    out, err = capsys.readouterr()
    for ii, jj in zip(err.split('\n'), ['..', 'Good reads: 2 Bad reads: 1']):
        assert ii == jj
    for ii, jj in zip([
            x.rstrip('\n')
            for x in helper.openNormalOrGz('out1.fastq.gz').readlines()
    ], ['@seq2', 'TT', '+seq2', '((', '@seq3', 'T', '+seq3', '(']):
        assert ii == jj
    for ii, jj in zip([
            x.rstrip('\n')
            for x in helper.openNormalOrGz('out2.fastq.gz').readlines()
    ], ['@seq2z', 'TT', '+seq2z', '((', '@seq3', 'T', '+seq3', '!']):
        assert ii == jj