Example #1
0
def main(argv=None):
    parser = argparse.ArgumentParser(description="A program to remove short reads from a fastq file.")
    parser.add_argument('fastqFile', help='a (potentially gzipped) fastq file containing the sequence data',type=helper.checkFile)
    parser.add_argument("-d","--dots", help="output dot to stderr every X reads. Input a negative number to suppress output (default:-1)", default=-1,type=int)
    parser.add_argument("-l","--minLength", help="minimum length read to output (default:15)",default=15,type=int)
    parser.add_argument("-n","--removeN", help="remove reads which contain anything other than A, C, T or G",action='store_true')
    parser.add_argument("-p","--removePoor", help="remove reads with different length sequence and qualities.  Note this requires assuming that all reads are 4 lines each",action='store_true')
    parser.add_argument("-b","--badOut", help="a file path in which to save the first 10000 malformed reads with different length sequence and qualities. Note this requires assuming that all reads are 4 lines each",type=argparse.FileType('w'),default=None)
    args=parser.parse_args(argv)
 

    with shortFilterFastqIter(args.fastqFile,args.minLength,args.removeN,args.removePoor) as fastqIter:
        for currentRead in fastqIter:
            helper.writeFastqRead(sys.stdout,currentRead)
            if args.dots>0:
                if fastqIter.nGood % args.dots==0:
                    sys.stderr.write('.')
                    sys.stderr.flush()

        if args.badOut is not None:
            for currentRead in fastqIter.badList:
                helper.writeFastqRead(args.badOut,currentRead)
            args.badOut.close()

        if args.dots>0:
            sys.stderr.write("\nGood reads: "+str(fastqIter.nGood)+" Bad reads: "+str(fastqIter.nBad)+"\n")
Example #2
0
def main(argv=None):
    parser = argparse.ArgumentParser(description="A program to filter reads by name from a single/set of fastq file(s). The script looks for reads which have a name line where the string before a space exactly matches a pattern. If multiple files are passed in, then they are processed in sync and if any name matches that read is discarded from all files.")
    parser.add_argument('fastqFiles', help='a fastq (potentially gzipped) file(s) containing the reads with the order of reads the same in all files',type=helper.checkFile,nargs='+')
    parser.add_argument("-d","--dots", help="output dot to stderr every X reads. Input a negative number to suppress output (default:-1)", default=-1,type=int)
    parser.add_argument('-f','--filterFile', help='a file (potentially gzipped) file containing the names of reads to be filtered one per line',type=helper.checkFile,required=True)
    parser.add_argument('-o','--outputFiles', help='an output file(s) (one for each input fastq file). default(out1.fastq.gz ... outn.fastq.gz where n is the number of fastqFiles)',type=str,nargs='*')

    args=parser.parse_args(argv)
    if(args.outputFiles is None):
        outputFiles=['out'+str(ii)+'.fastq.gz' for ii in range(1,len(args.fastqFiles)+1)]
    else:
        outputFiles=args.outputFiles
    if(len(outputFiles)!=len(args.fastqFiles)):
        raise argparse.ArgumentTypeError("Input and output file numbers do not match")
    outHandles=[helper.openNormalOrGz(x,'w') for x in outputFiles]

    patterns=set(line.strip() for line in helper.openNormalOrGz(args.filterFile))

    with filterFastqIter(args.fastqFiles,patterns) as fastqIter:
        for currentReads in fastqIter:
            for read,outFile in zip(currentReads,outHandles):
                helper.writeFastqRead(outFile,read)
            if args.dots>0:
                if fastqIter.nGood % args.dots==0:
                    sys.stderr.write('.')

        if args.dots>0:
            sys.stderr.write("\nGood reads: "+str(fastqIter.nGood)+" Bad reads: "+str(fastqIter.nBad)+"\n")

    helper.closeFiles(outHandles)
Example #3
0
def test_openGzOrNormal(tmpdir):
    d = tmpdir.mkdir('dir')
    p = d.join('test.txt')
    gzFile = d.join('test.gz')
    gz=helper.openNormalOrGz(str(gzFile),'w')
    helper.writeFastqRead(gz,['1','22','333'])
    helper.writeFastqRead(gz,['55555','666666','7777777'])
    helper.closeFiles([gz])
    gz=helper.openNormalOrGz(str(gzFile))
    pred=['@1','22','+1','333','@55555','666666','+55555','7777777']
    for x,y in zip(gz,[x+'\n' for x in pred]):
        assert x==y
Example #4
0
def test_openGzOrNormal(tmpdir):
    d = tmpdir.mkdir('dir')
    p = d.join('test.txt')
    gzFile = d.join('test.gz')
    gz = helper.openNormalOrGz(str(gzFile), 'w')
    helper.writeFastqRead(gz, ['1', '22', '333'])
    helper.writeFastqRead(gz, ['55555', '666666', '7777777'])
    helper.closeFiles([gz])
    gz = helper.openNormalOrGz(str(gzFile))
    pred = ['@1', '22', '+1', '333', '@55555', '666666', '+55555', '7777777']
    for x, y in zip(gz, [x + '\n' for x in pred]):
        assert x == y
Example #5
0
def main(argv=None):
    parser = argparse.ArgumentParser(description="A program to remove short reads from a fastq file.")
    parser.add_argument('fastqFile', help='a fastq (potentially gzipped) file containing the alignment',type=helper.checkFile)
    parser.add_argument("-d","--dots", help="output dot to stderr every X reads. Input a negative number to suppress output (default:-1)", default=-1,type=int)
    parser.add_argument("-l","--minLength", help="minimum length read to output (default:15)",default=15,type=int)
    args=parser.parse_args(argv)
 

    with shortFilterFastqIter(args.fastqFile,args.minLength) as fastqIter:
        for currentRead in fastqIter:
            helper.writeFastqRead(sys.stdout,currentRead)
            if args.dots>0:
                if fastqIter.nGood % args.dots==0:
                    sys.stderr.write('.')

        if args.dots>0:
            sys.stderr.write("\nGood reads: "+str(fastqIter.nGood)+" Bad reads: "+str(fastqIter.nBad)+"\n")
Example #6
0
def main(argv=None):
    parser = argparse.ArgumentParser(description="A program to remove short reads from a fastq file.")
    parser.add_argument('fastqFile', help='a fastq (potentially gzipped) file containing the alignment',type=helper.check_file)
    parser.add_argument("-d","--dots", help="output dot to stderr every X reads. Input a negative number to suppress output (default:-1)", default=-1,type=int)
    parser.add_argument("-l","--minLength", help="minimum length read to output (default:15)",default=15,type=int)
    args=parser.parse_args(argv)
 

    with shortFilterFastqIter(args.fastqFile,args.minLength) as fastqIter:
        for currentRead in fastqIter:
            helper.writeFastqRead(sys.stdout,currentRead)
            if args.dots>0:
                if fastqIter.nGood % args.dots==0:
                    sys.stderr.write('.')

        if args.dots>0:
            sys.stderr.write("\nGood reads: "+str(fastqIter.nGood)+" Bad reads: "+str(fastqIter.nBad)+"\n")
Example #7
0
def main(argv=None):
    parser = argparse.ArgumentParser(description="A program to take a list of barcodes and one or more fastq reads and one or two index reads and output reads matching the barcodes into a seperate file for each barcode. The script takes read files and index files where the reads and indexs are in the same order and outputs reads which match the appropriate barcodes into separate files.")
    parser.add_argument('fastqFiles', help='a fastq file(s) (potentially gzipped) containing the sequence reads',type=helper.checkFile,nargs='+')
    parser.add_argument('-i','--indexFiles', help='a fastq file(s) (potentially gzipped) containing the index reads',type=helper.checkFile,nargs='+')
    parser.add_argument("-d","--dots", help="output dot to stderr every X reads. Input a negative number to suppress output (default:-1)", default=-1,type=int)
    parser.add_argument('-b','--barcodeFile', help='a file (potentially gzipped) file containing comma separated sample names, first barcode and second barcode (with no header and no commas in the sample names)',type=helper.checkFile,required=True)
    parser.add_argument('-o','--outputPath', help='a string giving the desired output directory',type=helper.checkDir,default='.')

    args=parser.parse_args(argv)

    nFiles=len(args.fastqFiles)
    barcodes=helper.readSimpleCsv(args.barcodeFile)
    if(len(args.indexFiles)!=len(barcodes[0])-1):
        raise argparse.ArgumentTypeError("Number of index files and index columns in the barcodeFile do not agree")

    samples=[xx[0] for xx in barcodes]
    if(len(set(samples))!=len(samples)):
        raise argparse.ArgumentTypeError("Two or more samples share the same name")
    outputFiles=[[os.path.join(args.outputPath,xx)+"_"+str(ii+1)+".fastq.gz" for ii in range(nFiles)] for xx in samples]
    bars=[tuple(xx[1:]) for xx in barcodes]
    barSet=set(bars)
    if len(barSet)!=len(bars):
        raise argparse.ArgumentTypeError("Two or more samples share the same set of barcodes")
    outHandles=dict(zip(bars,[[helper.openNormalOrGz(yy,'w') for yy in xx] for xx in outputFiles]))


    with barcodeFastqIter(args.fastqFiles,args.indexFiles,bars) as fastqIter:
        for currentReads,bar in fastqIter:
            for read,outFile in zip(currentReads,outHandles[bar]):
                helper.writeFastqRead(outFile,read)
            if args.dots>0:
                if fastqIter.nGood % args.dots==0:
                    sys.stderr.write('.')

        if args.dots>0:
            sys.stderr.write("\nGood reads: "+str(fastqIter.nGood)+" Bad reads: "+str(fastqIter.nBad)+"\n")

    for key in outHandles:
        helper.closeFiles(outHandles[key])
Example #8
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        description=
        "A program to filter reads by name from a single/set of fastq file(s). The script looks for reads which have a name line where the string before a space exactly matches a pattern. If multiple files are passed in, then they are processed in sync and if any name matches that read is discarded (or kept) from all files."
    )
    parser.add_argument(
        'fastqFiles',
        help=
        'a (potentially gzipped) fastq file(s) containing the reads with the order of reads the same in all files',
        type=helper.checkFile,
        nargs='+')
    parser.add_argument(
        "-d",
        "--dots",
        help=
        "output dot to stderr every X reads. Input a negative number to suppress output (default:-1)",
        default=-1,
        type=int)
    parser.add_argument(
        '-o',
        '--outputFiles',
        help=
        'an output file(s) (one for each input fastq file). default(out1.fastq.gz ... outn.fastq.gz where n is the number of fastqFiles)',
        type=str,
        nargs='*')
    parser.add_argument(
        '-f',
        '--filterFile',
        help=
        'a (potentially gzipped) file containing the names of reads to be filtered one per line',
        type=helper.checkFile,
        required=True)
    parser.add_argument(
        '-k',
        '--keep',
        help=
        'keep reads matching the filter file and filter all nonmatching reads',
        action='store_true')

    args = parser.parse_args(argv)
    if (args.outputFiles is None):
        outputFiles = [
            'out' + str(ii) + '.fastq.gz'
            for ii in range(1,
                            len(args.fastqFiles) + 1)
        ]
    else:
        outputFiles = args.outputFiles
    if (len(outputFiles) != len(args.fastqFiles)):
        raise argparse.ArgumentTypeError(
            "Input and output file numbers do not match")
    outHandles = [helper.openNormalOrGz(x, 'w') for x in outputFiles]

    patterns = set(line.strip()
                   for line in helper.openNormalOrGz(args.filterFile))

    with filterFastqIter(args.fastqFiles, patterns, args.keep) as fastqIter:
        for currentReads in fastqIter:
            for read, outFile in zip(currentReads, outHandles):
                helper.writeFastqRead(outFile, read)
            if args.dots > 0:
                if fastqIter.nGood % args.dots == 0:
                    sys.stderr.write('.')
                    sys.stderr.flush()

        if args.dots > 0:
            sys.stderr.write("\nGood reads: " + str(fastqIter.nGood) +
                             " Bad reads: " + str(fastqIter.nBad) + "\n")

    helper.closeFiles(outHandles)
Example #9
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        description=
        "A program to take a list of barcodes and one or more fastq reads and one or two index reads and output reads matching the barcodes into a seperate file for each barcode. The script takes read files and index files where the reads and indexs are in the same order and outputs reads which match the appropriate barcodes into separate files."
    )
    parser.add_argument(
        'fastqFiles',
        help=
        'a fastq file(s) (potentially gzipped) containing the sequence reads',
        type=helper.checkFile,
        nargs='+')
    parser.add_argument(
        '-i',
        '--indexFiles',
        help='a fastq file(s) (potentially gzipped) containing the index reads',
        type=helper.checkFile,
        nargs='+')
    parser.add_argument(
        "-d",
        "--dots",
        help=
        "output dot to stderr every X reads. Input a negative number to suppress output (default:-1)",
        default=-1,
        type=int)
    parser.add_argument(
        '-b',
        '--barcodeFile',
        help=
        'a (potentially gzipped) file containing comma separated sample names, first barcode and second barcode (with no header and no commas in the sample names)',
        type=helper.checkFile,
        required=True)
    parser.add_argument('-o',
                        '--outputPath',
                        help='a string giving the desired output directory',
                        type=helper.checkDir,
                        default='.')
    parser.add_argument(
        '-u',
        '--unassigned',
        help=
        'if set then store unassigned reads to {outputPath}/__UNASSIGNED__R#.fastq.gz with their corresponding barcodes in {outputPath}/__UNASSIGNED__I#.fastq.gz',
        action='store_true')

    args = parser.parse_args(argv)

    nFiles = len(args.fastqFiles)
    nIndexs = len(args.indexFiles)
    barcodes = helper.readSimpleCsv(args.barcodeFile)

    if (nIndexs != len(barcodes[0]) - 1):
        raise argparse.ArgumentTypeError(
            "Number of index files and index columns in the barcodeFile do not agree"
        )

    samples = [xx[0] for xx in barcodes]
    if (len(set(samples)) != len(samples)):
        raise argparse.ArgumentTypeError(
            "Two or more samples share the same name")
    outputFiles = [[
        os.path.join(args.outputPath, xx) + "_" + str(ii + 1) + ".fastq.gz"
        for ii in range(nFiles)
    ] for xx in samples]
    bars = [tuple(xx[1:]) for xx in barcodes]
    barSet = set(bars)
    if len(barSet) != len(bars):
        raise argparse.ArgumentTypeError(
            "Two or more samples share the same set of barcodes")
    outHandles = dict(
        zip(bars, [[helper.openNormalOrGz(yy, 'w') for yy in xx]
                   for xx in outputFiles]))

    if args.unassigned:
        if any([
                xx[0] in ['__UNASSIGNED__R', '__UNASSIGNED__I']
                for xx in barcodes
        ]):
            raise argparse.ArgumentTypeError(
                "Sample named __UNASSIGNED__ clashes with unassigned output. Please rename"
            )
        badIndexFiles = [
            os.path.join(args.outputPath, "__UNASSIGNED__I") + str(ii + 1) +
            ".fastq.gz" for ii in range(nIndexs)
        ]
        badReadFiles = [
            os.path.join(args.outputPath, "__UNASSIGNED__R") + str(ii + 1) +
            ".fastq.gz" for ii in range(nFiles)
        ]
        badIndexHandles = [
            helper.openNormalOrGz(ii, 'w') for ii in badIndexFiles
        ]
        badReadHandles = [
            helper.openNormalOrGz(ii, 'w') for ii in badReadFiles
        ]

    with barcodeFastqIter(args.fastqFiles, args.indexFiles, bars,
                          args.unassigned) as fastqIter:
        for currentReads, bar, assigned, fullBar in fastqIter:
            if args.unassigned and not assigned:
                for read, outFile in zip(currentReads, badReadHandles):
                    helper.writeFastqRead(outFile, read)
                for read, outFile in zip(fullBar, badIndexHandles):
                    helper.writeFastqRead(outFile, read)
            else:
                for read, outFile in zip(currentReads, outHandles[bar]):
                    helper.writeFastqRead(outFile, read)
                if args.dots > 0:
                    if fastqIter.nGood % args.dots == 0:
                        sys.stderr.write('.')
                        sys.stderr.flush()

        if args.dots > 0:
            sys.stderr.write("\nReads assigned to barcode: " +
                             str(fastqIter.nGood) + " Unassigned reads: " +
                             str(fastqIter.nBad) + "\n")

    for key in outHandles:
        helper.closeFiles(outHandles[key])