Esempio n. 1
0
def test_readSimpleCsv(tmpdir):
    d = tmpdir.mkdir('dir')
    p = d.join('test.txt')
    with pytest.raises(argparse.ArgumentTypeError):
        helper.checkFile(str(p))
    with helper.openNormalOrGz(str(p),'w') as f:
        f.write("1,'2',3   \n  \n  \na,\"bb\",ccc\n  2,3,4  ")
    assert(helper.readSimpleCsv(str(p))==[['1','2','3'],['a','bb','ccc'],["2","3","4"]])
    with helper.openNormalOrGz(str(p),'w') as f:
        f.write("1,2,3\n\n  \na,bb,ccc,d\n  2,3,4  ")
    with pytest.raises(ValueError):
        helper.readSimpleCsv(str(p))
Esempio n. 2
0
def test_readSimpleCsv(tmpdir):
    d = tmpdir.mkdir('dir')
    p = d.join('test.txt')
    with pytest.raises(argparse.ArgumentTypeError):
        helper.checkFile(str(p))
    with helper.openNormalOrGz(str(p), 'w') as f:
        f.write("1,'2',3   \n  \n  \na,\"bb\",ccc\n  2,3,4  ")
    assert (helper.readSimpleCsv(str(p)) == [['1', '2',
                                              '3'], ['a', 'bb', 'ccc'],
                                             ["2", "3", "4"]])
    with helper.openNormalOrGz(str(p), 'w') as f:
        f.write("1,2,3\n\n  \na,bb,ccc,d\n  2,3,4  ")
    with pytest.raises(ValueError):
        helper.readSimpleCsv(str(p))
Esempio n. 3
0
def main(argv=None):
    parser = argparse.ArgumentParser(description="A program to take a list of barcodes and one or more fastq reads and one or two index reads and output reads matching the barcodes into a seperate file for each barcode. The script takes read files and index files where the reads and indexs are in the same order and outputs reads which match the appropriate barcodes into separate files.")
    parser.add_argument('fastqFiles', help='a fastq file(s) (potentially gzipped) containing the sequence reads',type=helper.checkFile,nargs='+')
    parser.add_argument('-i','--indexFiles', help='a fastq file(s) (potentially gzipped) containing the index reads',type=helper.checkFile,nargs='+')
    parser.add_argument("-d","--dots", help="output dot to stderr every X reads. Input a negative number to suppress output (default:-1)", default=-1,type=int)
    parser.add_argument('-b','--barcodeFile', help='a file (potentially gzipped) file containing comma separated sample names, first barcode and second barcode (with no header and no commas in the sample names)',type=helper.checkFile,required=True)
    parser.add_argument('-o','--outputPath', help='a string giving the desired output directory',type=helper.checkDir,default='.')

    args=parser.parse_args(argv)

    nFiles=len(args.fastqFiles)
    barcodes=helper.readSimpleCsv(args.barcodeFile)
    if(len(args.indexFiles)!=len(barcodes[0])-1):
        raise argparse.ArgumentTypeError("Number of index files and index columns in the barcodeFile do not agree")

    samples=[xx[0] for xx in barcodes]
    if(len(set(samples))!=len(samples)):
        raise argparse.ArgumentTypeError("Two or more samples share the same name")
    outputFiles=[[os.path.join(args.outputPath,xx)+"_"+str(ii+1)+".fastq.gz" for ii in range(nFiles)] for xx in samples]
    bars=[tuple(xx[1:]) for xx in barcodes]
    barSet=set(bars)
    if len(barSet)!=len(bars):
        raise argparse.ArgumentTypeError("Two or more samples share the same set of barcodes")
    outHandles=dict(zip(bars,[[helper.openNormalOrGz(yy,'w') for yy in xx] for xx in outputFiles]))


    with barcodeFastqIter(args.fastqFiles,args.indexFiles,bars) as fastqIter:
        for currentReads,bar in fastqIter:
            for read,outFile in zip(currentReads,outHandles[bar]):
                helper.writeFastqRead(outFile,read)
            if args.dots>0:
                if fastqIter.nGood % args.dots==0:
                    sys.stderr.write('.')

        if args.dots>0:
            sys.stderr.write("\nGood reads: "+str(fastqIter.nGood)+" Bad reads: "+str(fastqIter.nBad)+"\n")

    for key in outHandles:
        helper.closeFiles(outHandles[key])
Esempio n. 4
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        description=
        "A program to take a list of barcodes and one or more fastq reads and one or two index reads and output reads matching the barcodes into a seperate file for each barcode. The script takes read files and index files where the reads and indexs are in the same order and outputs reads which match the appropriate barcodes into separate files."
    )
    parser.add_argument(
        'fastqFiles',
        help=
        'a fastq file(s) (potentially gzipped) containing the sequence reads',
        type=helper.checkFile,
        nargs='+')
    parser.add_argument(
        '-i',
        '--indexFiles',
        help='a fastq file(s) (potentially gzipped) containing the index reads',
        type=helper.checkFile,
        nargs='+')
    parser.add_argument(
        "-d",
        "--dots",
        help=
        "output dot to stderr every X reads. Input a negative number to suppress output (default:-1)",
        default=-1,
        type=int)
    parser.add_argument(
        '-b',
        '--barcodeFile',
        help=
        'a (potentially gzipped) file containing comma separated sample names, first barcode and second barcode (with no header and no commas in the sample names)',
        type=helper.checkFile,
        required=True)
    parser.add_argument('-o',
                        '--outputPath',
                        help='a string giving the desired output directory',
                        type=helper.checkDir,
                        default='.')
    parser.add_argument(
        '-u',
        '--unassigned',
        help=
        'if set then store unassigned reads to {outputPath}/__UNASSIGNED__R#.fastq.gz with their corresponding barcodes in {outputPath}/__UNASSIGNED__I#.fastq.gz',
        action='store_true')

    args = parser.parse_args(argv)

    nFiles = len(args.fastqFiles)
    nIndexs = len(args.indexFiles)
    barcodes = helper.readSimpleCsv(args.barcodeFile)

    if (nIndexs != len(barcodes[0]) - 1):
        raise argparse.ArgumentTypeError(
            "Number of index files and index columns in the barcodeFile do not agree"
        )

    samples = [xx[0] for xx in barcodes]
    if (len(set(samples)) != len(samples)):
        raise argparse.ArgumentTypeError(
            "Two or more samples share the same name")
    outputFiles = [[
        os.path.join(args.outputPath, xx) + "_" + str(ii + 1) + ".fastq.gz"
        for ii in range(nFiles)
    ] for xx in samples]
    bars = [tuple(xx[1:]) for xx in barcodes]
    barSet = set(bars)
    if len(barSet) != len(bars):
        raise argparse.ArgumentTypeError(
            "Two or more samples share the same set of barcodes")
    outHandles = dict(
        zip(bars, [[helper.openNormalOrGz(yy, 'w') for yy in xx]
                   for xx in outputFiles]))

    if args.unassigned:
        if any([
                xx[0] in ['__UNASSIGNED__R', '__UNASSIGNED__I']
                for xx in barcodes
        ]):
            raise argparse.ArgumentTypeError(
                "Sample named __UNASSIGNED__ clashes with unassigned output. Please rename"
            )
        badIndexFiles = [
            os.path.join(args.outputPath, "__UNASSIGNED__I") + str(ii + 1) +
            ".fastq.gz" for ii in range(nIndexs)
        ]
        badReadFiles = [
            os.path.join(args.outputPath, "__UNASSIGNED__R") + str(ii + 1) +
            ".fastq.gz" for ii in range(nFiles)
        ]
        badIndexHandles = [
            helper.openNormalOrGz(ii, 'w') for ii in badIndexFiles
        ]
        badReadHandles = [
            helper.openNormalOrGz(ii, 'w') for ii in badReadFiles
        ]

    with barcodeFastqIter(args.fastqFiles, args.indexFiles, bars,
                          args.unassigned) as fastqIter:
        for currentReads, bar, assigned, fullBar in fastqIter:
            if args.unassigned and not assigned:
                for read, outFile in zip(currentReads, badReadHandles):
                    helper.writeFastqRead(outFile, read)
                for read, outFile in zip(fullBar, badIndexHandles):
                    helper.writeFastqRead(outFile, read)
            else:
                for read, outFile in zip(currentReads, outHandles[bar]):
                    helper.writeFastqRead(outFile, read)
                if args.dots > 0:
                    if fastqIter.nGood % args.dots == 0:
                        sys.stderr.write('.')
                        sys.stderr.flush()

        if args.dots > 0:
            sys.stderr.write("\nReads assigned to barcode: " +
                             str(fastqIter.nGood) + " Unassigned reads: " +
                             str(fastqIter.nBad) + "\n")

    for key in outHandles:
        helper.closeFiles(outHandles[key])