コード例 #1
0
ファイル: mapping.py プロジェクト: bxlab/HiFive_Paper
def splitSRA(filename,
             outFile="auto",
             splitBy=4000000,
             FASTQ_BINARY="./fastq-dump",
             FASTQ_ARGS=[]):

    inFile = os.path.abspath(filename)
    if outFile == "auto":
        outFile = filename.replace(".sra", "") + "_{0}_side{1}.fastq.gz"
    pread = subprocess.Popen([FASTQ_BINARY, inFile, "-Z", "--split-files"] +
                             FASTQ_ARGS,
                             stdout=subprocess.PIPE,
                             bufsize=-1)
    inStream = pread.stdout

    halted = False
    for counter in xrange(1000000):

        outProc1 = gzipWriter(outFile.format(counter, 1))
        outProc2 = gzipWriter(outFile.format(counter, 2))
        outStream1 = outProc1.stdin
        outStream2 = outProc2.stdin

        for _ in xrange(splitBy):

            line = inStream.readline()

            try:
                assert line[0] == "@"
            except AssertionError:
                print 'Not fastq'
                raise IOError("File is not fastq: {0}".format(filename))
            except IndexError:
                halted = True
                break

            fastq_entry = (line, inStream.readline(), inStream.readline(),
                           inStream.readline())

            outStream1.writelines(fastq_entry)
            outStream2.writelines((inStream.readline(), inStream.readline(),
                                   inStream.readline(), inStream.readline()))

        outProc1.communicate()
        outProc2.communicate()
        print "finished block number", counter
        if halted:
            return
コード例 #2
0
ファイル: mapping.py プロジェクト: bxlab/HiFive_Paper
def splitSRA(filename, outFile="auto", splitBy=4000000, FASTQ_BINARY="./fastq-dump", FASTQ_ARGS=[]):

    inFile = os.path.abspath(filename)
    if outFile == "auto":
        outFile = filename.replace(".sra", "") + "_{0}_side{1}.fastq.gz"
    pread = subprocess.Popen([FASTQ_BINARY, inFile, "-Z", "--split-files"] + FASTQ_ARGS ,
                             stdout=subprocess.PIPE, bufsize=-1)
    inStream = pread.stdout

    halted = False
    for counter in xrange(1000000):

        outProc1 = gzipWriter(outFile.format(counter, 1))
        outProc2 = gzipWriter(outFile.format(counter, 2))
        outStream1 = outProc1.stdin
        outStream2 = outProc2.stdin

        for _ in xrange(splitBy):

            line = inStream.readline()

            try:
                assert line[0] == "@"
            except AssertionError:
                print 'Not fastq'
                raise IOError("File is not fastq: {0}".format(filename))
            except IndexError:
                halted = True
                break


            fastq_entry = (line, inStream.readline(),
                           inStream.readline(), inStream.readline())

            outStream1.writelines(fastq_entry)
            outStream2.writelines((inStream.readline(), inStream.readline(),
                       inStream.readline(), inStream.readline()))

        outProc1.communicate()
        outProc2.communicate()
        print "finished block number", counter
        if halted:
            return
コード例 #3
0
def splitSingleFastq(filename,
                     outFile,
                     splitBy=4000000,
                     convertReadID=lambda x: x):

    inFile = os.path.abspath(filename)

    pread = subprocess.Popen(["gunzip", inFile, "-c"],
                             stdout=subprocess.PIPE,
                             bufsize=-1)
    inStream = pread.stdout

    halted = False
    counters = []
    for counter in range(100000):

        outProc1 = gzipWriter(outFile.format(counter))
        outStream1 = outProc1.stdin

        for j in range(splitBy):

            line = inStream.readline()

            try:
                assert six.indexbytes(line, 0) == 64  #"@"
            except AssertionError:
                print('Not fastq')
                print("bad line: {0}".format(line))
                raise IOError("File is not fastq: {0}".format(filename))
            except IndexError:
                halted = True
                counters.append(j)
                break

            fastq_entry = (convertReadID(line), inStream.readline(),
                           inStream.readline(), inStream.readline())
            outStream1.writelines(fastq_entry)

        outProc1.communicate()
        print("finished block number", counter)

        if halted:
            if (counters[-1] < splitBy / 3) and (len(counters) > 1):
                f1 = outFile.format(counter - 1)
                f2 = outFile.format(counter)
                os.system("cat {0} {1} > {0}_tmp".format(f1, f2))
                shutil.move(f1 + "_tmp", f1)
                os.remove(f2)
                last = counters.pop()
                counters[-1] = counters[-1] + last
            print("Read counts", counters)
            return counters
        counters.append(splitBy)
コード例 #4
0
def _filter_fastq(ids,
                  inStream,
                  out_fastq,
                  in_filename="none"):  # @UnusedVariable
    '''Filter FASTQ sequences by their IDs.

    Read entries from **in_fastq** and store in **out_fastq** only those
    the whose ID are in **ids**.
    '''
    writingProcess = gzipWriter(out_fastq)

    num_filtered = 0
    num_total = 0
    while True:

        line = inStream.readline()
        try:
            assert six.indexbytes(line, 0) == 64  # "@"
        except AssertionError:
            print('Not fastq')
            raise
        except IndexError:
            break

        # raise Exception('{0} does not comply with the FASTQ standards.'.format(in_filename))

        fastq_entry = (line, inStream.readline(), inStream.readline(),
                       inStream.readline())
        read_id = line.split()[0][1:]
        if read_id in ids:
            writingProcess.stdin.writelines(fastq_entry)
            num_filtered += 1
        num_total += 1

    sleep()
    writingProcess.communicate()

    if writingProcess.returncode != 0:
        raise RuntimeError("Writing process return code {0}".format(
            writingProcess.returncode))
    return num_total, num_filtered
コード例 #5
0
ファイル: mapping.py プロジェクト: bxlab/HiFive_Paper
def _filter_fastq(ids, inStream, out_fastq, in_filename="none"):
    '''Filter FASTQ sequences by their IDs.

    Read entries from **in_fastq** and store in **out_fastq** only those
    the whose ID are in **ids**.
    '''
    writingProcess = gzipWriter(out_fastq)

    num_filtered = 0
    num_total = 0
    while True:

        line = inStream.readline()

        try:
            assert line[0] == '@'
        except AssertionError:
            print 'Not fastq'
        except IndexError:
            break


        # raise Exception('{0} does not comply with the FASTQ standards.'.format(in_filename))

        fastq_entry = (line, inStream.readline(),
                       inStream.readline(), inStream.readline())
        read_id = line.split()[0][1:]
        if read_id in ids:
            writingProcess.stdin.writelines(fastq_entry)
            num_filtered += 1
        num_total += 1


    sleep()
    writingProcess.communicate()

    if writingProcess.returncode != 0:
        raise RuntimeError("Writing process return code {0}".format(writingProcess.returncode))
    return num_total, num_filtered
コード例 #6
0
def splitSRA(filename,
             outFile="auto",
             splitBy=4000000,
             FASTQ_BINARY="./fastq-dump",
             FASTQ_ARGS=[]):
    if not os.path.exists(FASTQ_BINARY):
        raise ValueError("(fastq-dump) file not found at {0}".format(
            os.path.abspath(FASTQ_BINARY)))

    inFile = os.path.abspath(filename)
    if outFile == "auto":
        outFile = filename.replace(".sra", "") + "_{0}_side{1}.fastq.gz"
    pread = subprocess.Popen([FASTQ_BINARY, inFile, "-Z", "--split-files"] +
                             FASTQ_ARGS,
                             stdout=subprocess.PIPE,
                             bufsize=-1)
    inStream = pread.stdout

    halted = False
    counters = []
    for counter in range(1000000):

        outProc1 = gzipWriter(outFile.format(counter, 1))
        outProc2 = gzipWriter(outFile.format(counter, 2))
        outStream1 = outProc1.stdin
        outStream2 = outProc2.stdin

        for j in range(splitBy):

            line = inStream.readline()

            try:

                assert six.indexbytes(line, 0) == 64  #"@"
            except AssertionError:
                print('Not fastq')
                print("bad line: {0}".format(line))
                raise IOError("File is not fastq: {0}".format(filename))
            except IndexError:
                halted = True
                counters.append(j)
                break

            fastq_entry = (line, inStream.readline(), inStream.readline(),
                           inStream.readline())

            outStream1.writelines(fastq_entry)
            outStream2.writelines((inStream.readline(), inStream.readline(),
                                   inStream.readline(), inStream.readline()))

        outProc1.communicate()
        outProc2.communicate()
        print("finished block number", counter)
        if halted:
            if (counters[-1] < splitBy / 3) and (len(counters) > 1):
                for side in [1, 2]:
                    f1 = outFile.format(counter - 1, side)
                    f2 = outFile.format(counter, side)
                    os.system("cat {0} {1} > {0}_tmp".format(f1, f2))
                    shutil.move(f1 + "_tmp", f1)
                    os.remove(f2)
                last = counters.pop()
                counters[-1] = counters[-1] + last
            return counters
        counters.append(splitBy)
    return counters