Ejemplo n.º 1
0
def fastq_sort(fastq, byname=True, bysequence=False, tmpdir=None, chunksize=100000, out=sys.stdout, quiet=False):
    tmpfiles = []

    chunk = []
    sys.stderr.write('Sorting FASTQ file into chunks...\n')
    count = 0
    for read in fastq.fetch(quiet):
        count += 1 
        if byname:
            chunk.append((read.name, read))
        if bysequence:
            chunk.append((read.seq, read))

        if len(chunk) >= chunksize:
            tmpfiles.append(_write_tmp(chunk))
            chunk = []

    if chunk:
        tmpfiles.append(_write_tmp(chunk))

    sys.stderr.write('Merging chunks...\n')
    buf = [None, ] * len(tmpfiles)
    skip = [False, ] * len(tmpfiles)

    eta = ETA(count)

    j=0
    writing = True

    while writing:
        j+=1
        eta.print_status(j)
        for i, fobj in enumerate(tmpfiles):
            if not buf[i] and not skip[i]:
                try:
                    read = fastq_read_file(fobj)
                    if byname:
                        buf[i] = (read.name, i, read)
                    if bysequence:
                        buf[i] = (read.seq, i, read)
                except:
                    buf[i] = None
                    skip[i] = True
        
        sorted_list = buf[:]
        sorted_list.sort()
        writing = False

        for tup in sorted_list:
            if not tup:
                continue

            sorter, i, read = tup
            read.write(out)
            buf[i] = None
            writing = True
            break
    eta.done()
Ejemplo n.º 2
0
def fastq_sort(fastq, bysequence=False, tmpdir=None, tmpprefix='.tmp', chunksize=100000, nogz=False, out=sys.stdout, quiet=False):
    tmpfiles = []
    chunk = []

    sys.stderr.write('Sorting FASTQ file into chunks...\n')
    count = 0
    for read in fastq.fetch(quiet):
        count += 1 
        if bysequence:
            chunk.append((read.seq, read))
        else:
            chunk.append((read.name, read))

        if len(chunk) >= chunksize:
            tmpfiles.append(_write_tmp(chunk, tmpdir, tmpprefix, nogz))
            chunk = []

    if chunk:
        tmpfiles.append(_write_tmp(chunk, tmpdir, tmpprefix, nogz))

    sys.stderr.write('\nMerging chunks...\n')
    sys.stderr.flush()
    buf = [None, ] * len(tmpfiles)
    skip = [False, ] * len(tmpfiles)

    eta = ETA(count)

    j=0
    writing = True

    if nogz:
        tmpfobjs = [open(x) for x in tmpfiles]
    else:
        tmpfobjs = [gzip.open(x) for x in tmpfiles]

    while writing:
        j+=1
        eta.print_status(j)
        for i, fobj in enumerate(tmpfobjs):
            if not buf[i] and not skip[i]:
                try:
                    read = fastq_read_file(fobj)
                    if bysequence:
                        buf[i] = (read.seq, i, read)
                    else:
                        buf[i] = (read.name, i, read)
                except:
                    buf[i] = None
                    skip[i] = True
        
        sorted_list = buf[:]
        sorted_list.sort()
        writing = False

        for tup in sorted_list:
            if not tup:
                continue

            sorter, i, read = tup
            read.write(out)
            buf[i] = None
            writing = True
            break
    eta.done()

    for fobj in tmpfobjs:
        fobj.close()

    for tmpfile in tmpfiles:
        os.unlink(tmpfile)
Ejemplo n.º 3
0
def fastq_sort(fastq,
               bysequence=False,
               tmpdir=None,
               tmpprefix='.tmp',
               chunksize=100000,
               nogz=False,
               out=sys.stdout,
               quiet=False):
    tmpfiles = []
    chunk = []

    sys.stderr.write('Sorting FASTQ file into chunks...\n')
    count = 0
    for read in fastq.fetch(quiet):
        count += 1
        if bysequence:
            chunk.append((read.seq, read))
        else:
            chunk.append((read.name, read))

        if len(chunk) >= chunksize:
            tmpfiles.append(_write_tmp(chunk, tmpdir, tmpprefix, nogz))
            chunk = []

    if chunk:
        tmpfiles.append(_write_tmp(chunk, tmpdir, tmpprefix, nogz))

    sys.stderr.write('\nMerging chunks...\n')
    sys.stderr.flush()
    buf = [
        None,
    ] * len(tmpfiles)
    skip = [
        False,
    ] * len(tmpfiles)

    eta = ETA(count)

    j = 0
    writing = True

    if nogz:
        tmpfobjs = [open(x) for x in tmpfiles]
    else:
        tmpfobjs = [gzip.open(x) for x in tmpfiles]

    while writing:
        j += 1
        eta.print_status(j)
        for i, fobj in enumerate(tmpfobjs):
            if not buf[i] and not skip[i]:
                try:
                    read = fastq_read_file(fobj)
                    if bysequence:
                        buf[i] = (read.seq, i, read)
                    else:
                        buf[i] = (read.name, i, read)
                except:
                    buf[i] = None
                    skip[i] = True

        sorted_list = buf[:]
        sorted_list.sort()
        writing = False

        for tup in sorted_list:
            if not tup:
                continue

            sorter, i, read = tup
            read.write(out)
            buf[i] = None
            writing = True
            break
    eta.done()

    for fobj in tmpfobjs:
        fobj.close()

    for tmpfile in tmpfiles:
        os.unlink(tmpfile)