Beispiel #1
0
def demultiplex_fastq(index, fastq1, fastq2):

    filter = {'index': index}
    
    fp1 = FastQParser(fastq1,filter)
    if fastq2 is not None:
        fp2 = FastQParser(fastq2,filter)
    for r1 in fp1:
        if fastq2 is not None:
            r2 = fp2.next()
            assert is_read_pair(r1,r2), "Mismatching headers for expected read pair" 
            sys.stderr.write("{}\n".format("\n".join(r2)))
             
        sys.stdout.write("{}\n".format("\n".join(r1))) 
def demultiplex_fastq(index, fastq1, fastq2):

    filter = {"index": index}

    fp1 = FastQParser(fastq1, filter)
    if fastq2 is not None:
        fp2 = FastQParser(fastq2, filter)
    for r1 in fp1:
        if fastq2 is not None:
            r2 = fp2.next()
            assert is_read_pair(r1, r2), "Mismatching headers for expected read pair"
            sys.stderr.write("{}\n".format("\n".join(r2)))

        sys.stdout.write("{}\n".format("\n".join(r1)))
Beispiel #3
0
def _split_fastq(fastq_input, outdir, outprefix, outsuffix, samples):

    if not os.path.exists(outdir):
        os.mkdir(outdir)

    out_handles = {}
    for file in fastq_input:
        iter = FastQParser(file)
        for record in iter:
            index = record[0].rfind(":")
            i = record[0][index + 1:].strip()
            # open a file handle to the index file if it's not already available
            if i not in out_handles:
                out_file = os.path.join(
                    outdir,
                    "%s_%s%s" % (outprefix, samples.get(i, i), outsuffix))
                out_handles[i] = FastQWriter(out_file)
            out_handles[i].write(record)

    # summarize the written records and close the file handles
    counts = {}
    for i, oh in out_handles.items():
        counts[i] = oh.rwritten()
        oh.close()

    return counts
Beispiel #4
0
def count_top_indexes(count_num, index_file, index_length, progress_interval):
    """
    Determine the most common indexes, sampling at most 200,000 reads.
    """
    assert (type(count_num) == int
            and count_num > 0), "Number passed must be a positive integer."
    fqp_ind = FastQParser(index_file)
    # This should perhaps be added to the FastQParser class
    print("Counting total number of lines in fastq file...",
          file=sys.stderr,
          end="")
    total_lines = int(
        subprocess.check_output(shlex.split(
            "wc -l {}".format(index_file))).split()[0])
    total_reads = total_lines / 4
    print(" complete.", file=sys.stderr)
    index_tally = collections.defaultdict(int)
    reads_processed = 0
    # Subsample if file is large
    if (total_reads) > 200000:
        print("Subsampling 200,000 reads from index file...", file=sys.stderr)
        fqp_ind = iter_sample_fast(fqp_ind, 200000, total_reads)
        print("Complete.", file=sys.stderr)
        total_reads = 200000
    print("Tallying indexes in {} records...".format(total_reads),
          file=sys.stderr)
    start_time = datetime.datetime.now()
    for index in fqp_ind:
        index_read_seq = index[1]
        index_seq = index_read_seq[:index_length]
        index_tally[index_seq] += 1
        reads_processeds += 1
        if reads_processed % progress_interval == 0:
            print_progress(reads_processed, total_reads, start_time)
    print("\n", file=sys.stderr)
    if count_num > len(index_tally.keys()):
        print(
            "Number of indexes found ({}) is fewer than those requested ({}). Printing all indexes found."
            .format(len(index_tally.keys()), count_num),
            file=sys.stderr)
        print("Printing indexes...", file=sys.stderr)
        count_num = len(index_tally.keys())
    print("{:<20} {:>20} {:>11}".format("Index", "Occurences", "Percentage"))
    for index, _ in sorted(index_tally.items(),
                           key=(lambda x: x[1]),
                           reverse=True)[:count_num]:
        percentage = (100.0 * index_tally[index]) / total_reads
        print("{:<20} {:>20,} {:>10.2f}%".format(index, index_tally[index],
                                                 percentage))
Beispiel #5
0
from bloomfaster import Elf
import collections
import sys

# Slight modification to read from input file instead of stdin
from scilifelab.utils.fastq_utils import (FastQParser, FastQWriter)

__doc__ %= sys.argv[0]
if len(sys.argv) > 2:
    print sys.argv
    print __doc__
    sys.exit()

print >> sys.stderr, "Command: ", " ".join(sys.argv)
infile = sys.argv[1]
fp = FastQParser(infile)
for _ in fp:
    pass
records = fp.rread()
print >> sys.stderr, records, "records in file ", infile

# say 1 out of 1000 is false positive.
bloom = Elf(records, error_rate=1e-3)
fp.seek(0)
checks = []
for _, seq, _, _ in fp:
    if seq in bloom:
        checks.append(seq)
    bloom.add(seq)

# now checks contains anything that could be a duplicate according to
Beispiel #6
0
from bloomfaster import Elf
import collections
import sys

# Slight modification to read from input file instead of stdin
from scilifelab.utils.fastq_utils import (FastQParser, FastQWriter)

__doc__ %= sys.argv[0]
if len(sys.argv) > 2:
    print sys.argv
    print __doc__
    sys.exit()

print >>sys.stderr, "Command: ", " ".join(sys.argv)
infile = sys.argv[1]
fp = FastQParser(infile)
for _ in fp:
    pass
records = fp.rread()
print >>sys.stderr, records, "records in file ", infile

# say 1 out of 1000 is false positive.
bloom = Elf(records, error_rate=1e-3)
fp.seek(0)
checks = []
for _,seq,_,_ in fp:
    if seq in bloom:
        checks.append(seq)
    bloom.add(seq)

# now checks contains anything that could be a duplicate according to