Exemple #1
0
def verbose_fastq_iter(filename):
    from screed.fastq import fastq_iter
    it = fastq_iter(open(filename))
    for n, record in enumerate(it):
        if n % 10000 == 0:
            print >>sys.stderr, '... filtering', n
        yield record
Exemple #2
0
def main_ilfq(ilfq_name, re_site, dimer_file=None):
    pst1 = SSW(re_site)
    if dimer_file is not None:
        dimer_file = open(dimer_file, 'w')
    n_reads = 0
    n_adapt = 0
    n_trimmed = 0
    re_len = len(re_site)
    if ilfq_name.endswith("gz"):
        fh = gzip.open(ilfq_name)
    else:
        fh = open(ilfq_name)
    reads = fastq_iter(fh)
    for rpair in pairitr(reads):
        n_reads += 1
        r1, r2 = rpair
        r1aln = pst1(r1["sequence"][re_len:])
        r2aln = pst1(r2["sequence"][re_len:])
        if r1aln.query_begin != 0 or r1aln.query_end != 4 or \
                r2aln.query_begin != 0 or r2aln.query_end != 4:
            # Not read-through, but might be dimer.
            if not is_dimer(r1, r2, re_len, dimer_file):
                printrecs(r1, r2)
            else:
                n_adapt += 1
        if r1aln.target_begin == r2aln.target_begin:
            r1["sequence"] = r1["sequence"][:r1aln.target_begin]
            r1["quality"] = r1["quality"][:r1aln.target_begin]
            r2["sequence"] = "N"
            r2["quality"] = "#"
            n_trimmed += 1
            printrecs(r1, r2)
        if n_reads % 1000 == 0:
            print("Processed {:0.0f}K read pairs".format(n_reads / 1000.0),
                  file=sys.stderr,
                  end='\r')
    fh.close()
    if dimer_file is not None:
        dimer_file.close()
    print("Processed", n_reads, "read pairs", file=sys.stderr)
    print("Trimmed", n_trimmed, file=sys.stderr)
    print("Adaptor in", n_adapt, file=sys.stderr)
Exemple #3
0
def main_ilfq(ilfq_name, re_site, dimer_file=None):
    pst1 = SSW(re_site)
    if dimer_file is not None:
        dimer_file = open(dimer_file, 'w')
    n_reads = 0
    n_adapt = 0
    n_trimmed = 0
    re_len = len(re_site);
    if ilfq_name.endswith("gz"):
        fh = gzip.open(ilfq_name)
    else:
        fh = open(ilfq_name)
    reads = fastq_iter(fh)
    for rpair in pairitr(reads):
        n_reads += 1
        r1, r2 =  rpair
        r1aln = pst1(r1["sequence"][re_len:])
        r2aln = pst1(r2["sequence"][re_len:])
        if r1aln.query_begin != 0 or r1aln.query_end != 4 or \
                r2aln.query_begin != 0 or r2aln.query_end != 4:
            # Not read-through, but might be dimer.
            if not is_dimer(r1, r2, re_len, dimer_file):
                printrecs(r1, r2)
            else:
                n_adapt += 1
        if r1aln.target_begin == r2aln.target_begin:
            r1["sequence"] = r1["sequence"][:r1aln.target_begin]
            r1["quality"] = r1["quality"][:r1aln.target_begin]
            r2["sequence"] = "N"
            r2["quality"] = "#"
            n_trimmed += 1
            printrecs(r1, r2)
        if n_reads % 1000 == 0:
            print("Processed {:0.0f}K read pairs".format(n_reads /1000.0),
                  file=sys.stderr, end='\r')
    fh.close()
    if dimer_file is not None:
        dimer_file.close();
    print("Processed", n_reads, "read pairs", file=sys.stderr)
    print("Trimmed", n_trimmed, file=sys.stderr)
    print("Adaptor in", n_adapt, file=sys.stderr)
Exemple #4
0
import sys

sys.path.insert(0, '/u/t/dev/screed')
import screed
from screed.fastq import fastq_iter

import khmer

hasher = khmer.HashtableIntersect(15, *khmer.PRIMES_1m)

filename = '/scratch/gpgc/iowa/850.2.fq'

for n, record in enumerate(fastq_iter(open(filename))):
    if n % 10000 == 0:
        print >> sys.stderr, '...', n
    if n > 10**6:
        break

    sequence = record['sequence']
    if 'N' in sequence:
        continue

    print '>%s\n%s' % (record['name'], record['sequence'])
Exemple #5
0
import sys
sys.path.insert(0, '/u/t/dev/screed')
import screed
from screed.fastq import fastq_iter

for n, record in enumerate(fastq_iter(open(sys.argv[1]))):
    if n % 10000 == 0:
        print>>sys.stderr, '...', n

    sequence = record['sequence']
    name = record['name']

    if 'N' in sequence:
        continue

    print ">" + name
    print sequence
# python quality-trim.py <input fastq.gz file> <output fasta file>

import sys
import screed
import gzip

filein = sys.argv[1]
fileout = sys.argv[2]

fw = open(fileout, 'w')

from screed.fastq import fastq_iter

for n, record in enumerate(fastq_iter(gzip.open(filein,'rb'))):
   if n <=100000:
      sequence = record['sequence']
      name = record['name']
      fw.write('>%s\n%s\n' % (name, sequence))
   else:
      break

fw.close

# python quality-trim.py <input fastq.gz file> <output fasta file>

import sys
import screed
import gzip

filein = sys.argv[1]
fileout = sys.argv[2]

fw = open(fileout, 'w')

from screed.fastq import fastq_iter

for n, record in enumerate(fastq_iter(gzip.open(filein, 'rb'))):
    if n <= 100000:
        sequence = record['sequence']
        name = record['name']
        fw.write('>%s\n%s\n' % (name, sequence))
    else:
        break

fw.close
Exemple #8
0
import sys
sys.path.insert(0, '/u/t/dev/screed')
import screed
from screed.fastq import fastq_iter
from screed.fasta import fasta_iter

K = 15

import khmer

kh = khmer.new_hashtable(K, 1999999973)

filename = '/scratch/gpgc/iowa/850.2.fq'
#filename = 'foo'

for n, record in enumerate(fastq_iter(open(filename))):
    if n % 10000 == 0:
        print>>sys.stderr, '...', n
    if n > 10**6:
        break

    sequence = record['sequence']
    if 'N' in sequence:
        continue

    kh.consume(sequence)

bins = [0] * 256
for n, record in enumerate(fastq_iter(open(filename))):
    if n % 10000 == 0:
        print>>sys.stderr, '...', n
def load_records_fastq(stringio_fp):
    records = list(fastq_iter(StringIO(stringio_fp.getvalue())))
    return records
Exemple #10
0
def load_records_fastq(stringio_fp):
    records = list(fastq_iter(StringIO(stringio_fp.getvalue())))
    return records
Exemple #11
0
import screed
from screed import fastq

# python quality-trim.py <input fastq file> <output filtered fastq file>
# MINLENGTH is the minimum lenth of read desired.  NCALLS is the percentage of a read with 'N' base calls for which if read has greater, it will be removed. 

MINLENGTH = 30

filein = sys.argv[1]
fileout = sys.argv[2]

fp = open(filein)
fw = open(fileout, 'w')

count=0
for n, record in enumerate(fastq.fastq_iter(fp)):
    name = record['name']
    sequence = record['sequence']
    accuracy = record['accuracy']

    sequence = sequence.rstrip('N')
    accuracy = accuracy[:len(sequence)]

    if 'N' in sequence:
       continue
    else:
        trim = accuracy.find('B')

        if trim > MINLENGTH or (trim == -1 and len(sequence) > MINLENGTH):
            if trim == -1:
                fw.write('@%s\n%s\n+\n%s\n' % (name, sequence, accuracy))
Exemple #12
0
import sys

sys.path.insert(0, '/u/t/dev/screed')
import screed
from screed.fastq import fastq_iter

for n, record in enumerate(fastq_iter(open(sys.argv[1]))):
    if n % 10000 == 0:
        print >> sys.stderr, '...', n

    sequence = record['sequence']
    name = record['name']

    if 'N' in sequence:
        continue

    print ">" + name
    print sequence