def verbose_fastq_iter(filename): from screed.fastq import fastq_iter it = fastq_iter(open(filename)) for n, record in enumerate(it): if n % 10000 == 0: print >>sys.stderr, '... filtering', n yield record
def main_ilfq(ilfq_name, re_site, dimer_file=None): pst1 = SSW(re_site) if dimer_file is not None: dimer_file = open(dimer_file, 'w') n_reads = 0 n_adapt = 0 n_trimmed = 0 re_len = len(re_site) if ilfq_name.endswith("gz"): fh = gzip.open(ilfq_name) else: fh = open(ilfq_name) reads = fastq_iter(fh) for rpair in pairitr(reads): n_reads += 1 r1, r2 = rpair r1aln = pst1(r1["sequence"][re_len:]) r2aln = pst1(r2["sequence"][re_len:]) if r1aln.query_begin != 0 or r1aln.query_end != 4 or \ r2aln.query_begin != 0 or r2aln.query_end != 4: # Not read-through, but might be dimer. if not is_dimer(r1, r2, re_len, dimer_file): printrecs(r1, r2) else: n_adapt += 1 if r1aln.target_begin == r2aln.target_begin: r1["sequence"] = r1["sequence"][:r1aln.target_begin] r1["quality"] = r1["quality"][:r1aln.target_begin] r2["sequence"] = "N" r2["quality"] = "#" n_trimmed += 1 printrecs(r1, r2) if n_reads % 1000 == 0: print("Processed {:0.0f}K read pairs".format(n_reads / 1000.0), file=sys.stderr, end='\r') fh.close() if dimer_file is not None: dimer_file.close() print("Processed", n_reads, "read pairs", file=sys.stderr) print("Trimmed", n_trimmed, file=sys.stderr) print("Adaptor in", n_adapt, file=sys.stderr)
def main_ilfq(ilfq_name, re_site, dimer_file=None): pst1 = SSW(re_site) if dimer_file is not None: dimer_file = open(dimer_file, 'w') n_reads = 0 n_adapt = 0 n_trimmed = 0 re_len = len(re_site); if ilfq_name.endswith("gz"): fh = gzip.open(ilfq_name) else: fh = open(ilfq_name) reads = fastq_iter(fh) for rpair in pairitr(reads): n_reads += 1 r1, r2 = rpair r1aln = pst1(r1["sequence"][re_len:]) r2aln = pst1(r2["sequence"][re_len:]) if r1aln.query_begin != 0 or r1aln.query_end != 4 or \ r2aln.query_begin != 0 or r2aln.query_end != 4: # Not read-through, but might be dimer. if not is_dimer(r1, r2, re_len, dimer_file): printrecs(r1, r2) else: n_adapt += 1 if r1aln.target_begin == r2aln.target_begin: r1["sequence"] = r1["sequence"][:r1aln.target_begin] r1["quality"] = r1["quality"][:r1aln.target_begin] r2["sequence"] = "N" r2["quality"] = "#" n_trimmed += 1 printrecs(r1, r2) if n_reads % 1000 == 0: print("Processed {:0.0f}K read pairs".format(n_reads /1000.0), file=sys.stderr, end='\r') fh.close() if dimer_file is not None: dimer_file.close(); print("Processed", n_reads, "read pairs", file=sys.stderr) print("Trimmed", n_trimmed, file=sys.stderr) print("Adaptor in", n_adapt, file=sys.stderr)
import sys sys.path.insert(0, '/u/t/dev/screed') import screed from screed.fastq import fastq_iter import khmer hasher = khmer.HashtableIntersect(15, *khmer.PRIMES_1m) filename = '/scratch/gpgc/iowa/850.2.fq' for n, record in enumerate(fastq_iter(open(filename))): if n % 10000 == 0: print >> sys.stderr, '...', n if n > 10**6: break sequence = record['sequence'] if 'N' in sequence: continue print '>%s\n%s' % (record['name'], record['sequence'])
import sys sys.path.insert(0, '/u/t/dev/screed') import screed from screed.fastq import fastq_iter for n, record in enumerate(fastq_iter(open(sys.argv[1]))): if n % 10000 == 0: print>>sys.stderr, '...', n sequence = record['sequence'] name = record['name'] if 'N' in sequence: continue print ">" + name print sequence
# python quality-trim.py <input fastq.gz file> <output fasta file> import sys import screed import gzip filein = sys.argv[1] fileout = sys.argv[2] fw = open(fileout, 'w') from screed.fastq import fastq_iter for n, record in enumerate(fastq_iter(gzip.open(filein,'rb'))): if n <=100000: sequence = record['sequence'] name = record['name'] fw.write('>%s\n%s\n' % (name, sequence)) else: break fw.close
# python quality-trim.py <input fastq.gz file> <output fasta file> import sys import screed import gzip filein = sys.argv[1] fileout = sys.argv[2] fw = open(fileout, 'w') from screed.fastq import fastq_iter for n, record in enumerate(fastq_iter(gzip.open(filein, 'rb'))): if n <= 100000: sequence = record['sequence'] name = record['name'] fw.write('>%s\n%s\n' % (name, sequence)) else: break fw.close
import sys sys.path.insert(0, '/u/t/dev/screed') import screed from screed.fastq import fastq_iter from screed.fasta import fasta_iter K = 15 import khmer kh = khmer.new_hashtable(K, 1999999973) filename = '/scratch/gpgc/iowa/850.2.fq' #filename = 'foo' for n, record in enumerate(fastq_iter(open(filename))): if n % 10000 == 0: print>>sys.stderr, '...', n if n > 10**6: break sequence = record['sequence'] if 'N' in sequence: continue kh.consume(sequence) bins = [0] * 256 for n, record in enumerate(fastq_iter(open(filename))): if n % 10000 == 0: print>>sys.stderr, '...', n
def load_records_fastq(stringio_fp): records = list(fastq_iter(StringIO(stringio_fp.getvalue()))) return records
import screed from screed import fastq # python quality-trim.py <input fastq file> <output filtered fastq file> # MINLENGTH is the minimum lenth of read desired. NCALLS is the percentage of a read with 'N' base calls for which if read has greater, it will be removed. MINLENGTH = 30 filein = sys.argv[1] fileout = sys.argv[2] fp = open(filein) fw = open(fileout, 'w') count=0 for n, record in enumerate(fastq.fastq_iter(fp)): name = record['name'] sequence = record['sequence'] accuracy = record['accuracy'] sequence = sequence.rstrip('N') accuracy = accuracy[:len(sequence)] if 'N' in sequence: continue else: trim = accuracy.find('B') if trim > MINLENGTH or (trim == -1 and len(sequence) > MINLENGTH): if trim == -1: fw.write('@%s\n%s\n+\n%s\n' % (name, sequence, accuracy))
import sys sys.path.insert(0, '/u/t/dev/screed') import screed from screed.fastq import fastq_iter for n, record in enumerate(fastq_iter(open(sys.argv[1]))): if n % 10000 == 0: print >> sys.stderr, '...', n sequence = record['sequence'] name = record['name'] if 'N' in sequence: continue print ">" + name print sequence