def convert_fastq(fq,ofq,out_lnum=4,out_baseQ=33,tickon = 10000): nreads = preprocess_radtag_lane.get_read_count(fq) lnum,baseQ = preprocess_radtag_lane.get_fastq_properties(fq) fh = preprocess_radtag_lane.smartopen(fq) ofh = preprocess_radtag_lane.smartopen(ofq,'w') for i in xrange(nreads): if i%tickon == 0: print >> sys.stderr, '\r%s / %s (%0.1f%%)' % (i,nreads,(float(i)/nreads)*100), n,s,qs = preprocess_radtag_lane.next_read_from_fh(fh, lnum) ofh.write(preprocess_radtag_lane.as_fq_line(n,s,qs_to_q(qs,baseQ),out_baseQ,out_lnum)) print >> sys.stderr,'\n'
def convert_fastq(fq, ofq, out_lnum=4, out_baseQ=33, tickon=10000): nreads = preprocess_radtag_lane.get_read_count(fq) lnum, baseQ = preprocess_radtag_lane.get_fastq_properties(fq) fh = preprocess_radtag_lane.smartopen(fq) ofh = preprocess_radtag_lane.smartopen(ofq, 'w') for i in xrange(nreads): if i % tickon == 0: print >> sys.stderr, '\r%s / %s (%0.1f%%)' % (i, nreads, (float(i) / nreads) * 100), n, s, qs = preprocess_radtag_lane.next_read_from_fh(fh, lnum) ofh.write( preprocess_radtag_lane.as_fq_line(n, s, qs_to_q(qs, baseQ), out_baseQ, out_lnum)) print >> sys.stderr, '\n'
def get_fastq_properties(fq): if smartopen(fq).read(1) == '@': lnum = 4 else: lnum = 1 print >> sys.stderr, 'fastq format lnum: %s' % lnum baseQ = None qfh = smartopen(fq) while baseQ is None: t, r, q = preprocess_radtag_lane.next_read_from_fh(qfh, lnum) baseQ = preprocess_radtag_lane.get_baseQ(q) qfh.close() print >> sys.stderr, 'fastq format baseQ: %s' % baseQ readlen = len(r) print >> sys.stderr, 'fastq format readlen: %s' % readlen return lnum, baseQ, readlen
def get_fastq_properties(fq): if smartopen(fq).read(1) == '@': lnum = 4 else: lnum = 1 print >> sys.stderr, 'fastq format lnum: %s' % lnum baseQ = None qfh = smartopen(fq) while baseQ is None: t,r,q = preprocess_radtag_lane.next_read_from_fh(qfh,lnum) baseQ = preprocess_radtag_lane.get_baseQ(q) qfh.close() print >> sys.stderr, 'fastq format baseQ: %s' % baseQ readlen = len(r) print >> sys.stderr, 'fastq format readlen: %s' % readlen return lnum,baseQ,readlen
if __name__ == "__main__": if len(sys.argv) == 4: cutsite, fq, outfile = sys.argv[1:] rc = preprocess_radtag_lane.get_read_count(fq) lnum, baseQ = preprocess_radtag_lane.get_fastq_properties(fq) fh = preprocess_radtag_lane.smartopen(fq) ofh = preprocess_radtag_lane.smartopen(outfile, 'w') found = 0 for i in range(rc): if i > 0 and i % tick == 0: print >> sys.stderr, '\r%s / %s (%0.1f%%) found %s (%0.1f%%)' % \ (i,rc,(float(i)/rc)*100,found,(float(found)/i)*100), n, s, q = preprocess_radtag_lane.next_read_from_fh(fh, lnum) if s[barcode_len:barcode_len + len(cutsite)] == cutsite: line = preprocess_radtag_lane.as_fq_line(n, s, q, None, lnum) ofh.write(line) found += 1 ofh.close() elif len(sys.argv) == 6: cutsite, fq1, fq2, outfile1, outfile2 = sys.argv[1:] rc1 = preprocess_radtag_lane.get_read_count(fq1) rc2 = preprocess_radtag_lane.get_read_count(fq2) if rc1 != rc2: errstr = 'read count for %s = %s; %s = %s. counts must match' % ( fq1, rc1, fq2.rc2) raise ValueError, errstr lnum, baseQ = preprocess_radtag_lane.get_fastq_properties(fq1)
''' import os, sys import numpy from editdist import distance from preprocess_radtag_lane import next_read_from_fh, smartopen, get_read_count idx_bp = 5 cut_bp = 5 lnum = 4 min_seqs = 7 uniqued, fastq = sys.argv[1:] readlen = len(next_read_from_fh(smartopen(fastq), 4)[1]) print >> sys.stderr, 'readlen: %s' % readlen num_reads = get_read_count(fastq, 4) tickon = num_reads / 200 useqs = [] for l in open(uniqued): s, cntstr = l.strip().split()[0], l.strip().split()[4] cnt = numpy.mean([int(i) for i in cntstr.split(',')]) if cnt >= min_seqs: useqs.append(s[cut_bp:readlen - idx_bp]) useqs = list(set(useqs)) print >> sys.stderr, '%s unique %sbp sequences in uniqued file' % (
''' import os,sys import numpy from editdist import distance from preprocess_radtag_lane import next_read_from_fh, smartopen, get_read_count idx_bp = 5 cut_bp = 5 lnum = 4 min_seqs = 7 uniqued, fastq = sys.argv[1:] readlen = len(next_read_from_fh(smartopen(fastq),4)[1]) print >> sys.stderr, 'readlen: %s' % readlen num_reads = get_read_count(fastq,4) tickon = num_reads/200 useqs = [] for l in open(uniqued): s,cntstr = l.strip().split()[0], l.strip().split()[4] cnt = numpy.mean([int(i) for i in cntstr.split(',')]) if cnt >= min_seqs: useqs.append(s[cut_bp:readlen-idx_bp]) useqs = list(set(useqs)) print >> sys.stderr, '%s unique %sbp sequences in uniqued file' % (len(useqs),len(s[cut_bp:readlen-idx_bp]))
if end == '': end = readlen readcount = preprocess_radtag_lane.get_read_count(fq) qsc_n = 0 qsc_tot = numpy.zeros(readlen) qsc_by_read = [] fh = smartopen(fq) tickon = readcount / 1000 for i in range(readcount): if i % tickon == 0: print >> sys.stderr, '\r%0.1f' % ((i / float(readcount)) * 100), t, r, q = preprocess_radtag_lane.next_read_from_fh(fh, lnum) qsc = [ord(c) - baseQ for c in q] qsc_n += 1 qsc_tot += qsc qsc_by_read.append(numpy.mean(qsc[start:end])) qsc_by_base = list(qsc_tot / qsc_n) print >> sys.stderr, 'write per-base mean qual ...', open(fq + '-per_base_qual.list', 'w').write(qsc_by_base.__repr__()) print >> sys.stderr, 'done' print >> sys.stderr, 'write per-read qual ..', open(fq + '-per_read_qual.list', 'w').write(qsc_by_read.__repr__()) print >> sys.stderr, 'done'
if end == '': end = readlen readcount = preprocess_radtag_lane.get_read_count(fq) qsc_n = 0 qsc_tot = numpy.zeros(readlen) qsc_by_read = [] fh = smartopen(fq) tickon = readcount/1000 for i in range(readcount): if i % tickon == 0: print >> sys.stderr, '\r%0.1f' % ((i/float(readcount)) * 100), t,r,q = preprocess_radtag_lane.next_read_from_fh(fh,lnum) qsc = [ord(c)-baseQ for c in q] qsc_n += 1 qsc_tot += qsc qsc_by_read.append(numpy.mean(qsc[start:end])) qsc_by_base = list(qsc_tot/qsc_n) print >> sys.stderr, 'write per-base mean qual ...', open(fq+'-per_base_qual.list','w').write(qsc_by_base.__repr__()) print >> sys.stderr, 'done' print >> sys.stderr, 'write per-read qual ..', open(fq+'-per_read_qual.list','w').write(qsc_by_read.__repr__()) print >> sys.stderr, 'done'
if __name__ == "__main__": if len(sys.argv) == 4: cutsite,fq,outfile = sys.argv[1:] rc = preprocess_radtag_lane.get_read_count(fq) lnum,baseQ = preprocess_radtag_lane.get_fastq_properties(fq) fh = preprocess_radtag_lane.smartopen(fq) ofh = preprocess_radtag_lane.smartopen(outfile,'w') found = 0 for i in range(rc): if i>0 and i % tick == 0: print >> sys.stderr, '\r%s / %s (%0.1f%%) found %s (%0.1f%%)' % \ (i,rc,(float(i)/rc)*100,found,(float(found)/i)*100), n,s,q = preprocess_radtag_lane.next_read_from_fh(fh,lnum) if s[barcode_len:barcode_len+len(cutsite)] == cutsite: line = preprocess_radtag_lane.as_fq_line(n,s,q,None,lnum) ofh.write(line) found += 1 ofh.close() elif len(sys.argv) == 6: cutsite,fq1,fq2,outfile1,outfile2 = sys.argv[1:] rc1 = preprocess_radtag_lane.get_read_count(fq1) rc2 = preprocess_radtag_lane.get_read_count(fq2) if rc1 != rc2: errstr = 'read count for %s = %s; %s = %s. counts must match' % (fq1,rc1,fq2.rc2) raise ValueError, errstr lnum,baseQ = preprocess_radtag_lane.get_fastq_properties(fq1) fh1 = preprocess_radtag_lane.smartopen(fq1)