コード例 #1
0
ファイル: rtd_run.py プロジェクト: alexagrf/rtd
def load_uniqued(all_quality,uniqued,readlen=None,nticks=20,baseQ=None,count_by_ind=False):
    '''given a .uniqued file produced by preprocess_radtag_lane.py

    loads data into all_quality, ensuring sequences remain unique

    all_quality per 20101114 - UPDATE below    
    '''

    nreads = get_read_count(uniqued)
    
    qfh = smartopen(uniqued)
    while baseQ is None:
		line = qfh.next()
		qstr = line.strip().split()[2]
		baseQ = get_baseQ(qstr)
    qfh.close()
    
    print >> sys.stderr, 'uniqued qualities base %s' % (baseQ)

    
    tickon = nreads/nticks
    if tickon < 1:
    	tickon = 1
    print >> sys.stderr, '\tloading'


    for i,line in enumerate(smartopen(uniqued)):
        if i % tickon == 0: print >> sys.stderr, '\t\t%s / %s (%d%%)' % (i,nreads,(float(i)/nreads)*100)

        try:
            s,c,qstr,indivstr,indcnt,r2,r2cnt = line.strip().split()
        except ValueError:
            print >> sys.stderr, 'line %s split: incorrect element number (%s) line:\n%ssplit:\n%s\n' % (i,len(line.strip().split()),line,line.strip().split())
        q = numpy.array([ord(ch)-baseQ for ch in qstr])
        c = int(c)
        indiv = set(indivstr.split(','))

        if count_by_ind:
            indcntd = dict(zip(indivstr.split(','),map(int,indcnt.split(','))))

        if readlen is not None:
            s = s[:readlen]
            q = q[:readlen]

        if all_quality.has_key(s):
            all_quality[s]['mIDs'] = list(set(all_quality[s]['mIDs']).union(indiv))
            all_quality[s]['sum_quality'] += q*c
            all_quality[s]['tot'] += c
            if count_by_ind:
                for ind,cnt in indcntd.items():
                    if all_quality[s]['count_by_ind'].has_key(ind):
                        all_quality[s]['count_by_ind'][ind] += cnt
                    else:
                        all_quality[s]['count_by_ind'][ind] = cnt
        else:
            all_quality[s]['mIDs'] = list(indiv)
            all_quality[s]['sum_quality'] = q*c
            all_quality[s]['tot'] = c
            if count_by_ind:
                all_quality[s]['count_by_ind'] = indcntd
コード例 #2
0
ファイル: iterative_rtd.py プロジェクト: xguse/rtd
def uniqued_to_fastq(uniqued, id_prefix=''):
    if uniqued.endswith('gz'):
        len_uni = int(
            Popen('zcat %s | wc -l' % uniqued, shell=True,
                  stdout=PIPE).stdout.read().strip())
    else:
        len_uni = int(
            Popen('cat %s | wc -l' % uniqued, shell=True,
                  stdout=PIPE).stdout.read().strip())
    fh = smartopen(uniqued)
    outname = remove_ext(uniqued) + '-fromuni.fastq.gz'
    if os.path.exists(outname) and get_read_count(outname) == len_uni:
        print >> sys.stderr, 'output %s exists' % outname
        return outname
    ofh = smartopen(outname, 'w')
    print >> sys.stderr, 'convert %s to fastq' % uniqued
    for i, l in enumerate(fh):
        fields = l.strip().split()
        fq_line = '@%s%s\n%s\n+\n%s\n' % (id_prefix, i, fields[0], fields[2])
        ofh.write(fq_line)
        if i % 1000 == 0: print >> sys.stderr, '\r\t%s done' % i,
    ofh.close()
    print >> sys.stderr, '%s done' % outname

    return outname
コード例 #3
0
ファイル: overlap_preprocess.py プロジェクト: alexagrf/rtd
def convert_fastq(fq,ofq,out_lnum=4,out_baseQ=33,tickon = 10000):
    nreads = preprocess_radtag_lane.get_read_count(fq)
    lnum,baseQ = preprocess_radtag_lane.get_fastq_properties(fq)
    fh = preprocess_radtag_lane.smartopen(fq)
    ofh = preprocess_radtag_lane.smartopen(ofq,'w')
    for i in xrange(nreads):
        if i%tickon == 0:
            print >> sys.stderr, '\r%s / %s (%0.1f%%)' % (i,nreads,(float(i)/nreads)*100),
        n,s,qs = preprocess_radtag_lane.next_read_from_fh(fh, lnum)
        ofh.write(preprocess_radtag_lane.as_fq_line(n,s,qs_to_q(qs,baseQ),out_baseQ,out_lnum))
    print >> sys.stderr,'\n'
コード例 #4
0
ファイル: overlap_preprocess.py プロジェクト: xguse/rtd
def convert_fastq(fq, ofq, out_lnum=4, out_baseQ=33, tickon=10000):
    nreads = preprocess_radtag_lane.get_read_count(fq)
    lnum, baseQ = preprocess_radtag_lane.get_fastq_properties(fq)
    fh = preprocess_radtag_lane.smartopen(fq)
    ofh = preprocess_radtag_lane.smartopen(ofq, 'w')
    for i in xrange(nreads):
        if i % tickon == 0:
            print >> sys.stderr, '\r%s / %s (%0.1f%%)' % (i, nreads,
                                                          (float(i) / nreads) *
                                                          100),
        n, s, qs = preprocess_radtag_lane.next_read_from_fh(fh, lnum)
        ofh.write(
            preprocess_radtag_lane.as_fq_line(n, s, qs_to_q(qs, baseQ),
                                              out_baseQ, out_lnum))
    print >> sys.stderr, '\n'
コード例 #5
0
ファイル: iterative_rtd.py プロジェクト: alexagrf/rtd
def uniqued_to_fastq(uniqued,id_prefix=''):
    if uniqued.endswith('gz'):
        len_uni = int(Popen('zcat %s | wc -l' % uniqued,shell=True,stdout=PIPE).stdout.read().strip())
    else:
        len_uni = int(Popen('cat %s | wc -l' % uniqued,shell=True,stdout=PIPE).stdout.read().strip())
    fh = smartopen(uniqued)
    outname = remove_ext(uniqued)+'-fromuni.fastq.gz'
    if os.path.exists(outname) and get_read_count(outname) == len_uni:
        print >> sys.stderr, 'output %s exists' % outname
        return outname
    ofh = smartopen(outname,'w')
    print >> sys.stderr, 'convert %s to fastq' % uniqued
    for i,l in enumerate(fh):
        fields = l.strip().split()
        fq_line = '@%s%s\n%s\n+\n%s\n' % (id_prefix,i,fields[0],fields[2])
        ofh.write(fq_line)
        if i % 1000 == 0: print >> sys.stderr, '\r\t%s done' % i,
    ofh.close()
    print >> sys.stderr, '%s done' % outname

    return outname
コード例 #6
0
ファイル: iterative_rtd.py プロジェクト: alexagrf/rtd
        for uniqued in uniqueds:
            load_uniqued(all_quality,uniqued,count_by_ind=True)
            
        print >> sys.stderr, 'LOAD COMPLETE. WRITE BY-SIZE.'
        ofbysize = write_uniqued_by_size(all_quality,bysize_dir)
        del all_quality
        ret = os.system('touch %s' % bysize_done) 
    
    sizes = sorted(ofbysize.keys(),reverse=True)

    for i in sizes:
        print >> sys.stderr, '\nSTART %s' % i
        uni = ofbysize[i]

        ufq = uniqued_to_fastq(uni)
        nreads = get_read_count(ufq)

        if os.path.exists(denovo_ref):
            dn_len = ref_len(denovo_ref)
            noncontam_ubam = subtractive_map(ufq,contam_fa,stampy=False,readnames_only=False)
            unmapped = subtractive_map(noncontam_ubam,denovo_ref,force_index=True)
        else:
            dn_len = 0
            unmapped = subtractive_map(ufq,contam_fa,stampy=False)

        print >> sys.stderr, '\nGET %s UNMAPPED' % len(unmapped)
        funi = os.path.splitext(uni)[0]+'.filtered.gz'
        filter_uniqued(uni,funi,map(int,unmapped))

        outdir = os.path.splitext(uni)[0]+'-rtd'
        print >> sys.stderr, '\nRTD'
コード例 #7
0
for paired end, argv:
cutsite,fq1,fq2,outfile1,outfile2

'''

import preprocess_radtag_lane
import os, sys

barcode_len = 5
tick = 10000  #update progress every this-many reads

if __name__ == "__main__":
    if len(sys.argv) == 4:
        cutsite, fq, outfile = sys.argv[1:]
        rc = preprocess_radtag_lane.get_read_count(fq)
        lnum, baseQ = preprocess_radtag_lane.get_fastq_properties(fq)

        fh = preprocess_radtag_lane.smartopen(fq)
        ofh = preprocess_radtag_lane.smartopen(outfile, 'w')

        found = 0
        for i in range(rc):
            if i > 0 and i % tick == 0:
                print >> sys.stderr, '\r%s / %s (%0.1f%%) found %s (%0.1f%%)' % \
                      (i,rc,(float(i)/rc)*100,found,(float(found)/i)*100),
            n, s, q = preprocess_radtag_lane.next_read_from_fh(fh, lnum)
            if s[barcode_len:barcode_len + len(cutsite)] == cutsite:
                line = preprocess_radtag_lane.as_fq_line(n, s, q, None, lnum)
                ofh.write(line)
                found += 1
コード例 #8
0
ファイル: iterative_rtd.py プロジェクト: xguse/rtd
        for uniqued in uniqueds:
            load_uniqued(all_quality, uniqued, count_by_ind=True)

        print >> sys.stderr, 'LOAD COMPLETE. WRITE BY-SIZE.'
        ofbysize = write_uniqued_by_size(all_quality, bysize_dir)
        del all_quality
        ret = os.system('touch %s' % bysize_done)

    sizes = sorted(ofbysize.keys(), reverse=True)

    for i in sizes:
        print >> sys.stderr, '\nSTART %s' % i
        uni = ofbysize[i]

        ufq = uniqued_to_fastq(uni)
        nreads = get_read_count(ufq)

        if os.path.exists(denovo_ref):
            dn_len = ref_len(denovo_ref)
            noncontam_ubam = subtractive_map(ufq,
                                             contam_fa,
                                             stampy=False,
                                             readnames_only=False)
            unmapped = subtractive_map(noncontam_ubam,
                                       denovo_ref,
                                       force_index=True)
        else:
            dn_len = 0
            unmapped = subtractive_map(ufq, contam_fa, stampy=False)

        print >> sys.stderr, '\nGET %s UNMAPPED' % len(unmapped)
コード例 #9
0
def load_uniqued(all_quality,
                 uniqued,
                 readlen=None,
                 nticks=20,
                 baseQ=None,
                 count_by_ind=False):
    '''given a .uniqued file produced by preprocess_radtag_lane.py

    loads data into all_quality, ensuring sequences remain unique

    all_quality per 20101114 - UPDATE below    
    '''

    nreads = get_read_count(uniqued)

    qfh = smartopen(uniqued)
    while baseQ is None:
        line = qfh.next()
        qstr = line.strip().split()[2]
        baseQ = get_baseQ(qstr)
    qfh.close()

    print >> sys.stderr, 'uniqued qualities base %s' % (baseQ)

    tickon = nreads / nticks
    if tickon < 1:
        tickon = 1
    print >> sys.stderr, '\tloading'

    for i, line in enumerate(smartopen(uniqued)):
        if i % tickon == 0:
            print >> sys.stderr, '\t\t%s / %s (%d%%)' % (i, nreads,
                                                         (float(i) / nreads) *
                                                         100)

        try:
            s, c, qstr, indivstr, indcnt, r2, r2cnt = line.strip().split()
        except ValueError:
            print >> sys.stderr, 'line %s split: incorrect element number (%s) line:\n%ssplit:\n%s\n' % (
                i, len(line.strip().split()), line, line.strip().split())
        q = numpy.array([ord(ch) - baseQ for ch in qstr])
        c = int(c)
        indiv = set(indivstr.split(','))

        if count_by_ind:
            indcntd = dict(
                zip(indivstr.split(','), map(int, indcnt.split(','))))

        if readlen is not None:
            s = s[:readlen]
            q = q[:readlen]

        if all_quality.has_key(s):
            all_quality[s]['mIDs'] = list(
                set(all_quality[s]['mIDs']).union(indiv))
            all_quality[s]['sum_quality'] += q * c
            all_quality[s]['tot'] += c
            if count_by_ind:
                for ind, cnt in indcntd.items():
                    if all_quality[s]['count_by_ind'].has_key(ind):
                        all_quality[s]['count_by_ind'][ind] += cnt
                    else:
                        all_quality[s]['count_by_ind'][ind] = cnt
        else:
            all_quality[s]['mIDs'] = list(indiv)
            all_quality[s]['sum_quality'] = q * c
            all_quality[s]['tot'] = c
            if count_by_ind:
                all_quality[s]['count_by_ind'] = indcntd
コード例 #10
0
ファイル: find_perfect_match_reads.py プロジェクト: xguse/rtd
from editdist import distance
from preprocess_radtag_lane import next_read_from_fh, smartopen, get_read_count

idx_bp = 5
cut_bp = 5

lnum = 4
min_seqs = 7

uniqued, fastq = sys.argv[1:]

readlen = len(next_read_from_fh(smartopen(fastq), 4)[1])

print >> sys.stderr, 'readlen: %s' % readlen

num_reads = get_read_count(fastq, 4)
tickon = num_reads / 200

useqs = []
for l in open(uniqued):
    s, cntstr = l.strip().split()[0], l.strip().split()[4]
    cnt = numpy.mean([int(i) for i in cntstr.split(',')])
    if cnt >= min_seqs:
        useqs.append(s[cut_bp:readlen - idx_bp])

useqs = list(set(useqs))
print >> sys.stderr, '%s unique %sbp sequences in uniqued file' % (
    len(useqs), len(s[cut_bp:readlen - idx_bp]))

fh = smartopen(fastq)
コード例 #11
0
from editdist import distance
from preprocess_radtag_lane import next_read_from_fh, smartopen, get_read_count

idx_bp = 5
cut_bp = 5

lnum = 4
min_seqs = 7

uniqued, fastq = sys.argv[1:]

readlen = len(next_read_from_fh(smartopen(fastq),4)[1])

print >> sys.stderr, 'readlen: %s' % readlen

num_reads = get_read_count(fastq,4)
tickon = num_reads/200

useqs = []
for l in open(uniqued):
    s,cntstr = l.strip().split()[0], l.strip().split()[4]
    cnt = numpy.mean([int(i) for i in cntstr.split(',')])
    if cnt >= min_seqs:
        useqs.append(s[cut_bp:readlen-idx_bp])

useqs = list(set(useqs))
print >> sys.stderr, '%s unique %sbp sequences in uniqued file' % (len(useqs),len(s[cut_bp:readlen-idx_bp]))

fh = smartopen(fastq)

for i in range(num_reads):
コード例 #12
0
ファイル: read_quality_statistics.py プロジェクト: xguse/rtd
    if len(sys.argv) == 2:
        fq = sys.argv[1]
        boundstr = "0:"
    else:
        fq, boundstr = sys.argv[1:]

    start, end = boundstr.split(':')
    start = int(start)

    lnum, baseQ, readlen = get_fastq_properties(fq)

    if end == '':
        end = readlen

    readcount = preprocess_radtag_lane.get_read_count(fq)

    qsc_n = 0
    qsc_tot = numpy.zeros(readlen)
    qsc_by_read = []

    fh = smartopen(fq)

    tickon = readcount / 1000
    for i in range(readcount):
        if i % tickon == 0:
            print >> sys.stderr, '\r%0.1f' % ((i / float(readcount)) * 100),
        t, r, q = preprocess_radtag_lane.next_read_from_fh(fh, lnum)
        qsc = [ord(c) - baseQ for c in q]
        qsc_n += 1
        qsc_tot += qsc
コード例 #13
0
    if len(sys.argv) == 2:
        fq = sys.argv[1]
        boundstr = "0:"
    else:
        fq, boundstr = sys.argv[1:]

    start,end = boundstr.split(':')
    start = int(start)

    lnum,baseQ,readlen = get_fastq_properties(fq)

    if end == '':
        end = readlen

    readcount = preprocess_radtag_lane.get_read_count(fq)

    qsc_n = 0
    qsc_tot = numpy.zeros(readlen)
    qsc_by_read = []

    fh = smartopen(fq)

    tickon = readcount/1000
    for i in range(readcount):
        if i % tickon == 0:
            print >> sys.stderr, '\r%0.1f' % ((i/float(readcount)) * 100),
        t,r,q = preprocess_radtag_lane.next_read_from_fh(fh,lnum)
        qsc = [ord(c)-baseQ for c in q]
        qsc_n += 1
        qsc_tot += qsc
コード例 #14
0
for paired end, argv:
cutsite,fq1,fq2,outfile1,outfile2

'''

import preprocess_radtag_lane
import os,sys

barcode_len = 5
tick = 10000 #update progress every this-many reads

if __name__ == "__main__":
    if len(sys.argv) == 4:
        cutsite,fq,outfile = sys.argv[1:]
        rc = preprocess_radtag_lane.get_read_count(fq)
        lnum,baseQ = preprocess_radtag_lane.get_fastq_properties(fq)

        fh = preprocess_radtag_lane.smartopen(fq)
        ofh = preprocess_radtag_lane.smartopen(outfile,'w')

        found = 0
        for i in range(rc):
            if i>0 and i % tick == 0:
                print >> sys.stderr, '\r%s / %s (%0.1f%%) found %s (%0.1f%%)' % \
                      (i,rc,(float(i)/rc)*100,found,(float(found)/i)*100),
            n,s,q = preprocess_radtag_lane.next_read_from_fh(fh,lnum)
            if s[barcode_len:barcode_len+len(cutsite)] == cutsite:
                line = preprocess_radtag_lane.as_fq_line(n,s,q,None,lnum)
                ofh.write(line)
                found += 1