Example #1
0
def load_uniqued(all_quality,uniqued,readlen=None,nticks=20,baseQ=None,count_by_ind=False):
    '''given a .uniqued file produced by preprocess_radtag_lane.py

    loads data into all_quality, ensuring sequences remain unique

    all_quality per 20101114 - UPDATE below    
    '''

    nreads = get_read_count(uniqued)
    
    qfh = smartopen(uniqued)
    while baseQ is None:
		line = qfh.next()
		qstr = line.strip().split()[2]
		baseQ = get_baseQ(qstr)
    qfh.close()
    
    print >> sys.stderr, 'uniqued qualities base %s' % (baseQ)

    
    tickon = nreads/nticks
    if tickon < 1:
    	tickon = 1
    print >> sys.stderr, '\tloading'


    for i,line in enumerate(smartopen(uniqued)):
        if i % tickon == 0: print >> sys.stderr, '\t\t%s / %s (%d%%)' % (i,nreads,(float(i)/nreads)*100)

        try:
            s,c,qstr,indivstr,indcnt,r2,r2cnt = line.strip().split()
        except ValueError:
            print >> sys.stderr, 'line %s split: incorrect element number (%s) line:\n%ssplit:\n%s\n' % (i,len(line.strip().split()),line,line.strip().split())
        q = numpy.array([ord(ch)-baseQ for ch in qstr])
        c = int(c)
        indiv = set(indivstr.split(','))

        if count_by_ind:
            indcntd = dict(zip(indivstr.split(','),map(int,indcnt.split(','))))

        if readlen is not None:
            s = s[:readlen]
            q = q[:readlen]

        if all_quality.has_key(s):
            all_quality[s]['mIDs'] = list(set(all_quality[s]['mIDs']).union(indiv))
            all_quality[s]['sum_quality'] += q*c
            all_quality[s]['tot'] += c
            if count_by_ind:
                for ind,cnt in indcntd.items():
                    if all_quality[s]['count_by_ind'].has_key(ind):
                        all_quality[s]['count_by_ind'][ind] += cnt
                    else:
                        all_quality[s]['count_by_ind'][ind] = cnt
        else:
            all_quality[s]['mIDs'] = list(indiv)
            all_quality[s]['sum_quality'] = q*c
            all_quality[s]['tot'] = c
            if count_by_ind:
                all_quality[s]['count_by_ind'] = indcntd
Example #2
0
File: rtd_run.py Project: ekay/rtd
def load_uniqued(all_quality,uniqued,readlen=None,nticks=20,baseQ=None):
    '''given a .uniqued file produced by preprocess_radtag_lane.py

    loads data into all_quality, ensuring sequences remain unique

    all_quality per 20101114 - UPDATE below    
    '''

    print >> sys.stderr, '%s readcount: ' % (uniqued),
    #number of sequences
    nreads = int(Popen('wc -l %s' % uniqued,shell=True,stdout=PIPE).stdout.read().split()[0])
    print >> sys.stderr, nreads
    
    qfh = open(uniqued)
    while baseQ is None:
		line = qfh.next()
		qstr = line.strip().split()[2]
		baseQ = get_baseQ(qstr)
    qfh.close()
    
    print >> sys.stderr, 'uniqued qualities base %s' % (baseQ)

    
    tickon = nreads/nticks
    if tickon < 1:
    	tickon = 1
    print >> sys.stderr, '\tloading'


    for i,line in enumerate(open(uniqued)):
        if i % tickon == 0: print >> sys.stderr, '\t\t%s / %s (%d%%)' % (i,nreads,(float(i)/nreads)*100)

        try:
            s,c,qstr,indiv,indcnt,r2,r2cnt = line.strip().split()
        except ValueError:
            print >> sys.stderr, 'line %s split: incorrect element number (%s) line:\n%ssplit:\n%s\n' % (i,len(line.strip().split()),line,line.strip().split())
        q = numpy.array([ord(ch)-baseQ for ch in qstr])
        c = int(c)
        indiv = set(indiv.split(','))

        if readlen is not None:
            s = s[:readlen]
            q = q[:readlen]

        if all_quality.has_key(s):
            all_quality[s]['mIDs'] = list(set(all_quality[s]['mIDs']).union(indiv))
            all_quality[s]['sum_quality'] += q*c
            all_quality[s]['tot'] += c
        else:
            all_quality[s]['mIDs'] = list(indiv)
            all_quality[s]['sum_quality'] = q*c
            all_quality[s]['tot'] = c
Example #3
0
def load_lines_from_uniqued(source_uniques,
                            rv_sort=True,
                            sort_key=lambda x: (len(x[0]), int(x[1])),
                            keep_source_id=False):
    '''
    if keep_source_id is True
        returns list of 2-tuples uniqued_id (eg 100617_lane6_PE for "data/100617/100617_lane6_PE.uniqued")
        tuples are (parsed_lines,uniqued_id)

    else list of lines.
    '''
    uniquedlines = []
    for f in source_uniques:
        lines = []

        print >> sys.stderr, 'load %s ...' % f,
        lines = tuple([l.strip().split() for l in open(f).readlines()])
        print >> sys.stderr, '%s lines' % len(lines)

        #get qual base
        baseQ = None
        for l in lines:
            baseQ = get_baseQ(l[2])
            if baseQ is not None:
                break
        print >> sys.stderr, 'qual base: %s' % baseQ

        if baseQ == 64:
            print >> sys.stderr, 'Translate quality encoding to base 33 ...',
            for l in lines:
                l[2] = ''.join([chr(ord(c) - 64 + 33) for c in l[2]])
            print >> sys.stderr, 'done'

        if keep_source_id:
            uniqued_id = os.path.basename(os.path.splitext(f)[0])
            uniquedlines.extend(zip(lines, [uniqued_id] * len(lines)))
        else:
            uniquedlines.extend(lines)

    print >> sys.stderr, 'sort',
    if keep_source_id:
        uniquedlines.sort(reverse=rv_sort, key=lambda x: sort_key(x[0]))
    else:
        uniquedlines.sort(reverse=rv_sort, key=sort_key)
    print >> sys.stderr, 'done'
    return uniquedlines
Example #4
0
def load_lines_from_uniqued(source_uniques,rv_sort = True, sort_key = lambda x: (len(x[0]),int(x[1])), keep_source_id = False):
    '''
    if keep_source_id is True
        returns list of 2-tuples uniqued_id (eg 100617_lane6_PE for "data/100617/100617_lane6_PE.uniqued")
        tuples are (parsed_lines,uniqued_id)

    else list of lines.
    '''
    uniquedlines = []
    for f in source_uniques:
        lines = []

        print >> sys.stderr, 'load %s ...' % f,
        lines = tuple([l.strip().split() for l in open(f).readlines()])
        print >> sys.stderr, '%s lines' % len(lines)

        #get qual base
        baseQ = None
        for l in lines:
        	baseQ = get_baseQ(l[2])
        	if baseQ is not None:
        		break
        print >> sys.stderr, 'qual base: %s' % baseQ
        
        if baseQ == 64: 
        	print >> sys.stderr, 'Translate quality encoding to base 33 ...',
        	for l in lines:
        		l[2] = ''.join([chr(ord(c)-64+33) for c in l[2]])
        	print >> sys.stderr, 'done'
        
        if keep_source_id:
            uniqued_id = os.path.basename(os.path.splitext(f)[0])
            uniquedlines.extend( zip( lines,[uniqued_id]*len(lines) ) )
        else:
            uniquedlines.extend(lines)

    print >> sys.stderr, 'sort',
    if keep_source_id:
        uniquedlines.sort(reverse = rv_sort,key = lambda x: sort_key(x[0]))
    else:
        uniquedlines.sort(reverse = rv_sort,key = sort_key)
    print >> sys.stderr, 'done'
    return uniquedlines
Example #5
0
def get_fastq_properties(fq):
    if smartopen(fq).read(1) == '@':
        lnum = 4
    else:
        lnum = 1
    print >> sys.stderr, 'fastq format lnum: %s' % lnum

    baseQ = None
    qfh = smartopen(fq)
    while baseQ is None:
        t, r, q = preprocess_radtag_lane.next_read_from_fh(qfh, lnum)
        baseQ = preprocess_radtag_lane.get_baseQ(q)
    qfh.close()
    print >> sys.stderr, 'fastq format baseQ: %s' % baseQ

    readlen = len(r)
    print >> sys.stderr, 'fastq format readlen: %s' % readlen

    return lnum, baseQ, readlen
Example #6
0
def get_fastq_properties(fq):
    if smartopen(fq).read(1) == '@':
        lnum = 4
    else:
        lnum = 1
    print >> sys.stderr, 'fastq format lnum: %s' % lnum

    baseQ = None
    qfh = smartopen(fq)
    while baseQ is None:
        t,r,q = preprocess_radtag_lane.next_read_from_fh(qfh,lnum)
        baseQ = preprocess_radtag_lane.get_baseQ(q)
    qfh.close()
    print >> sys.stderr, 'fastq format baseQ: %s' % baseQ

    readlen = len(r)
    print >> sys.stderr, 'fastq format readlen: %s' % readlen

    return lnum,baseQ,readlen
Example #7
0
def load_uniqued(all_quality,
                 uniqued,
                 readlen=None,
                 nticks=20,
                 baseQ=None,
                 count_by_ind=False):
    '''given a .uniqued file produced by preprocess_radtag_lane.py

    loads data into all_quality, ensuring sequences remain unique

    all_quality per 20101114 - UPDATE below    
    '''

    nreads = get_read_count(uniqued)

    qfh = smartopen(uniqued)
    while baseQ is None:
        line = qfh.next()
        qstr = line.strip().split()[2]
        baseQ = get_baseQ(qstr)
    qfh.close()

    print >> sys.stderr, 'uniqued qualities base %s' % (baseQ)

    tickon = nreads / nticks
    if tickon < 1:
        tickon = 1
    print >> sys.stderr, '\tloading'

    for i, line in enumerate(smartopen(uniqued)):
        if i % tickon == 0:
            print >> sys.stderr, '\t\t%s / %s (%d%%)' % (i, nreads,
                                                         (float(i) / nreads) *
                                                         100)

        try:
            s, c, qstr, indivstr, indcnt, r2, r2cnt = line.strip().split()
        except ValueError:
            print >> sys.stderr, 'line %s split: incorrect element number (%s) line:\n%ssplit:\n%s\n' % (
                i, len(line.strip().split()), line, line.strip().split())
        q = numpy.array([ord(ch) - baseQ for ch in qstr])
        c = int(c)
        indiv = set(indivstr.split(','))

        if count_by_ind:
            indcntd = dict(
                zip(indivstr.split(','), map(int, indcnt.split(','))))

        if readlen is not None:
            s = s[:readlen]
            q = q[:readlen]

        if all_quality.has_key(s):
            all_quality[s]['mIDs'] = list(
                set(all_quality[s]['mIDs']).union(indiv))
            all_quality[s]['sum_quality'] += q * c
            all_quality[s]['tot'] += c
            if count_by_ind:
                for ind, cnt in indcntd.items():
                    if all_quality[s]['count_by_ind'].has_key(ind):
                        all_quality[s]['count_by_ind'][ind] += cnt
                    else:
                        all_quality[s]['count_by_ind'][ind] = cnt
        else:
            all_quality[s]['mIDs'] = list(indiv)
            all_quality[s]['sum_quality'] = q * c
            all_quality[s]['tot'] = c
            if count_by_ind:
                all_quality[s]['count_by_ind'] = indcntd