Beispiel #1
0
def run_local_blat(subjects, queries, blattile, blatargstr='', num_cores=1):
    '''
    runs blat commands using os.system()
    runs all jobs as a single batch, to run on multiple cores/computers, consider run_parallel_blat()
    '''

    blatargstr += ' -tileSize=%s' % blattile
    blatargstr += ' -stepSize=%s' % (int(blattile) / 2)

    cmds = []
    labf = []
    for q in queries:
        for subject in subjects:
            subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj')
            outbase = q.rstrip('.fa').rstrip(
                '_query') + '_blat' + '-subj' + subjname + blatargstr.replace(
                    '=', '').replace(' ', '')
            labf.append(outbase + '.label.gz')
            cmd = '%s %s %s %s "%s" %s' % (
                sys.executable,
                os.path.join(radtag_denovo, 'mcl_id_triples_by_blat.py'),
                subject, q, blatargstr, outbase)
            cmds.append(run_safe.safe_script(cmd, outbase))

    shscr = os.path.join(os.path.dirname(subjects[0]), 'runblat.sh')
    smartopen(shscr, 'w').writelines([cmd + ';\n' for cmd in cmds])
    os.system('chmod +x ' + shscr)
    ret = os.system(shscr)
    if ret != 0 or not all([os.path.exists(f) for f in labf]):
        raise OSError, 'blat failed with code %s' % ret
    return labf
Beispiel #2
0
def load_uniqued(all_quality,uniqued,readlen=None,nticks=20,baseQ=None,count_by_ind=False):
    '''given a .uniqued file produced by preprocess_radtag_lane.py

    loads data into all_quality, ensuring sequences remain unique

    all_quality per 20101114 - UPDATE below    
    '''

    nreads = get_read_count(uniqued)
    
    qfh = smartopen(uniqued)
    while baseQ is None:
		line = qfh.next()
		qstr = line.strip().split()[2]
		baseQ = get_baseQ(qstr)
    qfh.close()
    
    print >> sys.stderr, 'uniqued qualities base %s' % (baseQ)

    
    tickon = nreads/nticks
    if tickon < 1:
    	tickon = 1
    print >> sys.stderr, '\tloading'


    for i,line in enumerate(smartopen(uniqued)):
        if i % tickon == 0: print >> sys.stderr, '\t\t%s / %s (%d%%)' % (i,nreads,(float(i)/nreads)*100)

        try:
            s,c,qstr,indivstr,indcnt,r2,r2cnt = line.strip().split()
        except ValueError:
            print >> sys.stderr, 'line %s split: incorrect element number (%s) line:\n%ssplit:\n%s\n' % (i,len(line.strip().split()),line,line.strip().split())
        q = numpy.array([ord(ch)-baseQ for ch in qstr])
        c = int(c)
        indiv = set(indivstr.split(','))

        if count_by_ind:
            indcntd = dict(zip(indivstr.split(','),map(int,indcnt.split(','))))

        if readlen is not None:
            s = s[:readlen]
            q = q[:readlen]

        if all_quality.has_key(s):
            all_quality[s]['mIDs'] = list(set(all_quality[s]['mIDs']).union(indiv))
            all_quality[s]['sum_quality'] += q*c
            all_quality[s]['tot'] += c
            if count_by_ind:
                for ind,cnt in indcntd.items():
                    if all_quality[s]['count_by_ind'].has_key(ind):
                        all_quality[s]['count_by_ind'][ind] += cnt
                    else:
                        all_quality[s]['count_by_ind'][ind] = cnt
        else:
            all_quality[s]['mIDs'] = list(indiv)
            all_quality[s]['sum_quality'] = q*c
            all_quality[s]['tot'] = c
            if count_by_ind:
                all_quality[s]['count_by_ind'] = indcntd
Beispiel #3
0
def run_local_blat(subjects,queries,blattile,blatargstr='',num_cores=1):
    '''
    runs blat commands using os.system()
    runs all jobs as a single batch, to run on multiple cores/computers, consider run_parallel_blat()
    '''

    blatargstr += ' -tileSize=%s' % blattile
    blatargstr += ' -stepSize=%s' % (int(blattile)/2)

    cmds = []
    labf = []
    for q in queries:
        for subject in subjects:
            subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj')
            outbase = q.rstrip('.fa').rstrip('_query')+'_blat'+'-subj'+subjname+blatargstr.replace('=','').replace(' ','')
            labf.append(outbase+'.label.gz')
            cmd = '%s %s %s %s "%s" %s' % (sys.executable, os.path.join(radtag_denovo, 'mcl_id_triples_by_blat.py'),subject,q,blatargstr,outbase)
            cmds.append(run_safe.safe_script(cmd,outbase))

    shscr = os.path.join(os.path.dirname(subjects[0]) , 'runblat.sh')
    smartopen(shscr, 'w').writelines([cmd+';\n' for cmd in cmds])
    os.system('chmod +x '+shscr)
    ret = os.system(shscr)
    if ret != 0 or not all([os.path.exists(f) for f in labf]):
        raise OSError, 'blat failed with code %s' % ret
    return labf
Beispiel #4
0
def uniqued_to_fastq(uniqued, id_prefix=''):
    if uniqued.endswith('gz'):
        len_uni = int(
            Popen('zcat %s | wc -l' % uniqued, shell=True,
                  stdout=PIPE).stdout.read().strip())
    else:
        len_uni = int(
            Popen('cat %s | wc -l' % uniqued, shell=True,
                  stdout=PIPE).stdout.read().strip())
    fh = smartopen(uniqued)
    outname = remove_ext(uniqued) + '-fromuni.fastq.gz'
    if os.path.exists(outname) and get_read_count(outname) == len_uni:
        print >> sys.stderr, 'output %s exists' % outname
        return outname
    ofh = smartopen(outname, 'w')
    print >> sys.stderr, 'convert %s to fastq' % uniqued
    for i, l in enumerate(fh):
        fields = l.strip().split()
        fq_line = '@%s%s\n%s\n+\n%s\n' % (id_prefix, i, fields[0], fields[2])
        ofh.write(fq_line)
        if i % 1000 == 0: print >> sys.stderr, '\r\t%s done' % i,
    ofh.close()
    print >> sys.stderr, '%s done' % outname

    return outname
Beispiel #5
0
def run_parallel_blat(subjects,
                      queries,
                      blattile,
                      blatargstr='',
                      num_cores='+0'):
    '''
    runs blat commands using GUN parallel.

    '''

    blatargstr += ' -tileSize=%s' % blattile
    blatargstr += ' -stepSize=%s' % (int(blattile) / 2)

    cmds = []
    labf = []
    for q in queries:
        for subject in subjects:
            subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj')
            outbase = q.rstrip('.fa').rstrip(
                '_query') + '_blat' + '-subj' + subjname + blatargstr.replace(
                    '=', '').replace(' ', '')
            labf.append(outbase + '.label.gz')
            cmd = '%smcl_id_triples_by_blat.py %s %s "%s" %s' % (
                radtag_denovo, subject, q, blatargstr, outbase)
            cmds.append(run_safe.safe_script(cmd, outbase))

    shscr = os.path.join(os.path.dirname(subjects[0]), 'runblat.sh')
    smartopen(shscr, 'w').writelines([cmd + ';\n' for cmd in cmds])
    os.system('chmod +x ' + shscr)
    ret = os.system('parallel --progress -j %s < %s' % (num_cores, shscr))
    if ret != 0 or not all([os.path.exists(f) for f in labf]):
        raise OSError, 'blat failed with code %s' % ret

    return labf
Beispiel #6
0
def run_parallel_blat(subjects,queries,blattile,blatargstr='',num_cores='+0'):
    '''
    runs blat commands using GUN parallel.

    '''

    blatargstr += ' -tileSize=%s' % blattile
    blatargstr += ' -stepSize=%s' % (int(blattile)/2)

    cmds = []
    labf = []
    for q in queries:
        for subject in subjects:
            subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj')
            outbase = q.rstrip('.fa').rstrip('_query')+'_blat'+'-subj'+subjname+blatargstr.replace('=','').replace(' ','')
            labf.append(outbase+'.label')
            cmds.append('%smcl_id_triples_by_blat.py %s %s "%s" %s' % (radtag_denovo,subject,q,blatargstr,outbase))

    shscr = os.path.join(os.path.dirname(subjects[0]) , 'runblat.sh')
    smartopen(shscr, 'w').writelines([cmd+';\n' for cmd in cmds])
    os.system('chmod +x '+shscr)
    ret = os.system('parallel --progress -j %s < %s' % (num_cores,shscr))
    if ret != 0 or not all([os.path.exists(f) for f in labf]):
        raise OSError, 'blat failed with code %s' % ret

    return labf
Beispiel #7
0
def convert_fastq(fq,ofq,out_lnum=4,out_baseQ=33,tickon = 10000):
    nreads = preprocess_radtag_lane.get_read_count(fq)
    lnum,baseQ = preprocess_radtag_lane.get_fastq_properties(fq)
    fh = preprocess_radtag_lane.smartopen(fq)
    ofh = preprocess_radtag_lane.smartopen(ofq,'w')
    for i in xrange(nreads):
        if i%tickon == 0:
            print >> sys.stderr, '\r%s / %s (%0.1f%%)' % (i,nreads,(float(i)/nreads)*100),
        n,s,qs = preprocess_radtag_lane.next_read_from_fh(fh, lnum)
        ofh.write(preprocess_radtag_lane.as_fq_line(n,s,qs_to_q(qs,baseQ),out_baseQ,out_lnum))
    print >> sys.stderr,'\n'
Beispiel #8
0
def append_to_ref(target_ref, new_ref, id_prefix):
    nfh = smartopen(new_ref)
    tfh = smartopen(target_ref, 'a')
    for l in nfh:
        if l.startswith('>'):
            newl = l.replace('>', '>%s_' % id_prefix)
            tfh.write(newl)
        else:
            tfh.write(l)
    nfh.close()
    tfh.close()
Beispiel #9
0
def append_to_ref(target_ref,new_ref,id_prefix):
    nfh = smartopen(new_ref)
    tfh = smartopen(target_ref,'a')
    for l in nfh:
        if l.startswith('>'):
            newl = l.replace('>','>%s_' % id_prefix)
            tfh.write(newl)
        else:
            tfh.write(l)
    nfh.close()
    tfh.close()
Beispiel #10
0
def convert_fastq(fq, ofq, out_lnum=4, out_baseQ=33, tickon=10000):
    nreads = preprocess_radtag_lane.get_read_count(fq)
    lnum, baseQ = preprocess_radtag_lane.get_fastq_properties(fq)
    fh = preprocess_radtag_lane.smartopen(fq)
    ofh = preprocess_radtag_lane.smartopen(ofq, 'w')
    for i in xrange(nreads):
        if i % tickon == 0:
            print >> sys.stderr, '\r%s / %s (%0.1f%%)' % (i, nreads,
                                                          (float(i) / nreads) *
                                                          100),
        n, s, qs = preprocess_radtag_lane.next_read_from_fh(fh, lnum)
        ofh.write(
            preprocess_radtag_lane.as_fq_line(n, s, qs_to_q(qs, baseQ),
                                              out_baseQ, out_lnum))
    print >> sys.stderr, '\n'
Beispiel #11
0
def get_shortest_readlen(unifiles):
    readlen = numpy.inf
    for uniqued in unifiles:
        rl = len(smartopen(uniqued).readline().strip().split()[0])
        if rl < readlen:
            readlen = rl
    return readlen
Beispiel #12
0
def get_shortest_readlen(unifiles):
    readlen = numpy.inf
    for uniqued in unifiles:
        rl = len(smartopen(uniqued).readline().strip().split()[0])
        if rl < readlen:
            readlen = rl
    return readlen
Beispiel #13
0
def cat(filelist,targetfile):
    '''cats an arbitrarily large filelist to targetfile'''
    fh = smartopen(targetfile,'w')
    print >> sys.stderr, '\n'
    for i,f in enumerate(filelist):
        print >> sys.stderr, '\r%s / %s' % (i,len(filelist)),
        for l in open(f):
            fh.write(l)
    fh.close()
Beispiel #14
0
def cat(filelist, targetfile):
    '''cats an arbitrarily large filelist to targetfile'''
    fh = smartopen(targetfile, 'w')
    print >> sys.stderr, '\n'
    for i, f in enumerate(filelist):
        print >> sys.stderr, '\r%s / %s' % (i, len(filelist)),
        for l in open(f):
            fh.write(l)
    fh.close()
Beispiel #15
0
def uniqued_to_fastq(uniqued,id_prefix=''):
    if uniqued.endswith('gz'):
        len_uni = int(Popen('zcat %s | wc -l' % uniqued,shell=True,stdout=PIPE).stdout.read().strip())
    else:
        len_uni = int(Popen('cat %s | wc -l' % uniqued,shell=True,stdout=PIPE).stdout.read().strip())
    fh = smartopen(uniqued)
    outname = remove_ext(uniqued)+'-fromuni.fastq.gz'
    if os.path.exists(outname) and get_read_count(outname) == len_uni:
        print >> sys.stderr, 'output %s exists' % outname
        return outname
    ofh = smartopen(outname,'w')
    print >> sys.stderr, 'convert %s to fastq' % uniqued
    for i,l in enumerate(fh):
        fields = l.strip().split()
        fq_line = '@%s%s\n%s\n+\n%s\n' % (id_prefix,i,fields[0],fields[2])
        ofh.write(fq_line)
        if i % 1000 == 0: print >> sys.stderr, '\r\t%s done' % i,
    ofh.close()
    print >> sys.stderr, '%s done' % outname

    return outname
Beispiel #16
0
def get_uniqued_error(infiles,cdest_searchbase):
    from glob import glob
    print >> sys.stderr, '\nset cluster dirt threshold from per-lane error estimates'
    err_by_uni = {}
    for uniqued in infiles:
        rl = readlen_from_uniqued(uniqued)
        cdest_search = uniqued.rstrip('.gz')+'-rtd/'+cdest_searchbase
        cdests = glob(cdest_search)
        if len(cdests) != 1:
            raise ValueError, 'search string %s did not result in a single .cdest file %s' % (cdest_search,cdests)
        else:
            cd = float(smartopen(cdests[0]).read())
        print >> sys.stderr, '%s: found cluster dirt %s for read length %s. Estimated error: %s' % (uniqued,cd,rl,cd/rl)
        err_by_uni[uniqued] = cd/rl
    return err_by_uni
def get_counts_by_pool(uniqued,db):
    ufields = get_uniqued_info(uniqued)
    pool_lookup = get_pool_lookup(db,ufields[0],ufields[1],ufields[3])
    counts_by_pool = {}
    fh = preprocess_radtag_lane.smartopen(uniqued)
    for l in fh:
        f = l.split()
        for ind,ct in zip(f[3].split(','),[int(i) for i in f[4].split(',')]):
            pool = pool_lookup[ind]
            
            try:
                counts_by_pool[pool][ind] += ct
            except:
                counts_by_pool[pool] = defaultdict(int)
                counts_by_pool[pool][ind] += ct

    return counts_by_pool
Beispiel #18
0
def get_counts_by_pool(uniqued, db):
    ufields = get_uniqued_info(uniqued)
    pool_lookup = get_pool_lookup(db, ufields[0], ufields[1], ufields[3])
    counts_by_pool = {}
    fh = preprocess_radtag_lane.smartopen(uniqued)
    for l in fh:
        f = l.split()
        for ind, ct in zip(f[3].split(','), [int(i) for i in f[4].split(',')]):
            pool = pool_lookup[ind]

            try:
                counts_by_pool[pool][ind] += ct
            except:
                counts_by_pool[pool] = defaultdict(int)
                counts_by_pool[pool][ind] += ct

    return counts_by_pool
Beispiel #19
0
def get_uniqued_error(infiles, cdest_searchbase):
    from glob import glob
    print >> sys.stderr, '\nset cluster dirt threshold from per-lane error estimates'
    err_by_uni = {}
    for uniqued in infiles:
        rl = readlen_from_uniqued(uniqued)
        cdest_search = uniqued.rstrip('.gz') + '-rtd/' + cdest_searchbase
        cdests = glob(cdest_search)
        if len(cdests) != 1:
            raise ValueError, 'search string %s did not result in a single .cdest file %s' % (
                cdest_search, cdests)
        else:
            cd = float(smartopen(cdests[0]).read())
        print >> sys.stderr, '%s: found cluster dirt %s for read length %s. Estimated error: %s' % (
            uniqued, cd, rl, cd / rl)
        err_by_uni[uniqued] = cd / rl
    return err_by_uni
Beispiel #20
0
def write_uniqued_by_size(all_quality,outbase,baseQ=33):
    outdir = os.path.dirname(outbase)
    if not os.path.exists(outdir): os.makedirs(outdir)

    outfhs = {}
    ofbysize = {}
    for seq,aqd in all_quality.items():
        #s,c,qstr,indivstr,indcnt,r2,r2cnt
        ind_li,cnt_li = zip(*aqd['count_by_ind'].items())
        outl =  '\t'.join((seq, str(aqd['tot']), ''.join([chr(i+baseQ) for i in map(int,aqd['sum_quality']/float(aqd['tot']))]), ','.join(ind_li),','.join(map(str,cnt_li)),'.','.')) + '\n'
        outf = outbase+'-%s.uniqued.gz' % len(seq)
        if not outf in outfhs:
            outfhs[outf] = smartopen(outf,'w')
            ofbysize[len(seq)] = outf
        outfhs[outf].write(outl)

    for outf,ofh in outfhs.items():
        ofh.close()

    return ofbysize
Beispiel #21
0
def write_uniqued_by_size(all_quality, outbase, baseQ=33):
    outdir = os.path.dirname(outbase)
    if not os.path.exists(outdir): os.makedirs(outdir)

    outfhs = {}
    ofbysize = {}
    for seq, aqd in all_quality.items():
        #s,c,qstr,indivstr,indcnt,r2,r2cnt
        ind_li, cnt_li = zip(*aqd['count_by_ind'].items())
        outl = '\t'.join((seq, str(aqd['tot']), ''.join([
            chr(i + baseQ)
            for i in map(int, aqd['sum_quality'] / float(aqd['tot']))
        ]), ','.join(ind_li), ','.join(map(str, cnt_li)), '.', '.')) + '\n'
        outf = outbase + '-%s.uniqued.gz' % len(seq)
        if not outf in outfhs:
            outfhs[outf] = smartopen(outf, 'w')
            ofbysize[len(seq)] = outf
        outfhs[outf].write(outl)

    for outf, ofh in outfhs.items():
        ofh.close()

    return ofbysize
Beispiel #22
0
'''

import preprocess_radtag_lane
import os, sys

barcode_len = 5
tick = 10000  #update progress every this-many reads

if __name__ == "__main__":
    if len(sys.argv) == 4:
        cutsite, fq, outfile = sys.argv[1:]
        rc = preprocess_radtag_lane.get_read_count(fq)
        lnum, baseQ = preprocess_radtag_lane.get_fastq_properties(fq)

        fh = preprocess_radtag_lane.smartopen(fq)
        ofh = preprocess_radtag_lane.smartopen(outfile, 'w')

        found = 0
        for i in range(rc):
            if i > 0 and i % tick == 0:
                print >> sys.stderr, '\r%s / %s (%0.1f%%) found %s (%0.1f%%)' % \
                      (i,rc,(float(i)/rc)*100,found,(float(found)/i)*100),
            n, s, q = preprocess_radtag_lane.next_read_from_fh(fh, lnum)
            if s[barcode_len:barcode_len + len(cutsite)] == cutsite:
                line = preprocess_radtag_lane.as_fq_line(n, s, q, None, lnum)
                ofh.write(line)
                found += 1
        ofh.close()
    elif len(sys.argv) == 6:
        cutsite, fq1, fq2, outfile1, outfile2 = sys.argv[1:]
Beispiel #23
0
def filter_uniqued(uniqued, outfile, lines_to_write):
    ofh = smartopen(outfile, 'w')
    for i, l in enumerate(smartopen(uniqued)):
        if i in lines_to_write:
            ofh.write(l)
    ofh.close()
Beispiel #24
0
def preprocess_sequence_for_match(all_quality, cutsite, mIDfile, subjects, queries, minlen=20):
    '''given a quality dictionary
    {
    20101114 - UPDATE:
    modified dict structure:
    
    
    <sequence> : {
                 "tot" : int
                 "mIDs" : [  <sampleID> ,<sampleID>,  ]
                 "sum_quality" : array([int,int,int ...])
                 }
    }

    generates three types of files:
    1x mIDlookup file containing header\tmID\tmID ... for each sequence
    1x "subject" contains all sequences that start with <cutsite>
    Nx "query" each contain a partition (<nparts> total) of fasta formatted sequence.  All seqs greater than <minlen> included

    '''
    import random

    mID_fh = smartopen(mIDfile,'w')

    
    if len(subjects) == 1: #write all subjects to single file
        this_subj_outfile = subjects[0]
        this_subj_fh = smartopen(this_subj_outfile,'w')
        print >> sys.stderr, this_subj_outfile
    else: #write to multiple subject files for parallel execution
        this_subj_outfile = None
        subj_break_at = int(len(all_quality)/(len(subjects)))
        scopy = deepcopy(subjects)

    print >> sys.stderr, 'write sequences'

    gen_queries = []
    gen_subjects = []
    
    if len(queries) == 1: #write all queries to single file
        this_outfile = queries[0]
        this_query_fh = smartopen(this_outfile,'w')
        print >> sys.stderr, this_outfile
    else: #write to multiple query files for parallel execution
        this_outfile = None
        break_at = int(len(all_quality)/(len(queries)))
        qcopy = deepcopy(queries)

    aqkeys = all_quality.keys()
    random.shuffle(aqkeys)
    for i,s in enumerate(aqkeys):
        c = all_quality[s]['tot']
        qsum = all_quality[s]['sum_quality']
        q = qsum / c

        if len(queries) > 1 and i%break_at==0 and len(qcopy) > 0: #move to the next query chunk
            if this_outfile:
                gen_queries.append(this_outfile)
                this_query_fh.close()
            this_outfile = qcopy.pop(0)
            print >> sys.stderr, i,this_outfile
            this_query_fh = smartopen(this_outfile,'w')

        if len(subjects) > 1 and i%subj_break_at==0 and len(scopy) > 0: #move to the next query chunk
            if this_subj_outfile:
                gen_subjects.append(this_subj_outfile)
                this_subj_fh.close()
            this_subj_outfile = scopy.pop(0)
            print >> sys.stderr, i,this_subj_outfile
            this_subj_fh = smartopen(this_subj_outfile,'w')

        if 2 in q:
            first2 = numpy.arange(len(q))[q==2][0]
        else:
            first2 = len(q)

        if first2 > minlen:
            header = '%s.%s.%s.%s' % (i,c,s[:first2],''.join([chr(int(n)+64) for n in q[:first2]]))
            this_query_fh.write('>%s\n%s\n' % (header,s[:first2]))
            mID_fh.write(header+'\t'+('\t'.join(all_quality[s]['mIDs']))+'\n')
            if s.startswith(cutsite) and c > 1:
                this_subj_fh.write('>%s\n%s\n' % (header,s[:first2]))

    gen_queries.append(this_outfile)
    this_query_fh.close()
    gen_subjects.append(this_subj_outfile)
    this_subj_fh.close()
    return gen_subjects, gen_queries
'''

import preprocess_radtag_lane
import os,sys

barcode_len = 5
tick = 10000 #update progress every this-many reads

if __name__ == "__main__":
    if len(sys.argv) == 4:
        cutsite,fq,outfile = sys.argv[1:]
        rc = preprocess_radtag_lane.get_read_count(fq)
        lnum,baseQ = preprocess_radtag_lane.get_fastq_properties(fq)

        fh = preprocess_radtag_lane.smartopen(fq)
        ofh = preprocess_radtag_lane.smartopen(outfile,'w')

        found = 0
        for i in range(rc):
            if i>0 and i % tick == 0:
                print >> sys.stderr, '\r%s / %s (%0.1f%%) found %s (%0.1f%%)' % \
                      (i,rc,(float(i)/rc)*100,found,(float(found)/i)*100),
            n,s,q = preprocess_radtag_lane.next_read_from_fh(fh,lnum)
            if s[barcode_len:barcode_len+len(cutsite)] == cutsite:
                line = preprocess_radtag_lane.as_fq_line(n,s,q,None,lnum)
                ofh.write(line)
                found += 1
        ofh.close()
    elif len(sys.argv) == 6:
        cutsite,fq1,fq2,outfile1,outfile2 = sys.argv[1:]
Beispiel #26
0
def readlen_from_uniqued(uniqued):
    return len(smartopen(uniqued).readline().strip().split()[0])
Beispiel #27
0
'''

import os, sys
import numpy
from editdist import distance
from preprocess_radtag_lane import next_read_from_fh, smartopen, get_read_count

idx_bp = 5
cut_bp = 5

lnum = 4
min_seqs = 7

uniqued, fastq = sys.argv[1:]

readlen = len(next_read_from_fh(smartopen(fastq), 4)[1])

print >> sys.stderr, 'readlen: %s' % readlen

num_reads = get_read_count(fastq, 4)
tickon = num_reads / 200

useqs = []
for l in open(uniqued):
    s, cntstr = l.strip().split()[0], l.strip().split()[4]
    cnt = numpy.mean([int(i) for i in cntstr.split(',')])
    if cnt >= min_seqs:
        useqs.append(s[cut_bp:readlen - idx_bp])

useqs = list(set(useqs))
print >> sys.stderr, '%s unique %sbp sequences in uniqued file' % (
def load_vcf(vcf,allele_map,indiv_gt_phred_cut=None,ding_on=100000,return_map=False):
	'''processes a vcf file, adding genotypes satisfying GQ cutoff indiv_gt_phred_cut to a returned cross genotype object
	sites corresponding to keys in allele_map are retained
	'''

	if return_map:
		new_map = defaultdict(dict)
	else:
		vcf_data = {}
	
	i = 0
	for line in preprocess_radtag_lane.smartopen(vcf):
		if i % ding_on == 0: print >> sys.stderr, 'reading',i
		i += 1

		if line.startswith('#CHROM'):
			headers = line[1:].split()
			exp_elements = len(line.split())
			FORMAT = headers.index('FORMAT')
		elif line.startswith('#'):
			continue
		else:
			#extract site stats
			fields = line.split()
			if len(fields) != exp_elements:
				print >>sys.stderr, 'unexpected length, line %s (exp %s obs %s)' % (i,exp_elements,len(fields))
				continue

			#populate site metrics
			sd = dict(zip(headers[:FORMAT],fields[:FORMAT]))
			loc = '%s.%s' % (sd['CHROM'],sd['POS'])
			key = (sd['CHROM'],sd['POS'])

			if not loc in allele_map.keys(): #not interested; skip!
				continue
				

			#temp hack for multiallelic sites
			if ',' in sd['ALT']:
				print >> sys.stderr, '!MULTIALLELIC SITE AT %s' % (key,)
				continue
			#temp hack for GQ-absent sites
			if not 'GQ' in fields[FORMAT]:
				print >> sys.stderr, '!GQ NOT CALCULATED AT %s' % (key,)
				continue

			try:
				infostr = sd.pop('INFO')
				sd.update(dict([el.split('=') for el in infostr.split(';') if '=' in el]))
			except KeyError:
				pass

			print >> sys.stderr, '%s found ...' % loc,
			#populate individual genotype metrics provided each GQ >= indiv_gt_phred_cut if defined
			sd['indiv_gt'] = {}
			for ind,gt in zip(headers[FORMAT+1:],fields[FORMAT+1:]):
				if not gt.startswith('./.') and ':' in gt:
					this_gt = dict(zip(fields[FORMAT].split(':'),gt.split(':')))
					if indiv_gt_phred_cut is None or float(this_gt['GQ'] != '.' and this_gt['GQ'] or '0') >= indiv_gt_phred_cut:
						sd['indiv_gt'][ind] = this_gt
						if return_map:
							new_map[ind].update({loc:''.join([allele_map[loc][n] for n in sd['indiv_gt'][ind]['GT'].split('/')])})
			if not return_map:
				vcf_data[key] = sd
			print >> sys.stderr, '%s individuals processed' % len(sd['indiv_gt'])

	if return_map:
		return new_map
	else:
		return vcf_data
Beispiel #29
0
def preprocess_sequence_for_match(all_quality,
                                  cutsite,
                                  mIDfile,
                                  subjects,
                                  queries,
                                  minlen=20):
    '''given a quality dictionary
    {
    20101114 - UPDATE:
    modified dict structure:
    
    
    <sequence> : {
                 "tot" : int
                 "mIDs" : [  <sampleID> ,<sampleID>,  ]
                 "sum_quality" : array([int,int,int ...])
                 }
    }

    generates three types of files:
    1x mIDlookup file containing header\tmID\tmID ... for each sequence
    1x "subject" contains all sequences that start with <cutsite>
    Nx "query" each contain a partition (<nparts> total) of fasta formatted sequence.  All seqs greater than <minlen> included

    '''
    import random

    mID_fh = smartopen(mIDfile, 'w')

    if len(subjects) == 1:  #write all subjects to single file
        this_subj_outfile = subjects[0]
        this_subj_fh = smartopen(this_subj_outfile, 'w')
        print >> sys.stderr, this_subj_outfile
    else:  #write to multiple subject files for parallel execution
        this_subj_outfile = None
        subj_break_at = int(len(all_quality) / (len(subjects)))
        scopy = deepcopy(subjects)

    print >> sys.stderr, 'write sequences'

    gen_queries = []
    gen_subjects = []

    if len(queries) == 1:  #write all queries to single file
        this_outfile = queries[0]
        this_query_fh = smartopen(this_outfile, 'w')
        print >> sys.stderr, this_outfile
    else:  #write to multiple query files for parallel execution
        this_outfile = None
        break_at = int(len(all_quality) / (len(queries)))
        qcopy = deepcopy(queries)

    aqkeys = all_quality.keys()
    random.shuffle(aqkeys)
    for i, s in enumerate(aqkeys):
        c = all_quality[s]['tot']
        qsum = all_quality[s]['sum_quality']
        q = qsum / c

        if len(queries) > 1 and break_at and i % break_at == 0 and len(
                qcopy) > 0:  #move to the next query chunk
            if this_outfile:
                gen_queries.append(this_outfile)
                this_query_fh.close()
            this_outfile = qcopy.pop(0)
            print >> sys.stderr, i, this_outfile
            this_query_fh = smartopen(this_outfile, 'w')

        if len(subjects
               ) > 1 and subj_break_at and i % subj_break_at == 0 and len(
                   scopy) > 0:  #move to the next subject chunk
            if this_subj_outfile:
                gen_subjects.append(this_subj_outfile)
                this_subj_fh.close()
            this_subj_outfile = scopy.pop(0)
            print >> sys.stderr, i, this_subj_outfile
            this_subj_fh = smartopen(this_subj_outfile, 'w')

        if 2 in q:
            first2 = numpy.arange(len(q))[q == 2][0]
        else:
            first2 = len(q)

        if first2 > minlen:
            header = '%s.%s.%s.%s' % (i, c, s[:first2], ''.join(
                [chr(int(n) + 64) for n in q[:first2]]))
            this_query_fh.write('>%s\n%s\n' % (header, s[:first2]))
            mID_fh.write(header + '\t' + ('\t'.join(all_quality[s]['mIDs'])) +
                         '\n')
            if s.startswith(cutsite) and c > 1:
                this_subj_fh.write('>%s\n%s\n' % (header, s[:first2]))

    gen_queries.append(this_outfile)
    this_query_fh.close()
    gen_subjects.append(this_subj_outfile)
    this_subj_fh.close()
    return gen_subjects, gen_queries
Beispiel #30
0
def load_vcf(vcf,
             allele_map,
             indiv_gt_phred_cut=None,
             ding_on=100000,
             return_map=False):
    '''processes a vcf file, adding genotypes satisfying GQ cutoff indiv_gt_phred_cut to a returned cross genotype object
	sites corresponding to keys in allele_map are retained
	'''

    if return_map:
        new_map = defaultdict(dict)
    else:
        vcf_data = {}

    i = 0
    for line in preprocess_radtag_lane.smartopen(vcf):
        if i % ding_on == 0: print >> sys.stderr, 'reading', i
        i += 1

        if line.startswith('#CHROM'):
            headers = line[1:].split()
            exp_elements = len(line.split())
            FORMAT = headers.index('FORMAT')
        elif line.startswith('#'):
            continue
        else:
            #extract site stats
            fields = line.split()
            if len(fields) != exp_elements:
                print >> sys.stderr, 'unexpected length, line %s (exp %s obs %s)' % (
                    i, exp_elements, len(fields))
                continue

            #populate site metrics
            sd = dict(zip(headers[:FORMAT], fields[:FORMAT]))
            loc = '%s.%s' % (sd['CHROM'], sd['POS'])
            key = (sd['CHROM'], sd['POS'])

            if not loc in allele_map.keys():  #not interested; skip!
                continue

            #temp hack for multiallelic sites
            if ',' in sd['ALT']:
                print >> sys.stderr, '!MULTIALLELIC SITE AT %s' % (key, )
                continue
            #temp hack for GQ-absent sites
            if not 'GQ' in fields[FORMAT]:
                print >> sys.stderr, '!GQ NOT CALCULATED AT %s' % (key, )
                continue

            try:
                infostr = sd.pop('INFO')
                sd.update(
                    dict([
                        el.split('=') for el in infostr.split(';') if '=' in el
                    ]))
            except KeyError:
                pass

            print >> sys.stderr, '%s found ...' % loc,
            #populate individual genotype metrics provided each GQ >= indiv_gt_phred_cut if defined
            sd['indiv_gt'] = {}
            for ind, gt in zip(headers[FORMAT + 1:], fields[FORMAT + 1:]):
                if not gt.startswith('./.') and ':' in gt:
                    this_gt = dict(
                        zip(fields[FORMAT].split(':'), gt.split(':')))
                    if indiv_gt_phred_cut is None or float(
                            this_gt['GQ'] != '.' and this_gt['GQ']
                            or '0') >= indiv_gt_phred_cut:
                        sd['indiv_gt'][ind] = this_gt
                        if return_map:
                            new_map[ind].update({
                                loc:
                                ''.join([
                                    allele_map[loc][n] for n in sd['indiv_gt']
                                    [ind]['GT'].split('/')
                                ])
                            })
            if not return_map:
                vcf_data[key] = sd
            print >> sys.stderr, '%s individuals processed' % len(
                sd['indiv_gt'])

    if return_map:
        return new_map
    else:
        return vcf_data
Beispiel #31
0
def filter_uniqued(uniqued,outfile,lines_to_write):
    ofh = smartopen(outfile,'w')
    for i,l in enumerate(smartopen(uniqued)):
        if i in lines_to_write:
            ofh.write(l)
    ofh.close()
Beispiel #32
0
def load_uniqued(all_quality,
                 uniqued,
                 readlen=None,
                 nticks=20,
                 baseQ=None,
                 count_by_ind=False):
    '''given a .uniqued file produced by preprocess_radtag_lane.py

    loads data into all_quality, ensuring sequences remain unique

    all_quality per 20101114 - UPDATE below    
    '''

    nreads = get_read_count(uniqued)

    qfh = smartopen(uniqued)
    while baseQ is None:
        line = qfh.next()
        qstr = line.strip().split()[2]
        baseQ = get_baseQ(qstr)
    qfh.close()

    print >> sys.stderr, 'uniqued qualities base %s' % (baseQ)

    tickon = nreads / nticks
    if tickon < 1:
        tickon = 1
    print >> sys.stderr, '\tloading'

    for i, line in enumerate(smartopen(uniqued)):
        if i % tickon == 0:
            print >> sys.stderr, '\t\t%s / %s (%d%%)' % (i, nreads,
                                                         (float(i) / nreads) *
                                                         100)

        try:
            s, c, qstr, indivstr, indcnt, r2, r2cnt = line.strip().split()
        except ValueError:
            print >> sys.stderr, 'line %s split: incorrect element number (%s) line:\n%ssplit:\n%s\n' % (
                i, len(line.strip().split()), line, line.strip().split())
        q = numpy.array([ord(ch) - baseQ for ch in qstr])
        c = int(c)
        indiv = set(indivstr.split(','))

        if count_by_ind:
            indcntd = dict(
                zip(indivstr.split(','), map(int, indcnt.split(','))))

        if readlen is not None:
            s = s[:readlen]
            q = q[:readlen]

        if all_quality.has_key(s):
            all_quality[s]['mIDs'] = list(
                set(all_quality[s]['mIDs']).union(indiv))
            all_quality[s]['sum_quality'] += q * c
            all_quality[s]['tot'] += c
            if count_by_ind:
                for ind, cnt in indcntd.items():
                    if all_quality[s]['count_by_ind'].has_key(ind):
                        all_quality[s]['count_by_ind'][ind] += cnt
                    else:
                        all_quality[s]['count_by_ind'][ind] = cnt
        else:
            all_quality[s]['mIDs'] = list(indiv)
            all_quality[s]['sum_quality'] = q * c
            all_quality[s]['tot'] = c
            if count_by_ind:
                all_quality[s]['count_by_ind'] = indcntd
'''

import os,sys
import numpy
from editdist import distance
from preprocess_radtag_lane import next_read_from_fh, smartopen, get_read_count

idx_bp = 5
cut_bp = 5

lnum = 4
min_seqs = 7

uniqued, fastq = sys.argv[1:]

readlen = len(next_read_from_fh(smartopen(fastq),4)[1])

print >> sys.stderr, 'readlen: %s' % readlen

num_reads = get_read_count(fastq,4)
tickon = num_reads/200

useqs = []
for l in open(uniqued):
    s,cntstr = l.strip().split()[0], l.strip().split()[4]
    cnt = numpy.mean([int(i) for i in cntstr.split(',')])
    if cnt >= min_seqs:
        useqs.append(s[cut_bp:readlen-idx_bp])

useqs = list(set(useqs))
print >> sys.stderr, '%s unique %sbp sequences in uniqued file' % (len(useqs),len(s[cut_bp:readlen-idx_bp]))
Beispiel #34
0
def readlen_from_uniqued(uniqued):
    return len(smartopen(uniqued).readline().strip().split()[0])