def run_local_blat(subjects, queries, blattile, blatargstr='', num_cores=1): ''' runs blat commands using os.system() runs all jobs as a single batch, to run on multiple cores/computers, consider run_parallel_blat() ''' blatargstr += ' -tileSize=%s' % blattile blatargstr += ' -stepSize=%s' % (int(blattile) / 2) cmds = [] labf = [] for q in queries: for subject in subjects: subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj') outbase = q.rstrip('.fa').rstrip( '_query') + '_blat' + '-subj' + subjname + blatargstr.replace( '=', '').replace(' ', '') labf.append(outbase + '.label.gz') cmd = '%s %s %s %s "%s" %s' % ( sys.executable, os.path.join(radtag_denovo, 'mcl_id_triples_by_blat.py'), subject, q, blatargstr, outbase) cmds.append(run_safe.safe_script(cmd, outbase)) shscr = os.path.join(os.path.dirname(subjects[0]), 'runblat.sh') smartopen(shscr, 'w').writelines([cmd + ';\n' for cmd in cmds]) os.system('chmod +x ' + shscr) ret = os.system(shscr) if ret != 0 or not all([os.path.exists(f) for f in labf]): raise OSError, 'blat failed with code %s' % ret return labf
def load_uniqued(all_quality,uniqued,readlen=None,nticks=20,baseQ=None,count_by_ind=False): '''given a .uniqued file produced by preprocess_radtag_lane.py loads data into all_quality, ensuring sequences remain unique all_quality per 20101114 - UPDATE below ''' nreads = get_read_count(uniqued) qfh = smartopen(uniqued) while baseQ is None: line = qfh.next() qstr = line.strip().split()[2] baseQ = get_baseQ(qstr) qfh.close() print >> sys.stderr, 'uniqued qualities base %s' % (baseQ) tickon = nreads/nticks if tickon < 1: tickon = 1 print >> sys.stderr, '\tloading' for i,line in enumerate(smartopen(uniqued)): if i % tickon == 0: print >> sys.stderr, '\t\t%s / %s (%d%%)' % (i,nreads,(float(i)/nreads)*100) try: s,c,qstr,indivstr,indcnt,r2,r2cnt = line.strip().split() except ValueError: print >> sys.stderr, 'line %s split: incorrect element number (%s) line:\n%ssplit:\n%s\n' % (i,len(line.strip().split()),line,line.strip().split()) q = numpy.array([ord(ch)-baseQ for ch in qstr]) c = int(c) indiv = set(indivstr.split(',')) if count_by_ind: indcntd = dict(zip(indivstr.split(','),map(int,indcnt.split(',')))) if readlen is not None: s = s[:readlen] q = q[:readlen] if all_quality.has_key(s): all_quality[s]['mIDs'] = list(set(all_quality[s]['mIDs']).union(indiv)) all_quality[s]['sum_quality'] += q*c all_quality[s]['tot'] += c if count_by_ind: for ind,cnt in indcntd.items(): if all_quality[s]['count_by_ind'].has_key(ind): all_quality[s]['count_by_ind'][ind] += cnt else: all_quality[s]['count_by_ind'][ind] = cnt else: all_quality[s]['mIDs'] = list(indiv) all_quality[s]['sum_quality'] = q*c all_quality[s]['tot'] = c if count_by_ind: all_quality[s]['count_by_ind'] = indcntd
def run_local_blat(subjects,queries,blattile,blatargstr='',num_cores=1): ''' runs blat commands using os.system() runs all jobs as a single batch, to run on multiple cores/computers, consider run_parallel_blat() ''' blatargstr += ' -tileSize=%s' % blattile blatargstr += ' -stepSize=%s' % (int(blattile)/2) cmds = [] labf = [] for q in queries: for subject in subjects: subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj') outbase = q.rstrip('.fa').rstrip('_query')+'_blat'+'-subj'+subjname+blatargstr.replace('=','').replace(' ','') labf.append(outbase+'.label.gz') cmd = '%s %s %s %s "%s" %s' % (sys.executable, os.path.join(radtag_denovo, 'mcl_id_triples_by_blat.py'),subject,q,blatargstr,outbase) cmds.append(run_safe.safe_script(cmd,outbase)) shscr = os.path.join(os.path.dirname(subjects[0]) , 'runblat.sh') smartopen(shscr, 'w').writelines([cmd+';\n' for cmd in cmds]) os.system('chmod +x '+shscr) ret = os.system(shscr) if ret != 0 or not all([os.path.exists(f) for f in labf]): raise OSError, 'blat failed with code %s' % ret return labf
def uniqued_to_fastq(uniqued, id_prefix=''): if uniqued.endswith('gz'): len_uni = int( Popen('zcat %s | wc -l' % uniqued, shell=True, stdout=PIPE).stdout.read().strip()) else: len_uni = int( Popen('cat %s | wc -l' % uniqued, shell=True, stdout=PIPE).stdout.read().strip()) fh = smartopen(uniqued) outname = remove_ext(uniqued) + '-fromuni.fastq.gz' if os.path.exists(outname) and get_read_count(outname) == len_uni: print >> sys.stderr, 'output %s exists' % outname return outname ofh = smartopen(outname, 'w') print >> sys.stderr, 'convert %s to fastq' % uniqued for i, l in enumerate(fh): fields = l.strip().split() fq_line = '@%s%s\n%s\n+\n%s\n' % (id_prefix, i, fields[0], fields[2]) ofh.write(fq_line) if i % 1000 == 0: print >> sys.stderr, '\r\t%s done' % i, ofh.close() print >> sys.stderr, '%s done' % outname return outname
def run_parallel_blat(subjects, queries, blattile, blatargstr='', num_cores='+0'): ''' runs blat commands using GUN parallel. ''' blatargstr += ' -tileSize=%s' % blattile blatargstr += ' -stepSize=%s' % (int(blattile) / 2) cmds = [] labf = [] for q in queries: for subject in subjects: subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj') outbase = q.rstrip('.fa').rstrip( '_query') + '_blat' + '-subj' + subjname + blatargstr.replace( '=', '').replace(' ', '') labf.append(outbase + '.label.gz') cmd = '%smcl_id_triples_by_blat.py %s %s "%s" %s' % ( radtag_denovo, subject, q, blatargstr, outbase) cmds.append(run_safe.safe_script(cmd, outbase)) shscr = os.path.join(os.path.dirname(subjects[0]), 'runblat.sh') smartopen(shscr, 'w').writelines([cmd + ';\n' for cmd in cmds]) os.system('chmod +x ' + shscr) ret = os.system('parallel --progress -j %s < %s' % (num_cores, shscr)) if ret != 0 or not all([os.path.exists(f) for f in labf]): raise OSError, 'blat failed with code %s' % ret return labf
def run_parallel_blat(subjects,queries,blattile,blatargstr='',num_cores='+0'): ''' runs blat commands using GUN parallel. ''' blatargstr += ' -tileSize=%s' % blattile blatargstr += ' -stepSize=%s' % (int(blattile)/2) cmds = [] labf = [] for q in queries: for subject in subjects: subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj') outbase = q.rstrip('.fa').rstrip('_query')+'_blat'+'-subj'+subjname+blatargstr.replace('=','').replace(' ','') labf.append(outbase+'.label') cmds.append('%smcl_id_triples_by_blat.py %s %s "%s" %s' % (radtag_denovo,subject,q,blatargstr,outbase)) shscr = os.path.join(os.path.dirname(subjects[0]) , 'runblat.sh') smartopen(shscr, 'w').writelines([cmd+';\n' for cmd in cmds]) os.system('chmod +x '+shscr) ret = os.system('parallel --progress -j %s < %s' % (num_cores,shscr)) if ret != 0 or not all([os.path.exists(f) for f in labf]): raise OSError, 'blat failed with code %s' % ret return labf
def convert_fastq(fq,ofq,out_lnum=4,out_baseQ=33,tickon = 10000): nreads = preprocess_radtag_lane.get_read_count(fq) lnum,baseQ = preprocess_radtag_lane.get_fastq_properties(fq) fh = preprocess_radtag_lane.smartopen(fq) ofh = preprocess_radtag_lane.smartopen(ofq,'w') for i in xrange(nreads): if i%tickon == 0: print >> sys.stderr, '\r%s / %s (%0.1f%%)' % (i,nreads,(float(i)/nreads)*100), n,s,qs = preprocess_radtag_lane.next_read_from_fh(fh, lnum) ofh.write(preprocess_radtag_lane.as_fq_line(n,s,qs_to_q(qs,baseQ),out_baseQ,out_lnum)) print >> sys.stderr,'\n'
def append_to_ref(target_ref, new_ref, id_prefix): nfh = smartopen(new_ref) tfh = smartopen(target_ref, 'a') for l in nfh: if l.startswith('>'): newl = l.replace('>', '>%s_' % id_prefix) tfh.write(newl) else: tfh.write(l) nfh.close() tfh.close()
def append_to_ref(target_ref,new_ref,id_prefix): nfh = smartopen(new_ref) tfh = smartopen(target_ref,'a') for l in nfh: if l.startswith('>'): newl = l.replace('>','>%s_' % id_prefix) tfh.write(newl) else: tfh.write(l) nfh.close() tfh.close()
def convert_fastq(fq, ofq, out_lnum=4, out_baseQ=33, tickon=10000): nreads = preprocess_radtag_lane.get_read_count(fq) lnum, baseQ = preprocess_radtag_lane.get_fastq_properties(fq) fh = preprocess_radtag_lane.smartopen(fq) ofh = preprocess_radtag_lane.smartopen(ofq, 'w') for i in xrange(nreads): if i % tickon == 0: print >> sys.stderr, '\r%s / %s (%0.1f%%)' % (i, nreads, (float(i) / nreads) * 100), n, s, qs = preprocess_radtag_lane.next_read_from_fh(fh, lnum) ofh.write( preprocess_radtag_lane.as_fq_line(n, s, qs_to_q(qs, baseQ), out_baseQ, out_lnum)) print >> sys.stderr, '\n'
def get_shortest_readlen(unifiles): readlen = numpy.inf for uniqued in unifiles: rl = len(smartopen(uniqued).readline().strip().split()[0]) if rl < readlen: readlen = rl return readlen
def cat(filelist,targetfile): '''cats an arbitrarily large filelist to targetfile''' fh = smartopen(targetfile,'w') print >> sys.stderr, '\n' for i,f in enumerate(filelist): print >> sys.stderr, '\r%s / %s' % (i,len(filelist)), for l in open(f): fh.write(l) fh.close()
def cat(filelist, targetfile): '''cats an arbitrarily large filelist to targetfile''' fh = smartopen(targetfile, 'w') print >> sys.stderr, '\n' for i, f in enumerate(filelist): print >> sys.stderr, '\r%s / %s' % (i, len(filelist)), for l in open(f): fh.write(l) fh.close()
def uniqued_to_fastq(uniqued,id_prefix=''): if uniqued.endswith('gz'): len_uni = int(Popen('zcat %s | wc -l' % uniqued,shell=True,stdout=PIPE).stdout.read().strip()) else: len_uni = int(Popen('cat %s | wc -l' % uniqued,shell=True,stdout=PIPE).stdout.read().strip()) fh = smartopen(uniqued) outname = remove_ext(uniqued)+'-fromuni.fastq.gz' if os.path.exists(outname) and get_read_count(outname) == len_uni: print >> sys.stderr, 'output %s exists' % outname return outname ofh = smartopen(outname,'w') print >> sys.stderr, 'convert %s to fastq' % uniqued for i,l in enumerate(fh): fields = l.strip().split() fq_line = '@%s%s\n%s\n+\n%s\n' % (id_prefix,i,fields[0],fields[2]) ofh.write(fq_line) if i % 1000 == 0: print >> sys.stderr, '\r\t%s done' % i, ofh.close() print >> sys.stderr, '%s done' % outname return outname
def get_uniqued_error(infiles,cdest_searchbase): from glob import glob print >> sys.stderr, '\nset cluster dirt threshold from per-lane error estimates' err_by_uni = {} for uniqued in infiles: rl = readlen_from_uniqued(uniqued) cdest_search = uniqued.rstrip('.gz')+'-rtd/'+cdest_searchbase cdests = glob(cdest_search) if len(cdests) != 1: raise ValueError, 'search string %s did not result in a single .cdest file %s' % (cdest_search,cdests) else: cd = float(smartopen(cdests[0]).read()) print >> sys.stderr, '%s: found cluster dirt %s for read length %s. Estimated error: %s' % (uniqued,cd,rl,cd/rl) err_by_uni[uniqued] = cd/rl return err_by_uni
def get_counts_by_pool(uniqued,db): ufields = get_uniqued_info(uniqued) pool_lookup = get_pool_lookup(db,ufields[0],ufields[1],ufields[3]) counts_by_pool = {} fh = preprocess_radtag_lane.smartopen(uniqued) for l in fh: f = l.split() for ind,ct in zip(f[3].split(','),[int(i) for i in f[4].split(',')]): pool = pool_lookup[ind] try: counts_by_pool[pool][ind] += ct except: counts_by_pool[pool] = defaultdict(int) counts_by_pool[pool][ind] += ct return counts_by_pool
def get_counts_by_pool(uniqued, db): ufields = get_uniqued_info(uniqued) pool_lookup = get_pool_lookup(db, ufields[0], ufields[1], ufields[3]) counts_by_pool = {} fh = preprocess_radtag_lane.smartopen(uniqued) for l in fh: f = l.split() for ind, ct in zip(f[3].split(','), [int(i) for i in f[4].split(',')]): pool = pool_lookup[ind] try: counts_by_pool[pool][ind] += ct except: counts_by_pool[pool] = defaultdict(int) counts_by_pool[pool][ind] += ct return counts_by_pool
def get_uniqued_error(infiles, cdest_searchbase): from glob import glob print >> sys.stderr, '\nset cluster dirt threshold from per-lane error estimates' err_by_uni = {} for uniqued in infiles: rl = readlen_from_uniqued(uniqued) cdest_search = uniqued.rstrip('.gz') + '-rtd/' + cdest_searchbase cdests = glob(cdest_search) if len(cdests) != 1: raise ValueError, 'search string %s did not result in a single .cdest file %s' % ( cdest_search, cdests) else: cd = float(smartopen(cdests[0]).read()) print >> sys.stderr, '%s: found cluster dirt %s for read length %s. Estimated error: %s' % ( uniqued, cd, rl, cd / rl) err_by_uni[uniqued] = cd / rl return err_by_uni
def write_uniqued_by_size(all_quality,outbase,baseQ=33): outdir = os.path.dirname(outbase) if not os.path.exists(outdir): os.makedirs(outdir) outfhs = {} ofbysize = {} for seq,aqd in all_quality.items(): #s,c,qstr,indivstr,indcnt,r2,r2cnt ind_li,cnt_li = zip(*aqd['count_by_ind'].items()) outl = '\t'.join((seq, str(aqd['tot']), ''.join([chr(i+baseQ) for i in map(int,aqd['sum_quality']/float(aqd['tot']))]), ','.join(ind_li),','.join(map(str,cnt_li)),'.','.')) + '\n' outf = outbase+'-%s.uniqued.gz' % len(seq) if not outf in outfhs: outfhs[outf] = smartopen(outf,'w') ofbysize[len(seq)] = outf outfhs[outf].write(outl) for outf,ofh in outfhs.items(): ofh.close() return ofbysize
def write_uniqued_by_size(all_quality, outbase, baseQ=33): outdir = os.path.dirname(outbase) if not os.path.exists(outdir): os.makedirs(outdir) outfhs = {} ofbysize = {} for seq, aqd in all_quality.items(): #s,c,qstr,indivstr,indcnt,r2,r2cnt ind_li, cnt_li = zip(*aqd['count_by_ind'].items()) outl = '\t'.join((seq, str(aqd['tot']), ''.join([ chr(i + baseQ) for i in map(int, aqd['sum_quality'] / float(aqd['tot'])) ]), ','.join(ind_li), ','.join(map(str, cnt_li)), '.', '.')) + '\n' outf = outbase + '-%s.uniqued.gz' % len(seq) if not outf in outfhs: outfhs[outf] = smartopen(outf, 'w') ofbysize[len(seq)] = outf outfhs[outf].write(outl) for outf, ofh in outfhs.items(): ofh.close() return ofbysize
''' import preprocess_radtag_lane import os, sys barcode_len = 5 tick = 10000 #update progress every this-many reads if __name__ == "__main__": if len(sys.argv) == 4: cutsite, fq, outfile = sys.argv[1:] rc = preprocess_radtag_lane.get_read_count(fq) lnum, baseQ = preprocess_radtag_lane.get_fastq_properties(fq) fh = preprocess_radtag_lane.smartopen(fq) ofh = preprocess_radtag_lane.smartopen(outfile, 'w') found = 0 for i in range(rc): if i > 0 and i % tick == 0: print >> sys.stderr, '\r%s / %s (%0.1f%%) found %s (%0.1f%%)' % \ (i,rc,(float(i)/rc)*100,found,(float(found)/i)*100), n, s, q = preprocess_radtag_lane.next_read_from_fh(fh, lnum) if s[barcode_len:barcode_len + len(cutsite)] == cutsite: line = preprocess_radtag_lane.as_fq_line(n, s, q, None, lnum) ofh.write(line) found += 1 ofh.close() elif len(sys.argv) == 6: cutsite, fq1, fq2, outfile1, outfile2 = sys.argv[1:]
def filter_uniqued(uniqued, outfile, lines_to_write): ofh = smartopen(outfile, 'w') for i, l in enumerate(smartopen(uniqued)): if i in lines_to_write: ofh.write(l) ofh.close()
def preprocess_sequence_for_match(all_quality, cutsite, mIDfile, subjects, queries, minlen=20): '''given a quality dictionary { 20101114 - UPDATE: modified dict structure: <sequence> : { "tot" : int "mIDs" : [ <sampleID> ,<sampleID>, ] "sum_quality" : array([int,int,int ...]) } } generates three types of files: 1x mIDlookup file containing header\tmID\tmID ... for each sequence 1x "subject" contains all sequences that start with <cutsite> Nx "query" each contain a partition (<nparts> total) of fasta formatted sequence. All seqs greater than <minlen> included ''' import random mID_fh = smartopen(mIDfile,'w') if len(subjects) == 1: #write all subjects to single file this_subj_outfile = subjects[0] this_subj_fh = smartopen(this_subj_outfile,'w') print >> sys.stderr, this_subj_outfile else: #write to multiple subject files for parallel execution this_subj_outfile = None subj_break_at = int(len(all_quality)/(len(subjects))) scopy = deepcopy(subjects) print >> sys.stderr, 'write sequences' gen_queries = [] gen_subjects = [] if len(queries) == 1: #write all queries to single file this_outfile = queries[0] this_query_fh = smartopen(this_outfile,'w') print >> sys.stderr, this_outfile else: #write to multiple query files for parallel execution this_outfile = None break_at = int(len(all_quality)/(len(queries))) qcopy = deepcopy(queries) aqkeys = all_quality.keys() random.shuffle(aqkeys) for i,s in enumerate(aqkeys): c = all_quality[s]['tot'] qsum = all_quality[s]['sum_quality'] q = qsum / c if len(queries) > 1 and i%break_at==0 and len(qcopy) > 0: #move to the next query chunk if this_outfile: gen_queries.append(this_outfile) this_query_fh.close() this_outfile = qcopy.pop(0) print >> sys.stderr, i,this_outfile this_query_fh = smartopen(this_outfile,'w') if len(subjects) > 1 and i%subj_break_at==0 and len(scopy) > 0: #move to the next query chunk if this_subj_outfile: gen_subjects.append(this_subj_outfile) this_subj_fh.close() this_subj_outfile = scopy.pop(0) print >> sys.stderr, i,this_subj_outfile this_subj_fh = smartopen(this_subj_outfile,'w') if 2 in q: first2 = numpy.arange(len(q))[q==2][0] else: first2 = len(q) if first2 > minlen: header = '%s.%s.%s.%s' % (i,c,s[:first2],''.join([chr(int(n)+64) for n in q[:first2]])) this_query_fh.write('>%s\n%s\n' % (header,s[:first2])) mID_fh.write(header+'\t'+('\t'.join(all_quality[s]['mIDs']))+'\n') if s.startswith(cutsite) and c > 1: this_subj_fh.write('>%s\n%s\n' % (header,s[:first2])) gen_queries.append(this_outfile) this_query_fh.close() gen_subjects.append(this_subj_outfile) this_subj_fh.close() return gen_subjects, gen_queries
''' import preprocess_radtag_lane import os,sys barcode_len = 5 tick = 10000 #update progress every this-many reads if __name__ == "__main__": if len(sys.argv) == 4: cutsite,fq,outfile = sys.argv[1:] rc = preprocess_radtag_lane.get_read_count(fq) lnum,baseQ = preprocess_radtag_lane.get_fastq_properties(fq) fh = preprocess_radtag_lane.smartopen(fq) ofh = preprocess_radtag_lane.smartopen(outfile,'w') found = 0 for i in range(rc): if i>0 and i % tick == 0: print >> sys.stderr, '\r%s / %s (%0.1f%%) found %s (%0.1f%%)' % \ (i,rc,(float(i)/rc)*100,found,(float(found)/i)*100), n,s,q = preprocess_radtag_lane.next_read_from_fh(fh,lnum) if s[barcode_len:barcode_len+len(cutsite)] == cutsite: line = preprocess_radtag_lane.as_fq_line(n,s,q,None,lnum) ofh.write(line) found += 1 ofh.close() elif len(sys.argv) == 6: cutsite,fq1,fq2,outfile1,outfile2 = sys.argv[1:]
def readlen_from_uniqued(uniqued): return len(smartopen(uniqued).readline().strip().split()[0])
''' import os, sys import numpy from editdist import distance from preprocess_radtag_lane import next_read_from_fh, smartopen, get_read_count idx_bp = 5 cut_bp = 5 lnum = 4 min_seqs = 7 uniqued, fastq = sys.argv[1:] readlen = len(next_read_from_fh(smartopen(fastq), 4)[1]) print >> sys.stderr, 'readlen: %s' % readlen num_reads = get_read_count(fastq, 4) tickon = num_reads / 200 useqs = [] for l in open(uniqued): s, cntstr = l.strip().split()[0], l.strip().split()[4] cnt = numpy.mean([int(i) for i in cntstr.split(',')]) if cnt >= min_seqs: useqs.append(s[cut_bp:readlen - idx_bp]) useqs = list(set(useqs)) print >> sys.stderr, '%s unique %sbp sequences in uniqued file' % (
def load_vcf(vcf,allele_map,indiv_gt_phred_cut=None,ding_on=100000,return_map=False): '''processes a vcf file, adding genotypes satisfying GQ cutoff indiv_gt_phred_cut to a returned cross genotype object sites corresponding to keys in allele_map are retained ''' if return_map: new_map = defaultdict(dict) else: vcf_data = {} i = 0 for line in preprocess_radtag_lane.smartopen(vcf): if i % ding_on == 0: print >> sys.stderr, 'reading',i i += 1 if line.startswith('#CHROM'): headers = line[1:].split() exp_elements = len(line.split()) FORMAT = headers.index('FORMAT') elif line.startswith('#'): continue else: #extract site stats fields = line.split() if len(fields) != exp_elements: print >>sys.stderr, 'unexpected length, line %s (exp %s obs %s)' % (i,exp_elements,len(fields)) continue #populate site metrics sd = dict(zip(headers[:FORMAT],fields[:FORMAT])) loc = '%s.%s' % (sd['CHROM'],sd['POS']) key = (sd['CHROM'],sd['POS']) if not loc in allele_map.keys(): #not interested; skip! continue #temp hack for multiallelic sites if ',' in sd['ALT']: print >> sys.stderr, '!MULTIALLELIC SITE AT %s' % (key,) continue #temp hack for GQ-absent sites if not 'GQ' in fields[FORMAT]: print >> sys.stderr, '!GQ NOT CALCULATED AT %s' % (key,) continue try: infostr = sd.pop('INFO') sd.update(dict([el.split('=') for el in infostr.split(';') if '=' in el])) except KeyError: pass print >> sys.stderr, '%s found ...' % loc, #populate individual genotype metrics provided each GQ >= indiv_gt_phred_cut if defined sd['indiv_gt'] = {} for ind,gt in zip(headers[FORMAT+1:],fields[FORMAT+1:]): if not gt.startswith('./.') and ':' in gt: this_gt = dict(zip(fields[FORMAT].split(':'),gt.split(':'))) if indiv_gt_phred_cut is None or float(this_gt['GQ'] != '.' and this_gt['GQ'] or '0') >= indiv_gt_phred_cut: sd['indiv_gt'][ind] = this_gt if return_map: new_map[ind].update({loc:''.join([allele_map[loc][n] for n in sd['indiv_gt'][ind]['GT'].split('/')])}) if not return_map: vcf_data[key] = sd print >> sys.stderr, '%s individuals processed' % len(sd['indiv_gt']) if return_map: return new_map else: return vcf_data
def preprocess_sequence_for_match(all_quality, cutsite, mIDfile, subjects, queries, minlen=20): '''given a quality dictionary { 20101114 - UPDATE: modified dict structure: <sequence> : { "tot" : int "mIDs" : [ <sampleID> ,<sampleID>, ] "sum_quality" : array([int,int,int ...]) } } generates three types of files: 1x mIDlookup file containing header\tmID\tmID ... for each sequence 1x "subject" contains all sequences that start with <cutsite> Nx "query" each contain a partition (<nparts> total) of fasta formatted sequence. All seqs greater than <minlen> included ''' import random mID_fh = smartopen(mIDfile, 'w') if len(subjects) == 1: #write all subjects to single file this_subj_outfile = subjects[0] this_subj_fh = smartopen(this_subj_outfile, 'w') print >> sys.stderr, this_subj_outfile else: #write to multiple subject files for parallel execution this_subj_outfile = None subj_break_at = int(len(all_quality) / (len(subjects))) scopy = deepcopy(subjects) print >> sys.stderr, 'write sequences' gen_queries = [] gen_subjects = [] if len(queries) == 1: #write all queries to single file this_outfile = queries[0] this_query_fh = smartopen(this_outfile, 'w') print >> sys.stderr, this_outfile else: #write to multiple query files for parallel execution this_outfile = None break_at = int(len(all_quality) / (len(queries))) qcopy = deepcopy(queries) aqkeys = all_quality.keys() random.shuffle(aqkeys) for i, s in enumerate(aqkeys): c = all_quality[s]['tot'] qsum = all_quality[s]['sum_quality'] q = qsum / c if len(queries) > 1 and break_at and i % break_at == 0 and len( qcopy) > 0: #move to the next query chunk if this_outfile: gen_queries.append(this_outfile) this_query_fh.close() this_outfile = qcopy.pop(0) print >> sys.stderr, i, this_outfile this_query_fh = smartopen(this_outfile, 'w') if len(subjects ) > 1 and subj_break_at and i % subj_break_at == 0 and len( scopy) > 0: #move to the next subject chunk if this_subj_outfile: gen_subjects.append(this_subj_outfile) this_subj_fh.close() this_subj_outfile = scopy.pop(0) print >> sys.stderr, i, this_subj_outfile this_subj_fh = smartopen(this_subj_outfile, 'w') if 2 in q: first2 = numpy.arange(len(q))[q == 2][0] else: first2 = len(q) if first2 > minlen: header = '%s.%s.%s.%s' % (i, c, s[:first2], ''.join( [chr(int(n) + 64) for n in q[:first2]])) this_query_fh.write('>%s\n%s\n' % (header, s[:first2])) mID_fh.write(header + '\t' + ('\t'.join(all_quality[s]['mIDs'])) + '\n') if s.startswith(cutsite) and c > 1: this_subj_fh.write('>%s\n%s\n' % (header, s[:first2])) gen_queries.append(this_outfile) this_query_fh.close() gen_subjects.append(this_subj_outfile) this_subj_fh.close() return gen_subjects, gen_queries
def load_vcf(vcf, allele_map, indiv_gt_phred_cut=None, ding_on=100000, return_map=False): '''processes a vcf file, adding genotypes satisfying GQ cutoff indiv_gt_phred_cut to a returned cross genotype object sites corresponding to keys in allele_map are retained ''' if return_map: new_map = defaultdict(dict) else: vcf_data = {} i = 0 for line in preprocess_radtag_lane.smartopen(vcf): if i % ding_on == 0: print >> sys.stderr, 'reading', i i += 1 if line.startswith('#CHROM'): headers = line[1:].split() exp_elements = len(line.split()) FORMAT = headers.index('FORMAT') elif line.startswith('#'): continue else: #extract site stats fields = line.split() if len(fields) != exp_elements: print >> sys.stderr, 'unexpected length, line %s (exp %s obs %s)' % ( i, exp_elements, len(fields)) continue #populate site metrics sd = dict(zip(headers[:FORMAT], fields[:FORMAT])) loc = '%s.%s' % (sd['CHROM'], sd['POS']) key = (sd['CHROM'], sd['POS']) if not loc in allele_map.keys(): #not interested; skip! continue #temp hack for multiallelic sites if ',' in sd['ALT']: print >> sys.stderr, '!MULTIALLELIC SITE AT %s' % (key, ) continue #temp hack for GQ-absent sites if not 'GQ' in fields[FORMAT]: print >> sys.stderr, '!GQ NOT CALCULATED AT %s' % (key, ) continue try: infostr = sd.pop('INFO') sd.update( dict([ el.split('=') for el in infostr.split(';') if '=' in el ])) except KeyError: pass print >> sys.stderr, '%s found ...' % loc, #populate individual genotype metrics provided each GQ >= indiv_gt_phred_cut if defined sd['indiv_gt'] = {} for ind, gt in zip(headers[FORMAT + 1:], fields[FORMAT + 1:]): if not gt.startswith('./.') and ':' in gt: this_gt = dict( zip(fields[FORMAT].split(':'), gt.split(':'))) if indiv_gt_phred_cut is None or float( this_gt['GQ'] != '.' and this_gt['GQ'] or '0') >= indiv_gt_phred_cut: sd['indiv_gt'][ind] = this_gt if return_map: new_map[ind].update({ loc: ''.join([ allele_map[loc][n] for n in sd['indiv_gt'] [ind]['GT'].split('/') ]) }) if not return_map: vcf_data[key] = sd print >> sys.stderr, '%s individuals processed' % len( sd['indiv_gt']) if return_map: return new_map else: return vcf_data
def filter_uniqued(uniqued,outfile,lines_to_write): ofh = smartopen(outfile,'w') for i,l in enumerate(smartopen(uniqued)): if i in lines_to_write: ofh.write(l) ofh.close()
def load_uniqued(all_quality, uniqued, readlen=None, nticks=20, baseQ=None, count_by_ind=False): '''given a .uniqued file produced by preprocess_radtag_lane.py loads data into all_quality, ensuring sequences remain unique all_quality per 20101114 - UPDATE below ''' nreads = get_read_count(uniqued) qfh = smartopen(uniqued) while baseQ is None: line = qfh.next() qstr = line.strip().split()[2] baseQ = get_baseQ(qstr) qfh.close() print >> sys.stderr, 'uniqued qualities base %s' % (baseQ) tickon = nreads / nticks if tickon < 1: tickon = 1 print >> sys.stderr, '\tloading' for i, line in enumerate(smartopen(uniqued)): if i % tickon == 0: print >> sys.stderr, '\t\t%s / %s (%d%%)' % (i, nreads, (float(i) / nreads) * 100) try: s, c, qstr, indivstr, indcnt, r2, r2cnt = line.strip().split() except ValueError: print >> sys.stderr, 'line %s split: incorrect element number (%s) line:\n%ssplit:\n%s\n' % ( i, len(line.strip().split()), line, line.strip().split()) q = numpy.array([ord(ch) - baseQ for ch in qstr]) c = int(c) indiv = set(indivstr.split(',')) if count_by_ind: indcntd = dict( zip(indivstr.split(','), map(int, indcnt.split(',')))) if readlen is not None: s = s[:readlen] q = q[:readlen] if all_quality.has_key(s): all_quality[s]['mIDs'] = list( set(all_quality[s]['mIDs']).union(indiv)) all_quality[s]['sum_quality'] += q * c all_quality[s]['tot'] += c if count_by_ind: for ind, cnt in indcntd.items(): if all_quality[s]['count_by_ind'].has_key(ind): all_quality[s]['count_by_ind'][ind] += cnt else: all_quality[s]['count_by_ind'][ind] = cnt else: all_quality[s]['mIDs'] = list(indiv) all_quality[s]['sum_quality'] = q * c all_quality[s]['tot'] = c if count_by_ind: all_quality[s]['count_by_ind'] = indcntd
''' import os,sys import numpy from editdist import distance from preprocess_radtag_lane import next_read_from_fh, smartopen, get_read_count idx_bp = 5 cut_bp = 5 lnum = 4 min_seqs = 7 uniqued, fastq = sys.argv[1:] readlen = len(next_read_from_fh(smartopen(fastq),4)[1]) print >> sys.stderr, 'readlen: %s' % readlen num_reads = get_read_count(fastq,4) tickon = num_reads/200 useqs = [] for l in open(uniqued): s,cntstr = l.strip().split()[0], l.strip().split()[4] cnt = numpy.mean([int(i) for i in cntstr.split(',')]) if cnt >= min_seqs: useqs.append(s[cut_bp:readlen-idx_bp]) useqs = list(set(useqs)) print >> sys.stderr, '%s unique %sbp sequences in uniqued file' % (len(useqs),len(s[cut_bp:readlen-idx_bp]))