def process_locus(lr, srin, args): if len(lr) == 0: return None totalrange = get_total_range(lr) #print '^^^^ Locus ^^^^' #print totalrange.get_range_string() #print str(len(lr))+"\t"+str(len(sr))+"\t"+str(len(srjun)) # Get fuzzys from of all short reads sr = {} #do this more time consuming cutdown ont he SR data after sending to a thread for srgpd in srin: srfz = GenePredFuzzyBasics.FuzzyGenePred( srgpd, juntol=args.junction_tolerance) for j in srfz.fuzzy_junctions: junstr = j.left.chr + ':' + str(j.left.end) + ',' + str( j.right.end) if junstr not in sr: sr[junstr] = {} sr[junstr]['cnt'] = 0 sr[junstr]['fzjun'] = j sr[junstr]['cnt'] += 1 #srfzs = [GenePredFuzzyBasics.FuzzyGenePred(x) for x in srjun] #for i in range(0,len(srfzs)): srfzs[i].gpds[0].entry['name'] = 'SR_'+str(i) fzs = GenePredFuzzyBasics.greedy_gpd_list_to_combined_fuzzy_list( lr, args.junction_tolerance) #print str(len(fzs)) + " genepreds" outputs = [] #if args.threads > 1: # p = Pool(processes=args.threads) for fz in fzs: #if args.by_read: # if args.threads > 1 and args.by_read: # p.apply_async(do_fuzzy,args=(fz,sr,args),callback=do_outs) # else: # outs = do_fuzzy(fz,sr,args) # do_outs([outs,totalrange]) #else: outs = do_fuzzy(fz, sr, args) # do_outs([outs,totalrange]) for o in outs: outputs.append(o) #if args.threads > 1 and args.by_read: # p.close() # p.join() #if not args.by_read: return [outputs, totalrange]
def process_locus(lr, srin, args): if len(lr) == 0: return None totalrange = get_total_range(lr) # print '^^^^ Locus ^^^^' # print totalrange.get_range_string() # print str(len(lr))+"\t"+str(len(sr))+"\t"+str(len(srjun)) # Get fuzzys from of all short reads sr = {} # do this more time consuming cutdown ont he SR data after sending to a thread for srgpd in srin: srfz = GenePredFuzzyBasics.FuzzyGenePred(srgpd, juntol=args.junction_tolerance) for j in srfz.fuzzy_junctions: junstr = j.left.chr + ":" + str(j.left.end) + "," + str(j.right.end) if junstr not in sr: sr[junstr] = {} sr[junstr]["cnt"] = 0 sr[junstr]["fzjun"] = j sr[junstr]["cnt"] += 1 # srfzs = [GenePredFuzzyBasics.FuzzyGenePred(x) for x in srjun] # for i in range(0,len(srfzs)): srfzs[i].gpds[0].entry['name'] = 'SR_'+str(i) fzs = GenePredFuzzyBasics.greedy_gpd_list_to_combined_fuzzy_list(lr, args.junction_tolerance) # print str(len(fzs)) + " genepreds" outputs = [] # if args.threads > 1: # p = Pool(processes=args.threads) for fz in fzs: # if args.by_read: # if args.threads > 1 and args.by_read: # p.apply_async(do_fuzzy,args=(fz,sr,args),callback=do_outs) # else: # outs = do_fuzzy(fz,sr,args) # do_outs([outs,totalrange]) # else: outs = do_fuzzy(fz, sr, args) # do_outs([outs,totalrange]) for o in outs: outputs.append(o) # if args.threads > 1 and args.by_read: # p.close() # p.join() # if not args.by_read: return [outputs, totalrange]
def main(): #do our inputs args = do_inputs() global gout gout = args.output gls = GenePredBasics.GenePredLocusStream(args.input) fgs = GenePredFuzzyBasics.FuzzyGenePredSeparator() if args.threads > 1: p = Pool(processes=args.threads) while True: buffer = gls.read_locus() if not buffer: break if args.threads > 1: p.apply_async(process_buffer, args=(buffer, args), callback=out_gpds) else: v = process_buffer(buffer, args) out_gpds(v) if args.threads > 1: p.close() p.join() sys.stderr.write("\n")
def evaluate_junctions(fz, sr, args): cnt = 0 source_names = [x.entry['name'] for x in fz.gpds] working = fz.copy() if len(working.fuzzy_junctions) == 0: return [] for i in range(0, len(working.fuzzy_junctions)): newjun = working.fuzzy_junctions[i] newjun.left.get_payload()['junc'] = [] newjun.right.get_payload()['junc'] = [] oldjun = fz.fuzzy_junctions[i] for srjun in sr: sjun = sr[srjun]['fzjun'] if oldjun.overlaps(sjun, args.junction_tolerance): for i in range(0, min(sr[srjun]['cnt'], args.downsample)): newjun.left.get_payload()['junc'].append( sjun.left.get_payload()['junc'][0]) newjun.right.get_payload()['junc'].append( sjun.right.get_payload()['junc'][0]) cnt += 1 juncs = [] starts = [] ends = [] evidences = [] for i in range(0, len(fz.fuzzy_junctions)): evidence = len(working.fuzzy_junctions[i].left.get_payload()['junc']) if evidence >= args.required_evidence: if i == 0: starts.append(working.start.start) elif working.fuzzy_junctions[i].left.get_payload()['start']: starts.append(working.fuzzy_junctions[i].left.get_payload() ['start'].start) else: starts.append(working.fuzzy_junctions[i - 1].right.start) #now ends if i == len(fz.fuzzy_junctions) - 1: ends.append(working.end.end) elif working.fuzzy_junctions[i].right.get_payload()['end']: ends.append( working.fuzzy_junctions[i].right.get_payload()['end'].end) else: ends.append(working.fuzzy_junctions[i + 1].left.end) bestleft = GenePredFuzzyBasics.mode( working.fuzzy_junctions[i].left.get_payload()['junc']) bestright = GenePredFuzzyBasics.mode( working.fuzzy_junctions[i].right.get_payload()['junc']) juncs.append([bestleft, bestright]) #print 'jun '+str(i)+' evid: '+str(evidence)+" "+str(bestleft)+" "+str(bestright) else: starts.append([]) ends.append([]) juncs.append([]) evidences.append(evidence) #print juncs #print starts #print ends #print evidences # now we can put together the runs runs = [] current_run = [] for i in range(0, len(evidences)): if evidences[i] < args.required_evidence: if len(current_run) > 0: runs.append(current_run) current_run = [] continue current_run.append(i) if len(current_run) > 0: runs.append(current_run) # now the runs are in runs #print 'runs:' parts = [] for run in runs: sarr = [] sarr.append(starts[run[0]] - 1) #put back to zero index earr = [] for i in range(0, len(run)): sarr.append(juncs[run[i]][1] - 1) earr.append(juncs[run[i]][0]) earr.append(ends[run[-1]]) # ready to build a genepred! part = '' part += str(working.start.chr) + "\t" part += '+' + "\t" part += str(sarr[0]) + "\t" part += str(earr[-1]) + "\t" part += str(sarr[0]) + "\t" part += str(earr[-1]) + "\t" part += str(len(sarr)) + "\t" part += ','.join([str(x) for x in sarr]) + ',' + "\t" part += ','.join([str(x) for x in earr]) + ',' # Final quality check here gpd = GenePredEntry("test1\ttest1\t" + part) if not gpd.is_valid(): sys.stderr.write("\nWARNING skipping invalid GPD\n" + gpd.get_line() + "\n") continue parts.append([part, source_names]) #print parts return parts
def evaluate_junctions(fz, sr, args): cnt = 0 source_names = [x.entry["name"] for x in fz.gpds] working = fz.copy() if len(working.fuzzy_junctions) == 0: return [] for i in range(0, len(working.fuzzy_junctions)): newjun = working.fuzzy_junctions[i] newjun.left.get_payload()["junc"] = [] newjun.right.get_payload()["junc"] = [] oldjun = fz.fuzzy_junctions[i] for srjun in sr: sjun = sr[srjun]["fzjun"] if oldjun.overlaps(sjun, args.junction_tolerance): for i in range(0, min(sr[srjun]["cnt"], args.downsample)): newjun.left.get_payload()["junc"].append(sjun.left.get_payload()["junc"][0]) newjun.right.get_payload()["junc"].append(sjun.right.get_payload()["junc"][0]) cnt += 1 juncs = [] starts = [] ends = [] evidences = [] for i in range(0, len(fz.fuzzy_junctions)): evidence = len(working.fuzzy_junctions[i].left.get_payload()["junc"]) if evidence >= args.required_evidence: if i == 0: starts.append(working.start.start) elif working.fuzzy_junctions[i].left.get_payload()["start"]: starts.append(working.fuzzy_junctions[i].left.get_payload()["start"].start) else: starts.append(working.fuzzy_junctions[i - 1].right.start) # now ends if i == len(fz.fuzzy_junctions) - 1: ends.append(working.end.end) elif working.fuzzy_junctions[i].right.get_payload()["end"]: ends.append(working.fuzzy_junctions[i].right.get_payload()["end"].end) else: ends.append(working.fuzzy_junctions[i + 1].left.end) bestleft = GenePredFuzzyBasics.mode(working.fuzzy_junctions[i].left.get_payload()["junc"]) bestright = GenePredFuzzyBasics.mode(working.fuzzy_junctions[i].right.get_payload()["junc"]) juncs.append([bestleft, bestright]) # print 'jun '+str(i)+' evid: '+str(evidence)+" "+str(bestleft)+" "+str(bestright) else: starts.append([]) ends.append([]) juncs.append([]) evidences.append(evidence) # print juncs # print starts # print ends # print evidences # now we can put together the runs runs = [] current_run = [] for i in range(0, len(evidences)): if evidences[i] < args.required_evidence: if len(current_run) > 0: runs.append(current_run) current_run = [] continue current_run.append(i) if len(current_run) > 0: runs.append(current_run) # now the runs are in runs # print 'runs:' parts = [] for run in runs: sarr = [] sarr.append(starts[run[0]] - 1) # put back to zero index earr = [] for i in range(0, len(run)): sarr.append(juncs[run[i]][1] - 1) earr.append(juncs[run[i]][0]) earr.append(ends[run[-1]]) # ready to build a genepred! part = "" part += str(working.start.chr) + "\t" part += "+" + "\t" part += str(sarr[0]) + "\t" part += str(earr[-1]) + "\t" part += str(sarr[0]) + "\t" part += str(earr[-1]) + "\t" part += str(len(sarr)) + "\t" part += ",".join([str(x) for x in sarr]) + "," + "\t" part += ",".join([str(x) for x in earr]) + "," # Final quality check here gpd = GenePredEntry("test1\ttest1\t" + part) if not gpd.is_valid(): sys.stderr.write("\nWARNING skipping invalid GPD\n" + gpd.get_line() + "\n") continue parts.append([part, source_names]) # print parts return parts
def process_buffer(buffer, args): fzs = GenePredFuzzyBasics.greedy_gpd_list_to_combined_fuzzy_list( buffer, args.junction_tolerance) return [fzs, args]
def process_buffer(buffer,args): fzs = GenePredFuzzyBasics.greedy_gpd_list_to_combined_fuzzy_list(buffer,args.junction_tolerance) return [fzs,args]