def do_reduction(subset, args, nrfuzzykey, location): seen = set() for i in subset: seen.add(i) for j in subset[i]: seen.add(j) singles = [] for num in nrfuzzykey: if num not in seen: singles.append(num) #if len(subset.keys()) == 0 and len(compatible.keys()) == 0: return families = get_subset_evidence(subset, nrfuzzykey, args) gpdlines = "" tablelines = "" for num in singles: families.append(nrfuzzykey[num]) # find gpds not in the graph... for fz in families: info = fz.get_info_string() gpdline = fz.get_genepred_line() #print '&&&&&&&&&&&&&&&&' #print gpdline #print fz.get_info_string() #print '&&&&&&&&&&&&&&&&' gpd = GenePredEntry(gpdline) if not gpd.is_valid(): sys.stderr.write("WARNING: invalid genepred entry generated\n" + gpdline + "\n" + fz.get_info_string() + "\n") gpd = sorted( fz.gpds, key=lambda x: x.get_exon_count(), reverse=True)[0] #just grab one that has all the exons fz = FuzzyGenePred(gpd, juntol=args.junction_tolerance * 2) gpdline = fz.get_genepred_line() if not gpd.is_valid(): sys.stderr.write("WARNING: still problem skilling\n") continue gpdlines += gpdline + "\n" if args.output_original_table: name = gpd.entry['name'] for g in fz.gpds: tablelines += name + "\t" + g.entry['name'] + "\n" grng = gpd.get_bed() grng.direction = None if not location: location = grng location = location.merge(grng) locstring = '' if location: locstring = location.get_range_string() return [gpdlines, tablelines, locstring]
def do_reduction(subset,args,nrfuzzykey,location): seen = set() for i in subset: seen.add(i) for j in subset[i]: seen.add(j) singles = [] for num in nrfuzzykey: if num not in seen: singles.append(num) #if len(subset.keys()) == 0 and len(compatible.keys()) == 0: return families = get_subset_evidence(subset,nrfuzzykey,args) gpdlines = "" tablelines = "" for num in singles: families.append(nrfuzzykey[num]) # find gpds not in the graph... for fz in families: info = fz.get_info_string() gpdline = fz.get_genepred_line() #print '&&&&&&&&&&&&&&&&' #print gpdline #print fz.get_info_string() #print '&&&&&&&&&&&&&&&&' gpd = GenePredEntry(gpdline) if not gpd.is_valid(): sys.stderr.write("WARNING: invalid genepred entry generated\n"+gpdline+"\n"+fz.get_info_string()+"\n") gpd = sorted(fz.gpds, key=lambda x: x.get_exon_count(), reverse=True)[0] #just grab one that has all the exons fz = FuzzyGenePred(gpd,juntol=args.junction_tolerance*2) gpdline = fz.get_genepred_line() if not gpd.is_valid(): sys.stderr.write("WARNING: still problem skilling\n") continue gpdlines += gpdline+"\n" if args.output_original_table: name = gpd.entry['name'] for g in fz.gpds: tablelines+=name+"\t"+g.entry['name']+"\n" grng = gpd.get_bed() grng.direction = None if not location: location = grng location = location.merge(grng) locstring = '' if location: locstring = location.get_range_string() return [gpdlines, tablelines, locstring]
def evaluate_junctions(fz, sr, args): cnt = 0 source_names = [x.entry['name'] for x in fz.gpds] working = fz.copy() if len(working.fuzzy_junctions) == 0: return [] for i in range(0, len(working.fuzzy_junctions)): newjun = working.fuzzy_junctions[i] newjun.left.get_payload()['junc'] = [] newjun.right.get_payload()['junc'] = [] oldjun = fz.fuzzy_junctions[i] for srjun in sr: sjun = sr[srjun]['fzjun'] if oldjun.overlaps(sjun, args.junction_tolerance): for i in range(0, min(sr[srjun]['cnt'], args.downsample)): newjun.left.get_payload()['junc'].append( sjun.left.get_payload()['junc'][0]) newjun.right.get_payload()['junc'].append( sjun.right.get_payload()['junc'][0]) cnt += 1 juncs = [] starts = [] ends = [] evidences = [] for i in range(0, len(fz.fuzzy_junctions)): evidence = len(working.fuzzy_junctions[i].left.get_payload()['junc']) if evidence >= args.required_evidence: if i == 0: starts.append(working.start.start) elif working.fuzzy_junctions[i].left.get_payload()['start']: starts.append(working.fuzzy_junctions[i].left.get_payload() ['start'].start) else: starts.append(working.fuzzy_junctions[i - 1].right.start) #now ends if i == len(fz.fuzzy_junctions) - 1: ends.append(working.end.end) elif working.fuzzy_junctions[i].right.get_payload()['end']: ends.append( working.fuzzy_junctions[i].right.get_payload()['end'].end) else: ends.append(working.fuzzy_junctions[i + 1].left.end) bestleft = GenePredFuzzyBasics.mode( working.fuzzy_junctions[i].left.get_payload()['junc']) bestright = GenePredFuzzyBasics.mode( working.fuzzy_junctions[i].right.get_payload()['junc']) juncs.append([bestleft, bestright]) #print 'jun '+str(i)+' evid: '+str(evidence)+" "+str(bestleft)+" "+str(bestright) else: starts.append([]) ends.append([]) juncs.append([]) evidences.append(evidence) #print juncs #print starts #print ends #print evidences # now we can put together the runs runs = [] current_run = [] for i in range(0, len(evidences)): if evidences[i] < args.required_evidence: if len(current_run) > 0: runs.append(current_run) current_run = [] continue current_run.append(i) if len(current_run) > 0: runs.append(current_run) # now the runs are in runs #print 'runs:' parts = [] for run in runs: sarr = [] sarr.append(starts[run[0]] - 1) #put back to zero index earr = [] for i in range(0, len(run)): sarr.append(juncs[run[i]][1] - 1) earr.append(juncs[run[i]][0]) earr.append(ends[run[-1]]) # ready to build a genepred! part = '' part += str(working.start.chr) + "\t" part += '+' + "\t" part += str(sarr[0]) + "\t" part += str(earr[-1]) + "\t" part += str(sarr[0]) + "\t" part += str(earr[-1]) + "\t" part += str(len(sarr)) + "\t" part += ','.join([str(x) for x in sarr]) + ',' + "\t" part += ','.join([str(x) for x in earr]) + ',' # Final quality check here gpd = GenePredEntry("test1\ttest1\t" + part) if not gpd.is_valid(): sys.stderr.write("\nWARNING skipping invalid GPD\n" + gpd.get_line() + "\n") continue parts.append([part, source_names]) #print parts return parts
def evaluate_junctions(fz, sr, args): cnt = 0 source_names = [x.entry["name"] for x in fz.gpds] working = fz.copy() if len(working.fuzzy_junctions) == 0: return [] for i in range(0, len(working.fuzzy_junctions)): newjun = working.fuzzy_junctions[i] newjun.left.get_payload()["junc"] = [] newjun.right.get_payload()["junc"] = [] oldjun = fz.fuzzy_junctions[i] for srjun in sr: sjun = sr[srjun]["fzjun"] if oldjun.overlaps(sjun, args.junction_tolerance): for i in range(0, min(sr[srjun]["cnt"], args.downsample)): newjun.left.get_payload()["junc"].append(sjun.left.get_payload()["junc"][0]) newjun.right.get_payload()["junc"].append(sjun.right.get_payload()["junc"][0]) cnt += 1 juncs = [] starts = [] ends = [] evidences = [] for i in range(0, len(fz.fuzzy_junctions)): evidence = len(working.fuzzy_junctions[i].left.get_payload()["junc"]) if evidence >= args.required_evidence: if i == 0: starts.append(working.start.start) elif working.fuzzy_junctions[i].left.get_payload()["start"]: starts.append(working.fuzzy_junctions[i].left.get_payload()["start"].start) else: starts.append(working.fuzzy_junctions[i - 1].right.start) # now ends if i == len(fz.fuzzy_junctions) - 1: ends.append(working.end.end) elif working.fuzzy_junctions[i].right.get_payload()["end"]: ends.append(working.fuzzy_junctions[i].right.get_payload()["end"].end) else: ends.append(working.fuzzy_junctions[i + 1].left.end) bestleft = GenePredFuzzyBasics.mode(working.fuzzy_junctions[i].left.get_payload()["junc"]) bestright = GenePredFuzzyBasics.mode(working.fuzzy_junctions[i].right.get_payload()["junc"]) juncs.append([bestleft, bestright]) # print 'jun '+str(i)+' evid: '+str(evidence)+" "+str(bestleft)+" "+str(bestright) else: starts.append([]) ends.append([]) juncs.append([]) evidences.append(evidence) # print juncs # print starts # print ends # print evidences # now we can put together the runs runs = [] current_run = [] for i in range(0, len(evidences)): if evidences[i] < args.required_evidence: if len(current_run) > 0: runs.append(current_run) current_run = [] continue current_run.append(i) if len(current_run) > 0: runs.append(current_run) # now the runs are in runs # print 'runs:' parts = [] for run in runs: sarr = [] sarr.append(starts[run[0]] - 1) # put back to zero index earr = [] for i in range(0, len(run)): sarr.append(juncs[run[i]][1] - 1) earr.append(juncs[run[i]][0]) earr.append(ends[run[-1]]) # ready to build a genepred! part = "" part += str(working.start.chr) + "\t" part += "+" + "\t" part += str(sarr[0]) + "\t" part += str(earr[-1]) + "\t" part += str(sarr[0]) + "\t" part += str(earr[-1]) + "\t" part += str(len(sarr)) + "\t" part += ",".join([str(x) for x in sarr]) + "," + "\t" part += ",".join([str(x) for x in earr]) + "," # Final quality check here gpd = GenePredEntry("test1\ttest1\t" + part) if not gpd.is_valid(): sys.stderr.write("\nWARNING skipping invalid GPD\n" + gpd.get_line() + "\n") continue parts.append([part, source_names]) # print parts return parts
def do_prediction(compatible,args,nrfuzzykey,location): #if len(compatible.keys()) == 0: return None #all reads could be standing alone version families = [] for num in nrfuzzykey: families.append(nrfuzzykey[num]) nrfuzzykey[num].params['proper_set'] = False #partial overlap is enough #get_compatible_evidence(compatible,nrfuzzykey,args) for i in compatible: for j in compatible[i]: #see if its already in there g1lines = set() for g1 in nrfuzzykey[i].gpds: g1lines.add(g1.get_line()) repeat = False for g2 in nrfuzzykey[j].gpds: if g2.get_line() in g1lines: repeat = True break if not repeat: continue together = nrfuzzykey[i].concat_fuzzy_gpd(nrfuzzykey[j]) if together: families.append(together) # now we need to find any duplicate entries and combine them newfam = [] beforefam = len(families) while len(families) > 0: fam = families.pop(0) remaining = [] for i in range(0,len(families)): if fam.is_equal_fuzzy(families[i]): added = fam.add_fuzzy_gpd(families[i]) if not added: sys.stderr.write("WARNING NOT SURE WHY NOT ADDED EQUAL\n") fam = added else: remaining.append(families[i]) families = remaining newfam.append(fam) families = newfam afterfam = len(families) # Replace the family with a set where we haven't used the same gpd line twice # This may damage the fuzzy object for i in range(0,len(families)): gset = set() for g in families[i].gpds: gset.add(g.get_line()) families[i].gpds = [GenePredEntry(x) for x in gset] # sys.stderr.write("\n\ncahnged from "+str(beforefam)+"\t"+str(afterfam)+"\n\n") gpdlines = "" tablelines = "" # find gpds not in the graph... for fz in families: info = fz.get_info_string() gpdline = fz.get_genepred_line() #print '&&&&&&&&&&&&&&&&' #print gpdline #print fz.get_info_string() #print '&&&&&&&&&&&&&&&&' gpd = GenePredEntry(gpdline) if not gpd.is_valid(): sys.stderr.write("WARNING: invalid genepred entry generated\n"+gpdline+"\n"+fz.get_info_string()+"\n") gpd = sorted(fz.gpds, key=lambda x: x.get_exon_count(), reverse=True)[0] #just grab one that has all the exons fz = FuzzyGenePred(gpd,juntol=args.junction_tolerance*2) gpdline = fz.get_genepred_line() if not gpd.is_valid(): sys.stderr.write("WARNING: still problem skilling\n") continue gpdlines += gpdline+"\n" if args.output_original_table: name = gpd.entry['name'] for g in fz.gpds: tablelines+=name+"\t"+g.entry['name']+"\n" grng = gpd.get_bed() grng.direction = None if not location: location = grng location = location.merge(grng) locstring = '' if location: locstring = location.get_range_string() return [gpdlines, tablelines, locstring]
def do_prediction(compatible, args, nrfuzzykey, location): #if len(compatible.keys()) == 0: return None #all reads could be standing alone version families = [] for num in nrfuzzykey: families.append(nrfuzzykey[num]) nrfuzzykey[num].params[ 'proper_set'] = False #partial overlap is enough #get_compatible_evidence(compatible,nrfuzzykey,args) for i in compatible: for j in compatible[i]: #see if its already in there g1lines = set() for g1 in nrfuzzykey[i].gpds: g1lines.add(g1.get_line()) repeat = False for g2 in nrfuzzykey[j].gpds: if g2.get_line() in g1lines: repeat = True break if not repeat: continue together = nrfuzzykey[i].concat_fuzzy_gpd(nrfuzzykey[j]) if together: families.append(together) # now we need to find any duplicate entries and combine them newfam = [] beforefam = len(families) while len(families) > 0: fam = families.pop(0) remaining = [] for i in range(0, len(families)): if fam.is_equal_fuzzy(families[i]): added = fam.add_fuzzy_gpd(families[i]) if not added: sys.stderr.write("WARNING NOT SURE WHY NOT ADDED EQUAL\n") fam = added else: remaining.append(families[i]) families = remaining newfam.append(fam) families = newfam afterfam = len(families) # Replace the family with a set where we haven't used the same gpd line twice # This may damage the fuzzy object for i in range(0, len(families)): gset = set() for g in families[i].gpds: gset.add(g.get_line()) families[i].gpds = [GenePredEntry(x) for x in gset] # sys.stderr.write("\n\ncahnged from "+str(beforefam)+"\t"+str(afterfam)+"\n\n") gpdlines = "" tablelines = "" # find gpds not in the graph... for fz in families: info = fz.get_info_string() gpdline = fz.get_genepred_line() #print '&&&&&&&&&&&&&&&&' #print gpdline #print fz.get_info_string() #print '&&&&&&&&&&&&&&&&' gpd = GenePredEntry(gpdline) if not gpd.is_valid(): sys.stderr.write("WARNING: invalid genepred entry generated\n" + gpdline + "\n" + fz.get_info_string() + "\n") gpd = sorted( fz.gpds, key=lambda x: x.get_exon_count(), reverse=True)[0] #just grab one that has all the exons fz = FuzzyGenePred(gpd, juntol=args.junction_tolerance * 2) gpdline = fz.get_genepred_line() if not gpd.is_valid(): sys.stderr.write("WARNING: still problem skilling\n") continue gpdlines += gpdline + "\n" if args.output_original_table: name = gpd.entry['name'] for g in fz.gpds: tablelines += name + "\t" + g.entry['name'] + "\n" grng = gpd.get_bed() grng.direction = None if not location: location = grng location = location.merge(grng) locstring = '' if location: locstring = location.get_range_string() return [gpdlines, tablelines, locstring]