def do_reduction(subset, args, nrfuzzykey, location): seen = set() for i in subset: seen.add(i) for j in subset[i]: seen.add(j) singles = [] for num in nrfuzzykey: if num not in seen: singles.append(num) #if len(subset.keys()) == 0 and len(compatible.keys()) == 0: return families = get_subset_evidence(subset, nrfuzzykey, args) gpdlines = "" tablelines = "" for num in singles: families.append(nrfuzzykey[num]) # find gpds not in the graph... for fz in families: info = fz.get_info_string() gpdline = fz.get_genepred_line() #print '&&&&&&&&&&&&&&&&' #print gpdline #print fz.get_info_string() #print '&&&&&&&&&&&&&&&&' gpd = GenePredEntry(gpdline) if not gpd.is_valid(): sys.stderr.write("WARNING: invalid genepred entry generated\n" + gpdline + "\n" + fz.get_info_string() + "\n") gpd = sorted( fz.gpds, key=lambda x: x.get_exon_count(), reverse=True)[0] #just grab one that has all the exons fz = FuzzyGenePred(gpd, juntol=args.junction_tolerance * 2) gpdline = fz.get_genepred_line() if not gpd.is_valid(): sys.stderr.write("WARNING: still problem skilling\n") continue gpdlines += gpdline + "\n" if args.output_original_table: name = gpd.entry['name'] for g in fz.gpds: tablelines += name + "\t" + g.entry['name'] + "\n" grng = gpd.get_bed() grng.direction = None if not location: location = grng location = location.merge(grng) locstring = '' if location: locstring = location.get_range_string() return [gpdlines, tablelines, locstring]
def main(): parser = argparse.ArgumentParser(description="do join of gpd overlaps") parser.add_argument('gpd_left') parser.add_argument('gpd_right') parser.add_argument('-j', '--junction_tolerance', default=5, type=int) parser.add_argument('--threads', type=int, default=1) parser.add_argument('--left_outer_join', action='store_true') parser.add_argument('-o', '--output') args = parser.parse_args() global of of = sys.stdout if args.output: of = open(args.output, 'w') #read the left into memory rightlines = [] z = 0 with open(args.gpd_right) as inf: for line in inf: z += 1 if z % 100 == 0: sys.stderr.write("reading in " + str(z) + " \r") fgpd = FuzzyGenePred(GPD(line.rstrip()), juntol=args.junction_tolerance) rightlines.append(fgpd) sys.stderr.write("\n") sys.stderr.write("finished reading in " + str(len(rightlines)) + " gpd entries\n") i = 0 if args.threads > 1: p = Pool(processes=args.threads) with open(args.gpd_left) as inf: for line in inf: i += 1 fgpd1 = FuzzyGenePred(GPD(line.rstrip()), juntol=args.junction_tolerance) if args.threads > 1: p.apply_async(get_compatible, args=(rightlines, fgpd1, i, args), callback=do_output) else: v = get_compatible(rightlines, fgpd1, i, args) do_output(v) if args.threads > 1: p.close() p.join() of.close()
def do_reduction(subset,args,nrfuzzykey,location): seen = set() for i in subset: seen.add(i) for j in subset[i]: seen.add(j) singles = [] for num in nrfuzzykey: if num not in seen: singles.append(num) #if len(subset.keys()) == 0 and len(compatible.keys()) == 0: return families = get_subset_evidence(subset,nrfuzzykey,args) gpdlines = "" tablelines = "" for num in singles: families.append(nrfuzzykey[num]) # find gpds not in the graph... for fz in families: info = fz.get_info_string() gpdline = fz.get_genepred_line() #print '&&&&&&&&&&&&&&&&' #print gpdline #print fz.get_info_string() #print '&&&&&&&&&&&&&&&&' gpd = GenePredEntry(gpdline) if not gpd.is_valid(): sys.stderr.write("WARNING: invalid genepred entry generated\n"+gpdline+"\n"+fz.get_info_string()+"\n") gpd = sorted(fz.gpds, key=lambda x: x.get_exon_count(), reverse=True)[0] #just grab one that has all the exons fz = FuzzyGenePred(gpd,juntol=args.junction_tolerance*2) gpdline = fz.get_genepred_line() if not gpd.is_valid(): sys.stderr.write("WARNING: still problem skilling\n") continue gpdlines += gpdline+"\n" if args.output_original_table: name = gpd.entry['name'] for g in fz.gpds: tablelines+=name+"\t"+g.entry['name']+"\n" grng = gpd.get_bed() grng.direction = None if not location: location = grng location = location.merge(grng) locstring = '' if location: locstring = location.get_range_string() return [gpdlines, tablelines, locstring]
def get_nr_fuzzy(nrlocuskey, args): global warning_count global glock nrfuzzykey = {} for num in nrlocuskey: # Create FuzzyGenePreds out of all of the GPDs # And reduce the sets of many gpds down to just a single fuzzy gpd representing each specific gpd. v = greedy_combine_down_fuzzies([ FuzzyGenePred(x, juntol=args.junction_tolerance * 2) for x in nrlocuskey[num] ]) if len(v) > 1: if args.verbose: sys.stderr.write("WARNING expected only 1 fuzzy genepred\n") glock.acquire() warning_count += 1 glock.release() nrfuzzykey[num] = v[0] return nrfuzzykey
def do_prediction(compatible,args,nrfuzzykey,location): #if len(compatible.keys()) == 0: return None #all reads could be standing alone version families = [] for num in nrfuzzykey: families.append(nrfuzzykey[num]) nrfuzzykey[num].params['proper_set'] = False #partial overlap is enough #get_compatible_evidence(compatible,nrfuzzykey,args) for i in compatible: for j in compatible[i]: #see if its already in there g1lines = set() for g1 in nrfuzzykey[i].gpds: g1lines.add(g1.get_line()) repeat = False for g2 in nrfuzzykey[j].gpds: if g2.get_line() in g1lines: repeat = True break if not repeat: continue together = nrfuzzykey[i].concat_fuzzy_gpd(nrfuzzykey[j]) if together: families.append(together) # now we need to find any duplicate entries and combine them newfam = [] beforefam = len(families) while len(families) > 0: fam = families.pop(0) remaining = [] for i in range(0,len(families)): if fam.is_equal_fuzzy(families[i]): added = fam.add_fuzzy_gpd(families[i]) if not added: sys.stderr.write("WARNING NOT SURE WHY NOT ADDED EQUAL\n") fam = added else: remaining.append(families[i]) families = remaining newfam.append(fam) families = newfam afterfam = len(families) # Replace the family with a set where we haven't used the same gpd line twice # This may damage the fuzzy object for i in range(0,len(families)): gset = set() for g in families[i].gpds: gset.add(g.get_line()) families[i].gpds = [GenePredEntry(x) for x in gset] # sys.stderr.write("\n\ncahnged from "+str(beforefam)+"\t"+str(afterfam)+"\n\n") gpdlines = "" tablelines = "" # find gpds not in the graph... for fz in families: info = fz.get_info_string() gpdline = fz.get_genepred_line() #print '&&&&&&&&&&&&&&&&' #print gpdline #print fz.get_info_string() #print '&&&&&&&&&&&&&&&&' gpd = GenePredEntry(gpdline) if not gpd.is_valid(): sys.stderr.write("WARNING: invalid genepred entry generated\n"+gpdline+"\n"+fz.get_info_string()+"\n") gpd = sorted(fz.gpds, key=lambda x: x.get_exon_count(), reverse=True)[0] #just grab one that has all the exons fz = FuzzyGenePred(gpd,juntol=args.junction_tolerance*2) gpdline = fz.get_genepred_line() if not gpd.is_valid(): sys.stderr.write("WARNING: still problem skilling\n") continue gpdlines += gpdline+"\n" if args.output_original_table: name = gpd.entry['name'] for g in fz.gpds: tablelines+=name+"\t"+g.entry['name']+"\n" grng = gpd.get_bed() grng.direction = None if not location: location = grng location = location.merge(grng) locstring = '' if location: locstring = location.get_range_string() return [gpdlines, tablelines, locstring]
def do_prediction(compatible, args, nrfuzzykey, location): #if len(compatible.keys()) == 0: return None #all reads could be standing alone version families = [] for num in nrfuzzykey: families.append(nrfuzzykey[num]) nrfuzzykey[num].params[ 'proper_set'] = False #partial overlap is enough #get_compatible_evidence(compatible,nrfuzzykey,args) for i in compatible: for j in compatible[i]: #see if its already in there g1lines = set() for g1 in nrfuzzykey[i].gpds: g1lines.add(g1.get_line()) repeat = False for g2 in nrfuzzykey[j].gpds: if g2.get_line() in g1lines: repeat = True break if not repeat: continue together = nrfuzzykey[i].concat_fuzzy_gpd(nrfuzzykey[j]) if together: families.append(together) # now we need to find any duplicate entries and combine them newfam = [] beforefam = len(families) while len(families) > 0: fam = families.pop(0) remaining = [] for i in range(0, len(families)): if fam.is_equal_fuzzy(families[i]): added = fam.add_fuzzy_gpd(families[i]) if not added: sys.stderr.write("WARNING NOT SURE WHY NOT ADDED EQUAL\n") fam = added else: remaining.append(families[i]) families = remaining newfam.append(fam) families = newfam afterfam = len(families) # Replace the family with a set where we haven't used the same gpd line twice # This may damage the fuzzy object for i in range(0, len(families)): gset = set() for g in families[i].gpds: gset.add(g.get_line()) families[i].gpds = [GenePredEntry(x) for x in gset] # sys.stderr.write("\n\ncahnged from "+str(beforefam)+"\t"+str(afterfam)+"\n\n") gpdlines = "" tablelines = "" # find gpds not in the graph... for fz in families: info = fz.get_info_string() gpdline = fz.get_genepred_line() #print '&&&&&&&&&&&&&&&&' #print gpdline #print fz.get_info_string() #print '&&&&&&&&&&&&&&&&' gpd = GenePredEntry(gpdline) if not gpd.is_valid(): sys.stderr.write("WARNING: invalid genepred entry generated\n" + gpdline + "\n" + fz.get_info_string() + "\n") gpd = sorted( fz.gpds, key=lambda x: x.get_exon_count(), reverse=True)[0] #just grab one that has all the exons fz = FuzzyGenePred(gpd, juntol=args.junction_tolerance * 2) gpdline = fz.get_genepred_line() if not gpd.is_valid(): sys.stderr.write("WARNING: still problem skilling\n") continue gpdlines += gpdline + "\n" if args.output_original_table: name = gpd.entry['name'] for g in fz.gpds: tablelines += name + "\t" + g.entry['name'] + "\n" grng = gpd.get_bed() grng.direction = None if not location: location = grng location = location.merge(grng) locstring = '' if location: locstring = location.get_range_string() return [gpdlines, tablelines, locstring]