Exemple #1
0
def do_reduction(subset, args, nrfuzzykey, location):
    seen = set()
    for i in subset:
        seen.add(i)
        for j in subset[i]:
            seen.add(j)
    singles = []
    for num in nrfuzzykey:
        if num not in seen:
            singles.append(num)
    #if len(subset.keys()) == 0 and len(compatible.keys()) == 0: return
    families = get_subset_evidence(subset, nrfuzzykey, args)
    gpdlines = ""
    tablelines = ""
    for num in singles:
        families.append(nrfuzzykey[num])
    # find gpds not in the graph...
    for fz in families:
        info = fz.get_info_string()
        gpdline = fz.get_genepred_line()
        #print '&&&&&&&&&&&&&&&&'
        #print gpdline
        #print fz.get_info_string()
        #print '&&&&&&&&&&&&&&&&'
        gpd = GenePredEntry(gpdline)
        if not gpd.is_valid():
            sys.stderr.write("WARNING: invalid genepred entry generated\n" +
                             gpdline + "\n" + fz.get_info_string() + "\n")
            gpd = sorted(
                fz.gpds, key=lambda x: x.get_exon_count(),
                reverse=True)[0]  #just grab one that has all the exons
            fz = FuzzyGenePred(gpd, juntol=args.junction_tolerance * 2)
            gpdline = fz.get_genepred_line()
            if not gpd.is_valid():
                sys.stderr.write("WARNING: still problem skilling\n")
                continue
        gpdlines += gpdline + "\n"
        if args.output_original_table:
            name = gpd.entry['name']
            for g in fz.gpds:
                tablelines += name + "\t" + g.entry['name'] + "\n"
        grng = gpd.get_bed()
        grng.direction = None
        if not location:
            location = grng
        location = location.merge(grng)
    locstring = ''
    if location: locstring = location.get_range_string()
    return [gpdlines, tablelines, locstring]
def do_reduction(subset,args,nrfuzzykey,location):
    seen = set()
    for i in subset:
      seen.add(i)
      for j in subset[i]:  seen.add(j)
    singles = []
    for num in nrfuzzykey:
      if num not in seen:
        singles.append(num)
    #if len(subset.keys()) == 0 and len(compatible.keys()) == 0: return
    families = get_subset_evidence(subset,nrfuzzykey,args)
    gpdlines = ""
    tablelines = ""
    for num in singles:
      families.append(nrfuzzykey[num])
    # find gpds not in the graph... 
    for fz in families:
      info = fz.get_info_string()
      gpdline = fz.get_genepred_line()
      #print '&&&&&&&&&&&&&&&&'
      #print gpdline
      #print fz.get_info_string()
      #print '&&&&&&&&&&&&&&&&'
      gpd = GenePredEntry(gpdline)
      if not gpd.is_valid(): 
        sys.stderr.write("WARNING: invalid genepred entry generated\n"+gpdline+"\n"+fz.get_info_string()+"\n")
        gpd = sorted(fz.gpds, key=lambda x: x.get_exon_count(), reverse=True)[0] #just grab one that has all the exons
        fz = FuzzyGenePred(gpd,juntol=args.junction_tolerance*2)
        gpdline = fz.get_genepred_line()
        if not gpd.is_valid():
          sys.stderr.write("WARNING: still problem skilling\n")
          continue
      gpdlines += gpdline+"\n"
      if args.output_original_table:
        name = gpd.entry['name']
        for g in fz.gpds:
          tablelines+=name+"\t"+g.entry['name']+"\n"
      grng = gpd.get_bed()
      grng.direction = None
      if not location: 
        location = grng
      location = location.merge(grng)
    locstring = ''
    if location:  locstring = location.get_range_string()
    return [gpdlines, tablelines, locstring]
def evaluate_junctions(fz, sr, args):
    cnt = 0
    source_names = [x.entry['name'] for x in fz.gpds]
    working = fz.copy()
    if len(working.fuzzy_junctions) == 0: return []
    for i in range(0, len(working.fuzzy_junctions)):
        newjun = working.fuzzy_junctions[i]
        newjun.left.get_payload()['junc'] = []
        newjun.right.get_payload()['junc'] = []
        oldjun = fz.fuzzy_junctions[i]
        for srjun in sr:
            sjun = sr[srjun]['fzjun']
            if oldjun.overlaps(sjun, args.junction_tolerance):
                for i in range(0, min(sr[srjun]['cnt'], args.downsample)):
                    newjun.left.get_payload()['junc'].append(
                        sjun.left.get_payload()['junc'][0])
                    newjun.right.get_payload()['junc'].append(
                        sjun.right.get_payload()['junc'][0])
                    cnt += 1
    juncs = []
    starts = []
    ends = []
    evidences = []
    for i in range(0, len(fz.fuzzy_junctions)):
        evidence = len(working.fuzzy_junctions[i].left.get_payload()['junc'])
        if evidence >= args.required_evidence:
            if i == 0:
                starts.append(working.start.start)
            elif working.fuzzy_junctions[i].left.get_payload()['start']:
                starts.append(working.fuzzy_junctions[i].left.get_payload()
                              ['start'].start)
            else:
                starts.append(working.fuzzy_junctions[i - 1].right.start)
            #now ends
            if i == len(fz.fuzzy_junctions) - 1:
                ends.append(working.end.end)
            elif working.fuzzy_junctions[i].right.get_payload()['end']:
                ends.append(
                    working.fuzzy_junctions[i].right.get_payload()['end'].end)
            else:
                ends.append(working.fuzzy_junctions[i + 1].left.end)
            bestleft = GenePredFuzzyBasics.mode(
                working.fuzzy_junctions[i].left.get_payload()['junc'])
            bestright = GenePredFuzzyBasics.mode(
                working.fuzzy_junctions[i].right.get_payload()['junc'])
            juncs.append([bestleft, bestright])
            #print 'jun '+str(i)+' evid: '+str(evidence)+" "+str(bestleft)+" "+str(bestright)
        else:
            starts.append([])
            ends.append([])
            juncs.append([])
        evidences.append(evidence)
    #print juncs
    #print starts
    #print ends
    #print evidences
    # now we can put together the runs
    runs = []
    current_run = []
    for i in range(0, len(evidences)):
        if evidences[i] < args.required_evidence:
            if len(current_run) > 0:
                runs.append(current_run)
            current_run = []
            continue
        current_run.append(i)
    if len(current_run) > 0:
        runs.append(current_run)
    # now the runs are in runs
    #print 'runs:'
    parts = []
    for run in runs:
        sarr = []
        sarr.append(starts[run[0]] - 1)  #put back to zero index
        earr = []
        for i in range(0, len(run)):
            sarr.append(juncs[run[i]][1] - 1)
            earr.append(juncs[run[i]][0])
        earr.append(ends[run[-1]])
        # ready to build a genepred!
        part = ''
        part += str(working.start.chr) + "\t"
        part += '+' + "\t"
        part += str(sarr[0]) + "\t"
        part += str(earr[-1]) + "\t"
        part += str(sarr[0]) + "\t"
        part += str(earr[-1]) + "\t"
        part += str(len(sarr)) + "\t"
        part += ','.join([str(x) for x in sarr]) + ',' + "\t"
        part += ','.join([str(x) for x in earr]) + ','
        # Final quality check here
        gpd = GenePredEntry("test1\ttest1\t" + part)
        if not gpd.is_valid():
            sys.stderr.write("\nWARNING skipping invalid GPD\n" +
                             gpd.get_line() + "\n")
            continue
        parts.append([part, source_names])
    #print parts
    return parts
def evaluate_junctions(fz, sr, args):
    cnt = 0
    source_names = [x.entry["name"] for x in fz.gpds]
    working = fz.copy()
    if len(working.fuzzy_junctions) == 0:
        return []
    for i in range(0, len(working.fuzzy_junctions)):
        newjun = working.fuzzy_junctions[i]
        newjun.left.get_payload()["junc"] = []
        newjun.right.get_payload()["junc"] = []
        oldjun = fz.fuzzy_junctions[i]
        for srjun in sr:
            sjun = sr[srjun]["fzjun"]
            if oldjun.overlaps(sjun, args.junction_tolerance):
                for i in range(0, min(sr[srjun]["cnt"], args.downsample)):
                    newjun.left.get_payload()["junc"].append(sjun.left.get_payload()["junc"][0])
                    newjun.right.get_payload()["junc"].append(sjun.right.get_payload()["junc"][0])
                    cnt += 1
    juncs = []
    starts = []
    ends = []
    evidences = []
    for i in range(0, len(fz.fuzzy_junctions)):
        evidence = len(working.fuzzy_junctions[i].left.get_payload()["junc"])
        if evidence >= args.required_evidence:
            if i == 0:
                starts.append(working.start.start)
            elif working.fuzzy_junctions[i].left.get_payload()["start"]:
                starts.append(working.fuzzy_junctions[i].left.get_payload()["start"].start)
            else:
                starts.append(working.fuzzy_junctions[i - 1].right.start)
            # now ends
            if i == len(fz.fuzzy_junctions) - 1:
                ends.append(working.end.end)
            elif working.fuzzy_junctions[i].right.get_payload()["end"]:
                ends.append(working.fuzzy_junctions[i].right.get_payload()["end"].end)
            else:
                ends.append(working.fuzzy_junctions[i + 1].left.end)
            bestleft = GenePredFuzzyBasics.mode(working.fuzzy_junctions[i].left.get_payload()["junc"])
            bestright = GenePredFuzzyBasics.mode(working.fuzzy_junctions[i].right.get_payload()["junc"])
            juncs.append([bestleft, bestright])
            # print 'jun '+str(i)+' evid: '+str(evidence)+" "+str(bestleft)+" "+str(bestright)
        else:
            starts.append([])
            ends.append([])
            juncs.append([])
        evidences.append(evidence)
    # print juncs
    # print starts
    # print ends
    # print evidences
    # now we can put together the runs
    runs = []
    current_run = []
    for i in range(0, len(evidences)):
        if evidences[i] < args.required_evidence:
            if len(current_run) > 0:
                runs.append(current_run)
            current_run = []
            continue
        current_run.append(i)
    if len(current_run) > 0:
        runs.append(current_run)
    # now the runs are in runs
    # print 'runs:'
    parts = []
    for run in runs:
        sarr = []
        sarr.append(starts[run[0]] - 1)  # put back to zero index
        earr = []
        for i in range(0, len(run)):
            sarr.append(juncs[run[i]][1] - 1)
            earr.append(juncs[run[i]][0])
        earr.append(ends[run[-1]])
        # ready to build a genepred!
        part = ""
        part += str(working.start.chr) + "\t"
        part += "+" + "\t"
        part += str(sarr[0]) + "\t"
        part += str(earr[-1]) + "\t"
        part += str(sarr[0]) + "\t"
        part += str(earr[-1]) + "\t"
        part += str(len(sarr)) + "\t"
        part += ",".join([str(x) for x in sarr]) + "," + "\t"
        part += ",".join([str(x) for x in earr]) + ","
        # Final quality check here
        gpd = GenePredEntry("test1\ttest1\t" + part)
        if not gpd.is_valid():
            sys.stderr.write("\nWARNING skipping invalid GPD\n" + gpd.get_line() + "\n")
            continue
        parts.append([part, source_names])
    # print parts
    return parts
def do_prediction(compatible,args,nrfuzzykey,location):
    #if len(compatible.keys()) == 0: return None
    #all reads could be standing alone version
    families = []
    for num in nrfuzzykey:
      families.append(nrfuzzykey[num])
      nrfuzzykey[num].params['proper_set'] = False #partial overlap is enough
    #get_compatible_evidence(compatible,nrfuzzykey,args)
    for i in compatible:
      for j in compatible[i]:
        #see if its already in there
        g1lines = set()
        for g1 in nrfuzzykey[i].gpds: g1lines.add(g1.get_line())
        repeat = False
        for g2 in nrfuzzykey[j].gpds:
          if g2.get_line() in g1lines:
            repeat = True
            break
        if not repeat: continue
        together = nrfuzzykey[i].concat_fuzzy_gpd(nrfuzzykey[j])
        if together:
          families.append(together)
    # now we need to find any duplicate entries and combine them
    newfam = []
    beforefam = len(families)
    while len(families) > 0:
      fam = families.pop(0)
      remaining = []
      for i in range(0,len(families)):
        if fam.is_equal_fuzzy(families[i]):
          added = fam.add_fuzzy_gpd(families[i])
          if not added:
            sys.stderr.write("WARNING NOT SURE WHY NOT ADDED EQUAL\n")
          fam = added
        else: remaining.append(families[i])
      families = remaining
      newfam.append(fam)
    families = newfam
    afterfam = len(families)

    # Replace the family with a set where we haven't used the same gpd line twice
    # This may damage the fuzzy object
    for i in range(0,len(families)):
      gset = set()
      for g in families[i].gpds:  
        gset.add(g.get_line())
      families[i].gpds  = [GenePredEntry(x) for x in gset]
    #  sys.stderr.write("\n\ncahnged from "+str(beforefam)+"\t"+str(afterfam)+"\n\n")
    gpdlines = ""
    tablelines = ""
    # find gpds not in the graph... 
    for fz in families:
      info = fz.get_info_string()
      gpdline = fz.get_genepred_line()
      #print '&&&&&&&&&&&&&&&&'
      #print gpdline
      #print fz.get_info_string()
      #print '&&&&&&&&&&&&&&&&'
      gpd = GenePredEntry(gpdline)
      if not gpd.is_valid(): 
        sys.stderr.write("WARNING: invalid genepred entry generated\n"+gpdline+"\n"+fz.get_info_string()+"\n")
        gpd = sorted(fz.gpds, key=lambda x: x.get_exon_count(), reverse=True)[0] #just grab one that has all the exons
        fz = FuzzyGenePred(gpd,juntol=args.junction_tolerance*2)
        gpdline = fz.get_genepred_line()
        if not gpd.is_valid():
          sys.stderr.write("WARNING: still problem skilling\n")
          continue
      gpdlines += gpdline+"\n"
      if args.output_original_table:
        name = gpd.entry['name']
        for g in fz.gpds:
          tablelines+=name+"\t"+g.entry['name']+"\n"
      grng = gpd.get_bed()
      grng.direction = None
      if not location: 
        location = grng
      location = location.merge(grng)
    locstring = ''
    if location:  locstring = location.get_range_string()
    return [gpdlines, tablelines, locstring]
Exemple #6
0
def do_prediction(compatible, args, nrfuzzykey, location):
    #if len(compatible.keys()) == 0: return None
    #all reads could be standing alone version
    families = []
    for num in nrfuzzykey:
        families.append(nrfuzzykey[num])
        nrfuzzykey[num].params[
            'proper_set'] = False  #partial overlap is enough
    #get_compatible_evidence(compatible,nrfuzzykey,args)
    for i in compatible:
        for j in compatible[i]:
            #see if its already in there
            g1lines = set()
            for g1 in nrfuzzykey[i].gpds:
                g1lines.add(g1.get_line())
            repeat = False
            for g2 in nrfuzzykey[j].gpds:
                if g2.get_line() in g1lines:
                    repeat = True
                    break
            if not repeat: continue
            together = nrfuzzykey[i].concat_fuzzy_gpd(nrfuzzykey[j])
            if together:
                families.append(together)
    # now we need to find any duplicate entries and combine them
    newfam = []
    beforefam = len(families)
    while len(families) > 0:
        fam = families.pop(0)
        remaining = []
        for i in range(0, len(families)):
            if fam.is_equal_fuzzy(families[i]):
                added = fam.add_fuzzy_gpd(families[i])
                if not added:
                    sys.stderr.write("WARNING NOT SURE WHY NOT ADDED EQUAL\n")
                fam = added
            else:
                remaining.append(families[i])
        families = remaining
        newfam.append(fam)
    families = newfam
    afterfam = len(families)

    # Replace the family with a set where we haven't used the same gpd line twice
    # This may damage the fuzzy object
    for i in range(0, len(families)):
        gset = set()
        for g in families[i].gpds:
            gset.add(g.get_line())
        families[i].gpds = [GenePredEntry(x) for x in gset]
    #  sys.stderr.write("\n\ncahnged from "+str(beforefam)+"\t"+str(afterfam)+"\n\n")
    gpdlines = ""
    tablelines = ""
    # find gpds not in the graph...
    for fz in families:
        info = fz.get_info_string()
        gpdline = fz.get_genepred_line()
        #print '&&&&&&&&&&&&&&&&'
        #print gpdline
        #print fz.get_info_string()
        #print '&&&&&&&&&&&&&&&&'
        gpd = GenePredEntry(gpdline)
        if not gpd.is_valid():
            sys.stderr.write("WARNING: invalid genepred entry generated\n" +
                             gpdline + "\n" + fz.get_info_string() + "\n")
            gpd = sorted(
                fz.gpds, key=lambda x: x.get_exon_count(),
                reverse=True)[0]  #just grab one that has all the exons
            fz = FuzzyGenePred(gpd, juntol=args.junction_tolerance * 2)
            gpdline = fz.get_genepred_line()
            if not gpd.is_valid():
                sys.stderr.write("WARNING: still problem skilling\n")
                continue
        gpdlines += gpdline + "\n"
        if args.output_original_table:
            name = gpd.entry['name']
            for g in fz.gpds:
                tablelines += name + "\t" + g.entry['name'] + "\n"
        grng = gpd.get_bed()
        grng.direction = None
        if not location:
            location = grng
        location = location.merge(grng)
    locstring = ''
    if location: locstring = location.get_range_string()
    return [gpdlines, tablelines, locstring]