def get_loci(transcripts_genepred):
    loci = Loci()
    loci.verbose = True
    with open(transcripts_genepred) as inf:
        for line in inf:
            if line[0] == '#': continue
            gpd = GenePredEntry(line.rstrip())
            rng = Bed(gpd.value('chrom'), gpd.value('txStart'),
                      gpd.value('txEnd'))
            rng.set_payload(gpd.value('name'))
            loc1 = Locus()
            loc1.add_member(rng)
            loci.add_locus(loc1)
    sys.stderr.write("Organizing genepred data into overlapping loci\n")
    sys.stderr.write("Started with " + str(len(loci.loci)) + " loci\n")
    loci.update_loci()
    sys.stderr.write("Ended with " + str(len(loci.loci)) + " loci\n")

    m = 0
    locus2name = {}
    name2locus = {}
    for locus in loci.loci:
        m += 1
        for member in locus.members:
            name = member.get_payload()
            if m not in locus2name: locus2name[m] = set()
            locus2name[m].add(name)
            name2locus[name] = m
    return [locus2name, name2locus]
def get_loci(transcripts_genepred):
  loci = Loci()
  loci.verbose= True
  with open(transcripts_genepred) as inf:
    for line in inf:
      if line[0]=='#': continue
      gpd = GenePredEntry(line.rstrip())
      rng = Bed(gpd.value('chrom'),gpd.value('txStart'),gpd.value('txEnd'))
      rng.set_payload(gpd.value('name'))
      loc1 = Locus()
      loc1.add_member(rng)
      loci.add_locus(loc1)
  sys.stderr.write("Organizing genepred data into overlapping loci\n")
  sys.stderr.write("Started with "+str(len(loci.loci))+" loci\n")
  loci.update_loci()
  sys.stderr.write("Ended with "+str(len(loci.loci))+" loci\n")

  m = 0
  locus2name = {}
  name2locus = {}
  for locus in loci.loci:
    m+=1
    for member in locus.members:
      name = member.get_payload()
      if m not in locus2name:  locus2name[m] = set()
      locus2name[m].add(name)
      name2locus[name] = m
  return [locus2name,name2locus]
class FuzzyGenePred:
    # set use_dir true if you want to use direction and make it direction specific
    # set proper_set false if you want to do awesome extending that doesn't really work yet
    def __init__(self, ingpd=None, params=None, juntol=10):
        # Here is the basic data
        self.fuzzy_junctions = []
        self.gpds = []  # contributing member genepreds
        self.start = None
        self.end = None
        self.dir = None
        # Higher level data
        self.simple_junction_set = set()  # quickly search for if a multi exon gene has been added
        # Here is the parameters
        self.params = {}
        self.params["use_dir"] = False
        self.params["junction_tolerance"] = juntol
        # Not fully implemented.  Do we require a full length match
        self.params["proper_set"] = True
        # Define thresholds for overlapping single exons
        self.params["do_add_single_exon"] = True
        self.params["single_exon_minimum_length"] = 200
        self.params[
            "single_exon_minimum_overlap_fraction"
        ] = 0.8  # reciprocal ... must be this fraction or more on both
        self.params["single_exon_minimum_overlap_bases"] = 1  # minimum number of bases
        self.params["single_exon_maximum_endpoint_distance"] = 1000
        if params:
            for pname in params:
                self.params[pname] = params[pname]
        if ingpd:
            self.add_gpd(ingpd)

    def get_genepred_line(self, end_select="extremes", junction_select="mode", name=None):
        if not name:
            name = "fuzGPD_" + random_string(8) + "_" + str(len(self.fuzzy_junctions) + 1) + "_" + str(len(self.gpds))
        ostr = ""
        ostr += name + "\t"
        ostr += name + "\t"
        ostr += self.start.chr + "\t"
        ostr += self.gpds[0].value("strand") + "\t"
        ostr += str(self.start.start - 1) + "\t"
        ostr += str(self.end.end) + "\t"
        ostr += str(self.start.start - 1) + "\t"
        ostr += str(self.end.end) + "\t"
        ostr += str(len(self.fuzzy_junctions) + 1) + "\t"
        exonstarts = []
        exonends = []
        exonstarts.append(self.start.start - 1)
        for j in self.fuzzy_junctions:
            exonends.append(mode(j.left.get_payload()["junc"]))
            exonstarts.append(mode(j.right.get_payload()["junc"]) - 1)
        exonends.append(self.end.end)
        ostr += ",".join([str(x) for x in exonstarts]) + "," + "\t"
        ostr += ",".join([str(x) for x in exonends]) + ","
        return ostr

    # Return a copy of the fuzzy geneprep
    def copy(self):
        g = FuzzyGenePred()  # start with a blank one why not
        # get the settings
        for pname in self.params:
            g.params[pname] = self.params[pname]
        # copy the genepreds
        for orig in self.gpds:
            g.gpds.append(GenePredEntry(orig.get_line()))
        # store direction
        g.dir = self.dir
        # copy the fuzzy junctions
        for orig in self.fuzzy_junctions:
            g.fuzzy_junctions.append(orig.copy())
        # copy the simple junction set
        for orig in self.simple_junction_set:
            g.simple_junction_set.add(orig)
        # copy the start
        if self.start:
            g.start = Bed(self.start.chr, self.start.start - 1, self.start.end, self.start.direction)
            g.start.set_payload([])
            for v in self.start.get_payload():
                g.start.get_payload().append(v)
        # copy the end
        if self.end:
            g.end = Bed(self.end.chr, self.end.start - 1, self.end.end, self.end.direction)
            g.end.set_payload([])
            for v in self.end.get_payload():
                g.end.get_payload().append(v)
        return g

    def exon_count(self):
        return len(self.fuzzy_junctions) + 1

    def gpd_count(self):
        return len(self.gpds)

    def get_bed(self):
        return Bed(self.start.chr, self.start.start - 1, self.end.end, self.start.direction)

    # This is an inspection tool for a fuzzy gpd
    def get_info_string(self):
        ostr = ""
        ostr += "== FUZZY GENEPRED INFO ==" + "\n"
        ostr += str(len(self.gpds)) + " total GPDs" + "\n"
        totalbounds = Bed(self.start.chr, self.start.start - 1, self.end.end, self.start.direction)
        ostr += totalbounds.get_range_string() + " total bounds\n"
        ostr += "---- start ----" + "\n"
        ostr += str(len(self.start.get_payload())) + " reads supporting start" + "\n"
        ostr += "  " + str(mean(self.start.get_payload())) + " mean" + "\n"
        ostr += "  " + str(mode(self.start.get_payload())) + " mode" + "\n"
        ostr += "  " + self.start.get_range_string() + " start range\n"
        ostr += "---- end ----" + "\n"
        ostr += str(len(self.end.get_payload())) + " reads supporting end" + "\n"
        ostr += "  " + str(mean(self.end.get_payload())) + " mean" + "\n"
        ostr += "  " + str(mode(self.end.get_payload())) + " mode" + "\n"
        ostr += "  " + self.end.get_range_string() + " end range\n"
        ostr += "---- junctions ----" + "\n"
        ostr += str(len(self.fuzzy_junctions)) + " total fuzzy junctions" + "\n"
        cnt = 0
        for j in self.fuzzy_junctions:
            cnt += 1
            ostr += (
                "  "
                + str(cnt)
                + ". "
                + str(mode(j.left.get_payload()["junc"]))
                + " ^ "
                + str(mode(j.right.get_payload()["junc"]))
                + "\n"
            )
            ostr += "     " + j.left.get_range_string() + " ^ " + j.right.get_range_string() + "\n"
            ostr += "     " + str(len(j.left.get_payload()["junc"])) + " read support" + "\n"
            if j.left.get_payload()["start"]:
                ostr += "       " + "---starts----" + "\n"
                ostr += (
                    "       "
                    + str(len(j.left.get_payload()["start"].get_payload()))
                    + " starts at "
                    + j.left.get_payload()["start"].get_range_string()
                    + "\n"
                )
            if j.right.get_payload()["end"]:
                ostr += "       " + "---ends----" + "\n"
                ostr += (
                    "       "
                    + str(len(j.right.get_payload()["end"].get_payload()))
                    + " ends at "
                    + j.right.get_payload()["end"].get_range_string()
                    + "\n"
                )
        return ostr

    # Add a new gpd return true if successful
    # Return false if it didn't work, return the new combined if it worked
    def add_gpd(self, ingpd):
        if len(self.gpds) == 0:  # first one
            self.read_first(ingpd)
            return self  # return ourself if we are adding our first
        # more difficult situation where we must try to combine
        # See if it can match first before actually adding stuff to it
        # if self.
        newfuz = FuzzyGenePred(ingpd, params=self.params)
        output = self.add_fuzzy_gpd(newfuz)
        return output

    # combine together compatible overlapping sets
    def concat_fuzzy_gpd(self, fuz2):
        if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) != 0:
            return False
        if len(fuz2.fuzzy_junctions) != 0 and len(self.fuzzy_junctions) == 0:
            return False
        # Lets work combine the single exon step and exit
        if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) == 0:
            return self.do_add_single_exon_fuzzy_gpd(fuz2)
        # For now don't add them if one is single exon
        if len(self.fuzzy_junctions) == 0 or len(fuz2.fuzzy_junctions) == 0:
            return False

        # See if its already a subset
        easy_subset = False
        for simplejunction in fuz2.simple_junction_set:
            if simplejunction in self.simple_junction_set:
                easy_subset = True
        # If its not already a subset look deeper
        # 1. First we need perfect junctions for a run of them
        if not easy_subset:
            if not self.compatible_overlap(fuz2):
                return False
        # still here. we will work on combining these
        output = self.copy()
        # first lets put add any overlapping junctions
        for i in range(0, len(output.fuzzy_junctions)):
            for j in range(0, len(fuz2.fuzzy_junctions)):
                if output.fuzzy_junctions[i].overlaps(fuz2.fuzzy_junctions[j], fuz2.params["junction_tolerance"]):
                    output.fuzzy_junctions[i].add_fuzzy_junction(fuz2.fuzzy_junctions[j])
                    if j == 0:  # put the start in too
                        if not output.fuzzy_junctions[i].left.get_payload()["start"]:
                            output.fuzzy_junctions[i].left.get_payload()["start"] = fuz2.start.copy()
                        else:  # merge
                            starts = output.fuzzy_junctions[i].left.get_payload()["start"].get_payload()
                            for v in fuz2.start.get_payload():
                                starts.append(v)
                            nrange = output.fuzzy_junctions[i].left.get_payload()["start"].merge(fuz2.start)
                            nrange.set_payload(starts[:])
                            output.fuzzy_junctions[i].left.get_payload()["start"] = nrange
                    if j == len(fuz2.fuzzy_junctions) - 1:  # put the end in too
                        if not output.fuzzy_junctions[i].right.get_payload()["end"]:
                            output.fuzzy_junctions[i].right.get_payload()["end"] = fuz2.end.copy()
                        else:  # merge
                            ends = output.fuzzy_junctions[i].right.get_payload()["end"].get_payload()
                            for v in fuz2.end.get_payload():
                                ends.append(v)
                            nrange = output.fuzzy_junctions[i].right.get_payload()["end"].merge(fuz2.end)
                            nrange.set_payload(ends[:])
                            output.fuzzy_junctions[i].right.get_payload()["end"] = nrange
        # see if we should build onto the left
        leftnum = -1
        leftmost = self.fuzzy_junctions[0]
        if fuz2.fuzzy_junctions[0].right.end < leftmost.left.start:
            for i in range(0, len(fuz2.fuzzy_junctions)):
                if fuz2.fuzzy_junctions[i].overlaps(leftmost, fuz2.params["junction_tolerance"]):
                    leftnum = i
                    break
        # leftnum is now -1 if no additions to the left zero if it starts on the same
        if leftnum > 0:
            for i in reversed(range(0, leftnum)):
                output.fuzzy_junctions.insert(0, fuz2.fuzzy_junctions[i].copy())
            output.start = fuz2.start.copy()
        rightnum = -1  # get the right point ... our first one comes after this
        rightmost = self.fuzzy_junctions[-1]
        if fuz2.fuzzy_junctions[-1].left.start > rightmost.right.end:
            for i in reversed(range(0, len(fuz2.fuzzy_junctions))):
                if fuz2.fuzzy_junctions[i].overlaps(rightmost, fuz2.params["junction_tolerance"]):
                    rightnum = i
                    break
        if rightnum != -1:
            rightnum += 1
            if rightnum < len(fuz2.fuzzy_junctions):
                for i in range(rightnum, len(fuz2.fuzzy_junctions)):
                    output.fuzzy_junctions.append(fuz2.fuzzy_junctions[i].copy())
                output.end = fuz2.end.copy()
        # print leftnum
        # print rightnum
        # print fuz2.params['junction_tolerance']
        # print 'combining'
        return output

    # add together subsets
    def add_fuzzy_gpd(self, fuz2):
        # see if we can add this fuzzy gpd to another
        # We treat single exon genes seprately so if only one of them is
        # single exon we can't compare them
        if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) != 0:
            return False
        if len(fuz2.fuzzy_junctions) != 0 and len(self.fuzzy_junctions) == 0:
            return False
        # Lets work combine the single exon step and exit
        if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) == 0:
            return self.do_add_single_exon_fuzzy_gpd(fuz2)

        # For now don't add them if one is single exon
        if len(self.fuzzy_junctions) == 0 or len(fuz2.fuzzy_junctions) == 0:
            return False

        # See if its already a subset
        easy_subset = False
        for simplejunction in fuz2.simple_junction_set:
            if simplejunction in self.simple_junction_set:
                easy_subset = True
        # If its not already a subset look deeper
        # 1. First we need perfect junctions for a run of them
        if not easy_subset:
            if not self.compatible_overlap(fuz2):
                return False
        # still here. we will work on combining these
        output = self.copy()
        # switch over to working on the output now

        # If we are still here we can add the two of them together
        # If they have the same starting junction we can add their starting points together
        if output.fuzzy_junctions[0].overlaps(fuz2.fuzzy_junctions[0], output.params["junction_tolerance"]):
            # print 'samestart'
            newstart = output.start.merge(fuz2.start)
            newstart.set_payload(output.start.get_payload())
            for s in fuz2.start.get_payload():
                newstart.get_payload().append(s)
            output.start = newstart

        # Check if the other one is new start
        elif mode(fuz2.fuzzy_junctions[0].left.get_payload()["junc"]) < mode(
            output.fuzzy_junctions[0].left.get_payload()["junc"]
        ):
            # print "2 start"
            output.start = fuz2.start
        elif mode(fuz2.fuzzy_junctions[0].left.get_payload()["junc"]) > mode(
            output.fuzzy_junctions[0].left.get_payload()["junc"]
        ):
            True
        #  #print "1 start"
        #  #we're good to go
        else:
            sys.stderr.write("WARNING: strange start case abort merge\n")
            return False
        # lets work the ends now
        if output.fuzzy_junctions[-1].overlaps(fuz2.fuzzy_junctions[-1], output.params["junction_tolerance"]):
            # print 'sameend'
            newend = output.end.merge(fuz2.end)
            newend.set_payload(output.end.get_payload())
            for s in fuz2.end.get_payload():
                newend.get_payload().append(s)
            output.end = newend

        # Check if the other one is new start
        elif mode(fuz2.fuzzy_junctions[-1].right.get_payload()["junc"]) > mode(
            output.fuzzy_junctions[-1].right.get_payload()["junc"]
        ):
            # print "2 end"
            output.end = fuz2.end
        elif mode(fuz2.fuzzy_junctions[-1].right.get_payload()["junc"]) < mode(
            output.fuzzy_junctions[-1].right.get_payload()["junc"]
        ):
            True
        #  #print "1 end"
        #  #we're good to go
        else:
            sys.stderr.write("WARNING: strange end case abort merge\n")
            u1 = mode(output.fuzzy_junctions[-1].left.get_payload()["junc"])
            u2 = mode(fuz2.fuzzy_junctions[-1].left.get_payload()["junc"])
            v1 = mode(output.fuzzy_junctions[-1].right.get_payload()["junc"])
            v2 = mode(fuz2.fuzzy_junctions[-1].right.get_payload()["junc"])
            sys.stderr.write(str(u1) + "\t" + str(u2) + "\n")
            sys.stderr.write(str(v1) + "\t" + str(v2) + "\n")
            return False
        # now the starts and ends have been updated in output.
        # iterate through the junctions.
        # check for a left overhang.
        numfuz2left = 0
        numoutleft = 0
        if not output.fuzzy_junctions[0].overlaps(fuz2.fuzzy_junctions[0], output.params["junction_tolerance"]):
            # see if we need to add sequences from fuz2
            if mode(fuz2.fuzzy_junctions[0].left.get_payload()["junc"]) < mode(
                output.fuzzy_junctions[0].left.get_payload()["junc"]
            ):
                # print 'left over2'
                i = 0
                while not output.fuzzy_junctions[0].overlaps(
                    fuz2.fuzzy_junctions[i], output.params["junction_tolerance"]
                ) and i < len(fuz2.fuzzy_junctions):
                    i += 1
                numfuz2left = i  # number to push on from the fuz2 and increment in
                # print numfuz2left
            elif mode(fuz2.fuzzy_junctions[0].left.get_payload()["junc"]) > mode(
                output.fuzzy_junctions[0].left.get_payload()["junc"]
            ):
                # print 'left over1'
                i = 0
                while not output.fuzzy_junctions[i].overlaps(
                    fuz2.fuzzy_junctions[0], output.params["junction_tolerance"]
                ) and i < len(output.fuzzy_junctions):
                    i += 1
                numoutleft = i  # number to increment in from output
                # print numoutleft
            else:
                sys.stderr.write("WARNING: strange case \n")
                return False
        # next we can check how long we have a run of the same
        ind1 = numoutleft
        ind2 = numfuz2left
        overlap_size = 0
        while (
            ind1 < len(output.fuzzy_junctions)
            and ind2 < len(fuz2.fuzzy_junctions)
            and output.fuzzy_junctions[ind1].overlaps(fuz2.fuzzy_junctions[ind2], output.params["junction_tolerance"])
        ):
            overlap_size += 1
            ind1 += 1
            ind2 += 1
        # print 'overlap size '+str(overlap_size)
        numoutright = len(output.fuzzy_junctions) - overlap_size - numoutleft
        numfuz2right = len(fuz2.fuzzy_junctions) - overlap_size - numfuz2left
        if min(numoutright, numfuz2right) != 0:
            sys.stderr.write("WARNING: expected one of them to be zero\n")
            # print self.get_info_string()
            # print '====================='
            # print fuz2.get_info_string()
            # sys.exit()
            return False
        if min(numoutleft, numfuz2left) != 0:
            sys.stderr.write("WARNING: expected one of them to be zero\n")
            return False
        # print numoutright
        # print numfuz2right
        # print output.fuzzy_junctions[numoutleft].overlaps(fuz2.fuzzy_junctions[numfuz2left],output.junction_tolerance)
        # print 'add'
        # Now we have what we need to go through and do some updating
        # Lets just make new fuzzy junctions
        newjuncs = []
        for i in range(0, numfuz2left):
            newjuncs.append(fuz2.fuzzy_junctions[i])
        for i in range(0, numoutleft):
            newjuncs.append(output.fuzzy_junctions[i])
        # Now we do both down the center
        range1 = range(numoutleft, overlap_size + numoutleft)
        range2 = range(numfuz2left, overlap_size + numfuz2left)
        for i in range(0, len(range1)):
            newjuncs.append(output.fuzzy_junctions[range1[i]])
            newjuncs[-1].add_fuzzy_junction(fuz2.fuzzy_junctions[range2[i]])
            # print i
        # Make the right size
        for i in range(overlap_size + numfuz2left, overlap_size + numfuz2left + numfuz2right):
            newjuncs.append(fuz2.fuzzy_junctions[i])
        for i in range(overlap_size + numoutleft, overlap_size + numoutleft + numoutright):
            newjuncs.append(output.fuzzy_junctions[i])
        output.fuzzy_junctions = newjuncs
        # print 'adding gpd '+str(len(fuz2.gpds))+' entries'
        for g in fuz2.gpds:
            output.gpds.append(g)
            sjun = get_simple_junction(g)
            if sjun:
                output.simple_junction_set.add(sjun)
        # print 'new entry'
        # print self.get_info_string()
        return output

    def do_add_single_exon_fuzzy_gpd(self, fuz2):
        if not self.params["do_add_single_exon"]:
            return False  # make sure we are allowed to be doing this
        # build the bounds from the average start and end
        s1 = mean(self.start.get_payload())
        e1 = mean(self.end.get_payload())
        s2 = mean(fuz2.start.get_payload())
        e2 = mean(fuz2.end.get_payload())
        l1 = e1 - s1 + 1
        l2 = e2 - s2 + 1
        if l1 < self.params["single_exon_minimum_length"]:
            return False
        if l2 < self.params["single_exon_minimum_length"]:
            return False
        if l1 < 1 or l2 < 1:
            return False  # shouldn't happen
        chr1 = self.start.chr
        chr2 = self.end.chr
        if chr1 != chr2:
            return False  # shouldn't happen
        r1 = Bed(chr1, s1 - 1, e1, self.dir)
        r2 = Bed(chr2, s2 - 1, e2, self.dir)
        over = r1.overlap_size(r2)
        if over < self.params["single_exon_minimum_overlap_bases"]:
            return False
        # print r1.get_range_string()
        # print r2.get_range_string()
        cov = min(float(over) / float(l1), float(over) / float(l2))
        if cov < self.params["single_exon_minimum_overlap_fraction"]:
            return False
        if abs(e1 - e2) > self.params["single_exon_maximum_endpoint_distance"]:
            return False
        if abs(s1 - s2) > self.params["single_exon_maximum_endpoint_distance"]:
            return False
        # If we're still here, we can add result
        output = self.copy()
        newstart = output.start.merge(fuz2.start)
        newstart.set_payload([])
        for s in output.start.get_payload():
            newstart.get_payload().append(s)
        for s in fuz2.start.get_payload():
            newstart.get_payload().append(s)
        newend = output.end.merge(fuz2.end)
        newend.set_payload([])
        for e in output.end.get_payload():
            newend.get_payload().append(e)
        for e in fuz2.end.get_payload():
            newend.get_payload().append(e)
        output.start = newstart
        output.end = newend
        for gpd in fuz2.gpds:
            output.gpds.append(gpd)
            sjun = get_simple_junction(gpd)
            if sjun:
                output.simple_junction_set.add(gpd)
        return output

    # Return true if these fuzzy genepreds can be added together
    def compatible_overlap(self, fingpd):
        f1 = self
        f2 = fingpd

        #### Forget about trying zero exon cases for now
        if len(f1.fuzzy_junctions) == 0 or len(f2.fuzzy_junctions) == 0:
            return False

        # Find all matches
        matches = []
        for i in range(0, len(f1.fuzzy_junctions)):
            for j in range(0, len(f2.fuzzy_junctions)):
                if f1.fuzzy_junctions[i].overlaps(f2.fuzzy_junctions[j], self.params["junction_tolerance"]):
                    matches.append([i, j])

        # This is our matched junctions in f1 and f2
        if len(matches) == 0:
            return False  # Nothing matched.. certainly no overlap

        # This is the number of extra exons it would take in the middle of the run (shifts)
        if len(set([x[0] - x[1] for x in matches])) != 1:
            return False

        # Lets make sure all our exons are consecutive
        if len(matches) > 1:
            consec1 = list(set([matches[i + 1][0] - matches[i][0] for i in range(0, len(matches) - 1)]))
            consec2 = list(set([matches[i + 1][1] - matches[i][1] for i in range(0, len(matches) - 1)]))
            if len(consec1) != 1:
                return False
            if len(consec2) != 1:
                return False
            if consec1[0] != 1:
                return False
            if consec2[0] != 1:
                return False
        # one of them should be zero
        if not (matches[0][1] == 0 or matches[0][0] == 0):
            return False

        # and one of our last matches should be the last junction
        if not (len(f1.fuzzy_junctions) - 1 == matches[-1][0] or len(f2.fuzzy_junctions) - 1 == matches[-1][1]):
            return False

        #### most of the time we will probably be looking for a proper set
        #### unless we are extending the long read for isoform prediction
        if self.params["proper_set"]:
            # check those last overhangs
            # one of the two needs to have the start and end points in the consecutive matches
            if (matches[0][0] == 0 and len(f1.fuzzy_junctions) - 1 == matches[-1][0]) or (
                matches[0][1] == 0 and len(f2.fuzzy_junctions) - 1 == matches[-1][1]
            ):
                return True
            return False

        return True

    def read_first(self, ingpd):
        self.gpds.append(ingpd)
        sjun = get_simple_junction(ingpd)
        if sjun:
            self.simple_junction_set.add(sjun)
        if self.params["use_dir"]:
            self.dir = ingpd.value("strand")
        # add fuzzy junctions
        chr = ingpd.value("chrom")
        for i in range(0, len(ingpd.value("exonStarts")) - 1):
            self.fuzzy_junctions.append(
                FuzzyJunction(chr, ingpd.value("exonEnds")[i], ingpd.value("exonStarts")[i + 1] + 1, self.dir)
            )
        if len(ingpd.value("exonStarts")) > 1:  # we have junctions
            self.fuzzy_junctions[0].left.get_payload()["start"] = Bed(
                chr, ingpd.value("txStart"), ingpd.value("txStart") + 1, self.dir
            )
            self.fuzzy_junctions[0].left.get_payload()["start"].set_payload([])
            self.fuzzy_junctions[0].left.get_payload()["start"].get_payload().append(ingpd.value("txStart") + 1)
            self.fuzzy_junctions[-1].right.get_payload()["end"] = Bed(
                chr, ingpd.value("txEnd") - 1, ingpd.value("txEnd"), self.dir
            )
            self.fuzzy_junctions[-1].right.get_payload()["end"].set_payload([])
            self.fuzzy_junctions[-1].right.get_payload()["end"].get_payload().append(ingpd.value("txEnd"))
        # add fuzzy starts
        self.start = Bed(ingpd.value("chrom"), ingpd.value("txStart"), ingpd.value("txStart") + 1, self.dir)
        self.start.set_payload([])
        self.start.get_payload().append(ingpd.value("txStart") + 1)
        self.end = Bed(ingpd.value("chrom"), ingpd.value("txEnd") - 1, ingpd.value("txEnd"), self.dir)
        self.end.set_payload([])
        self.end.get_payload().append(ingpd.value("txEnd"))
        # Have finished reading in the first case

    # Pre: another fuzzy gpd
    # Post: True if they are all overlapping junctions
    def is_equal_fuzzy(self, fuz2, use_direction=False):
        if use_direction:
            if self.dir != fuz2.dir:
                return False
        if len(self.fuzzy_junctions) < 0:
            return False
        if len(fuz2.fuzzy_junctions) < 0:
            return False
        if len(self.fuzzy_junctions) != len(fuz2.fuzzy_junctions):
            return False
        for i in range(0, len(self.fuzzy_junctions)):
            if not self.fuzzy_junctions[i].overlaps(fuz2.fuzzy_junctions[i], self.params["junction_tolerance"]):
                return False
        return True
class FuzzyJunction:
    # Pre: inleft is 1-indexed last exonic base on the left
    #      inright is 1-indexed first exonic base on the right
    #      direction doesn't need to be used
    def __init__(self, inchr=None, inleft=None, inright=None, indir=None):
        self.chr = inchr
        self.left = None  # range with payloads being the actual left and rights
        self.right = None
        self.dir = indir
        if inchr and inleft and inright:
            self.add_junction(inchr, inleft, inright, indir)

    def copy(self):
        newjunc = FuzzyJunction()
        newjunc.chr = self.chr
        newjunc.left = Bed(self.left.chr, self.left.start - 1, self.left.end, self.left.direction)
        t1 = {}
        t1["junc"] = []
        t1["start"] = None
        newjunc.left.set_payload(t1)
        for j in self.left.get_payload()["junc"]:
            newjunc.left.get_payload()["junc"].append(j)
        newjunc.right = Bed(self.right.chr, self.right.start - 1, self.right.end, self.right.direction)
        # copy any starts for the junction
        if self.left.get_payload()["start"]:
            ls = self.left.get_payload()["start"]
            newjunc.left.get_payload()["start"] = Bed(ls.chr, ls.start - 1, ls.end, ls.direction)
            newjunc.left.get_payload()["start"].set_payload([])
            for p in self.left.get_payload()["start"].get_payload():
                newjunc.left.get_payload()["start"].get_payload().append(p)
        t2 = {}
        t2["junc"] = []
        t2["end"] = None
        newjunc.right.set_payload(t2)
        for j in self.right.get_payload()["junc"]:
            newjunc.right.get_payload()["junc"].append(j)
        # copy any ends for the junction
        if self.right.get_payload()["end"]:
            ren = self.right.get_payload()["end"]
            newjunc.right.get_payload()["end"] = Bed(ren.chr, ren.start - 1, ren.end, ren.direction)
            newjunc.right.get_payload()["end"].set_payload([])
            for p in self.right.get_payload()["end"].get_payload():
                newjunc.right.get_payload()["end"].get_payload().append(p)
        return newjunc

    # return chr, and the left and right mode as an array
    def get_mode(self):
        m1 = mode(self.left.get_payload()["junc"])
        m2 = mode(self.right.get_payload()["junc"])
        return [Bed(self.chr, m1 - 1, m1, self.dir), Bed(self.chr, m2 - 1, m2, self.dir)]

    # Find the mode of the junction and see if this overlaps
    def overlaps(self, fjun2, juntol):
        m1 = self.get_mode()
        m2 = fjun2.get_mode()
        if m1[0].chr != m2[0].chr:
            return False
        if m1[0].direction != m2[0].direction:
            return False  # usually they are both off
        if not m1[0].overlaps_with_padding(m2[0], juntol):
            return False
        if not m1[1].overlaps_with_padding(m2[1], juntol):
            return False
        return True

    # Right now assumes these are overlap verified prior to calling
    def add_junction(self, inchr, inleft, inright, indir=None):
        if not self.left:  # this is our first one
            t1 = {}
            t1["junc"] = []
            t1["start"] = None
            self.left = Bed(inchr, inleft - 1, inleft, indir)
            self.left.set_payload(t1)
            self.left.get_payload()["junc"].append(inleft)
            self.right = Bed(inchr, inright - 1, inright, indir)
            t2 = {}
            t2["junc"] = []
            t2["end"] = None
            self.right = Bed(inchr, inright - 1, inright, indir)
            self.right.set_payload(t2)
            self.right.get_payload()["junc"].append(inright)
            return
        # Lets add this one to our current one
        newfuz = FuzzyJunction(inchar, inleft, inright, indir)
        self.add_fuzzy_junction(newfuz)

    def add_fuzzy_junction(self, newfuz):
        # print 'add fuzzy'
        mergeleft = self.left.merge(newfuz.left)
        mergeleft.set_payload(self.left.get_payload())
        mergeright = self.right.merge(newfuz.right)
        mergeright.set_payload(self.right.get_payload())
        for j1 in newfuz.left.get_payload()["junc"]:
            mergeleft.get_payload()["junc"].append(j1)
        for j2 in newfuz.right.get_payload()["junc"]:
            mergeright.get_payload()["junc"].append(j2)
        # fix the starts
        if newfuz.left.get_payload()["start"] and not self.left.get_payload()["start"]:
            mergeleft.get_payload()["start"] = newfuz.left.get_payload()["start"]
        elif newfuz.left.get_payload()["start"] and self.left.get_payload()["start"]:
            newrange = self.left.get_payload()["start"].merge(newfuz.left.get_payload()["start"])
            newrange.set_payload([])
            for s in self.left.get_payload()["start"].get_payload():
                newrange.get_payload().append(s)
            for s in newfuz.left.get_payload()["start"].get_payload():
                newrange.get_payload().append(s)
            mergeleft.get_payload()["start"] = newrange
            # print 'update left starts'
        # fix the ends
        if newfuz.right.get_payload()["end"] and not self.right.get_payload()["end"]:
            mergeright.get_payload()["end"] = newfuz.right.get_payload()["end"]
        elif newfuz.right.get_payload()["end"] and self.right.get_payload()["end"]:
            newrange = newfuz.right.get_payload()["end"].merge(self.right.get_payload()["end"])
            newrange.set_payload([])
            for s in self.right.get_payload()["end"].get_payload():
                newrange.get_payload().append(s)
            for s in newfuz.right.get_payload()["end"].get_payload():
                newrange.get_payload().append(s)
            mergeright.get_payload()["end"] = newrange
            # print 'update right ends'
        # We finished the changes
        self.left = mergeleft
        self.right = mergeright
예제 #5
0
class FuzzyGenePred:
    #set use_dir true if you want to use direction and make it direction specific
    #set proper_set false if you want to do awesome extending that doesn't really work yet
    def __init__(self, ingpd=None, params=None, juntol=10):
        # Here is the basic data
        self.fuzzy_junctions = []
        self.gpds = []  #contributing member genepreds
        self.start = None
        self.end = None
        self.dir = None
        # Higher level data
        self.simple_junction_set = set(
        )  # quickly search for if a multi exon gene has been added
        #Here is the parameters
        self.params = {}
        self.params['use_dir'] = False
        self.params['junction_tolerance'] = juntol
        #Not fully implemented.  Do we require a full length match
        self.params['proper_set'] = True
        # Define thresholds for overlapping single exons
        self.params['do_add_single_exon'] = True
        self.params['single_exon_minimum_length'] = 200
        self.params[
            'single_exon_minimum_overlap_fraction'] = 0.8  #reciprocal ... must be this fraction or more on both
        self.params[
            'single_exon_minimum_overlap_bases'] = 1  #minimum number of bases
        self.params['single_exon_maximum_endpoint_distance'] = 1000
        if params:
            for pname in params:
                self.params[pname] = params[pname]
        if ingpd:
            self.add_gpd(ingpd)

    def get_genepred_line(self,
                          end_select='extremes',
                          junction_select='mode',
                          name=None):
        if not name:
            name = 'fuzGPD_' + random_string(8) + '_' + str(
                len(self.fuzzy_junctions) + 1) + '_' + str(len(self.gpds))
        ostr = ''
        ostr += name + "\t"
        ostr += name + "\t"
        ostr += self.start.chr + "\t"
        ostr += self.gpds[0].value('strand') + "\t"
        ostr += str(self.start.start - 1) + "\t"
        ostr += str(self.end.end) + "\t"
        ostr += str(self.start.start - 1) + "\t"
        ostr += str(self.end.end) + "\t"
        ostr += str(len(self.fuzzy_junctions) + 1) + "\t"
        exonstarts = []
        exonends = []
        exonstarts.append(self.start.start - 1)
        for j in self.fuzzy_junctions:
            exonends.append(mode(j.left.get_payload()['junc']))
            exonstarts.append(mode(j.right.get_payload()['junc']) - 1)
        exonends.append(self.end.end)
        ostr += ','.join([str(x) for x in exonstarts]) + ',' + "\t"
        ostr += ','.join([str(x) for x in exonends]) + ','
        return ostr

    # Return a copy of the fuzzy geneprep
    def copy(self):
        g = FuzzyGenePred()  # start with a blank one why not
        # get the settings
        for pname in self.params:
            g.params[pname] = self.params[pname]
        # copy the genepreds
        for orig in self.gpds:
            g.gpds.append(GenePredEntry(orig.get_line()))
        #store direction
        g.dir = self.dir
        # copy the fuzzy junctions
        for orig in self.fuzzy_junctions:
            g.fuzzy_junctions.append(orig.copy())
        # copy the simple junction set
        for orig in self.simple_junction_set:
            g.simple_junction_set.add(orig)
        # copy the start
        if self.start:
            g.start = Bed(self.start.chr,\
                          self.start.start-1,\
                          self.start.end,\
                          self.start.direction)
            g.start.set_payload([])
            for v in self.start.get_payload():
                g.start.get_payload().append(v)
        # copy the end
        if self.end:
            g.end = Bed(self.end.chr, self.end.start - 1, self.end.end,
                        self.end.direction)
            g.end.set_payload([])
            for v in self.end.get_payload():
                g.end.get_payload().append(v)
        return g

    def exon_count(self):
        return len(self.fuzzy_junctions) + 1

    def gpd_count(self):
        return len(self.gpds)

    def get_bed(self):
        return Bed(self.start.chr, self.start.start - 1, self.end.end,
                   self.start.direction)

    #This is an inspection tool for a fuzzy gpd
    def get_info_string(self):
        ostr = ''
        ostr += "== FUZZY GENEPRED INFO ==" + "\n"
        ostr += str(len(self.gpds)) + ' total GPDs' + "\n"
        totalbounds = Bed(self.start.chr, self.start.start - 1, self.end.end,
                          self.start.direction)
        ostr += totalbounds.get_range_string() + " total bounds\n"
        ostr += '---- start ----' + "\n"
        ostr += str(len(
            self.start.get_payload())) + " reads supporting start" + "\n"
        ostr += '  ' + str(mean(self.start.get_payload())) + ' mean' + "\n"
        ostr += '  ' + str(mode(self.start.get_payload())) + ' mode' + "\n"
        ostr += '  ' + self.start.get_range_string() + " start range\n"
        ostr += '---- end ----' + "\n"
        ostr += str(len(
            self.end.get_payload())) + " reads supporting end" + "\n"
        ostr += '  ' + str(mean(self.end.get_payload())) + ' mean' + "\n"
        ostr += '  ' + str(mode(self.end.get_payload())) + ' mode' + "\n"
        ostr += '  ' + self.end.get_range_string() + " end range\n"
        ostr += '---- junctions ----' + "\n"
        ostr += str(len(
            self.fuzzy_junctions)) + ' total fuzzy junctions' + "\n"
        cnt = 0
        for j in self.fuzzy_junctions:
            cnt += 1
            ostr += '  ' + str(cnt) + '. ' + str(
                mode(j.left.get_payload()['junc'])) + " ^ " + str(
                    mode(j.right.get_payload()['junc'])) + "\n"
            ostr += "     " + j.left.get_range_string(
            ) + " ^ " + j.right.get_range_string() + "\n"
            ostr += "     " + str(len(
                j.left.get_payload()['junc'])) + " read support" + "\n"
            if j.left.get_payload()['start']:
                ostr += "       " + "---starts----" + "\n"
                ostr += "       " + str(
                    len(j.left.get_payload()['start'].get_payload())
                ) + " starts at " + j.left.get_payload(
                )['start'].get_range_string() + "\n"
            if j.right.get_payload()['end']:
                ostr += "       " + "---ends----" + "\n"
                ostr += "       " + str(
                    len(j.right.get_payload()['end'].get_payload())
                ) + " ends at " + j.right.get_payload(
                )['end'].get_range_string() + "\n"
        return ostr

    #Add a new gpd return true if successful
    #Return false if it didn't work, return the new combined if it worked
    def add_gpd(self, ingpd):
        if len(self.gpds) == 0:  # first one
            self.read_first(ingpd)
            return self  #return ourself if we are adding our first
        # more difficult situation where we must try to combine
        # See if it can match first before actually adding stuff to it
        #if self.
        newfuz = FuzzyGenePred(ingpd, params=self.params)
        output = self.add_fuzzy_gpd(newfuz)
        return output

    # combine together compatible overlapping sets
    def concat_fuzzy_gpd(self, fuz2):
        if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) != 0:
            return False
        if len(fuz2.fuzzy_junctions) != 0 and len(self.fuzzy_junctions) == 0:
            return False
        # Lets work combine the single exon step and exit
        if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) == 0:
            return self.do_add_single_exon_fuzzy_gpd(fuz2)
        # For now don't add them if one is single exon
        if len(self.fuzzy_junctions) == 0 or len(fuz2.fuzzy_junctions) == 0:
            return False

        # See if its already a subset
        easy_subset = False
        for simplejunction in fuz2.simple_junction_set:
            if simplejunction in self.simple_junction_set:
                easy_subset = True
        # If its not already a subset look deeper
        #1. First we need perfect junctions for a run of them
        if not easy_subset:
            if not self.compatible_overlap(fuz2): return False
        # still here. we will work on combining these
        output = self.copy()
        # first lets put add any overlapping junctions
        for i in range(0, len(output.fuzzy_junctions)):
            for j in range(0, len(fuz2.fuzzy_junctions)):
                if output.fuzzy_junctions[i].overlaps(
                        fuz2.fuzzy_junctions[j],
                        fuz2.params['junction_tolerance']):
                    output.fuzzy_junctions[i].add_fuzzy_junction(
                        fuz2.fuzzy_junctions[j])
                    if j == 0:  # put the start in too
                        if not output.fuzzy_junctions[i].left.get_payload(
                        )['start']:
                            output.fuzzy_junctions[i].left.get_payload(
                            )['start'] = fuz2.start.copy()
                        else:  # merge
                            starts = output.fuzzy_junctions[
                                i].left.get_payload()['start'].get_payload()
                            for v in fuz2.start.get_payload():
                                starts.append(v)
                            nrange = output.fuzzy_junctions[
                                i].left.get_payload()['start'].merge(
                                    fuz2.start)
                            nrange.set_payload(starts[:])
                            output.fuzzy_junctions[i].left.get_payload(
                            )['start'] = nrange
                    if j == len(
                            fuz2.fuzzy_junctions) - 1:  # put the end in too
                        if not output.fuzzy_junctions[i].right.get_payload(
                        )['end']:
                            output.fuzzy_junctions[i].right.get_payload(
                            )['end'] = fuz2.end.copy()
                        else:  # merge
                            ends = output.fuzzy_junctions[i].right.get_payload(
                            )['end'].get_payload()
                            for v in fuz2.end.get_payload():
                                ends.append(v)
                            nrange = output.fuzzy_junctions[
                                i].right.get_payload()['end'].merge(fuz2.end)
                            nrange.set_payload(ends[:])
                            output.fuzzy_junctions[i].right.get_payload(
                            )['end'] = nrange
        # see if we should build onto the left
        leftnum = -1
        leftmost = self.fuzzy_junctions[0]
        if fuz2.fuzzy_junctions[0].right.end < leftmost.left.start:
            for i in range(0, len(fuz2.fuzzy_junctions)):
                if fuz2.fuzzy_junctions[i].overlaps(
                        leftmost, fuz2.params['junction_tolerance']):
                    leftnum = i
                    break
        #leftnum is now -1 if no additions to the left zero if it starts on the same
        if leftnum > 0:
            for i in reversed(range(0, leftnum)):
                output.fuzzy_junctions.insert(0,
                                              fuz2.fuzzy_junctions[i].copy())
            output.start = fuz2.start.copy()
        rightnum = -1  # get the right point ... our first one comes after this
        rightmost = self.fuzzy_junctions[-1]
        if fuz2.fuzzy_junctions[-1].left.start > rightmost.right.end:
            for i in reversed(range(0, len(fuz2.fuzzy_junctions))):
                if fuz2.fuzzy_junctions[i].overlaps(
                        rightmost, fuz2.params['junction_tolerance']):
                    rightnum = i
                    break
        if rightnum != -1:
            rightnum += 1
            if rightnum < len(fuz2.fuzzy_junctions):
                for i in range(rightnum, len(fuz2.fuzzy_junctions)):
                    output.fuzzy_junctions.append(
                        fuz2.fuzzy_junctions[i].copy())
                output.end = fuz2.end.copy()
        #print leftnum
        #print rightnum
        #print fuz2.params['junction_tolerance']
        #print 'combining'
        return output

    # add together subsets
    def add_fuzzy_gpd(self, fuz2):
        # see if we can add this fuzzy gpd to another
        # We treat single exon genes seprately so if only one of them is
        # single exon we can't compare them
        if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) != 0:
            return False
        if len(fuz2.fuzzy_junctions) != 0 and len(self.fuzzy_junctions) == 0:
            return False
        # Lets work combine the single exon step and exit
        if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) == 0:
            return self.do_add_single_exon_fuzzy_gpd(fuz2)

        # For now don't add them if one is single exon
        if len(self.fuzzy_junctions) == 0 or len(fuz2.fuzzy_junctions) == 0:
            return False

        # See if its already a subset
        easy_subset = False
        for simplejunction in fuz2.simple_junction_set:
            if simplejunction in self.simple_junction_set:
                easy_subset = True
        # If its not already a subset look deeper
        #1. First we need perfect junctions for a run of them
        if not easy_subset:
            if not self.compatible_overlap(fuz2): return False
        # still here. we will work on combining these
        output = self.copy()
        #switch over to working on the output now

        # If we are still here we can add the two of them together
        # If they have the same starting junction we can add their starting points together
        if output.fuzzy_junctions[0].overlaps(
                fuz2.fuzzy_junctions[0], output.params['junction_tolerance']):
            #print 'samestart'
            newstart = output.start.merge(fuz2.start)
            newstart.set_payload(output.start.get_payload())
            for s in fuz2.start.get_payload():
                newstart.get_payload().append(s)
            output.start = newstart

        # Check if the other one is new start
        elif mode(fuz2.fuzzy_junctions[0].left.get_payload()['junc']) < mode(
                output.fuzzy_junctions[0].left.get_payload()['junc']):
            #print "2 start"
            output.start = fuz2.start
        elif mode(fuz2.fuzzy_junctions[0].left.get_payload()['junc']) > mode(
                output.fuzzy_junctions[0].left.get_payload()['junc']):
            True
        #  #print "1 start"
        #  #we're good to go
        else:
            sys.stderr.write("WARNING: strange start case abort merge\n")
            return False
        # lets work the ends now
        if output.fuzzy_junctions[-1].overlaps(
                fuz2.fuzzy_junctions[-1], output.params['junction_tolerance']):
            #print 'sameend'
            newend = output.end.merge(fuz2.end)
            newend.set_payload(output.end.get_payload())
            for s in fuz2.end.get_payload():
                newend.get_payload().append(s)
            output.end = newend

        # Check if the other one is new start
        elif mode(fuz2.fuzzy_junctions[-1].right.get_payload()['junc']) > mode(
                output.fuzzy_junctions[-1].right.get_payload()['junc']):
            #print "2 end"
            output.end = fuz2.end
        elif mode(fuz2.fuzzy_junctions[-1].right.get_payload()['junc']) < mode(
                output.fuzzy_junctions[-1].right.get_payload()['junc']):
            True
        #  #print "1 end"
        #  #we're good to go
        else:
            sys.stderr.write("WARNING: strange end case abort merge\n")
            u1 = mode(output.fuzzy_junctions[-1].left.get_payload()['junc'])
            u2 = mode(fuz2.fuzzy_junctions[-1].left.get_payload()['junc'])
            v1 = mode(output.fuzzy_junctions[-1].right.get_payload()['junc'])
            v2 = mode(fuz2.fuzzy_junctions[-1].right.get_payload()['junc'])
            sys.stderr.write(str(u1) + "\t" + str(u2) + "\n")
            sys.stderr.write(str(v1) + "\t" + str(v2) + "\n")
            return False
        # now the starts and ends have been updated in output.
        # iterate through the junctions.
        # check for a left overhang.
        numfuz2left = 0
        numoutleft = 0
        if not output.fuzzy_junctions[0].overlaps(
                fuz2.fuzzy_junctions[0], output.params['junction_tolerance']):
            # see if we need to add sequences from fuz2
            if mode(fuz2.fuzzy_junctions[0].left.get_payload()['junc']) < mode(
                    output.fuzzy_junctions[0].left.get_payload()['junc']):
                #print 'left over2'
                i = 0
                while not output.fuzzy_junctions[0].overlaps(
                        fuz2.fuzzy_junctions[i],
                        output.params['junction_tolerance']) and i < len(
                            fuz2.fuzzy_junctions):
                    i += 1
                numfuz2left = i  # number to push on from the fuz2 and increment in
                #print numfuz2left
            elif mode(
                    fuz2.fuzzy_junctions[0].left.get_payload()['junc']) > mode(
                        output.fuzzy_junctions[0].left.get_payload()['junc']):
                #print 'left over1'
                i = 0
                while not output.fuzzy_junctions[i].overlaps(
                        fuz2.fuzzy_junctions[0],
                        output.params['junction_tolerance']) and i < len(
                            output.fuzzy_junctions):
                    i += 1
                numoutleft = i  # number to increment in from output
                #print numoutleft
            else:
                sys.stderr.write("WARNING: strange case \n")
                return False
        # next we can check how long we have a run of the same
        ind1 = numoutleft
        ind2 = numfuz2left
        overlap_size = 0
        while ind1 < len(output.fuzzy_junctions) and ind2 < len(fuz2.fuzzy_junctions) \
          and output.fuzzy_junctions[ind1].overlaps(fuz2.fuzzy_junctions[ind2],output.params['junction_tolerance']):
            overlap_size += 1
            ind1 += 1
            ind2 += 1
        #print 'overlap size '+str(overlap_size)
        numoutright = len(output.fuzzy_junctions) - overlap_size - numoutleft
        numfuz2right = len(fuz2.fuzzy_junctions) - overlap_size - numfuz2left
        if min(numoutright, numfuz2right) != 0:
            sys.stderr.write("WARNING: expected one of them to be zero\n")
            #print self.get_info_string()
            #print '====================='
            #print fuz2.get_info_string()
            #sys.exit()
            return False
        if min(numoutleft, numfuz2left) != 0:
            sys.stderr.write("WARNING: expected one of them to be zero\n")
            return False
        #print numoutright
        #print numfuz2right
        #print output.fuzzy_junctions[numoutleft].overlaps(fuz2.fuzzy_junctions[numfuz2left],output.junction_tolerance)
        #print 'add'
        #Now we have what we need to go through and do some updating
        #Lets just make new fuzzy junctions
        newjuncs = []
        for i in range(0, numfuz2left):
            newjuncs.append(fuz2.fuzzy_junctions[i])
        for i in range(0, numoutleft):
            newjuncs.append(output.fuzzy_junctions[i])
        #Now we do both down the center
        range1 = range(numoutleft, overlap_size + numoutleft)
        range2 = range(numfuz2left, overlap_size + numfuz2left)
        for i in range(0, len(range1)):
            newjuncs.append(output.fuzzy_junctions[range1[i]])
            newjuncs[-1].add_fuzzy_junction(fuz2.fuzzy_junctions[range2[i]])
            #print i
        #Make the right size
        for i in range(overlap_size + numfuz2left,
                       overlap_size + numfuz2left + numfuz2right):
            newjuncs.append(fuz2.fuzzy_junctions[i])
        for i in range(overlap_size + numoutleft,
                       overlap_size + numoutleft + numoutright):
            newjuncs.append(output.fuzzy_junctions[i])
        output.fuzzy_junctions = newjuncs
        #print 'adding gpd '+str(len(fuz2.gpds))+' entries'
        for g in fuz2.gpds:
            output.gpds.append(g)
            sjun = get_simple_junction(g)
            if sjun:
                output.simple_junction_set.add(sjun)
        #print 'new entry'
        #print self.get_info_string()
        return output

    def do_add_single_exon_fuzzy_gpd(self, fuz2):
        if not self.params['do_add_single_exon']:
            return False  # make sure we are allowed to be doing this
        #build the bounds from the average start and end
        s1 = mean(self.start.get_payload())
        e1 = mean(self.end.get_payload())
        s2 = mean(fuz2.start.get_payload())
        e2 = mean(fuz2.end.get_payload())
        l1 = e1 - s1 + 1
        l2 = e2 - s2 + 1
        if l1 < self.params['single_exon_minimum_length']:
            return False
        if l2 < self.params['single_exon_minimum_length']:
            return False
        if l1 < 1 or l2 < 1: return False  #shouldn't happen
        chr1 = self.start.chr
        chr2 = self.end.chr
        if chr1 != chr2: return False  #shouldn't happen
        r1 = Bed(chr1, s1 - 1, e1, self.dir)
        r2 = Bed(chr2, s2 - 1, e2, self.dir)
        over = r1.overlap_size(r2)
        if over < self.params['single_exon_minimum_overlap_bases']:
            return False
        #print r1.get_range_string()
        #print r2.get_range_string()
        cov = min(float(over) / float(l1), float(over) / float(l2))
        if cov < self.params['single_exon_minimum_overlap_fraction']:
            return False
        if abs(e1 - e2) > self.params['single_exon_maximum_endpoint_distance']:
            return False
        if abs(s1 - s2) > self.params['single_exon_maximum_endpoint_distance']:
            return False
        #If we're still here, we can add result
        output = self.copy()
        newstart = output.start.merge(fuz2.start)
        newstart.set_payload([])
        for s in output.start.get_payload():
            newstart.get_payload().append(s)
        for s in fuz2.start.get_payload():
            newstart.get_payload().append(s)
        newend = output.end.merge(fuz2.end)
        newend.set_payload([])
        for e in output.end.get_payload():
            newend.get_payload().append(e)
        for e in fuz2.end.get_payload():
            newend.get_payload().append(e)
        output.start = newstart
        output.end = newend
        for gpd in fuz2.gpds:
            output.gpds.append(gpd)
            sjun = get_simple_junction(gpd)
            if sjun:
                output.simple_junction_set.add(gpd)
        return output

    #Return true if these fuzzy genepreds can be added together
    def compatible_overlap(self, fingpd):
        f1 = self
        f2 = fingpd

        #### Forget about trying zero exon cases for now
        if len(f1.fuzzy_junctions) == 0 or len(f2.fuzzy_junctions) == 0:
            return False

        #Find all matches
        matches = []
        for i in range(0, len(f1.fuzzy_junctions)):
            for j in range(0, len(f2.fuzzy_junctions)):
                if f1.fuzzy_junctions[i].overlaps(
                        f2.fuzzy_junctions[j],
                        self.params['junction_tolerance']):
                    matches.append([i, j])

        # This is our matched junctions in f1 and f2
        if len(matches) == 0:
            return False  # Nothing matched.. certainly no overlap

        # This is the number of extra exons it would take in the middle of the run (shifts)
        if len(set([x[0] - x[1] for x in matches])) != 1: return False

        # Lets make sure all our exons are consecutive
        if len(matches) > 1:
            consec1 = list(
                set([
                    matches[i + 1][0] - matches[i][0]
                    for i in range(0,
                                   len(matches) - 1)
                ]))
            consec2 = list(
                set([
                    matches[i + 1][1] - matches[i][1]
                    for i in range(0,
                                   len(matches) - 1)
                ]))
            if len(consec1) != 1: return False
            if len(consec2) != 1: return False
            if consec1[0] != 1: return False
            if consec2[0] != 1: return False
        # one of them should be zero
        if not (matches[0][1] == 0 or matches[0][0] == 0):
            return False

        # and one of our last matches should be the last junction
        if not (len(f1.fuzzy_junctions) - 1 == matches[-1][0]
                or len(f2.fuzzy_junctions) - 1 == matches[-1][1]):
            return False

        #### most of the time we will probably be looking for a proper set
        #### unless we are extending the long read for isoform prediction
        if self.params['proper_set']:
            # check those last overhangs
            # one of the two needs to have the start and end points in the consecutive matches
            if (matches[0][0] == 0 and len(f1.fuzzy_junctions)-1 == matches[-1][0]) or \
               (matches[0][1] == 0 and len(f2.fuzzy_junctions)-1 == matches[-1][1]):
                return True
            return False

        return True

    def read_first(self, ingpd):
        self.gpds.append(ingpd)
        sjun = get_simple_junction(ingpd)
        if sjun:
            self.simple_junction_set.add(sjun)
        if self.params['use_dir']: self.dir = ingpd.value('strand')
        # add fuzzy junctions
        chr = ingpd.value('chrom')
        for i in range(0, len(ingpd.value('exonStarts')) - 1):
            self.fuzzy_junctions.append(
                FuzzyJunction(chr,
                              ingpd.value('exonEnds')[i],
                              ingpd.value('exonStarts')[i + 1] + 1, self.dir))
        if len(ingpd.value('exonStarts')) > 1:  # we have junctions
            self.fuzzy_junctions[0].left.get_payload()['start'] = Bed(
                chr, ingpd.value('txStart'),
                ingpd.value('txStart') + 1, self.dir)
            self.fuzzy_junctions[0].left.get_payload()['start'].set_payload([])
            self.fuzzy_junctions[0].left.get_payload()['start'].get_payload(
            ).append(ingpd.value('txStart') + 1)
            self.fuzzy_junctions[-1].right.get_payload()['end'] = Bed(
                chr,
                ingpd.value('txEnd') - 1, ingpd.value('txEnd'), self.dir)
            self.fuzzy_junctions[-1].right.get_payload()['end'].set_payload([])
            self.fuzzy_junctions[-1].right.get_payload()['end'].get_payload(
            ).append(ingpd.value('txEnd'))
        # add fuzzy starts
        self.start = Bed(ingpd.value('chrom'), ingpd.value('txStart'),
                         ingpd.value('txStart') + 1, self.dir)
        self.start.set_payload([])
        self.start.get_payload().append(ingpd.value('txStart') + 1)
        self.end = Bed(ingpd.value('chrom'),
                       ingpd.value('txEnd') - 1, ingpd.value('txEnd'),
                       self.dir)
        self.end.set_payload([])
        self.end.get_payload().append(ingpd.value('txEnd'))
        # Have finished reading in the first case

    # Pre: another fuzzy gpd
    # Post: True if they are all overlapping junctions
    def is_equal_fuzzy(self, fuz2, use_direction=False):
        if use_direction:
            if self.dir != fuz2.dir: return False
        if len(self.fuzzy_junctions) < 0: return False
        if len(fuz2.fuzzy_junctions) < 0: return False
        if len(self.fuzzy_junctions) != len(fuz2.fuzzy_junctions):
            return False
        for i in range(0, len(self.fuzzy_junctions)):
            if not self.fuzzy_junctions[i].overlaps(
                    fuz2.fuzzy_junctions[i],
                    self.params['junction_tolerance']):
                return False
        return True
예제 #6
0
class FuzzyJunction:
    # Pre: inleft is 1-indexed last exonic base on the left
    #      inright is 1-indexed first exonic base on the right
    #      direction doesn't need to be used
    def __init__(self, inchr=None, inleft=None, inright=None, indir=None):
        self.chr = inchr
        self.left = None  #range with payloads being the actual left and rights
        self.right = None
        self.dir = indir
        if inchr and inleft and inright:
            self.add_junction(inchr, inleft, inright, indir)

    def copy(self):
        newjunc = FuzzyJunction()
        newjunc.chr = self.chr
        newjunc.left = Bed(self.left.chr,\
                           self.left.start-1,\
                           self.left.end,\
                           self.left.direction)
        t1 = {}
        t1['junc'] = []
        t1['start'] = None
        newjunc.left.set_payload(t1)
        for j in self.left.get_payload()['junc']:
            newjunc.left.get_payload()['junc'].append(j)
        newjunc.right = Bed(self.right.chr, self.right.start - 1,
                            self.right.end, self.right.direction)
        #copy any starts for the junction
        if self.left.get_payload()['start']:
            ls = self.left.get_payload()['start']
            newjunc.left.get_payload()['start'] = Bed(ls.chr, ls.start - 1,
                                                      ls.end, ls.direction)
            newjunc.left.get_payload()['start'].set_payload([])
            for p in self.left.get_payload()['start'].get_payload():
                newjunc.left.get_payload()['start'].get_payload().append(p)
        t2 = {}
        t2['junc'] = []
        t2['end'] = None
        newjunc.right.set_payload(t2)
        for j in self.right.get_payload()['junc']:
            newjunc.right.get_payload()['junc'].append(j)
        #copy any ends for the junction
        if self.right.get_payload()['end']:
            ren = self.right.get_payload()['end']
            newjunc.right.get_payload()['end'] = Bed(ren.chr, ren.start - 1,
                                                     ren.end, ren.direction)
            newjunc.right.get_payload()['end'].set_payload([])
            for p in self.right.get_payload()['end'].get_payload():
                newjunc.right.get_payload()['end'].get_payload().append(p)
        return newjunc

    # return chr, and the left and right mode as an array
    def get_mode(self):
        m1 = mode(self.left.get_payload()['junc'])
        m2 = mode(self.right.get_payload()['junc'])
        return [
            Bed(self.chr, m1 - 1, m1, self.dir),
            Bed(self.chr, m2 - 1, m2, self.dir)
        ]

    # Find the mode of the junction and see if this overlaps
    def overlaps(self, fjun2, juntol):
        m1 = self.get_mode()
        m2 = fjun2.get_mode()
        if m1[0].chr != m2[0].chr: return False
        if m1[0].direction != m2[0].direction:
            return False  # usually they are both off
        if not m1[0].overlaps_with_padding(m2[0], juntol): return False
        if not m1[1].overlaps_with_padding(m2[1], juntol): return False
        return True

    #Right now assumes these are overlap verified prior to calling
    def add_junction(self, inchr, inleft, inright, indir=None):
        if not self.left:  # this is our first one
            t1 = {}
            t1['junc'] = []
            t1['start'] = None
            self.left = Bed(inchr, inleft - 1, inleft, indir)
            self.left.set_payload(t1)
            self.left.get_payload()['junc'].append(inleft)
            self.right = Bed(inchr, inright - 1, inright, indir)
            t2 = {}
            t2['junc'] = []
            t2['end'] = None
            self.right = Bed(inchr, inright - 1, inright, indir)
            self.right.set_payload(t2)
            self.right.get_payload()['junc'].append(inright)
            return
        #Lets add this one to our current one
        newfuz = FuzzyJunction(inchar, inleft, inright, indir)
        self.add_fuzzy_junction(newfuz)

    def add_fuzzy_junction(self, newfuz):
        #print 'add fuzzy'
        mergeleft = self.left.merge(newfuz.left)
        mergeleft.set_payload(self.left.get_payload())
        mergeright = self.right.merge(newfuz.right)
        mergeright.set_payload(self.right.get_payload())
        for j1 in newfuz.left.get_payload()['junc']:
            mergeleft.get_payload()['junc'].append(j1)
        for j2 in newfuz.right.get_payload()['junc']:
            mergeright.get_payload()['junc'].append(j2)
        #fix the starts
        if newfuz.left.get_payload(
        )['start'] and not self.left.get_payload()['start']:
            mergeleft.get_payload()['start'] = newfuz.left.get_payload(
            )['start']
        elif newfuz.left.get_payload()['start'] and self.left.get_payload(
        )['start']:
            newrange = self.left.get_payload()['start'].merge(
                newfuz.left.get_payload()['start'])
            newrange.set_payload([])
            for s in self.left.get_payload()['start'].get_payload():
                newrange.get_payload().append(s)
            for s in newfuz.left.get_payload()['start'].get_payload():
                newrange.get_payload().append(s)
            mergeleft.get_payload()['start'] = newrange
            #print 'update left starts'
        #fix the ends
        if newfuz.right.get_payload(
        )['end'] and not self.right.get_payload()['end']:
            mergeright.get_payload()['end'] = newfuz.right.get_payload()['end']
        elif newfuz.right.get_payload()['end'] and self.right.get_payload(
        )['end']:
            newrange = newfuz.right.get_payload()['end'].merge(
                self.right.get_payload()['end'])
            newrange.set_payload([])
            for s in self.right.get_payload()['end'].get_payload():
                newrange.get_payload().append(s)
            for s in newfuz.right.get_payload()['end'].get_payload():
                newrange.get_payload().append(s)
            mergeright.get_payload()['end'] = newrange
            #print 'update right ends'
        # We finished the changes
        self.left = mergeleft
        self.right = mergeright