def get_loci(transcripts_genepred):
    loci = Loci()
    loci.verbose = True
    with open(transcripts_genepred) as inf:
        for line in inf:
            if line[0] == '#': continue
            gpd = GenePredEntry(line.rstrip())
            rng = Bed(gpd.value('chrom'), gpd.value('txStart'),
                      gpd.value('txEnd'))
            rng.set_payload(gpd.value('name'))
            loc1 = Locus()
            loc1.add_member(rng)
            loci.add_locus(loc1)
    sys.stderr.write("Organizing genepred data into overlapping loci\n")
    sys.stderr.write("Started with " + str(len(loci.loci)) + " loci\n")
    loci.update_loci()
    sys.stderr.write("Ended with " + str(len(loci.loci)) + " loci\n")

    m = 0
    locus2name = {}
    name2locus = {}
    for locus in loci.loci:
        m += 1
        for member in locus.members:
            name = member.get_payload()
            if m not in locus2name: locus2name[m] = set()
            locus2name[m].add(name)
            name2locus[name] = m
    return [locus2name, name2locus]
 def read_first(self, ingpd):
     self.gpds.append(ingpd)
     sjun = get_simple_junction(ingpd)
     if sjun:
         self.simple_junction_set.add(sjun)
     if self.params["use_dir"]:
         self.dir = ingpd.value("strand")
     # add fuzzy junctions
     chr = ingpd.value("chrom")
     for i in range(0, len(ingpd.value("exonStarts")) - 1):
         self.fuzzy_junctions.append(
             FuzzyJunction(chr, ingpd.value("exonEnds")[i], ingpd.value("exonStarts")[i + 1] + 1, self.dir)
         )
     if len(ingpd.value("exonStarts")) > 1:  # we have junctions
         self.fuzzy_junctions[0].left.get_payload()["start"] = Bed(
             chr, ingpd.value("txStart"), ingpd.value("txStart") + 1, self.dir
         )
         self.fuzzy_junctions[0].left.get_payload()["start"].set_payload([])
         self.fuzzy_junctions[0].left.get_payload()["start"].get_payload().append(ingpd.value("txStart") + 1)
         self.fuzzy_junctions[-1].right.get_payload()["end"] = Bed(
             chr, ingpd.value("txEnd") - 1, ingpd.value("txEnd"), self.dir
         )
         self.fuzzy_junctions[-1].right.get_payload()["end"].set_payload([])
         self.fuzzy_junctions[-1].right.get_payload()["end"].get_payload().append(ingpd.value("txEnd"))
     # add fuzzy starts
     self.start = Bed(ingpd.value("chrom"), ingpd.value("txStart"), ingpd.value("txStart") + 1, self.dir)
     self.start.set_payload([])
     self.start.get_payload().append(ingpd.value("txStart") + 1)
     self.end = Bed(ingpd.value("chrom"), ingpd.value("txEnd") - 1, ingpd.value("txEnd"), self.dir)
     self.end.set_payload([])
     self.end.get_payload().append(ingpd.value("txEnd"))
def get_loci(transcripts_genepred):
  loci = Loci()
  loci.verbose= True
  with open(transcripts_genepred) as inf:
    for line in inf:
      if line[0]=='#': continue
      gpd = GenePredEntry(line.rstrip())
      rng = Bed(gpd.value('chrom'),gpd.value('txStart'),gpd.value('txEnd'))
      rng.set_payload(gpd.value('name'))
      loc1 = Locus()
      loc1.add_member(rng)
      loci.add_locus(loc1)
  sys.stderr.write("Organizing genepred data into overlapping loci\n")
  sys.stderr.write("Started with "+str(len(loci.loci))+" loci\n")
  loci.update_loci()
  sys.stderr.write("Ended with "+str(len(loci.loci))+" loci\n")

  m = 0
  locus2name = {}
  name2locus = {}
  for locus in loci.loci:
    m+=1
    for member in locus.members:
      name = member.get_payload()
      if m not in locus2name:  locus2name[m] = set()
      locus2name[m].add(name)
      name2locus[name] = m
  return [locus2name,name2locus]
Ejemplo n.º 4
0
 def get_mode(self):
     m1 = mode(self.left.get_payload()['junc'])
     m2 = mode(self.right.get_payload()['junc'])
     return [
         Bed(self.chr, m1 - 1, m1, self.dir),
         Bed(self.chr, m2 - 1, m2, self.dir)
     ]
Ejemplo n.º 5
0
 def copy(self):
     g = FuzzyGenePred()  # start with a blank one why not
     # get the settings
     for pname in self.params:
         g.params[pname] = self.params[pname]
     # copy the genepreds
     for orig in self.gpds:
         g.gpds.append(GenePredEntry(orig.get_line()))
     #store direction
     g.dir = self.dir
     # copy the fuzzy junctions
     for orig in self.fuzzy_junctions:
         g.fuzzy_junctions.append(orig.copy())
     # copy the simple junction set
     for orig in self.simple_junction_set:
         g.simple_junction_set.add(orig)
     # copy the start
     if self.start:
         g.start = Bed(self.start.chr,\
                       self.start.start-1,\
                       self.start.end,\
                       self.start.direction)
         g.start.set_payload([])
         for v in self.start.get_payload():
             g.start.get_payload().append(v)
     # copy the end
     if self.end:
         g.end = Bed(self.end.chr, self.end.start - 1, self.end.end,
                     self.end.direction)
         g.end.set_payload([])
         for v in self.end.get_payload():
             g.end.get_payload().append(v)
     return g
 def do_add_single_exon_fuzzy_gpd(self, fuz2):
     if not self.params["do_add_single_exon"]:
         return False  # make sure we are allowed to be doing this
     # build the bounds from the average start and end
     s1 = mean(self.start.get_payload())
     e1 = mean(self.end.get_payload())
     s2 = mean(fuz2.start.get_payload())
     e2 = mean(fuz2.end.get_payload())
     l1 = e1 - s1 + 1
     l2 = e2 - s2 + 1
     if l1 < self.params["single_exon_minimum_length"]:
         return False
     if l2 < self.params["single_exon_minimum_length"]:
         return False
     if l1 < 1 or l2 < 1:
         return False  # shouldn't happen
     chr1 = self.start.chr
     chr2 = self.end.chr
     if chr1 != chr2:
         return False  # shouldn't happen
     r1 = Bed(chr1, s1 - 1, e1, self.dir)
     r2 = Bed(chr2, s2 - 1, e2, self.dir)
     over = r1.overlap_size(r2)
     if over < self.params["single_exon_minimum_overlap_bases"]:
         return False
     # print r1.get_range_string()
     # print r2.get_range_string()
     cov = min(float(over) / float(l1), float(over) / float(l2))
     if cov < self.params["single_exon_minimum_overlap_fraction"]:
         return False
     if abs(e1 - e2) > self.params["single_exon_maximum_endpoint_distance"]:
         return False
     if abs(s1 - s2) > self.params["single_exon_maximum_endpoint_distance"]:
         return False
     # If we're still here, we can add result
     output = self.copy()
     newstart = output.start.merge(fuz2.start)
     newstart.set_payload([])
     for s in output.start.get_payload():
         newstart.get_payload().append(s)
     for s in fuz2.start.get_payload():
         newstart.get_payload().append(s)
     newend = output.end.merge(fuz2.end)
     newend.set_payload([])
     for e in output.end.get_payload():
         newend.get_payload().append(e)
     for e in fuz2.end.get_payload():
         newend.get_payload().append(e)
     output.start = newstart
     output.end = newend
     for gpd in fuz2.gpds:
         output.gpds.append(gpd)
         sjun = get_simple_junction(gpd)
         if sjun:
             output.simple_junction_set.add(gpd)
     return output
Ejemplo n.º 7
0
 def do_add_single_exon_fuzzy_gpd(self, fuz2):
     if not self.params['do_add_single_exon']:
         return False  # make sure we are allowed to be doing this
     #build the bounds from the average start and end
     s1 = mean(self.start.get_payload())
     e1 = mean(self.end.get_payload())
     s2 = mean(fuz2.start.get_payload())
     e2 = mean(fuz2.end.get_payload())
     l1 = e1 - s1 + 1
     l2 = e2 - s2 + 1
     if l1 < self.params['single_exon_minimum_length']:
         return False
     if l2 < self.params['single_exon_minimum_length']:
         return False
     if l1 < 1 or l2 < 1: return False  #shouldn't happen
     chr1 = self.start.chr
     chr2 = self.end.chr
     if chr1 != chr2: return False  #shouldn't happen
     r1 = Bed(chr1, s1 - 1, e1, self.dir)
     r2 = Bed(chr2, s2 - 1, e2, self.dir)
     over = r1.overlap_size(r2)
     if over < self.params['single_exon_minimum_overlap_bases']:
         return False
     #print r1.get_range_string()
     #print r2.get_range_string()
     cov = min(float(over) / float(l1), float(over) / float(l2))
     if cov < self.params['single_exon_minimum_overlap_fraction']:
         return False
     if abs(e1 - e2) > self.params['single_exon_maximum_endpoint_distance']:
         return False
     if abs(s1 - s2) > self.params['single_exon_maximum_endpoint_distance']:
         return False
     #If we're still here, we can add result
     output = self.copy()
     newstart = output.start.merge(fuz2.start)
     newstart.set_payload([])
     for s in output.start.get_payload():
         newstart.get_payload().append(s)
     for s in fuz2.start.get_payload():
         newstart.get_payload().append(s)
     newend = output.end.merge(fuz2.end)
     newend.set_payload([])
     for e in output.end.get_payload():
         newend.get_payload().append(e)
     for e in fuz2.end.get_payload():
         newend.get_payload().append(e)
     output.start = newstart
     output.end = newend
     for gpd in fuz2.gpds:
         output.gpds.append(gpd)
         sjun = get_simple_junction(gpd)
         if sjun:
             output.simple_junction_set.add(gpd)
     return output
 def get_info_string(self):
     ostr = ""
     ostr += "== FUZZY GENEPRED INFO ==" + "\n"
     ostr += str(len(self.gpds)) + " total GPDs" + "\n"
     totalbounds = Bed(self.start.chr, self.start.start - 1, self.end.end, self.start.direction)
     ostr += totalbounds.get_range_string() + " total bounds\n"
     ostr += "---- start ----" + "\n"
     ostr += str(len(self.start.get_payload())) + " reads supporting start" + "\n"
     ostr += "  " + str(mean(self.start.get_payload())) + " mean" + "\n"
     ostr += "  " + str(mode(self.start.get_payload())) + " mode" + "\n"
     ostr += "  " + self.start.get_range_string() + " start range\n"
     ostr += "---- end ----" + "\n"
     ostr += str(len(self.end.get_payload())) + " reads supporting end" + "\n"
     ostr += "  " + str(mean(self.end.get_payload())) + " mean" + "\n"
     ostr += "  " + str(mode(self.end.get_payload())) + " mode" + "\n"
     ostr += "  " + self.end.get_range_string() + " end range\n"
     ostr += "---- junctions ----" + "\n"
     ostr += str(len(self.fuzzy_junctions)) + " total fuzzy junctions" + "\n"
     cnt = 0
     for j in self.fuzzy_junctions:
         cnt += 1
         ostr += (
             "  "
             + str(cnt)
             + ". "
             + str(mode(j.left.get_payload()["junc"]))
             + " ^ "
             + str(mode(j.right.get_payload()["junc"]))
             + "\n"
         )
         ostr += "     " + j.left.get_range_string() + " ^ " + j.right.get_range_string() + "\n"
         ostr += "     " + str(len(j.left.get_payload()["junc"])) + " read support" + "\n"
         if j.left.get_payload()["start"]:
             ostr += "       " + "---starts----" + "\n"
             ostr += (
                 "       "
                 + str(len(j.left.get_payload()["start"].get_payload()))
                 + " starts at "
                 + j.left.get_payload()["start"].get_range_string()
                 + "\n"
             )
         if j.right.get_payload()["end"]:
             ostr += "       " + "---ends----" + "\n"
             ostr += (
                 "       "
                 + str(len(j.right.get_payload()["end"].get_payload()))
                 + " ends at "
                 + j.right.get_payload()["end"].get_range_string()
                 + "\n"
             )
     return ostr
Ejemplo n.º 9
0
 def target_distance(self, psl_entry, use_direction=False):
     if self.value('tName') != psl_entry.value('tName'):
         return -1
     if use_direction and self.value('strand') != psl_entry.value('strand'):
         return -1
     b1 = Bed(self.entry['tName'], self.entry['tStart'], self.entry['tEnd'])
     b2 = Bed(psl_entry.entry['tName'], psl_entry.entry['tStart'],
              psl_entry.entry['tEnd'])
     if b1.overlaps(b2):
         return 0
     if b1.end < b2.start:
         return b2.start - b1.end - 1
     if b1.start > b2.end:
         return b1.start - b2.end - 1
     sys.stderr.write("ERROR un accounted for state\n")
     sys.exit()
Ejemplo n.º 10
0
 def set_conversion_string(self, conversion_string):
     self.conversion_string = conversion_string
     self.ars_name = encode_ars_name(conversion_string, self.name)
     self.bounds = []
     for part in conversion_string.split('/'):
         m = re.match('^([^,]+),(\d+)-(\d+)\|([+-])$', part)
         self.bounds.append(
             Bed(m.group(1), int(m.group(2)), int(m.group(3)), m.group(4)))
Ejemplo n.º 11
0
 def get_info_string(self):
     ostr = ''
     ostr += "== FUZZY GENEPRED INFO ==" + "\n"
     ostr += str(len(self.gpds)) + ' total GPDs' + "\n"
     totalbounds = Bed(self.start.chr, self.start.start - 1, self.end.end,
                       self.start.direction)
     ostr += totalbounds.get_range_string() + " total bounds\n"
     ostr += '---- start ----' + "\n"
     ostr += str(len(
         self.start.get_payload())) + " reads supporting start" + "\n"
     ostr += '  ' + str(mean(self.start.get_payload())) + ' mean' + "\n"
     ostr += '  ' + str(mode(self.start.get_payload())) + ' mode' + "\n"
     ostr += '  ' + self.start.get_range_string() + " start range\n"
     ostr += '---- end ----' + "\n"
     ostr += str(len(
         self.end.get_payload())) + " reads supporting end" + "\n"
     ostr += '  ' + str(mean(self.end.get_payload())) + ' mean' + "\n"
     ostr += '  ' + str(mode(self.end.get_payload())) + ' mode' + "\n"
     ostr += '  ' + self.end.get_range_string() + " end range\n"
     ostr += '---- junctions ----' + "\n"
     ostr += str(len(
         self.fuzzy_junctions)) + ' total fuzzy junctions' + "\n"
     cnt = 0
     for j in self.fuzzy_junctions:
         cnt += 1
         ostr += '  ' + str(cnt) + '. ' + str(
             mode(j.left.get_payload()['junc'])) + " ^ " + str(
                 mode(j.right.get_payload()['junc'])) + "\n"
         ostr += "     " + j.left.get_range_string(
         ) + " ^ " + j.right.get_range_string() + "\n"
         ostr += "     " + str(len(
             j.left.get_payload()['junc'])) + " read support" + "\n"
         if j.left.get_payload()['start']:
             ostr += "       " + "---starts----" + "\n"
             ostr += "       " + str(
                 len(j.left.get_payload()['start'].get_payload())
             ) + " starts at " + j.left.get_payload(
             )['start'].get_range_string() + "\n"
         if j.right.get_payload()['end']:
             ostr += "       " + "---ends----" + "\n"
             ostr += "       " + str(
                 len(j.right.get_payload()['end'].get_payload())
             ) + " ends at " + j.right.get_payload(
             )['end'].get_range_string() + "\n"
     return ostr
Ejemplo n.º 12
0
def get_beds_from_entry(entry, use_direction=False):
    query_beds = []
    target_beds = []
    print entry
    for i in range(0, entry['blockCount']):
        if use_direction:
            tb = Bed(entry['tName'], entry['tStarts'][i],
                     entry['tStarts'][i] + entry['blockSizes'][i],
                     entry['strand'])
            target_beds.append(tb)
        else:
            tb = Bed(entry['tName'], entry['tStarts'][i],
                     entry['tStarts'][i] + entry['blockSizes'][i])
            target_beds.append(tb)
        qb = Bed(entry['qName'], entry['qStarts_actual'][i],
                 entry['qStarts_actual'][i] + entry['blockSizes'][i])
        query_beds.append(tb)
    return [query_beds, target_beds]
def window_break(inranges, window_size):
    outputs = []
    if len(inranges) == 0: return outputs
    for inrange in inranges:
        start = inrange.start
        while start + window_size < inrange.end:
            outputs.append(Bed(inrange.chr, start, start + window_size - 1))
            start += window_size
    return outputs
Ejemplo n.º 14
0
 def get_query_bed(self):
     s1 = self.value('qStarts_actual')[0]
     s2 = self.value('qStarts_actual')[-1] + self.value('blockSizes')[-1]
     if self.value('strand') == '-':
         s1 = self.convert_coordinate_query_to_actual_query(
             self.value('qStarts')[-1] + self.value('blockSizes')[-1]) - 1
         s2 = self.convert_coordinate_query_to_actual_query(
             self.value('qStarts')[0] + 1)
     return Bed(self.value('qName'), s1, s2)
 def add_junction(self, inchr, inleft, inright, indir=None):
     if not self.left:  # this is our first one
         t1 = {}
         t1["junc"] = []
         t1["start"] = None
         self.left = Bed(inchr, inleft - 1, inleft, indir)
         self.left.set_payload(t1)
         self.left.get_payload()["junc"].append(inleft)
         self.right = Bed(inchr, inright - 1, inright, indir)
         t2 = {}
         t2["junc"] = []
         t2["end"] = None
         self.right = Bed(inchr, inright - 1, inright, indir)
         self.right.set_payload(t2)
         self.right.get_payload()["junc"].append(inright)
         return
     # Lets add this one to our current one
     newfuz = FuzzyJunction(inchar, inleft, inright, indir)
     self.add_fuzzy_junction(newfuz)
Ejemplo n.º 16
0
 def query_overlap_size(self, psl2):
     if self.value('qName') != psl2.value('qName'):
         return 0
     # on same query
     output = 0
     for i in range(0, self.value('blockCount')):
         for j in range(0, psl2.value('blockCount')):
             b1 = Bed(
                 self.value('qName'),
                 self.value('qStarts_actual')[i],
                 self.value('qStarts_actual')[i] +
                 self.value('blockSizes')[i])
             b2 = Bed(
                 psl2.value('qName'),
                 psl2.value('qStarts_actual')[j],
                 psl2.value('qStarts_actual')[j] +
                 psl2.value('blockSizes')[j])
             size = b1.overlap_size(b2)
             output += size
     return output
Ejemplo n.º 17
0
 def target_overlap_size(self, psl2, use_direction=False):
     if self.value('tName') != psl2.value('tName'):
         return 0
     if use_direction and self.value('strand') != psl2.value('strand'):
         return 0
     # on same chromosome
     output = 0
     for i in range(0, self.value('blockCount')):
         for j in range(0, psl2.value('blockCount')):
             b1 = Bed(
                 self.value('tName'),
                 self.value('tStarts')[i],
                 self.value('tStarts')[i] + self.value('blockSizes')[i])
             b2 = Bed(
                 psl2.value('tName'),
                 psl2.value('tStarts')[j],
                 psl2.value('tStarts')[j] + psl2.value('blockSizes')[j])
             size = b1.overlap_size(b2)
             output += size
     return output
Ejemplo n.º 18
0
 def copy(self):
     newjunc = FuzzyJunction()
     newjunc.chr = self.chr
     newjunc.left = Bed(self.left.chr,\
                        self.left.start-1,\
                        self.left.end,\
                        self.left.direction)
     t1 = {}
     t1['junc'] = []
     t1['start'] = None
     newjunc.left.set_payload(t1)
     for j in self.left.get_payload()['junc']:
         newjunc.left.get_payload()['junc'].append(j)
     newjunc.right = Bed(self.right.chr, self.right.start - 1,
                         self.right.end, self.right.direction)
     #copy any starts for the junction
     if self.left.get_payload()['start']:
         ls = self.left.get_payload()['start']
         newjunc.left.get_payload()['start'] = Bed(ls.chr, ls.start - 1,
                                                   ls.end, ls.direction)
         newjunc.left.get_payload()['start'].set_payload([])
         for p in self.left.get_payload()['start'].get_payload():
             newjunc.left.get_payload()['start'].get_payload().append(p)
     t2 = {}
     t2['junc'] = []
     t2['end'] = None
     newjunc.right.set_payload(t2)
     for j in self.right.get_payload()['junc']:
         newjunc.right.get_payload()['junc'].append(j)
     #copy any ends for the junction
     if self.right.get_payload()['end']:
         ren = self.right.get_payload()['end']
         newjunc.right.get_payload()['end'] = Bed(ren.chr, ren.start - 1,
                                                  ren.end, ren.direction)
         newjunc.right.get_payload()['end'].set_payload([])
         for p in self.right.get_payload()['end'].get_payload():
             newjunc.right.get_payload()['end'].get_payload().append(p)
     return newjunc
def main():
  parser = argparse.ArgumentParser(description='Create artifical reference sequences from a genepred')
  parser.add_argument('gpd_file')
  parser.add_argument('reference_fasta')
  parser.add_argument('-o','--output',help="output file to write to or STDOUT if not set")
  args = parser.parse_args()
  of  = sys.stdout
  if args.output: of = open(args.output,'w')
  f = read_fasta_into_hash(args.reference_fasta)
  with open(args.gpd_file) as inf:
    for line in inf:
      gpd = GenePredBasics.GenePredEntry()
      gpd.line_to_entry(line.rstrip())
      ars = ARS()
      beds = []
      for i in range(0,gpd.value('exonCount')):
        b = Bed(gpd.value('chrom'),gpd.value('exonStarts')[i],gpd.value('exonEnds')[i],gpd.value('strand'))
        beds.append(b)
      ars.set_bounds(beds)
      ars.set_name(gpd.value('name'))
      ars.set_sequence_from_original_reference_hash(f)
      of.write(ars.get_fasta())
Ejemplo n.º 20
0
 def add_junction(self, inchr, inleft, inright, indir=None):
     if not self.left:  # this is our first one
         t1 = {}
         t1['junc'] = []
         t1['start'] = None
         self.left = Bed(inchr, inleft - 1, inleft, indir)
         self.left.set_payload(t1)
         self.left.get_payload()['junc'].append(inleft)
         self.right = Bed(inchr, inright - 1, inright, indir)
         t2 = {}
         t2['junc'] = []
         t2['end'] = None
         self.right = Bed(inchr, inright - 1, inright, indir)
         self.right.set_payload(t2)
         self.right.get_payload()['junc'].append(inright)
         return
     #Lets add this one to our current one
     newfuz = FuzzyJunction(inchar, inleft, inright, indir)
     self.add_fuzzy_junction(newfuz)
Ejemplo n.º 21
0
 def read_first(self, ingpd):
     self.gpds.append(ingpd)
     sjun = get_simple_junction(ingpd)
     if sjun:
         self.simple_junction_set.add(sjun)
     if self.params['use_dir']: self.dir = ingpd.value('strand')
     # add fuzzy junctions
     chr = ingpd.value('chrom')
     for i in range(0, len(ingpd.value('exonStarts')) - 1):
         self.fuzzy_junctions.append(
             FuzzyJunction(chr,
                           ingpd.value('exonEnds')[i],
                           ingpd.value('exonStarts')[i + 1] + 1, self.dir))
     if len(ingpd.value('exonStarts')) > 1:  # we have junctions
         self.fuzzy_junctions[0].left.get_payload()['start'] = Bed(
             chr, ingpd.value('txStart'),
             ingpd.value('txStart') + 1, self.dir)
         self.fuzzy_junctions[0].left.get_payload()['start'].set_payload([])
         self.fuzzy_junctions[0].left.get_payload()['start'].get_payload(
         ).append(ingpd.value('txStart') + 1)
         self.fuzzy_junctions[-1].right.get_payload()['end'] = Bed(
             chr,
             ingpd.value('txEnd') - 1, ingpd.value('txEnd'), self.dir)
         self.fuzzy_junctions[-1].right.get_payload()['end'].set_payload([])
         self.fuzzy_junctions[-1].right.get_payload()['end'].get_payload(
         ).append(ingpd.value('txEnd'))
     # add fuzzy starts
     self.start = Bed(ingpd.value('chrom'), ingpd.value('txStart'),
                      ingpd.value('txStart') + 1, self.dir)
     self.start.set_payload([])
     self.start.get_payload().append(ingpd.value('txStart') + 1)
     self.end = Bed(ingpd.value('chrom'),
                    ingpd.value('txEnd') - 1, ingpd.value('txEnd'),
                    self.dir)
     self.end.set_payload([])
     self.end.get_payload().append(ingpd.value('txEnd'))
Ejemplo n.º 22
0
 def get_range(self):
     endpos = self.value('pos') - 1
     for c in self.value('cigar_array'):
         if re.match('[MDNX=]', c['op']): endpos += c['val']
     return Bed(self.value('rname'),
                self.value('pos') - 1, endpos, self.strand())
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('gpd_input')
    parser.add_argument('bam_input')
    parser.add_argument('--intergenic_buffer', default=10000, type=int)
    parser.add_argument('--window_size', default=10000, type=int)
    parser.add_argument('--bin_size', default=1000, type=int)
    parser.add_argument(
        '--use_off_regions',
        action='store_true',
        help="Use a region even if there is no reads mapped to it.")
    parser.add_argument('--get_exons', action='store_true')
    args = parser.parse_args()
    chr_beds = {}
    gene_beds = []
    exon_beds = []
    sys.stderr.write("Reading genepred file\n")
    asum = 0
    atot = 0
    with open(args.gpd_input) as inf:
        for line in inf:
            g = GenePredEntry(line)
            asum += g.length()
            atot += 1
            grng = g.get_bed()
            grng.direction = None
            if grng.chr not in chr_beds:
                chr_beds[grng.chr] = grng.copy()
            chr_beds[grng.chr] = chr_beds[grng.chr].merge(grng)
            gene_beds.append(grng)
            for i in range(0, g.get_exon_count()):
                erng = Bed(g.value('chrom'),
                           g.value('exonStarts')[i],
                           g.value('exonEnds')[i])
                exon_beds.append(erng)
    avglen = float(asum) / float(atot)
    sys.stderr.write("Sorting gene bed\n")
    gene_beds = sort_ranges(gene_beds)
    gene_beds = merge_ranges(gene_beds, already_sorted=True)
    sys.stderr.write("Sorting chromosome beds\n")
    chr_beds = sort_ranges([chr_beds[x] for x in chr_beds.keys()])
    sys.stderr.write("Sorting exon beds\n")
    exon_beds = sort_ranges(exon_beds)
    sys.stderr.write("Get padded genes\n")
    padded_gene_beds = pad_ranges(gene_beds, args.intergenic_buffer, chr_beds)
    padded_gene_beds = merge_ranges(padded_gene_beds, already_sorted=True)
    sys.stderr.write("Get intergenic regions\n")
    intergenic_beds = subtract_ranges(chr_beds,
                                      padded_gene_beds,
                                      already_sorted=True)
    intergenic_beds = merge_ranges(intergenic_beds, already_sorted=True)
    intergenic_beds = window_break(intergenic_beds, args.window_size)
    #for i in intergenic_beds: print i.get_range_string()
    sys.stderr.write("Get merged exons\n")
    exon_beds = merge_ranges(exon_beds)
    sys.stderr.write("Get introns\n")
    intron_beds = subtract_ranges(gene_beds, exon_beds, already_sorted=True)
    intron_beds = merge_ranges(intron_beds, already_sorted=True)
    intron_beds = window_break(intron_beds, args.window_size)
    sys.stderr.write("Going through short reads\n")
    cmd = "sam_to_bed_depth.py " + args.bam_input
    p = Popen(cmd.split(), stdout=PIPE)
    for x in intron_beds:
        x.set_payload([])  # payloads are read depths
    for x in intergenic_beds:
        x.set_payload([])  # payloads are read depths
    for x in exon_beds:
        x.set_payload([])  # payloads are read depths
    introndepth = []
    intergenicdepth = []
    exondepth = []
    pseudoreadcount = 0
    if not args.get_exons: exon_beds = []
    section_count = 0
    while True:
        section_count += 1
        line = p.stdout.readline()
        if not line: break
        f = line.split("\t")
        depth = int(f[3])
        curr = Bed(f[0], int(f[1]), int(f[2]))
        if section_count % 100 == 0:
            sys.stderr.write(curr.get_range_string() + "          \r")
        pseudoreadcount += depth
        if len(exon_beds) > 0:
            while curr.cmp(exon_beds[0]) > 0 and len(
                    exon_beds) > 0:  # we've passed the region
                v = exon_beds.pop(0)
                if len(v.get_payload()) == 0 and not args.use_off_regions:
                    continue
                av = average(v)
                exondepth.append(av)
                #print str(av)+" exonic "+v.get_range_string()
            c = curr.cmp(exon_beds[0])
            if c == 0:  # overlaps with intron
                size = curr.overlap_size(exon_beds[0])
                for i in range(0, size):
                    exon_beds[0].get_payload().append(depth)
        if len(intron_beds) > 0:
            while curr.cmp(intron_beds[0]) > 0 and len(
                    intron_beds) > 0:  # we've passed the region
                v = intron_beds.pop(0)
                if len(v.get_payload()) == 0 and not args.use_off_regions:
                    continue
                av = average(v)
                introndepth.append(av)
                #print str(av)+" intronic "+v.get_range_string()
            c = curr.cmp(intron_beds[0])
            if c == 0:  # overlaps with intron
                size = curr.overlap_size(intron_beds[0])
                for i in range(0, size):
                    intron_beds[0].get_payload().append(depth)
        if len(intergenic_beds) > 0:
            while curr.cmp(intergenic_beds[0]) > 0 and len(
                    intergenic_beds) > 0:  # we've passed the region
                v = intergenic_beds.pop(0)
                if len(v.get_payload()) == 0 and not args.use_off_regions:
                    continue
                av = average(v)
                intergenicdepth.append(av)
                display(curr, introndepth, intergenicdepth, pseudoreadcount,
                        avglen)
                #print str(av)+" intergenic "+v.get_range_string()
            c = curr.cmp(intergenic_beds[0])
            if c == 0:  # overlaps with intron
                size = curr.overlap_size(intergenic_beds[0])
                for i in range(0, size):
                    intergenic_beds[0].get_payload().append(depth)
            #if c > 0: # we passed the intron
            #  v = intergenic_beds.pop(0)
            #  av = average(v)
            #  intergenicdepth.append(av)
            #  print str(av)+" intergenic "+v.get_range_string()
    if args.use_off_regions:
        for x in exon_beds:
            introndepth.append(average(x.get_payload()))
        for x in intron_beds:
            introndepth.append(average(x.get_payload()))
        for x in intergenic_beds:
            intergenicdepth.append(average(x.get_payload()))
    p.communicate()
Ejemplo n.º 24
0
 def get_target_bed(self):
     return Bed(self.value('tName'), self.value('tStart'),
                self.value('tEnd'), self.value('strand'))
Ejemplo n.º 25
0
class FuzzyGenePred:
    #set use_dir true if you want to use direction and make it direction specific
    #set proper_set false if you want to do awesome extending that doesn't really work yet
    def __init__(self, ingpd=None, params=None, juntol=10):
        # Here is the basic data
        self.fuzzy_junctions = []
        self.gpds = []  #contributing member genepreds
        self.start = None
        self.end = None
        self.dir = None
        # Higher level data
        self.simple_junction_set = set(
        )  # quickly search for if a multi exon gene has been added
        #Here is the parameters
        self.params = {}
        self.params['use_dir'] = False
        self.params['junction_tolerance'] = juntol
        #Not fully implemented.  Do we require a full length match
        self.params['proper_set'] = True
        # Define thresholds for overlapping single exons
        self.params['do_add_single_exon'] = True
        self.params['single_exon_minimum_length'] = 200
        self.params[
            'single_exon_minimum_overlap_fraction'] = 0.8  #reciprocal ... must be this fraction or more on both
        self.params[
            'single_exon_minimum_overlap_bases'] = 1  #minimum number of bases
        self.params['single_exon_maximum_endpoint_distance'] = 1000
        if params:
            for pname in params:
                self.params[pname] = params[pname]
        if ingpd:
            self.add_gpd(ingpd)

    def get_genepred_line(self,
                          end_select='extremes',
                          junction_select='mode',
                          name=None):
        if not name:
            name = 'fuzGPD_' + random_string(8) + '_' + str(
                len(self.fuzzy_junctions) + 1) + '_' + str(len(self.gpds))
        ostr = ''
        ostr += name + "\t"
        ostr += name + "\t"
        ostr += self.start.chr + "\t"
        ostr += self.gpds[0].value('strand') + "\t"
        ostr += str(self.start.start - 1) + "\t"
        ostr += str(self.end.end) + "\t"
        ostr += str(self.start.start - 1) + "\t"
        ostr += str(self.end.end) + "\t"
        ostr += str(len(self.fuzzy_junctions) + 1) + "\t"
        exonstarts = []
        exonends = []
        exonstarts.append(self.start.start - 1)
        for j in self.fuzzy_junctions:
            exonends.append(mode(j.left.get_payload()['junc']))
            exonstarts.append(mode(j.right.get_payload()['junc']) - 1)
        exonends.append(self.end.end)
        ostr += ','.join([str(x) for x in exonstarts]) + ',' + "\t"
        ostr += ','.join([str(x) for x in exonends]) + ','
        return ostr

    # Return a copy of the fuzzy geneprep
    def copy(self):
        g = FuzzyGenePred()  # start with a blank one why not
        # get the settings
        for pname in self.params:
            g.params[pname] = self.params[pname]
        # copy the genepreds
        for orig in self.gpds:
            g.gpds.append(GenePredEntry(orig.get_line()))
        #store direction
        g.dir = self.dir
        # copy the fuzzy junctions
        for orig in self.fuzzy_junctions:
            g.fuzzy_junctions.append(orig.copy())
        # copy the simple junction set
        for orig in self.simple_junction_set:
            g.simple_junction_set.add(orig)
        # copy the start
        if self.start:
            g.start = Bed(self.start.chr,\
                          self.start.start-1,\
                          self.start.end,\
                          self.start.direction)
            g.start.set_payload([])
            for v in self.start.get_payload():
                g.start.get_payload().append(v)
        # copy the end
        if self.end:
            g.end = Bed(self.end.chr, self.end.start - 1, self.end.end,
                        self.end.direction)
            g.end.set_payload([])
            for v in self.end.get_payload():
                g.end.get_payload().append(v)
        return g

    def exon_count(self):
        return len(self.fuzzy_junctions) + 1

    def gpd_count(self):
        return len(self.gpds)

    def get_bed(self):
        return Bed(self.start.chr, self.start.start - 1, self.end.end,
                   self.start.direction)

    #This is an inspection tool for a fuzzy gpd
    def get_info_string(self):
        ostr = ''
        ostr += "== FUZZY GENEPRED INFO ==" + "\n"
        ostr += str(len(self.gpds)) + ' total GPDs' + "\n"
        totalbounds = Bed(self.start.chr, self.start.start - 1, self.end.end,
                          self.start.direction)
        ostr += totalbounds.get_range_string() + " total bounds\n"
        ostr += '---- start ----' + "\n"
        ostr += str(len(
            self.start.get_payload())) + " reads supporting start" + "\n"
        ostr += '  ' + str(mean(self.start.get_payload())) + ' mean' + "\n"
        ostr += '  ' + str(mode(self.start.get_payload())) + ' mode' + "\n"
        ostr += '  ' + self.start.get_range_string() + " start range\n"
        ostr += '---- end ----' + "\n"
        ostr += str(len(
            self.end.get_payload())) + " reads supporting end" + "\n"
        ostr += '  ' + str(mean(self.end.get_payload())) + ' mean' + "\n"
        ostr += '  ' + str(mode(self.end.get_payload())) + ' mode' + "\n"
        ostr += '  ' + self.end.get_range_string() + " end range\n"
        ostr += '---- junctions ----' + "\n"
        ostr += str(len(
            self.fuzzy_junctions)) + ' total fuzzy junctions' + "\n"
        cnt = 0
        for j in self.fuzzy_junctions:
            cnt += 1
            ostr += '  ' + str(cnt) + '. ' + str(
                mode(j.left.get_payload()['junc'])) + " ^ " + str(
                    mode(j.right.get_payload()['junc'])) + "\n"
            ostr += "     " + j.left.get_range_string(
            ) + " ^ " + j.right.get_range_string() + "\n"
            ostr += "     " + str(len(
                j.left.get_payload()['junc'])) + " read support" + "\n"
            if j.left.get_payload()['start']:
                ostr += "       " + "---starts----" + "\n"
                ostr += "       " + str(
                    len(j.left.get_payload()['start'].get_payload())
                ) + " starts at " + j.left.get_payload(
                )['start'].get_range_string() + "\n"
            if j.right.get_payload()['end']:
                ostr += "       " + "---ends----" + "\n"
                ostr += "       " + str(
                    len(j.right.get_payload()['end'].get_payload())
                ) + " ends at " + j.right.get_payload(
                )['end'].get_range_string() + "\n"
        return ostr

    #Add a new gpd return true if successful
    #Return false if it didn't work, return the new combined if it worked
    def add_gpd(self, ingpd):
        if len(self.gpds) == 0:  # first one
            self.read_first(ingpd)
            return self  #return ourself if we are adding our first
        # more difficult situation where we must try to combine
        # See if it can match first before actually adding stuff to it
        #if self.
        newfuz = FuzzyGenePred(ingpd, params=self.params)
        output = self.add_fuzzy_gpd(newfuz)
        return output

    # combine together compatible overlapping sets
    def concat_fuzzy_gpd(self, fuz2):
        if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) != 0:
            return False
        if len(fuz2.fuzzy_junctions) != 0 and len(self.fuzzy_junctions) == 0:
            return False
        # Lets work combine the single exon step and exit
        if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) == 0:
            return self.do_add_single_exon_fuzzy_gpd(fuz2)
        # For now don't add them if one is single exon
        if len(self.fuzzy_junctions) == 0 or len(fuz2.fuzzy_junctions) == 0:
            return False

        # See if its already a subset
        easy_subset = False
        for simplejunction in fuz2.simple_junction_set:
            if simplejunction in self.simple_junction_set:
                easy_subset = True
        # If its not already a subset look deeper
        #1. First we need perfect junctions for a run of them
        if not easy_subset:
            if not self.compatible_overlap(fuz2): return False
        # still here. we will work on combining these
        output = self.copy()
        # first lets put add any overlapping junctions
        for i in range(0, len(output.fuzzy_junctions)):
            for j in range(0, len(fuz2.fuzzy_junctions)):
                if output.fuzzy_junctions[i].overlaps(
                        fuz2.fuzzy_junctions[j],
                        fuz2.params['junction_tolerance']):
                    output.fuzzy_junctions[i].add_fuzzy_junction(
                        fuz2.fuzzy_junctions[j])
                    if j == 0:  # put the start in too
                        if not output.fuzzy_junctions[i].left.get_payload(
                        )['start']:
                            output.fuzzy_junctions[i].left.get_payload(
                            )['start'] = fuz2.start.copy()
                        else:  # merge
                            starts = output.fuzzy_junctions[
                                i].left.get_payload()['start'].get_payload()
                            for v in fuz2.start.get_payload():
                                starts.append(v)
                            nrange = output.fuzzy_junctions[
                                i].left.get_payload()['start'].merge(
                                    fuz2.start)
                            nrange.set_payload(starts[:])
                            output.fuzzy_junctions[i].left.get_payload(
                            )['start'] = nrange
                    if j == len(
                            fuz2.fuzzy_junctions) - 1:  # put the end in too
                        if not output.fuzzy_junctions[i].right.get_payload(
                        )['end']:
                            output.fuzzy_junctions[i].right.get_payload(
                            )['end'] = fuz2.end.copy()
                        else:  # merge
                            ends = output.fuzzy_junctions[i].right.get_payload(
                            )['end'].get_payload()
                            for v in fuz2.end.get_payload():
                                ends.append(v)
                            nrange = output.fuzzy_junctions[
                                i].right.get_payload()['end'].merge(fuz2.end)
                            nrange.set_payload(ends[:])
                            output.fuzzy_junctions[i].right.get_payload(
                            )['end'] = nrange
        # see if we should build onto the left
        leftnum = -1
        leftmost = self.fuzzy_junctions[0]
        if fuz2.fuzzy_junctions[0].right.end < leftmost.left.start:
            for i in range(0, len(fuz2.fuzzy_junctions)):
                if fuz2.fuzzy_junctions[i].overlaps(
                        leftmost, fuz2.params['junction_tolerance']):
                    leftnum = i
                    break
        #leftnum is now -1 if no additions to the left zero if it starts on the same
        if leftnum > 0:
            for i in reversed(range(0, leftnum)):
                output.fuzzy_junctions.insert(0,
                                              fuz2.fuzzy_junctions[i].copy())
            output.start = fuz2.start.copy()
        rightnum = -1  # get the right point ... our first one comes after this
        rightmost = self.fuzzy_junctions[-1]
        if fuz2.fuzzy_junctions[-1].left.start > rightmost.right.end:
            for i in reversed(range(0, len(fuz2.fuzzy_junctions))):
                if fuz2.fuzzy_junctions[i].overlaps(
                        rightmost, fuz2.params['junction_tolerance']):
                    rightnum = i
                    break
        if rightnum != -1:
            rightnum += 1
            if rightnum < len(fuz2.fuzzy_junctions):
                for i in range(rightnum, len(fuz2.fuzzy_junctions)):
                    output.fuzzy_junctions.append(
                        fuz2.fuzzy_junctions[i].copy())
                output.end = fuz2.end.copy()
        #print leftnum
        #print rightnum
        #print fuz2.params['junction_tolerance']
        #print 'combining'
        return output

    # add together subsets
    def add_fuzzy_gpd(self, fuz2):
        # see if we can add this fuzzy gpd to another
        # We treat single exon genes seprately so if only one of them is
        # single exon we can't compare them
        if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) != 0:
            return False
        if len(fuz2.fuzzy_junctions) != 0 and len(self.fuzzy_junctions) == 0:
            return False
        # Lets work combine the single exon step and exit
        if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) == 0:
            return self.do_add_single_exon_fuzzy_gpd(fuz2)

        # For now don't add them if one is single exon
        if len(self.fuzzy_junctions) == 0 or len(fuz2.fuzzy_junctions) == 0:
            return False

        # See if its already a subset
        easy_subset = False
        for simplejunction in fuz2.simple_junction_set:
            if simplejunction in self.simple_junction_set:
                easy_subset = True
        # If its not already a subset look deeper
        #1. First we need perfect junctions for a run of them
        if not easy_subset:
            if not self.compatible_overlap(fuz2): return False
        # still here. we will work on combining these
        output = self.copy()
        #switch over to working on the output now

        # If we are still here we can add the two of them together
        # If they have the same starting junction we can add their starting points together
        if output.fuzzy_junctions[0].overlaps(
                fuz2.fuzzy_junctions[0], output.params['junction_tolerance']):
            #print 'samestart'
            newstart = output.start.merge(fuz2.start)
            newstart.set_payload(output.start.get_payload())
            for s in fuz2.start.get_payload():
                newstart.get_payload().append(s)
            output.start = newstart

        # Check if the other one is new start
        elif mode(fuz2.fuzzy_junctions[0].left.get_payload()['junc']) < mode(
                output.fuzzy_junctions[0].left.get_payload()['junc']):
            #print "2 start"
            output.start = fuz2.start
        elif mode(fuz2.fuzzy_junctions[0].left.get_payload()['junc']) > mode(
                output.fuzzy_junctions[0].left.get_payload()['junc']):
            True
        #  #print "1 start"
        #  #we're good to go
        else:
            sys.stderr.write("WARNING: strange start case abort merge\n")
            return False
        # lets work the ends now
        if output.fuzzy_junctions[-1].overlaps(
                fuz2.fuzzy_junctions[-1], output.params['junction_tolerance']):
            #print 'sameend'
            newend = output.end.merge(fuz2.end)
            newend.set_payload(output.end.get_payload())
            for s in fuz2.end.get_payload():
                newend.get_payload().append(s)
            output.end = newend

        # Check if the other one is new start
        elif mode(fuz2.fuzzy_junctions[-1].right.get_payload()['junc']) > mode(
                output.fuzzy_junctions[-1].right.get_payload()['junc']):
            #print "2 end"
            output.end = fuz2.end
        elif mode(fuz2.fuzzy_junctions[-1].right.get_payload()['junc']) < mode(
                output.fuzzy_junctions[-1].right.get_payload()['junc']):
            True
        #  #print "1 end"
        #  #we're good to go
        else:
            sys.stderr.write("WARNING: strange end case abort merge\n")
            u1 = mode(output.fuzzy_junctions[-1].left.get_payload()['junc'])
            u2 = mode(fuz2.fuzzy_junctions[-1].left.get_payload()['junc'])
            v1 = mode(output.fuzzy_junctions[-1].right.get_payload()['junc'])
            v2 = mode(fuz2.fuzzy_junctions[-1].right.get_payload()['junc'])
            sys.stderr.write(str(u1) + "\t" + str(u2) + "\n")
            sys.stderr.write(str(v1) + "\t" + str(v2) + "\n")
            return False
        # now the starts and ends have been updated in output.
        # iterate through the junctions.
        # check for a left overhang.
        numfuz2left = 0
        numoutleft = 0
        if not output.fuzzy_junctions[0].overlaps(
                fuz2.fuzzy_junctions[0], output.params['junction_tolerance']):
            # see if we need to add sequences from fuz2
            if mode(fuz2.fuzzy_junctions[0].left.get_payload()['junc']) < mode(
                    output.fuzzy_junctions[0].left.get_payload()['junc']):
                #print 'left over2'
                i = 0
                while not output.fuzzy_junctions[0].overlaps(
                        fuz2.fuzzy_junctions[i],
                        output.params['junction_tolerance']) and i < len(
                            fuz2.fuzzy_junctions):
                    i += 1
                numfuz2left = i  # number to push on from the fuz2 and increment in
                #print numfuz2left
            elif mode(
                    fuz2.fuzzy_junctions[0].left.get_payload()['junc']) > mode(
                        output.fuzzy_junctions[0].left.get_payload()['junc']):
                #print 'left over1'
                i = 0
                while not output.fuzzy_junctions[i].overlaps(
                        fuz2.fuzzy_junctions[0],
                        output.params['junction_tolerance']) and i < len(
                            output.fuzzy_junctions):
                    i += 1
                numoutleft = i  # number to increment in from output
                #print numoutleft
            else:
                sys.stderr.write("WARNING: strange case \n")
                return False
        # next we can check how long we have a run of the same
        ind1 = numoutleft
        ind2 = numfuz2left
        overlap_size = 0
        while ind1 < len(output.fuzzy_junctions) and ind2 < len(fuz2.fuzzy_junctions) \
          and output.fuzzy_junctions[ind1].overlaps(fuz2.fuzzy_junctions[ind2],output.params['junction_tolerance']):
            overlap_size += 1
            ind1 += 1
            ind2 += 1
        #print 'overlap size '+str(overlap_size)
        numoutright = len(output.fuzzy_junctions) - overlap_size - numoutleft
        numfuz2right = len(fuz2.fuzzy_junctions) - overlap_size - numfuz2left
        if min(numoutright, numfuz2right) != 0:
            sys.stderr.write("WARNING: expected one of them to be zero\n")
            #print self.get_info_string()
            #print '====================='
            #print fuz2.get_info_string()
            #sys.exit()
            return False
        if min(numoutleft, numfuz2left) != 0:
            sys.stderr.write("WARNING: expected one of them to be zero\n")
            return False
        #print numoutright
        #print numfuz2right
        #print output.fuzzy_junctions[numoutleft].overlaps(fuz2.fuzzy_junctions[numfuz2left],output.junction_tolerance)
        #print 'add'
        #Now we have what we need to go through and do some updating
        #Lets just make new fuzzy junctions
        newjuncs = []
        for i in range(0, numfuz2left):
            newjuncs.append(fuz2.fuzzy_junctions[i])
        for i in range(0, numoutleft):
            newjuncs.append(output.fuzzy_junctions[i])
        #Now we do both down the center
        range1 = range(numoutleft, overlap_size + numoutleft)
        range2 = range(numfuz2left, overlap_size + numfuz2left)
        for i in range(0, len(range1)):
            newjuncs.append(output.fuzzy_junctions[range1[i]])
            newjuncs[-1].add_fuzzy_junction(fuz2.fuzzy_junctions[range2[i]])
            #print i
        #Make the right size
        for i in range(overlap_size + numfuz2left,
                       overlap_size + numfuz2left + numfuz2right):
            newjuncs.append(fuz2.fuzzy_junctions[i])
        for i in range(overlap_size + numoutleft,
                       overlap_size + numoutleft + numoutright):
            newjuncs.append(output.fuzzy_junctions[i])
        output.fuzzy_junctions = newjuncs
        #print 'adding gpd '+str(len(fuz2.gpds))+' entries'
        for g in fuz2.gpds:
            output.gpds.append(g)
            sjun = get_simple_junction(g)
            if sjun:
                output.simple_junction_set.add(sjun)
        #print 'new entry'
        #print self.get_info_string()
        return output

    def do_add_single_exon_fuzzy_gpd(self, fuz2):
        if not self.params['do_add_single_exon']:
            return False  # make sure we are allowed to be doing this
        #build the bounds from the average start and end
        s1 = mean(self.start.get_payload())
        e1 = mean(self.end.get_payload())
        s2 = mean(fuz2.start.get_payload())
        e2 = mean(fuz2.end.get_payload())
        l1 = e1 - s1 + 1
        l2 = e2 - s2 + 1
        if l1 < self.params['single_exon_minimum_length']:
            return False
        if l2 < self.params['single_exon_minimum_length']:
            return False
        if l1 < 1 or l2 < 1: return False  #shouldn't happen
        chr1 = self.start.chr
        chr2 = self.end.chr
        if chr1 != chr2: return False  #shouldn't happen
        r1 = Bed(chr1, s1 - 1, e1, self.dir)
        r2 = Bed(chr2, s2 - 1, e2, self.dir)
        over = r1.overlap_size(r2)
        if over < self.params['single_exon_minimum_overlap_bases']:
            return False
        #print r1.get_range_string()
        #print r2.get_range_string()
        cov = min(float(over) / float(l1), float(over) / float(l2))
        if cov < self.params['single_exon_minimum_overlap_fraction']:
            return False
        if abs(e1 - e2) > self.params['single_exon_maximum_endpoint_distance']:
            return False
        if abs(s1 - s2) > self.params['single_exon_maximum_endpoint_distance']:
            return False
        #If we're still here, we can add result
        output = self.copy()
        newstart = output.start.merge(fuz2.start)
        newstart.set_payload([])
        for s in output.start.get_payload():
            newstart.get_payload().append(s)
        for s in fuz2.start.get_payload():
            newstart.get_payload().append(s)
        newend = output.end.merge(fuz2.end)
        newend.set_payload([])
        for e in output.end.get_payload():
            newend.get_payload().append(e)
        for e in fuz2.end.get_payload():
            newend.get_payload().append(e)
        output.start = newstart
        output.end = newend
        for gpd in fuz2.gpds:
            output.gpds.append(gpd)
            sjun = get_simple_junction(gpd)
            if sjun:
                output.simple_junction_set.add(gpd)
        return output

    #Return true if these fuzzy genepreds can be added together
    def compatible_overlap(self, fingpd):
        f1 = self
        f2 = fingpd

        #### Forget about trying zero exon cases for now
        if len(f1.fuzzy_junctions) == 0 or len(f2.fuzzy_junctions) == 0:
            return False

        #Find all matches
        matches = []
        for i in range(0, len(f1.fuzzy_junctions)):
            for j in range(0, len(f2.fuzzy_junctions)):
                if f1.fuzzy_junctions[i].overlaps(
                        f2.fuzzy_junctions[j],
                        self.params['junction_tolerance']):
                    matches.append([i, j])

        # This is our matched junctions in f1 and f2
        if len(matches) == 0:
            return False  # Nothing matched.. certainly no overlap

        # This is the number of extra exons it would take in the middle of the run (shifts)
        if len(set([x[0] - x[1] for x in matches])) != 1: return False

        # Lets make sure all our exons are consecutive
        if len(matches) > 1:
            consec1 = list(
                set([
                    matches[i + 1][0] - matches[i][0]
                    for i in range(0,
                                   len(matches) - 1)
                ]))
            consec2 = list(
                set([
                    matches[i + 1][1] - matches[i][1]
                    for i in range(0,
                                   len(matches) - 1)
                ]))
            if len(consec1) != 1: return False
            if len(consec2) != 1: return False
            if consec1[0] != 1: return False
            if consec2[0] != 1: return False
        # one of them should be zero
        if not (matches[0][1] == 0 or matches[0][0] == 0):
            return False

        # and one of our last matches should be the last junction
        if not (len(f1.fuzzy_junctions) - 1 == matches[-1][0]
                or len(f2.fuzzy_junctions) - 1 == matches[-1][1]):
            return False

        #### most of the time we will probably be looking for a proper set
        #### unless we are extending the long read for isoform prediction
        if self.params['proper_set']:
            # check those last overhangs
            # one of the two needs to have the start and end points in the consecutive matches
            if (matches[0][0] == 0 and len(f1.fuzzy_junctions)-1 == matches[-1][0]) or \
               (matches[0][1] == 0 and len(f2.fuzzy_junctions)-1 == matches[-1][1]):
                return True
            return False

        return True

    def read_first(self, ingpd):
        self.gpds.append(ingpd)
        sjun = get_simple_junction(ingpd)
        if sjun:
            self.simple_junction_set.add(sjun)
        if self.params['use_dir']: self.dir = ingpd.value('strand')
        # add fuzzy junctions
        chr = ingpd.value('chrom')
        for i in range(0, len(ingpd.value('exonStarts')) - 1):
            self.fuzzy_junctions.append(
                FuzzyJunction(chr,
                              ingpd.value('exonEnds')[i],
                              ingpd.value('exonStarts')[i + 1] + 1, self.dir))
        if len(ingpd.value('exonStarts')) > 1:  # we have junctions
            self.fuzzy_junctions[0].left.get_payload()['start'] = Bed(
                chr, ingpd.value('txStart'),
                ingpd.value('txStart') + 1, self.dir)
            self.fuzzy_junctions[0].left.get_payload()['start'].set_payload([])
            self.fuzzy_junctions[0].left.get_payload()['start'].get_payload(
            ).append(ingpd.value('txStart') + 1)
            self.fuzzy_junctions[-1].right.get_payload()['end'] = Bed(
                chr,
                ingpd.value('txEnd') - 1, ingpd.value('txEnd'), self.dir)
            self.fuzzy_junctions[-1].right.get_payload()['end'].set_payload([])
            self.fuzzy_junctions[-1].right.get_payload()['end'].get_payload(
            ).append(ingpd.value('txEnd'))
        # add fuzzy starts
        self.start = Bed(ingpd.value('chrom'), ingpd.value('txStart'),
                         ingpd.value('txStart') + 1, self.dir)
        self.start.set_payload([])
        self.start.get_payload().append(ingpd.value('txStart') + 1)
        self.end = Bed(ingpd.value('chrom'),
                       ingpd.value('txEnd') - 1, ingpd.value('txEnd'),
                       self.dir)
        self.end.set_payload([])
        self.end.get_payload().append(ingpd.value('txEnd'))
        # Have finished reading in the first case

    # Pre: another fuzzy gpd
    # Post: True if they are all overlapping junctions
    def is_equal_fuzzy(self, fuz2, use_direction=False):
        if use_direction:
            if self.dir != fuz2.dir: return False
        if len(self.fuzzy_junctions) < 0: return False
        if len(fuz2.fuzzy_junctions) < 0: return False
        if len(self.fuzzy_junctions) != len(fuz2.fuzzy_junctions):
            return False
        for i in range(0, len(self.fuzzy_junctions)):
            if not self.fuzzy_junctions[i].overlaps(
                    fuz2.fuzzy_junctions[i],
                    self.params['junction_tolerance']):
                return False
        return True
def get_random_gpds_from_pair(pair, genes, ref):
    #print 'gene 1 ('+pair[0]+'): '
    j1s = set()
    j1chrom = genes[pair[0]][0].value('chrom')
    j1starts = []
    j1ends = []
    j1strand = genes[pair[0]][0].value('strand')
    j2s = set()
    j2chrom = genes[pair[1]][0].value('chrom')
    j2starts = []
    j2ends = []
    j2strand = genes[pair[1]][0].value('strand')
    for gpd in genes[pair[0]]:
        if gpd.value('strand') != j1strand: continue
        if gpd.value('chrom') != j1chrom: continue
        j1starts.append(gpd.value('exonStarts')[0])
        j1ends.append(gpd.value('exonEnds')[-1])
        for j in gpd.calculate_junctions():
            j1s.add(j)
    #print 'gene 2 ('+pair[1]+'): '
    for gpd in genes[pair[1]]:
        if gpd.value('strand') != j2strand: continue
        if gpd.value('chrom') != j2chrom: continue
        j2starts.append(gpd.value('exonStarts')[0])
        j2ends.append(gpd.value('exonEnds')[-1])
        for j in gpd.calculate_junctions():
            j2s.add(j)
    j1shuf = list(j1s)
    shuffle(j1shuf)
    j2shuf = list(j2s)
    shuffle(j2shuf)
    #print j1shuf[0]
    #print j2shuf[0]
    if j1strand == '+':
        m = re.match('[^:]+:(\d+)', j1shuf[0])
        left = Bed(j1chrom,
                   min(j1starts) - 500,
                   int(m.group(1)) + 500, j1strand)
        fsite1 = int(m.group(1))
    else:
        m = re.match('[^:]+:(\d+),[^:]+:(\d+)', j1shuf[0])
        left = Bed(j1chrom, int(m.group(2)) - 500, max(j1ends) + 500, j1strand)
        fsite1 = int(m.group(2))
    if j2strand == '+':
        m = re.match('[^:]+:(\d+),[^:]+:(\d+)', j2shuf[0])
        right = Bed(j2chrom,
                    int(m.group(2)) - 500,
                    max(j2ends) + 500, j2strand)
        fsite2 = int(m.group(2))
    else:
        m = re.match('[^:]+:(\d+),[^:]+:(\d+)', j2shuf[0])
        right = Bed(j2chrom,
                    min(j2starts) - 500,
                    int(m.group(1)) + 500, j2strand)
        fsite2 = int(m.group(1))
    #print left.get_range_string()+' '+left.direction
    #print right.get_range_string()+' '+right.direction
    [leftcomp, rightcomp] = get_compatible_transcripts(genes[pair[0]], fsite1,
                                                       genes[pair[1]], fsite2)
    #print fsite1
    #print fsite2
    acf = ACF()
    acf.add_bounds(left)
    acf.add_bounds(right)
    ln = leftcomp[0].value('gene_name')
    rn = rightcomp[0].value('gene_name')
    site_string = leftcomp[0].value('chrom') + ":" + str(
        fsite1) + leftcomp[0].value('strand') + '/' + rightcomp[0].value(
            'chrom') + ":" + str(fsite2) + rightcomp[0].value('strand')
    ars = ARS(ref=ref,
              conversion_string=acf.get_conversion_string(),
              name=ln + "," + rn + "," + site_string)
    #print ars.conversion_string
    #print ars.name
    #print ars.get_ars_name()
    gpds = make_new_genepreds(leftcomp, fsite1, rightcomp, fsite2, ars)
    return [gpds, ars]
Ejemplo n.º 27
0
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('gpd_input')
  parser.add_argument('bam_input')
  parser.add_argument('--intergenic_buffer',default=10000,type=int)
  parser.add_argument('--window_size',default=10000,type=int)
  parser.add_argument('--bin_size',default=1000,type=int)
  parser.add_argument('--use_off_regions',action='store_true',help="Use a region even if there is no reads mapped to it.")
  parser.add_argument('--get_exons',action='store_true')
  args = parser.parse_args()
  chr_beds = {}
  gene_beds = []
  exon_beds = []
  sys.stderr.write("Reading genepred file\n")
  asum = 0
  atot = 0
  with open(args.gpd_input) as inf:
    for line in inf:
      g = GenePredEntry(line)
      asum += g.length()
      atot += 1
      grng = g.get_bed()
      grng.direction = None
      if grng.chr not in chr_beds:
        chr_beds[grng.chr] = grng.copy()
      chr_beds[grng.chr] = chr_beds[grng.chr].merge(grng)
      gene_beds.append(grng)
      for i in range(0,g.get_exon_count()):
        erng = Bed(g.value('chrom'),g.value('exonStarts')[i],g.value('exonEnds')[i])
        exon_beds.append(erng)
  avglen = float(asum)/float(atot)
  sys.stderr.write("Sorting gene bed\n")
  gene_beds = sort_ranges(gene_beds)
  gene_beds = merge_ranges(gene_beds,already_sorted=True)
  sys.stderr.write("Sorting chromosome beds\n")
  chr_beds = sort_ranges([chr_beds[x] for x in chr_beds.keys()])
  sys.stderr.write("Sorting exon beds\n")
  exon_beds = sort_ranges(exon_beds)
  sys.stderr.write("Get padded genes\n")
  padded_gene_beds = pad_ranges(gene_beds,args.intergenic_buffer,chr_beds)
  padded_gene_beds = merge_ranges(padded_gene_beds,already_sorted=True)
  sys.stderr.write("Get intergenic regions\n")
  intergenic_beds = subtract_ranges(chr_beds,padded_gene_beds,already_sorted=True)
  intergenic_beds = merge_ranges(intergenic_beds,already_sorted=True)
  intergenic_beds = window_break(intergenic_beds,args.window_size)
  #for i in intergenic_beds: print i.get_range_string()
  sys.stderr.write("Get merged exons\n")
  exon_beds = merge_ranges(exon_beds)
  sys.stderr.write("Get introns\n")
  intron_beds = subtract_ranges(gene_beds,exon_beds,already_sorted=True)  
  intron_beds = merge_ranges(intron_beds,already_sorted=True)
  intron_beds = window_break(intron_beds,args.window_size)
  sys.stderr.write("Going through short reads\n")
  cmd = "sam_to_bed_depth.py "+args.bam_input
  p = Popen(cmd.split(),stdout=PIPE)
  for x in intron_beds: x.set_payload([]) # payloads are read depths
  for x in intergenic_beds: x.set_payload([]) # payloads are read depths
  for x in exon_beds: x.set_payload([]) # payloads are read depths
  introndepth = []
  intergenicdepth = []
  exondepth = []
  pseudoreadcount = 0
  if not args.get_exons: exon_beds = []
  section_count = 0
  while True:
    section_count += 1
    line = p.stdout.readline()
    if not line: break
    f = line.split("\t")
    depth = int(f[3])
    curr = Bed(f[0],int(f[1]),int(f[2]))
    if section_count %100==0: sys.stderr.write(curr.get_range_string()+"          \r")
    pseudoreadcount += depth
    if len(exon_beds) > 0:
      while curr.cmp(exon_beds[0]) > 0 and len(exon_beds) > 0: # we've passed the region
        v = exon_beds.pop(0)
        if len(v.get_payload()) == 0 and not args.use_off_regions: continue
        av = average(v)
        exondepth.append(av)
        #print str(av)+" exonic "+v.get_range_string()
      c = curr.cmp(exon_beds[0])
      if c == 0: # overlaps with intron
        size = curr.overlap_size(exon_beds[0])
        for i in range(0,size): exon_beds[0].get_payload().append(depth)
    if len(intron_beds) > 0:
      while curr.cmp(intron_beds[0]) > 0 and len(intron_beds) > 0: # we've passed the region
        v = intron_beds.pop(0)
        if len(v.get_payload()) == 0 and not args.use_off_regions: continue
        av = average(v)
        introndepth.append(av)
        #print str(av)+" intronic "+v.get_range_string()
      c = curr.cmp(intron_beds[0])
      if c == 0: # overlaps with intron
        size = curr.overlap_size(intron_beds[0])
        for i in range(0,size): intron_beds[0].get_payload().append(depth)
    if len(intergenic_beds) > 0:
      while curr.cmp(intergenic_beds[0]) > 0 and len(intergenic_beds) > 0: # we've passed the region
        v = intergenic_beds.pop(0)
        if len(v.get_payload()) == 0 and not args.use_off_regions: continue
        av = average(v)
        intergenicdepth.append(av)
        display(curr,introndepth,intergenicdepth,pseudoreadcount,avglen)
        #print str(av)+" intergenic "+v.get_range_string()
      c = curr.cmp(intergenic_beds[0])
      if c == 0: # overlaps with intron
        size = curr.overlap_size(intergenic_beds[0])
        for i in range(0,size): intergenic_beds[0].get_payload().append(depth)
      #if c > 0: # we passed the intron
      #  v = intergenic_beds.pop(0)
      #  av = average(v)
      #  intergenicdepth.append(av)
      #  print str(av)+" intergenic "+v.get_range_string()
  if args.use_off_regions:
    for x in exon_beds: introndepth.append(average(x.get_payload()))
    for x in intron_beds: introndepth.append(average(x.get_payload()))
    for x in intergenic_beds: intergenicdepth.append(average(x.get_payload()))
  p.communicate()
class FuzzyGenePred:
    # set use_dir true if you want to use direction and make it direction specific
    # set proper_set false if you want to do awesome extending that doesn't really work yet
    def __init__(self, ingpd=None, params=None, juntol=10):
        # Here is the basic data
        self.fuzzy_junctions = []
        self.gpds = []  # contributing member genepreds
        self.start = None
        self.end = None
        self.dir = None
        # Higher level data
        self.simple_junction_set = set()  # quickly search for if a multi exon gene has been added
        # Here is the parameters
        self.params = {}
        self.params["use_dir"] = False
        self.params["junction_tolerance"] = juntol
        # Not fully implemented.  Do we require a full length match
        self.params["proper_set"] = True
        # Define thresholds for overlapping single exons
        self.params["do_add_single_exon"] = True
        self.params["single_exon_minimum_length"] = 200
        self.params[
            "single_exon_minimum_overlap_fraction"
        ] = 0.8  # reciprocal ... must be this fraction or more on both
        self.params["single_exon_minimum_overlap_bases"] = 1  # minimum number of bases
        self.params["single_exon_maximum_endpoint_distance"] = 1000
        if params:
            for pname in params:
                self.params[pname] = params[pname]
        if ingpd:
            self.add_gpd(ingpd)

    def get_genepred_line(self, end_select="extremes", junction_select="mode", name=None):
        if not name:
            name = "fuzGPD_" + random_string(8) + "_" + str(len(self.fuzzy_junctions) + 1) + "_" + str(len(self.gpds))
        ostr = ""
        ostr += name + "\t"
        ostr += name + "\t"
        ostr += self.start.chr + "\t"
        ostr += self.gpds[0].value("strand") + "\t"
        ostr += str(self.start.start - 1) + "\t"
        ostr += str(self.end.end) + "\t"
        ostr += str(self.start.start - 1) + "\t"
        ostr += str(self.end.end) + "\t"
        ostr += str(len(self.fuzzy_junctions) + 1) + "\t"
        exonstarts = []
        exonends = []
        exonstarts.append(self.start.start - 1)
        for j in self.fuzzy_junctions:
            exonends.append(mode(j.left.get_payload()["junc"]))
            exonstarts.append(mode(j.right.get_payload()["junc"]) - 1)
        exonends.append(self.end.end)
        ostr += ",".join([str(x) for x in exonstarts]) + "," + "\t"
        ostr += ",".join([str(x) for x in exonends]) + ","
        return ostr

    # Return a copy of the fuzzy geneprep
    def copy(self):
        g = FuzzyGenePred()  # start with a blank one why not
        # get the settings
        for pname in self.params:
            g.params[pname] = self.params[pname]
        # copy the genepreds
        for orig in self.gpds:
            g.gpds.append(GenePredEntry(orig.get_line()))
        # store direction
        g.dir = self.dir
        # copy the fuzzy junctions
        for orig in self.fuzzy_junctions:
            g.fuzzy_junctions.append(orig.copy())
        # copy the simple junction set
        for orig in self.simple_junction_set:
            g.simple_junction_set.add(orig)
        # copy the start
        if self.start:
            g.start = Bed(self.start.chr, self.start.start - 1, self.start.end, self.start.direction)
            g.start.set_payload([])
            for v in self.start.get_payload():
                g.start.get_payload().append(v)
        # copy the end
        if self.end:
            g.end = Bed(self.end.chr, self.end.start - 1, self.end.end, self.end.direction)
            g.end.set_payload([])
            for v in self.end.get_payload():
                g.end.get_payload().append(v)
        return g

    def exon_count(self):
        return len(self.fuzzy_junctions) + 1

    def gpd_count(self):
        return len(self.gpds)

    def get_bed(self):
        return Bed(self.start.chr, self.start.start - 1, self.end.end, self.start.direction)

    # This is an inspection tool for a fuzzy gpd
    def get_info_string(self):
        ostr = ""
        ostr += "== FUZZY GENEPRED INFO ==" + "\n"
        ostr += str(len(self.gpds)) + " total GPDs" + "\n"
        totalbounds = Bed(self.start.chr, self.start.start - 1, self.end.end, self.start.direction)
        ostr += totalbounds.get_range_string() + " total bounds\n"
        ostr += "---- start ----" + "\n"
        ostr += str(len(self.start.get_payload())) + " reads supporting start" + "\n"
        ostr += "  " + str(mean(self.start.get_payload())) + " mean" + "\n"
        ostr += "  " + str(mode(self.start.get_payload())) + " mode" + "\n"
        ostr += "  " + self.start.get_range_string() + " start range\n"
        ostr += "---- end ----" + "\n"
        ostr += str(len(self.end.get_payload())) + " reads supporting end" + "\n"
        ostr += "  " + str(mean(self.end.get_payload())) + " mean" + "\n"
        ostr += "  " + str(mode(self.end.get_payload())) + " mode" + "\n"
        ostr += "  " + self.end.get_range_string() + " end range\n"
        ostr += "---- junctions ----" + "\n"
        ostr += str(len(self.fuzzy_junctions)) + " total fuzzy junctions" + "\n"
        cnt = 0
        for j in self.fuzzy_junctions:
            cnt += 1
            ostr += (
                "  "
                + str(cnt)
                + ". "
                + str(mode(j.left.get_payload()["junc"]))
                + " ^ "
                + str(mode(j.right.get_payload()["junc"]))
                + "\n"
            )
            ostr += "     " + j.left.get_range_string() + " ^ " + j.right.get_range_string() + "\n"
            ostr += "     " + str(len(j.left.get_payload()["junc"])) + " read support" + "\n"
            if j.left.get_payload()["start"]:
                ostr += "       " + "---starts----" + "\n"
                ostr += (
                    "       "
                    + str(len(j.left.get_payload()["start"].get_payload()))
                    + " starts at "
                    + j.left.get_payload()["start"].get_range_string()
                    + "\n"
                )
            if j.right.get_payload()["end"]:
                ostr += "       " + "---ends----" + "\n"
                ostr += (
                    "       "
                    + str(len(j.right.get_payload()["end"].get_payload()))
                    + " ends at "
                    + j.right.get_payload()["end"].get_range_string()
                    + "\n"
                )
        return ostr

    # Add a new gpd return true if successful
    # Return false if it didn't work, return the new combined if it worked
    def add_gpd(self, ingpd):
        if len(self.gpds) == 0:  # first one
            self.read_first(ingpd)
            return self  # return ourself if we are adding our first
        # more difficult situation where we must try to combine
        # See if it can match first before actually adding stuff to it
        # if self.
        newfuz = FuzzyGenePred(ingpd, params=self.params)
        output = self.add_fuzzy_gpd(newfuz)
        return output

    # combine together compatible overlapping sets
    def concat_fuzzy_gpd(self, fuz2):
        if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) != 0:
            return False
        if len(fuz2.fuzzy_junctions) != 0 and len(self.fuzzy_junctions) == 0:
            return False
        # Lets work combine the single exon step and exit
        if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) == 0:
            return self.do_add_single_exon_fuzzy_gpd(fuz2)
        # For now don't add them if one is single exon
        if len(self.fuzzy_junctions) == 0 or len(fuz2.fuzzy_junctions) == 0:
            return False

        # See if its already a subset
        easy_subset = False
        for simplejunction in fuz2.simple_junction_set:
            if simplejunction in self.simple_junction_set:
                easy_subset = True
        # If its not already a subset look deeper
        # 1. First we need perfect junctions for a run of them
        if not easy_subset:
            if not self.compatible_overlap(fuz2):
                return False
        # still here. we will work on combining these
        output = self.copy()
        # first lets put add any overlapping junctions
        for i in range(0, len(output.fuzzy_junctions)):
            for j in range(0, len(fuz2.fuzzy_junctions)):
                if output.fuzzy_junctions[i].overlaps(fuz2.fuzzy_junctions[j], fuz2.params["junction_tolerance"]):
                    output.fuzzy_junctions[i].add_fuzzy_junction(fuz2.fuzzy_junctions[j])
                    if j == 0:  # put the start in too
                        if not output.fuzzy_junctions[i].left.get_payload()["start"]:
                            output.fuzzy_junctions[i].left.get_payload()["start"] = fuz2.start.copy()
                        else:  # merge
                            starts = output.fuzzy_junctions[i].left.get_payload()["start"].get_payload()
                            for v in fuz2.start.get_payload():
                                starts.append(v)
                            nrange = output.fuzzy_junctions[i].left.get_payload()["start"].merge(fuz2.start)
                            nrange.set_payload(starts[:])
                            output.fuzzy_junctions[i].left.get_payload()["start"] = nrange
                    if j == len(fuz2.fuzzy_junctions) - 1:  # put the end in too
                        if not output.fuzzy_junctions[i].right.get_payload()["end"]:
                            output.fuzzy_junctions[i].right.get_payload()["end"] = fuz2.end.copy()
                        else:  # merge
                            ends = output.fuzzy_junctions[i].right.get_payload()["end"].get_payload()
                            for v in fuz2.end.get_payload():
                                ends.append(v)
                            nrange = output.fuzzy_junctions[i].right.get_payload()["end"].merge(fuz2.end)
                            nrange.set_payload(ends[:])
                            output.fuzzy_junctions[i].right.get_payload()["end"] = nrange
        # see if we should build onto the left
        leftnum = -1
        leftmost = self.fuzzy_junctions[0]
        if fuz2.fuzzy_junctions[0].right.end < leftmost.left.start:
            for i in range(0, len(fuz2.fuzzy_junctions)):
                if fuz2.fuzzy_junctions[i].overlaps(leftmost, fuz2.params["junction_tolerance"]):
                    leftnum = i
                    break
        # leftnum is now -1 if no additions to the left zero if it starts on the same
        if leftnum > 0:
            for i in reversed(range(0, leftnum)):
                output.fuzzy_junctions.insert(0, fuz2.fuzzy_junctions[i].copy())
            output.start = fuz2.start.copy()
        rightnum = -1  # get the right point ... our first one comes after this
        rightmost = self.fuzzy_junctions[-1]
        if fuz2.fuzzy_junctions[-1].left.start > rightmost.right.end:
            for i in reversed(range(0, len(fuz2.fuzzy_junctions))):
                if fuz2.fuzzy_junctions[i].overlaps(rightmost, fuz2.params["junction_tolerance"]):
                    rightnum = i
                    break
        if rightnum != -1:
            rightnum += 1
            if rightnum < len(fuz2.fuzzy_junctions):
                for i in range(rightnum, len(fuz2.fuzzy_junctions)):
                    output.fuzzy_junctions.append(fuz2.fuzzy_junctions[i].copy())
                output.end = fuz2.end.copy()
        # print leftnum
        # print rightnum
        # print fuz2.params['junction_tolerance']
        # print 'combining'
        return output

    # add together subsets
    def add_fuzzy_gpd(self, fuz2):
        # see if we can add this fuzzy gpd to another
        # We treat single exon genes seprately so if only one of them is
        # single exon we can't compare them
        if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) != 0:
            return False
        if len(fuz2.fuzzy_junctions) != 0 and len(self.fuzzy_junctions) == 0:
            return False
        # Lets work combine the single exon step and exit
        if len(fuz2.fuzzy_junctions) == 0 and len(self.fuzzy_junctions) == 0:
            return self.do_add_single_exon_fuzzy_gpd(fuz2)

        # For now don't add them if one is single exon
        if len(self.fuzzy_junctions) == 0 or len(fuz2.fuzzy_junctions) == 0:
            return False

        # See if its already a subset
        easy_subset = False
        for simplejunction in fuz2.simple_junction_set:
            if simplejunction in self.simple_junction_set:
                easy_subset = True
        # If its not already a subset look deeper
        # 1. First we need perfect junctions for a run of them
        if not easy_subset:
            if not self.compatible_overlap(fuz2):
                return False
        # still here. we will work on combining these
        output = self.copy()
        # switch over to working on the output now

        # If we are still here we can add the two of them together
        # If they have the same starting junction we can add their starting points together
        if output.fuzzy_junctions[0].overlaps(fuz2.fuzzy_junctions[0], output.params["junction_tolerance"]):
            # print 'samestart'
            newstart = output.start.merge(fuz2.start)
            newstart.set_payload(output.start.get_payload())
            for s in fuz2.start.get_payload():
                newstart.get_payload().append(s)
            output.start = newstart

        # Check if the other one is new start
        elif mode(fuz2.fuzzy_junctions[0].left.get_payload()["junc"]) < mode(
            output.fuzzy_junctions[0].left.get_payload()["junc"]
        ):
            # print "2 start"
            output.start = fuz2.start
        elif mode(fuz2.fuzzy_junctions[0].left.get_payload()["junc"]) > mode(
            output.fuzzy_junctions[0].left.get_payload()["junc"]
        ):
            True
        #  #print "1 start"
        #  #we're good to go
        else:
            sys.stderr.write("WARNING: strange start case abort merge\n")
            return False
        # lets work the ends now
        if output.fuzzy_junctions[-1].overlaps(fuz2.fuzzy_junctions[-1], output.params["junction_tolerance"]):
            # print 'sameend'
            newend = output.end.merge(fuz2.end)
            newend.set_payload(output.end.get_payload())
            for s in fuz2.end.get_payload():
                newend.get_payload().append(s)
            output.end = newend

        # Check if the other one is new start
        elif mode(fuz2.fuzzy_junctions[-1].right.get_payload()["junc"]) > mode(
            output.fuzzy_junctions[-1].right.get_payload()["junc"]
        ):
            # print "2 end"
            output.end = fuz2.end
        elif mode(fuz2.fuzzy_junctions[-1].right.get_payload()["junc"]) < mode(
            output.fuzzy_junctions[-1].right.get_payload()["junc"]
        ):
            True
        #  #print "1 end"
        #  #we're good to go
        else:
            sys.stderr.write("WARNING: strange end case abort merge\n")
            u1 = mode(output.fuzzy_junctions[-1].left.get_payload()["junc"])
            u2 = mode(fuz2.fuzzy_junctions[-1].left.get_payload()["junc"])
            v1 = mode(output.fuzzy_junctions[-1].right.get_payload()["junc"])
            v2 = mode(fuz2.fuzzy_junctions[-1].right.get_payload()["junc"])
            sys.stderr.write(str(u1) + "\t" + str(u2) + "\n")
            sys.stderr.write(str(v1) + "\t" + str(v2) + "\n")
            return False
        # now the starts and ends have been updated in output.
        # iterate through the junctions.
        # check for a left overhang.
        numfuz2left = 0
        numoutleft = 0
        if not output.fuzzy_junctions[0].overlaps(fuz2.fuzzy_junctions[0], output.params["junction_tolerance"]):
            # see if we need to add sequences from fuz2
            if mode(fuz2.fuzzy_junctions[0].left.get_payload()["junc"]) < mode(
                output.fuzzy_junctions[0].left.get_payload()["junc"]
            ):
                # print 'left over2'
                i = 0
                while not output.fuzzy_junctions[0].overlaps(
                    fuz2.fuzzy_junctions[i], output.params["junction_tolerance"]
                ) and i < len(fuz2.fuzzy_junctions):
                    i += 1
                numfuz2left = i  # number to push on from the fuz2 and increment in
                # print numfuz2left
            elif mode(fuz2.fuzzy_junctions[0].left.get_payload()["junc"]) > mode(
                output.fuzzy_junctions[0].left.get_payload()["junc"]
            ):
                # print 'left over1'
                i = 0
                while not output.fuzzy_junctions[i].overlaps(
                    fuz2.fuzzy_junctions[0], output.params["junction_tolerance"]
                ) and i < len(output.fuzzy_junctions):
                    i += 1
                numoutleft = i  # number to increment in from output
                # print numoutleft
            else:
                sys.stderr.write("WARNING: strange case \n")
                return False
        # next we can check how long we have a run of the same
        ind1 = numoutleft
        ind2 = numfuz2left
        overlap_size = 0
        while (
            ind1 < len(output.fuzzy_junctions)
            and ind2 < len(fuz2.fuzzy_junctions)
            and output.fuzzy_junctions[ind1].overlaps(fuz2.fuzzy_junctions[ind2], output.params["junction_tolerance"])
        ):
            overlap_size += 1
            ind1 += 1
            ind2 += 1
        # print 'overlap size '+str(overlap_size)
        numoutright = len(output.fuzzy_junctions) - overlap_size - numoutleft
        numfuz2right = len(fuz2.fuzzy_junctions) - overlap_size - numfuz2left
        if min(numoutright, numfuz2right) != 0:
            sys.stderr.write("WARNING: expected one of them to be zero\n")
            # print self.get_info_string()
            # print '====================='
            # print fuz2.get_info_string()
            # sys.exit()
            return False
        if min(numoutleft, numfuz2left) != 0:
            sys.stderr.write("WARNING: expected one of them to be zero\n")
            return False
        # print numoutright
        # print numfuz2right
        # print output.fuzzy_junctions[numoutleft].overlaps(fuz2.fuzzy_junctions[numfuz2left],output.junction_tolerance)
        # print 'add'
        # Now we have what we need to go through and do some updating
        # Lets just make new fuzzy junctions
        newjuncs = []
        for i in range(0, numfuz2left):
            newjuncs.append(fuz2.fuzzy_junctions[i])
        for i in range(0, numoutleft):
            newjuncs.append(output.fuzzy_junctions[i])
        # Now we do both down the center
        range1 = range(numoutleft, overlap_size + numoutleft)
        range2 = range(numfuz2left, overlap_size + numfuz2left)
        for i in range(0, len(range1)):
            newjuncs.append(output.fuzzy_junctions[range1[i]])
            newjuncs[-1].add_fuzzy_junction(fuz2.fuzzy_junctions[range2[i]])
            # print i
        # Make the right size
        for i in range(overlap_size + numfuz2left, overlap_size + numfuz2left + numfuz2right):
            newjuncs.append(fuz2.fuzzy_junctions[i])
        for i in range(overlap_size + numoutleft, overlap_size + numoutleft + numoutright):
            newjuncs.append(output.fuzzy_junctions[i])
        output.fuzzy_junctions = newjuncs
        # print 'adding gpd '+str(len(fuz2.gpds))+' entries'
        for g in fuz2.gpds:
            output.gpds.append(g)
            sjun = get_simple_junction(g)
            if sjun:
                output.simple_junction_set.add(sjun)
        # print 'new entry'
        # print self.get_info_string()
        return output

    def do_add_single_exon_fuzzy_gpd(self, fuz2):
        if not self.params["do_add_single_exon"]:
            return False  # make sure we are allowed to be doing this
        # build the bounds from the average start and end
        s1 = mean(self.start.get_payload())
        e1 = mean(self.end.get_payload())
        s2 = mean(fuz2.start.get_payload())
        e2 = mean(fuz2.end.get_payload())
        l1 = e1 - s1 + 1
        l2 = e2 - s2 + 1
        if l1 < self.params["single_exon_minimum_length"]:
            return False
        if l2 < self.params["single_exon_minimum_length"]:
            return False
        if l1 < 1 or l2 < 1:
            return False  # shouldn't happen
        chr1 = self.start.chr
        chr2 = self.end.chr
        if chr1 != chr2:
            return False  # shouldn't happen
        r1 = Bed(chr1, s1 - 1, e1, self.dir)
        r2 = Bed(chr2, s2 - 1, e2, self.dir)
        over = r1.overlap_size(r2)
        if over < self.params["single_exon_minimum_overlap_bases"]:
            return False
        # print r1.get_range_string()
        # print r2.get_range_string()
        cov = min(float(over) / float(l1), float(over) / float(l2))
        if cov < self.params["single_exon_minimum_overlap_fraction"]:
            return False
        if abs(e1 - e2) > self.params["single_exon_maximum_endpoint_distance"]:
            return False
        if abs(s1 - s2) > self.params["single_exon_maximum_endpoint_distance"]:
            return False
        # If we're still here, we can add result
        output = self.copy()
        newstart = output.start.merge(fuz2.start)
        newstart.set_payload([])
        for s in output.start.get_payload():
            newstart.get_payload().append(s)
        for s in fuz2.start.get_payload():
            newstart.get_payload().append(s)
        newend = output.end.merge(fuz2.end)
        newend.set_payload([])
        for e in output.end.get_payload():
            newend.get_payload().append(e)
        for e in fuz2.end.get_payload():
            newend.get_payload().append(e)
        output.start = newstart
        output.end = newend
        for gpd in fuz2.gpds:
            output.gpds.append(gpd)
            sjun = get_simple_junction(gpd)
            if sjun:
                output.simple_junction_set.add(gpd)
        return output

    # Return true if these fuzzy genepreds can be added together
    def compatible_overlap(self, fingpd):
        f1 = self
        f2 = fingpd

        #### Forget about trying zero exon cases for now
        if len(f1.fuzzy_junctions) == 0 or len(f2.fuzzy_junctions) == 0:
            return False

        # Find all matches
        matches = []
        for i in range(0, len(f1.fuzzy_junctions)):
            for j in range(0, len(f2.fuzzy_junctions)):
                if f1.fuzzy_junctions[i].overlaps(f2.fuzzy_junctions[j], self.params["junction_tolerance"]):
                    matches.append([i, j])

        # This is our matched junctions in f1 and f2
        if len(matches) == 0:
            return False  # Nothing matched.. certainly no overlap

        # This is the number of extra exons it would take in the middle of the run (shifts)
        if len(set([x[0] - x[1] for x in matches])) != 1:
            return False

        # Lets make sure all our exons are consecutive
        if len(matches) > 1:
            consec1 = list(set([matches[i + 1][0] - matches[i][0] for i in range(0, len(matches) - 1)]))
            consec2 = list(set([matches[i + 1][1] - matches[i][1] for i in range(0, len(matches) - 1)]))
            if len(consec1) != 1:
                return False
            if len(consec2) != 1:
                return False
            if consec1[0] != 1:
                return False
            if consec2[0] != 1:
                return False
        # one of them should be zero
        if not (matches[0][1] == 0 or matches[0][0] == 0):
            return False

        # and one of our last matches should be the last junction
        if not (len(f1.fuzzy_junctions) - 1 == matches[-1][0] or len(f2.fuzzy_junctions) - 1 == matches[-1][1]):
            return False

        #### most of the time we will probably be looking for a proper set
        #### unless we are extending the long read for isoform prediction
        if self.params["proper_set"]:
            # check those last overhangs
            # one of the two needs to have the start and end points in the consecutive matches
            if (matches[0][0] == 0 and len(f1.fuzzy_junctions) - 1 == matches[-1][0]) or (
                matches[0][1] == 0 and len(f2.fuzzy_junctions) - 1 == matches[-1][1]
            ):
                return True
            return False

        return True

    def read_first(self, ingpd):
        self.gpds.append(ingpd)
        sjun = get_simple_junction(ingpd)
        if sjun:
            self.simple_junction_set.add(sjun)
        if self.params["use_dir"]:
            self.dir = ingpd.value("strand")
        # add fuzzy junctions
        chr = ingpd.value("chrom")
        for i in range(0, len(ingpd.value("exonStarts")) - 1):
            self.fuzzy_junctions.append(
                FuzzyJunction(chr, ingpd.value("exonEnds")[i], ingpd.value("exonStarts")[i + 1] + 1, self.dir)
            )
        if len(ingpd.value("exonStarts")) > 1:  # we have junctions
            self.fuzzy_junctions[0].left.get_payload()["start"] = Bed(
                chr, ingpd.value("txStart"), ingpd.value("txStart") + 1, self.dir
            )
            self.fuzzy_junctions[0].left.get_payload()["start"].set_payload([])
            self.fuzzy_junctions[0].left.get_payload()["start"].get_payload().append(ingpd.value("txStart") + 1)
            self.fuzzy_junctions[-1].right.get_payload()["end"] = Bed(
                chr, ingpd.value("txEnd") - 1, ingpd.value("txEnd"), self.dir
            )
            self.fuzzy_junctions[-1].right.get_payload()["end"].set_payload([])
            self.fuzzy_junctions[-1].right.get_payload()["end"].get_payload().append(ingpd.value("txEnd"))
        # add fuzzy starts
        self.start = Bed(ingpd.value("chrom"), ingpd.value("txStart"), ingpd.value("txStart") + 1, self.dir)
        self.start.set_payload([])
        self.start.get_payload().append(ingpd.value("txStart") + 1)
        self.end = Bed(ingpd.value("chrom"), ingpd.value("txEnd") - 1, ingpd.value("txEnd"), self.dir)
        self.end.set_payload([])
        self.end.get_payload().append(ingpd.value("txEnd"))
        # Have finished reading in the first case

    # Pre: another fuzzy gpd
    # Post: True if they are all overlapping junctions
    def is_equal_fuzzy(self, fuz2, use_direction=False):
        if use_direction:
            if self.dir != fuz2.dir:
                return False
        if len(self.fuzzy_junctions) < 0:
            return False
        if len(fuz2.fuzzy_junctions) < 0:
            return False
        if len(self.fuzzy_junctions) != len(fuz2.fuzzy_junctions):
            return False
        for i in range(0, len(self.fuzzy_junctions)):
            if not self.fuzzy_junctions[i].overlaps(fuz2.fuzzy_junctions[i], self.params["junction_tolerance"]):
                return False
        return True
Ejemplo n.º 29
0
class FuzzyJunction:
    # Pre: inleft is 1-indexed last exonic base on the left
    #      inright is 1-indexed first exonic base on the right
    #      direction doesn't need to be used
    def __init__(self, inchr=None, inleft=None, inright=None, indir=None):
        self.chr = inchr
        self.left = None  #range with payloads being the actual left and rights
        self.right = None
        self.dir = indir
        if inchr and inleft and inright:
            self.add_junction(inchr, inleft, inright, indir)

    def copy(self):
        newjunc = FuzzyJunction()
        newjunc.chr = self.chr
        newjunc.left = Bed(self.left.chr,\
                           self.left.start-1,\
                           self.left.end,\
                           self.left.direction)
        t1 = {}
        t1['junc'] = []
        t1['start'] = None
        newjunc.left.set_payload(t1)
        for j in self.left.get_payload()['junc']:
            newjunc.left.get_payload()['junc'].append(j)
        newjunc.right = Bed(self.right.chr, self.right.start - 1,
                            self.right.end, self.right.direction)
        #copy any starts for the junction
        if self.left.get_payload()['start']:
            ls = self.left.get_payload()['start']
            newjunc.left.get_payload()['start'] = Bed(ls.chr, ls.start - 1,
                                                      ls.end, ls.direction)
            newjunc.left.get_payload()['start'].set_payload([])
            for p in self.left.get_payload()['start'].get_payload():
                newjunc.left.get_payload()['start'].get_payload().append(p)
        t2 = {}
        t2['junc'] = []
        t2['end'] = None
        newjunc.right.set_payload(t2)
        for j in self.right.get_payload()['junc']:
            newjunc.right.get_payload()['junc'].append(j)
        #copy any ends for the junction
        if self.right.get_payload()['end']:
            ren = self.right.get_payload()['end']
            newjunc.right.get_payload()['end'] = Bed(ren.chr, ren.start - 1,
                                                     ren.end, ren.direction)
            newjunc.right.get_payload()['end'].set_payload([])
            for p in self.right.get_payload()['end'].get_payload():
                newjunc.right.get_payload()['end'].get_payload().append(p)
        return newjunc

    # return chr, and the left and right mode as an array
    def get_mode(self):
        m1 = mode(self.left.get_payload()['junc'])
        m2 = mode(self.right.get_payload()['junc'])
        return [
            Bed(self.chr, m1 - 1, m1, self.dir),
            Bed(self.chr, m2 - 1, m2, self.dir)
        ]

    # Find the mode of the junction and see if this overlaps
    def overlaps(self, fjun2, juntol):
        m1 = self.get_mode()
        m2 = fjun2.get_mode()
        if m1[0].chr != m2[0].chr: return False
        if m1[0].direction != m2[0].direction:
            return False  # usually they are both off
        if not m1[0].overlaps_with_padding(m2[0], juntol): return False
        if not m1[1].overlaps_with_padding(m2[1], juntol): return False
        return True

    #Right now assumes these are overlap verified prior to calling
    def add_junction(self, inchr, inleft, inright, indir=None):
        if not self.left:  # this is our first one
            t1 = {}
            t1['junc'] = []
            t1['start'] = None
            self.left = Bed(inchr, inleft - 1, inleft, indir)
            self.left.set_payload(t1)
            self.left.get_payload()['junc'].append(inleft)
            self.right = Bed(inchr, inright - 1, inright, indir)
            t2 = {}
            t2['junc'] = []
            t2['end'] = None
            self.right = Bed(inchr, inright - 1, inright, indir)
            self.right.set_payload(t2)
            self.right.get_payload()['junc'].append(inright)
            return
        #Lets add this one to our current one
        newfuz = FuzzyJunction(inchar, inleft, inright, indir)
        self.add_fuzzy_junction(newfuz)

    def add_fuzzy_junction(self, newfuz):
        #print 'add fuzzy'
        mergeleft = self.left.merge(newfuz.left)
        mergeleft.set_payload(self.left.get_payload())
        mergeright = self.right.merge(newfuz.right)
        mergeright.set_payload(self.right.get_payload())
        for j1 in newfuz.left.get_payload()['junc']:
            mergeleft.get_payload()['junc'].append(j1)
        for j2 in newfuz.right.get_payload()['junc']:
            mergeright.get_payload()['junc'].append(j2)
        #fix the starts
        if newfuz.left.get_payload(
        )['start'] and not self.left.get_payload()['start']:
            mergeleft.get_payload()['start'] = newfuz.left.get_payload(
            )['start']
        elif newfuz.left.get_payload()['start'] and self.left.get_payload(
        )['start']:
            newrange = self.left.get_payload()['start'].merge(
                newfuz.left.get_payload()['start'])
            newrange.set_payload([])
            for s in self.left.get_payload()['start'].get_payload():
                newrange.get_payload().append(s)
            for s in newfuz.left.get_payload()['start'].get_payload():
                newrange.get_payload().append(s)
            mergeleft.get_payload()['start'] = newrange
            #print 'update left starts'
        #fix the ends
        if newfuz.right.get_payload(
        )['end'] and not self.right.get_payload()['end']:
            mergeright.get_payload()['end'] = newfuz.right.get_payload()['end']
        elif newfuz.right.get_payload()['end'] and self.right.get_payload(
        )['end']:
            newrange = newfuz.right.get_payload()['end'].merge(
                self.right.get_payload()['end'])
            newrange.set_payload([])
            for s in self.right.get_payload()['end'].get_payload():
                newrange.get_payload().append(s)
            for s in newfuz.right.get_payload()['end'].get_payload():
                newrange.get_payload().append(s)
            mergeright.get_payload()['end'] = newrange
            #print 'update right ends'
        # We finished the changes
        self.left = mergeleft
        self.right = mergeright
class FuzzyJunction:
    # Pre: inleft is 1-indexed last exonic base on the left
    #      inright is 1-indexed first exonic base on the right
    #      direction doesn't need to be used
    def __init__(self, inchr=None, inleft=None, inright=None, indir=None):
        self.chr = inchr
        self.left = None  # range with payloads being the actual left and rights
        self.right = None
        self.dir = indir
        if inchr and inleft and inright:
            self.add_junction(inchr, inleft, inright, indir)

    def copy(self):
        newjunc = FuzzyJunction()
        newjunc.chr = self.chr
        newjunc.left = Bed(self.left.chr, self.left.start - 1, self.left.end, self.left.direction)
        t1 = {}
        t1["junc"] = []
        t1["start"] = None
        newjunc.left.set_payload(t1)
        for j in self.left.get_payload()["junc"]:
            newjunc.left.get_payload()["junc"].append(j)
        newjunc.right = Bed(self.right.chr, self.right.start - 1, self.right.end, self.right.direction)
        # copy any starts for the junction
        if self.left.get_payload()["start"]:
            ls = self.left.get_payload()["start"]
            newjunc.left.get_payload()["start"] = Bed(ls.chr, ls.start - 1, ls.end, ls.direction)
            newjunc.left.get_payload()["start"].set_payload([])
            for p in self.left.get_payload()["start"].get_payload():
                newjunc.left.get_payload()["start"].get_payload().append(p)
        t2 = {}
        t2["junc"] = []
        t2["end"] = None
        newjunc.right.set_payload(t2)
        for j in self.right.get_payload()["junc"]:
            newjunc.right.get_payload()["junc"].append(j)
        # copy any ends for the junction
        if self.right.get_payload()["end"]:
            ren = self.right.get_payload()["end"]
            newjunc.right.get_payload()["end"] = Bed(ren.chr, ren.start - 1, ren.end, ren.direction)
            newjunc.right.get_payload()["end"].set_payload([])
            for p in self.right.get_payload()["end"].get_payload():
                newjunc.right.get_payload()["end"].get_payload().append(p)
        return newjunc

    # return chr, and the left and right mode as an array
    def get_mode(self):
        m1 = mode(self.left.get_payload()["junc"])
        m2 = mode(self.right.get_payload()["junc"])
        return [Bed(self.chr, m1 - 1, m1, self.dir), Bed(self.chr, m2 - 1, m2, self.dir)]

    # Find the mode of the junction and see if this overlaps
    def overlaps(self, fjun2, juntol):
        m1 = self.get_mode()
        m2 = fjun2.get_mode()
        if m1[0].chr != m2[0].chr:
            return False
        if m1[0].direction != m2[0].direction:
            return False  # usually they are both off
        if not m1[0].overlaps_with_padding(m2[0], juntol):
            return False
        if not m1[1].overlaps_with_padding(m2[1], juntol):
            return False
        return True

    # Right now assumes these are overlap verified prior to calling
    def add_junction(self, inchr, inleft, inright, indir=None):
        if not self.left:  # this is our first one
            t1 = {}
            t1["junc"] = []
            t1["start"] = None
            self.left = Bed(inchr, inleft - 1, inleft, indir)
            self.left.set_payload(t1)
            self.left.get_payload()["junc"].append(inleft)
            self.right = Bed(inchr, inright - 1, inright, indir)
            t2 = {}
            t2["junc"] = []
            t2["end"] = None
            self.right = Bed(inchr, inright - 1, inright, indir)
            self.right.set_payload(t2)
            self.right.get_payload()["junc"].append(inright)
            return
        # Lets add this one to our current one
        newfuz = FuzzyJunction(inchar, inleft, inright, indir)
        self.add_fuzzy_junction(newfuz)

    def add_fuzzy_junction(self, newfuz):
        # print 'add fuzzy'
        mergeleft = self.left.merge(newfuz.left)
        mergeleft.set_payload(self.left.get_payload())
        mergeright = self.right.merge(newfuz.right)
        mergeright.set_payload(self.right.get_payload())
        for j1 in newfuz.left.get_payload()["junc"]:
            mergeleft.get_payload()["junc"].append(j1)
        for j2 in newfuz.right.get_payload()["junc"]:
            mergeright.get_payload()["junc"].append(j2)
        # fix the starts
        if newfuz.left.get_payload()["start"] and not self.left.get_payload()["start"]:
            mergeleft.get_payload()["start"] = newfuz.left.get_payload()["start"]
        elif newfuz.left.get_payload()["start"] and self.left.get_payload()["start"]:
            newrange = self.left.get_payload()["start"].merge(newfuz.left.get_payload()["start"])
            newrange.set_payload([])
            for s in self.left.get_payload()["start"].get_payload():
                newrange.get_payload().append(s)
            for s in newfuz.left.get_payload()["start"].get_payload():
                newrange.get_payload().append(s)
            mergeleft.get_payload()["start"] = newrange
            # print 'update left starts'
        # fix the ends
        if newfuz.right.get_payload()["end"] and not self.right.get_payload()["end"]:
            mergeright.get_payload()["end"] = newfuz.right.get_payload()["end"]
        elif newfuz.right.get_payload()["end"] and self.right.get_payload()["end"]:
            newrange = newfuz.right.get_payload()["end"].merge(self.right.get_payload()["end"])
            newrange.set_payload([])
            for s in self.right.get_payload()["end"].get_payload():
                newrange.get_payload().append(s)
            for s in newfuz.right.get_payload()["end"].get_payload():
                newrange.get_payload().append(s)
            mergeright.get_payload()["end"] = newrange
            # print 'update right ends'
        # We finished the changes
        self.left = mergeleft
        self.right = mergeright
Ejemplo n.º 31
0
 def get_bed(self):
     return Bed(self.start.chr, self.start.start - 1, self.end.end,
                self.start.direction)