Beispiel #1
0
 def LoadExonBoundaryCoordinatesFromFile(self, path): #{
   DebugMsg(self, "Gene annotation file: %s" % path)
   # open the annotations file
   annotations_file = GeneAnnotationParserCls(path, log_info=self.log_info)
   skipped_chroms = set()
   # get the coordinates from the file
   for transcript in annotations_file: #{
     # fix chromosome names, if needed
     chrom = NormalizeChrID(transcript.chrom)
     if (NonStandardChr(chrom)): #{
       ExtremeDebugMsg(self, "Skipping transcript in strange chromosome: "
         "%s (%s)" % (chrom, transcript.chrom))
       skipped_chroms.add(chrom)
       continue
     #} end if
     prev_exon = None
     for (index, exon) in enumerate(transcript.SortedExons()): #{
       (exon.left, exon.right) = (exon.min, exon.max)
       # assume that exon list is sorted by left coordinate
       if (None != prev_exon and prev_exon.min > exon.min): #{
         raise ExonBoundCounterError("Transcript %s exons are not in "
           "order: %s, %s" % (transcript.transcript_id,
           prev_exon.ToString(), exon.ToString()))
       #} end if
       prev_exon = exon
       # do not include the left side of the first exon
       if (0 == index): #{
         exon.left = None
       #} end if
       # do not include the right side of the last exon
       if (len(transcript.exons) == (index+1)): #{
         exon.right = None
       #} end if
       # exon_bound_coords[chrom][prime_side][coord1][coord2] = gene_list
       for side in SIDES: #{
         if (None != getattr(exon, side)): #{
           keys = [chrom, side, getattr(exon, side),
             getattr(exon, OtherSide(side))]
           AddToMultiDict(self.exon_bound_coords, keys,
             transcript.transcript_id)
         #} end if
       #} end for
     #} end for
   #} end for
   if (0 < len(skipped_chroms)): #{
     DebugMsg(self, "Skipped transcripts in chromosomes: %s" %
       ", ".join(sorted(skipped_chroms)))
   #} end if
   # close the file
   annotations_file.close()
Beispiel #2
0
 def __init__(self, path, log_info=None): #{
   self.log_info = log_info
   # open the annotations file
   self.parser = GeneAnnotationParserCls(path, log_info=log_info)
Beispiel #3
0
class AnnotationsFileCls: #{
  def __init__(self, path, log_info=None): #{
    self.log_info = log_info
    # open the annotations file
    self.parser = GeneAnnotationParserCls(path, log_info=log_info)
  #} end def

  def __iter__(self): #{
    return self
  #} end def

  def next(self): #{
    # replace spaces in the alias and
    # get rid of any "chr" in the chromosome name
    transcript = FixAnnotation(self.parser.next(), use_chr=False)
    ExtremeDebugMsg(self, "T: %s" % transcript)
    # ensure that the transcript is from a "normal" chromosome,
    # not including mitochondrial DNA, and is not a tRNA or rRNA
    while (NonStandardChr(transcript.chrom) or
        "M" == transcript.chrom or
        transcript.gene_name.lower().startswith("trna_") or
        transcript.gene_name.lower().endswith("_rrna")): #{
      ExtremeDebugMsg(self, "  Skipping...")
      transcript = FixAnnotation(self.parser.next(), use_chr=False)
      ExtremeDebugMsg(self, "T: %s" % transcript)
    #} end while
    transcript.isoform = 1
    # check whether the transcript is coding or non-coding
    if (transcript.cdsStart >= transcript.cdsEnd): #{
      transcript.non_coding = True
    #} end if
    # separate the exons into UTRs and coding exons
    self.SeparateUTRs(transcript)
    # reverse the order of the exons if
    # the transcript is on the negative strand
    if ("-" == transcript.strand): #{
      transcript.exons.reverse()
      transcript.split_exons.reverse()
      transcript.utr_flags.reverse()
    #} end if
    return transcript
  #} end def

  def SeparateUTRs(self, transcript): #{
    # ensure that exons are ordered by start coordinate
    if (transcript.exons[0][0] > transcript.exons[-1][0]): #{
      transcript.exons.reverse()
    #} end if
    transcript.num_coding_exons = 0
    transcript.utr_flags = list()
    transcript.split_exons = list()
    if (transcript.non_coding): #{
      ExtremeDebugMsg(self, "Not separating UTRs for non-coding gene")
      return
    #} end if
    ExtremeDebugMsg(self, "Separating UTRs from coding exons...\n"
      "cdsStart: %i, cdsEnd: %i" % (transcript.cdsStart, transcript.cdsEnd))
    for (e_start, e_end) in transcript.exons: #{
      ExtremeDebugMsg(self, "Exon start: %i, end: %i" % (e_start, e_end))
      # if the exon ends before the CDS start or
      # the exon starts after the CDS end,
      # the full exon is a UTR
      if (e_end < transcript.cdsStart or transcript.cdsEnd < e_start): #{
        transcript.utr_flags.append(True)
        transcript.split_exons.append([e_start, e_end])
        ExtremeDebugMsg(self, "  full UTR")
      else:
        # if the exon starts before the CDS start and
        # ends after the CDS start,
        # the first part of the exon is a UTR
        if (e_start < transcript.cdsStart): #{
          transcript.utr_flags.append(True)
          transcript.split_exons.append([e_start, transcript.cdsStart-1])
          e_start = transcript.cdsStart
          ExtremeDebugMsg(self, "  UTR start: %i-%i\n  New start: %i" %
            (transcript.split_exons[-1][0],
            transcript.split_exons[-1][1], e_start))
        #} end if
        # if the exon starts before the CDS end and
        # ends after the CDS end,
        # the second part of the exon is a UTR
        if (transcript.cdsEnd < e_end): #{
          transcript.num_coding_exons += 1
          transcript.utr_flags.append(False)
          transcript.split_exons.append([e_start, transcript.cdsEnd])
          transcript.utr_flags.append(True)
          transcript.split_exons.append([transcript.cdsEnd+1, e_end])
          ExtremeDebugMsg(self, "  exon start: %i-%i\n  UTR end: %i-%i" %
            (transcript.split_exons[-2][0], transcript.split_exons[-2][1],
             transcript.split_exons[-1][0], transcript.split_exons[-1][1]))
        # if the exon starts after the CDS start and
        # ends before the CDS end,
        # the full exon is really an exon
        elif (e_start <= e_end):
          transcript.num_coding_exons += 1
          transcript.utr_flags.append(False)
          transcript.split_exons.append([e_start, e_end])
          ExtremeDebugMsg(self, "  full exon: %i-%i" % (e_start, e_end))
        else:
          raise ExonCoordsError("cannot determine exon type: "
            "%s: CDS:%i-%i, Exon:%i-%i" % (transcript.alias,
            transcript.cdsStart, transcript.cdsEnd, e_start, e_end))
        #} end if
      #} end if
    #} end for
    if (len(transcript.split_exons) != len(transcript.utr_flags)): #{
      raise ChimeraSimulatorError("error loading transcript: # exons (%i)" %
        len(transcript.exons) + " not equal to # UTR flags (%i)" %
        len(transcript.utr_flags))