Esempio n. 1
0
def repeat_masker_iterator(fh, alignment_index=None,
                           header=True, verbose=False):
  """
  Iterator for repeatmasker coordinate annotation files. These files describe
  the location of repeat occurrences. There is (optionally) a two-line header
  with the names of the fields (ignored by the iterator, if present). Each line
  is a record of an occurrence. The description of fields for each line is
  given in from_repeat_masker_string.

  :param fh:              stream-like object, or string filename, to load the
                          annotations from
  :param alignment_index: an IndexedFile for full alignments; keys should be
                          repeat-masker IDs
  :param header:          if True, expect and discard the two-line header;
                          otherwise we will expect there is no header
  :param verbose:         if True, output additional status messages about
                          progress to stderr.
  """

  strm = fh
  if type(fh).__name__ == "str":
    strm = open(fh)

  # try to get an idea of how much data we have...
  if verbose:
    try:
      total = os.path.getsize(strm.name)
      pind = ProgressIndicator(totalToDo=total, messagePrefix="completed",
                               messageSuffix="of processing " + strm.name)
    except AttributeError as e:
      sys.stderr.write(str(e))
      sys.stderr.write("completed [unknown] of processing index")
      verbose = False

  if header:
    # chomp first 2 lines
    next(strm)
    next(strm)

  for line in strm:
    if verbose:
      pind.done = strm.tell()
      pind.showProgress()

    line = line.strip()
    if line == "":
      continue
    rto = retrotransposon.from_repeat_masker_string(line)
    if alignment_index is not None:
      rto.pairwise_alignment =\
          JustInTimePairwiseAlignment(alignment_index, rto.uniq_id)
    yield rto
Esempio n. 2
0
  def test_basic_iterator(self):
    elems = [x for x in repeat_masker_iterator(StringIO.StringIO(self.ann))]
    self.assertEqual(len(elems), 6)
    for i in range(0, len(elems)):
      an = retrotransposon.from_repeat_masker_string(self.indv_an[i])
      self.assertEqual(elems[i], an)

    # alignments are not avaialble, so liftover should work only on coords;
    # just check one to make sure its working
    # elem[0]: 15, 67 -> 85, 141 - (53 to 57; gap_length = 14)
    self.assertEqual(elems[0].liftover(GenomicInterval("chr1", 10, 100)),
                     [GenomicInterval("A#B", 128, 142, strand='+'),
                      GenomicInterval("A#B", 113, 127, strand='+'),
                      GenomicInterval("A#B", 98, 112, strand='+'),
                      GenomicInterval("A#B", 86, 97, strand='+')])
Esempio n. 3
0
  def test_iterator_with_alignment_index(self):
    def extract_UID(rm_alignment):
      return rm_alignment.meta[repeatmaskerAlignments.RM_ID_KEY]

    s_io = StringIO.StringIO(self.rm_rc_1_input)
    index = IndexedFile(s_io, repeat_masker_alignment_iterator, extract_UID)

    elems = [x for x in repeat_masker_iterator(StringIO.StringIO(self.ann),
                                               alignment_index=index)]
    self.assertEqual(len(elems), 6)
    for i in range(0, len(elems)):
      an = retrotransposon.from_repeat_masker_string(self.indv_an[i])
      self.assertEqual(elems[i], an)

    # alignments were provided, liftover should be using them; test one
    # to make sure they were matched up properly
    r = elems[0].liftover(GenomicInterval("chr1", 10, 100))
    self.assertEqual(r, [(132, 142), (120, 131), (88, 118), (85, 87)])
    # also test one of the ones that had no alignment; here we expect failure
    self.assertRaises(IndexError, elems[4].liftover,
                      GenomicInterval("chr1", 15200, 15400))