def processBED(infh, outhandle, scheme, verbose=False):
  for read in BEDIterator(infh, verbose=verbose):
    # split the chrom field to get the genomic indices..
    y = collections.deque(read.chrom.split("_"))
    while len(y) > 5:
      a = y.popleft()
      a += ("_" + y.popleft())
      y.appendleft(a)
    chrom = y[0]
    chrom1SeqStart = int(y[1])
    chrom1SeqEnd = int(y[2])
    chrom2SeqStart = int(y[3])

    # arbitrarily decide the first exon contains the largest portion of
    # the read if both are the same
    firstExon = None
    secondExon = None
    if scheme != SECOND_EXON:
        firstExon = GenomicInterval(chrom, chrom1SeqStart + read.start - 1,
                                    chrom1SeqEnd, read.name, read.score,
                                    read.strand)
    if scheme != FIRST_EXON:
        end = chrom2SeqStart + (read.end - (chrom1SeqEnd - chrom1SeqStart)) - 1
        secondExon = GenomicInterval(chrom, chrom2SeqStart, end, read.name,
                                     read.score, read.strand)

    # we add %1 or %2 to the end of the read names so they can
    # be distinguished later
    if firstExon is not None:
      firstExon.name = firstExon.name + "%1"
    if secondExon is not None:
      secondExon.name = secondExon.name + "%2"

    if (scheme == FIRST_EXON) or \
       (scheme == BIGGEST_EXON and len(firstExon) >= len(secondExon)) or \
       (scheme == FIVE_PRIME_END and read.strand == "+"):
      out = str(firstExon)
    elif (scheme == SECOND_EXON) or \
         (scheme == BIGGEST_EXON and len(secondExon) > len(firstExon)) or \
         (scheme == FIVE_PRIME_END and read.strand == "-"):
      out = str(secondExon)
    elif scheme == BOTH_EXONS:
      out = str(firstExon) + "\n" + str(secondExon)

    # sanity check -- make sure we create a valid output string
    for l in out.split("\n"):
      e = parseBEDString(l)
      if e.chrom.strip() == "":
        raise ValueError(" got an emtpy chrom -> " + str(read))

    # write output
    outhandle.write(out + "\n")
Exemple #2
0
  def test_basic_iterator(self):
    elems = [x for x in repeat_masker_iterator(StringIO.StringIO(self.ann))]
    self.assertEqual(len(elems), 6)
    for i in range(0, len(elems)):
      an = retrotransposon.from_repeat_masker_string(self.indv_an[i])
      self.assertEqual(elems[i], an)

    # alignments are not avaialble, so liftover should work only on coords;
    # just check one to make sure its working
    # elem[0]: 15, 67 -> 85, 141 - (53 to 57; gap_length = 14)
    self.assertEqual(elems[0].liftover(GenomicInterval("chr1", 10, 100)),
                     [GenomicInterval("A#B", 128, 142, strand='+'),
                      GenomicInterval("A#B", 113, 127, strand='+'),
                      GenomicInterval("A#B", 98, 112, strand='+'),
                      GenomicInterval("A#B", 86, 97, strand='+')])
Exemple #3
0
 def __init__(self, chrom, genomic_start, genomic_end, consensus_match_strand,
              consensus_start, consensus_end, consensus_len, retrotransposon,
              pairwise_alignment=None, uniq_id=None):
   """
   Constructor for RetrotransposonOccurrence objects; see class docstring
   for parameter defs.
   """
   GenomicInterval.__init__(self, chrom, genomic_start, genomic_end,
                            retrotransposon.name, 0, "+")
   self.consensus_start = consensus_start
   self.consensus_end = consensus_end
   # TODO check that consensus length is at least as long as end index
   self.consensus_len = consensus_len
   self.consensus_match_strand = consensus_match_strand
   self.retrotransposon = retrotransposon
   self.uniq_id = uniq_id
   # TODO would be good to provide some error-checking here to make sure the
   # alignment matches up properly with this occurrence...
   self.pairwise_alignment = pairwise_alignment
Exemple #4
0
 def __str__(self):
   """
   :return: a string representation of this occurrence in the format
            <genomic_interval_details> --> match_start match_end \
                                           consensus_len repeat_name \
                                           match_strand
   """
   return GenomicInterval.__str__(self) + " --> " +\
       str(self.consensus_start) + "\t" + str(self.consensus_end) + "\t" +\
       str(self.consensus_len) + "\t" + self.repeat_name() + "\t" +\
       str(self.consensus_match_strand)
Exemple #5
0
  def test_iterator_with_alignment_index(self):
    def extract_UID(rm_alignment):
      return rm_alignment.meta[repeatmaskerAlignments.RM_ID_KEY]

    s_io = StringIO.StringIO(self.rm_rc_1_input)
    index = IndexedFile(s_io, repeat_masker_alignment_iterator, extract_UID)

    elems = [x for x in repeat_masker_iterator(StringIO.StringIO(self.ann),
                                               alignment_index=index)]
    self.assertEqual(len(elems), 6)
    for i in range(0, len(elems)):
      an = retrotransposon.from_repeat_masker_string(self.indv_an[i])
      self.assertEqual(elems[i], an)

    # alignments were provided, liftover should be using them; test one
    # to make sure they were matched up properly
    r = elems[0].liftover(GenomicInterval("chr1", 10, 100))
    self.assertEqual(r, [(132, 142), (120, 131), (88, 118), (85, 87)])
    # also test one of the ones that had no alignment; here we expect failure
    self.assertRaises(IndexError, elems[4].liftover,
                      GenomicInterval("chr1", 15200, 15400))
Exemple #6
0
def processBED(infh, outhandle, scheme, verbose=False):
    for read in BEDIterator(infh, verbose=verbose):
        # split the chrom field to get the genomic indices..
        y = collections.deque(read.chrom.split("_"))
        while len(y) > 5:
            a = y.popleft()
            a += ("_" + y.popleft())
            y.appendleft(a)
        chrom = y[0]
        chrom1SeqStart = int(y[1])
        chrom1SeqEnd = int(y[2])
        chrom2SeqStart = int(y[3])

        # arbitrarily decide the first exon contains the largest portion of
        # the read if both are the same
        firstExon = None
        secondExon = None
        if scheme != SECOND_EXON:
            firstExon = GenomicInterval(chrom, chrom1SeqStart + read.start - 1,
                                        chrom1SeqEnd, read.name, read.score,
                                        read.strand)
        if scheme != FIRST_EXON:
            end = chrom2SeqStart + (read.end -
                                    (chrom1SeqEnd - chrom1SeqStart)) - 1
            secondExon = GenomicInterval(chrom, chrom2SeqStart, end, read.name,
                                         read.score, read.strand)

        # we add %1 or %2 to the end of the read names so they can
        # be distinguished later
        if firstExon is not None:
            firstExon.name = firstExon.name + "%1"
        if secondExon is not None:
            secondExon.name = secondExon.name + "%2"

        if (scheme == FIRST_EXON) or \
           (scheme == BIGGEST_EXON and len(firstExon) >= len(secondExon)) or \
           (scheme == FIVE_PRIME_END and read.strand == "+"):
            out = str(firstExon)
        elif (scheme == SECOND_EXON) or \
             (scheme == BIGGEST_EXON and len(secondExon) > len(firstExon)) or \
             (scheme == FIVE_PRIME_END and read.strand == "-"):
            out = str(secondExon)
        elif scheme == BOTH_EXONS:
            out = str(firstExon) + "\n" + str(secondExon)

        # sanity check -- make sure we create a valid output string
        for l in out.split("\n"):
            e = parseBEDString(l)
            if e.chrom.strip() == "":
                raise ValueError(" got an emtpy chrom -> " + str(read))

        # write output
        outhandle.write(out + "\n")
Exemple #7
0
 def __init__(self, whole_chrom_files, partial_chrom_files, factory):
   """Constructor; see class docsstring for param details."""
   self.current = None
   self.current_key = None
   self.factory = factory
   self.whole_chrom_files = whole_chrom_files
   self.partial_trees = {}
   by_chrom = {}
   for chrom, start, end in partial_chrom_files:
     k = (chrom, start, end)
     v = partial_chrom_files[k]
     if chrom in whole_chrom_files:
       raise GenomeAlignmentError("Oops")
     if chrom not in by_chrom:
       by_chrom[chrom] = []
     interval = GenomicInterval(chrom, start, end)
     by_chrom[chrom].append(JITGenomeAlignmentKeyInterval(interval, v))
   for chrom in by_chrom:
     self.partial_trees[chrom] = IntervalTree(by_chrom[chrom])
   for chrom, start, end in partial_chrom_files:
     hits = self.partial_trees[chrom].intersectingInterval(start, end)
     if len(hits) != 1:
       raise GenomeAlignmentError("Oops")
Exemple #8
0
def pairedBEDIterator(inputStreams,
                      mirror=False,
                      mirrorScore=None,
                      ignoreStrand=False,
                      ignoreScore=True,
                      ignoreName=True,
                      sortedby=ITERATOR_SORTED_END,
                      scoreType=float,
                      verbose=False):
    """
  Iterate over multiple BED format files simultaneously and yield lists of
  genomic intervals for each matching set of intervals found. By default,
  regions which are not found in all files will be skipped (mirror = false).
  Optionally (by setting mirror to true) if a file is missing an interval,
  it can be added on-the-fly, and will have the same chrom, start and end and
  name as in other files. The score will be taken from the first file in
  inputStreams if mirrorScore is not set, otherwise that value will be used.

  :param inputStreams: a list of input streams in BED format
  :param mirror: if true, add missing elements so all streams contain the
                 same elements. Inserted elements will have the same
  :param ignoreStrand: ignore strand when comparing elements for equality?
  :param ignoreScore: ignore score when comparing elements for equality?
  :param ignoreScore: ignore name when comparing elements for equality?
  :param sortedby: must be set to one of the sorting orders for BED streams;
                   we require the streams to be sorted in some fashion.
  :param scoreType: interpret scores as what type? Defaults to float, which
                    is generally the most flexible.
  """

    # let's build our sorting order...
    sortOrder = ["chrom"]
    if sortedby == ITERATOR_SORTED_START:
        sortOrder.append("start")
        sortOrder.append("end")
    elif sortedby == ITERATOR_SORTED_END:
        sortOrder.append("end")
        sortOrder.append("start")
    if not ignoreStrand:
        sortOrder.append("strand")
    if not ignoreName:
        sortOrder.append("name")
    if not ignoreScore:
        sortOrder.append("score")
    keyFunc = attrgetter(*sortOrder)

    def next_item(iterator):
        """ little internal function to return the next item, or None """
        try:
            return iterator.next()
        except StopIteration:
            return None

    bIterators = [
        BEDIterator(bfh,
                    verbose=verbose,
                    sortedby=sortedby,
                    scoreType=scoreType) for bfh in inputStreams
    ]
    elements = [next_item(it) for it in bIterators]

    while True:
        assert (len(elements) >= 2)
        if None not in elements and len(set([keyFunc(x)
                                             for x in elements])) == 1:
            # All equal -- yield and move on for all streams
            yield [e for e in elements]
            elements = [next_item(it) for it in bIterators]
        else:
            # something wasn't equal.. find the smallest thing, it's about to drop
            # out of range and will never have the chance to match anything again
            minElement = min([x for x in elements if x is not None],
                             key=keyFunc)
            minIndices = [
                i for i in range(0, len(elements)) if elements[i] is not None
                and keyFunc(elements[i]) == keyFunc(minElement)
            ]
            if mirror:
                # mirror the min item for any streams in which it doesn't match
                score = minElement.score if mirrorScore is None else mirrorScore
                yield [
                    elements[i] if i in minIndices else GenomicInterval(
                        minElement.chrom,
                        minElement.start,
                        minElement.end,
                        minElement.name,
                        score,
                        minElement.strand,
                        scoreType=scoreType) for i in range(0, len(elements))
                ]

            # move the smallest element onwards now, we're done with it
            for index in minIndices:
                elements[index] = next_item(bIterators[index])

        # stop once all streams are exhausted
        if reduce(lambda x, y: x and y, [e is None for e in elements]):
            break
Exemple #9
0
def pairedWigIterator(inputStreams,
                      mirror=False,
                      mirrorScore=None,
                      ignoreScore=True,
                      sortedby=ITERATOR_SORTED_END,
                      scoreType=int,
                      verbose=False,
                      debug=False):
    """
    @summary: iterate over multiple wig streams, and yield a list of wig
              elements that match for each location (locations with 0 matching
              items are skipped)
    @param inputStrams: TODO
    @param mirror:      TODO
    @param mirrorScore: TODO
    @param ignoreScore: Don't consider score when determining if two elements
                        are equal
    @param sortedby:    TODO
    @param scoreType:   TODO
    @param verbose:     TODO
    @param debug:       TODO
    @note: streams must be sorted -- sortedby parameter determines what sorting
           order will be acceptable
  """
    # let's build our sorting order...
    sortOrder = ["chrom"]
    if sortedby == ITERATOR_SORTED_START:
        sortOrder.append("start")
        sortOrder.append("end")
    elif sortedby == ITERATOR_SORTED_END:
        sortOrder.append("end")
        sortOrder.append("start")
    if not ignoreScore:
        sortOrder.append("score")
    keyFunc = attrgetter(*sortOrder)

    def next_item(iterator):
        """ little internal function to return the next item, or None """
        try:
            return iterator.next()
        except StopIteration:
            return None

    wIterators = [
        wigIterator(fh,
                    verbose=verbose,
                    sortedby=sortedby,
                    scoreType=scoreType) for fh in inputStreams
    ]
    elements = [next_item(it) for it in wIterators]

    while True:
        assert (len(elements) >= 2)
        if None not in elements and len(set([keyFunc(x)
                                             for x in elements])) == 1:
            # All equal -- yield and move on for all streams
            yield [e for e in elements]
            elements = [next_item(it) for it in wIterators]
        else:
            # something wasn't equal.... find the smallest thing, it's about
            # to drop out of range...
            minElement = min([x for x in elements if x is not None],
                             key=keyFunc)
            minIndices = [
                i for i in range(0, len(elements)) if elements[i] is not None
                and keyFunc(elements[i]) == keyFunc(minElement)
            ]
            if mirror:
                # mirror the min item for any streams in which it doesn't match
                score = minElement.score if mirrorScore is None else mirrorScore
                yield [
                    elements[i] if i in minIndices else GenomicInterval(
                        minElement.chrom, minElement.start, minElement.end,
                        None, score) for i in range(0, len(elements))
                ]

            # move the smallest element onwards now, we're done with it
            for index in minIndices:
                elements[index] = next_item(wIterators[index])

        # stop once all strams are exhuasted
        if reduce(lambda x, y: x and y, [e is None for e in elements]):
            break
Exemple #10
0
def fixedWigIterator(fd, verbose=False, sortedby=None, scoreType=int):
    """
    @summary:
  """
    fh = openFD(fd)
    if verbose:
        try:
            pind = ProgressIndicator(totalToDo=os.path.getsize(fh.name),
                                     messagePrefix="completed",
                                     messageSuffix="of processing " + fh.name)
        except AttributeError:
            sys.stderr.write("WigIterator -- warning: " +
                             "unable to show progress for stream")
            verbose = False

    chromsSeen = set()
    prev = None

    # NUMBERS = set(['1','2','3','4','5','6','7','8','9','0','.'])
    currentChrom, at, step = None, None, None
    for line in fh:
        line = line.strip()
        if line == "":
            continue

        if line[0] == 't' or line[0] == 'f':
            parts = line.split()
            if parts[0] == "track":
                continue
            elif parts[0] == "fixedStep":
                currentChrom = parts[1].split("=")[1]
                at = int(parts[2].split("=")[1])
                step = int(parts[3].split("=")[1])
        else:
            val = float(line)
            e = GenomicInterval(currentChrom,
                                at,
                                at + step,
                                None,
                                val,
                                scoreType=scoreType)

            # on same chrom as the prev item, make sure order is right
            if prev is not None and sortedby is not None and e.chrom == prev.chrom:
                if sortedby == ITERATOR_SORTED_START and prev.start > e.start:
                    raise WigIteratorError(
                        "Wig file " + fd.name +
                        " not sorted by start index - saw item " + str(prev) +
                        " before " + str(e))

            # starting a new chrom.. make sure we haven't already seen it
            if prev is not None and prev.chrom != e.chrom:
                if (sortedby == ITERATOR_SORTED_START) and\
                   (e.chrom in chromsSeen or prev.chrom > e.chrom):
                    raise WigIteratorError("Wig file " + fd.name +
                                           " not sorted by chrom")
                chromsSeen.add(e.chrom)

            # all good..
            yield e
            prev = e
            at += step
            if verbose:
                pind.done = fh.tell()
                pind.showProgress()