Esempio n. 1
0
  def testBothExonsScheme(self):
    """
    check that output exons have the right size and relative
    indices from chromsome indices
    """

    # run the code...
    infh = StringIO.StringIO("\n".join(self.readLines))
    outfh = StringIO.StringIO()
    processBED(infh, outfh, BOTH_EXONS)

    # see what we get..
    outlines = outfh.getvalue().split("\n")
    outlines = [l for l in outlines if l.strip() != ""]
    for i in range(0, len(outlines), 2):
      first = outlines[i]
      second = outlines[i + 1]

      e1 = parseBEDString(first)
      e2 = parseBEDString(second)

      self.assertTrue(e1.name[:-2] == e2.name[:-2])
      answer = len(e1) + len(e2)
      self.assertTrue(self.lengths[e1.name[:-2]] == answer)
      self.assertTrue(e1.end == self.firstChromEnds[e1.name[:-2]])
      self.assertTrue(e2.start == self.secondChromStarts[e2.name[:-2]])
Esempio n. 2
0
    def testBothExonsScheme(self):
        """
    check that output exons have the right size and relative
    indices from chromsome indices
    """

        # run the code...
        infh = StringIO.StringIO("\n".join(self.readLines))
        outfh = StringIO.StringIO()
        processBED(infh, outfh, BOTH_EXONS)

        # see what we get..
        outlines = outfh.getvalue().split("\n")
        outlines = [l for l in outlines if l.strip() != ""]
        for i in range(0, len(outlines), 2):
            first = outlines[i]
            second = outlines[i + 1]

            e1 = parseBEDString(first)
            e2 = parseBEDString(second)

            self.assertTrue(e1.name[:-2] == e2.name[:-2])
            answer = len(e1) + len(e2)
            self.assertTrue(self.lengths[e1.name[:-2]] == answer)
            self.assertTrue(e1.end == self.firstChromEnds[e1.name[:-2]])
            self.assertTrue(e2.start == self.secondChromStarts[e2.name[:-2]])
Esempio n. 3
0
  def testSecondExonScheme(self):
    """
    check that output exons have the right size and relative indices
    from chromosome indices.
    """

    # run the code...
    infh = DummyInputStream(self.readLines)
    outfh = DummyOutputStream()
    processBED(infh, outfh, SECOND_EXON)

    # see what we get..
    outlines = [l.strip() for l in outfh.itemsWritten() if l.strip() != ""]

    for i in range(0, len(outlines)):
      out = outlines[i]
      e2 = parseBEDString(out)

      gotAnswer = len(e2)
      r_len = self.readEnds[e2.name[:-2]] - self.readStarts[e2.name[:-2]]
      glob_s = (self.firstChromStarts[e2.name[:-2]] +
                self.readStarts[e2.name[:-2]])
      expectedAns = r_len - (self.firstChromEnds[e2.name[:-2]] - (glob_s)) - 1
      self.assertTrue(gotAnswer == expectedAns)
      self.assertTrue(e2.start == self.secondChromStarts[e2.name[:-2]])
Esempio n. 4
0
  def testFirstExonScheme(self):
    """
    check that output exons have the right size and relative indices
    from chromosome indices
    """

    # run the code...
    infh = DummyInputStream(self.readLines)
    outfh = DummyOutputStream()
    processBED(infh, outfh, FIRST_EXON)

    # see what we get..
    outlines = [l.strip() for l in outfh.itemsWritten() if l.strip() != ""]

    for i in range(0, len(outlines)):
      out = outlines[i]
      e1 = parseBEDString(out)

      gotAnswer = len(e1)

      read_start_global = (self.firstChromStarts[e1.name[:-2]] +
                           self.readStarts[e1.name[:-2]])
      expectedAns = self.firstChromEnds[e1.name[:-2]] - read_start_global + 1
      self.assertTrue(gotAnswer == expectedAns)
      self.assertTrue(e1.end == self.firstChromEnds[e1.name[:-2]])
Esempio n. 5
0
    def testSecondExonScheme(self):
        """
    check that output exons have the right size and relative indices
    from chromosome indices.
    """

        # run the code...
        infh = DummyInputStream(self.readLines)
        outfh = DummyOutputStream()
        processBED(infh, outfh, SECOND_EXON)

        # see what we get..
        outlines = [l.strip() for l in outfh.itemsWritten() if l.strip() != ""]

        for i in range(0, len(outlines)):
            out = outlines[i]
            e2 = parseBEDString(out)

            gotAnswer = len(e2)
            r_len = self.readEnds[e2.name[:-2]] - self.readStarts[e2.name[:-2]]
            glob_s = (self.firstChromStarts[e2.name[:-2]] +
                      self.readStarts[e2.name[:-2]])
            expectedAns = r_len - (self.firstChromEnds[e2.name[:-2]] -
                                   (glob_s)) - 1
            self.assertTrue(gotAnswer == expectedAns)
            self.assertTrue(e2.start == self.secondChromStarts[e2.name[:-2]])
Esempio n. 6
0
    def testFirstExonScheme(self):
        """
    check that output exons have the right size and relative indices
    from chromosome indices
    """

        # run the code...
        infh = DummyInputStream(self.readLines)
        outfh = DummyOutputStream()
        processBED(infh, outfh, FIRST_EXON)

        # see what we get..
        outlines = [l.strip() for l in outfh.itemsWritten() if l.strip() != ""]

        for i in range(0, len(outlines)):
            out = outlines[i]
            e1 = parseBEDString(out)

            gotAnswer = len(e1)

            read_start_global = (self.firstChromStarts[e1.name[:-2]] +
                                 self.readStarts[e1.name[:-2]])
            expectedAns = self.firstChromEnds[
                e1.name[:-2]] - read_start_global + 1
            self.assertTrue(gotAnswer == expectedAns)
            self.assertTrue(e1.end == self.firstChromEnds[e1.name[:-2]])
Esempio n. 7
0
def processBED(infh, outhandle, scheme, verbose=False):
    for read in BEDIterator(infh, verbose=verbose):
        # split the chrom field to get the genomic indices..
        y = collections.deque(read.chrom.split("_"))
        while len(y) > 5:
            a = y.popleft()
            a += ("_" + y.popleft())
            y.appendleft(a)
        chrom = y[0]
        chrom1SeqStart = int(y[1])
        chrom1SeqEnd = int(y[2])
        chrom2SeqStart = int(y[3])

        # arbitrarily decide the first exon contains the largest portion of
        # the read if both are the same
        firstExon = None
        secondExon = None
        if scheme != SECOND_EXON:
            firstExon = GenomicInterval(chrom, chrom1SeqStart + read.start - 1,
                                        chrom1SeqEnd, read.name, read.score,
                                        read.strand)
        if scheme != FIRST_EXON:
            end = chrom2SeqStart + (read.end -
                                    (chrom1SeqEnd - chrom1SeqStart)) - 1
            secondExon = GenomicInterval(chrom, chrom2SeqStart, end, read.name,
                                         read.score, read.strand)

        # we add %1 or %2 to the end of the read names so they can
        # be distinguished later
        if firstExon is not None:
            firstExon.name = firstExon.name + "%1"
        if secondExon is not None:
            secondExon.name = secondExon.name + "%2"

        if (scheme == FIRST_EXON) or \
           (scheme == BIGGEST_EXON and len(firstExon) >= len(secondExon)) or \
           (scheme == FIVE_PRIME_END and read.strand == "+"):
            out = str(firstExon)
        elif (scheme == SECOND_EXON) or \
             (scheme == BIGGEST_EXON and len(secondExon) > len(firstExon)) or \
             (scheme == FIVE_PRIME_END and read.strand == "-"):
            out = str(secondExon)
        elif scheme == BOTH_EXONS:
            out = str(firstExon) + "\n" + str(secondExon)

        # sanity check -- make sure we create a valid output string
        for l in out.split("\n"):
            e = parseBEDString(l)
            if e.chrom.strip() == "":
                raise ValueError(" got an emtpy chrom -> " + str(read))

        # write output
        outhandle.write(out + "\n")
Esempio n. 8
0
def processBED(infh, outhandle, scheme, verbose=False):
  for read in BEDIterator(infh, verbose=verbose):
    # split the chrom field to get the genomic indices..
    y = collections.deque(read.chrom.split("_"))
    while len(y) > 5:
      a = y.popleft()
      a += ("_" + y.popleft())
      y.appendleft(a)
    chrom = y[0]
    chrom1SeqStart = int(y[1])
    chrom1SeqEnd = int(y[2])
    chrom2SeqStart = int(y[3])

    # arbitrarily decide the first exon contains the largest portion of
    # the read if both are the same
    firstExon = None
    secondExon = None
    if scheme != SECOND_EXON:
        firstExon = GenomicInterval(chrom, chrom1SeqStart + read.start - 1,
                                    chrom1SeqEnd, read.name, read.score,
                                    read.strand)
    if scheme != FIRST_EXON:
        end = chrom2SeqStart + (read.end - (chrom1SeqEnd - chrom1SeqStart)) - 1
        secondExon = GenomicInterval(chrom, chrom2SeqStart, end, read.name,
                                     read.score, read.strand)

    # we add %1 or %2 to the end of the read names so they can
    # be distinguished later
    if firstExon is not None:
      firstExon.name = firstExon.name + "%1"
    if secondExon is not None:
      secondExon.name = secondExon.name + "%2"

    if (scheme == FIRST_EXON) or \
       (scheme == BIGGEST_EXON and len(firstExon) >= len(secondExon)) or \
       (scheme == FIVE_PRIME_END and read.strand == "+"):
      out = str(firstExon)
    elif (scheme == SECOND_EXON) or \
         (scheme == BIGGEST_EXON and len(secondExon) > len(firstExon)) or \
         (scheme == FIVE_PRIME_END and read.strand == "-"):
      out = str(secondExon)
    elif scheme == BOTH_EXONS:
      out = str(firstExon) + "\n" + str(secondExon)

    # sanity check -- make sure we create a valid output string
    for l in out.split("\n"):
      e = parseBEDString(l)
      if e.chrom.strip() == "":
        raise ValueError(" got an emtpy chrom -> " + str(read))

    # write output
    outhandle.write(out + "\n")
Esempio n. 9
0
  def testInclusion(self):
    """
    if a read appears in the input, it should appear in the output and
    vice-versa. Number of occurances should be the same too (unless we're
    doing BOTH_EXONS, then it should be twice in the output)
    """

    for scheme in self.schemes:
      infh = StringIO.StringIO("\n".join(self.readLines))
      outfh = StringIO.StringIO()
      processBED(infh, outfh, scheme)

      # see what we get..
      outlines = outfh.getvalue().split("\n")
      outlines = [l for l in outlines if l.strip() != ""]
      outnames = [parseBEDString(line).name[:-2] for line in outlines]
      self.assertTrue(set(outnames) == set(self.names.keys()))

      if scheme == BOTH_EXONS:
        len(outnames) / 2 == len(self.names)
Esempio n. 10
0
def randomBEDElement(name=None, chrom=None, start=None, end=None, delim="\t",
                     maxIndex=1000000):
  MAX_SCORE = 30

  if name is None:
    name = randomName(10)
  if chrom is None:
    chrom = randomName(10)

  if start is None:
    start = int(random.random() * (maxIndex - 1))
  if end is None:
    end = int(random.random() * (maxIndex - start) + start)
  score = int(random.random() * MAX_SCORE)
  strand = "-"
  if random.random() <= 0.5:
    strand = "+"

  line = delim.join([chrom, str(start), str(end), name, str(score), strand])
  return parseBEDString(line)
Esempio n. 11
0
    def testInclusion(self):
        """
    if a read appears in the input, it should appear in the output and
    vice-versa. Number of occurances should be the same too (unless we're
    doing BOTH_EXONS, then it should be twice in the output)
    """

        for scheme in self.schemes:
            infh = StringIO.StringIO("\n".join(self.readLines))
            outfh = StringIO.StringIO()
            processBED(infh, outfh, scheme)

            # see what we get..
            outlines = outfh.getvalue().split("\n")
            outlines = [l for l in outlines if l.strip() != ""]
            outnames = [parseBEDString(line).name[:-2] for line in outlines]
            self.assertTrue(set(outnames) == set(self.names.keys()))

            if scheme == BOTH_EXONS:
                len(outnames) / 2 == len(self.names)
Esempio n. 12
0
def randomBEDElement(name=None,
                     chrom=None,
                     start=None,
                     end=None,
                     delim="\t",
                     maxIndex=1000000):
    MAX_SCORE = 30

    if name is None:
        name = randomName(10)
    if chrom is None:
        chrom = randomName(10)

    if start is None:
        start = int(random.random() * (maxIndex - 1))
    if end is None:
        end = int(random.random() * (maxIndex - start) + start)
    score = int(random.random() * MAX_SCORE)
    strand = "-"
    if random.random() <= 0.5:
        strand = "+"

    line = delim.join([chrom, str(start), str(end), name, str(score), strand])
    return parseBEDString(line)
Esempio n. 13
0
    def testPairedIterator(self):
        debug = False

        in1 = "chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "1" +\
              "\t" + "+" + "\n" +\
              "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "2" +\
              "\t" + "-" + "\n" +\
              "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "3" +\
              "\t" + "+" + "\n" +\
              "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "4" +\
              "\t" + "-" + "\n"
        in2 = "chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "1" +\
              "\t" + "+" + "\n" +\
              "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" +\
              "\t" + "+" + "\n" +\
              "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" +\
              "\t" + "-" + "\n"
        in3 = "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "1" +\
              "\t" + "+" + "\n" +\
              "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" +\
              "\t" + "+" + "\n" +\
              "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" +\
              "\t" + "-" + "\n" +\
              "chr3" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "4" +\
              "\t" + "+" + "\n"

        # first, ignore strand, name and score and don't mirror missing elements
        e1 = [
            "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "3" +
            "\t" + "+", "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" +
            "\t" + "4" + "\t" + "-"
        ]
        e2 = [
            "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" +
            "\t" + "+", "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" +
            "\t" + "3" + "\t" + "-"
        ]
        e3 = [
            "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" +
            "\t" + "+", "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" +
            "\t" + "3" + "\t" + "-"
        ]
        instms = [DummyInputStream(x) for x in [in1, in2, in3]]
        allOut = [
            x for x in pairedBEDIterator(instms,
                                         mirror=False,
                                         mirrorScore=None,
                                         ignoreStrand=True,
                                         ignoreScore=True,
                                         ignoreName=True)
        ]
        got1, got2, got3 = [], [], []
        for x1, x2, x3 in allOut:
            got1.append(x1)
            got2.append(x2)
            got3.append(x3)
        for g, e in [(got1, [parseBEDString(x, scoreType=float) for x in e1]),
                     (got2, [parseBEDString(x, scoreType=float) for x in e2]),
                     (got3, [parseBEDString(x, scoreType=float) for x in e3])]:
            if debug:
                sys.stderr.write("expect\n" + "\n".join([str(x)
                                                         for x in e]) + "\n")
                sys.stderr.write("got\n" + "\n".join([str(x)
                                                      for x in g]) + "\n")
            assert (g == e)

        # now, same sort order but include strand, and mirror missing elements
        # using a score of 0
        e1 = [
            "chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "1" +
            "\t" + "+", "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" +
            "\t" + "0" + "\t" + "+", "chr1" + "\t" + "20" + "\t" + "25" +
            "\t" + "X" + "\t" + "2" + "\t" + "-", "chr1" + "\t" + "40" + "\t" +
            "47" + "\t" + "X" + "\t" + "3" + "\t" + "+", "chr2" + "\t" + "10" +
            "\t" + "15" + "\t" + "X" + "\t" + "4" + "\t" + "-", "chr3" + "\t" +
            "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" + "\t" + "+"
        ]
        e2 = [
            "chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "1" +
            "\t" + "+", "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" +
            "\t" + "0" + "\t" + "+", "chr1" + "\t" + "20" + "\t" + "25" +
            "\t" + "X" + "\t" + "0" + "\t" + "-", "chr1" + "\t" + "40" + "\t" +
            "47" + "\t" + "X" + "\t" + "2" + "\t" + "+", "chr2" + "\t" + "10" +
            "\t" + "15" + "\t" + "X" + "\t" + "3" + "\t" + "-", "chr3" + "\t" +
            "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" + "\t" + "+"
        ]
        e3 = [
            "chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "0" +
            "\t" + "+", "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" +
            "\t" + "1" + "\t" + "+", "chr1" + "\t" + "20" + "\t" + "25" +
            "\t" + "X" + "\t" + "0" + "\t" + "-", "chr1" + "\t" + "40" + "\t" +
            "47" + "\t" + "X" + "\t" + "2" + "\t" + "+", "chr2" + "\t" + "10" +
            "\t" + "15" + "\t" + "X" + "\t" + "3" + "\t" + "-", "chr3" + "\t" +
            "20" + "\t" + "25" + "\t" + "X" + "\t" + "4" + "\t" + "+"
        ]
        instms = [DummyInputStream(x) for x in [in1, in2, in3]]
        allOut = [
            x for x in pairedBEDIterator(instms,
                                         mirror=True,
                                         mirrorScore=0,
                                         ignoreStrand=False,
                                         ignoreScore=True,
                                         ignoreName=True)
        ]
        got1, got2, got3 = [], [], []
        for x1, x2, x3 in allOut:
            got1.append(x1)
            got2.append(x2)
            got3.append(x3)
        for g, e in [(got1, [parseBEDString(x, scoreType=float) for x in e1]),
                     (got2, [parseBEDString(x, scoreType=float) for x in e2]),
                     (got3, [parseBEDString(x, scoreType=float) for x in e3])]:
            if debug:
                sys.stderr.write("expect\n" + "\n".join([str(x)
                                                         for x in e]) + "\n")
                sys.stderr.write("got\n" + "\n".join([str(x)
                                                      for x in g]) + "\n")
            assert (g == e)
Esempio n. 14
0
def BEDIterator(filehandle,
                sortedby=None,
                verbose=False,
                scoreType=int,
                dropAfter=None):
    """
  Get an iterator for a BED file

  :param filehandle: this can be either a string, or a stream-like object. In
                     the former case, it is treated as a filename. The format
                     of the file/stream must be BED.
  :param sortedby: if None, order is not checked.
                   if == ITERATOR_SORTED_START, elements in file must
                   be sorted by chrom and start index (an exception
                   is raised if they are not)
                   if == ITERATOR_SORTED_END, element must be sorted
                   by chrom and end index.
  :param verbose: if True, output additional progress messages to stderr
  :param scoreType: The data type for scores (the fifth column) in the BED
                    file.
  :param dropAfter: an int indicating that any fields after and including this
                    field should be ignored as they don't conform to the BED
                    format. By default, None, meaning we use all fields. Index
                    from zero.
  :return: iterator where subsequent calls to next() yield the next BED
           element in the stream as a GenomicInterval object.
  """
    chromsSeen = set()
    prev = None
    if type(filehandle).__name__ == "str":
        filehandle = open(filehandle)

    if verbose:
        try:
            pind = ProgressIndicator(
                totalToDo=os.path.getsize(filehandle.name),
                messagePrefix="completed",
                messageSuffix="of processing " + filehandle.name)
        except (AttributeError, OSError) as e:
            sys.stderr.write("BEDIterator -- warning: " +
                             "unable to show progress for stream")
            verbose = False

    for line in filehandle:
        if verbose:
            pind.done = filehandle.tell()
            pind.showProgress()

        if line.strip() == "":
            continue
        try:
            e = parseBEDString(line, scoreType, dropAfter=dropAfter)
        except GenomicIntervalError as e:
            raise BEDError(str(e) + " on line " + line)

        # sorting by name?
        if ((sortedby == ITERATOR_SORTED_NAME and prev is not None)
                and (prev.name > e.name)):
            raise BEDError("bed file " + filehandle.name +
                           " not sorted by element name" + " found " + e.name +
                           " after " + prev.name)

        # first item
        if prev is None:
            chromsSeen.add(e.chrom)

        # on same chrom as the prev item, make sure order is right
        if prev is not None and sortedby is not None and e.chrom == prev.chrom:
            if sortedby == ITERATOR_SORTED_START and prev.start > e.start:
                raise BEDError("bed file " + filehandle.name +
                               " not sorted by start index - saw item " +
                               str(prev) + " before " + str(e))
            if sortedby == ITERATOR_SORTED_END and prev.end > e.end:
                raise BEDError("bed file " + filehandle.name +
                               " not sorted by end index - saw item " +
                               str(prev) + " before " + str(e))

        # starting a new chrom.. make sure we haven't already seen it
        if prev is not None and prev.chrom != e.chrom:
            if (sortedby == ITERATOR_SORTED_START or
                sortedby == ITERATOR_SORTED_END or
                sortedby == ITERATOR_SORTED_CHROM) and\
               (e.chrom in chromsSeen or prev.chrom > e.chrom):
                try:
                    e_fn = filehandle.name
                except AttributeError:
                    e_fn = "UNNAMED STREAM"
                raise BEDError("BED file " + e_fn + " not sorted by chrom")
            chromsSeen.add(e.chrom)

        # all good..
        yield e
        prev = e
Esempio n. 15
0
  def testPairedIterator(self):
    debug = False

    in1 = "chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "1" +\
          "\t" + "+" + "\n" +\
          "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "2" +\
          "\t" + "-" + "\n" +\
          "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "3" +\
          "\t" + "+" + "\n" +\
          "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "4" +\
          "\t" + "-" + "\n"
    in2 = "chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "1" +\
          "\t" + "+" + "\n" +\
          "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" +\
          "\t" + "+" + "\n" +\
          "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" +\
          "\t" + "-" + "\n"
    in3 = "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "1" +\
          "\t" + "+" + "\n" +\
          "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" +\
          "\t" + "+" + "\n" +\
          "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" +\
          "\t" + "-" + "\n" +\
          "chr3" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "4" +\
          "\t" + "+" + "\n"

    # first, ignore strand, name and score and don't mirror missing elements
    e1 = ["chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "3" +
          "\t" + "+",
          "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "4" +
          "\t" + "-"]
    e2 = ["chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" +
          "\t" + "+",
          "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" +
          "\t" + "-"]
    e3 = ["chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" +
          "\t" + "+",
          "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" +
          "\t" + "-"]
    instms = [DummyInputStream(x) for x in [in1, in2, in3]]
    allOut = [x for x in pairedBEDIterator(instms, mirror=False,
                                           mirrorScore=None, ignoreStrand=True,
                                           ignoreScore=True, ignoreName=True)]
    got1, got2, got3 = [], [], []
    for x1, x2, x3 in allOut:
      got1.append(x1)
      got2.append(x2)
      got3.append(x3)
    for g, e in [(got1, [parseBEDString(x, scoreType=float) for x in e1]),
                 (got2, [parseBEDString(x, scoreType=float) for x in e2]),
                 (got3, [parseBEDString(x, scoreType=float) for x in e3])]:
      if debug:
        sys.stderr.write("expect\n" + "\n".join([str(x) for x in e]) + "\n")
        sys.stderr.write("got\n" + "\n".join([str(x) for x in g]) + "\n")
      assert(g == e)

    # now, same sort order but include strand, and mirror missing elements
    # using a score of 0
    e1 = ["chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "1" +
          "\t" + "+",
          "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" +
          "\t" + "+",
          "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "2" +
          "\t" + "-",
          "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "3" +
          "\t" + "+",
          "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "4" +
          "\t" + "-",
          "chr3" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" +
          "\t" + "+"]
    e2 = ["chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "1" +
          "\t" + "+",
          "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" +
          "\t" + "+",
          "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" +
          "\t" + "-",
          "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" +
          "\t" + "+",
          "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" +
          "\t" + "-",
          "chr3" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" +
          "\t" + "+"]
    e3 = ["chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "0" +
          "\t" + "+",
          "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "1" +
          "\t" + "+",
          "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" +
          "\t" + "-",
          "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" +
          "\t" + "+",
          "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" +
          "\t" + "-",
          "chr3" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "4" +
          "\t" + "+"]
    instms = [DummyInputStream(x) for x in [in1, in2, in3]]
    allOut = [x for x in pairedBEDIterator(instms, mirror=True,
                                           mirrorScore=0, ignoreStrand=False,
                                           ignoreScore=True, ignoreName=True)]
    got1, got2, got3 = [], [], []
    for x1, x2, x3 in allOut:
      got1.append(x1)
      got2.append(x2)
      got3.append(x3)
    for g, e in [(got1, [parseBEDString(x, scoreType=float) for x in e1]),
                 (got2, [parseBEDString(x, scoreType=float) for x in e2]),
                 (got3, [parseBEDString(x, scoreType=float) for x in e3])]:
      if debug:
        sys.stderr.write("expect\n" + "\n".join([str(x) for x in e]) + "\n")
        sys.stderr.write("got\n" + "\n".join([str(x) for x in g]) + "\n")
      assert(g == e)
Esempio n. 16
0
def BEDIterator(filehandle, sortedby=None, verbose=False, scoreType=int,
                dropAfter=None):
  """
  Get an iterator for a BED file

  :param filehandle: this can be either a string, or a stream-like object. In
                     the former case, it is treated as a filename. The format
                     of the file/stream must be BED.
  :param sortedby: if None, order is not checked.
                   if == ITERATOR_SORTED_START, elements in file must
                   be sorted by chrom and start index (an exception
                   is raised if they are not)
                   if == ITERATOR_SORTED_END, element must be sorted
                   by chrom and end index.
  :param verbose: if True, output additional progress messages to stderr
  :param scoreType: The data type for scores (the fifth column) in the BED
                    file.
  :param dropAfter: an int indicating that any fields after and including this
                    field should be ignored as they don't conform to the BED
                    format. By default, None, meaning we use all fields. Index
                    from zero.
  :return: iterator where subsequent calls to next() yield the next BED
           element in the stream as a GenomicInterval object.
  """
  chromsSeen = set()
  prev = None
  if type(filehandle).__name__ == "str":
    filehandle = open(filehandle)

  if verbose:
    try:
      pind = ProgressIndicator(totalToDo=os.path.getsize(filehandle.name),
                               messagePrefix="completed",
                               messageSuffix="of processing " +
                                             filehandle.name)
    except (AttributeError, OSError) as e:
      sys.stderr.write("BEDIterator -- warning: " +
                       "unable to show progress for stream")
      verbose = False

  for line in filehandle:
    if verbose:
      pind.done = filehandle.tell()
      pind.showProgress()

    if line.strip() == "":
      continue
    try:
      e = parseBEDString(line, scoreType, dropAfter=dropAfter)
    except GenomicIntervalError as e:
      raise BEDError(str(e) + " on line " + line)

    # sorting by name?
    if ((sortedby == ITERATOR_SORTED_NAME and prev is not None) and
       (prev.name > e.name)):
      raise BEDError("bed file " + filehandle.name +
                     " not sorted by element name" +
                     " found " + e.name + " after " +
                     prev.name)

    # first item
    if prev is None:
      chromsSeen.add(e.chrom)

    # on same chrom as the prev item, make sure order is right
    if prev is not None and sortedby is not None and e.chrom == prev.chrom:
      if sortedby == ITERATOR_SORTED_START and prev.start > e.start:
        raise BEDError("bed file " + filehandle.name +
                       " not sorted by start index - saw item " +
                       str(prev) + " before " + str(e))
      if sortedby == ITERATOR_SORTED_END and prev.end > e.end:
        raise BEDError("bed file " + filehandle.name +
                       " not sorted by end index - saw item " +
                       str(prev) + " before " + str(e))

    # starting a new chrom.. make sure we haven't already seen it
    if prev is not None and prev.chrom != e.chrom:
      if (sortedby == ITERATOR_SORTED_START or
          sortedby == ITERATOR_SORTED_END or
          sortedby == ITERATOR_SORTED_CHROM) and\
         (e.chrom in chromsSeen or prev.chrom > e.chrom):
        try:
          e_fn = filehandle.name
        except AttributeError:
          e_fn = "UNNAMED STREAM"
        raise BEDError("BED file " + e_fn + " not sorted by chrom")
      chromsSeen.add(e.chrom)

    # all good..
    yield e
    prev = e