Beispiel #1
0
    def test_random_intervals(self):
        """Test random interval lists."""

        n_cases = 1000
        full_interval_size = 10000
        n_regions_to_test = 50
        max_intervals = 50

        for case in xrange(n_cases):
            n_intervals = random.randrange(1, max_intervals)
            interval_list = []
            for i in xrange(n_intervals):
                ival_length = random.randrange(full_interval_size / 10)
                ival_start = random.randrange(full_interval_size - ival_length)
                ival_end = ival_start + ival_length
                interval_list.append(
                    interval_tree.Interval(ival_start, ival_end))

            itree = interval_tree.IntervalTree(interval_list)

            test_region_starts = random.sample(range(full_interval_size),
                                               n_regions_to_test)
            for test_region_start in test_region_starts:
                test_region_length = random.randrange(full_interval_size -
                                                      test_region_start)
                test_region_end = test_region_start + test_region_length

                true_overlapping_intervals = []
                for interval in interval_list:
                    if interval.start < test_region_end and test_region_start < interval.stop:
                        true_overlapping_intervals.append(interval)

                itree_overlapping_intervals = []
                itree.find_overlapping(test_region_start, test_region_end,
                                       itree_overlapping_intervals)

                self.assertEqual(len(true_overlapping_intervals),
                                 len(itree_overlapping_intervals))

                for o_ival in true_overlapping_intervals:
                    self.assertIn(o_ival, itree_overlapping_intervals)
Beispiel #2
0
    def test_boundaries(self):
        """Test that intervals in the IntervalTree are interpreted
        as the usual half-open [start, end) intervals.
        """

        interval_list = []
        interval_list.append(interval_tree.Interval(5, 10))
        interval_list.append(interval_tree.Interval(5, 11))
        interval_list.append(interval_tree.Interval(10, 15))
        interval_list.append(interval_tree.Interval(20, 25))

        itree = interval_tree.IntervalTree(interval_list)

        ivals = []
        itree.find_overlapping(25, 26, ivals)
        self.assertEqual(len(ivals), 0)

        ivals = []
        itree.find_overlapping(20, 21, ivals)
        self.assertEqual(len(ivals), 1)
        self.assertEqual(ivals[0].start, 20)
        self.assertEqual(ivals[0].stop, 25)

        ivals = []
        itree.find_overlapping(5, 10, ivals)
        self.assertEqual(len(ivals), 2)
        self.assertEqual(ivals[0].start, 5)
        self.assertEqual(ivals[0].stop, 10)
        self.assertEqual(ivals[1].start, 5)
        self.assertEqual(ivals[1].stop, 11)

        ivals = []
        itree.find_overlapping(10, 11, ivals)
        self.assertEqual(len(ivals), 2)
        self.assertEqual(ivals[0].start, 5)
        self.assertEqual(ivals[0].stop, 11)
        self.assertEqual(ivals[1].start, 10)
        self.assertEqual(ivals[1].stop, 15)
Beispiel #3
0
def generate_gff_records(interval_list, readers, ref_id, region_size_func,
                         untruncator):
    """Generator for Gff records for a ref_id.

    :param interval_list: a sequence of interval_tree.Intervals of
        alignments to this reference
    :param reader: CmpH5Reader for SamfileAdapter for file
        containing the alignments
    :param ref_id: ID for this reference
    :param region_size_func: function from reference length to region
        size
    :param untruncator: dict that maps from truncated name to full name.
        If a truncated name does not appear in the dict, then it just
        uses the truncated name.

    :yields: GffIO.Gff3Records
    """
    # Get the appropriate region size for this reference
    for reader in readers:
        try:
            ref_length = reader.referenceInfo(ref_id).Length
            ref_full_name = reader.referenceInfo(ref_id).FullName
            break
        except KeyError:
            pass

    short_name = ref_full_name.split()[0]
    region_size = region_size_func(ref_length)

    if region_size == 0:
        # bug 25079 - /by0 err
        raise ValueError(
            'region_size == 0 for ref_id {r}'.format(r=str(ref_id)))

    log.debug("Chosen region size for reference {i} is {r}".format(
        i=ref_id, r=region_size))

    log.debug("reference {i} has full name {n} and length {L}".format(
        i=ref_id, n=ref_full_name, L=ref_length))

    itree = interval_tree.IntervalTree(interval_list)

    # To improve performance, we batch the interval lookups and projections
    # into ranges
    regions_per_batch = int(math.ceil(Constants.BATCH_SIZE / region_size))
    batch_start, batch_end = 0, 0
    batch_coverage_arr = None

    for region_start in xrange(0, ref_length, region_size):
        region_end = region_start + region_size
        # pbpy summarizeCoverage would merge the last region into the
        # penultimate region, so we do that here
        if region_end >= ref_length and region_start > 0:
            continue
        if region_end + region_size >= ref_length:
            region_end = ref_length

        # Check if we need to step to the next batch
        if region_end > batch_end:
            if region_start < batch_end:
                raise ValueError("A region overlaps a batch, which should not "
                                 "happen.")

            batch_start = region_start
            batch_end = region_size * regions_per_batch + batch_end
            if ref_length - region_size <= batch_end:
                batch_end = ref_length
            log.debug("Processing batch ({s}, {e})".format(s=batch_start,
                                                           e=batch_end))

            overlapping_intervals = []
            itree.find_overlapping(batch_start, batch_end,
                                   overlapping_intervals)
            batch_coverage_arr = project_into_region(overlapping_intervals,
                                                     batch_start, batch_end)

        region_start_in_batch = region_start - batch_start
        region_end_in_batch = region_end - batch_start
        region_coverage_arr = batch_coverage_arr[
            region_start_in_batch:region_end_in_batch]

        gff_attributes = get_attributes_from_coverage(region_coverage_arr)

        # Note the region_start + 1. GFF is 1-based and used closed intervals
        # XXX using truncated name (identifier field), see ticket 28667
        gff_record = GffIO.Gff3Record(
            short_name,  # untruncator.get(ref_full_name, ref_full_name),
            region_start + 1,
            region_end,
            "region",
            score='0.00',
            strand='+',
            attributes=gff_attributes)

        yield gff_record