Beispiel #1
0
def summarize_coverage(aln_set, aln_summ_gff, ref_set=None,
                       num_regions=Constants.NUM_REGIONS,
                       region_size=Constants.REGION_SIZE,
                       force_num_regions=Constants.FORCE_NUM_REGIONS,
                       max_region_size=Constants.MAX_REGION_SIZE):
    """
    Main point of entry
    """

    if ref_set:
        untruncator = get_name_untruncator(ref_set)
    else:
        # this dict is always used with get(x, x), so when it's empty it will
        # just preserve the original name
        untruncator = {}

    #readers = enumerate_readers(args.alignment_file)
    readers = openDataSet(aln_set).resourceReaders()
    gff_writer = GffIO.GffWriter(aln_summ_gff)

    # First write the metadata. Names of references, command line used, things
    # like that
    metadata_lines = get_metadata_lines(readers, untruncator)
    for metadata_line in metadata_lines:
        gff_writer.writeHeader(metadata_line)
    log.debug("Wrote {n} header lines to {f}"
              .format(n=len(metadata_lines), f=aln_summ_gff))

    # Build lists of intervals for each reference
    interval_lists = build_interval_lists(readers)
    log.debug("Finished creating interval lists for {n} references"
              .format(n=len(interval_lists)))

    # Create a function that gets region size from the reference length by
    # freezing the constant parameters
    get_region_size_frozen = functools.partial(
        get_region_size, num_refs=len(interval_lists),
        region_size=region_size, num_regions=num_regions,
        force_num_regions=force_num_regions,
        max_region_size=max_region_size)

    # Create Gff records and write them
    for ref_group_id in sorted(interval_lists):
        log.debug("Generating coverage GFF records for refGroupID {r}"
                  .format(r=ref_group_id))

        gff_generator = generate_gff_records(
            interval_lists[ref_group_id], readers,
            ref_group_id, get_region_size_frozen,
            untruncator)

        try:

            for gff_record in gff_generator:
                gff_writer.writeRecord(gff_record)

        except ValueError as e:
            log.warn(e)
    def test_coverages(self):
        """Test that the regions and calculated coverages are the same for both the bfx and pbpy summarize_coverage results.
        """
        # Read the pbpy gff into a dictionary for comparison
        pbpy_gff_reader = GffIO.GffReader(self.gff_path)
        pbpy_gff_records = {}
        for gff_record in pbpy_gff_reader:
            record_key = (gff_record.seqid.split()[0], gff_record.start,
                          gff_record.end)
            record_val = gff_record
            pbpy_gff_records[record_key] = record_val

        # Recapitulate the first few steps of summarize_coverage.main
        ds_reader, readers = self._get_readers()
        interval_lists = summarize_coverage.build_interval_lists(readers)
        get_region_size_frozen = functools.partial(
            summarize_coverage.get_region_size,
            num_refs=len(interval_lists),
            region_size=0,
            num_regions=500,
            force_num_regions=False)

        for ref_group_id in sorted(interval_lists):
            gff_generator = summarize_coverage.generate_gff_records(
                interval_lists[ref_group_id], readers, ref_group_id,
                get_region_size_frozen, {})

            for bfx_gff_record in gff_generator:
                bfx_key = (bfx_gff_record.seqid, bfx_gff_record.start,
                           bfx_gff_record.end)
                self.assertIn(bfx_key, pbpy_gff_records)
                pbpy_gff_record = pbpy_gff_records.pop(bfx_key)

                self.assertEqual(pbpy_gff_record.cov, bfx_gff_record.cov)
                self.assertEqual(pbpy_gff_record.gaps, bfx_gff_record.gaps)
                pbpy_cov2 = [float(k) for k in pbpy_gff_record.cov2.split(',')]
                bfx_cov2 = [float(k) for k in bfx_gff_record.cov2.split(',')]

                for pair in zip(pbpy_cov2, bfx_cov2):
                    self.assertAlmostEqual(pair[0], pair[1])

                self.assertEqual(pbpy_gff_record.source, bfx_gff_record.source)
                self.assertEqual(pbpy_gff_record.type, bfx_gff_record.type)
                self.assertEqual(pbpy_gff_record.score, bfx_gff_record.score)
                self.assertEqual(pbpy_gff_record.strand, bfx_gff_record.strand)
                self.assertEqual(pbpy_gff_record.phase, bfx_gff_record.phase)

        if self.selected_reference is not None:
            remaining_pbpy_records = {}
            for record_key in pbpy_gff_records:
                if record_key[0] == self.selected_reference:
                    remaining_pbpy_records[record_key] = pbpy_gff_records[
                        record_key]
        else:
            remaining_pbpy_records = pbpy_gff_records

        self.assertEqual(len(remaining_pbpy_records), 0)
    def test_metadata(self):
        """Test that the metadata lines match those from the pbpy version of summarize_coverage.
        """
        ds_reader, bam_readers = self._get_readers()
        bfx_metadata_lines = summarize_coverage.get_metadata_lines(
            bam_readers, {})

        pbpy_gff_reader = GffIO.GffReader(self.gff_path)
        pbpy_metadata_lines = pbpy_gff_reader.headers

        pbpy_i = 0
        bfx_i = 0

        # GffWriter handles this first line, not get_metadata_lines. So just
        # make sure it's there in the pbpy file
        self.assertEqual(pbpy_metadata_lines[0], "##gff-version 3")
        pbpy_i += 1
        self.assertEqual(len(pbpy_metadata_lines[1:]), len(bfx_metadata_lines))

        # Check the date line
        bfx_dateline = bfx_metadata_lines[bfx_i]
        pbpy_dateline = pbpy_metadata_lines[pbpy_i]
        self.assertEqual(
            bfx_dateline.split(' ')[0],
            pbpy_dateline.split(' ')[0])
        pbpy_i += 1
        bfx_i += 1

        # Check the source line
        bfx_sourceline = bfx_metadata_lines[bfx_i]
        pbpy_sourceline = pbpy_metadata_lines[pbpy_i]
        self.assertEqual(bfx_sourceline, pbpy_sourceline)
        pbpy_i += 1
        bfx_i += 1

        # Check the command line
        bfx_commandline = bfx_metadata_lines[bfx_i]
        pbpy_commandline = pbpy_metadata_lines[pbpy_i]
        self.assertEqual(
            bfx_commandline.split(' ')[0],
            pbpy_commandline.split(' ')[0])
        pbpy_i += 1
        bfx_i += 1

        while pbpy_i < len(pbpy_metadata_lines):
            self.assertEqual(pbpy_metadata_lines[pbpy_i],
                             bfx_metadata_lines[bfx_i])
            pbpy_i += 1
            bfx_i += 1
Beispiel #4
0
def generate_gff_records(interval_list, readers, ref_id, region_size_func,
                         untruncator):
    """Generator for Gff records for a ref_id.

    :param interval_list: a sequence of interval_tree.Intervals of
        alignments to this reference
    :param reader: CmpH5Reader for SamfileAdapter for file
        containing the alignments
    :param ref_id: ID for this reference
    :param region_size_func: function from reference length to region
        size
    :param untruncator: dict that maps from truncated name to full name.
        If a truncated name does not appear in the dict, then it just
        uses the truncated name.

    :yields: GffIO.Gff3Records
    """
    # Get the appropriate region size for this reference
    for reader in readers:
        try:
            ref_length = reader.referenceInfo(ref_id).Length
            ref_full_name = reader.referenceInfo(ref_id).FullName
            break
        except KeyError:
            pass

    short_name = ref_full_name.split()[0]
    region_size = region_size_func(ref_length)

    if region_size == 0:
        # bug 25079 - /by0 err
        raise ValueError(
            'region_size == 0 for ref_id {r}'.format(r=str(ref_id)))

    log.debug("Chosen region size for reference {i} is {r}".format(
        i=ref_id, r=region_size))

    log.debug("reference {i} has full name {n} and length {L}".format(
        i=ref_id, n=ref_full_name, L=ref_length))

    itree = interval_tree.IntervalTree(interval_list)

    # To improve performance, we batch the interval lookups and projections
    # into ranges
    regions_per_batch = int(math.ceil(Constants.BATCH_SIZE / region_size))
    batch_start, batch_end = 0, 0
    batch_coverage_arr = None

    for region_start in xrange(0, ref_length, region_size):
        region_end = region_start + region_size
        # pbpy summarizeCoverage would merge the last region into the
        # penultimate region, so we do that here
        if region_end >= ref_length and region_start > 0:
            continue
        if region_end + region_size >= ref_length:
            region_end = ref_length

        # Check if we need to step to the next batch
        if region_end > batch_end:
            if region_start < batch_end:
                raise ValueError("A region overlaps a batch, which should not "
                                 "happen.")

            batch_start = region_start
            batch_end = region_size * regions_per_batch + batch_end
            if ref_length - region_size <= batch_end:
                batch_end = ref_length
            log.debug("Processing batch ({s}, {e})".format(s=batch_start,
                                                           e=batch_end))

            overlapping_intervals = []
            itree.find_overlapping(batch_start, batch_end,
                                   overlapping_intervals)
            batch_coverage_arr = project_into_region(overlapping_intervals,
                                                     batch_start, batch_end)

        region_start_in_batch = region_start - batch_start
        region_end_in_batch = region_end - batch_start
        region_coverage_arr = batch_coverage_arr[
            region_start_in_batch:region_end_in_batch]

        gff_attributes = get_attributes_from_coverage(region_coverage_arr)

        # Note the region_start + 1. GFF is 1-based and used closed intervals
        # XXX using truncated name (identifier field), see ticket 28667
        gff_record = GffIO.Gff3Record(
            short_name,  # untruncator.get(ref_full_name, ref_full_name),
            region_start + 1,
            region_end,
            "region",
            score='0.00',
            strand='+',
            attributes=gff_attributes)

        yield gff_record