def summarize_coverage(aln_set, aln_summ_gff, ref_set=None, num_regions=Constants.NUM_REGIONS, region_size=Constants.REGION_SIZE, force_num_regions=Constants.FORCE_NUM_REGIONS, max_region_size=Constants.MAX_REGION_SIZE): """ Main point of entry """ if ref_set: untruncator = get_name_untruncator(ref_set) else: # this dict is always used with get(x, x), so when it's empty it will # just preserve the original name untruncator = {} #readers = enumerate_readers(args.alignment_file) readers = openDataSet(aln_set).resourceReaders() gff_writer = GffIO.GffWriter(aln_summ_gff) # First write the metadata. Names of references, command line used, things # like that metadata_lines = get_metadata_lines(readers, untruncator) for metadata_line in metadata_lines: gff_writer.writeHeader(metadata_line) log.debug("Wrote {n} header lines to {f}" .format(n=len(metadata_lines), f=aln_summ_gff)) # Build lists of intervals for each reference interval_lists = build_interval_lists(readers) log.debug("Finished creating interval lists for {n} references" .format(n=len(interval_lists))) # Create a function that gets region size from the reference length by # freezing the constant parameters get_region_size_frozen = functools.partial( get_region_size, num_refs=len(interval_lists), region_size=region_size, num_regions=num_regions, force_num_regions=force_num_regions, max_region_size=max_region_size) # Create Gff records and write them for ref_group_id in sorted(interval_lists): log.debug("Generating coverage GFF records for refGroupID {r}" .format(r=ref_group_id)) gff_generator = generate_gff_records( interval_lists[ref_group_id], readers, ref_group_id, get_region_size_frozen, untruncator) try: for gff_record in gff_generator: gff_writer.writeRecord(gff_record) except ValueError as e: log.warn(e)
def test_coverages(self): """Test that the regions and calculated coverages are the same for both the bfx and pbpy summarize_coverage results. """ # Read the pbpy gff into a dictionary for comparison pbpy_gff_reader = GffIO.GffReader(self.gff_path) pbpy_gff_records = {} for gff_record in pbpy_gff_reader: record_key = (gff_record.seqid.split()[0], gff_record.start, gff_record.end) record_val = gff_record pbpy_gff_records[record_key] = record_val # Recapitulate the first few steps of summarize_coverage.main ds_reader, readers = self._get_readers() interval_lists = summarize_coverage.build_interval_lists(readers) get_region_size_frozen = functools.partial( summarize_coverage.get_region_size, num_refs=len(interval_lists), region_size=0, num_regions=500, force_num_regions=False) for ref_group_id in sorted(interval_lists): gff_generator = summarize_coverage.generate_gff_records( interval_lists[ref_group_id], readers, ref_group_id, get_region_size_frozen, {}) for bfx_gff_record in gff_generator: bfx_key = (bfx_gff_record.seqid, bfx_gff_record.start, bfx_gff_record.end) self.assertIn(bfx_key, pbpy_gff_records) pbpy_gff_record = pbpy_gff_records.pop(bfx_key) self.assertEqual(pbpy_gff_record.cov, bfx_gff_record.cov) self.assertEqual(pbpy_gff_record.gaps, bfx_gff_record.gaps) pbpy_cov2 = [float(k) for k in pbpy_gff_record.cov2.split(',')] bfx_cov2 = [float(k) for k in bfx_gff_record.cov2.split(',')] for pair in zip(pbpy_cov2, bfx_cov2): self.assertAlmostEqual(pair[0], pair[1]) self.assertEqual(pbpy_gff_record.source, bfx_gff_record.source) self.assertEqual(pbpy_gff_record.type, bfx_gff_record.type) self.assertEqual(pbpy_gff_record.score, bfx_gff_record.score) self.assertEqual(pbpy_gff_record.strand, bfx_gff_record.strand) self.assertEqual(pbpy_gff_record.phase, bfx_gff_record.phase) if self.selected_reference is not None: remaining_pbpy_records = {} for record_key in pbpy_gff_records: if record_key[0] == self.selected_reference: remaining_pbpy_records[record_key] = pbpy_gff_records[ record_key] else: remaining_pbpy_records = pbpy_gff_records self.assertEqual(len(remaining_pbpy_records), 0)
def test_metadata(self): """Test that the metadata lines match those from the pbpy version of summarize_coverage. """ ds_reader, bam_readers = self._get_readers() bfx_metadata_lines = summarize_coverage.get_metadata_lines( bam_readers, {}) pbpy_gff_reader = GffIO.GffReader(self.gff_path) pbpy_metadata_lines = pbpy_gff_reader.headers pbpy_i = 0 bfx_i = 0 # GffWriter handles this first line, not get_metadata_lines. So just # make sure it's there in the pbpy file self.assertEqual(pbpy_metadata_lines[0], "##gff-version 3") pbpy_i += 1 self.assertEqual(len(pbpy_metadata_lines[1:]), len(bfx_metadata_lines)) # Check the date line bfx_dateline = bfx_metadata_lines[bfx_i] pbpy_dateline = pbpy_metadata_lines[pbpy_i] self.assertEqual( bfx_dateline.split(' ')[0], pbpy_dateline.split(' ')[0]) pbpy_i += 1 bfx_i += 1 # Check the source line bfx_sourceline = bfx_metadata_lines[bfx_i] pbpy_sourceline = pbpy_metadata_lines[pbpy_i] self.assertEqual(bfx_sourceline, pbpy_sourceline) pbpy_i += 1 bfx_i += 1 # Check the command line bfx_commandline = bfx_metadata_lines[bfx_i] pbpy_commandline = pbpy_metadata_lines[pbpy_i] self.assertEqual( bfx_commandline.split(' ')[0], pbpy_commandline.split(' ')[0]) pbpy_i += 1 bfx_i += 1 while pbpy_i < len(pbpy_metadata_lines): self.assertEqual(pbpy_metadata_lines[pbpy_i], bfx_metadata_lines[bfx_i]) pbpy_i += 1 bfx_i += 1
def generate_gff_records(interval_list, readers, ref_id, region_size_func, untruncator): """Generator for Gff records for a ref_id. :param interval_list: a sequence of interval_tree.Intervals of alignments to this reference :param reader: CmpH5Reader for SamfileAdapter for file containing the alignments :param ref_id: ID for this reference :param region_size_func: function from reference length to region size :param untruncator: dict that maps from truncated name to full name. If a truncated name does not appear in the dict, then it just uses the truncated name. :yields: GffIO.Gff3Records """ # Get the appropriate region size for this reference for reader in readers: try: ref_length = reader.referenceInfo(ref_id).Length ref_full_name = reader.referenceInfo(ref_id).FullName break except KeyError: pass short_name = ref_full_name.split()[0] region_size = region_size_func(ref_length) if region_size == 0: # bug 25079 - /by0 err raise ValueError( 'region_size == 0 for ref_id {r}'.format(r=str(ref_id))) log.debug("Chosen region size for reference {i} is {r}".format( i=ref_id, r=region_size)) log.debug("reference {i} has full name {n} and length {L}".format( i=ref_id, n=ref_full_name, L=ref_length)) itree = interval_tree.IntervalTree(interval_list) # To improve performance, we batch the interval lookups and projections # into ranges regions_per_batch = int(math.ceil(Constants.BATCH_SIZE / region_size)) batch_start, batch_end = 0, 0 batch_coverage_arr = None for region_start in xrange(0, ref_length, region_size): region_end = region_start + region_size # pbpy summarizeCoverage would merge the last region into the # penultimate region, so we do that here if region_end >= ref_length and region_start > 0: continue if region_end + region_size >= ref_length: region_end = ref_length # Check if we need to step to the next batch if region_end > batch_end: if region_start < batch_end: raise ValueError("A region overlaps a batch, which should not " "happen.") batch_start = region_start batch_end = region_size * regions_per_batch + batch_end if ref_length - region_size <= batch_end: batch_end = ref_length log.debug("Processing batch ({s}, {e})".format(s=batch_start, e=batch_end)) overlapping_intervals = [] itree.find_overlapping(batch_start, batch_end, overlapping_intervals) batch_coverage_arr = project_into_region(overlapping_intervals, batch_start, batch_end) region_start_in_batch = region_start - batch_start region_end_in_batch = region_end - batch_start region_coverage_arr = batch_coverage_arr[ region_start_in_batch:region_end_in_batch] gff_attributes = get_attributes_from_coverage(region_coverage_arr) # Note the region_start + 1. GFF is 1-based and used closed intervals # XXX using truncated name (identifier field), see ticket 28667 gff_record = GffIO.Gff3Record( short_name, # untruncator.get(ref_full_name, ref_full_name), region_start + 1, region_end, "region", score='0.00', strand='+', attributes=gff_attributes) yield gff_record