def test_labels_trimmed_back(self): # we should have two alignments which partially overlap # (318288, 417741) # (417732, 422799) # in this case, the first is >2 x longer than the second, so we trim back the second # check resulting positions and labels are non-overlapping alignments = TruthAlignment.bam_to_alignments(__truth_bam__, __ref_name__, start=318288, end=422799) assert alignments[0].start == 318288 assert alignments[0].end == 417741 assert alignments[1].start == 417732 assert alignments[1].end == 422799 filtered_alignments = TruthAlignment.filter_alignments(alignments) assert filtered_alignments[0].start == 318288 assert filtered_alignments[0].end == 417741 assert filtered_alignments[1].start == 417741 assert filtered_alignments[1].end == 422799 p1_positions, p1_labels = filtered_alignments[ 0].get_positions_and_labels() p2_positions, p2_labels = filtered_alignments[ 1].get_positions_and_labels() assert p1_positions[0]['major'] == filtered_alignments[0].start assert p1_positions[-1]['major'] == filtered_alignments[0].end - 1 assert p2_positions[0]['major'] == filtered_alignments[1].start assert p2_positions[-1]['major'] == filtered_alignments[1].end - 1
def test_labels_trimmed_back(self): # we should have two alignments which partially overlap # (318288, 417741) # (417732, 422799) # in this case, the first is >2 x longer than the second, so we trim back the second # check resulting positions and labels are non-overlapping alignments = TruthAlignment.bam_to_alignments( __truth_bam__, Region(__ref_name__, start=318288, end=422799)) self.assertEqual(alignments[0][0].start, 318288) self.assertEqual(alignments[0][0].end, 417741) self.assertEqual(alignments[1][0].start, 417741) self.assertEqual(alignments[1][0].end, 422799)
def bams_to_training_samples(self, truth_bam, bam, region, reference=None, read_fraction=None): """Prepare training data chunks. :param truth_bam: .bam file of truth aligned to ref to generate labels. :param bam: input .bam file. :param region: `Region` obj. the reference will be parsed. :param reference: reference `.fasta`, should correspond to `bam`. :returns: tuple of `Sample` objects. .. note:: Chunks might be missing if `truth_bam` is provided and regions with multiple mappings were encountered. """ ref_rle = self.process_ref_seq(region.ref_name, reference) # filter truth alignments to restrict ourselves to regions of the ref where the truth # in unambiguous alignments = TruthAlignment.bam_to_alignments(truth_bam, region.ref_name, start=region.start, end=region.end) filtered_alignments = TruthAlignment.filter_alignments( alignments, start=region.start, end=region.end) if len(filtered_alignments) == 0: self.logger.info( "Filtering removed all alignments of truth to ref from {}.". format(region)) samples = [] for aln in filtered_alignments: mock_compr = self.max_hp_len > 1 and not self.is_compressed truth_pos, truth_labels = aln.get_positions_and_labels( ref_compr_rle=ref_rle, mock_compr=mock_compr, is_compressed=self.is_compressed, rle_dtype=True) aln_samples = self.bam_to_sample(bam, Region(region.ref_name, aln.start, aln.end), ref_rle, read_fraction=read_fraction) for sample in aln_samples: # Create labels according to positions in pileup pad = (encoding[_gap_], 1) if len(truth_labels.dtype) > 0 else encoding[_gap_] padder = itertools.repeat(pad) position_to_label = defaultdict( padder.__next__, zip([tuple(p) for p in truth_pos], [a for a in truth_labels])) padded_labels = np.fromiter( (position_to_label[tuple(p)] for p in sample.positions), dtype=truth_labels.dtype, count=len(sample.positions)) sample = sample._asdict() sample['labels'] = padded_labels samples.append(Sample(**sample)) return tuple(samples)