コード例 #1
0
    def test_labels_trimmed_back(self):
        # we should have two alignments which partially overlap
        # (318288, 417741)
        # (417732, 422799)
        # in this case, the first is >2 x longer than the second, so we trim back the second
        # check resulting positions and labels are non-overlapping
        alignments = TruthAlignment.bam_to_alignments(__truth_bam__,
                                                      __ref_name__,
                                                      start=318288,
                                                      end=422799)
        assert alignments[0].start == 318288
        assert alignments[0].end == 417741
        assert alignments[1].start == 417732
        assert alignments[1].end == 422799
        filtered_alignments = TruthAlignment.filter_alignments(alignments)
        assert filtered_alignments[0].start == 318288
        assert filtered_alignments[0].end == 417741
        assert filtered_alignments[1].start == 417741
        assert filtered_alignments[1].end == 422799

        p1_positions, p1_labels = filtered_alignments[
            0].get_positions_and_labels()
        p2_positions, p2_labels = filtered_alignments[
            1].get_positions_and_labels()
        assert p1_positions[0]['major'] == filtered_alignments[0].start
        assert p1_positions[-1]['major'] == filtered_alignments[0].end - 1
        assert p2_positions[0]['major'] == filtered_alignments[1].start
        assert p2_positions[-1]['major'] == filtered_alignments[1].end - 1
コード例 #2
0
 def test_labels_trimmed_back(self):
     # we should have two alignments which partially overlap
     # (318288, 417741)
     # (417732, 422799)
     # in this case, the first is >2 x longer than the second, so we trim back the second
     # check resulting positions and labels are non-overlapping
     alignments = TruthAlignment.bam_to_alignments(
         __truth_bam__, Region(__ref_name__, start=318288, end=422799))
     self.assertEqual(alignments[0][0].start, 318288)
     self.assertEqual(alignments[0][0].end, 417741)
     self.assertEqual(alignments[1][0].start, 417741)
     self.assertEqual(alignments[1][0].end, 422799)
コード例 #3
0
    def bams_to_training_samples(self,
                                 truth_bam,
                                 bam,
                                 region,
                                 reference=None,
                                 read_fraction=None):
        """Prepare training data chunks.

        :param truth_bam: .bam file of truth aligned to ref to generate labels.
        :param bam: input .bam file.
        :param region: `Region` obj.
            the reference will be parsed.
        :param reference: reference `.fasta`, should correspond to `bam`.

        :returns: tuple of `Sample` objects.

        .. note:: Chunks might be missing if `truth_bam` is provided and
            regions with multiple mappings were encountered.

        """
        ref_rle = self.process_ref_seq(region.ref_name, reference)

        # filter truth alignments to restrict ourselves to regions of the ref where the truth
        # in unambiguous
        alignments = TruthAlignment.bam_to_alignments(truth_bam,
                                                      region.ref_name,
                                                      start=region.start,
                                                      end=region.end)
        filtered_alignments = TruthAlignment.filter_alignments(
            alignments, start=region.start, end=region.end)
        if len(filtered_alignments) == 0:
            self.logger.info(
                "Filtering removed all alignments of truth to ref from {}.".
                format(region))

        samples = []
        for aln in filtered_alignments:
            mock_compr = self.max_hp_len > 1 and not self.is_compressed
            truth_pos, truth_labels = aln.get_positions_and_labels(
                ref_compr_rle=ref_rle,
                mock_compr=mock_compr,
                is_compressed=self.is_compressed,
                rle_dtype=True)
            aln_samples = self.bam_to_sample(bam,
                                             Region(region.ref_name, aln.start,
                                                    aln.end),
                                             ref_rle,
                                             read_fraction=read_fraction)
            for sample in aln_samples:
                # Create labels according to positions in pileup
                pad = (encoding[_gap_],
                       1) if len(truth_labels.dtype) > 0 else encoding[_gap_]
                padder = itertools.repeat(pad)
                position_to_label = defaultdict(
                    padder.__next__,
                    zip([tuple(p) for p in truth_pos],
                        [a for a in truth_labels]))
                padded_labels = np.fromiter(
                    (position_to_label[tuple(p)] for p in sample.positions),
                    dtype=truth_labels.dtype,
                    count=len(sample.positions))

                sample = sample._asdict()
                sample['labels'] = padded_labels
                samples.append(Sample(**sample))
        return tuple(samples)