def test_030_bams_to_training_samples_simple(self): reads_bam = tempfile.NamedTemporaryFile(suffix='.bam').name truth_bam = tempfile.NamedTemporaryFile(suffix='.bam').name # we had a bug caused by missing qualities and bad indexing... data = copy.deepcopy(simple_data['calls']) data[0]['quality'] = None create_simple_bam(reads_bam, data) create_simple_bam( truth_bam, [simple_data['truth']]) encoder = medaka.features.CountsFeatureEncoder(normalise='total') label_scheme = medaka.labels.HaploidLabelScheme() region = Region('ref', 0, 100) result = encoder.bams_to_training_samples( truth_bam, reads_bam, region, label_scheme, min_length=0)[0] expected = Sample( ref_name='ref', features=np.array([ [0.5 , 0. , 0. , 0. , 0.5 , 0. , 0. , 0. , 0. , 0. ], [0. , 0.5 , 0. , 0. , 0. , 0.5 , 0. , 0. , 0. , 0. ], [0.5 , 0. , 0. , 0. , 0.5 , 0. , 0. , 0. , 0. , 0. ], [0. , 0.25, 0. , 0.25, 0. , 0. , 0. , 0.25, 0. , 0.25], [0.25, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ], [0. , 0. , 0.5 , 0. , 0. , 0. , 0.5 , 0. , 0. , 0. ], [0.5 , 0. , 0. , 0. , 0.5 , 0. , 0. , 0. , 0. , 0. ], [0. , 0. , 0. , 0.5 , 0. , 0. , 0. , 0.5 , 0. , 0. ], [0. , 0. , 0.5 , 0. , 0. , 0. , 0.5 , 0. , 0. , 0. ]], dtype='float32'), # the two insertions with respect to the draft are dropped labels=np.array([1, 2, 1, 4, 1, 3, 1, 4, 3]), # A C A T A G A T C ref_seq=None, positions=np.array([ (0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 0), (5, 0), (6, 0), (7, 0)], dtype=[('major', '<i8'), ('minor', '<i8')]), label_probs=None ) np.testing.assert_equal(result.labels, expected.labels) np.testing.assert_equal(result.positions, expected.positions) np.testing.assert_equal(result.features, expected.features)
def test_size(self): a = Region('contig1', 50, 100) self.assertEqual(a.size, 50)
def bams_to_training_samples(self, truth_bam, bam, region, reference=None, read_fraction=None): """Prepare training data chunks. :param truth_bam: .bam file of truth aligned to ref to generate labels. :param bam: input .bam file. :param region: `Region` obj. the reference will be parsed. :param reference: reference `.fasta`, should correspond to `bam`. :returns: tuple of `Sample` objects. .. note:: Chunks might be missing if `truth_bam` is provided and regions with multiple mappings were encountered. """ ref_rle = self.process_ref_seq(region.ref_name, reference) # filter truth alignments to restrict ourselves to regions of the ref where the truth # in unambiguous alignments = TruthAlignment.bam_to_alignments(truth_bam, region.ref_name, start=region.start, end=region.end) filtered_alignments = TruthAlignment.filter_alignments( alignments, start=region.start, end=region.end) if len(filtered_alignments) == 0: self.logger.info( "Filtering removed all alignments of truth to ref from {}.". format(region)) samples = [] for aln in filtered_alignments: mock_compr = self.max_hp_len > 1 and not self.is_compressed truth_pos, truth_labels = aln.get_positions_and_labels( ref_compr_rle=ref_rle, mock_compr=mock_compr, is_compressed=self.is_compressed, rle_dtype=True) aln_samples = self.bam_to_sample(bam, Region(region.ref_name, aln.start, aln.end), ref_rle, read_fraction=read_fraction) for sample in aln_samples: # Create labels according to positions in pileup pad = (encoding[_gap_], 1) if len(truth_labels.dtype) > 0 else encoding[_gap_] padder = itertools.repeat(pad) position_to_label = defaultdict( padder.__next__, zip([tuple(p) for p in truth_pos], [a for a in truth_labels])) padded_labels = np.fromiter( (position_to_label[tuple(p)] for p in sample.positions), dtype=truth_labels.dtype, count=len(sample.positions)) sample = sample._asdict() sample['labels'] = padded_labels samples.append(Sample(**sample)) return tuple(samples)
ref_start=None, ref_end=None, ref_len=None, query_seq='ACATGCAAGACACGAT', ref_seq='AAAGGCAAGACACGAT'): self.reference_start = ref_start self.reference_end = ref_end self.reference_length = ref_len self.query_sequence = query_seq self.reference_sequence = ref_seq def get_reference_sequence(self): return self.reference_sequence full_region = Region('Mock', 0, float('inf')) class TruthAlignmentTest(unittest.TestCase): def test_case1(self): # case 1: longer < 2 x len shorter and >= 50% of shorter overlaps longer both should be removed starts_ends = [(2000, 2999), (2500, 3000)] expected = [] alignments = [ TruthAlignment(MockAlignment(start, end, end - start)) for start, end in starts_ends ] filtered = [ (f.start, f.end) for f in TruthAlignment._filter_alignments(alignments, full_region)
import numpy as np import os import unittest from medaka.features import FeatureEncoder, pileup_counts from medaka.common import Region __reads_bam__ = os.path.join(os.path.dirname(__file__), 'data', 'test_reads.bam') __two_type_bam__ = os.path.join(os.path.dirname(__file__), 'data', 'test_two_type.bam') __gapped_bam__ = os.path.join(os.path.dirname(__file__), 'data', 'reads_gapped.bam') __region__ = Region('Consensus_Consensus_Consensus_Consensus_utg000001l', start=50000, end=100000) __region_start__ = Region('Consensus_Consensus_Consensus_Consensus_utg000001l', start=0, end=200) __kwargs__ = { 'consensus_as_ref': False, 'is_compressed': False, 'log_min': None, 'max_hp_len': 1, 'normalise': 'total', 'ref_mode': None, 'with_depth': False } class CountsTest(unittest.TestCase):
def test_004_trim_mid(self): region = Region('ref', start=1, end=7) reads = self.get_reads(region) orig = [x[1:-1] for x in self.reads] self.assertEqual(reads, orig)
def test_003_trim_end(self): region = Region('ref', start=6, end=8) reads = self.get_reads(region) orig = [x[-2:] for x in self.reads] self.assertEqual(reads, orig)
def test_002_trim_start(self): region = Region('ref', start=0, end=2) reads = self.get_reads(region) orig = [x[0:2] for x in self.reads] self.assertEqual(reads, orig)
def test_001_full_region(self): region = Region('ref', start=0, end=100000) reads = self.get_reads(region) self.assertEqual(reads, self.reads)
import numpy as np import pysam from .mock_data import simple_data, create_simple_bam import libmedaka import medaka.features from medaka.common import Region, Sample import medaka.labels __reads_bam__ = os.path.join(os.path.dirname(__file__), 'data', 'test_reads.bam') __reads_truth__ = os.path.join(os.path.dirname(__file__), 'data', 'truth_to_ref.bam') __gapped_bam__ = os.path.join(os.path.dirname(__file__), 'data', 'reads_gapped.bam') __region__ = Region('utg000001l', start=50000, end=100000) __region_start__ = Region('utg000001l', start=0, end=200) class CountsTest(unittest.TestCase): @classmethod def setUpClass(self): self.expected_width = 86294 def test_001_basic_counting(self): kwargs = {'normalise': None} encoder = medaka.features.CountsFeatureEncoder(**kwargs) sample = encoder.bam_to_sample(__reads_bam__, __region__) self.assertEqual(len(sample), 1) sample = sample[0] assert tuple(sample.positions.shape) == (self.expected_width, )
def main(): # Entry point for testing/checking logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO) np.set_printoptions(precision=4, linewidth=100) parser = argparse.ArgumentParser('medaka', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('bam', help='alignment file.') parser.add_argument('region', help='alignment region to sample.') parser.add_argument('--print', action='store_true', help='print counts.') parser.add_argument('--dtypes', nargs='+', help='perform a multi-datatype tests.') parser.add_argument('--norm', nargs='+', help='additional normalisation tests. (total, fwd_rev)') args = parser.parse_args() region = Region.from_string(args.region) kwargs={ 'log_min': None, 'max_hp_len': 1, 'is_compressed': False, 'consensus_as_ref': False, 'ref_mode': None, 'with_depth': False, } def _print(samples): if args.print: for p, f in zip(samples.positions, samples.features): print('{}\t{}\t0\t{}\t{}'.format(p[0], p[1], '\t'.join('{:.3f}'.format(x) if x>0.0 else '-' for x in f), sum(f))) dtype_options = [('',)] if args.dtypes is not None: dtype_options.append(args.dtypes) norm_options = [None, ] if args.norm is not None: norm_options.extend(args.norm) for dtypes in dtype_options: kwargs['dtypes'] = dtypes for norm in norm_options: kwargs['normalise'] = norm print("###########################################################") print(kwargs) encoder = FeatureEncoder(**kwargs) # py-style t0=now() samples = encoder.bam_to_sample(args.bam, region, force_py=True)[0] t1=now() if not samples.is_empty: print(samples.features.shape) _print(samples) else: print("Samples is empty") print("---------------------") # C-style t2=now() samples = encoder.bam_to_sample(args.bam, region)[0] t3=now() if not samples.is_empty: print(samples.features.shape) _print(samples) else: print("Samples is empty") print("pysam time:", t1 - t0) print("hts time:", t3 - t2)