def test_reservoir_sample_frequency(self, iterable_size, k): """Tests observed frequency is close to expected frequency.""" # Use a fixed random number so our test is deterministic. random = np.random.RandomState(123456789) n_replicates = 100000 counts = collections.Counter( item for _ in range(n_replicates) for item in utils.reservoir_sample(range(iterable_size), k, random)) expected_frequency = min(k / float(iterable_size), 1.0) for c in counts.itervalues(): observed_frequency = c / float(n_replicates) npt.assert_allclose(observed_frequency, expected_frequency, atol=0.01)
def test_reservoir_sample_frequency(self, iterable_size, k): """Tests observed frequency is close to expected frequency.""" # Use a fixed random number so our test is deterministic. random = np.random.RandomState(123456789) n_replicates = 100000 counts = collections.Counter(item for _ in range(n_replicates) for item in utils.reservoir_sample( range(iterable_size), k, random)) expected_frequency = min(k / float(iterable_size), 1.0) for c in counts.itervalues(): observed_frequency = c / float(n_replicates) npt.assert_allclose(observed_frequency, expected_frequency, atol=0.01)
def test_reservoir_sample_length(self): """Tests samples have expected length.""" first_ten_ints = range(10) # Test sampling with k > len(iterable). self.assertEquals(len(utils.reservoir_sample(first_ten_ints, 11)), 10) # Test sampling with k == len(iterable). self.assertEquals(len(utils.reservoir_sample(first_ten_ints, 10)), 10) # Test sampling with k < len(iterable). self.assertEquals(len(utils.reservoir_sample(first_ten_ints, 9)), 9) # Test sampling with k == 0. self.assertEquals(len(utils.reservoir_sample(first_ten_ints, 0)), 0) # Test sampling with k < 0 (bad args). with self.assertRaises(ValueError): utils.reservoir_sample(first_ten_ints, -1)
def region_reads(self, region): """Update in_memory_sam_reader with read alignments overlapping the region. If self.realigner is set, uses realigned reads, otherwise original reads are returned. Args: region: A nucleus.genomics.v1.Range object specifying the region we want to realign reads. Returns: [genomics.deepvariant.core.genomics.Read], reads overlapping the region. """ reads = self.sam_reader.query(region) if self.options.max_reads_per_partition > 0: reads = utils.reservoir_sample( reads, self.options.max_reads_per_partition, self.random) reads = list(reads) if self.realigner: _, reads = self.realigner.realign_reads(reads, region) return reads
def build_pileup(self, dv_call, refbases, reads, alt_alleles, custom_ref=False): """Creates a pileup tensor for dv_call. Args: dv_call: learning.genomics.deepvariant.DeepVariantCall object with information on our candidate call and allele support information. refbases: A string options.width in length containing the reference base sequence to encode. The middle base of this string should be at the start of the variant in dv_call. reads: Iterable of third_party.nucleus.protos.Read objects that we'll use to encode the read information supporting our call. Assumes each read is aligned and is well-formed (e.g., has bases and quality scores, cigar). Rows of the image are encoded in the same order as reads. alt_alleles: A collection of alternative_bases from dv_call.variant that we are treating as "alt" when constructing this pileup image. A read will be considered supporting the "alt" allele if it occurs in the support list for any alt_allele in this collection. custom_ref: True if refbases should not be checked for matching against variant's reference_bases. Returns: A [self.width, self.height, DEFAULT_NUM_CHANNEL] uint8 Tensor image. Raises: ValueError: if any arguments are invalid. """ if len(refbases) != self.width: raise ValueError('refbases is {} long but width is {}'.format( len(refbases), self.width)) if not alt_alleles: raise ValueError('alt_alleles cannot be empty') if any(alt not in dv_call.variant.alternate_bases for alt in alt_alleles): raise ValueError( 'all elements of alt_alleles must be the alternate bases' ' of dv_call.variant', alt_alleles, dv_call.variant) image_start_pos = dv_call.variant.start - self.half_width if not custom_ref and (refbases[self.half_width] != dv_call.variant.reference_bases[0]): raise ValueError('The middle base of reference sequence in the window ' "({} at base {}) doesn't match first " 'character of variant.reference_bases ({}).'.format( refbases[self.half_width], self.half_width, dv_call.variant.reference_bases)) # We start with n copies of our encoded reference bases. rows = ([self._encoder.encode_reference(refbases)] * self.reference_band_height) # A generator that yields tuples of the form (position, row), iff the read # can be encoded as a valid row to be used in the pileup image. def _row_generator(): for read in reads: read_row = self._encoder.encode_read(dv_call, refbases, read, image_start_pos, alt_alleles) if read_row is not None: yield read.alignment.position.position, read_row # We add a row for each read in order, down-sampling if the number of reads # is greater than self.max_reads. Sort the reads by their alignment # position. random_for_image = np.random.RandomState(self._options.random_seed) sample = sorted( utils.reservoir_sample( _row_generator(), self.max_reads, random=random_for_image), key=lambda x: x[0]) rows += [read_row for _, read_row in sample] # Finally, fill in any missing rows to bring our image to self.height rows # with empty (all black) pixels. n_missing_rows = self.height - len(rows) if n_missing_rows > 0: # Add values to rows to fill it out with zeros. rows += [self._empty_image_row()] * n_missing_rows # Vertically stack the image rows to create a single # h x w x DEFAULT_NUM_CHANNEL image. return np.vstack(rows)
def build_pileup(self, dv_call, refbases, reads, alt_alleles): """Creates a pileup tensor for dv_call. Args: dv_call: learning.genomics.deepvariant.DeepVariantCall object with information on our candidate call and allele support information. refbases: A string options.width in length containing the reference base sequence to encode. The middle base of this string should be at the start of the variant in dv_call. reads: Iterable of third_party.nucleus.protos.Read objects that we'll use to encode the read information supporting our call. Assumes each read is aligned and is well-formed (e.g., has bases and quality scores, cigar). Rows of the image are encoded in the same order as reads. alt_alleles: A collection of alternative_bases from dv_call.variant that we are treating as "alt" when constructing this pileup image. A read will be considered supporting the "alt" allele if it occurs in the support list for any alt_allele in this collection. Returns: A [self.width, self.height, DEFAULT_NUM_CHANNEL] uint8 Tensor image. Raises: ValueError: if any arguments are invalid. """ if len(refbases) != self.width: raise ValueError('refbases is {} long but width is {}'.format( len(refbases), self.width)) if not alt_alleles: raise ValueError('alt_alleles cannot be empty') if any(alt not in dv_call.variant.alternate_bases for alt in alt_alleles): raise ValueError('all elements of alt_alleles must be the alternate bases' ' of dv_call.variant', alt_alleles, dv_call.variant) image_start_pos = dv_call.variant.start - self.half_width if (len(dv_call.variant.reference_bases) == 1 and refbases[self.half_width] != dv_call.variant.reference_bases): raise ValueError('center of refbases doesnt match variant.refbases', self.half_width, refbases[self.half_width], dv_call.variant) # We start with n copies of our encoded reference bases. rows = ( [self._encoder.encode_reference(refbases)] * self.reference_band_height) # A generator that yields tuples of the form (position, row), iff the read # can be encoded as a valid row to be used in the pileup image. def _row_generator(): for read in reads: read_row = self._encoder.encode_read(dv_call, refbases, read, image_start_pos, alt_alleles) if read_row is not None: yield read.alignment.position.position, read_row # We add a row for each read in order, down-sampling if the number of reads # is greater than self.max_reads. Sort the reads by their alignment # position. sample = sorted( utils.reservoir_sample( _row_generator(), self.max_reads, random=self._random), key=lambda x: x[0]) rows += [read_row for _, read_row in sample] # Finally, fill in any missing rows to bring our image to self.height rows # with empty (all black) pixels. n_missing_rows = self.height - len(rows) if n_missing_rows > 0: # Add values to rows to fill it out with zeros. rows += [_empty_image_row(len(refbases))] * n_missing_rows # Vertically stack the image rows to create a single # h x w x DEFAULT_NUM_CHANNEL image. return np.vstack(rows)
def build_pileup_for_one_sample(reads, sample): """Create read pileup image section for one sample.""" # We start with n copies of our encoded reference bases. rows = ([self._encoder.encode_reference(refbases)] * self.reference_band_height) def _update_hap_index(read, sort_by_haplotypes_sample_hp_tag): default_hap_idx = 0 # By default, reads with no HP is set to 0. if 'HP' not in read.info: return default_hap_idx hp_field = next(iter(read.info.get('HP').values)) if not hp_field.HasField('int_value'): return default_hap_idx hp_value = hp_field.int_value if (sort_by_haplotypes_sample_hp_tag > 0 and hp_value == sort_by_haplotypes_sample_hp_tag): # For the target HP tag, set it to -1 so it will be sorted on # top of the pileup image. return -1 elif hp_value < 0: return 0 # For reads with HP < 0, assume it is not tagged. else: return hp_value # A generator that yields tuples of the form (haplotype, position, row), # if the read can be encoded as a valid row to be used in the pileup # image. def _row_generator(): """A generator that yields tuples of (haplotype, position, row).""" for read in reads: read_row = self._encoder.encode_read(dv_call, refbases, read, image_start_pos, alt_alleles) if read_row is None: continue hap_idx = 0 # By default, reads with no HP is set to 0. if self._options.sort_by_haplotypes: hap_idx = _update_hap_index( read, self._options.sort_by_haplotypes_sample_hp_tag) yield hap_idx, read.alignment.position.position, read_row # We add a row for each read in order, down-sampling if the number of # reads is greater than the max reads for each sample. Sort the reads by # their alignment position. random_for_image = np.random.RandomState(self._options.random_seed) # Use sample height or default to pic height. if sample.pileup_height is not None: pileup_height = sample.pileup_height else: pileup_height = self.height max_reads = pileup_height - self.reference_band_height pileup_of_reads = sorted( utils.reservoir_sample( _row_generator(), max_reads, random=random_for_image), key=lambda x: (x[0], x[1])) rows += [read_row for _, _, read_row in pileup_of_reads] # Finally, fill in any missing rows to bring our image to pileup_height # rows with empty (all black) pixels. n_missing_rows = pileup_height - len(rows) if n_missing_rows > 0: # Add values to rows to fill it out with zeros. rows += [self._empty_image_row()] * n_missing_rows return rows