Esempio n. 1
0
 def test_reservoir_sample_frequency(self, iterable_size, k):
     """Tests observed frequency is close to expected frequency."""
     # Use a fixed random number so our test is deterministic.
     random = np.random.RandomState(123456789)
     n_replicates = 100000
     counts = collections.Counter(item for _ in range(n_replicates)
                                  for item in utils.reservoir_sample(
                                      range(iterable_size), k, random))
     expected_frequency = min(k / float(iterable_size), 1.0)
     for c in counts.itervalues():
         observed_frequency = c / float(n_replicates)
         npt.assert_allclose(observed_frequency,
                             expected_frequency,
                             atol=0.01)
Esempio n. 2
0
 def test_reservoir_sample_length(self):
     """Tests samples have expected length."""
     first_ten_ints = range(10)
     # Test sampling with k > len(iterable).
     self.assertEquals(len(utils.reservoir_sample(first_ten_ints, 11)), 10)
     # Test sampling with k == len(iterable).
     self.assertEquals(len(utils.reservoir_sample(first_ten_ints, 10)), 10)
     # Test sampling with k < len(iterable).
     self.assertEquals(len(utils.reservoir_sample(first_ten_ints, 9)), 9)
     # Test sampling with k == 0.
     self.assertEquals(len(utils.reservoir_sample(first_ten_ints, 0)), 0)
     # Test sampling with k < 0 (bad args).
     with self.assertRaises(ValueError):
         utils.reservoir_sample(first_ten_ints, -1)
    def region_reads(self, region):
        """Update in_memory_sam_reader with read alignments overlapping the region.

    If self.realigner is set, uses realigned reads, otherwise original reads
    are returned.

    Args:
      region: A learning.genomics.v1.Range object specifying the region we
        want to realign reads.

    Returns:
      [genomics.deepvariant.core.genomics.Read], reads overlapping the region.
    """
        reads = self.sam_reader.query(region)
        if self.options.max_reads_per_partition > 0:
            reads = utils.reservoir_sample(
                reads, self.options.max_reads_per_partition, self.random)
        reads = list(reads)
        if self.realigner:
            _, reads = self.realigner.realign_reads(reads, region)
        return reads
Esempio n. 4
0
    def build_pileup(self, dv_call, refbases, reads, alt_alleles):
        """Creates a pileup tensor for dv_call.

    Args:
      dv_call: learning.genomics.deepvariant.DeepVariantCall object with
        information on our candidate call and allele support information.
      refbases: A string options.width in length containing the reference base
        sequence to encode. The middle base of this string should be at the
        start of the variant in dv_call.
      reads: Iterable of third_party.nucleus.protos.Read
        objects that we'll use to
        encode the read information supporting our call. Assumes each read is
        aligned and is well-formed (e.g., has bases and quality scores, cigar).
        Rows of the image are encoded in the same order as reads.
      alt_alleles: A collection of alternative_bases from dv_call.variant that
        we are treating as "alt" when constructing this pileup image. A read
        will be considered supporting the "alt" allele if it occurs in the
        support list for any alt_allele in this collection.

    Returns:
      A [self.width, self.height, DEFAULT_NUM_CHANNEL] uint8 Tensor image.

    Raises:
      ValueError: if any arguments are invalid.
    """
        if len(refbases) != self.width:
            raise ValueError('refbases is {} long but width is {}'.format(
                len(refbases), self.width))

        if not alt_alleles:
            raise ValueError('alt_alleles cannot be empty')
        if any(alt not in dv_call.variant.alternate_bases
               for alt in alt_alleles):
            raise ValueError(
                'all elements of alt_alleles must be the alternate bases'
                ' of dv_call.variant', alt_alleles, dv_call.variant)

        image_start_pos = dv_call.variant.start - self.half_width
        if (len(dv_call.variant.reference_bases) == 1 and
                refbases[self.half_width] != dv_call.variant.reference_bases):
            raise ValueError(
                'center of refbases doesnt match variant.refbases',
                self.half_width, refbases[self.half_width], dv_call.variant)

        # We start with n copies of our encoded reference bases.
        rows = ([self._encoder.encode_reference(refbases)] *
                self.reference_band_height)

        # A generator that yields tuples of the form (position, row), iff the read
        # can be encoded as a valid row to be used in the pileup image.
        def _row_generator():
            for read in reads:
                read_row = self._encoder.encode_read(dv_call, refbases, read,
                                                     image_start_pos,
                                                     alt_alleles)
                if read_row is not None:
                    yield read.alignment.position.position, read_row

        # We add a row for each read in order, down-sampling if the number of reads
        # is greater than self.max_reads. Sort the reads by their alignment
        # position.
        sample = sorted(utils.reservoir_sample(_row_generator(),
                                               self.max_reads,
                                               random=self._random),
                        key=lambda x: x[0])

        rows += [read_row for _, read_row in sample]

        # Finally, fill in any missing rows to bring our image to self.height rows
        # with empty (all black) pixels.
        n_missing_rows = self.height - len(rows)
        if n_missing_rows > 0:
            # Add values to rows to fill it out with zeros.
            rows += [_empty_image_row(len(refbases))] * n_missing_rows

        # Vertically stack the image rows to create a single
        # h x w x DEFAULT_NUM_CHANNEL image.
        return np.vstack(rows)