Example #1
0
def _get_variant_type(variant):
  """Returns the type of variant as a string."""
  if variant_utils.is_variant_call(variant):
    biallelic = variant_utils.is_biallelic(variant)
    snp = variant_utils.is_snp(variant)
    insertion = variant_utils.variant_is_insertion(variant)
    deletion = variant_utils.variant_is_deletion(variant)

    if biallelic:
      if snp:
        return BIALLELIC_SNP
      elif insertion:
        return BIALLELIC_INSERTION
      elif deletion:
        return BIALLELIC_DELETION
      else:
        return BIALLELIC_MNP
    else:
      if snp:
        return MULTIALLELIC_SNP
      elif insertion:
        return MULTIALLELIC_INSERTION
      elif deletion:
        return MULTIALLELIC_DELETION
      else:
        return MULTIALLELIC_COMPLEX
  else:
    return REFCALL
def generate_positions(vcf_reader, ref_reader, baseline_contig):
    """Gets all INDELs position and an equal amount of SNPs and random positions.

  Args:
    vcf_reader: a nucleus.io.VcfReader.
    ref_reader: a nucleus.io.IndexedFastaReader.
    baseline_contig: contig from which to sample baseline positions.

  Returns:
    A list of PositionWrapper.
  """
    variants = [variant for variant in vcf_reader]
    indels_positions = [
        PositionWrapper(var.reference_name, var.start, _INDEL_LABEL)
        for var in variants if variant_utils.is_indel(var)
    ]
    n_indels = len(indels_positions)

    # We sort by position for better data locality.
    snps = [var for var in variants if variant_utils.is_snp(var)]
    snps_positions = [
        PositionWrapper(var.reference_name, var.start, _SNP_LABEL)
        for var in random.sample(snps, min(len(snps), n_indels))
    ]

    contig_size = ref_reader.contig(baseline_contig).n_bases
    # NOTE: Though unlikely, these random positions can end up on actual
    # variants.
    baseline_positions = [
        PositionWrapper(baseline_contig, pos, _REF_LABEL)
        for pos in random.sample(xrange(contig_size), min(
            contig_size, n_indels))
    ]

    return sorted(indels_positions + snps_positions + baseline_positions)
Example #3
0
def _create_cvo_proto(encoded_variant,
                      gls,
                      encoded_alt_allele_indices,
                      true_labels=None,
                      logits=None,
                      prelogits=None):
    """Returns a CallVariantsOutput proto from the relevant input information."""
    variant = variants_pb2.Variant.FromString(encoded_variant)
    alt_allele_indices = (deepvariant_pb2.CallVariantsOutput.AltAlleleIndices.
                          FromString(encoded_alt_allele_indices))
    debug_info = None
    if FLAGS.include_debug_info or FLAGS.debugging_true_label_mode:
        if prelogits is not None:
            assert prelogits.shape == (1, 1, 2048)
            prelogits = prelogits[0][0]
        debug_info = deepvariant_pb2.CallVariantsOutput.DebugInfo(
            has_insertion=variant_utils.has_insertion(variant),
            has_deletion=variant_utils.has_deletion(variant),
            is_snp=variant_utils.is_snp(variant),
            predicted_label=np.argmax(gls),
            true_label=true_labels,
            logits=logits,
            prelogits=prelogits)
    call_variants_output = deepvariant_pb2.CallVariantsOutput(
        variant=variant,
        alt_allele_indices=alt_allele_indices,
        genotype_probabilities=gls,
        debug_info=debug_info)
    return call_variants_output
Example #4
0
def _create_cvo_proto(encoded_variant, gls, encoded_alt_allele_indices):
  """Returns a CallVariantsOutput proto from the relevant input information."""
  variant = variants_pb2.Variant.FromString(encoded_variant)
  alt_allele_indices = (
      deepvariant_pb2.CallVariantsOutput.AltAlleleIndices.FromString(
          encoded_alt_allele_indices))
  debug_info = None
  if FLAGS.include_debug_info:
    debug_info = deepvariant_pb2.CallVariantsOutput.DebugInfo(
        has_insertion=variant_utils.has_insertion(variant),
        has_deletion=variant_utils.has_deletion(variant),
        is_snp=variant_utils.is_snp(variant),
        predicted_label=np.argmax(gls))
  call_variants_output = deepvariant_pb2.CallVariantsOutput(
      variant=variant,
      alt_allele_indices=alt_allele_indices,
      genotype_probabilities=gls,
      debug_info=debug_info)
  return call_variants_output
Example #5
0
def _create_cvo_proto(encoded_variant, gls, encoded_alt_allele_indices):
  """Returns a CallVariantsOutput proto from the relevant input information."""
  variant = variants_pb2.Variant.FromString(encoded_variant)
  alt_allele_indices = (
      deepvariant_pb2.CallVariantsOutput.AltAlleleIndices.FromString(
          encoded_alt_allele_indices))
  debug_info = None
  if FLAGS.include_debug_info:
    debug_info = deepvariant_pb2.CallVariantsOutput.DebugInfo(
        has_insertion=variant_utils.has_insertion(variant),
        has_deletion=variant_utils.has_deletion(variant),
        is_snp=variant_utils.is_snp(variant),
        predicted_label=np.argmax(gls))
  call_variants_output = deepvariant_pb2.CallVariantsOutput(
      variant=variant,
      alt_allele_indices=alt_allele_indices,
      genotype_probabilities=gls,
      debug_info=debug_info)
  return call_variants_output
Example #6
0
def encoded_variant_type(variant):
    """Gets the EncodedVariantType for variant.

  This function examines variant and returns the EncodedVariantType that best
  describes the variation type of variant. For example, if variant has
  `reference_bases = "A"` and `alternative_bases = ["C"]` this function would
  return EncodedVariantType.SNP.

  Args:
    variant: nucleus.Variant proto. The variant whose EncodedVariantType we want
      to get.

  Returns:
    EncodedVariantType enum value.
  """
    if variant_utils.is_snp(variant):
        return EncodedVariantType.SNP
    elif variant_utils.is_indel(variant):
        return EncodedVariantType.INDEL
    else:
        return EncodedVariantType.UNKNOWN
Example #7
0
 def test_is_snp(self, variant, expected):
   self.assertEqual(variant_utils.is_snp(variant), expected)
Example #8
0
    def test_call_end2end(self, model, shard_inputs, include_debug_info):
        FLAGS.include_debug_info = include_debug_info
        (call_variants_outputs, examples, batch_size,
         max_batches) = self._call_end2end_helper(
             testdata.GOLDEN_CALLING_EXAMPLES, model, shard_inputs)
        # Check that we have the right number of output protos.
        self.assertEqual(
            len(call_variants_outputs),
            batch_size * max_batches if max_batches else len(examples))

        # Check that our CallVariantsOutput (CVO) have the following critical
        # properties:
        # - we have one CVO for each example we processed.
        # - the variant in the CVO is exactly what was in the example.
        # - the alt_allele_indices of the CVO match those of its corresponding
        #   example.
        # - there are 3 genotype probabilities and these are between 0.0 and 1.0.
        # We can only do this test when processing all of the variants (max_batches
        # is None), since we processed all of the examples with that model.
        if max_batches is None:
            self.assertItemsEqual(
                [cvo.variant for cvo in call_variants_outputs],
                [tf_utils.example_variant(ex) for ex in examples])

        # Check the CVO debug_info: not filled if include_debug_info is False;
        # else, filled by logic based on CVO.
        if not include_debug_info:
            for cvo in call_variants_outputs:
                self.assertEqual(
                    cvo.debug_info,
                    deepvariant_pb2.CallVariantsOutput.DebugInfo())
        else:
            for cvo in call_variants_outputs:
                self.assertEqual(cvo.debug_info.has_insertion,
                                 variant_utils.has_insertion(cvo.variant))
                self.assertEqual(cvo.debug_info.has_deletion,
                                 variant_utils.has_deletion(cvo.variant))
                self.assertEqual(cvo.debug_info.is_snp,
                                 variant_utils.is_snp(cvo.variant))
                self.assertEqual(cvo.debug_info.predicted_label,
                                 np.argmax(cvo.genotype_probabilities))

        def example_matches_call_variants_output(example,
                                                 call_variants_output):
            return (tf_utils.example_variant(example)
                    == call_variants_output.variant
                    and tf_utils.example_alt_alleles_indices(example)
                    == call_variants_output.alt_allele_indices.indices)

        for call_variants_output in call_variants_outputs:
            # Find all matching examples.
            matches = [
                ex for ex in examples if example_matches_call_variants_output(
                    ex, call_variants_output)
            ]
            # We should have exactly one match.
            self.assertEqual(len(matches), 1)
            example = matches[0]
            # Check that we've faithfully copied in the alt alleles (though currently
            # as implemented we find our example using this information so it cannot
            # fail). Included here in case that changes in the future.
            self.assertEqual(
                list(tf_utils.example_alt_alleles_indices(example)),
                list(call_variants_output.alt_allele_indices.indices))
            # We should have exactly three genotype probabilities (assuming our
            # ploidy == 2).
            self.assertEqual(len(call_variants_output.genotype_probabilities),
                             3)
            # These are probabilities so they should be between 0 and 1.
            self.assertTrue(
                0 <= gp <= 1
                for gp in call_variants_output.genotype_probabilities)
 def test_is_snp_symbolic_allele(self, variant, exclude_alleles, expected):
   self.assertEqual(
       variant_utils.is_snp(variant, exclude_alleles=exclude_alleles),
       expected)
Example #10
0
 def test_is_snp(self, variant, expected):
   self.assertEqual(variant_utils.is_snp(variant), expected)
    def test_call_end2end(self, model, shard_inputs, include_debug_info):
        FLAGS.include_debug_info = include_debug_info
        examples = list(
            io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES))

        if shard_inputs:
            # Create a sharded version of our golden examples.
            source_path = test_utils.test_tmpfile('sharded@{}'.format(3))
            io_utils.write_tfrecords(examples, source_path)
        else:
            source_path = testdata.GOLDEN_CALLING_EXAMPLES

        batch_size = 4
        if model.name == 'random_guess':
            # For the random guess model we can run everything.
            max_batches = None
        else:
            # For all other models we only run a single batch for inference.
            max_batches = 1

        outfile = test_utils.test_tmpfile('call_variants.tfrecord')
        call_variants.call_variants(
            examples_filename=source_path,
            checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST,
            model=model,
            output_file=outfile,
            batch_size=batch_size,
            max_batches=max_batches)

        call_variants_outputs = list(
            io_utils.read_tfrecords(outfile,
                                    deepvariant_pb2.CallVariantsOutput))

        # Check that we have the right number of output protos.
        self.assertEqual(
            len(call_variants_outputs),
            batch_size * max_batches if max_batches else len(examples))

        # Check that our CallVariantsOutput (CVO) have the following critical
        # properties:
        # - we have one CVO for each example we processed.
        # - the variant in the CVO is exactly what was in the example.
        # - the alt_allele_indices of the CVO match those of its corresponding
        #   example.
        # - there are 3 genotype probabilities and these are between 0.0 and 1.0.
        # We can only do this test when processing all of the variants (max_batches
        # is None), since we processed all of the examples with that model.
        if max_batches is None:
            self.assertItemsEqual(
                [cvo.variant for cvo in call_variants_outputs],
                [tf_utils.example_variant(ex) for ex in examples])

        # Check the CVO debug_info: not filled if include_debug_info is False;
        # else, filled by logic based on CVO.
        if not include_debug_info:
            for cvo in call_variants_outputs:
                self.assertEqual(
                    cvo.debug_info,
                    deepvariant_pb2.CallVariantsOutput.DebugInfo())
        else:
            for cvo in call_variants_outputs:
                self.assertEqual(cvo.debug_info.has_insertion,
                                 variant_utils.has_insertion(cvo.variant))
                self.assertEqual(cvo.debug_info.has_deletion,
                                 variant_utils.has_deletion(cvo.variant))
                self.assertEqual(cvo.debug_info.is_snp,
                                 variant_utils.is_snp(cvo.variant))
                self.assertEqual(cvo.debug_info.predicted_label,
                                 np.argmax(cvo.genotype_probabilities))

        def example_matches_call_variants_output(example,
                                                 call_variants_output):
            return (tf_utils.example_variant(example)
                    == call_variants_output.variant
                    and tf_utils.example_alt_alleles_indices(example)
                    == call_variants_output.alt_allele_indices.indices)

        for call_variants_output in call_variants_outputs:
            # Find all matching examples.
            matches = [
                ex for ex in examples if example_matches_call_variants_output(
                    ex, call_variants_output)
            ]
            # We should have exactly one match.
            self.assertEqual(len(matches), 1)
            example = matches[0]
            # Check that we've faithfully copied in the alt alleles (though currently
            # as implemented we find our example using this information so it cannot
            # fail). Included here in case that changes in the future.
            self.assertEqual(
                list(tf_utils.example_alt_alleles_indices(example)),
                list(call_variants_output.alt_allele_indices.indices))
            # We should have exactly three genotype probabilities (assuming our
            # ploidy == 2).
            self.assertEqual(len(call_variants_output.genotype_probabilities),
                             3)
            # These are probabilities so they should be between 0 and 1.
            self.assertTrue(
                0 <= gp <= 1
                for gp in call_variants_output.genotype_probabilities)
Example #12
0
  def test_call_end2end(self, model, shard_inputs, include_debug_info):
    FLAGS.include_debug_info = include_debug_info
    examples = list(io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES))

    if shard_inputs:
      # Create a sharded version of our golden examples.
      source_path = test_utils.test_tmpfile('sharded@{}'.format(3))
      io_utils.write_tfrecords(examples, source_path)
    else:
      source_path = testdata.GOLDEN_CALLING_EXAMPLES

    batch_size = 4
    if model.name == 'random_guess':
      # For the random guess model we can run everything.
      max_batches = None
    else:
      # For all other models we only run a single batch for inference.
      max_batches = 1

    outfile = test_utils.test_tmpfile('call_variants.tfrecord')
    call_variants.call_variants(
        examples_filename=source_path,
        checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST,
        model=model,
        output_file=outfile,
        batch_size=batch_size,
        max_batches=max_batches)

    call_variants_outputs = list(
        io_utils.read_tfrecords(outfile, deepvariant_pb2.CallVariantsOutput))

    # Check that we have the right number of output protos.
    self.assertEqual(
        len(call_variants_outputs), batch_size * max_batches
        if max_batches else len(examples))

    # Check that our CallVariantsOutput (CVO) have the following critical
    # properties:
    # - we have one CVO for each example we processed.
    # - the variant in the CVO is exactly what was in the example.
    # - the alt_allele_indices of the CVO match those of its corresponding
    #   example.
    # - there are 3 genotype probabilities and these are between 0.0 and 1.0.
    # We can only do this test when processing all of the variants (max_batches
    # is None), since we processed all of the examples with that model.
    if max_batches is None:
      self.assertItemsEqual([cvo.variant for cvo in call_variants_outputs],
                            [tf_utils.example_variant(ex) for ex in examples])

    # Check the CVO debug_info: not filled if include_debug_info is False;
    # else, filled by logic based on CVO.
    if not include_debug_info:
      for cvo in call_variants_outputs:
        self.assertEqual(cvo.debug_info,
                         deepvariant_pb2.CallVariantsOutput.DebugInfo())
    else:
      for cvo in call_variants_outputs:
        self.assertEqual(cvo.debug_info.has_insertion,
                         variant_utils.has_insertion(cvo.variant))
        self.assertEqual(cvo.debug_info.has_deletion,
                         variant_utils.has_deletion(cvo.variant))
        self.assertEqual(cvo.debug_info.is_snp, variant_utils.is_snp(
            cvo.variant))
        self.assertEqual(cvo.debug_info.predicted_label,
                         np.argmax(cvo.genotype_probabilities))

    def example_matches_call_variants_output(example, call_variants_output):
      return (tf_utils.example_variant(example) == call_variants_output.variant
              and tf_utils.example_alt_alleles_indices(
                  example) == call_variants_output.alt_allele_indices.indices)

    for call_variants_output in call_variants_outputs:
      # Find all matching examples.
      matches = [
          ex for ex in examples
          if example_matches_call_variants_output(ex, call_variants_output)
      ]
      # We should have exactly one match.
      self.assertEqual(len(matches), 1)
      example = matches[0]
      # Check that we've faithfully copied in the alt alleles (though currently
      # as implemented we find our example using this information so it cannot
      # fail). Included here in case that changes in the future.
      self.assertEqual(
          list(tf_utils.example_alt_alleles_indices(example)),
          list(call_variants_output.alt_allele_indices.indices))
      # We should have exactly three genotype probabilities (assuming our
      # ploidy == 2).
      self.assertEqual(len(call_variants_output.genotype_probabilities), 3)
      # These are probabilities so they should be between 0 and 1.
      self.assertTrue(
          0 <= gp <= 1 for gp in call_variants_output.genotype_probabilities)