Ejemplo n.º 1
0
  def test_realigner_end2end(self):
    ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA)
    config = realigner.realigner_config(FLAGS)
    reads_realigner = realigner.Realigner(config, ref_reader)
    region_str = 'chr20:10,000,000-10,009,999'
    windows_count = 0

    regions = ranges.RangeSet.from_regions([region_str])
    for region in regions.partition(1000):
      with sam.SamReader(
          testdata.CHR20_BAM,
          read_requirements=reads_pb2.ReadRequirements()) as sam_reader:
        in_reads = list(sam_reader.query(region))
      windows, out_reads = reads_realigner.realign_reads(in_reads, region)

      # We should always get back all of the reads we sent in. Instead of just
      # checking the lengths are the same, make sure all the read names are the
      # same.
      self.assertCountEqual([r.fragment_name for r in in_reads],
                            [r.fragment_name for r in out_reads])

      # Check each window to make sure it's reasonable.
      for window in windows:
        # We always expect the reference sequence to be one of our haplotypes.
        ref_seq = ref_reader.query(window.span)
        self.assertIn(ref_seq, set(window.haplotypes))
      windows_count += len(windows)

    self.assertGreater(windows_count, 0)
Ejemplo n.º 2
0
 def setUp(self):
     self.ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA)
     # redacted
     FLAGS.ws_use_window_selector_model = True
     self.config = realigner.realigner_config(FLAGS)
     self.reads_realigner = realigner.Realigner(self.config,
                                                self.ref_reader)
Ejemplo n.º 3
0
    def test_realigner_end2end(self):
        ref_reader = genomics_io.make_ref_reader(test_utils.CHR20_FASTA)
        config = realigner.realigner_config(FLAGS)
        reads_realigner = realigner.Realigner(config, ref_reader)
        region_str = 'chr20:10,000,000-10,009,999'

        regions = ranges.RangeSet.from_regions([region_str])
        for region in regions.partition(1000):
            with genomics_io.make_sam_reader(
                    test_utils.CHR20_BAM,
                    core_pb2.ReadRequirements()) as sam_reader:
                in_reads = list(sam_reader.query(region))
            windows, out_reads = reads_realigner.realign_reads(
                in_reads, region)

            # We should always get back all of the reads we sent in. Instead of just
            # checking the lengths are the same, make sure all the read names are the
            # same.
            self.assertCountEqual([r.fragment_name for r in in_reads],
                                  [r.fragment_name for r in out_reads])

            # Make sure we assembled at least one windows in the region.
            self.assertNotEqual(0, len(windows))

            # Check each window to make sure it's reasonable.
            for window in windows:
                # We always expect the reference sequence to be one of our haplotypes.
                ref_seq = ref_reader.bases(window.span)
                self.assertIn(ref_seq, set(window.haplotypes))
Ejemplo n.º 4
0
    def test_realigner_diagnostics(self, enabled, emit_reads):
        # Make sure that by default we aren't emitting any diagnostic outputs.
        dx_dir = test_utils.test_tmpfile('dx_enabled{}_emitreads_{}'.format(
            enabled, emit_reads))
        region_str = 'chr20:10046178-10046188'
        region = ranges.parse_literal(region_str)
        assembled_region_str = 'chr20:10046096-10046267'
        reads, header = _get_reads_and_header(region)
        self.config = realigner.realigner_config(FLAGS)
        self.config.diagnostics.enabled = enabled
        self.config.diagnostics.output_root = dx_dir
        self.config.diagnostics.emit_realigned_reads = emit_reads
        self.reads_realigner = realigner.Realigner(self.config,
                                                   self.ref_reader, header)
        _, _ = self.reads_realigner.realign_reads(reads, region)
        self.reads_realigner.diagnostic_logger.close(
        )  # Force close all resources.

        if not enabled:
            # Make sure our diagnostic output isn't emitted.
            self.assertFalse(tf.io.gfile.exists(dx_dir))
        else:
            # Our root directory exists.
            self.assertTrue(tf.io.gfile.isdir(dx_dir))

            # We expect a realigner_metrics.csv in our rootdir with 1 entry in it.
            metrics_file = os.path.join(
                dx_dir,
                self.reads_realigner.diagnostic_logger.metrics_filename)
            self.assertTrue(tf.io.gfile.exists(metrics_file))
            with tf.io.gfile.GFile(metrics_file) as fin:
                rows = list(csv.DictReader(fin))
                self.assertLen(rows, 1)
                self.assertEqual(set(rows[0].keys()),
                                 {'window', 'k', 'n_haplotypes', 'time'})
                self.assertEqual(rows[0]['window'], assembled_region_str)
                self.assertEqual(int(rows[0]['k']), 25)
                self.assertTrue(int(rows[0]['n_haplotypes']), 2)
                # Check that our runtime is reasonable (greater than 0, less than 10 s).
                self.assertTrue(0.0 < float(rows[0]['time']) < 10.0)

            # As does the subdirectory for this region.
            region_subdir = os.path.join(dx_dir, assembled_region_str)
            self.assertTrue(tf.io.gfile.isdir(region_subdir))

            # We always have a graph.dot
            self.assertTrue(
                tf.io.gfile.exists(
                    os.path.join(
                        region_subdir, self.reads_realigner.diagnostic_logger.
                        graph_filename)))

            reads_file = os.path.join(
                dx_dir, region_str, self.reads_realigner.diagnostic_logger.
                realigned_reads_filename)

            # if emit_reads=False then file should not exist and vice versa.
            self.assertEqual(emit_reads, tf.io.gfile.exists(reads_file))
Ejemplo n.º 5
0
    def test_window_selector_model_flags_failures(self):
        with six.assertRaisesRegex(
                self, ValueError,
                'ws_min_supporting_reads should be smaller than ws_'
                'max_supporting_reads.'):
            FLAGS.ws_max_num_supporting_reads = 1
            FLAGS.ws_min_num_supporting_reads = 2
            FLAGS.ws_window_selector_model = None
            FLAGS.ws_use_window_selector_model = False
            _ = realigner.realigner_config(FLAGS)

        with six.assertRaisesRegex(
                self, ValueError, 'Cannot specify a ws_window_selector_model '
                'if ws_use_window_selector_model is False.'):
            FLAGS.ws_max_num_supporting_reads = -1
            FLAGS.ws_min_num_supporting_reads = -1
            FLAGS.ws_window_selector_model = testdata.WS_ALLELE_COUNT_LINEAR_MODEL
            FLAGS.ws_use_window_selector_model = False
            _ = realigner.realigner_config(FLAGS)

        with six.assertRaisesRegex(
                self, ValueError,
                'Cannot use both ws_min_num_supporting_reads and '
                'ws_use_window_selector_model flags.'):
            FLAGS.ws_max_num_supporting_reads = -1
            FLAGS.ws_min_num_supporting_reads = 1
            FLAGS.ws_window_selector_model = None
            FLAGS.ws_use_window_selector_model = True
            _ = realigner.realigner_config(FLAGS)

        with six.assertRaisesRegex(
                self, ValueError,
                'Cannot use both ws_max_num_supporting_reads and '
                'ws_use_window_selector_model flags.'):
            FLAGS.ws_max_num_supporting_reads = 1
            FLAGS.ws_min_num_supporting_reads = -1
            FLAGS.ws_window_selector_model = None
            FLAGS.ws_use_window_selector_model = True
            _ = realigner.realigner_config(FLAGS)
Ejemplo n.º 6
0
 def test_window_selector_model_flags(self, model, min_supporting,
                                      max_supporting, use_ws_model):
   # This indirection is needed because the symbols in testdata are not set
   # when the @parameterized decorator is called.
   symbol_to_testdata = {
       None: None,
       'VARIANT_READS_THRESHOLD': testdata.WS_VARIANT_READS_THRESHOLD_MODEL,
       'ALLELE_COUNT_LINEAR': testdata.WS_ALLELE_COUNT_LINEAR_MODEL
   }
   FLAGS.ws_max_num_supporting_reads = max_supporting
   FLAGS.ws_min_num_supporting_reads = min_supporting
   FLAGS.ws_window_selector_model = symbol_to_testdata[model]
   FLAGS.ws_use_window_selector_model = use_ws_model
   # We only make sure that reading the model does not crash or raise
   # exceptions.
   _ = realigner.realigner_config(FLAGS)
Ejemplo n.º 7
0
 def setUp(self):
   self.ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA)
   self.config = realigner.realigner_config(FLAGS)
   self.reads_realigner = realigner.Realigner(self.config, self.ref_reader)
Ejemplo n.º 8
0
def shared_flags_to_options(
        add_flags, flags_obj, samples_in_order, sample_role_to_train,
        main_sample_index) -> deepvariant_pb2.MakeExamplesOptions:
    """Creates options from flags that are shared, along with given samples."""
    read_reqs = reads_pb2.ReadRequirements(
        keep_duplicates=flags_obj.keep_duplicates,
        keep_supplementary_alignments=flags_obj.keep_supplementary_alignments,
        keep_secondary_alignments=flags_obj.keep_secondary_alignments,
        min_base_quality=flags_obj.min_base_quality,
        min_mapping_quality=flags_obj.min_mapping_quality,
        min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT)

    logging.vlog(3, 'ReadRequirements are: %s', read_reqs)

    pic_options = pileup_image.default_options(read_requirements=read_reqs)

    allele_counter_options = deepvariant_pb2.AlleleCounterOptions(
        partition_size=flags_obj.partition_size,
        read_requirements=read_reqs,
        track_ref_reads=flags_obj.track_ref_reads,
        normalize_reads=flags_obj.normalize_reads,
        keep_legacy_behavior=flags_obj.keep_legacy_allele_counter_behavior)

    options = deepvariant_pb2.MakeExamplesOptions(
        exclude_contigs=exclude_contigs.EXCLUDED_HUMAN_CONTIGS,
        # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'.
        random_seed=609314161,
        # # Not specified by default: calling_regions = 3;
        read_requirements=read_reqs,
        allele_counter_options=allele_counter_options,
        pic_options=pic_options,
        n_cores=1,
        task_id=0,
        num_shards=0,
        min_shared_contigs_basepairs=0.9,
        sample_options=samples_in_order,
        main_sample_index=main_sample_index,
        sample_role_to_train=sample_role_to_train)

    if add_flags:
        options.mode = make_examples_core.parse_proto_enum_flag(
            deepvariant_pb2.MakeExamplesOptions.Mode, flags_obj.mode.upper())

        options.labeler_algorithm = make_examples_core.parse_proto_enum_flag(
            deepvariant_pb2.MakeExamplesOptions.LabelerAlgorithm,
            flags_obj.labeler_algorithm.upper())

        options.variant_caller = make_examples_core.parse_proto_enum_flag(
            deepvariant_pb2.MakeExamplesOptions.VariantCaller,
            flags_obj.variant_caller.upper())

        if flags_obj.ref:
            options.reference_filename = flags_obj.ref
        if flags_obj.confident_regions:
            options.confident_regions_filename = flags_obj.confident_regions
        if flags_obj.truth_variants:
            options.truth_variants_filename = flags_obj.truth_variants
        if flags_obj.sequencing_type:
            options.pic_options.sequencing_type = make_examples_core.parse_proto_enum_flag(
                deepvariant_pb2.PileupImageOptions.SequencingType,
                flags_obj.sequencing_type)

        if flags_obj.channels:
            channel_set = flags_obj.channels.split(',')
            for channel in channel_set:
                if channel and channel not in dv_constants.OPT_CHANNELS:
                    err_msg = 'Channel "{}" is not one of the available opt channels: {}'.format(
                        channel, ', '.join(dv_constants.OPT_CHANNELS))
                    errors.log_and_raise(err_msg, errors.CommandLineError)
            options.pic_options.channels[:] = channel_set
            options.pic_options.num_channels += len(channel_set)

        if flags_obj.multi_allelic_mode:
            multi_allelic_enum = {
                'include_het_alt_images':
                deepvariant_pb2.PileupImageOptions.ADD_HET_ALT_IMAGES,
                'exclude_het_alt_images':
                deepvariant_pb2.PileupImageOptions.NO_HET_ALT_IMAGES,
            }[flags_obj.multi_allelic_mode]
            options.pic_options.multi_allelic_mode = multi_allelic_enum

        if flags_obj.pileup_image_width:
            options.pic_options.width = flags_obj.pileup_image_width

        options.pic_options.alt_aligned_pileup = flags_obj.alt_aligned_pileup
        options.pic_options.types_to_alt_align = flags_obj.types_to_alt_align

        if flags_obj.add_supporting_other_alt_color:
            options.pic_options.other_allele_supporting_read_alpha = 0.3

        if flags_obj.select_variant_types:
            options.select_variant_types[:] = flags_obj.select_variant_types.split(
            )
            for svt in options.select_variant_types:
                if svt not in make_examples_core.VARIANT_TYPE_SELECTORS:
                    errors.log_and_raise(
                        'Select variant type {} not recognized. Allowed values are {}'
                        .format(
                            svt, ', '.join(
                                make_examples_core.VARIANT_TYPE_SELECTORS)),
                        errors.CommandLineError)

        num_shards, examples, candidates, gvcf, runtime_by_region = (
            sharded_file_utils.resolve_filespecs(
                flags_obj.task, flags_obj.examples or '', flags_obj.candidates
                or '', flags_obj.gvcf or '', flags_obj.runtime_by_region
                or ''))
        options.examples_filename = examples
        options.candidates_filename = candidates
        options.gvcf_filename = gvcf
        options.include_med_dp = flags_obj.include_med_dp
        options.task_id = flags_obj.task
        options.num_shards = num_shards
        options.runtime_by_region = runtime_by_region

        options.parse_sam_aux_fields = make_examples_core.resolve_sam_aux_fields(
            flags_obj=flags_obj)
        if flags_obj.aux_fields_to_keep:
            options.aux_fields_to_keep[:] = flags_obj.aux_fields_to_keep.split(
                ',')
        else:
            options.aux_fields_to_keep = None
        options.use_original_quality_scores = flags_obj.use_original_quality_scores

        if flags_obj.add_hp_channel:
            options.pic_options.num_channels += 1
            options.pic_options.add_hp_channel = True

        if flags_obj.hp_tag_for_assembly_polishing < 0:
            errors.log_and_raise(
                '--hp_tag_for_assembly_polishing has to be set to a positive int.',
                errors.CommandLineError)
        if (flags_obj.hp_tag_for_assembly_polishing > 0
                and not flags_obj.sort_by_haplotypes):
            errors.log_and_raise(
                '--hp_tag_for_assembly_polishing requires --sort_by_haplotypes to be '
                'set ', errors.CommandLineError)

        options.pic_options.sort_by_haplotypes = flags_obj.sort_by_haplotypes
        options.pic_options.hp_tag_for_assembly_polishing = flags_obj.hp_tag_for_assembly_polishing

        if flags_obj.write_run_info:
            options.run_info_filename = examples + _RUN_INFO_FILE_EXTENSION

        options.calling_regions.extend(
            make_examples_core.parse_regions_flag(flags_obj.regions))
        options.exclude_calling_regions.extend(
            make_examples_core.parse_regions_flag(flags_obj.exclude_regions))

        options.realigner_enabled = flags_obj.realign_reads
        options.realigner_options.CopyFrom(
            realigner.realigner_config(flags_obj))

        if (options.mode == deepvariant_pb2.MakeExamplesOptions.TRAINING
                and flags_obj.training_random_emit_ref_sites != NO_RANDOM_REF):
            options.sample_options[
                main_sample_index].variant_caller_options.fraction_reference_sites_to_emit = (
                    flags_obj.training_random_emit_ref_sites)

        if (flags_obj.use_allele_frequency and not flags_obj.population_vcfs):
            errors.log_and_raise(
                'If use_allele_frequency is set then population_vcfs '
                'must be provided.', errors.CommandLineError)
        if flags_obj.use_allele_frequency:
            options.use_allele_frequency = flags_obj.use_allele_frequency
            options.pic_options.num_channels += 1
            options.pic_options.use_allele_frequency = True
        if flags_obj.population_vcfs:
            options.population_vcf_filenames.extend(
                re.split(',| ', flags_obj.population_vcfs))
        options.max_reads_per_partition = flags_obj.max_reads_per_partition
        options.use_ref_for_cram = flags_obj.use_ref_for_cram
        options.hts_block_size = flags_obj.hts_block_size
        options.logging_every_n_candidates = flags_obj.logging_every_n_candidates
        options.customized_classes_labeler_classes_list = flags_obj.customized_classes_labeler_classes_list
        options.customized_classes_labeler_info_field_name = flags_obj.customized_classes_labeler_info_field_name

    return options
Ejemplo n.º 9
0
def default_options(add_flags=True, flags_obj=None):
  """Creates a DeepVariantOptions proto populated with reasonable defaults.

  Args:
    add_flags: bool. defaults to True. If True, we will push the value of
      certain FLAGS into our options. If False, those option fields are left
      uninitialized.
    flags_obj: object.  If not None, use as the source of flags,
      else use global FLAGS.

  Returns:
    deepvariant_pb2.DeepVariantOptions protobuf.

  Raises:
    ValueError: If we observe invalid flag values.
  """
  if not flags_obj:
    flags_obj = FLAGS

  read_reqs = reads_pb2.ReadRequirements(
      min_base_quality=10,
      min_mapping_quality=10,
      min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT)

  pic_options = pileup_image.default_options(read_requirements=read_reqs)

  allele_counter_options = deepvariant_pb2.AlleleCounterOptions(
      partition_size=flags_obj.partition_size, read_requirements=read_reqs)

  if flags_obj.sample_name:
    sample_name = flags_obj.sample_name
  elif flags_obj.reads:
    with sam.SamReader(flags_obj.reads) as sam_reader:
      sample_name = extract_sample_name_from_sam_reader(sam_reader)
  else:
    sample_name = _UNKNOWN_SAMPLE

  variant_caller_options = deepvariant_pb2.VariantCallerOptions(
      min_count_snps=flags_obj.vsc_min_count_snps,
      min_count_indels=flags_obj.vsc_min_count_indels,
      min_fraction_snps=flags_obj.vsc_min_fraction_snps,
      min_fraction_indels=flags_obj.vsc_min_fraction_indels,
      # Not specified by default: fraction_reference_sites_to_emit,
      # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'.
      random_seed=1400605801,
      sample_name=sample_name,
      p_error=0.001,
      max_gq=50,
      gq_resolution=flags_obj.gvcf_gq_binsize,
      ploidy=2)

  options = deepvariant_pb2.DeepVariantOptions(
      exclude_contigs=exclude_contigs.EXCLUDED_HUMAN_CONTIGS,
      # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'.
      random_seed=609314161,
      # # Not specified by default: calling_regions = 3;
      read_requirements=read_reqs,
      allele_counter_options=allele_counter_options,
      variant_caller_options=variant_caller_options,
      pic_options=pic_options,
      n_cores=1,
      task_id=0,
      num_shards=0,
      min_shared_contigs_basepairs=0.9,
  )

  if add_flags:
    options.mode = parse_proto_enum_flag(
        deepvariant_pb2.DeepVariantOptions.Mode, flags_obj.mode.upper())

    options.labeler_algorithm = parse_proto_enum_flag(
        deepvariant_pb2.DeepVariantOptions.LabelerAlgorithm,
        flags_obj.labeler_algorithm.upper())

    if flags_obj.ref:
      options.reference_filename = flags_obj.ref
    if flags_obj.reads:
      options.reads_filename = flags_obj.reads
    if flags_obj.confident_regions:
      options.confident_regions_filename = flags_obj.confident_regions
    if flags_obj.truth_variants:
      options.truth_variants_filename = flags_obj.truth_variants

    if flags_obj.downsample_fraction != NO_DOWNSAMPLING:
      options.downsample_fraction = flags_obj.downsample_fraction

    if flags_obj.multi_allelic_mode:
      multi_allelic_enum = {
          'include_het_alt_images':
              deepvariant_pb2.PileupImageOptions.ADD_HET_ALT_IMAGES,
          'exclude_het_alt_images':
              deepvariant_pb2.PileupImageOptions.NO_HET_ALT_IMAGES,
      }[flags_obj.multi_allelic_mode]
      options.pic_options.multi_allelic_mode = multi_allelic_enum

    if flags_obj.pileup_image_height:
      options.pic_options.height = flags_obj.pileup_image_height
    if flags_obj.pileup_image_width:
      options.pic_options.width = flags_obj.pileup_image_width

    num_shards, examples, candidates, gvcf = io_utils.resolve_filespecs(
        flags_obj.task, flags_obj.examples or '', flags_obj.candidates or '',
        flags_obj.gvcf or '')
    options.examples_filename = examples
    options.candidates_filename = candidates
    options.gvcf_filename = gvcf

    options.calling_regions.extend(parse_regions_flag(flags_obj.regions))
    options.exclude_calling_regions.extend(
        parse_regions_flag(flags_obj.exclude_regions))

    options.task_id = flags_obj.task
    options.num_shards = 0 if num_shards is None else num_shards

    options.realigner_enabled = flags_obj.realign_reads
    if options.realigner_enabled:
      options.realigner_options.CopyFrom(realigner.realigner_config(flags_obj))

    options.max_reads_per_partition = flags_obj.max_reads_per_partition

    if (options.mode == deepvariant_pb2.DeepVariantOptions.TRAINING and
        flags_obj.training_random_emit_ref_sites != NO_RANDOM_REF):
      options.variant_caller_options.fraction_reference_sites_to_emit = (
          flags_obj.training_random_emit_ref_sites)

  return options
Ejemplo n.º 10
0
    def test_realigner_diagnostics(self, enabled, emit_reads):
        # Make sure that by default we aren't emitting any diagnostic outputs.
        dx_dir = test_utils.test_tmpfile('dx')
        region_str = 'chr20:10046179-10046188'
        region = ranges.parse_literal(region_str)
        assembled_region_str = 'chr20:10046109-10046257'
        reads = _get_reads(region)
        self.config = realigner.realigner_config(FLAGS)
        self.config.diagnostics.enabled = enabled
        self.config.diagnostics.output_root = dx_dir
        self.config.diagnostics.emit_realigned_reads = emit_reads
        self.reads_realigner = realigner.Realigner(self.config,
                                                   self.ref_reader)
        _, realigned_reads = self.reads_realigner.realign_reads(reads, region)
        self.reads_realigner.diagnostic_logger.close(
        )  # Force close all resources.

        if not enabled:
            # Make sure our diagnostic output isn't emitted.
            self.assertFalse(tf.gfile.Exists(dx_dir))
        else:
            # Our root directory exists.
            self.assertTrue(tf.gfile.IsDirectory(dx_dir))

            # We expect a realigner_metrics.csv in our rootdir with 1 entry in it.
            metrics_file = os.path.join(
                dx_dir,
                self.reads_realigner.diagnostic_logger.metrics_filename)
            self.assertTrue(tf.gfile.Exists(metrics_file))
            with tf.gfile.FastGFile(metrics_file) as fin:
                rows = list(csv.DictReader(fin))
                self.assertEqual(len(rows), 1)
                self.assertEqual(set(rows[0].keys()),
                                 {'window', 'k', 'n_haplotypes', 'time'})
                self.assertEqual(rows[0]['window'], assembled_region_str)
                self.assertEqual(int(rows[0]['k']), 25)
                self.assertTrue(int(rows[0]['n_haplotypes']), 2)
                # Check that our runtime is reasonable (greater than 0, less than 10 s).
                self.assertTrue(0.0 < float(rows[0]['time']) < 10.0)

            # As does the subdirectory for this region.
            region_subdir = os.path.join(dx_dir, assembled_region_str)
            self.assertTrue(tf.gfile.IsDirectory(region_subdir))

            # We always have a graph.dot
            self.assertTrue(
                tf.gfile.Exists(
                    os.path.join(
                        region_subdir, self.reads_realigner.diagnostic_logger.
                        graph_filename)))

            reads_file = os.path.join(
                dx_dir, region_str, self.reads_realigner.diagnostic_logger.
                realigned_reads_filename)
            if emit_reads:
                self.assertTrue(tf.gfile.Exists(reads_file))
                reads_from_dx = io_utils.read_tfrecords(
                    reads_file, reads_pb2.Read)
                self.assertCountEqual(reads_from_dx, realigned_reads)
            else:
                self.assertFalse(tf.gfile.Exists(reads_file))
Ejemplo n.º 11
0
 def setUp(self):
     self.ref_reader = genomics_io.make_ref_reader(test_utils.CHR20_FASTA)
     self.config = realigner.realigner_config(FLAGS)
     self.reads_realigner = realigner.Realigner(self.config,
                                                self.ref_reader)
Ejemplo n.º 12
0
def default_options(add_flags=True, flags=None):
    """Creates a DeepVariantOptions proto populated with reasonable defaults.

  Args:
    add_flags: bool. defaults to True. If True, we will push the value of
      certain FLAGS into our options. If False, those option fields are left
      uninitialized.
    flags: object.  If not None, use as the source of flags,
      else use global FLAGS.

  Returns:
    deepvariant_pb2.DeepVariantOptions protobuf.

  Raises:
    ValueError: If we observe invalid flag values.
  """
    if not flags:
        flags = FLAGS

    read_reqs = core_pb2.ReadRequirements(
        min_base_quality=10,
        min_mapping_quality=10,
        min_base_quality_mode=core_pb2.ReadRequirements.ENFORCED_BY_CLIENT)

    pic_options = pileup_image.default_options(read_requirements=read_reqs)

    allele_counter_options = deepvariant_pb2.AlleleCounterOptions(
        partition_size=flags.partition_size, read_requirements=read_reqs)

    if flags.sample_name:
        sample_name = flags.sample_name
    elif flags.reads:
        sample_name = extract_sample_name_from_reads(flags.reads)
    else:
        sample_name = _UNKNOWN_SAMPLE

    variant_caller_options = deepvariant_pb2.VariantCallerOptions(
        min_count_snps=flags.vsc_min_count_snps,
        min_count_indels=flags.vsc_min_count_indels,
        min_fraction_snps=flags.vsc_min_fraction_snps,
        min_fraction_indels=flags.vsc_min_fraction_indels,
        # Not specified by default: fraction_reference_sites_to_emit,
        # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'.
        random_seed=1400605801,
        sample_name=sample_name,
        p_error=0.001,
        max_gq=50,
        gq_resolution=1,
        ploidy=2)

    options = deepvariant_pb2.DeepVariantOptions(
        exclude_contigs=[
            # The two canonical names for the contig representing the human
            # mitochondrial sequence.
            'chrM',
            'MT',
            # From hs37d5.
            # (ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/README_human_reference_20110707)  # pylint:disable=line-too-long
            'GL000207.1',
            'GL000226.1',
            'GL000229.1',
            'GL000231.1',
            'GL000210.1',
            'GL000239.1',
            'GL000235.1',
            'GL000201.1',
            'GL000247.1',
            'GL000245.1',
            'GL000197.1',
            'GL000203.1',
            'GL000246.1',
            'GL000249.1',
            'GL000196.1',
            'GL000248.1',
            'GL000244.1',
            'GL000238.1',
            'GL000202.1',
            'GL000234.1',
            'GL000232.1',
            'GL000206.1',
            'GL000240.1',
            'GL000236.1',
            'GL000241.1',
            'GL000243.1',
            'GL000242.1',
            'GL000230.1',
            'GL000237.1',
            'GL000233.1',
            'GL000204.1',
            'GL000198.1',
            'GL000208.1',
            'GL000191.1',
            'GL000227.1',
            'GL000228.1',
            'GL000214.1',
            'GL000221.1',
            'GL000209.1',
            'GL000218.1',
            'GL000220.1',
            'GL000213.1',
            'GL000211.1',
            'GL000199.1',
            'GL000217.1',
            'GL000216.1',
            'GL000215.1',
            'GL000205.1',
            'GL000219.1',
            'GL000224.1',
            'GL000223.1',
            'GL000195.1',
            'GL000212.1',
            'GL000222.1',
            'GL000200.1',
            'GL000193.1',
            'GL000194.1',
            'GL000225.1',
            'GL000192.1',
            'NC_007605',
            'hs37d5',
        ],
        # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'.
        random_seed=609314161,
        # # Not specified by default: calling_regions = 3;
        read_requirements=read_reqs,
        allele_counter_options=allele_counter_options,
        variant_caller_options=variant_caller_options,
        pic_options=pic_options,
        n_cores=1,
        task_id=0,
        num_shards=0,
        min_shared_contigs_basepairs=0.9,
    )

    if add_flags:
        if flags.mode == 'training':
            options.mode = deepvariant_pb2.DeepVariantOptions.TRAINING
        elif flags.mode == 'calling':
            options.mode = deepvariant_pb2.DeepVariantOptions.CALLING
        else:
            raise ValueError('Unexpected mode', flags.mode)

        if flags.ref:
            options.reference_filename = flags.ref
        if flags.reads:
            options.reads_filename = flags.reads
        if flags.confident_regions:
            options.confident_regions_filename = flags.confident_regions
        if flags.truth_variants:
            options.truth_variants_filename = flags.truth_variants

        if flags.downsample_fraction != NO_DOWNSAMPLING:
            options.downsample_fraction = flags.downsample_fraction

        if flags.multi_allelic_mode:
            multi_allelic_enum = {
                'include_het_alt_images':
                deepvariant_pb2.PileupImageOptions.ADD_HET_ALT_IMAGES,
                'exclude_het_alt_images':
                deepvariant_pb2.PileupImageOptions.NO_HET_ALT_IMAGES,
            }[flags.multi_allelic_mode]
            options.pic_options.multi_allelic_mode = multi_allelic_enum

        if flags.pileup_image_height:
            options.pic_options.height = flags.pileup_image_height
        if flags.pileup_image_width:
            options.pic_options.width = flags.pileup_image_width

        num_shards, examples, candidates, gvcf = io_utils.resolve_filespecs(
            flags.task, flags.examples or '', flags.candidates or '',
            flags.gvcf or '')
        options.examples_filename = examples
        options.candidates_filename = candidates
        options.gvcf_filename = gvcf

        # redacted
        regions_flag = flags.regions
        if isinstance(regions_flag, str):
            regions_flag = regions_flag.split()
        options.calling_regions.extend(regions_flag)

        options.task_id = flags.task
        options.num_shards = 0 if num_shards is None else num_shards

        if flags.realign_reads:
            options.realigner_enabled = True
            options.realigner_options.CopyFrom(
                realigner.realigner_config(flags))

        options.max_reads_per_partition = flags.max_reads_per_partition

        if (options.mode == deepvariant_pb2.DeepVariantOptions.TRAINING
                and flags.training_random_emit_ref_sites != NO_RANDOM_REF):
            options.variant_caller_options.fraction_reference_sites_to_emit = (
                flags.training_random_emit_ref_sites)

    return options