Example #1
0
def default_options(read_requirements=None):
    """Creates a PileupImageOptions populated with good default values."""
    if not read_requirements:
        read_requirements = core_pb2.ReadRequirements(
            min_base_quality=DEFAULT_MIN_BASE_QUALITY,
            min_mapping_quality=DEFAULT_MIN_MAPPING_QUALITY,
            min_base_quality_mode=core_pb2.ReadRequirements.ENFORCED_BY_CLIENT)

    return deepvariant_pb2.PileupImageOptions(
        reference_band_height=5,
        base_color_offset_a_and_g=40,
        base_color_offset_t_and_c=30,
        base_color_stride=70,
        allele_supporting_read_alpha=1.0,
        allele_unsupporting_read_alpha=0.6,
        reference_matching_read_alpha=0.2,
        reference_mismatching_read_alpha=1.0,
        indel_anchoring_base_char='*',
        reference_alpha=0.4,
        reference_base_quality=60,
        positive_strand_color=70,
        negative_strand_color=240,
        base_quality_cap=40,
        mapping_quality_cap=60,
        height=100,
        width=221,
        read_overlap_buffer_bp=5,
        read_requirements=read_requirements,
        multi_allelic_mode=deepvariant_pb2.PileupImageOptions.
        ADD_HET_ALT_IMAGES,
        # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'.
        random_seed=2101079370)
Example #2
0
    def test_realigner_end2end(self):
        ref_reader = genomics_io.make_ref_reader(test_utils.CHR20_FASTA)
        config = realigner.realigner_config(FLAGS)
        reads_realigner = realigner.Realigner(config, ref_reader)
        region_str = 'chr20:10,000,000-10,009,999'

        regions = ranges.RangeSet.from_regions([region_str])
        for region in regions.partition(1000):
            with genomics_io.make_sam_reader(
                    test_utils.CHR20_BAM,
                    core_pb2.ReadRequirements()) as sam_reader:
                in_reads = list(sam_reader.query(region))
            windows, out_reads = reads_realigner.realign_reads(
                in_reads, region)

            # We should always get back all of the reads we sent in. Instead of just
            # checking the lengths are the same, make sure all the read names are the
            # same.
            self.assertCountEqual([r.fragment_name for r in in_reads],
                                  [r.fragment_name for r in out_reads])

            # Make sure we assembled at least one windows in the region.
            self.assertNotEqual(0, len(windows))

            # Check each window to make sure it's reasonable.
            for window in windows:
                # We always expect the reference sequence to be one of our haplotypes.
                ref_seq = ref_reader.bases(window.span)
                self.assertIn(ref_seq, set(window.haplotypes))
def default_options(add_flags=True, flags=None):
    """Creates a DeepVariantOptions proto populated with reasonable defaults.

  Args:
    add_flags: bool. defaults to True. If True, we will push the value of
      certain FLAGS into our options. If False, those option fields are left
      uninitialized.
    flags: object.  If not None, use as the source of flags,
      else use global FLAGS.

  Returns:
    deepvariant_pb2.DeepVariantOptions protobuf.

  Raises:
    ValueError: If we observe invalid flag values.
  """
    if not flags:
        flags = FLAGS

    read_reqs = core_pb2.ReadRequirements(
        min_base_quality=10,
        min_mapping_quality=10,
        min_base_quality_mode=core_pb2.ReadRequirements.ENFORCED_BY_CLIENT)

    pic_options = pileup_image.default_options(read_requirements=read_reqs)

    allele_counter_options = deepvariant_pb2.AlleleCounterOptions(
        partition_size=flags.partition_size, read_requirements=read_reqs)

    if flags.sample_name:
        sample_name = flags.sample_name
    elif flags.reads:
        sample_name = extract_sample_name_from_reads(flags.reads)
    else:
        sample_name = _UNKNOWN_SAMPLE

    variant_caller_options = deepvariant_pb2.VariantCallerOptions(
        min_count_snps=flags.vsc_min_count_snps,
        min_count_indels=flags.vsc_min_count_indels,
        min_fraction_snps=flags.vsc_min_fraction_snps,
        min_fraction_indels=flags.vsc_min_fraction_indels,
        # Not specified by default: fraction_reference_sites_to_emit,
        # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'.
        random_seed=1400605801,
        sample_name=sample_name,
        p_error=0.001,
        max_gq=50,
        gq_resolution=1,
        ploidy=2)

    options = deepvariant_pb2.DeepVariantOptions(
        exclude_contigs=[
            # The two canonical names for the contig representing the human
            # mitochondrial sequence.
            'chrM',
            'MT',
            # From hs37d5.
            # (ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/README_human_reference_20110707)  # pylint:disable=line-too-long
            'GL000207.1',
            'GL000226.1',
            'GL000229.1',
            'GL000231.1',
            'GL000210.1',
            'GL000239.1',
            'GL000235.1',
            'GL000201.1',
            'GL000247.1',
            'GL000245.1',
            'GL000197.1',
            'GL000203.1',
            'GL000246.1',
            'GL000249.1',
            'GL000196.1',
            'GL000248.1',
            'GL000244.1',
            'GL000238.1',
            'GL000202.1',
            'GL000234.1',
            'GL000232.1',
            'GL000206.1',
            'GL000240.1',
            'GL000236.1',
            'GL000241.1',
            'GL000243.1',
            'GL000242.1',
            'GL000230.1',
            'GL000237.1',
            'GL000233.1',
            'GL000204.1',
            'GL000198.1',
            'GL000208.1',
            'GL000191.1',
            'GL000227.1',
            'GL000228.1',
            'GL000214.1',
            'GL000221.1',
            'GL000209.1',
            'GL000218.1',
            'GL000220.1',
            'GL000213.1',
            'GL000211.1',
            'GL000199.1',
            'GL000217.1',
            'GL000216.1',
            'GL000215.1',
            'GL000205.1',
            'GL000219.1',
            'GL000224.1',
            'GL000223.1',
            'GL000195.1',
            'GL000212.1',
            'GL000222.1',
            'GL000200.1',
            'GL000193.1',
            'GL000194.1',
            'GL000225.1',
            'GL000192.1',
            'NC_007605',
            'hs37d5',
        ],
        # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'.
        random_seed=609314161,
        # # Not specified by default: calling_regions = 3;
        read_requirements=read_reqs,
        allele_counter_options=allele_counter_options,
        variant_caller_options=variant_caller_options,
        pic_options=pic_options,
        n_cores=1,
        task_id=0,
        num_shards=0,
        min_shared_contigs_basepairs=0.9,
    )

    if add_flags:
        if flags.mode == 'training':
            options.mode = deepvariant_pb2.DeepVariantOptions.TRAINING
        elif flags.mode == 'calling':
            options.mode = deepvariant_pb2.DeepVariantOptions.CALLING
        else:
            raise ValueError('Unexpected mode', flags.mode)

        if flags.ref:
            options.reference_filename = flags.ref
        if flags.reads:
            options.reads_filename = flags.reads
        if flags.confident_regions:
            options.confident_regions_filename = flags.confident_regions
        if flags.truth_variants:
            options.truth_variants_filename = flags.truth_variants

        if flags.downsample_fraction != NO_DOWNSAMPLING:
            options.downsample_fraction = flags.downsample_fraction

        if flags.multi_allelic_mode:
            multi_allelic_enum = {
                'include_het_alt_images':
                deepvariant_pb2.PileupImageOptions.ADD_HET_ALT_IMAGES,
                'exclude_het_alt_images':
                deepvariant_pb2.PileupImageOptions.NO_HET_ALT_IMAGES,
            }[flags.multi_allelic_mode]
            options.pic_options.multi_allelic_mode = multi_allelic_enum

        if flags.pileup_image_height:
            options.pic_options.height = flags.pileup_image_height
        if flags.pileup_image_width:
            options.pic_options.width = flags.pileup_image_width

        num_shards, examples, candidates, gvcf = io_utils.resolve_filespecs(
            flags.task, flags.examples or '', flags.candidates or '',
            flags.gvcf or '')
        options.examples_filename = examples
        options.candidates_filename = candidates
        options.gvcf_filename = gvcf

        # redacted
        regions_flag = flags.regions
        if isinstance(regions_flag, str):
            regions_flag = regions_flag.split()
        options.calling_regions.extend(regions_flag)

        options.task_id = flags.task
        options.num_shards = 0 if num_shards is None else num_shards

        if flags.realign_reads:
            options.realigner_enabled = True
            options.realigner_options.CopyFrom(
                realigner.realigner_config(flags))

        options.max_reads_per_partition = flags.max_reads_per_partition

        if (options.mode == deepvariant_pb2.DeepVariantOptions.TRAINING
                and flags.training_random_emit_ref_sites != NO_RANDOM_REF):
            options.variant_caller_options.fraction_reference_sites_to_emit = (
                flags.training_random_emit_ref_sites)

    return options
Example #4
0
def default_options(add_flags=True, flags_obj=None):
    """Creates a DeepVariantOptions proto populated with reasonable defaults.

  Args:
    add_flags: bool. defaults to True. If True, we will push the value of
      certain FLAGS into our options. If False, those option fields are left
      uninitialized.
    flags_obj: object.  If not None, use as the source of flags,
      else use global FLAGS.

  Returns:
    deepvariant_pb2.DeepVariantOptions protobuf.

  Raises:
    ValueError: If we observe invalid flag values.
  """
    if not flags_obj:
        flags_obj = FLAGS

    read_reqs = core_pb2.ReadRequirements(
        min_base_quality=10,
        min_mapping_quality=10,
        min_base_quality_mode=core_pb2.ReadRequirements.ENFORCED_BY_CLIENT)

    pic_options = pileup_image.default_options(read_requirements=read_reqs)

    allele_counter_options = deepvariant_pb2.AlleleCounterOptions(
        partition_size=flags_obj.partition_size, read_requirements=read_reqs)

    if flags_obj.sample_name:
        sample_name = flags_obj.sample_name
    elif flags_obj.reads:
        with genomics_io.make_sam_reader(flags_obj.reads) as sam_reader:
            sample_name = extract_sample_name_from_sam_reader(sam_reader)
    else:
        sample_name = _UNKNOWN_SAMPLE

    variant_caller_options = deepvariant_pb2.VariantCallerOptions(
        min_count_snps=flags_obj.vsc_min_count_snps,
        min_count_indels=flags_obj.vsc_min_count_indels,
        min_fraction_snps=flags_obj.vsc_min_fraction_snps,
        min_fraction_indels=flags_obj.vsc_min_fraction_indels,
        # Not specified by default: fraction_reference_sites_to_emit,
        # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'.
        random_seed=1400605801,
        sample_name=sample_name,
        p_error=0.001,
        max_gq=50,
        gq_resolution=flags_obj.gvcf_gq_binsize,
        ploidy=2)

    options = deepvariant_pb2.DeepVariantOptions(
        exclude_contigs=exclude_contigs.EXCLUDED_HUMAN_CONTIGS,
        # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'.
        random_seed=609314161,
        # # Not specified by default: calling_regions = 3;
        read_requirements=read_reqs,
        allele_counter_options=allele_counter_options,
        variant_caller_options=variant_caller_options,
        pic_options=pic_options,
        n_cores=1,
        task_id=0,
        num_shards=0,
        min_shared_contigs_basepairs=0.9,
    )

    if add_flags:
        if flags_obj.mode == 'training':
            options.mode = deepvariant_pb2.DeepVariantOptions.TRAINING
        elif flags_obj.mode == 'calling':
            options.mode = deepvariant_pb2.DeepVariantOptions.CALLING
        else:
            raise ValueError('Unexpected mode', flags_obj.mode)

        if flags_obj.ref:
            options.reference_filename = flags_obj.ref
        if flags_obj.reads:
            options.reads_filename = flags_obj.reads
        if flags_obj.confident_regions:
            options.confident_regions_filename = flags_obj.confident_regions
        if flags_obj.truth_variants:
            options.truth_variants_filename = flags_obj.truth_variants

        if flags_obj.downsample_fraction != NO_DOWNSAMPLING:
            options.downsample_fraction = flags_obj.downsample_fraction

        if flags_obj.multi_allelic_mode:
            multi_allelic_enum = {
                'include_het_alt_images':
                deepvariant_pb2.PileupImageOptions.ADD_HET_ALT_IMAGES,
                'exclude_het_alt_images':
                deepvariant_pb2.PileupImageOptions.NO_HET_ALT_IMAGES,
            }[flags_obj.multi_allelic_mode]
            options.pic_options.multi_allelic_mode = multi_allelic_enum

        if flags_obj.pileup_image_height:
            options.pic_options.height = flags_obj.pileup_image_height
        if flags_obj.pileup_image_width:
            options.pic_options.width = flags_obj.pileup_image_width

        num_shards, examples, candidates, gvcf = io_utils.resolve_filespecs(
            flags_obj.task, flags_obj.examples or '', flags_obj.candidates
            or '', flags_obj.gvcf or '')
        options.examples_filename = examples
        options.candidates_filename = candidates
        options.gvcf_filename = gvcf

        options.calling_regions.extend(parse_regions_flag(flags_obj.regions))
        options.exclude_calling_regions.extend(
            parse_regions_flag(flags_obj.exclude_regions))

        options.task_id = flags_obj.task
        options.num_shards = 0 if num_shards is None else num_shards

        options.realigner_enabled = flags_obj.realign_reads
        if options.realigner_enabled:
            options.realigner_options.CopyFrom(
                realigner.realigner_config(flags_obj))

        options.max_reads_per_partition = flags_obj.max_reads_per_partition

        if (options.mode == deepvariant_pb2.DeepVariantOptions.TRAINING
                and flags_obj.training_random_emit_ref_sites != NO_RANDOM_REF):
            options.variant_caller_options.fraction_reference_sites_to_emit = (
                flags_obj.training_random_emit_ref_sites)

    return options