def default_options(read_requirements=None): """Creates a PileupImageOptions populated with good default values.""" if not read_requirements: read_requirements = reads_pb2.ReadRequirements( min_base_quality=10, min_mapping_quality=10, min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT) return deepvariant_pb2.PileupImageOptions( reference_band_height=5, base_color_offset_a_and_g=40, base_color_offset_t_and_c=30, base_color_stride=70, allele_supporting_read_alpha=1.0, allele_unsupporting_read_alpha=0.6, reference_matching_read_alpha=0.2, reference_mismatching_read_alpha=1.0, indel_anchoring_base_char='*', reference_alpha=0.4, reference_base_quality=60, positive_strand_color=70, negative_strand_color=240, base_quality_cap=40, mapping_quality_cap=60, height=dv_constants.PILEUP_DEFAULT_HEIGHT, width=dv_constants.PILEUP_DEFAULT_WIDTH, num_channels=dv_constants.PILEUP_NUM_CHANNELS, read_overlap_buffer_bp=5, read_requirements=read_requirements, multi_allelic_mode=deepvariant_pb2.PileupImageOptions.ADD_HET_ALT_IMAGES, # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'. random_seed=2101079370)
def test_realigner_end2end(self): ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA) config = realigner.realigner_config(FLAGS) reads_realigner = realigner.Realigner(config, ref_reader) region_str = 'chr20:10,000,000-10,009,999' windows_count = 0 regions = ranges.RangeSet.from_regions([region_str]) for region in regions.partition(1000): with sam.SamReader( testdata.CHR20_BAM, read_requirements=reads_pb2.ReadRequirements()) as sam_reader: in_reads = list(sam_reader.query(region)) windows, out_reads = reads_realigner.realign_reads(in_reads, region) # We should always get back all of the reads we sent in. Instead of just # checking the lengths are the same, make sure all the read names are the # same. self.assertCountEqual([r.fragment_name for r in in_reads], [r.fragment_name for r in out_reads]) # Check each window to make sure it's reasonable. for window in windows: # We always expect the reference sequence to be one of our haplotypes. ref_seq = ref_reader.query(window.span) self.assertIn(ref_seq, set(window.haplotypes)) windows_count += len(windows) self.assertGreater(windows_count, 0)
def generate_data(vcf_reader, ref_reader, sam_reader, baseline_contig, exclude_contig): """Generates a pandas.DataFrame summarizing the AlleleCount at each position. The features included are: - 'ref_nonconfident_read_count' - 'ref_supporting_read_count' - 'SUBSTITUTION' - 'INSERTION' - 'DELETION' - 'SOFT_CLIP' - 'label' These features are extracted from the AlleleCount proto at the concerned position. Args: vcf_reader: a nucleus.io.VcfReader. ref_reader: a nucleus.io.IndexedFastaReader. sam_reader: a nucleus.io.SamReader. baseline_contig: string, contig from which to sample baseline positions. exclude_contig: string, contig to exclude for test purposes. Returns: pandas.Dataframe object. """ # These parameters are the ones used in make_examples. read_reqs = reads_pb2.ReadRequirements( min_base_quality=10, min_mapping_quality=10, min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT) allele_counter_options = deepvariant_pb2.AlleleCounterOptions( partition_size=1, read_requirements=read_reqs) training_positions = generate_positions(vcf_reader, ref_reader, baseline_contig) positions_records = [] for position in training_positions: region = ranges.make_range(position.reference_name, position.start, position.start + 1) allele_counter = allelecounter.AlleleCounter(ref_reader.c_reader, region, allele_counter_options) row = _position_to_features(sam_reader, allele_counter, region, position, exclude_contig) if row is not None: positions_records.append(row) df = pd.DataFrame(positions_records) df = df.fillna(0) df = shuffle(df) return df
def model_evaluation_runner(truth_variants, reads, ref, input_model_pckl, eval_region, output_report_csv): """Outputs precision-recall for a sklearn model using AlleleCount features. Args: truth_variants: path to the VCF. reads: path to the reads BAM. ref: path to the reference FASTA. input_model_pckl: path to read the LogisticRegression pickle from. eval_region: str, region to evaluate on in the 'chr:start-end', 'chr:position' or 'chr' format. output_report_csv: path to the output report csv. Raises: ValueError: if eval_region cannot be parsed. """ sam_reader = sam.SamReader(reads) ref_reader = fasta.IndexedFastaReader(ref) read_reqs = reads_pb2.ReadRequirements( min_base_quality=10, min_mapping_quality=10, min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT) allele_counter_options = deepvariant_pb2.AlleleCounterOptions( partition_size=1, read_requirements=read_reqs) model = joblib.load(input_model_pckl) with vcf.VcfReader(truth_variants) as vcf_reader: region = ranges.parse_literal(eval_region, contig_map=ranges.contigs_dict( ref_reader.header.contigs)) true_indels = [ var for var in vcf_reader.query(region) if (variant_utils.is_indel(var)) ] precisions = compute_precision(model, true_indels, sam_reader, ref_reader, allele_counter_options, _THRESHOLDS, region) recalls = compute_effective_recall(model, true_indels, sam_reader, ref_reader, allele_counter_options, _THRESHOLDS) with tf.gfile.GFile(output_report_csv, 'w') as csvfile: fieldnames = ['threshold', 'precision', 'recall'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for threshold in _THRESHOLDS: writer.writerow({ 'threshold': threshold, 'precision': precisions[threshold], 'recall': recalls[threshold] })
def _candidates_from_reads(config, ref_reader, reads, region): """Returns a list of candidate positions. Args: config: learning.genomics.deepvariant.realigner.WindowSelectorOptions options determining the behavior of this window selector. ref_reader: GenomeReference. Indexed reference genome to query bases. reads: list[nucleus.protos.Read]. The reads we are processing into candidate positions. region: nucleus.protos.Range. The region we are processing. Returns: A list. The elements are reference positions within region. Raises: ValueError: if config.window_selector_model.model_type isn't a valid enum name in realigner_pb2.WindowSelectorModel.ModelType. """ allele_counter_options = deepvariant_pb2.AlleleCounterOptions( read_requirements=reads_pb2.ReadRequirements( min_mapping_quality=config.min_mapq, min_base_quality=config.min_base_quality), keep_legacy_behavior=config.keep_legacy_behavior) expanded_region = ranges.expand(region, config.region_expansion_in_bp, contig_map=ranges.contigs_dict( ref_reader.header.contigs)) allele_counter = allelecounter.AlleleCounter(ref_reader.c_reader, expanded_region, [], allele_counter_options) for read in reads: allele_counter.add(read, 'placeholder_sample_id') model_type = config.window_selector_model.model_type if model_type == realigner_pb2.WindowSelectorModel.VARIANT_READS: return _variant_reads_threshold_selector( allele_counter, config.window_selector_model.variant_reads_model, expanded_region) elif model_type == realigner_pb2.WindowSelectorModel.ALLELE_COUNT_LINEAR: return _allele_count_linear_selector( allele_counter, config.window_selector_model.allele_count_linear_model, expanded_region) else: raise ValueError('Unknown enum option "{}" for ' 'WindowSelectorModel.model_type'.format( config.window_selector_model.model_type))
def test_ignores_reads_with_low_mapping_quality(self, min_base_qual, min_mapping_qual): """Check that we discard reads with low mapping quality. We have the following scenario: position 0 1 2 3 4 5 reference A A C A G read A A A variant C We set the mapping quality of the read to different values of `mapping_qual`. All bases in the read have base quality greater than `min_base_qual`. The read should only be kept if `mapping_qual` > `min_mapping_qual`. Args: min_base_qual: Reads are discarded if the base at a variant start position does not meet this base quality requirement. min_mapping_qual: Reads are discarded if they do not meet this mapping quality requirement. """ dv_call = deepvariant_pb2.DeepVariantCall( variant=variants_pb2.Variant(reference_name='chr1', start=2, end=3, reference_bases='A', alternate_bases=['C'])) read_requirements = reads_pb2.ReadRequirements( min_base_quality=min_base_qual, min_mapping_quality=min_mapping_qual, min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT ) pie = _make_encoder(read_requirements=read_requirements) for mapping_qual in range(min_mapping_qual + 5): quals = [min_base_qual, min_base_qual, min_base_qual] read = test_utils.make_read('AAA', start=1, cigar='3M', quals=quals, mapq=mapping_qual) actual = pie.encode_read(dv_call, 'AACAG', read, 1, 'C') if mapping_qual < min_mapping_qual: self.assertIsNone(actual) else: self.assertIsNotNone(actual)
def test_keeps_reads_with_low_quality_bases(self, min_base_qual, min_mapping_qual): """Check that we keep reads with adequate quality at variant start position. We have the following scenario: position 0 1 2 3 4 5 reference A A C A G read A A A variant C We set the base quality of the first and third bases in the read to different functions of `base_qual`. The middle position of the read is where the variant starts, and this position always has base quality greater than `min_base_qual`. Thus, the read should always be kept. Args: min_base_qual: Reads are discarded if the base at a variant start position does not meet this base quality requirement. min_mapping_qual: Reads are discarded if they do not meet this mapping quality requirement. """ dv_call = deepvariant_pb2.DeepVariantCall( variant=variants_pb2.Variant(reference_name='chr1', start=2, end=3, reference_bases='A', alternate_bases=['C'])) read_requirements = reads_pb2.ReadRequirements( min_base_quality=min_base_qual, min_mapping_quality=min_mapping_qual, min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT ) pie = _make_encoder(read_requirements=read_requirements) for base_qual in range(min_base_qual + 5): quals = [base_qual - 1, min_base_qual, base_qual + 1] read = test_utils.make_read('AAA', start=1, cigar='3M', quals=quals, mapq=min_mapping_qual) actual = pie.encode_read(dv_call, 'AACAG', read, 1, 'C') self.assertIsNotNone(actual)
def shared_flags_to_options( add_flags, flags_obj, samples_in_order, sample_role_to_train, main_sample_index) -> deepvariant_pb2.MakeExamplesOptions: """Creates options from flags that are shared, along with given samples.""" read_reqs = reads_pb2.ReadRequirements( keep_duplicates=flags_obj.keep_duplicates, keep_supplementary_alignments=flags_obj.keep_supplementary_alignments, keep_secondary_alignments=flags_obj.keep_secondary_alignments, min_base_quality=flags_obj.min_base_quality, min_mapping_quality=flags_obj.min_mapping_quality, min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT) logging.vlog(3, 'ReadRequirements are: %s', read_reqs) pic_options = pileup_image.default_options(read_requirements=read_reqs) allele_counter_options = deepvariant_pb2.AlleleCounterOptions( partition_size=flags_obj.partition_size, read_requirements=read_reqs, track_ref_reads=flags_obj.track_ref_reads, normalize_reads=flags_obj.normalize_reads, keep_legacy_behavior=flags_obj.keep_legacy_allele_counter_behavior) options = deepvariant_pb2.MakeExamplesOptions( exclude_contigs=exclude_contigs.EXCLUDED_HUMAN_CONTIGS, # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'. random_seed=609314161, # # Not specified by default: calling_regions = 3; read_requirements=read_reqs, allele_counter_options=allele_counter_options, pic_options=pic_options, n_cores=1, task_id=0, num_shards=0, min_shared_contigs_basepairs=0.9, sample_options=samples_in_order, main_sample_index=main_sample_index, sample_role_to_train=sample_role_to_train) if add_flags: options.mode = make_examples_core.parse_proto_enum_flag( deepvariant_pb2.MakeExamplesOptions.Mode, flags_obj.mode.upper()) options.labeler_algorithm = make_examples_core.parse_proto_enum_flag( deepvariant_pb2.MakeExamplesOptions.LabelerAlgorithm, flags_obj.labeler_algorithm.upper()) options.variant_caller = make_examples_core.parse_proto_enum_flag( deepvariant_pb2.MakeExamplesOptions.VariantCaller, flags_obj.variant_caller.upper()) if flags_obj.ref: options.reference_filename = flags_obj.ref if flags_obj.confident_regions: options.confident_regions_filename = flags_obj.confident_regions if flags_obj.truth_variants: options.truth_variants_filename = flags_obj.truth_variants if flags_obj.sequencing_type: options.pic_options.sequencing_type = make_examples_core.parse_proto_enum_flag( deepvariant_pb2.PileupImageOptions.SequencingType, flags_obj.sequencing_type) if flags_obj.channels: channel_set = flags_obj.channels.split(',') for channel in channel_set: if channel and channel not in dv_constants.OPT_CHANNELS: err_msg = 'Channel "{}" is not one of the available opt channels: {}'.format( channel, ', '.join(dv_constants.OPT_CHANNELS)) errors.log_and_raise(err_msg, errors.CommandLineError) options.pic_options.channels[:] = channel_set options.pic_options.num_channels += len(channel_set) if flags_obj.multi_allelic_mode: multi_allelic_enum = { 'include_het_alt_images': deepvariant_pb2.PileupImageOptions.ADD_HET_ALT_IMAGES, 'exclude_het_alt_images': deepvariant_pb2.PileupImageOptions.NO_HET_ALT_IMAGES, }[flags_obj.multi_allelic_mode] options.pic_options.multi_allelic_mode = multi_allelic_enum if flags_obj.pileup_image_width: options.pic_options.width = flags_obj.pileup_image_width options.pic_options.alt_aligned_pileup = flags_obj.alt_aligned_pileup options.pic_options.types_to_alt_align = flags_obj.types_to_alt_align if flags_obj.add_supporting_other_alt_color: options.pic_options.other_allele_supporting_read_alpha = 0.3 if flags_obj.select_variant_types: options.select_variant_types[:] = flags_obj.select_variant_types.split( ) for svt in options.select_variant_types: if svt not in make_examples_core.VARIANT_TYPE_SELECTORS: errors.log_and_raise( 'Select variant type {} not recognized. Allowed values are {}' .format( svt, ', '.join( make_examples_core.VARIANT_TYPE_SELECTORS)), errors.CommandLineError) num_shards, examples, candidates, gvcf, runtime_by_region = ( sharded_file_utils.resolve_filespecs( flags_obj.task, flags_obj.examples or '', flags_obj.candidates or '', flags_obj.gvcf or '', flags_obj.runtime_by_region or '')) options.examples_filename = examples options.candidates_filename = candidates options.gvcf_filename = gvcf options.include_med_dp = flags_obj.include_med_dp options.task_id = flags_obj.task options.num_shards = num_shards options.runtime_by_region = runtime_by_region options.parse_sam_aux_fields = make_examples_core.resolve_sam_aux_fields( flags_obj=flags_obj) if flags_obj.aux_fields_to_keep: options.aux_fields_to_keep[:] = flags_obj.aux_fields_to_keep.split( ',') else: options.aux_fields_to_keep = None options.use_original_quality_scores = flags_obj.use_original_quality_scores if flags_obj.add_hp_channel: options.pic_options.num_channels += 1 options.pic_options.add_hp_channel = True if flags_obj.hp_tag_for_assembly_polishing < 0: errors.log_and_raise( '--hp_tag_for_assembly_polishing has to be set to a positive int.', errors.CommandLineError) if (flags_obj.hp_tag_for_assembly_polishing > 0 and not flags_obj.sort_by_haplotypes): errors.log_and_raise( '--hp_tag_for_assembly_polishing requires --sort_by_haplotypes to be ' 'set ', errors.CommandLineError) options.pic_options.sort_by_haplotypes = flags_obj.sort_by_haplotypes options.pic_options.hp_tag_for_assembly_polishing = flags_obj.hp_tag_for_assembly_polishing if flags_obj.write_run_info: options.run_info_filename = examples + _RUN_INFO_FILE_EXTENSION options.calling_regions.extend( make_examples_core.parse_regions_flag(flags_obj.regions)) options.exclude_calling_regions.extend( make_examples_core.parse_regions_flag(flags_obj.exclude_regions)) options.realigner_enabled = flags_obj.realign_reads options.realigner_options.CopyFrom( realigner.realigner_config(flags_obj)) if (options.mode == deepvariant_pb2.MakeExamplesOptions.TRAINING and flags_obj.training_random_emit_ref_sites != NO_RANDOM_REF): options.sample_options[ main_sample_index].variant_caller_options.fraction_reference_sites_to_emit = ( flags_obj.training_random_emit_ref_sites) if (flags_obj.use_allele_frequency and not flags_obj.population_vcfs): errors.log_and_raise( 'If use_allele_frequency is set then population_vcfs ' 'must be provided.', errors.CommandLineError) if flags_obj.use_allele_frequency: options.use_allele_frequency = flags_obj.use_allele_frequency options.pic_options.num_channels += 1 options.pic_options.use_allele_frequency = True if flags_obj.population_vcfs: options.population_vcf_filenames.extend( re.split(',| ', flags_obj.population_vcfs)) options.max_reads_per_partition = flags_obj.max_reads_per_partition options.use_ref_for_cram = flags_obj.use_ref_for_cram options.hts_block_size = flags_obj.hts_block_size options.logging_every_n_candidates = flags_obj.logging_every_n_candidates options.customized_classes_labeler_classes_list = flags_obj.customized_classes_labeler_classes_list options.customized_classes_labeler_info_field_name = flags_obj.customized_classes_labeler_info_field_name return options
def default_options(add_flags=True, flags_obj=None): """Creates a DeepVariantOptions proto populated with reasonable defaults. Args: add_flags: bool. defaults to True. If True, we will push the value of certain FLAGS into our options. If False, those option fields are left uninitialized. flags_obj: object. If not None, use as the source of flags, else use global FLAGS. Returns: deepvariant_pb2.DeepVariantOptions protobuf. Raises: ValueError: If we observe invalid flag values. """ if not flags_obj: flags_obj = FLAGS read_reqs = reads_pb2.ReadRequirements( min_base_quality=10, min_mapping_quality=10, min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT) pic_options = pileup_image.default_options(read_requirements=read_reqs) allele_counter_options = deepvariant_pb2.AlleleCounterOptions( partition_size=flags_obj.partition_size, read_requirements=read_reqs) if flags_obj.sample_name: sample_name = flags_obj.sample_name elif flags_obj.reads: with sam.SamReader(flags_obj.reads) as sam_reader: sample_name = extract_sample_name_from_sam_reader(sam_reader) else: sample_name = _UNKNOWN_SAMPLE variant_caller_options = deepvariant_pb2.VariantCallerOptions( min_count_snps=flags_obj.vsc_min_count_snps, min_count_indels=flags_obj.vsc_min_count_indels, min_fraction_snps=flags_obj.vsc_min_fraction_snps, min_fraction_indels=flags_obj.vsc_min_fraction_indels, # Not specified by default: fraction_reference_sites_to_emit, # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'. random_seed=1400605801, sample_name=sample_name, p_error=0.001, max_gq=50, gq_resolution=flags_obj.gvcf_gq_binsize, ploidy=2) options = deepvariant_pb2.DeepVariantOptions( exclude_contigs=exclude_contigs.EXCLUDED_HUMAN_CONTIGS, # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'. random_seed=609314161, # # Not specified by default: calling_regions = 3; read_requirements=read_reqs, allele_counter_options=allele_counter_options, variant_caller_options=variant_caller_options, pic_options=pic_options, n_cores=1, task_id=0, num_shards=0, min_shared_contigs_basepairs=0.9, ) if add_flags: options.mode = parse_proto_enum_flag( deepvariant_pb2.DeepVariantOptions.Mode, flags_obj.mode.upper()) options.labeler_algorithm = parse_proto_enum_flag( deepvariant_pb2.DeepVariantOptions.LabelerAlgorithm, flags_obj.labeler_algorithm.upper()) if flags_obj.ref: options.reference_filename = flags_obj.ref if flags_obj.reads: options.reads_filename = flags_obj.reads if flags_obj.confident_regions: options.confident_regions_filename = flags_obj.confident_regions if flags_obj.truth_variants: options.truth_variants_filename = flags_obj.truth_variants if flags_obj.downsample_fraction != NO_DOWNSAMPLING: options.downsample_fraction = flags_obj.downsample_fraction if flags_obj.multi_allelic_mode: multi_allelic_enum = { 'include_het_alt_images': deepvariant_pb2.PileupImageOptions.ADD_HET_ALT_IMAGES, 'exclude_het_alt_images': deepvariant_pb2.PileupImageOptions.NO_HET_ALT_IMAGES, }[flags_obj.multi_allelic_mode] options.pic_options.multi_allelic_mode = multi_allelic_enum if flags_obj.pileup_image_height: options.pic_options.height = flags_obj.pileup_image_height if flags_obj.pileup_image_width: options.pic_options.width = flags_obj.pileup_image_width num_shards, examples, candidates, gvcf = io_utils.resolve_filespecs( flags_obj.task, flags_obj.examples or '', flags_obj.candidates or '', flags_obj.gvcf or '') options.examples_filename = examples options.candidates_filename = candidates options.gvcf_filename = gvcf options.calling_regions.extend(parse_regions_flag(flags_obj.regions)) options.exclude_calling_regions.extend( parse_regions_flag(flags_obj.exclude_regions)) options.task_id = flags_obj.task options.num_shards = 0 if num_shards is None else num_shards options.realigner_enabled = flags_obj.realign_reads if options.realigner_enabled: options.realigner_options.CopyFrom(realigner.realigner_config(flags_obj)) options.max_reads_per_partition = flags_obj.max_reads_per_partition if (options.mode == deepvariant_pb2.DeepVariantOptions.TRAINING and flags_obj.training_random_emit_ref_sites != NO_RANDOM_REF): options.variant_caller_options.fraction_reference_sites_to_emit = ( flags_obj.training_random_emit_ref_sites) return options