def test_call_from_allele_counter(self): ref = fasta.IndexedFastaReader(testdata.CHR20_FASTA) sam_reader = sam.SamReader(testdata.CHR20_BAM) size = 1000 region = ranges.make_range('chr20', 10000000, 10000000 + size) allele_counter = _allelecounter.AlleleCounter( ref.c_reader, region, deepvariant_pb2.AlleleCounterOptions(partition_size=size)) caller = variant_calling.VariantCaller( deepvariant_pb2.VariantCallerOptions(min_count_snps=2, min_count_indels=2, min_fraction_snps=0.12, min_fraction_indels=0.12, sample_name='sample_name', p_error=0.001, max_gq=50, gq_resolution=1, ploidy=2)) # Grab all of the reads in our region and add them to the allele_counter. reads = list(sam_reader.query(region)) self.assertNotEmpty(reads) for read in reads: allele_counter.add(read) # Get the candidates records for this whole region. candidates = caller.calls_from_allele_counter(allele_counter) # We should have at least some candidates and some gvcf records. self.assertNotEmpty(candidates) # Each candidate should be a DeepVariantCall. for candidate in candidates: self.assertIsInstance(candidate, deepvariant_pb2.DeepVariantCall)
def generate_data(vcf_reader, ref_reader, sam_reader, baseline_contig, exclude_contig): """Generates a pandas.DataFrame summarizing the AlleleCount at each position. The features included are: - 'ref_nonconfident_read_count' - 'ref_supporting_read_count' - 'SUBSTITUTION' - 'INSERTION' - 'DELETION' - 'SOFT_CLIP' - 'label' These features are extracted from the AlleleCount proto at the concerned position. Args: vcf_reader: a nucleus.io.VcfReader. ref_reader: a nucleus.io.IndexedFastaReader. sam_reader: a nucleus.io.SamReader. baseline_contig: string, contig from which to sample baseline positions. exclude_contig: string, contig to exclude for test purposes. Returns: pandas.Dataframe object. """ # These parameters are the ones used in make_examples. read_reqs = reads_pb2.ReadRequirements( min_base_quality=10, min_mapping_quality=10, min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT) allele_counter_options = deepvariant_pb2.AlleleCounterOptions( partition_size=1, read_requirements=read_reqs) training_positions = generate_positions(vcf_reader, ref_reader, baseline_contig) positions_records = [] for position in training_positions: region = ranges.make_range(position.reference_name, position.start, position.start + 1) allele_counter = allelecounter.AlleleCounter(ref_reader.c_reader, region, allele_counter_options) row = _position_to_features(sam_reader, allele_counter, region, position, exclude_contig) if row is not None: positions_records.append(row) df = pd.DataFrame(positions_records) df = df.fillna(0) df = shuffle(df) return df
def model_evaluation_runner(truth_variants, reads, ref, input_model_pckl, eval_region, output_report_csv): """Outputs precision-recall for a sklearn model using AlleleCount features. Args: truth_variants: path to the VCF. reads: path to the reads BAM. ref: path to the reference FASTA. input_model_pckl: path to read the LogisticRegression pickle from. eval_region: str, region to evaluate on in the 'chr:start-end', 'chr:position' or 'chr' format. output_report_csv: path to the output report csv. Raises: ValueError: if eval_region cannot be parsed. """ sam_reader = sam.SamReader(reads) ref_reader = fasta.IndexedFastaReader(ref) read_reqs = reads_pb2.ReadRequirements( min_base_quality=10, min_mapping_quality=10, min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT) allele_counter_options = deepvariant_pb2.AlleleCounterOptions( partition_size=1, read_requirements=read_reqs) model = joblib.load(input_model_pckl) with vcf.VcfReader(truth_variants) as vcf_reader: region = ranges.parse_literal(eval_region, contig_map=ranges.contigs_dict( ref_reader.header.contigs)) true_indels = [ var for var in vcf_reader.query(region) if (variant_utils.is_indel(var)) ] precisions = compute_precision(model, true_indels, sam_reader, ref_reader, allele_counter_options, _THRESHOLDS, region) recalls = compute_effective_recall(model, true_indels, sam_reader, ref_reader, allele_counter_options, _THRESHOLDS) with tf.gfile.GFile(output_report_csv, 'w') as csvfile: fieldnames = ['threshold', 'precision', 'recall'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for threshold in _THRESHOLDS: writer.writerow({ 'threshold': threshold, 'precision': precisions[threshold], 'recall': recalls[threshold] })
def test_wrap(self): ref = fasta.IndexedFastaReader(testdata.CHR20_FASTA) sam_reader = sam.SamReader(testdata.CHR20_BAM) size = 100 region = ranges.make_range('chr20', 10000000, 10000000 + size) options = deepvariant_pb2.AlleleCounterOptions(partition_size=size) allele_counter = _allelecounter.AlleleCounter(ref.c_reader, region, options) reads = list(sam_reader.query(region)) self.assertGreater(len(reads), 0) for read in reads: allele_counter.add(read) counts = allele_counter.counts() self.assertEqual(len(counts), size)
def _candidates_from_reads(config, ref_reader, reads, region): """Returns a list of candidate positions. Args: config: learning.genomics.deepvariant.realigner.WindowSelectorOptions options determining the behavior of this window selector. ref_reader: GenomeReference. Indexed reference genome to query bases. reads: list[nucleus.protos.Read]. The reads we are processing into candidate positions. region: nucleus.protos.Range. The region we are processing. Returns: A list. The elements are reference positions within region. Raises: ValueError: if config.window_selector_model.model_type isn't a valid enum name in realigner_pb2.WindowSelectorModel.ModelType. """ allele_counter_options = deepvariant_pb2.AlleleCounterOptions( read_requirements=reads_pb2.ReadRequirements( min_mapping_quality=config.min_mapq, min_base_quality=config.min_base_quality), keep_legacy_behavior=config.keep_legacy_behavior) expanded_region = ranges.expand(region, config.region_expansion_in_bp, contig_map=ranges.contigs_dict( ref_reader.header.contigs)) allele_counter = allelecounter.AlleleCounter(ref_reader.c_reader, expanded_region, [], allele_counter_options) for read in reads: allele_counter.add(read, 'placeholder_sample_id') model_type = config.window_selector_model.model_type if model_type == realigner_pb2.WindowSelectorModel.VARIANT_READS: return _variant_reads_threshold_selector( allele_counter, config.window_selector_model.variant_reads_model, expanded_region) elif model_type == realigner_pb2.WindowSelectorModel.ALLELE_COUNT_LINEAR: return _allele_count_linear_selector( allele_counter, config.window_selector_model.allele_count_linear_model, expanded_region) else: raise ValueError('Unknown enum option "{}" for ' 'WindowSelectorModel.model_type'.format( config.window_selector_model.model_type))
def shared_flags_to_options( add_flags, flags_obj, samples_in_order, sample_role_to_train, main_sample_index) -> deepvariant_pb2.MakeExamplesOptions: """Creates options from flags that are shared, along with given samples.""" read_reqs = reads_pb2.ReadRequirements( keep_duplicates=flags_obj.keep_duplicates, keep_supplementary_alignments=flags_obj.keep_supplementary_alignments, keep_secondary_alignments=flags_obj.keep_secondary_alignments, min_base_quality=flags_obj.min_base_quality, min_mapping_quality=flags_obj.min_mapping_quality, min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT) logging.vlog(3, 'ReadRequirements are: %s', read_reqs) pic_options = pileup_image.default_options(read_requirements=read_reqs) allele_counter_options = deepvariant_pb2.AlleleCounterOptions( partition_size=flags_obj.partition_size, read_requirements=read_reqs, track_ref_reads=flags_obj.track_ref_reads, normalize_reads=flags_obj.normalize_reads, keep_legacy_behavior=flags_obj.keep_legacy_allele_counter_behavior) options = deepvariant_pb2.MakeExamplesOptions( exclude_contigs=exclude_contigs.EXCLUDED_HUMAN_CONTIGS, # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'. random_seed=609314161, # # Not specified by default: calling_regions = 3; read_requirements=read_reqs, allele_counter_options=allele_counter_options, pic_options=pic_options, n_cores=1, task_id=0, num_shards=0, min_shared_contigs_basepairs=0.9, sample_options=samples_in_order, main_sample_index=main_sample_index, sample_role_to_train=sample_role_to_train) if add_flags: options.mode = make_examples_core.parse_proto_enum_flag( deepvariant_pb2.MakeExamplesOptions.Mode, flags_obj.mode.upper()) options.labeler_algorithm = make_examples_core.parse_proto_enum_flag( deepvariant_pb2.MakeExamplesOptions.LabelerAlgorithm, flags_obj.labeler_algorithm.upper()) options.variant_caller = make_examples_core.parse_proto_enum_flag( deepvariant_pb2.MakeExamplesOptions.VariantCaller, flags_obj.variant_caller.upper()) if flags_obj.ref: options.reference_filename = flags_obj.ref if flags_obj.confident_regions: options.confident_regions_filename = flags_obj.confident_regions if flags_obj.truth_variants: options.truth_variants_filename = flags_obj.truth_variants if flags_obj.sequencing_type: options.pic_options.sequencing_type = make_examples_core.parse_proto_enum_flag( deepvariant_pb2.PileupImageOptions.SequencingType, flags_obj.sequencing_type) if flags_obj.channels: channel_set = flags_obj.channels.split(',') for channel in channel_set: if channel and channel not in dv_constants.OPT_CHANNELS: err_msg = 'Channel "{}" is not one of the available opt channels: {}'.format( channel, ', '.join(dv_constants.OPT_CHANNELS)) errors.log_and_raise(err_msg, errors.CommandLineError) options.pic_options.channels[:] = channel_set options.pic_options.num_channels += len(channel_set) if flags_obj.multi_allelic_mode: multi_allelic_enum = { 'include_het_alt_images': deepvariant_pb2.PileupImageOptions.ADD_HET_ALT_IMAGES, 'exclude_het_alt_images': deepvariant_pb2.PileupImageOptions.NO_HET_ALT_IMAGES, }[flags_obj.multi_allelic_mode] options.pic_options.multi_allelic_mode = multi_allelic_enum if flags_obj.pileup_image_width: options.pic_options.width = flags_obj.pileup_image_width options.pic_options.alt_aligned_pileup = flags_obj.alt_aligned_pileup options.pic_options.types_to_alt_align = flags_obj.types_to_alt_align if flags_obj.add_supporting_other_alt_color: options.pic_options.other_allele_supporting_read_alpha = 0.3 if flags_obj.select_variant_types: options.select_variant_types[:] = flags_obj.select_variant_types.split( ) for svt in options.select_variant_types: if svt not in make_examples_core.VARIANT_TYPE_SELECTORS: errors.log_and_raise( 'Select variant type {} not recognized. Allowed values are {}' .format( svt, ', '.join( make_examples_core.VARIANT_TYPE_SELECTORS)), errors.CommandLineError) num_shards, examples, candidates, gvcf, runtime_by_region = ( sharded_file_utils.resolve_filespecs( flags_obj.task, flags_obj.examples or '', flags_obj.candidates or '', flags_obj.gvcf or '', flags_obj.runtime_by_region or '')) options.examples_filename = examples options.candidates_filename = candidates options.gvcf_filename = gvcf options.include_med_dp = flags_obj.include_med_dp options.task_id = flags_obj.task options.num_shards = num_shards options.runtime_by_region = runtime_by_region options.parse_sam_aux_fields = make_examples_core.resolve_sam_aux_fields( flags_obj=flags_obj) if flags_obj.aux_fields_to_keep: options.aux_fields_to_keep[:] = flags_obj.aux_fields_to_keep.split( ',') else: options.aux_fields_to_keep = None options.use_original_quality_scores = flags_obj.use_original_quality_scores if flags_obj.add_hp_channel: options.pic_options.num_channels += 1 options.pic_options.add_hp_channel = True if flags_obj.hp_tag_for_assembly_polishing < 0: errors.log_and_raise( '--hp_tag_for_assembly_polishing has to be set to a positive int.', errors.CommandLineError) if (flags_obj.hp_tag_for_assembly_polishing > 0 and not flags_obj.sort_by_haplotypes): errors.log_and_raise( '--hp_tag_for_assembly_polishing requires --sort_by_haplotypes to be ' 'set ', errors.CommandLineError) options.pic_options.sort_by_haplotypes = flags_obj.sort_by_haplotypes options.pic_options.hp_tag_for_assembly_polishing = flags_obj.hp_tag_for_assembly_polishing if flags_obj.write_run_info: options.run_info_filename = examples + _RUN_INFO_FILE_EXTENSION options.calling_regions.extend( make_examples_core.parse_regions_flag(flags_obj.regions)) options.exclude_calling_regions.extend( make_examples_core.parse_regions_flag(flags_obj.exclude_regions)) options.realigner_enabled = flags_obj.realign_reads options.realigner_options.CopyFrom( realigner.realigner_config(flags_obj)) if (options.mode == deepvariant_pb2.MakeExamplesOptions.TRAINING and flags_obj.training_random_emit_ref_sites != NO_RANDOM_REF): options.sample_options[ main_sample_index].variant_caller_options.fraction_reference_sites_to_emit = ( flags_obj.training_random_emit_ref_sites) if (flags_obj.use_allele_frequency and not flags_obj.population_vcfs): errors.log_and_raise( 'If use_allele_frequency is set then population_vcfs ' 'must be provided.', errors.CommandLineError) if flags_obj.use_allele_frequency: options.use_allele_frequency = flags_obj.use_allele_frequency options.pic_options.num_channels += 1 options.pic_options.use_allele_frequency = True if flags_obj.population_vcfs: options.population_vcf_filenames.extend( re.split(',| ', flags_obj.population_vcfs)) options.max_reads_per_partition = flags_obj.max_reads_per_partition options.use_ref_for_cram = flags_obj.use_ref_for_cram options.hts_block_size = flags_obj.hts_block_size options.logging_every_n_candidates = flags_obj.logging_every_n_candidates options.customized_classes_labeler_classes_list = flags_obj.customized_classes_labeler_classes_list options.customized_classes_labeler_info_field_name = flags_obj.customized_classes_labeler_info_field_name return options
def default_options(add_flags=True, flags_obj=None): """Creates a DeepVariantOptions proto populated with reasonable defaults. Args: add_flags: bool. defaults to True. If True, we will push the value of certain FLAGS into our options. If False, those option fields are left uninitialized. flags_obj: object. If not None, use as the source of flags, else use global FLAGS. Returns: deepvariant_pb2.DeepVariantOptions protobuf. Raises: ValueError: If we observe invalid flag values. """ if not flags_obj: flags_obj = FLAGS read_reqs = reads_pb2.ReadRequirements( min_base_quality=10, min_mapping_quality=10, min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT) pic_options = pileup_image.default_options(read_requirements=read_reqs) allele_counter_options = deepvariant_pb2.AlleleCounterOptions( partition_size=flags_obj.partition_size, read_requirements=read_reqs) if flags_obj.sample_name: sample_name = flags_obj.sample_name elif flags_obj.reads: with sam.SamReader(flags_obj.reads) as sam_reader: sample_name = extract_sample_name_from_sam_reader(sam_reader) else: sample_name = _UNKNOWN_SAMPLE variant_caller_options = deepvariant_pb2.VariantCallerOptions( min_count_snps=flags_obj.vsc_min_count_snps, min_count_indels=flags_obj.vsc_min_count_indels, min_fraction_snps=flags_obj.vsc_min_fraction_snps, min_fraction_indels=flags_obj.vsc_min_fraction_indels, # Not specified by default: fraction_reference_sites_to_emit, # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'. random_seed=1400605801, sample_name=sample_name, p_error=0.001, max_gq=50, gq_resolution=flags_obj.gvcf_gq_binsize, ploidy=2) options = deepvariant_pb2.DeepVariantOptions( exclude_contigs=exclude_contigs.EXCLUDED_HUMAN_CONTIGS, # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'. random_seed=609314161, # # Not specified by default: calling_regions = 3; read_requirements=read_reqs, allele_counter_options=allele_counter_options, variant_caller_options=variant_caller_options, pic_options=pic_options, n_cores=1, task_id=0, num_shards=0, min_shared_contigs_basepairs=0.9, ) if add_flags: options.mode = parse_proto_enum_flag( deepvariant_pb2.DeepVariantOptions.Mode, flags_obj.mode.upper()) options.labeler_algorithm = parse_proto_enum_flag( deepvariant_pb2.DeepVariantOptions.LabelerAlgorithm, flags_obj.labeler_algorithm.upper()) if flags_obj.ref: options.reference_filename = flags_obj.ref if flags_obj.reads: options.reads_filename = flags_obj.reads if flags_obj.confident_regions: options.confident_regions_filename = flags_obj.confident_regions if flags_obj.truth_variants: options.truth_variants_filename = flags_obj.truth_variants if flags_obj.downsample_fraction != NO_DOWNSAMPLING: options.downsample_fraction = flags_obj.downsample_fraction if flags_obj.multi_allelic_mode: multi_allelic_enum = { 'include_het_alt_images': deepvariant_pb2.PileupImageOptions.ADD_HET_ALT_IMAGES, 'exclude_het_alt_images': deepvariant_pb2.PileupImageOptions.NO_HET_ALT_IMAGES, }[flags_obj.multi_allelic_mode] options.pic_options.multi_allelic_mode = multi_allelic_enum if flags_obj.pileup_image_height: options.pic_options.height = flags_obj.pileup_image_height if flags_obj.pileup_image_width: options.pic_options.width = flags_obj.pileup_image_width num_shards, examples, candidates, gvcf = io_utils.resolve_filespecs( flags_obj.task, flags_obj.examples or '', flags_obj.candidates or '', flags_obj.gvcf or '') options.examples_filename = examples options.candidates_filename = candidates options.gvcf_filename = gvcf options.calling_regions.extend(parse_regions_flag(flags_obj.regions)) options.exclude_calling_regions.extend( parse_regions_flag(flags_obj.exclude_regions)) options.task_id = flags_obj.task options.num_shards = 0 if num_shards is None else num_shards options.realigner_enabled = flags_obj.realign_reads if options.realigner_enabled: options.realigner_options.CopyFrom(realigner.realigner_config(flags_obj)) options.max_reads_per_partition = flags_obj.max_reads_per_partition if (options.mode == deepvariant_pb2.DeepVariantOptions.TRAINING and flags_obj.training_random_emit_ref_sites != NO_RANDOM_REF): options.variant_caller_options.fraction_reference_sites_to_emit = ( flags_obj.training_random_emit_ref_sites) return options
def default_options(add_flags=True, flags=None): """Creates a DeepVariantOptions proto populated with reasonable defaults. Args: add_flags: bool. defaults to True. If True, we will push the value of certain FLAGS into our options. If False, those option fields are left uninitialized. flags: object. If not None, use as the source of flags, else use global FLAGS. Returns: deepvariant_pb2.DeepVariantOptions protobuf. Raises: ValueError: If we observe invalid flag values. """ if not flags: flags = FLAGS read_reqs = core_pb2.ReadRequirements( min_base_quality=10, min_mapping_quality=10, min_base_quality_mode=core_pb2.ReadRequirements.ENFORCED_BY_CLIENT) pic_options = pileup_image.default_options(read_requirements=read_reqs) allele_counter_options = deepvariant_pb2.AlleleCounterOptions( partition_size=flags.partition_size, read_requirements=read_reqs) if flags.sample_name: sample_name = flags.sample_name elif flags.reads: sample_name = extract_sample_name_from_reads(flags.reads) else: sample_name = _UNKNOWN_SAMPLE variant_caller_options = deepvariant_pb2.VariantCallerOptions( min_count_snps=flags.vsc_min_count_snps, min_count_indels=flags.vsc_min_count_indels, min_fraction_snps=flags.vsc_min_fraction_snps, min_fraction_indels=flags.vsc_min_fraction_indels, # Not specified by default: fraction_reference_sites_to_emit, # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'. random_seed=1400605801, sample_name=sample_name, p_error=0.001, max_gq=50, gq_resolution=1, ploidy=2) options = deepvariant_pb2.DeepVariantOptions( exclude_contigs=[ # The two canonical names for the contig representing the human # mitochondrial sequence. 'chrM', 'MT', # From hs37d5. # (ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/README_human_reference_20110707) # pylint:disable=line-too-long 'GL000207.1', 'GL000226.1', 'GL000229.1', 'GL000231.1', 'GL000210.1', 'GL000239.1', 'GL000235.1', 'GL000201.1', 'GL000247.1', 'GL000245.1', 'GL000197.1', 'GL000203.1', 'GL000246.1', 'GL000249.1', 'GL000196.1', 'GL000248.1', 'GL000244.1', 'GL000238.1', 'GL000202.1', 'GL000234.1', 'GL000232.1', 'GL000206.1', 'GL000240.1', 'GL000236.1', 'GL000241.1', 'GL000243.1', 'GL000242.1', 'GL000230.1', 'GL000237.1', 'GL000233.1', 'GL000204.1', 'GL000198.1', 'GL000208.1', 'GL000191.1', 'GL000227.1', 'GL000228.1', 'GL000214.1', 'GL000221.1', 'GL000209.1', 'GL000218.1', 'GL000220.1', 'GL000213.1', 'GL000211.1', 'GL000199.1', 'GL000217.1', 'GL000216.1', 'GL000215.1', 'GL000205.1', 'GL000219.1', 'GL000224.1', 'GL000223.1', 'GL000195.1', 'GL000212.1', 'GL000222.1', 'GL000200.1', 'GL000193.1', 'GL000194.1', 'GL000225.1', 'GL000192.1', 'NC_007605', 'hs37d5', ], # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'. random_seed=609314161, # # Not specified by default: calling_regions = 3; read_requirements=read_reqs, allele_counter_options=allele_counter_options, variant_caller_options=variant_caller_options, pic_options=pic_options, n_cores=1, task_id=0, num_shards=0, min_shared_contigs_basepairs=0.9, ) if add_flags: if flags.mode == 'training': options.mode = deepvariant_pb2.DeepVariantOptions.TRAINING elif flags.mode == 'calling': options.mode = deepvariant_pb2.DeepVariantOptions.CALLING else: raise ValueError('Unexpected mode', flags.mode) if flags.ref: options.reference_filename = flags.ref if flags.reads: options.reads_filename = flags.reads if flags.confident_regions: options.confident_regions_filename = flags.confident_regions if flags.truth_variants: options.truth_variants_filename = flags.truth_variants if flags.downsample_fraction != NO_DOWNSAMPLING: options.downsample_fraction = flags.downsample_fraction if flags.multi_allelic_mode: multi_allelic_enum = { 'include_het_alt_images': deepvariant_pb2.PileupImageOptions.ADD_HET_ALT_IMAGES, 'exclude_het_alt_images': deepvariant_pb2.PileupImageOptions.NO_HET_ALT_IMAGES, }[flags.multi_allelic_mode] options.pic_options.multi_allelic_mode = multi_allelic_enum if flags.pileup_image_height: options.pic_options.height = flags.pileup_image_height if flags.pileup_image_width: options.pic_options.width = flags.pileup_image_width num_shards, examples, candidates, gvcf = io_utils.resolve_filespecs( flags.task, flags.examples or '', flags.candidates or '', flags.gvcf or '') options.examples_filename = examples options.candidates_filename = candidates options.gvcf_filename = gvcf # redacted regions_flag = flags.regions if isinstance(regions_flag, str): regions_flag = regions_flag.split() options.calling_regions.extend(regions_flag) options.task_id = flags.task options.num_shards = 0 if num_shards is None else num_shards if flags.realign_reads: options.realigner_enabled = True options.realigner_options.CopyFrom( realigner.realigner_config(flags)) options.max_reads_per_partition = flags.max_reads_per_partition if (options.mode == deepvariant_pb2.DeepVariantOptions.TRAINING and flags.training_random_emit_ref_sites != NO_RANDOM_REF): options.variant_caller_options.fraction_reference_sites_to_emit = ( flags.training_random_emit_ref_sites) return options