def main(args): hl.init(default_reference="GRCh38", log="/qc_annotations.log") if args.compute_info: compute_info().write(get_info(split=False).path, overwrite=args.overwrite) if args.split_info: split_info().write(get_info(split=True).path, overwrite=args.overwrite) if args.export_info_vcf: info_ht = get_info(split=False).ht() hl.export_vcf(ht_to_vcf_mt(info_ht), info_vcf_path()) if args.generate_allele_data: mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) generate_allele_data(mt.rows()).write(allele_data.path, overwrite=args.overwrite) if args.generate_ac: # TODO: compute AC and qc_AC as part of compute_info mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, samples_meta=True) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) ht = generate_ac(mt).checkpoint( "gs://gnomad-tmp/ac_tmp.ht", overwrite=args.overwrite, _read_if_exists=not args.overwrite, ) ht.repartition(10000, shuffle=False).write(qc_ac.path, overwrite=args.overwrite) if args.generate_fam_stats: mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, samples_meta=True) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) fam_stats_ht = generate_fam_stats(mt, trios.path) fam_stats_ht = fam_stats_ht.checkpoint( "gs://gnomad-tmp/fam_stats_tmp.ht", overwrite=args.overwrite, _read_if_exists=not args.overwrite, ) fam_stats_ht = fam_stats_ht.repartition(10000, shuffle=False) fam_stats_ht.write(fam_stats.path, overwrite=args.overwrite) if args.export_transmitted_singletons_vcf: export_transmitted_singletons_vcf() if args.vep: run_vep(vep_version=args.vep_version).write(vep.path, overwrite=args.overwrite)
def split_info() -> hl.Table: """ Generates an info table that splits multi-allelic sites from the multi-allelic info table. :return: Info table with split multi-allelics :rtype: Table """ info_ht = get_info(split=False).ht() # Create split version info_ht = hl.split_multi(info_ht) info_ht = info_ht.annotate( info=info_ht.info.annotate( **split_info_annotation(info_ht.info, info_ht.a_index), ), AS_lowqual=split_lowqual_annotation(info_ht.AS_lowqual, info_ht.a_index), ) return info_ht
def create_rf_ht( impute_features: bool = True, adj: bool = False, n_partitions: int = 5000, checkpoint_path: Optional[str] = None, ) -> hl.Table: """ Creates a Table with all necessary annotations for the random forest model. Annotations that are included: Features for RF: - InbreedingCoeff - variant_type - allele_type - n_alt_alleles - has_star - AS_QD - AS_pab_max - AS_MQRankSum - AS_SOR - AS_ReadPosRankSum Training sites (bool): - transmitted_singleton - fail_hard_filters - (ht.QD < 2) | (ht.FS > 60) | (ht.MQ < 30) :param bool impute_features: Whether to impute features using feature medians (this is done by variant type) :param str adj: Whether to use adj genotypes :param int n_partitions: Number of partitions to use for final annotated table :param str checkpoint_path: Optional checkpoint path for the Table before median imputation and/or aggregate summary :return: Hail Table ready for RF :rtype: Table """ group = "adj" if adj else "raw" ht = get_info(split=True).ht() ht = ht.transmute(**ht.info) ht = ht.select("lowqual", "AS_lowqual", "FS", "MQ", "QD", *INFO_FEATURES) inbreeding_ht = get_freq().ht() inbreeding_ht = inbreeding_ht.select( InbreedingCoeff=hl.if_else( hl.is_nan(inbreeding_ht.InbreedingCoeff), hl.null(hl.tfloat32), inbreeding_ht.InbreedingCoeff, ) ) trio_stats_ht = fam_stats.ht() trio_stats_ht = trio_stats_ht.select( f"n_transmitted_{group}", f"ac_children_{group}" ) truth_data_ht = get_truth_ht() allele_data_ht = allele_data.ht() allele_counts_ht = qc_ac.ht() logger.info("Annotating Table with all columns from multiple annotation Tables") ht = ht.annotate( **inbreeding_ht[ht.key], **trio_stats_ht[ht.key], **truth_data_ht[ht.key], **allele_data_ht[ht.key].allele_data, **allele_counts_ht[ht.key], ) # Filter to only variants found in high quality samples and are not lowqual ht = ht.filter((ht[f"ac_qc_samples_{group}"] > 0) & ~ht.AS_lowqual) ht = ht.select( "a_index", "was_split", *FEATURES, *TRUTH_DATA, **{ "transmitted_singleton": (ht[f"n_transmitted_{group}"] == 1) & (ht[f"ac_qc_samples_{group}"] == 2), "fail_hard_filters": (ht.QD < 2) | (ht.FS > 60) | (ht.MQ < 30), }, singleton=ht.ac_release_samples_raw == 1, ac_raw=ht.ac_qc_samples_raw, ac=ht.ac_release_samples_adj, ac_qc_samples_unrelated_raw=ht.ac_qc_samples_unrelated_raw, ) ht = ht.repartition(n_partitions, shuffle=False) if checkpoint_path: ht = ht.checkpoint(checkpoint_path, overwrite=True) if impute_features: ht = median_impute_features(ht, {"variant_type": ht.variant_type}) summary = ht.group_by("omni", "mills", "transmitted_singleton",).aggregate( n=hl.agg.count() ) logger.info("Summary of truth data annotations:") summary.show(20) return ht
def add_release_annotations(freq_ht: hl.Table) -> hl.Table: """ Load and join all Tables with variant annotations. :param freq_ht: Table with frequency annotations :return: Table containing joined annotations """ logger.info("Loading annotation tables...") filters_ht = final_filter.ht() vep_ht = vep.ht() dbsnp_ht = dbsnp.ht().select("rsid") info_ht = get_info().ht() in_silico_ht = analyst_annotations.ht() logger.info("Filtering lowqual variants and assembling 'info' field...") info_fields = SITE_FIELDS + AS_FIELDS missing_info_fields = set(info_fields).difference(info_ht.info.keys()) logger.info("The following fields are not found in the info HT: %s", missing_info_fields) select_info_fields = set(info_fields).intersection(info_ht.info.keys()) info_ht = info_ht.transmute(info=info_ht.info.select(*select_info_fields)) score_name = hl.eval(filters_ht.filtering_model.score_name) filters = filters_ht[info_ht.key] info_ht = info_ht.annotate(info=info_ht.info.annotate( AS_SOR=filters. AS_SOR, # NOTE: AS_SOR will be incorporated into the info HT after v3.1, so no need to add this annotation in future releases SOR=filters.SOR, singleton=filters.singleton, transmitted_singleton=filters.transmitted_singleton, omni=filters.omni, mills=filters.mills, monoallelic=filters.monoallelic, **{f"{score_name}": filters[f"{score_name}"]}, )) logger.info("Adding annotations...") filters_ht = filters_ht.select( "filters", "vqsr", allele_info=hl.struct( variant_type=filters_ht.variant_type, allele_type=filters_ht.allele_type, n_alt_alleles=filters_ht.n_alt_alleles, was_mixed=filters_ht.was_mixed, ), ) ht = freq_ht.filter(info_ht[freq_ht.key].AS_lowqual, keep=False) ht = ht.annotate( a_index=info_ht[ht.key].a_index, was_split=info_ht[ht.key].was_split, rsid=dbsnp_ht[ht.key].rsid, info=info_ht[ht.key].info, vep=vep_ht[ht.key].vep.drop("colocated_variants"), vqsr=filters_ht[ht.key].vqsr, region_flag=region_flag_expr( ht, non_par=False, prob_regions={ "lcr": lcr_intervals.ht(), "segdup": seg_dup_intervals.ht(), }, ), **filters_ht[ht.key], **in_silico_ht[ht.key], ) ht = ht.transmute(info=ht.info.annotate( InbreedingCoeff=ht.InbreedingCoeff)) ht = ht.annotate_globals( vep_version=vep_ht.index_globals().version, vep_csq_header=VEP_CSQ_HEADER, dbsnp_version=dbsnp.default_version, filtering_model=filters_ht.index_globals().filtering_model, ) return ht
def main(args): hl.init(log="/variant_qc_finalize.log") ht = get_score_bins(args.model_id, aggregated=False).ht() if args.filter_centromere_telomere: ht = ht.filter( ~hl.is_defined(telomeres_and_centromeres.ht()[ht.locus])) info_ht = get_info(split=True).ht() ht = ht.filter(~info_ht[ht.key].AS_lowqual) if args.model_id.startswith("vqsr_"): ht = ht.drop("info") freq_ht = get_freq().ht() ht = ht.annotate(InbreedingCoeff=freq_ht[ht.key].InbreedingCoeff) freq_idx = freq_ht[ht.key] aggregated_bin_path = get_score_bins(args.model_id, aggregated=True).path if not file_exists(aggregated_bin_path): sys.exit( f"Could not find binned HT for model: {args.model_id} ({aggregated_bin_path}). Please run create_ranked_scores.py for that hash." ) aggregated_bin_ht = get_score_bins(args.model_id, aggregated=True).ht() ht = generate_final_filter_ht( ht, args.model_name, args.score_name, ac0_filter_expr=freq_idx.freq[0].AC == 0, ts_ac_filter_expr=freq_idx.freq[1].AC == 1, mono_allelic_flag_expr=(freq_idx.freq[1].AF == 1) | (freq_idx.freq[1].AF == 0), snp_bin_cutoff=args.snp_bin_cutoff, indel_bin_cutoff=args.indel_bin_cutoff, snp_score_cutoff=args.snp_score_cutoff, indel_score_cutoff=args.indel_score_cutoff, inbreeding_coeff_cutoff=args.inbreeding_coeff_threshold, aggregated_bin_ht=aggregated_bin_ht, bin_id="bin", vqsr_ht=get_vqsr_filters(args.vqsr_model_id, split=True).ht() if args.vqsr_model_id else None, ) ht = ht.annotate_globals( filtering_model=ht.filtering_model.annotate(model_id=args.model_id, )) if args.model_id.startswith("vqsr_"): ht = ht.annotate_globals(filtering_model=ht.filtering_model.annotate( snv_training_variables=[ "AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_SOR", "AS_MQ", ], indel_training_variables=[ "AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_SOR", ], )) else: ht = ht.annotate_globals(filtering_model=ht.filtering_model.annotate( snv_training_variables=ht.features, indel_training_variables=ht.features, )) ht.write(final_filter.path, args.overwrite) final_filter_ht = final_filter.ht() final_filter_ht.summarize()