def main(args): hl.init(default_reference='GRCh38', log='/qc_annotations.log') if args.compute_info: compute_info().write(get_info(split=False).path, overwrite=args.overwrite) if args.split_info: split_info().write(get_info(split=True).path, overwrite=args.overwrite) if args.export_info_vcf: info_ht = get_info(split=False).ht() hl.export_vcf(ht_to_vcf_mt(info_ht), info_vcf_path) # if args.generate_ac: # TODO: compute AC and qc_AC as part of compute_info # mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, samples_meta=True) # mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) # # ht = generate_ac(mt, ).checkpoint('gs://gnomad-tmp/v3_ac_tmp.ht', overwrite=args.overwrite, _read_if_exists=not args.overwrite) # ht.repartition(10000, shuffle=False).write(ac_ht_path, overwrite=args.overwrite) if args.generate_fam_stats: mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, samples_meta=True) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) fam_stats_ht = generate_fam_stats(mt, trios.path) fam_stats_ht = fam_stats_ht.checkpoint( 'gs://gnomad-tmp/v3_fam_stats_tmp.ht', overwrite=args.overwrite, _read_if_exists=not args.overwrite) fam_stats_ht = fam_stats_ht.repartition(10000, shuffle=False) fam_stats_ht.write(fam_stats.path, overwrite=args.overwrite) if args.export_transmitted_singletons_vcf: export_transmitted_singletons_vcf() if args.vep: run_vep().write(vep.path, overwrite=args.overwrite)
def create_bin_ht(model_id: str, n_bins: int) -> hl.Table: """ Creates a table with bin annotations added for a RF or VQSR run and writes it to its correct location in annotations. :param model_id: Which variant QC model (RF or VQSR model ID) to annotate with bin :param n_bins: Number of bins to bin the data into :return: Table with bin annotations """ logger.info(f"Annotating {model_id} HT with bins using {n_bins} bins") info_ht = get_info(split=True).ht() if model_id.startswith("vqsr"): rf_ht = get_rf_annotations().ht() ht = get_vqsr_filters(model_id, split=True).ht() ht = ht.annotate( **rf_ht[ht.key], info=info_ht[ht.key].info, score=ht.info.AS_VQSLOD, positive_train_site=ht.info.POSITIVE_TRAIN_SITE, negative_train_site=ht.info.NEGATIVE_TRAIN_SITE, AS_culprit=ht.info.AS_culprit, ) # Remove all samples with an undefined ac_raw Because ac_raw was calculated on the high quality samples only # and VQSR was run before sample filtering ht = ht.filter(hl.is_defined(ht.ac_raw)) else: ht = get_rf_result(model_id=model_id).ht() ht = ht.annotate( info=info_ht[ht.key].info, positive_train_site=ht.tp, negative_train_site=ht.fp, score=ht.rf_probability["TP"], ) ht = ht.filter(~info_ht[ht.key].AS_lowqual & ~hl.is_defined(telomeres_and_centromeres.ht()[ht.locus])) ht_non_lcr = filter_low_conf_regions( ht, filter_lcr=True, # TODO: Uncomment when we have decoy path filter_decoy=False, # True, filter_segdup=True, ) ht = ht.annotate(non_lcr=hl.is_defined(ht_non_lcr[ht.key])) bin_ht = create_binned_ht(ht, n_bins, add_substrat={"non_lcr": ht.non_lcr}) return bin_ht
def split_info() -> hl.Table: """ Generates an info table that splits multi-allelic sites from the multi-allelic info table. :return: Info table with split multi-allelics :rtype: Table """ info_ht = get_info(split=False).ht() # Create split version info_ht = hl.split_multi(info_ht) # Index AS annotations info_ht = info_ht.annotate(info=info_ht.info.annotate( **{ f: info_ht.info[f][info_ht.a_index - 1] for f in info_ht.info if f.startswith("AC") or ( f.startswith("AS_") and not f == 'AS_SB_TABLE') }, AS_SB_TABLE=info_ht.info.AS_SB_TABLE[0].extend( info_ht.info.AS_SB_TABLE[info_ht.a_index])), lowqual=info_ht.lowqual[info_ht.a_index - 1]) return info_ht
def main(args): hl.init(log="/variant_qc_evaluation.log") if args.create_bin_ht: create_bin_ht( args.model_id, args.n_bins, ).write( get_score_bins(args.model_id, aggregated=False).path, overwrite=args.overwrite, ) if args.run_sanity_checks: ht = get_score_bins(args.model_id, aggregated=False).ht() logger.info("Running sanity checks...") print( ht.aggregate( hl.struct( was_biallelic=hl.agg.counter(~ht.was_split), has_biallelic_rank=hl.agg.counter( hl.is_defined(ht.biallelic_bin)), was_singleton=hl.agg.counter(ht.singleton), has_singleton_rank=hl.agg.counter( hl.is_defined(ht.singleton_bin)), was_biallelic_singleton=hl.agg.counter(ht.singleton & ~ht.was_split), has_biallelic_singleton_rank=hl.agg.counter( hl.is_defined(ht.biallelic_singleton_bin)), ))) if args.create_aggregated_bin_ht: logger.warning( "Use only workers, it typically crashes with preemptibles") create_aggregated_bin_ht(args.model_id).write( get_score_bins(args.model_id, aggregated=True).path, overwrite=args.overwrite, ) if args.extract_truth_samples: logger.info(f"Extracting truth samples from MT...") mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, remove_hard_filtered_samples=False) mt = mt.filter_cols( hl.literal([v["s"] for k, v in TRUTH_SAMPLES.items()]).contains(mt.s)) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) # Checkpoint to prevent needing to go through the large table a second time mt = mt.checkpoint( get_checkpoint_path("truth_samples", mt=True), overwrite=args.overwrite, ) for truth_sample in TRUTH_SAMPLES: truth_sample_mt = mt.filter_cols( mt.s == TRUTH_SAMPLES[truth_sample]["s"]) # Filter to variants in truth data truth_sample_mt = truth_sample_mt.filter_rows( hl.agg.any(truth_sample_mt.GT.is_non_ref())) truth_sample_mt.naive_coalesce(args.n_partitions).write( get_callset_truth_data(truth_sample).path, overwrite=args.overwrite, ) if args.merge_with_truth_data: for truth_sample in TRUTH_SAMPLES: logger.info( f"Creating a merged table with callset truth sample and truth data for {truth_sample}..." ) # Load truth data mt = get_callset_truth_data(truth_sample).mt() truth_hc_intervals = TRUTH_SAMPLES[truth_sample][ "hc_intervals"].ht() truth_mt = TRUTH_SAMPLES[truth_sample]["truth_mt"].mt() truth_mt = truth_mt.key_cols_by( s=hl.str(TRUTH_SAMPLES[truth_sample]["s"])) # Remove low quality sites info_ht = get_info(split=True).ht() mt = mt.filter_rows(~info_ht[mt.row_key].AS_lowqual) ht = create_truth_sample_ht(mt, truth_mt, truth_hc_intervals) ht.write( get_callset_truth_data(truth_sample, mt=False).path, overwrite=args.overwrite, ) if args.bin_truth_sample_concordance: for truth_sample in TRUTH_SAMPLES: logger.info( f"Creating binned concordance table for {truth_sample} for model {args.model_id}" ) ht = get_callset_truth_data(truth_sample, mt=False).ht() info_ht = get_info(split=True).ht() ht = ht.filter( ~info_ht[ht.key].AS_lowqual & ~hl.is_defined(telomeres_and_centromeres.ht()[ht.locus])) logger.info("Filtering out low confidence regions and segdups...") ht = filter_low_conf_regions( ht, filter_lcr=True, # TODO: Uncomment when we have decoy path filter_decoy=False, # True, filter_segdup=True, ) logger.info( "Loading HT containing RF or VQSR scores annotated with a bin based on the rank of score..." ) metric_ht = get_score_bins(args.model_id, aggregated=False).ht() ht = ht.filter(hl.is_defined(metric_ht[ht.key])) ht = ht.annotate(score=metric_ht[ht.key].score) ht = compute_binned_truth_sample_concordance( ht, metric_ht, args.n_bins) ht.write( get_binned_concordance(args.model_id, truth_sample).path, overwrite=args.overwrite, )