def main(args): hl.init(default_reference='GRCh38') logger = logging.getLogger("gnomad_qc.v3.load_data.compute_coverage") if args.compute_coverage_ht: print("Building reference context HT") ref_ht = get_reference_ht(hl.get_reference('GRCh38'), excluded_intervals=telomeres_and_centromeres. ht().interval.collect()) ref_ht = ref_ht.checkpoint("gs://gnomad-tmp/ref_context.ht", overwrite=True) logger.info("Done building reference context HT") mt = get_gnomad_v3_mt() mt = mt.filter_cols(meta.ht()[mt.col_key].release) coverage_ht = compute_coverage_stats(mt, ref_ht) coverage_ht = coverage_ht.checkpoint( 'gs://gnomad-tmp/gnomad.genomes_v3.coverage.summary.ht', overwrite=True) coverage_ht = coverage_ht.naive_coalesce(5000) coverage_ht.write(coverage('genomes').versions['3.0'].path, overwrite=args.overwrite) if args.export_coverage: ht = coverage('genomes').versions['3.0'].ht() if 'count_array' in ht.row_value: # Note that count_array isn't computed any more, so this is v3.0-specific ht = ht.drop('count_array') ht.export(coverage_tsv_path('genomes', '3.0'))
def run_vep() -> hl.Table: def get_mt_partitions(mt_path: str) -> List[hl.Interval]: """ This function loads the partitioning from a given MT. Note that because it relies on hardcoded paths within the MT that are still in flux, it isn't guaranteed to work on future versions of the MT format. :param str mt_path: MT path :return: MT partitions :rtype: List of Interval """ logger.info(f'Reading partitions for {mt_path}') import json from os import path mt = hl.read_matrix_table(mt_path) with hl.hadoop_open( path.join(mt_path, 'rows', 'rows', 'metadata.json.gz')) as f: intervals_json = json.load(f)['jRangeBounds'] return hl.tarray(hl.tinterval(hl.tstruct( locus=mt.locus.dtype)))._convert_from_json(intervals_json) ht = get_gnomad_v3_mt(key_by_locus_and_alleles=True).rows() ht = ht.filter(hl.len(ht.alleles) > 1) return vep_or_lookup_vep(ht, reference='GRCh38')
def run_mendel_errors() -> hl.Table: meta_ht = meta.ht() ped = pedigree.versions["raw"].pedigree() logger.info(f"Running Mendel errors for {len(ped.trios)} trios.") fake_ped = create_fake_pedigree( n=100, sample_list=list( meta_ht.aggregate( hl.agg.filter( hl.rand_bool(0.01) & ((hl.len(meta_ht.qc_metrics_filters) == 0) & hl.or_else(hl.len(meta_ht.hard_filters) == 0, False)), hl.agg.collect_as_set(meta_ht.s), ))), real_pedigree=ped, ) merged_ped = hl.Pedigree(trios=ped.trios + fake_ped.trios) ped_samples = hl.literal( set([ s for trio in merged_ped.trios for s in [trio.s, trio.pat_id, trio.mat_id] ])) mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) mt = mt.filter_cols(ped_samples.contains(mt.s)) mt = hl.filter_intervals( mt, [hl.parse_locus_interval("chr20", reference_genome='GRCh38')]) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) mt = mt.select_entries("GT", "END") mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) == 2) mendel_errors, _, _, _ = hl.mendel_errors(mt["GT"], merged_ped) return mendel_errors
def compute_info() -> hl.Table: """ Computes a HT with the typical GATK AS and site-level info fields as well as ACs and lowqual fields. Note that this table doesn't split multi-allelic sites. :return: Table with info fields :rtype: Table """ mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, remove_hard_filtered_samples=False) mt = mt.filter_rows((hl.len(mt.alleles) > 1)) mt = mt.transmute_entries(**mt.gvcf_info) # Compute AS and site level info expr # Note that production defaults have changed: # For new releases, the `RAWMQ_andDP` field replaces the `RAW_MQ` and `MQ_DP` fields info_expr = get_site_info_expr( mt, sum_agg_fields=INFO_SUM_AGG_FIELDS + ['RAW_MQ'], int32_sum_agg_fields=INFO_INT32_SUM_AGG_FIELDS + ['MQ_DP'], array_sum_agg_fields=['SB']) info_expr = info_expr.annotate( **get_as_info_expr(mt, sum_agg_fields=INFO_SUM_AGG_FIELDS + ['RAW_MQ'], int32_sum_agg_fields=INFO_INT32_SUM_AGG_FIELDS + ['MQ_DP'], array_sum_agg_fields=['SB'])) # Add AC and AC_raw: # First compute ACs for each non-ref allele, grouped by adj grp_ac_expr = hl.agg.array_agg( lambda ai: hl.agg.filter( mt.LA.contains(ai), hl.agg.group_by( get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD), hl.agg.sum( mt.LGT.one_hot_alleles(mt.LA.map(lambda x: hl.str(x)))[ mt.LA.index(ai)]))), hl.range(1, hl.len(mt.alleles))) # Then, for each non-ref allele, compute # AC as the adj group # AC_raw as the sum of adj and non-adj groups info_expr = info_expr.annotate( AC_raw=grp_ac_expr.map( lambda i: hl.int32(i.get(True, 0) + i.get(False, 0))), AC=grp_ac_expr.map(lambda i: hl.int32(i.get(True, 0)))) info_ht = mt.select_rows(info=info_expr).rows() # Add lowqual flag info_ht = info_ht.annotate(lowqual=get_lowqual_expr( info_ht.alleles, info_ht.info.QUALapprox, # The indel het prior used for gnomad v3 was 1/10k bases (phred=40). # This value is usually 1/8k bases (phred=39). indel_phred_het_prior=40)) return info_ht.naive_coalesce(5000)
def main(args): hl.init(log='/frequency_data_generation.log', default_reference='GRCh38') logger.info("Reading sparse MT and metadata table...") mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) meta_ht = meta.ht().select('pop', 'sex', 'project_id', 'release', 'sample_filters') if args.test: logger.info("Filtering to chr20:1-1000000") mt = hl.filter_intervals(mt, [hl.parse_locus_interval('chr20:1-1000000')]) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) logger.info("Annotating sparse MT with metadata...") mt = mt.annotate_cols(meta=meta_ht[mt.s]) mt = mt.filter_cols(mt.meta.release) samples = mt.count_cols() logger.info(f"Running frequency table prep and generation pipeline on {samples} samples") logger.info("Computing adj and sex adjusted genotypes.") mt = mt.annotate_entries( GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT, mt.meta.sex), adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD) ) logger.info("Densify-ing...") mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) > 1) logger.info("Generating frequency data...") mt = annotate_freq( mt, sex_expr=mt.meta.sex, pop_expr=mt.meta.pop ) # Select freq, FAF and popmax faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus, POPS_TO_REMOVE_FOR_POPMAX) mt = mt.select_rows( 'freq', faf=faf, popmax=pop_max_expr(mt.freq, mt.freq_meta, POPS_TO_REMOVE_FOR_POPMAX) ) mt = mt.annotate_globals(faf_meta=faf_meta) # Annotate quality metrics histograms, as these also require densifying mt = mt.annotate_rows( **qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD) ) logger.info("Writing out frequency data...") if args.test: mt.rows().write("gs://gnomad-tmp/gnomad_freq/chr20_1_1000000_freq.ht", overwrite=True) else: mt.rows().write(freq.path, overwrite=args.overwrite)
def main(args): mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) mt = mt.annotate_entries( gvcf_info=mt.gvcf_info.drop('ClippingRankSum', 'ReadPosRankSum')) mt = mt.annotate_rows( n_unsplit_alleles=hl.len(mt.alleles), mixed_site=(hl.len(mt.alleles) > 2) & hl.any(lambda a: hl.is_indel(mt.alleles[0], a), mt.alleles[1:]) & hl.any(lambda a: hl.is_snp(mt.alleles[0], a), mt.alleles[1:])) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) mt.write(args.split_mt_location, overwrite=args.overwrite)
def compute_stats(stats_path: str): mt = get_gnomad_v3_mt() mt = mt.filter_entries(hl.is_defined(mt.END)) ref_block_stats = mt.aggregate_entries( hl.struct(ref_block_stats=hl.struct( stats=hl.agg.stats(mt.END - mt.locus.position), hist=hl.agg.hist(mt.END - mt.locus.position, 0, 9999, 10000), hist_log=hl.agg.hist(hl.log10(1 + mt.END - mt.locus.position), 0, 5, 100)), adj_ref_block_stats=hl.agg.filter( get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD), hl.struct(stats=hl.agg.stats(mt.END - mt.locus.position), hist=hl.agg.hist(mt.END - mt.locus.position, 0, 9999, 10000), hist_log=hl.agg.hist( hl.log10(1 + mt.END - mt.locus.position), 0, 5, 100))))) with hl.hadoop_open(stats_path, 'wb') as f: pickle.dump(ref_block_stats, f)
def main(args): hl.init(default_reference='GRCh38') coverage_version = args.coverage_version if args.coverage_version else CURRENT_GENOME_COVERAGE_RELEASE logger = logging.getLogger("gnomad_qc.v3.load_data.compute_coverage") logger.warning( "Last time this was run (July 2020), this script required high-mem machines." ) if args.compute_coverage_ht: print("Building reference context HT") ref_ht = get_reference_ht( hl.get_reference('GRCh38'), contigs=[f'chr{x}' for x in range(1, 23)] + ['chrX', 'chrY'], excluded_intervals=telomeres_and_centromeres.ht().interval.collect( )) ref_ht = ref_ht.checkpoint("gs://gnomad-tmp/ref_context.ht", overwrite=True) logger.info("Done building reference context HT") mt = get_gnomad_v3_mt() mt = mt.filter_cols(meta.ht()[mt.col_key].release) coverage_ht = compute_coverage_stats(mt, ref_ht) coverage_ht = coverage_ht.checkpoint( 'gs://gnomad-tmp/gnomad.genomes_v3.coverage.summary.ht', overwrite=True) coverage_ht = coverage_ht.naive_coalesce(5000) coverage_ht.write(coverage('genomes').versions[coverage_version].path, overwrite=args.overwrite) if args.export_coverage: ht = coverage('genomes').versions[coverage_version].ht() if 'count_array' in ht.row_value: # Note that count_array isn't computed any more, so this is v3.0-specific ht = ht.drop('count_array') ht.export(coverage_tsv_path('genomes', coverage_version))
def main(args): hl.init(default_reference='GRCh38', log='/qc_annotations.log') if args.compute_info: compute_info().write(get_info(split=False).path, overwrite=args.overwrite) if args.split_info: split_info().write(get_info(split=True).path, overwrite=args.overwrite) if args.export_info_vcf: info_ht = get_info(split=False).ht() hl.export_vcf(ht_to_vcf_mt(info_ht), info_vcf_path) # if args.generate_ac: # TODO: compute AC and qc_AC as part of compute_info # mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, samples_meta=True) # mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) # # ht = generate_ac(mt, ).checkpoint('gs://gnomad-tmp/v3_ac_tmp.ht', overwrite=args.overwrite, _read_if_exists=not args.overwrite) # ht.repartition(10000, shuffle=False).write(ac_ht_path, overwrite=args.overwrite) if args.generate_fam_stats: mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, samples_meta=True) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) fam_stats_ht = generate_fam_stats(mt, trios.path) fam_stats_ht = fam_stats_ht.checkpoint( 'gs://gnomad-tmp/v3_fam_stats_tmp.ht', overwrite=args.overwrite, _read_if_exists=not args.overwrite) fam_stats_ht = fam_stats_ht.repartition(10000, shuffle=False) fam_stats_ht.write(fam_stats.path, overwrite=args.overwrite) if args.export_transmitted_singletons_vcf: export_transmitted_singletons_vcf() if args.vep: run_vep().write(vep.path, overwrite=args.overwrite)
def main(args): hl.init(log="/variant_qc_evaluation.log") if args.create_bin_ht: create_bin_ht( args.model_id, args.n_bins, ).write( get_score_bins(args.model_id, aggregated=False).path, overwrite=args.overwrite, ) if args.run_sanity_checks: ht = get_score_bins(args.model_id, aggregated=False).ht() logger.info("Running sanity checks...") print( ht.aggregate( hl.struct( was_biallelic=hl.agg.counter(~ht.was_split), has_biallelic_rank=hl.agg.counter( hl.is_defined(ht.biallelic_bin)), was_singleton=hl.agg.counter(ht.singleton), has_singleton_rank=hl.agg.counter( hl.is_defined(ht.singleton_bin)), was_biallelic_singleton=hl.agg.counter(ht.singleton & ~ht.was_split), has_biallelic_singleton_rank=hl.agg.counter( hl.is_defined(ht.biallelic_singleton_bin)), ))) if args.create_aggregated_bin_ht: logger.warning( "Use only workers, it typically crashes with preemptibles") create_aggregated_bin_ht(args.model_id).write( get_score_bins(args.model_id, aggregated=True).path, overwrite=args.overwrite, ) if args.extract_truth_samples: logger.info(f"Extracting truth samples from MT...") mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, remove_hard_filtered_samples=False) mt = mt.filter_cols( hl.literal([v["s"] for k, v in TRUTH_SAMPLES.items()]).contains(mt.s)) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) # Checkpoint to prevent needing to go through the large table a second time mt = mt.checkpoint( get_checkpoint_path("truth_samples", mt=True), overwrite=args.overwrite, ) for truth_sample in TRUTH_SAMPLES: truth_sample_mt = mt.filter_cols( mt.s == TRUTH_SAMPLES[truth_sample]["s"]) # Filter to variants in truth data truth_sample_mt = truth_sample_mt.filter_rows( hl.agg.any(truth_sample_mt.GT.is_non_ref())) truth_sample_mt.naive_coalesce(args.n_partitions).write( get_callset_truth_data(truth_sample).path, overwrite=args.overwrite, ) if args.merge_with_truth_data: for truth_sample in TRUTH_SAMPLES: logger.info( f"Creating a merged table with callset truth sample and truth data for {truth_sample}..." ) # Load truth data mt = get_callset_truth_data(truth_sample).mt() truth_hc_intervals = TRUTH_SAMPLES[truth_sample][ "hc_intervals"].ht() truth_mt = TRUTH_SAMPLES[truth_sample]["truth_mt"].mt() truth_mt = truth_mt.key_cols_by( s=hl.str(TRUTH_SAMPLES[truth_sample]["s"])) # Remove low quality sites info_ht = get_info(split=True).ht() mt = mt.filter_rows(~info_ht[mt.row_key].AS_lowqual) ht = create_truth_sample_ht(mt, truth_mt, truth_hc_intervals) ht.write( get_callset_truth_data(truth_sample, mt=False).path, overwrite=args.overwrite, ) if args.bin_truth_sample_concordance: for truth_sample in TRUTH_SAMPLES: logger.info( f"Creating binned concordance table for {truth_sample} for model {args.model_id}" ) ht = get_callset_truth_data(truth_sample, mt=False).ht() info_ht = get_info(split=True).ht() ht = ht.filter( ~info_ht[ht.key].AS_lowqual & ~hl.is_defined(telomeres_and_centromeres.ht()[ht.locus])) logger.info("Filtering out low confidence regions and segdups...") ht = filter_low_conf_regions( ht, filter_lcr=True, # TODO: Uncomment when we have decoy path filter_decoy=False, # True, filter_segdup=True, ) logger.info( "Loading HT containing RF or VQSR scores annotated with a bin based on the rank of score..." ) metric_ht = get_score_bins(args.model_id, aggregated=False).ht() ht = ht.filter(hl.is_defined(metric_ht[ht.key])) ht = ht.annotate(score=metric_ht[ht.key].score) ht = compute_binned_truth_sample_concordance( ht, metric_ht, args.n_bins) ht.write( get_binned_concordance(args.model_id, truth_sample).path, overwrite=args.overwrite, )
import hail as hl from gnomad_qc.v3.resources import get_gnomad_v3_mt, last_END_position # END RESOURCES mt = get_gnomad_v3_mt() mt = mt.select_entries('END') t = mt._localize_entries('__entries', '__cols') t = t.select(last_END_position=hl.or_else( hl.min( hl.scan.array_agg( lambda entry: hl.scan._prev_nonnull( hl.or_missing(hl.is_defined(entry.END), hl.tuple([t.locus, entry.END]))), t.__entries). map(lambda x: hl.or_missing((x[1] >= t.locus.position) & (x[ 0].contig == t.locus.contig), x[0].position))), t.locus.position)) t.write(last_END_position.path, overwrite=True)