def main(args): hl.init(default_reference='GRCh38') logger = logging.getLogger("gnomad_qc.v3.load_data.compute_coverage") if args.compute_coverage_ht: print("Building reference context HT") ref_ht = get_reference_ht(hl.get_reference('GRCh38'), excluded_intervals=telomeres_and_centromeres. ht().interval.collect()) ref_ht = ref_ht.checkpoint("gs://gnomad-tmp/ref_context.ht", overwrite=True) logger.info("Done building reference context HT") mt = get_gnomad_v3_mt() mt = mt.filter_cols(meta.ht()[mt.col_key].release) coverage_ht = compute_coverage_stats(mt, ref_ht) coverage_ht = coverage_ht.checkpoint( 'gs://gnomad-tmp/gnomad.genomes_v3.coverage.summary.ht', overwrite=True) coverage_ht = coverage_ht.naive_coalesce(5000) coverage_ht.write(coverage('genomes').versions['3.0'].path, overwrite=args.overwrite) if args.export_coverage: ht = coverage('genomes').versions['3.0'].ht() if 'count_array' in ht.row_value: # Note that count_array isn't computed any more, so this is v3.0-specific ht = ht.drop('count_array') ht.export(coverage_tsv_path('genomes', '3.0'))
def run_mendel_errors() -> hl.Table: meta_ht = meta.ht() ped = pedigree.versions["raw"].pedigree() logger.info(f"Running Mendel errors for {len(ped.trios)} trios.") fake_ped = create_fake_pedigree( n=100, sample_list=list( meta_ht.aggregate( hl.agg.filter( hl.rand_bool(0.01) & ((hl.len(meta_ht.qc_metrics_filters) == 0) & hl.or_else(hl.len(meta_ht.hard_filters) == 0, False)), hl.agg.collect_as_set(meta_ht.s), ))), real_pedigree=ped, ) merged_ped = hl.Pedigree(trios=ped.trios + fake_ped.trios) ped_samples = hl.literal( set([ s for trio in merged_ped.trios for s in [trio.s, trio.pat_id, trio.mat_id] ])) mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) mt = mt.filter_cols(ped_samples.contains(mt.s)) mt = hl.filter_intervals( mt, [hl.parse_locus_interval("chr20", reference_genome='GRCh38')]) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) mt = mt.select_entries("GT", "END") mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) == 2) mendel_errors, _, _, _ = hl.mendel_errors(mt["GT"], merged_ped) return mendel_errors
def main(args): hl.init(log='/frequency_data_generation.log', default_reference='GRCh38') logger.info("Reading sparse MT and metadata table...") mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) meta_ht = meta.ht().select('pop', 'sex', 'project_id', 'release', 'sample_filters') if args.test: logger.info("Filtering to chr20:1-1000000") mt = hl.filter_intervals(mt, [hl.parse_locus_interval('chr20:1-1000000')]) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) logger.info("Annotating sparse MT with metadata...") mt = mt.annotate_cols(meta=meta_ht[mt.s]) mt = mt.filter_cols(mt.meta.release) samples = mt.count_cols() logger.info(f"Running frequency table prep and generation pipeline on {samples} samples") logger.info("Computing adj and sex adjusted genotypes.") mt = mt.annotate_entries( GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT, mt.meta.sex), adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD) ) logger.info("Densify-ing...") mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) > 1) logger.info("Generating frequency data...") mt = annotate_freq( mt, sex_expr=mt.meta.sex, pop_expr=mt.meta.pop ) # Select freq, FAF and popmax faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus, POPS_TO_REMOVE_FOR_POPMAX) mt = mt.select_rows( 'freq', faf=faf, popmax=pop_max_expr(mt.freq, mt.freq_meta, POPS_TO_REMOVE_FOR_POPMAX) ) mt = mt.annotate_globals(faf_meta=faf_meta) # Annotate quality metrics histograms, as these also require densifying mt = mt.annotate_rows( **qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD) ) logger.info("Writing out frequency data...") if args.test: mt.rows().write("gs://gnomad-tmp/gnomad_freq/chr20_1_1000000_freq.ht", overwrite=True) else: mt.rows().write(freq.path, overwrite=args.overwrite)
def run_infer_families() -> hl.Pedigree: logger.info("Inferring families") ped = infer_families(get_v3_relatedness_annotated_ht(), v3_sex.ht(), duplicates.ht()) # Remove all trios containing any QC-filtered sample meta_ht = meta.ht() filtered_samples = meta_ht.aggregate( hl.agg.filter( (hl.len(meta_ht.qc_metrics_filters) > 0) | hl.or_else(hl.len(meta_ht.hard_filters) > 0, False), hl.agg.collect_as_set(meta_ht.s), )) return hl.Pedigree(trios=[ trio for trio in ped.trios if trio.s not in filtered_samples and trio.pat_id not in filtered_samples and trio.mat_id not in filtered_samples ])
def main(args): if args.join_qc_mt: v2_qc_mt_liftover = get_liftover_v2_qc_mt('exomes', ld_pruned=True, release_only=True) v2_qc_mt_liftover = v2_qc_mt_liftover.key_cols_by(s=v2_qc_mt_liftover.s, data_type="v2_exomes") v3_qc_mt = v3_qc.mt() v3_qc_mt = v3_qc_mt.filter_cols(meta.ht()[v3_qc_mt.col_key].release) v3_qc_mt = v3_qc_mt.select_rows().select_cols() v3_qc_mt = v3_qc_mt.key_cols_by(s=v3_qc_mt.s, data_type="v3_genomes") joint_qc_mt = v2_qc_mt_liftover.union_cols(v3_qc_mt) joint_qc_mt.write("gs://gnomad-tmp/v2_exomes_v3_joint_qc.mt", overwrite=args.overwrite) if args.run_pc_relate: logger.info('Running PC-Relate') logger.warning("PC-relate requires SSDs and doesn't work with preemptible workers!") joint_qc_mt = hl.read_matrix_table("gs://gnomad-tmp/v2_exomes_v3_joint_qc.mt") joint_qc_mt = joint_qc_mt.sample_rows(0.1) eig, scores, _ = hl.hwe_normalized_pca(joint_qc_mt.GT, k=10, compute_loadings=False) scores = scores.checkpoint(v2_v3_pc_relate_pca_scores.path, overwrite=args.overwrite, _read_if_exists=not args.overwrite) relatedness_ht = hl.pc_relate(joint_qc_mt.GT, min_individual_maf=0.01, scores_expr=scores[joint_qc_mt.col_key].scores, block_size=4096, min_kinship=0.1, statistics='all') relatedness_ht.write(v2_v3_relatedness.path, args.overwrite)
def main(args): hl.init(default_reference='GRCh38') coverage_version = args.coverage_version if args.coverage_version else CURRENT_GENOME_COVERAGE_RELEASE logger = logging.getLogger("gnomad_qc.v3.load_data.compute_coverage") logger.warning( "Last time this was run (July 2020), this script required high-mem machines." ) if args.compute_coverage_ht: print("Building reference context HT") ref_ht = get_reference_ht( hl.get_reference('GRCh38'), contigs=[f'chr{x}' for x in range(1, 23)] + ['chrX', 'chrY'], excluded_intervals=telomeres_and_centromeres.ht().interval.collect( )) ref_ht = ref_ht.checkpoint("gs://gnomad-tmp/ref_context.ht", overwrite=True) logger.info("Done building reference context HT") mt = get_gnomad_v3_mt() mt = mt.filter_cols(meta.ht()[mt.col_key].release) coverage_ht = compute_coverage_stats(mt, ref_ht) coverage_ht = coverage_ht.checkpoint( 'gs://gnomad-tmp/gnomad.genomes_v3.coverage.summary.ht', overwrite=True) coverage_ht = coverage_ht.naive_coalesce(5000) coverage_ht.write(coverage('genomes').versions[coverage_version].path, overwrite=args.overwrite) if args.export_coverage: ht = coverage('genomes').versions[coverage_version].ht() if 'count_array' in ht.row_value: # Note that count_array isn't computed any more, so this is v3.0-specific ht = ht.drop('count_array') ht.export(coverage_tsv_path('genomes', coverage_version))