def main(args): # Start Hail hl.init(default_reference=args.default_ref_genome) # Import unfiltered split MT mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered') # Compute stratified sample_qc (biallelic and multi-allelic sites) sample_qc_ht = compute_sample_qc(mt) # Write HT with sample QC metrics sample_qc_ht = sample_qc_ht.checkpoint(get_sample_qc_ht_path( dataset=args.exome_cohort, part='high_conf_autosomes'), overwrite=args.overwrite, _read_if_exists=not args.overwrite) # annotate sample population and platform qc info pop_qc = hl.read_table(get_sample_qc_ht_path(part='population_qc')) platform_qc = hl.read_table(get_sample_qc_ht_path(part='platform_pca')) ann_expr = { 'qc_pop': pop_qc[sample_qc_ht.s].predicted_pop, 'qc_platform': platform_qc[sample_qc_ht.s].qc_platform } sample_qc_ht = sample_qc_ht.annotate(**ann_expr) # Export HT to file if args.write_to_file: (sample_qc_ht.flatten().export( f"{get_sample_qc_ht_path(dataset=args.exome_cohort, part='high_conf_autosomes')}.tsv.bgz" )) # Apply stratified sample filters based on defined QC metrics exome_qc_metrics = [ 'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion', 'n_deletion', 'r_het_hom_var' ] print('Computing stratified metrics filters...') exome_pop_platform_filter_ht = compute_stratified_metrics_filter( sample_qc_ht, exome_qc_metrics, ['qc_pop', 'qc_platform']) exome_pop_platform_filter_ht = exome_pop_platform_filter_ht.checkpoint( get_sample_qc_ht_path(dataset=args.exome_cohort, part='stratified_metrics_filter'), overwrite=args.overwrite, _read_if_exists=not args.overwrite) # Export HT to file if args.write_to_file: (exome_pop_platform_filter_ht.export( f"{get_sample_qc_ht_path(dataset=args.exome_cohort, part='stratified_metrics_filter')}.tsv.bgz" )) # Stop Hail hl.stop() print("Finished!")
def apply_sample_qc_filtering(mt: hl.MatrixTable, keep_rare_variants: bool = True, maf_threshold: float = 0.01) -> hl.MatrixTable: """ Apply sample QC filtering, compute internal allelic frequencies on samples passing qc and adjusted phenotypes. Optionally, return MT filtered to rare variants. :param mt: hl.MatrixTable :param keep_rare_variants: Filter MT to rare variants :param maf_threshold: allelic frequency cutoff :return: hl.MatrixTable """ # import variant qc final table sample_qc_ht = hl.read_table(get_sample_qc_ht_path(part='final_qc')) sample_qc_ht = (sample_qc_ht.filter(sample_qc_ht.pass_filters)) mt = (mt.filter_cols(hl.is_defined(sample_qc_ht[mt.col_key]))) # compute cohort-specific (internal) allelic frequencies on samples passing qc mt = (mt.annotate_rows(gt_stats=hl.agg.call_stats(mt.GT, mt.alleles))) mt = (mt.annotate_rows(internal_af=mt.gt_stats.AF[1], internal_ac=mt.gt_stats.AC[1])) # filter out common variants base don internal af if keep_rare_variants: mt = (mt.filter_rows(af_filter_expr(mt, 'internal_af', maf_threshold))) return mt
def main(args): # Start Hail hl.init(default_reference=args.default_ref_genome) # Import unfiltered split MT mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered') # Compute stratified sample_qc (biallelic and multi-allelic sites) sample_qc_ht = compute_sample_qc(mt) # Write HT with sample QC metrics output_path = ( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.sample_qc.high_conf.autosomes.cds.capture_intervals.rare_common.ht' ) sample_qc_ht = sample_qc_ht.checkpoint(output_path, overwrite=args.overwrite, _read_if_exists=not args.overwrite) # annotate sample population and platform qc info pop_qc = hl.read_table(get_sample_qc_ht_path(part='population_qc')) platform_qc = hl.read_table(get_sample_qc_ht_path(part='platform_pca')) ann_expr = { 'qc_pop': pop_qc[sample_qc_ht.s].predicted_pop, 'qc_platform': platform_qc[sample_qc_ht.s].qc_platform } sample_qc_ht = sample_qc_ht.annotate(**ann_expr) # Export HT to file if args.write_to_file: (sample_qc_ht.flatten().export(f"{output_path}.tsv.bgz")) # Stop Hail hl.stop() print("Finished!")
def main(args): # nfs_dir = 'file:///home/ubuntu/data' hl.init(default_reference=args.default_reference) logger.info("Importing data...") # import unfiltered MT mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered') # keep bi-allelic variants mt = (mt .filter_rows(bi_allelic_expr(mt), keep=True) ) # read intervals for filtering variants (used mainly for exomes) def _get_interval_table(interval: str) -> Union[None, hl.Table]: return get_capture_interval_ht(name=interval, reference=args.default_reference) if interval is not None else interval ht = compute_mean_coverage(mt=mt, normalization_contig=args.normalization_contig, included_calling_intervals=_get_interval_table(args.interval_to_include), excluded_calling_intervals=_get_interval_table(args.interval_to_exclude), chr_x=args.chr_x, chr_y=args.chr_y) logger.info("Exporting data...") # write HT output_ht_path = get_sample_qc_ht_path(part='sex_chrom_coverage') ht.write(output=output_ht_path, overwrite=args.overwrite) # export to file if true if args.write_to_file: (ht .export(f'{output_ht_path}.tsv.bgz') ) hl.stop() print("Done!")
def main(args): # Start Hail hl.init(default_reference=args.default_reference) if not args.skip_filter_step: logger.info("Importing data...") # import unfiltered MT mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered') # Read MT from 1kgenome and keep only locus defined in interval mt_1kg = get_1kg_mt(args.default_reference) # Joining dataset (inner join). Keep only 'GT' entry field mt_joint = (mt.select_entries('GT').union_cols( mt_1kg.select_entries('GT'), row_join_type='inner')) logger.info( "Filtering joint MT to bi-allelic, high-callrate, common SNPs...") mt_joint = (mt_joint.filter_rows( bi_allelic_expr(mt_joint) & hl.is_snp(mt_joint.alleles[0], mt_joint.alleles[1]) & (hl.agg.mean(mt_joint.GT.n_alt_alleles()) / 2 > 0.001) & (hl.agg.fraction(hl.is_defined(mt_joint.GT)) > 0.99)). naive_coalesce(1000)) logger.info( "Checkpoint: writing joint filtered MT before LD pruning...") mt_joint = mt_joint.checkpoint(get_mt_checkpoint_path( dataset=args.exome_cohort, part='joint_1kg_high_callrate_common_snp_biallelic'), overwrite=True) logger.info( f"Running ld_prune with r2 = {args.ld_prune_r2} on MT with {mt_joint.count_rows()} variants..." ) # remove correlated variants pruned_variant_table = hl.ld_prune(mt_joint.GT, r2=args.ld_prune_r2, bp_window_size=500000, memory_per_core=512) mt_joint = (mt_joint.filter_rows( hl.is_defined(pruned_variant_table[mt_joint.row_key]))) logger.info("Writing filtered joint MT with variants in LD pruned...") (mt_joint.write(get_qc_mt_path( dataset=args.exome_cohort + '_1kg', part='joint_high_callrate_common_snp_biallelic', split=True, ld_pruned=True), overwrite=args.overwrite)) logger.info("Importing filtered joint MT...") mt_joint = hl.read_matrix_table( get_qc_mt_path(dataset=args.exome_cohort + '_1kg', part='joint_high_callrate_common_snp_biallelic', split=True, ld_pruned=True)) logger.info(f"Running PCA with {mt_joint.count_rows()} variants...") # run pca on merged dataset eigenvalues, pc_scores, _ = hl.hwe_normalized_pca(mt_joint.GT, k=args.n_pcs) logger.info(f"Eigenvalues: {eigenvalues}") # TODO: save eigenvalues? # Annotate PC array as independent fields. pca_table = (pc_scores.annotate( ** {'PC' + str(k + 1): pc_scores.scores[k] for k in range(0, args.n_pcs)}).drop('scores')) logger.info(f"Writing HT with PCA results...") # write as HT output_ht_path = get_sample_qc_ht_path(dataset=args.exome_cohort, part='joint_pca_1kg') pca_table.write(output=output_ht_path) if args.write_to_file: (pca_table.export(f'{output_ht_path}.tsv.bgz')) # Stop Hail hl.stop() print("Done!")
def main(args): # Start Hail hl.init(default_reference=args.default_reference) if not args.skip_filter_step: logger.info("Importing data...") # import unfiltered MT mt = hl.read_matrix_table( get_qc_mt_path(dataset=args.exome_cohort, part='unphase_adj_genotypes', split=True)) # filter to samples passing QC filters logger.info( "Filtering MT to samples passing QC filters (hard filters, relatedness, european ancestries)..." ) sample_qc_ht = hl.read_table(get_sample_qc_ht_path(part='final_qc')) sample_qc_ht = (sample_qc_ht.filter(sample_qc_ht.pass_filters)) mt = (mt.filter_cols(hl.is_defined(sample_qc_ht[mt.col_key]))) logger.info( "Filtering joint MT to bi-allelic, high-callrate, common SNPs...") maf = args.maf_threshold mt = (mt.filter_rows( bi_allelic_expr(mt) & hl.is_snp(mt.alleles[0], mt.alleles[1]) & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > maf) & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99)).naive_coalesce( 500)) logger.info("Checkpoint: writing filtered MT before LD pruning...") mt = mt.checkpoint(get_mt_checkpoint_path( dataset=args.exome_cohort, part='high_callrate_common_snp_biallelic'), overwrite=args.overwrite) logger.info( f"Running ld_prune with r2 = {args.ld_prune_r2} on MT with {mt.count_rows()} variants..." ) # remove correlated variants pruned_variant_table = hl.ld_prune(mt.GT, r2=args.ld_prune_r2, bp_window_size=500000, memory_per_core=512) mt = (mt.filter_rows(hl.is_defined(pruned_variant_table[mt.row_key]))) logger.info("Writing filtered MT with ld-pruned variants...") (mt.write(get_qc_mt_path(dataset=args.exome_cohort, part='high_callrate_common_snp_biallelic', split=True, ld_pruned=True), overwrite=args.overwrite)) logger.info("Importing filtered ld-pruned MT...") mt = hl.read_matrix_table( get_qc_mt_path(dataset=args.exome_cohort, part='high_callrate_common_snp_biallelic', split=True, ld_pruned=True)) logger.info(f"Running PCA on {mt.count_rows()} variants...") # run pca on merged dataset eigenvalues, pc_scores, _ = hl.hwe_normalized_pca(mt.GT, k=args.n_pcs) logger.info(f"Eigenvalues: {eigenvalues}") # Annotate eigenvalues as global field pc_scores = (pc_scores.annotate_globals(**{'eigenvalues': eigenvalues})) # Annotate PC array as independent fields. pca_table = (pc_scores.annotate( ** {'PC' + str(k + 1): pc_scores.scores[k] for k in range(0, args.n_pcs)}).drop('scores')) logger.info(f"Writing HT with PCA results...") # write as HT output_ht_path = args.output_ht pca_table = (pca_table.checkpoint(output=output_ht_path, overwrite=args.overwrite)) if args.write_to_file: (pca_table.export(f'{output_ht_path}.tsv.bgz')) # Stop Hail hl.stop() print("PCA pipeline finalised...")
def main(args): # Start Hail hl.init(default_reference=args.default_ref_genome) # Import raw split MT mt = (get_mt_data(dataset=args.exome_cohort, part='raw', split=True).select_cols()) ht = (mt.cols().key_by('s')) # Annotate samples filters sample_qc_filters = {} # 1. Add sample hard filters annotation expr sample_qc_hard_filters_ht = hl.read_table( get_sample_qc_ht_path(dataset=args.exome_cohort, part='hard_filters')) sample_qc_filters.update( {'hard_filters': sample_qc_hard_filters_ht[ht.s]['hard_filters']}) # 2. Add population qc filters annotation expr sample_qc_pop_ht = hl.read_table( get_sample_qc_ht_path(dataset=args.exome_cohort, part='population_qc')) sample_qc_filters.update( {'predicted_pop': sample_qc_pop_ht[ht.s]['predicted_pop']}) # 3. Add relatedness filters annotation expr related_samples_to_drop = get_related_samples_to_drop() related_samples = hl.set( related_samples_to_drop.aggregate( hl.agg.collect_as_set(related_samples_to_drop.node.id))) sample_qc_filters.update({'is_related': related_samples.contains(ht.s)}) # 4. Add stratified sample qc (population/platform) annotation expr sample_qc_pop_platform_filters_ht = hl.read_table( get_sample_qc_ht_path(dataset=args.exome_cohort, part='stratified_metrics_filter')) sample_qc_filters.update({ 'pop_platform_filters': sample_qc_pop_platform_filters_ht[ht.s]['pop_platform_filters'] }) ht = (ht.annotate(**sample_qc_filters)) # Final sample qc filter joint expression final_sample_qc_ann_expr = { 'pass_filters': hl.cond((hl.len(ht.hard_filters) == 0) & (hl.len(ht.pop_platform_filters) == 0) & (ht.predicted_pop == 'EUR') & ~ht.is_related, True, False) } ht = (ht.annotate(**final_sample_qc_ann_expr)) logger.info('Writing final sample qc HT to disk...') output_path_ht = get_sample_qc_ht_path(dataset=args.exome_cohort, part='final_qc') ht = ht.checkpoint(output_path_ht, overwrite=args.overwrite) # Export final sample QC annotations to file if args.write_to_file: (ht.export(f'{output_path_ht}.tsv.bgz')) ## Release final unphase MT with adjusted genotypes filtered mt = unphase_mt(mt) mt = annotate_adj(mt) mt = mt.filter_entries(mt.adj).select_entries('GT', 'DP', 'GQ', 'adj') logger.info('Writing unphase MT with adjusted genotypes to disk...') # write MT mt.write(get_qc_mt_path(dataset=args.exome_cohort, part='unphase_adj_genotypes', split=True), overwrite=args.overwrite) # Stop Hail hl.stop() print("Finished!")