def main(args): hl.init(default_reference=args.default_ref_genome) if args.run_test_mode: logger.info('Running pipeline on test data...') mt = (get_mt_data(part='raw_chr20').sample_rows(0.1)) else: logger.info( 'Running pipeline on MatrixTable wih adjusted genotypes...') ds = args.exome_cohort mt = hl.read_matrix_table( get_qc_mt_path(dataset=ds, part='unphase_adj_genotypes', split=True)) # 1. Sample-QC filtering if not args.skip_sample_qc_filtering: logger.info('Applying per sample QC filtering...') mt = apply_sample_qc_filtering(mt) logger.info( 'Writing sample qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt', overwrite=True)) # 2. Variant-QC filtering if not args.skip_variant_qc_filtering: logger.info('Applying per variant QC filtering...') if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt/_SUCCESS'): logger.info('Reading pre-existing sample qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt') mt = apply_variant_qc_filtering(mt) # write hard filtered MT to disk logger.info( 'Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt', overwrite=True)) # 3. Annotate AFs # allelic frequency cut-off maf_cutoff = args.af_max_threshold if not args.skip_af_filtering: if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt') # Annotate allelic frequencies from external source, # and compute internal AF on samples passing QC af_ht = get_af_annotation_ht() mt = (mt.annotate_rows(**af_ht[mt.row_key])) filter_expressions = [ af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff), af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff) ] mt = (mt.filter_rows(functools.reduce(operator.iand, filter_expressions), keep=True)) logger.info( 'Writing qc-filtered MT filtered to external maf with to disk...') mt = (mt.write(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt', overwrite=True)) # 4. ##### Burden Test ###### logger.info('Running burden test...') if hl.hadoop_is_file(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT with rare variants...' ) mt = hl.read_matrix_table(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt') ## Add VEP-annotated fields vep_ht = get_vep_annotation_ht() mt = (mt.annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF, Consequence=vep_ht[mt.row_key].vep.Consequence, DOMAINS=vep_ht[mt.row_key].vep.DOMAINS, SYMBOL=vep_ht[mt.row_key].vep.SYMBOL)) ## Filter to bi-allelic variants if args.filter_biallelic: logger.info('Running burden test on biallelic variants...') mt = mt.filter_rows(bi_allelic_expr(mt)) ## Filter to variants within protein domain(s) if args.filter_protein_domain: logger.info( 'Running burden test on variants within protein domain(s)...') mt = mt.filter_rows(vep_protein_domain_filter_expr(mt.DOMAINS), keep=True) ## Add cases/controls sample annotations tb_sample = get_sample_meta_data() mt = (mt.annotate_cols(**tb_sample[mt.s])) mt = (mt.filter_cols(mt['phe.is_case'] | mt['phe.is_control'])) ## Annotate pathogenic scores ht_scores = get_vep_scores_ht() mt = mt.annotate_rows(**ht_scores[mt.row_key]) ## Classify variant into (major) consequence groups score_expr_ann = { 'hcLOF': mt.LoF == 'HC', 'syn': mt.Consequence == 'synonymous_variant', 'miss': mt.Consequence == 'missense_variant' } # Update dict expr annotations with combinations of variant consequences categories score_expr_ann.update({ 'missC': (hl.sum([(mt['vep.MVP_score'] >= MVP_THRESHOLD), (mt['vep.REVEL_score'] >= REVEL_THRESHOLD), (mt['vep.CADD_PHRED'] >= CADD_THRESHOLD)]) >= 2) & score_expr_ann.get('miss') }) score_expr_ann.update({ 'hcLOF_missC': score_expr_ann.get('hcLOF') | score_expr_ann.get('missC') }) mt = (mt.annotate_rows(csq_group=score_expr_ann)) # Transmute csq_group and convert dict to set where the group is defined # (easier to explode and grouping later) mt = (mt.transmute_rows(csq_group=hl.set( hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys())))) mt = (mt.filter_rows(hl.len(mt.csq_group) > 0)) # Explode nested csq_group before grouping mt = (mt.explode_rows(mt.csq_group)) # print('Number of samples/variants: ') # print(mt.count()) # Group mt by gene/csq_group. mt_grouped = (mt.group_rows_by(mt['SYMBOL'], mt['csq_group']).aggregate( hets=hl.agg.any(mt.GT.is_het()), homs=hl.agg.any(mt.GT.is_hom_var()), chets=hl.agg.count_where(mt.GT.is_het()) >= 2, homs_chets=(hl.agg.count_where(mt.GT.is_het()) >= 2) | (hl.agg.any(mt.GT.is_hom_var()))).repartition(100).persist()) mts = [] if args.homs: # select homs genotypes. mt_homs = (mt_grouped.select_entries( mac=mt_grouped.homs).annotate_rows(agg_genotype='homs')) mts.append(mt_homs) if args.chets: # select compound hets (chets) genotypes. mt_chets = (mt_grouped.select_entries( mac=mt_grouped.chets).annotate_rows(agg_genotype='chets')) mts.append(mt_chets) if args.homs_chets: # select chets and/or homs genotypes. mt_homs_chets = (mt_grouped.select_entries( mac=mt_grouped.homs_chets).annotate_rows( agg_genotype='homs_chets')) mts.append(mt_homs_chets) if args.hets: # select hets genotypes mt_hets = (mt_grouped.select_entries( mac=mt_grouped.hets).annotate_rows(agg_genotype='hets')) mts.append(mt_hets) ## Joint MatrixTables mt_grouped = hl.MatrixTable.union_rows(*mts) # Generate table of counts tb_gene = (mt_grouped.annotate_rows( n_cases=hl.agg.filter(mt_grouped['phe.is_case'], hl.agg.sum(mt_grouped.mac)), n_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'], hl.agg.sum(mt_grouped.mac)), n_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'], hl.agg.sum(mt_grouped.mac)), n_controls=hl.agg.filter(mt_grouped['phe.is_control'], hl.agg.sum(mt_grouped.mac)), n_total_cases=hl.agg.filter(mt_grouped['phe.is_case'], hl.agg.count()), n_total_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'], hl.agg.count()), n_total_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'], hl.agg.count()), n_total_controls=hl.agg.filter(mt_grouped['phe.is_control'], hl.agg.count())).rows()) # run fet stratified by proband type analysis = ['all_cases', 'syndromic', 'nonsyndromic'] tbs = [] for proband in analysis: logger.info(f'Running test for {proband}...') colCases = None colTotalCases = None colControls = 'n_controls' colTotalControls = 'n_total_controls' if proband == 'all_cases': colCases = 'n_cases' colTotalCases = 'n_total_cases' if proband == 'syndromic': colCases = 'n_syndromic' colTotalCases = 'n_total_syndromic' if proband == 'nonsyndromic': colCases = 'n_nonsyndromic' colTotalCases = 'n_total_nonsyndromic' tb_fet = compute_fisher_exact(tb=tb_gene, n_cases_col=colCases, n_control_col=colControls, total_cases_col=colTotalCases, total_controls_col=colTotalControls, correct_total_counts=True, root_col_name='fet', extra_fields={ 'analysis': proband, 'maf': maf_cutoff }) # filter out zero-count genes tb_fet = (tb_fet.filter( hl.sum([tb_fet[colCases], tb_fet[colControls]]) > 0, keep=True)) tbs.append(tb_fet) tb_final = hl.Table.union(*tbs) tb_final.describe() # export results date = current_date() run_hash = str(uuid.uuid4())[:6] output_path = f'{args.output_dir}/{date}/{args.exome_cohort}.fet_burden.{run_hash}.ht' tb_final = (tb_final.checkpoint(output=output_path)) if args.write_to_file: # write table to disk as TSV file (tb_final.export(f'{output_path}.tsv')) hl.stop()
def main(args): # Start Hail hl.init(default_reference=args.default_reference) if not args.skip_filter_step: logger.info("Importing data...") # import unfiltered MT mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered') # Read MT from 1kgenome and keep only locus defined in interval mt_1kg = get_1kg_mt(args.default_reference) # Joining dataset (inner join). Keep only 'GT' entry field mt_joint = (mt.select_entries('GT').union_cols( mt_1kg.select_entries('GT'), row_join_type='inner')) logger.info( "Filtering joint MT to bi-allelic, high-callrate, common SNPs...") mt_joint = (mt_joint.filter_rows( bi_allelic_expr(mt_joint) & hl.is_snp(mt_joint.alleles[0], mt_joint.alleles[1]) & (hl.agg.mean(mt_joint.GT.n_alt_alleles()) / 2 > 0.001) & (hl.agg.fraction(hl.is_defined(mt_joint.GT)) > 0.99)). naive_coalesce(1000)) logger.info( "Checkpoint: writing joint filtered MT before LD pruning...") mt_joint = mt_joint.checkpoint(get_mt_checkpoint_path( dataset=args.exome_cohort, part='joint_1kg_high_callrate_common_snp_biallelic'), overwrite=True) logger.info( f"Running ld_prune with r2 = {args.ld_prune_r2} on MT with {mt_joint.count_rows()} variants..." ) # remove correlated variants pruned_variant_table = hl.ld_prune(mt_joint.GT, r2=args.ld_prune_r2, bp_window_size=500000, memory_per_core=512) mt_joint = (mt_joint.filter_rows( hl.is_defined(pruned_variant_table[mt_joint.row_key]))) logger.info("Writing filtered joint MT with variants in LD pruned...") (mt_joint.write(get_qc_mt_path( dataset=args.exome_cohort + '_1kg', part='joint_high_callrate_common_snp_biallelic', split=True, ld_pruned=True), overwrite=args.overwrite)) logger.info("Importing filtered joint MT...") mt_joint = hl.read_matrix_table( get_qc_mt_path(dataset=args.exome_cohort + '_1kg', part='joint_high_callrate_common_snp_biallelic', split=True, ld_pruned=True)) logger.info(f"Running PCA with {mt_joint.count_rows()} variants...") # run pca on merged dataset eigenvalues, pc_scores, _ = hl.hwe_normalized_pca(mt_joint.GT, k=args.n_pcs) logger.info(f"Eigenvalues: {eigenvalues}") # TODO: save eigenvalues? # Annotate PC array as independent fields. pca_table = (pc_scores.annotate( ** {'PC' + str(k + 1): pc_scores.scores[k] for k in range(0, args.n_pcs)}).drop('scores')) logger.info(f"Writing HT with PCA results...") # write as HT output_ht_path = get_sample_qc_ht_path(dataset=args.exome_cohort, part='joint_pca_1kg') pca_table.write(output=output_ht_path) if args.write_to_file: (pca_table.export(f'{output_ht_path}.tsv.bgz')) # Stop Hail hl.stop() print("Done!")
def main(args): # Start Hail hl.init(default_reference=args.default_reference) if not args.skip_filter_step: logger.info("Importing data...") # import unfiltered MT mt = hl.read_matrix_table( get_qc_mt_path(dataset=args.exome_cohort, part='unphase_adj_genotypes', split=True)) # filter to samples passing QC filters logger.info( "Filtering MT to samples passing QC filters (hard filters, relatedness, european ancestries)..." ) sample_qc_ht = hl.read_table(get_sample_qc_ht_path(part='final_qc')) sample_qc_ht = (sample_qc_ht.filter(sample_qc_ht.pass_filters)) mt = (mt.filter_cols(hl.is_defined(sample_qc_ht[mt.col_key]))) logger.info( "Filtering joint MT to bi-allelic, high-callrate, common SNPs...") maf = args.maf_threshold mt = (mt.filter_rows( bi_allelic_expr(mt) & hl.is_snp(mt.alleles[0], mt.alleles[1]) & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > maf) & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99)).naive_coalesce( 500)) logger.info("Checkpoint: writing filtered MT before LD pruning...") mt = mt.checkpoint(get_mt_checkpoint_path( dataset=args.exome_cohort, part='high_callrate_common_snp_biallelic'), overwrite=args.overwrite) logger.info( f"Running ld_prune with r2 = {args.ld_prune_r2} on MT with {mt.count_rows()} variants..." ) # remove correlated variants pruned_variant_table = hl.ld_prune(mt.GT, r2=args.ld_prune_r2, bp_window_size=500000, memory_per_core=512) mt = (mt.filter_rows(hl.is_defined(pruned_variant_table[mt.row_key]))) logger.info("Writing filtered MT with ld-pruned variants...") (mt.write(get_qc_mt_path(dataset=args.exome_cohort, part='high_callrate_common_snp_biallelic', split=True, ld_pruned=True), overwrite=args.overwrite)) logger.info("Importing filtered ld-pruned MT...") mt = hl.read_matrix_table( get_qc_mt_path(dataset=args.exome_cohort, part='high_callrate_common_snp_biallelic', split=True, ld_pruned=True)) logger.info(f"Running PCA on {mt.count_rows()} variants...") # run pca on merged dataset eigenvalues, pc_scores, _ = hl.hwe_normalized_pca(mt.GT, k=args.n_pcs) logger.info(f"Eigenvalues: {eigenvalues}") # Annotate eigenvalues as global field pc_scores = (pc_scores.annotate_globals(**{'eigenvalues': eigenvalues})) # Annotate PC array as independent fields. pca_table = (pc_scores.annotate( ** {'PC' + str(k + 1): pc_scores.scores[k] for k in range(0, args.n_pcs)}).drop('scores')) logger.info(f"Writing HT with PCA results...") # write as HT output_ht_path = args.output_ht pca_table = (pca_table.checkpoint(output=output_ht_path, overwrite=args.overwrite)) if args.write_to_file: (pca_table.export(f'{output_ht_path}.tsv.bgz')) # Stop Hail hl.stop() print("PCA pipeline finalised...")
def main(args): # Start Hail hl.init(default_reference=args.default_ref_genome) # Import adj genotype MT and remove mt = hl.read_matrix_table( get_qc_mt_path(dataset=args.exome_cohort, part='sample_qc_adj_genotypes', split=True)) # keep samples passing QC filtering mt = (mt.filter_cols(mt.pass_filters).select_cols().select_rows()) # import variant info fields (vcf info) variant_info_ht = (get_vep_annotation_ht().drop('vep')) # Add useful annotation for variant hard filter ht = ( mt.annotate_rows( inbreeding_coeff=variant_info_ht[mt.row_key].info.InbreedingCoeff, vqsr_filter=variant_info_ht[mt.row_key].filters, VQSLOD=variant_info_ht[mt.row_key].info.VQSLOD, gt_counts=hl.agg.count_where(hl.is_defined( mt.GT)) # expected MT filtered to high-quality GT ).rows()) # 1. Apply variant hard filters # hard filter expression variant_hard_filter_expr = { 'fail_inbreeding_coeff': ht.inbreeding_coeff < INBREEDING_COEFFICIENT_CUTOFF, 'AC0': ht.gt_counts == 0 } ht = (ht.annotate(**variant_hard_filter_expr)) # 2. Apply VQSR filter ht = (ht.annotate(fail_vqsr=hl.len(ht.vqsr_filter) != 0)) # 3. Apply RF filter # import/parse rf final HT ht_rf = hl.read_table(get_variant_qc_ht_path(part='rf_result')) ht_rf = (ht_rf.select(rf_probability_tp=ht_rf.rf_probability['TP'], variant_type=ht_rf.variant_type)) ht = (ht.annotate(**ht_rf[ht.key])) ht = (ht.annotate(fail_rf=hl.case().when( (ht.rf_probability_tp < RF_PROBABILITY_SNV_CUTOFF) & (ht.variant_type == 'snv'), True).when( (ht.rf_probability_tp < RF_PROBABILITY_INDEL_CUTOFF) & (ht.variant_type == 'indel'), True).default(False))) # 5. Apply coverage/capture interval filters ## gnomad genome coverage gnomad_coverage_ht = get_gnomad_genomes_coverage_ht().key_by() gnomad_coverage_ht = (gnomad_coverage_ht.annotate(locus=hl.parse_locus( gnomad_coverage_ht.locus, reference_genome='GRCh38')).key_by('locus')) ht = (ht.annotate(gnomad_cov_10X=gnomad_coverage_ht[ht.locus].over_10)) ht = (ht.annotate(is_coveraged_gnomad_genomes=ht.gnomad_cov_10X >= 0.9)) ## defined in capture intervals # filter to capture intervals (intersect) ht_defined_intervals = filter_capture_intervals(ht) ht = (ht.annotate(is_defined_capture_intervals=hl.is_defined( ht_defined_intervals[ht.key]))) # 6. Summary final variant QC # final variant qc filter joint expression final_variant_qc_ann_expr = { 'pass_variant_qc_filters': hl.cond( ~ht.fail_inbreeding_coeff & ~ht.AC0 & ~ht.fail_vqsr & ~ht.fail_rf & ht.is_coveraged_gnomad_genomes & ht.is_defined_capture_intervals, True, False) } ht = (ht.annotate(**final_variant_qc_ann_expr)) # Counts the number of variants (snv and indels) affected by every filter and add as global field filter_flags = [ 'fail_inbreeding_coeff', 'AC0', 'fail_vqsr', 'fail_rf', 'is_coveraged_gnomad_genomes', 'is_defined_capture_intervals', 'pass_variant_qc_filters' ] summary_filter_expr = { v: hl.struct( **{ f: hl.agg.filter(ht.variant_type == v, hl.agg.counter(ht[f])) for f in filter_flags }) for v in ['snv', 'indel'] } ht = ht.annotate_globals( summary_filter=ht.aggregate(summary_filter_expr, _localize=False)) # write HT variant QC final table output_path = get_variant_qc_ht_path(dataset=args.exome_cohort, part='final_qc') ht = ht.checkpoint(output_path, overwrite=args.overwrite) # print filter summary logger.info(f'Variant QC filter summary: {ht.summary_filter.collect()}') # export HT to file if args.write_to_file: ht.export(f'{output_path}.tsv.bgz') # Stop Hail hl.stop() print("Finished!")
def main(args): ## Init Hail hl.init(default_reference=args.default_ref_genome) ## Import unfiltered MT with adjusted genotypes ds = args.exome_cohort mt = hl.read_matrix_table(get_qc_mt_path(dataset=ds, part='unphase_adj_genotypes', split=True)) ## Add VEP-annotated fields vep_ht = get_vep_annotation_ht() mt = (mt .annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF, Consequence=vep_ht[mt.row_key].vep.Consequence, DOMAINS=vep_ht[mt.row_key].vep.DOMAINS, SYMBOL=vep_ht[mt.row_key].vep.SYMBOL) ) ## Parse geneset geneset = parse_geneset(args.geneset_file) ## Filter to geneset mt = (mt .filter_rows(hl.set(geneset).contains(mt.SYMBOL)) .checkpoint(f'{nfs_tmp}/tmp.mt', overwrite=True) ) ## Sample-QC filtering if args.apply_sample_qc_filtering: logger.info('Applying per sample QC filtering...') mt = apply_sample_qc_filtering(mt) logger.info('Writing sample qc-filtered MT to disk...') mt = (mt .checkpoint(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt', overwrite=True) ) ## Variant-QC filtering if args.apply_variant_qc_filtering: logger.info('Applying per variant QC filtering...') mt = apply_variant_qc_filtering(mt) # write hard filtered MT to disk logger.info('Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...') mt = (mt .checkpoint(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt', overwrite=True) ) ## Filtering by AFs # allelic frequency cut-off maf_cutoff = args.af_max_threshold if args.apply_af_filtering: # Annotate allelic frequencies from external source, # and compute internal AF on samples passing QC af_ht = get_af_annotation_ht() mt = (mt .annotate_rows(**af_ht[mt.row_key]) ) filter_expressions = [af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff), af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff) ] mt = (mt .filter_rows(functools.reduce(operator.iand, filter_expressions), keep=True) ) logger.info('Writing AF-filtered MT to disk...') mt = (mt .checkpoint(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt', overwrite=True) ) ## Filter to bi-allelic variants if args.filter_biallelic: logger.info('Running burden test on biallelic variants...') mt = mt.filter_rows(bi_allelic_expr(mt)) ## Generate blind sample IDs mt = mt.add_col_index() mt = (mt .annotate_cols(BIID=hl.str('BLIND_ID_') + hl.str(mt.col_idx)) ) ## Add cases/controls sample annotations tb_sample = get_sample_meta_data() mt = (mt .annotate_cols(**tb_sample[mt.s]) ) mt = (mt .filter_cols(mt['phe.is_case'] | mt['phe.is_control']) ) ## Annotate pathogenic scores ht_scores = get_vep_scores_ht() mt = mt.annotate_rows(**ht_scores[mt.row_key]) ## Annotate variants ID mt = annotate_variant_id(mt) # annotate samples ann_expr = {'n_het_cases': hl.agg.filter(mt.GT.is_het() & mt['phe.is_case'], hl.agg.count()), 'n_hom_cases': hl.agg.filter(mt.GT.is_hom_var() & mt['phe.is_case'], hl.agg.count()), 'n_het_syndromic': hl.agg.filter(mt.GT.is_het() & mt['phe.is_syndromic'], hl.agg.count()), 'n_hom_syndromic': hl.agg.filter(mt.GT.is_hom_var() & mt['phe.is_syndromic'], hl.agg.count()), 'n_het_nonsyndromic': hl.agg.filter(mt.GT.is_het() & mt['phe.is_nonsyndromic'], hl.agg.count()), 'n_hom_nonsyndromic': hl.agg.filter(mt.GT.is_hom_var() & mt['phe.is_nonsyndromic'], hl.agg.count()), 'n_het_controls': hl.agg.filter(mt.GT.is_het() & ~mt['phe.is_case'], hl.agg.count()), 'n_hom_controls': hl.agg.filter(mt.GT.is_hom_var() & ~mt['phe.is_case'], hl.agg.count()), 'het_case_ids': hl.agg.filter(mt.GT.is_het() & mt['phe.is_case'], hl.delimit(hl.agg.collect_as_set(mt.BIID), '|')), 'hom_case_ids': hl.agg.filter(mt.GT.is_hom_var() & mt['phe.is_case'], hl.delimit(hl.agg.collect_as_set(mt.BIID), '|')), 'het_control_ids': hl.agg.filter(mt.GT.is_het() & ~mt['phe.is_case'], hl.delimit(hl.agg.collect_as_set(mt.BIID), '|')), 'hom_control_ids': hl.agg.filter(mt.GT.is_hom_var() & ~mt['phe.is_case'], hl.delimit(hl.agg.collect_as_set(mt.BIID), '|')) } ht = (mt .annotate_rows(**ann_expr) .rows() .key_by() .select(*list(['vid', 'Consequence', 'SYMBOL', 'internal_af', 'gnomAD_AF', 'vep.MVP_score', 'vep.REVEL_score', 'vep.MPC_score', 'vep.CADD_PHRED']) + list(ann_expr.keys())) ) # export results (ht .export(args.output_file) )
def main(args): # Start Hail hl.init(default_reference=args.default_ref_genome) # Import raw split MT mt = (get_mt_data(dataset=args.exome_cohort, part='raw', split=True).select_cols()) ht = (mt.cols().key_by('s')) # Annotate samples filters sample_qc_filters = {} # 1. Add sample hard filters annotation expr sample_qc_hard_filters_ht = hl.read_table( get_sample_qc_ht_path(dataset=args.exome_cohort, part='hard_filters')) sample_qc_filters.update( {'hard_filters': sample_qc_hard_filters_ht[ht.s]['hard_filters']}) # 2. Add population qc filters annotation expr sample_qc_pop_ht = hl.read_table( get_sample_qc_ht_path(dataset=args.exome_cohort, part='population_qc')) sample_qc_filters.update( {'predicted_pop': sample_qc_pop_ht[ht.s]['predicted_pop']}) # 3. Add relatedness filters annotation expr related_samples_to_drop = get_related_samples_to_drop() related_samples = hl.set( related_samples_to_drop.aggregate( hl.agg.collect_as_set(related_samples_to_drop.node.id))) sample_qc_filters.update({'is_related': related_samples.contains(ht.s)}) # 4. Add stratified sample qc (population/platform) annotation expr sample_qc_pop_platform_filters_ht = hl.read_table( get_sample_qc_ht_path(dataset=args.exome_cohort, part='stratified_metrics_filter')) sample_qc_filters.update({ 'pop_platform_filters': sample_qc_pop_platform_filters_ht[ht.s]['pop_platform_filters'] }) ht = (ht.annotate(**sample_qc_filters)) # Final sample qc filter joint expression final_sample_qc_ann_expr = { 'pass_filters': hl.cond((hl.len(ht.hard_filters) == 0) & (hl.len(ht.pop_platform_filters) == 0) & (ht.predicted_pop == 'EUR') & ~ht.is_related, True, False) } ht = (ht.annotate(**final_sample_qc_ann_expr)) logger.info('Writing final sample qc HT to disk...') output_path_ht = get_sample_qc_ht_path(dataset=args.exome_cohort, part='final_qc') ht = ht.checkpoint(output_path_ht, overwrite=args.overwrite) # Export final sample QC annotations to file if args.write_to_file: (ht.export(f'{output_path_ht}.tsv.bgz')) ## Release final unphase MT with adjusted genotypes filtered mt = unphase_mt(mt) mt = annotate_adj(mt) mt = mt.filter_entries(mt.adj).select_entries('GT', 'DP', 'GQ', 'adj') logger.info('Writing unphase MT with adjusted genotypes to disk...') # write MT mt.write(get_qc_mt_path(dataset=args.exome_cohort, part='unphase_adj_genotypes', split=True), overwrite=args.overwrite) # Stop Hail hl.stop() print("Finished!")
def main(args): hl.init(default_reference=args.default_ref_genome) if args.run_test_mode: logger.info('Running pipeline on test data...') mt = (get_mt_data(part='raw_chr20').sample_rows(0.1)) else: logger.info( 'Running pipeline on MatrixTable wih adjusted genotypes...') ds = args.exome_cohort mt = hl.read_matrix_table( get_qc_mt_path(dataset=ds, part='unphase_adj_genotypes', split=True)) # 1. Sample-QC filtering if not args.skip_sample_qc_filtering: logger.info('Applying per sample QC filtering...') mt = apply_sample_qc_filtering(mt) logger.info( 'Writing sample qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt', overwrite=True)) # 2. Variant-QC filtering if not args.skip_variant_qc_filtering: logger.info('Applying per variant QC filtering...') if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt/_SUCCESS'): logger.info('Reading pre-existing sample qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt') mt = apply_variant_qc_filtering(mt) # write hard filtered MT to disk logger.info( 'Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt', overwrite=True)) # 3. Annotate AFs # allelic frequency cut-off maf_cutoff = args.af_max_threshold if not args.skip_af_filtering: if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt') # Annotate allelic frequencies from external source, # and compute internal AF on samples passing QC af_ht = get_af_annotation_ht() mt = (mt.annotate_rows(**af_ht[mt.row_key])) filter_expressions = [ af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff), af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff) ] mt = (mt.filter_rows(functools.reduce(operator.iand, filter_expressions), keep=True)) logger.info( f'Writing sample/variant QCed MT with rare variants at maf: {args.af_max_threshold}.' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt', overwrite=True)) # 4. ##### Run gene-set burden logistic regression ###### logger.info('Running gene-set burden logistic regression test...') if hl.hadoop_is_file(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT with rare variants...' ) mt = hl.read_matrix_table(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt') ## Add VEP-annotated fields vep_ht = get_vep_annotation_ht() mt = (mt.annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF, Consequence=vep_ht[mt.row_key].vep.Consequence, DOMAINS=vep_ht[mt.row_key].vep.DOMAINS, SYMBOL=vep_ht[mt.row_key].vep.SYMBOL)) ## Filter to bi-allelic variants if args.filter_biallelic: logger.info('Running burden test on biallelic variants...') mt = mt.filter_rows(bi_allelic_expr(mt)) ## Filter to variants within protein domain(s) if args.filter_protein_domain: logger.info( 'Running burden test on variants within protein domain(s)...') mt = mt.filter_rows(vep_protein_domain_filter_expr(mt.DOMAINS), keep=True) ## Annotate pathogenic scores ht_scores = get_vep_scores_ht() mt = mt.annotate_rows(**ht_scores[mt.row_key]) ## Classify variant into (major) consequence groups score_expr_ann = { 'hcLOF': mt.LoF == 'HC', 'syn': mt.Consequence == 'synonymous_variant', 'miss': mt.Consequence == 'missense_variant' } # Update dict expr annotations with combinations of variant consequences categories score_expr_ann.update({ 'missC': (hl.sum([(mt['vep.MVP_score'] >= MVP_THRESHOLD), (mt['vep.REVEL_score'] >= REVEL_THRESHOLD), (mt['vep.CADD_PHRED'] >= CADD_THRESHOLD)]) >= 2) & score_expr_ann.get('miss') }) score_expr_ann.update({ 'hcLOF_missC': score_expr_ann.get('hcLOF') | score_expr_ann.get('missC') }) mt = (mt.annotate_rows(csq_group=score_expr_ann)) # Transmute csq_group and convert dict to set where the group is defined # (easier to explode and grouping later) mt = (mt.transmute_rows(csq_group=hl.set( hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys())))) mt = (mt.filter_rows(hl.len(mt.csq_group) > 0)) # Explode nested csq_group and gene clusters before grouping mt = (mt.explode_rows(mt.csq_group)) # First-step aggregation: # Generate a sample per gene/variant_type (binary) matrix aggregating genotypes as follow: # # a) entry: hets # b) entry: homs # c) entry: chets (compound hets) mt_grouped = (mt.group_rows_by(mt['SYMBOL'], mt['csq_group']).aggregate( hets=hl.agg.any(mt.GT.is_het()), homs=hl.agg.any(mt.GT.is_hom_var()), chets=hl.agg.count_where( mt.GT.is_het()) >= 2).repartition(100).persist()) # Import/generate gene clusters clusters = hl.import_table(args.set_file, no_header=True, delimiter="\t", min_partitions=50, impute=False) clusters = generate_clusters_map(clusters) # Annotate gene-set info mt_grouped = (mt_grouped.annotate_rows(**clusters[mt_grouped.SYMBOL])) # Explode nested csq_group before grouping mt_grouped = (mt_grouped.explode_rows(mt_grouped.cluster_id)) # filter rows with defined consequence and gene-set name mt_grouped = (mt_grouped.filter_rows( hl.is_defined(mt_grouped.csq_group) & hl.is_defined(mt_grouped.cluster_id))) # 2. Second-step aggregation # Generate a sample per gene-sets/variant type matrix aggregating genotypes as follow: # if dominant -> sum hets (default) # if recessive -> sum (homs) # if recessive (a) -> sum (chets) # if recessive (b) -> sum (chets and/or homs) mts = [] if args.homs: # Group mt by gene-sets/csq_group aggregating homs genotypes. mt_homs = (mt_grouped.group_rows_by( mt_grouped.csq_group, mt_grouped.cluster_id).aggregate( mac=hl.int(hl.agg.sum(mt_grouped.homs))).repartition( 100).persist().annotate_rows(agg_genotype='homs')) mts.append(mt_homs) if args.chets: # Group mt by gene-sets/csq_group aggregating compound hets (chets) genotypes. mt_chets = (mt_grouped.group_rows_by( mt_grouped.csq_group, mt_grouped.cluster_id).aggregate( mac=hl.int(hl.agg.sum(mt_grouped.chets))).repartition( 100).persist().annotate_rows(agg_genotype='chets')) mts.append(mt_chets) if args.homs_chets: # Group mt by gene-sets/csq_group aggregating chets and/or homs genotypes. mt_homs_chets = (mt_grouped.group_rows_by( mt_grouped.csq_group, mt_grouped.cluster_id).aggregate(mac=hl.int( hl.agg.count_where(mt_grouped.chets | mt_grouped.homs))).repartition(100). persist().annotate_rows(agg_genotype='homs_chets')) mts.append(mt_homs_chets) if args.hets: # Group mt by gene-sets/csq_group aggregating hets genotypes (default) mt_hets = (mt_grouped.group_rows_by( mt_grouped.csq_group, mt_grouped.cluster_id).aggregate( mac=hl.int(hl.agg.sum(mt_grouped.hets))).repartition( 100).persist().annotate_rows(agg_genotype='hets')) mts.append(mt_hets) ## Joint MatrixTables mt_joint = hl.MatrixTable.union_rows(*mts) ## Add samples annotations # annotate sample covs covariates = hl.read_table( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.sample_covariates.ht') mt_joint = (mt_joint.annotate_cols(**covariates[mt_joint.s])) # annotate case/control phenotype info tb_sample = get_sample_meta_data() mt_joint = (mt_joint.annotate_cols(**tb_sample[mt_joint.s])) mt_joint = (mt_joint.filter_cols(mt_joint['phe.is_case'] | mt_joint['phe.is_control'])) ## Run logistic regression stratified by proband type analysis = ['all_cases', 'syndromic', 'nonsyndromic'] tbs = [] covs = ['sex', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5'] for proband in analysis: logger.info(f'Running burden test for {proband}...') mt_tmp = hl.MatrixTable if proband == 'all_cases': mt_tmp = mt_joint if proband == 'syndromic': mt_tmp = mt_joint.filter_cols(~mt_joint['phe.is_nonsyndromic']) if proband == 'nonsyndromic': mt_tmp = mt_joint.filter_cols(~mt_joint['phe.is_syndromic']) tb_logreg = logistic_regression(mt=mt_tmp, x_expr='mac', response='phe.is_case', covs=covs, pass_through=['agg_genotype'], extra_fields={ 'analysis': proband, 'maf': maf_cutoff, 'covs': '|'.join(covs) }) tbs.append(tb_logreg) tb_final = hl.Table.union(*tbs) # export results date = current_date() run_hash = str(uuid.uuid4())[:6] output_path = f'{args.output_dir}/{date}/{args.exome_cohort}.logreg_burden.{run_hash}.ht' tb_final = (tb_final.checkpoint(output=output_path)) if args.write_to_file: # write table to disk as TSV file (tb_final.export(f'{output_path}.tsv')) hl.stop()