def main(args): # Start Hail hl.init(default_reference=args.default_ref_genome) # Read Hail MatrixTable mt = hl.read_matrix_table(args.mt_input_path) # compute sample and variant qc mt = hl.variant_qc(mt) # write variant qc hailtable tb_variant_qc = (mt .select_rows('variant_qc') .rows() .flatten() .key_by('locus', 'alleles') ) output_path_ht = f'{args.ht_output_path}_variant_qc.ht' tb_variant_qc.write(output=output_path_ht) if args.write_to_file: (hl.read_table(output_path_ht) .export(f'{output_path_ht}_variant_qc.tsv.bgz') ) # Stop Hail hl.stop() print("Finished!")
def main(args): # Start Hail hl.init(default_reference=args.default_ref_genome) # Import unfiltered split MT mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered') # Compute stratified sample_qc (biallelic and multi-allelic sites) sample_qc_ht = compute_sample_qc(mt) # Write HT with sample QC metrics sample_qc_ht = sample_qc_ht.checkpoint(get_sample_qc_ht_path( dataset=args.exome_cohort, part='high_conf_autosomes'), overwrite=args.overwrite, _read_if_exists=not args.overwrite) # annotate sample population and platform qc info pop_qc = hl.read_table(get_sample_qc_ht_path(part='population_qc')) platform_qc = hl.read_table(get_sample_qc_ht_path(part='platform_pca')) ann_expr = { 'qc_pop': pop_qc[sample_qc_ht.s].predicted_pop, 'qc_platform': platform_qc[sample_qc_ht.s].qc_platform } sample_qc_ht = sample_qc_ht.annotate(**ann_expr) # Export HT to file if args.write_to_file: (sample_qc_ht.flatten().export( f"{get_sample_qc_ht_path(dataset=args.exome_cohort, part='high_conf_autosomes')}.tsv.bgz" )) # Apply stratified sample filters based on defined QC metrics exome_qc_metrics = [ 'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion', 'n_deletion', 'r_het_hom_var' ] print('Computing stratified metrics filters...') exome_pop_platform_filter_ht = compute_stratified_metrics_filter( sample_qc_ht, exome_qc_metrics, ['qc_pop', 'qc_platform']) exome_pop_platform_filter_ht = exome_pop_platform_filter_ht.checkpoint( get_sample_qc_ht_path(dataset=args.exome_cohort, part='stratified_metrics_filter'), overwrite=args.overwrite, _read_if_exists=not args.overwrite) # Export HT to file if args.write_to_file: (exome_pop_platform_filter_ht.export( f"{get_sample_qc_ht_path(dataset=args.exome_cohort, part='stratified_metrics_filter')}.tsv.bgz" )) # Stop Hail hl.stop() print("Finished!")
def download_data(): global _data_dir, _mt _data_dir = os.environ.get('HAIL_BENCHMARK_DIR', '/tmp/hail_benchmark_data') print(f'using benchmark data directory {_data_dir}') os.makedirs(_data_dir, exist_ok=True) files = map(lambda f: os.path.join(_data_dir, f), [ 'profile.vcf.bgz', 'profile.mt', 'table_10M_par_1000.ht', 'table_10M_par_100.ht', 'table_10M_par_10.ht', 'gnomad_dp_simulation.mt', 'many_strings_table.ht' ]) if not all(os.path.exists(file) for file in files): hl.init() # use all cores vcf = os.path.join(_data_dir, 'profile.vcf.bgz') print('files not found - downloading...', end='', flush=True) urlretrieve( 'https://storage.googleapis.com/hail-common/benchmark/profile.vcf.bgz', vcf) print('done', flush=True) print('importing...', end='', flush=True) hl.import_vcf(vcf, min_partitions=16).write(os.path.join( _data_dir, 'profile.mt'), overwrite=True) ht = hl.utils.range_table( 10_000_000, 1000).annotate(**{f'f_{i}': hl.rand_unif(0, 1) for i in range(5)}) ht = ht.checkpoint(os.path.join(_data_dir, 'table_10M_par_1000.ht'), overwrite=True) ht = ht.naive_coalesce(100).checkpoint(os.path.join( _data_dir, 'table_10M_par_100.ht'), overwrite=True) ht.naive_coalesce(10).write(os.path.join(_data_dir, 'table_10M_par_10.ht'), overwrite=True) mt = hl.utils.range_matrix_table(n_rows=250_000, n_cols=1_000, n_partitions=32) mt = mt.annotate_entries(x=hl.int(hl.rand_unif(0, 4.5)**3)) mt.write(os.path.join(_data_dir, 'gnomad_dp_simulation.mt'), overwrite=True) print('downloading many strings table...') mst_tsv = os.path.join(_data_dir, 'many_strings_table.tsv.bgz') mst_ht = os.path.join(_data_dir, 'many_strings_table.ht') urlretrieve( 'https://storage.googleapis.com/hail-common/benchmark/many_strings_table.tsv.bgz', mst_tsv) print('importing...') hl.import_table(mst_tsv).write(mst_ht, overwrite=True) hl.stop() else: print('all files found.', flush=True)
def main(args): # init hail hl.init(default_reference=args.default_ref_genome) # input MT mt = hl.read_matrix_table(args.mt_input_path) # filter high-quality genotype # mt = filter_genotypes_ab(mt) # import capture interval table (intersect) intervals = hl.read_table(args.ht_intervals) # generate an interval x sample MT by computing per intervals callrate mt_callrate = compute_callrate_mt(mt=mt, intervals_ht=intervals) # run pca eigenvalues, ht_pca, _ = run_platform_pca( callrate_mt=mt_callrate, binarization_threshold=args.binarization_threshold) # normalize eigenvalues (0-100) eigenvalues_norm = [x / sum(eigenvalues) * 100 for x in eigenvalues] # compute eigenvalues cumulative sum ev_cumsum = hl.array_scan(lambda i, j: i + j, 0, hl.array(eigenvalues_norm)) # getting optimal number of PCs (those which explain 99% of the variance) n_optimal_pcs = hl.eval(hl.len(ev_cumsum.filter(lambda x: x < 99.0))) logger.info( f"Keep only principal components which explain up to 99% of the variance. Number of optimal PCs found: {n_optimal_pcs}" ) # filter out uninformative PCs ht_pca = ht_pca.annotate(scores=ht_pca.scores[:n_optimal_pcs]) # apply unsupervised clustering on PCs to infer samples platform ht_platform = assign_platform_from_pcs( platform_pca_scores_ht=ht_pca, pc_scores_ann='scores', hdbscan_min_cluster_size=args.hdbscan_min_cluster_size, hdbscan_min_samples=args.hdbscan_min_cluster_size) ht_platform.show() # write HT ht_platform.write(output=args.ht_output_path, overwrite=args.overwrite) # export to file if true if args.write_to_file: (ht_platform.export(f'{args.ht_output_path}.tsv.bgz')) hl.stop()
def test_init_hail_context_twice(self): hl.init(idempotent=True) # Should be no error hl.stop() hl.init(idempotent=True) hl.experimental.define_function(lambda x: x + 2, hl.tint32) # ensure functions are cleaned up without error hl.stop() hl.init(idempotent=True) # Should be no error hl.init(hl.spark_context(), idempotent=True) # Should be no error
def ensure_resources(data_dir, resources): logging.info(f'using benchmark data directory {data_dir}') os.makedirs(data_dir, exist_ok=True) to_create = [] for rg in resources: if not rg.exists(data_dir): to_create.append(rg) if to_create: hl.init() for rg in to_create: rg.create(data_dir) hl.stop()
def main(args): # Start Hail on local mode hl.init() # getting list of VCF files from given path vcf_files_list = get_files_names(args.vcf_path, ext='vcf.gz') # import VCF(s) as Hail MatrixTable mt = hl.import_vcf(vcf_files_list, force_bgz=args.force_bgz) # write MatrixTable mt.write(output=args.output_path, overwrite=args.overwrite) # Stop Hail hl.stop()
def main(): p = argparse.ArgumentParser() p.add_argument('input_dataset', help='input VCF file') p.add_argument( '--matrixtable-file', help= 'file name (includes path) of the MatrixTable for data imported from VCF input' ) p.add_argument( '--overwrite-matrixtable', action='store_true', help='always import vcf data ignoring any existing matrixtable file') p.add_argument('--skip-sample-subset', action='store_true') p.add_argument('--ignore-missing-samples', action='store_true') p.add_argument('--project-guid', required=True, help='the guid of the target seqr project') p.add_argument('--gencode-release', type=int, default=29) p.add_argument('--gencode-path', help='path for downloaded Gencode data') p.add_argument('--es-host', default='localhost') p.add_argument('--es-port', default='9200') p.add_argument('--num-shards', type=int, default=1) p.add_argument('--block-size', type=int, default=2000) args = p.parse_args() start_time = time.time() hl.init() mt = load_mt(args.input_dataset, args.matrixtable_file, args.overwrite_matrixtable) mt = subset_mt(args.project_guid, mt, skip_sample_subset=args.skip_sample_subset, ignore_missing_samples=args.ignore_missing_samples) rows = annotate_fields(mt, args.gencode_release, args.gencode_path) export_to_es(rows, args.input_dataset, args.project_guid, args.es_host, args.es_port, args.block_size, args.num_shards) logger.info( 'Total time for subsetting, annotating, and exporting: {}'.format( time.time() - start_time)) hl.stop()
def download_data(data_dir, group=None): logging.info(f'using benchmark data directory {data_dir}') os.makedirs(data_dir, exist_ok=True) if group: resources = [r for r in all_resources if r.name() == group] if not resources: raise RuntimeError(f"no group {group!r}") else: resources = all_resources to_create = [] for rg in resources: if not rg.exists(data_dir): to_create.append(rg) if to_create: hl.init() for rg in to_create: rg.create(data_dir) hl.stop()
def main(args): # nfs_dir = 'file:///home/ubuntu/data' hl.init(default_reference=args.default_reference) logger.info("Importing data...") # import unfiltered MT mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered') # keep bi-allelic variants mt = (mt .filter_rows(bi_allelic_expr(mt), keep=True) ) # read intervals for filtering variants (used mainly for exomes) def _get_interval_table(interval: str) -> Union[None, hl.Table]: return get_capture_interval_ht(name=interval, reference=args.default_reference) if interval is not None else interval ht = compute_mean_coverage(mt=mt, normalization_contig=args.normalization_contig, included_calling_intervals=_get_interval_table(args.interval_to_include), excluded_calling_intervals=_get_interval_table(args.interval_to_exclude), chr_x=args.chr_x, chr_y=args.chr_y) logger.info("Exporting data...") # write HT output_ht_path = get_sample_qc_ht_path(part='sex_chrom_coverage') ht.write(output=output_ht_path, overwrite=args.overwrite) # export to file if true if args.write_to_file: (ht .export(f'{output_ht_path}.tsv.bgz') ) hl.stop() print("Done!")
def main(args): # Start Hail on local mode hl.init(default_reference='GRCh38') # getting list of VCF files from given path # vcf_files_list = get_files_names(args.vcf_path, ext='vcf.gz') # import VCF(s) as Hail MatrixTable mt = hl.import_vcf(path=args.vcf_path, force_bgz=args.force_bgz) if args.split_multi: mt = hl.split_multi_hts(mt) # write MatrixTable mt.write(output=args.output_path, overwrite=args.overwrite) # Stop Hail hl.stop() print("Finished!")
def main(args): # Start Hail hl.init(default_reference=args.default_ref_genome) # Import unfiltered split MT mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered') # Compute stratified sample_qc (biallelic and multi-allelic sites) sample_qc_ht = compute_sample_qc(mt) # Write HT with sample QC metrics output_path = ( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.sample_qc.high_conf.autosomes.cds.capture_intervals.rare_common.ht' ) sample_qc_ht = sample_qc_ht.checkpoint(output_path, overwrite=args.overwrite, _read_if_exists=not args.overwrite) # annotate sample population and platform qc info pop_qc = hl.read_table(get_sample_qc_ht_path(part='population_qc')) platform_qc = hl.read_table(get_sample_qc_ht_path(part='platform_pca')) ann_expr = { 'qc_pop': pop_qc[sample_qc_ht.s].predicted_pop, 'qc_platform': platform_qc[sample_qc_ht.s].qc_platform } sample_qc_ht = sample_qc_ht.annotate(**ann_expr) # Export HT to file if args.write_to_file: (sample_qc_ht.flatten().export(f"{output_path}.tsv.bgz")) # Stop Hail hl.stop() print("Finished!")
def main(args): # init hail hl.init(default_reference=args.default_ref_genome) # import MT mt = hl.read_matrix_table(args.mt_input_path) n_variants, n_samples = mt.count() # Getting variant table. Basically, a table keyed by <locus> or <locus, alleles> # with all variants in the dataset and no extra fields (a.k.a reference table). tb_variants = (mt.select_rows().rows()) # compute overall coverage if args.compute_overall_coverage: logger.info( f"Computing coverage stats for {n_variants} variant over {n_samples} samples..." ) ht_cov_overall = compute_coverage_stats(mt=mt, reference_ht=tb_variants) tb_variants = (tb_variants.annotate( overall=ht_cov_overall[tb_variants.key])) # compute coverage stratified by phenotype status (expected binary) # force the input MT to have a case_control bool filed (is_case) # *** if args.compute_phe_coverage: logger.info( f"Computing coverage stats stratified by phenotype status...") # Annotate sample meta info # Note: Temporal solution, better to import annotated MT mt = (mt.annotate_cols(**get_sample_meta_data()[mt.col_key])) mt = (mt.annotate_cols( case_control=hl.if_else(mt[args.phe_field], 'case', 'control'))) strata = (mt.aggregate_cols(hl.agg.collect_as_set(mt['case_control']))) dict_strata_ht = { s: compute_coverage_stats(mt=mt.filter_cols(mt['case_control'] == s), reference_ht=tb_variants) for s in strata } for k in dict_strata_ht.keys(): _tb = dict_strata_ht.get(k) tb_variants = tb_variants.annotate(**{k: _tb[tb_variants.key]}) if args.run_binomial_test: logger.info(f"Running binomial test...") # perform a binomial test on coverage and case/control status # DOI: https://doi.org/10.1002/acn3.582 tb_binomial = (tb_variants.annotate( n_cases_over_10=hl.int(tb_variants.case.over_10 * 100), n_controls_over_10=hl.int(tb_variants.control.over_10 * 100), total_cases=tb_variants.case.n_samples, total_controls=tb_variants.control.n_samples, ).select('n_cases_over_10', 'n_controls_over_10', 'total_cases', 'total_controls')) binomial_expr = { 'p_value': hl.binom_test( x=tb_binomial.n_cases_over_10, n=tb_binomial.n_cases_over_10 + tb_binomial.n_controls_over_10, p=tb_binomial.total_cases / (tb_binomial.total_cases + tb_binomial.total_controls), alternative='two.sided') } tb_binomial = (tb_binomial.annotate(**binomial_expr)) tb_variants = (tb_variants.annotate( binomial_stats=tb_binomial[tb_variants.key])) # make coverage filter expressions # Note: the default number of reads is set to 10X logger.info(f"Assigning per site coverage filters...") significant_level = args.pvalue_threshold min_sample_prop = args.min_sample_proportion coverage_filter_dict_expr = {} if args.compute_overall_coverage: coverage_filter_dict_expr.update({ 'overall_hard_cutoff': hl.if_else((tb_variants.overall.over_10 >= min_sample_prop), "pass", "fail") }) if args.compute_phe_coverage: # DOI: https://doi.org/10.1016/j.ajhg.2018.08.016 coverage_filter_dict_expr.update({ 'phe_hard_cutoff': hl.if_else((tb_variants.case.over_10 >= min_sample_prop) & (tb_variants.control.over_10 >= min_sample_prop), "concordant", "discordant") }) if args.run_binomial_test: coverage_filter_dict_expr.update({ 'phe_binomial': hl.if_else(tb_variants.binomial_stats.p_value < significant_level, 'dependent', 'independent') }) # annotate coverage filters tb_variants = (tb_variants.annotate(coverage_filter=hl.struct( **coverage_filter_dict_expr))) # add useful global annotations to final coverage stats ht # as well as affected/non-affected summary counts per filters global_ann_dict_expr = { 'date': current_date(), 'mt_path': args.mt_input_path, 'min_sample_prop': min_sample_prop } if args.compute_overall_coverage: global_ann_dict_expr.update({ 'overall_hard_cutoff': tb_variants.aggregate( hl.agg.counter( tb_variants.coverage_filter.overall_hard_cutoff)) }) if args.compute_phe_coverage: global_ann_dict_expr.update({ 'phe_hard_cutoff': tb_variants.aggregate( hl.agg.counter(tb_variants.coverage_filter.phe_hard_cutoff)) }) if args.run_binomial_test: global_ann_dict_expr.update({ 'phe_binomial': tb_variants.aggregate( hl.agg.counter(tb_variants.coverage_filter.phe_binomial)), 'binomial_pvalue_cutoff': significant_level if args.run_binomial_test else hl.float('') }) tb_variants = (tb_variants.annotate_globals(**global_ann_dict_expr)) # check tb_variants.globals.show() tb_variants.describe() # write HT tb_variants = tb_variants.checkpoint(output=args.ht_output_path, overwrite=args.overwrite) # export to file if true if args.write_to_file: (tb_variants.export(f'{args.ht_output_path}.tsv.bgz')) hl.stop()
(freq.freq[1].AF == 0), snp_cutoff=args.snp_cutoff, indel_cutoff=args.indel_cutoff, determine_cutoff_from_bin=False, aggregated_bin_ht=bin_ht, bin_id=bin_ht.bin, inbreeding_coeff_cutoff=INBREEDING_COEFF_HARD_CUTOFF, ) # This column is added by the RF module based on a 0.5 threshold which doesn't correspond to what we use # ht = ht.drop(ht[PREDICTION_COL]) ht.write(f'{tmp_dir}/rf_final.ht', overwrite=True) if __name__ == "__main__": hl.stop() hl.init(default_reference="GRCh38") # s3 credentials required for user to access the datasets in farm flexible compute s3 environment # you may use your own here from your .s3fg file in your home directory n_partitions = 500 parser = argparse.ArgumentParser() parser.add_argument( "--run_hash", help= "Run hash. Created by --train_rf and only needed for --apply_rf without running --train_rf", required=False, )
def main(args): hl.init(default_reference='GRCh38') transcript_field = args.transcript_field # import variant HT ht_variants_path = args.ht_variant ht_variants = hl.read_table(ht_variants_path).select(transcript_field) # import dbNSFP HT ht_dbnsfp_path = args.ht_dbnsfp ht_dbnsfp = hl.read_table(ht_dbnsfp_path) # annotate scores from dbNSFP # prediction scores fields to annotate score_fields = [ f for f in ht_dbnsfp.row if f.endswith('_score') or f == 'CADD_phred' ] ht_variants = (ht_variants.annotate(**ht_dbnsfp.select( *score_fields)[ht_variants.key])) # Match score with specific transcript ht_variants = (ht_variants.annotate( **{ f: ht_variants[f].get(ht_variants[transcript_field]) for f in score_fields })) # Annotate extra info from dbNSFP # Note: Expected extra fields (as struct) from dbNSFP: ['gnomAD', 'ExAC', '1000Gp3', 'ESP6500', 'clinvar'] ann_expr_dict = {} if args.add_clinvar: ann_expr_dict.update( {'clinvar': ht_dbnsfp[ht_variants.key]['clinvar']}) if args.add_gnomad: ann_expr_dict.update({'gnomAD': ht_dbnsfp[ht_variants.key]['gnomAD']}) if args.add_exac: ann_expr_dict.update({'ExAC': ht_dbnsfp[ht_variants.key]['ExAC']}) if args.add_1000Gp3: ann_expr_dict.update( {'1000Gp3': ht_dbnsfp[ht_variants.key]['1000Gp3']}) if args.add_ESP6500: ann_expr_dict.update( {'ESP6500': ht_dbnsfp[ht_variants.key]['ESP6500']}) if len(ann_expr_dict) > 0: ht_variants = (ht_variants.annotate(**ann_expr_dict)) # write annotated table # write HT ht_variants = ht_variants.checkpoint(output=args.ht_output, overwrite=args.overwrite) # export to file if true if args.write_to_file: (ht_variants.export(f'{args.ht_output}.tsv.bgz')) hl.stop()
def main(args): # Init Hail hl.init(default_reference=args.default_reference) if not args.skip_compute_pc_relate: if not args.skip_filter_data: # Read MatrixTable mt = hl.read_matrix_table(args.mt_input_path) # filter variants (bi-allelic, high-callrate, common SNPs) logger.info( f"Filtering to bi-allelic, high-callrate, common SNPs ({args.maf_threshold}) for pc_relate..." ) mt = (mt.filter_rows( (hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1]) & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > args.maf_threshold) & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99) & ~mt.was_split).repartition(500, shuffle=False)) # keep only GT entry field and force to evaluate expression (mt.select_entries(mt.GT).write( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.filtered_high_confidence_variants.mt', overwrite=args.overwrite)) mt = hl.read_matrix_table( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.filtered_high_confidence_variants.mt' ) if not args.skip_prune_ld: # LD pruning # Avoid filtering / missingness entries (genotypes) before run LP pruning # Zulip Hail support issue -> "BlockMatrix trouble when running pc_relate" # mt = mt.unfilter_entries() # Prune variants in linkage disequilibrium. # Return a table with nearly uncorrelated variants logger.info( f'Pruning variants in LD from MT with {mt.count_rows()} variants...' ) pruned_variant_table = hl.ld_prune(mt.GT, r2=args.r2) # Keep LD-pruned variants pruned_mt = (mt.filter_rows(hl.is_defined( pruned_variant_table[mt.row_key]), keep=True)) pruned_mt.write( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.ld_pruned.mt', overwrite=args.overwrite) pruned_mt = hl.read_matrix_table( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.ld_pruned.mt') v, s = pruned_mt.count() logger.info(f'{s} samples, {v} variants found in LD-pruned MT') pruned_mt = pruned_mt.select_entries( GT=hl.unphased_diploid_gt_index_call(pruned_mt.GT.n_alt_alleles())) # run pc_relate method...compute all stats logger.info('Running PCA for PC-Relate...') eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT, k=10, compute_loadings=False) scores.write( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.pruned.pca_scores_for_pc_relate.ht', overwrite=args.overwrite) logger.info(f'Running PC-Relate...') scores = hl.read_table( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.pruned.pca_scores_for_pc_relate.ht' ) relatedness_ht = hl.pc_relate( call_expr=pruned_mt.GT, min_individual_maf=args.min_individual_maf, scores_expr=scores[pruned_mt.col_key].scores, block_size=4096, min_kinship=args.min_kinship, statistics='all') logger.info(f'Writing relatedness table...') # Write/export table to file relatedness_ht.write( output= f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.relatedness_kinship.ht', overwrite=args.overwrite) # Write PCs table to file (if specified) # if args.write_to_file: # # Export table to file # relatedness_ht.export(output=f'{args.ht_output_path}.tsv.bgz') # retrieve maximal independent set of related samples logger.info('Getting optimal set of related samples to prune...') relatedness_ht = hl.read_table( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.relatedness_kinship.ht') relatedness_ht = (relatedness_ht.flatten().rename({ 'i.s': 'i', 'j.s': 'j' }).repartition(100)) # import trios info fam = import_fam_ht() mat_ids = hl.set(fam.mat_id.collect()) fat_ids = hl.set(fam.pat_id.collect()) # rank samples by retention priority (e.g. cases over controls) tb_rank = make_sample_rank_table(get_sample_meta_data()) # apply min kinship to consider related pairs relatedness_ht = (relatedness_ht.filter(relatedness_ht.kin > MIN_KINSHIP)) # run maximal_independent_set stratified by groups # Note: This method fails when considering all pairs together (e.g. it removes most of the index in trios, we want # keep them (index) since they are mostly affected individuals rather than parents). # defining pairs group # TODO: check groups with updated fam file relatedness_ht = (relatedness_ht.annotate(pairs_group=hl.case().when( relatedness_ht.kin > 0.40, 'twins_or_dups').when( mat_ids.contains(relatedness_ht.i) | mat_ids.contains(relatedness_ht.j), 'pairs_child_mat').when( fat_ids.contains(relatedness_ht.i) | fat_ids.contains(relatedness_ht.j), 'pairs_child_fat').default('pairs_others'))) groups = (relatedness_ht.aggregate( hl.agg.collect_as_set(relatedness_ht['pairs_group']))) tbs = [] for pair_group in groups: pair_ht = relatedness_ht.filter( relatedness_ht.pairs_group == pair_group) tb = get_related_samples_to_drop(rank_table=tb_rank, relatedness_ht=pair_ht) tbs.append(tb) related_samples_to_remove = hl.Table.union(*tbs) related_samples_to_remove.describe() related_samples_to_remove = related_samples_to_remove.checkpoint( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.related_samples_to_remove.ht', overwrite=args.overwrite) if args.write_to_file: (related_samples_to_remove.flatten().export( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.related_samples_to_remove.tsv' )) hl.stop()
def stopTestHailContext(): hail.stop()
def test_init_hail_context_twice(self): hl.init(idempotent=True) # Should be no error hl.stop() hl.init(idempotent=True) # Should be no error hl.init(hl.spark_context(), idempotent=True) # Should be no error
def tearDown(self): hl.stop() os.remove(self.vcf_file)
def main(args): # Start Hail hl.init(default_reference=args.default_reference) if not args.skip_filter_step: logger.info("Importing data...") # import unfiltered MT mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered') # Read MT from 1kgenome and keep only locus defined in interval mt_1kg = get_1kg_mt(args.default_reference) # Joining dataset (inner join). Keep only 'GT' entry field mt_joint = (mt.select_entries('GT').union_cols( mt_1kg.select_entries('GT'), row_join_type='inner')) logger.info( "Filtering joint MT to bi-allelic, high-callrate, common SNPs...") mt_joint = (mt_joint.filter_rows( bi_allelic_expr(mt_joint) & hl.is_snp(mt_joint.alleles[0], mt_joint.alleles[1]) & (hl.agg.mean(mt_joint.GT.n_alt_alleles()) / 2 > 0.001) & (hl.agg.fraction(hl.is_defined(mt_joint.GT)) > 0.99)). naive_coalesce(1000)) logger.info( "Checkpoint: writing joint filtered MT before LD pruning...") mt_joint = mt_joint.checkpoint(get_mt_checkpoint_path( dataset=args.exome_cohort, part='joint_1kg_high_callrate_common_snp_biallelic'), overwrite=True) logger.info( f"Running ld_prune with r2 = {args.ld_prune_r2} on MT with {mt_joint.count_rows()} variants..." ) # remove correlated variants pruned_variant_table = hl.ld_prune(mt_joint.GT, r2=args.ld_prune_r2, bp_window_size=500000, memory_per_core=512) mt_joint = (mt_joint.filter_rows( hl.is_defined(pruned_variant_table[mt_joint.row_key]))) logger.info("Writing filtered joint MT with variants in LD pruned...") (mt_joint.write(get_qc_mt_path( dataset=args.exome_cohort + '_1kg', part='joint_high_callrate_common_snp_biallelic', split=True, ld_pruned=True), overwrite=args.overwrite)) logger.info("Importing filtered joint MT...") mt_joint = hl.read_matrix_table( get_qc_mt_path(dataset=args.exome_cohort + '_1kg', part='joint_high_callrate_common_snp_biallelic', split=True, ld_pruned=True)) logger.info(f"Running PCA with {mt_joint.count_rows()} variants...") # run pca on merged dataset eigenvalues, pc_scores, _ = hl.hwe_normalized_pca(mt_joint.GT, k=args.n_pcs) logger.info(f"Eigenvalues: {eigenvalues}") # TODO: save eigenvalues? # Annotate PC array as independent fields. pca_table = (pc_scores.annotate( ** {'PC' + str(k + 1): pc_scores.scores[k] for k in range(0, args.n_pcs)}).drop('scores')) logger.info(f"Writing HT with PCA results...") # write as HT output_ht_path = get_sample_qc_ht_path(dataset=args.exome_cohort, part='joint_pca_1kg') pca_table.write(output=output_ht_path) if args.write_to_file: (pca_table.export(f'{output_ht_path}.tsv.bgz')) # Stop Hail hl.stop() print("Done!")
def main(args): # Start Hail hl.init(default_reference=args.default_ref_genome) # Import adj genotype MT and remove mt = hl.read_matrix_table( get_qc_mt_path(dataset=args.exome_cohort, part='sample_qc_adj_genotypes', split=True)) # keep samples passing QC filtering mt = (mt.filter_cols(mt.pass_filters).select_cols().select_rows()) # import variant info fields (vcf info) variant_info_ht = (get_vep_annotation_ht().drop('vep')) # Add useful annotation for variant hard filter ht = ( mt.annotate_rows( inbreeding_coeff=variant_info_ht[mt.row_key].info.InbreedingCoeff, vqsr_filter=variant_info_ht[mt.row_key].filters, VQSLOD=variant_info_ht[mt.row_key].info.VQSLOD, gt_counts=hl.agg.count_where(hl.is_defined( mt.GT)) # expected MT filtered to high-quality GT ).rows()) # 1. Apply variant hard filters # hard filter expression variant_hard_filter_expr = { 'fail_inbreeding_coeff': ht.inbreeding_coeff < INBREEDING_COEFFICIENT_CUTOFF, 'AC0': ht.gt_counts == 0 } ht = (ht.annotate(**variant_hard_filter_expr)) # 2. Apply VQSR filter ht = (ht.annotate(fail_vqsr=hl.len(ht.vqsr_filter) != 0)) # 3. Apply RF filter # import/parse rf final HT ht_rf = hl.read_table(get_variant_qc_ht_path(part='rf_result')) ht_rf = (ht_rf.select(rf_probability_tp=ht_rf.rf_probability['TP'], variant_type=ht_rf.variant_type)) ht = (ht.annotate(**ht_rf[ht.key])) ht = (ht.annotate(fail_rf=hl.case().when( (ht.rf_probability_tp < RF_PROBABILITY_SNV_CUTOFF) & (ht.variant_type == 'snv'), True).when( (ht.rf_probability_tp < RF_PROBABILITY_INDEL_CUTOFF) & (ht.variant_type == 'indel'), True).default(False))) # 5. Apply coverage/capture interval filters ## gnomad genome coverage gnomad_coverage_ht = get_gnomad_genomes_coverage_ht().key_by() gnomad_coverage_ht = (gnomad_coverage_ht.annotate(locus=hl.parse_locus( gnomad_coverage_ht.locus, reference_genome='GRCh38')).key_by('locus')) ht = (ht.annotate(gnomad_cov_10X=gnomad_coverage_ht[ht.locus].over_10)) ht = (ht.annotate(is_coveraged_gnomad_genomes=ht.gnomad_cov_10X >= 0.9)) ## defined in capture intervals # filter to capture intervals (intersect) ht_defined_intervals = filter_capture_intervals(ht) ht = (ht.annotate(is_defined_capture_intervals=hl.is_defined( ht_defined_intervals[ht.key]))) # 6. Summary final variant QC # final variant qc filter joint expression final_variant_qc_ann_expr = { 'pass_variant_qc_filters': hl.cond( ~ht.fail_inbreeding_coeff & ~ht.AC0 & ~ht.fail_vqsr & ~ht.fail_rf & ht.is_coveraged_gnomad_genomes & ht.is_defined_capture_intervals, True, False) } ht = (ht.annotate(**final_variant_qc_ann_expr)) # Counts the number of variants (snv and indels) affected by every filter and add as global field filter_flags = [ 'fail_inbreeding_coeff', 'AC0', 'fail_vqsr', 'fail_rf', 'is_coveraged_gnomad_genomes', 'is_defined_capture_intervals', 'pass_variant_qc_filters' ] summary_filter_expr = { v: hl.struct( **{ f: hl.agg.filter(ht.variant_type == v, hl.agg.counter(ht[f])) for f in filter_flags }) for v in ['snv', 'indel'] } ht = ht.annotate_globals( summary_filter=ht.aggregate(summary_filter_expr, _localize=False)) # write HT variant QC final table output_path = get_variant_qc_ht_path(dataset=args.exome_cohort, part='final_qc') ht = ht.checkpoint(output_path, overwrite=args.overwrite) # print filter summary logger.info(f'Variant QC filter summary: {ht.summary_filter.collect()}') # export HT to file if args.write_to_file: ht.export(f'{output_path}.tsv.bgz') # Stop Hail hl.stop() print("Finished!")
def main(args): # Start Hail hl.init(default_reference=args.default_reference) if not args.skip_filter_step: logger.info("Importing data...") # import unfiltered MT mt = hl.read_matrix_table( get_qc_mt_path(dataset=args.exome_cohort, part='unphase_adj_genotypes', split=True)) # filter to samples passing QC filters logger.info( "Filtering MT to samples passing QC filters (hard filters, relatedness, european ancestries)..." ) sample_qc_ht = hl.read_table(get_sample_qc_ht_path(part='final_qc')) sample_qc_ht = (sample_qc_ht.filter(sample_qc_ht.pass_filters)) mt = (mt.filter_cols(hl.is_defined(sample_qc_ht[mt.col_key]))) logger.info( "Filtering joint MT to bi-allelic, high-callrate, common SNPs...") maf = args.maf_threshold mt = (mt.filter_rows( bi_allelic_expr(mt) & hl.is_snp(mt.alleles[0], mt.alleles[1]) & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > maf) & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99)).naive_coalesce( 500)) logger.info("Checkpoint: writing filtered MT before LD pruning...") mt = mt.checkpoint(get_mt_checkpoint_path( dataset=args.exome_cohort, part='high_callrate_common_snp_biallelic'), overwrite=args.overwrite) logger.info( f"Running ld_prune with r2 = {args.ld_prune_r2} on MT with {mt.count_rows()} variants..." ) # remove correlated variants pruned_variant_table = hl.ld_prune(mt.GT, r2=args.ld_prune_r2, bp_window_size=500000, memory_per_core=512) mt = (mt.filter_rows(hl.is_defined(pruned_variant_table[mt.row_key]))) logger.info("Writing filtered MT with ld-pruned variants...") (mt.write(get_qc_mt_path(dataset=args.exome_cohort, part='high_callrate_common_snp_biallelic', split=True, ld_pruned=True), overwrite=args.overwrite)) logger.info("Importing filtered ld-pruned MT...") mt = hl.read_matrix_table( get_qc_mt_path(dataset=args.exome_cohort, part='high_callrate_common_snp_biallelic', split=True, ld_pruned=True)) logger.info(f"Running PCA on {mt.count_rows()} variants...") # run pca on merged dataset eigenvalues, pc_scores, _ = hl.hwe_normalized_pca(mt.GT, k=args.n_pcs) logger.info(f"Eigenvalues: {eigenvalues}") # Annotate eigenvalues as global field pc_scores = (pc_scores.annotate_globals(**{'eigenvalues': eigenvalues})) # Annotate PC array as independent fields. pca_table = (pc_scores.annotate( ** {'PC' + str(k + 1): pc_scores.scores[k] for k in range(0, args.n_pcs)}).drop('scores')) logger.info(f"Writing HT with PCA results...") # write as HT output_ht_path = args.output_ht pca_table = (pca_table.checkpoint(output=output_ht_path, overwrite=args.overwrite)) if args.write_to_file: (pca_table.export(f'{output_ht_path}.tsv.bgz')) # Stop Hail hl.stop() print("PCA pipeline finalised...")
def main(args): hl.init(default_reference=args.default_ref_genome) if args.run_test_mode: logger.info('Running pipeline on test data...') mt = (get_mt_data(part='raw_chr20').sample_rows(0.1)) else: logger.info( 'Running pipeline on MatrixTable wih adjusted genotypes...') ds = args.exome_cohort mt = hl.read_matrix_table( get_qc_mt_path(dataset=ds, part='unphase_adj_genotypes', split=True)) # 1. Sample-QC filtering if not args.skip_sample_qc_filtering: logger.info('Applying per sample QC filtering...') mt = apply_sample_qc_filtering(mt) logger.info( 'Writing sample qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt', overwrite=True)) # 2. Variant-QC filtering if not args.skip_variant_qc_filtering: logger.info('Applying per variant QC filtering...') if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt/_SUCCESS'): logger.info('Reading pre-existing sample qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt') mt = apply_variant_qc_filtering(mt) # write hard filtered MT to disk logger.info( 'Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt', overwrite=True)) # 3. Annotate AFs # allelic frequency cut-off maf_cutoff = args.af_max_threshold if not args.skip_af_filtering: if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt') # Annotate allelic frequencies from external source, # and compute internal AF on samples passing QC af_ht = get_af_annotation_ht() mt = (mt.annotate_rows(**af_ht[mt.row_key])) filter_expressions = [ af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff), af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff) ] mt = (mt.filter_rows(functools.reduce(operator.iand, filter_expressions), keep=True)) logger.info( 'Writing qc-filtered MT filtered to external maf with to disk...') mt = (mt.write(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt', overwrite=True)) # 4. ##### Burden Test ###### logger.info('Running burden test...') if hl.hadoop_is_file(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT with rare variants...' ) mt = hl.read_matrix_table(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt') ## Add VEP-annotated fields vep_ht = get_vep_annotation_ht() mt = (mt.annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF, Consequence=vep_ht[mt.row_key].vep.Consequence, DOMAINS=vep_ht[mt.row_key].vep.DOMAINS, SYMBOL=vep_ht[mt.row_key].vep.SYMBOL)) ## Filter to bi-allelic variants if args.filter_biallelic: logger.info('Running burden test on biallelic variants...') mt = mt.filter_rows(bi_allelic_expr(mt)) ## Filter to variants within protein domain(s) if args.filter_protein_domain: logger.info( 'Running burden test on variants within protein domain(s)...') mt = mt.filter_rows(vep_protein_domain_filter_expr(mt.DOMAINS), keep=True) ## Add cases/controls sample annotations tb_sample = get_sample_meta_data() mt = (mt.annotate_cols(**tb_sample[mt.s])) mt = (mt.filter_cols(mt['phe.is_case'] | mt['phe.is_control'])) ## Annotate pathogenic scores ht_scores = get_vep_scores_ht() mt = mt.annotate_rows(**ht_scores[mt.row_key]) ## Classify variant into (major) consequence groups score_expr_ann = { 'hcLOF': mt.LoF == 'HC', 'syn': mt.Consequence == 'synonymous_variant', 'miss': mt.Consequence == 'missense_variant' } # Update dict expr annotations with combinations of variant consequences categories score_expr_ann.update({ 'missC': (hl.sum([(mt['vep.MVP_score'] >= MVP_THRESHOLD), (mt['vep.REVEL_score'] >= REVEL_THRESHOLD), (mt['vep.CADD_PHRED'] >= CADD_THRESHOLD)]) >= 2) & score_expr_ann.get('miss') }) score_expr_ann.update({ 'hcLOF_missC': score_expr_ann.get('hcLOF') | score_expr_ann.get('missC') }) mt = (mt.annotate_rows(csq_group=score_expr_ann)) # Transmute csq_group and convert dict to set where the group is defined # (easier to explode and grouping later) mt = (mt.transmute_rows(csq_group=hl.set( hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys())))) mt = (mt.filter_rows(hl.len(mt.csq_group) > 0)) # Explode nested csq_group before grouping mt = (mt.explode_rows(mt.csq_group)) # print('Number of samples/variants: ') # print(mt.count()) # Group mt by gene/csq_group. mt_grouped = (mt.group_rows_by(mt['SYMBOL'], mt['csq_group']).aggregate( hets=hl.agg.any(mt.GT.is_het()), homs=hl.agg.any(mt.GT.is_hom_var()), chets=hl.agg.count_where(mt.GT.is_het()) >= 2, homs_chets=(hl.agg.count_where(mt.GT.is_het()) >= 2) | (hl.agg.any(mt.GT.is_hom_var()))).repartition(100).persist()) mts = [] if args.homs: # select homs genotypes. mt_homs = (mt_grouped.select_entries( mac=mt_grouped.homs).annotate_rows(agg_genotype='homs')) mts.append(mt_homs) if args.chets: # select compound hets (chets) genotypes. mt_chets = (mt_grouped.select_entries( mac=mt_grouped.chets).annotate_rows(agg_genotype='chets')) mts.append(mt_chets) if args.homs_chets: # select chets and/or homs genotypes. mt_homs_chets = (mt_grouped.select_entries( mac=mt_grouped.homs_chets).annotate_rows( agg_genotype='homs_chets')) mts.append(mt_homs_chets) if args.hets: # select hets genotypes mt_hets = (mt_grouped.select_entries( mac=mt_grouped.hets).annotate_rows(agg_genotype='hets')) mts.append(mt_hets) ## Joint MatrixTables mt_grouped = hl.MatrixTable.union_rows(*mts) # Generate table of counts tb_gene = (mt_grouped.annotate_rows( n_cases=hl.agg.filter(mt_grouped['phe.is_case'], hl.agg.sum(mt_grouped.mac)), n_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'], hl.agg.sum(mt_grouped.mac)), n_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'], hl.agg.sum(mt_grouped.mac)), n_controls=hl.agg.filter(mt_grouped['phe.is_control'], hl.agg.sum(mt_grouped.mac)), n_total_cases=hl.agg.filter(mt_grouped['phe.is_case'], hl.agg.count()), n_total_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'], hl.agg.count()), n_total_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'], hl.agg.count()), n_total_controls=hl.agg.filter(mt_grouped['phe.is_control'], hl.agg.count())).rows()) # run fet stratified by proband type analysis = ['all_cases', 'syndromic', 'nonsyndromic'] tbs = [] for proband in analysis: logger.info(f'Running test for {proband}...') colCases = None colTotalCases = None colControls = 'n_controls' colTotalControls = 'n_total_controls' if proband == 'all_cases': colCases = 'n_cases' colTotalCases = 'n_total_cases' if proband == 'syndromic': colCases = 'n_syndromic' colTotalCases = 'n_total_syndromic' if proband == 'nonsyndromic': colCases = 'n_nonsyndromic' colTotalCases = 'n_total_nonsyndromic' tb_fet = compute_fisher_exact(tb=tb_gene, n_cases_col=colCases, n_control_col=colControls, total_cases_col=colTotalCases, total_controls_col=colTotalControls, correct_total_counts=True, root_col_name='fet', extra_fields={ 'analysis': proband, 'maf': maf_cutoff }) # filter out zero-count genes tb_fet = (tb_fet.filter( hl.sum([tb_fet[colCases], tb_fet[colControls]]) > 0, keep=True)) tbs.append(tb_fet) tb_final = hl.Table.union(*tbs) tb_final.describe() # export results date = current_date() run_hash = str(uuid.uuid4())[:6] output_path = f'{args.output_dir}/{date}/{args.exome_cohort}.fet_burden.{run_hash}.ht' tb_final = (tb_final.checkpoint(output=output_path)) if args.write_to_file: # write table to disk as TSV file (tb_final.export(f'{output_path}.tsv')) hl.stop()
def main(args): # Initializing Hail on cluster mode hl.init() # 1- Aggregate MatrixTable per gene/consequences creating gene/csq X sample matrix # Read MatrixTable mt = hl.read_matrix_table(args.mt_input_path) # Annotate csq group info per variants # Define consequences variant rules with hail expressions # TODO: check if fields exist in dataset csq_group_rules = {} if args.ptv: csq_group_rules.update({'PTV': mt.csq_type == 'PTV'}) if args.pav: csq_group_rules.update({'PAV': mt.csq_type == 'PAV'}) if args.syn: csq_group_rules.update({'SYN': mt.csq_type == 'SYN'}) if args.cadd: csq_group_rules.update({ 'CADD': (mt.csq_type == 'PAV') & (mt.cadd_phred >= args.cadd_threshold) }) # Annotate groups per variants mt = (mt.annotate_rows(csq_group=csq_group_rules)) # Transmute csq_group and convert to set (easier to explode and grouping later) mt = (mt.transmute_rows(csq_group=hl.set( hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys())))) # Explode nested csq_group before grouping mt = (mt.explode_rows(mt.csq_group)) # Group mt by gene/csq_group. mt_grouped = (mt.group_rows_by( mt.csq_group, mt.symbol).partition_hint(100).aggregate( n_het=hl.agg.count_where(mt.GT.is_het()))) # force to eval all aggregation operation by writing mt to disk # mt_grouped = mt_grouped.persist(storage_level='DISK_ONLY') if args.logistic_regression: # covariates list covs = list(args.covs_list) # Define x expression (entries/genotype) x_expr = 'n_het' extra_annotations = {'analysis': 'all_cases', 'covariates': covs} tb_stats = logistic_regression(mt=mt_grouped, x_expr=x_expr, response=args.phenotype_field, covs=covs, pass_through=[], extra_fields=extra_annotations) # export table tb_stats.export(args.output_path) if args.fet: None # TODO: implement gene-based Fisher Exact burden test hl.stop()
def stop(): global _initialized _initialized = False hl.stop()
def main(args): # Init Hail with hg38 genome build as default hl.init(default_reference=args.default_ref_genome) # Import VEPed VCF file as MatrixTable and get VCF file meta-data vcf_path = args.vcf_vep_path mt = hl.import_vcf(path=vcf_path, force_bgz=args.force_bgz) # getting annotated VEP fields names from VCF-header vep_fields = get_vep_fields(vcf_path=vcf_path, vep_csq_field=args.csq_field) if args.exclude_multi_allelic: # TODO: This option should skip the split_multi step... # Filter out multi-allelic variants. Keep only bi-allelic mt = filter_biallelic(mt) # split multi-allelic variants mt = hl.split_multi_hts(mt) # flatten nested structure (e.g. 'info') and get a HailTable with all rows fields tb_csq = (mt.rows().flatten().key_by('locus', 'alleles')) # rename info[CSQ] field to 'csq_array'. # Simpler field name are easier to parse later... tb_csq = (tb_csq.rename({'info.' + args.csq_field: 'csq_array'})) # Convert/annotate all transcripts per variants with a structure of type array<dict<str, str>>. # The transcript(s) are represented as a dict<k,v>, the keys are the field names extracted from the VCF header, the # values are the current annotated values in the CSQ field. tb_csq = (tb_csq.annotate(csq_array=tb_csq.csq_array.map( lambda x: hl.dict(hl.zip(vep_fields, x.split('[|]')))))) # Keep transcript(s) matching with the allele index. # It requires having the flag "ALLELE_NUM" annotated by VEP # Apply only were the alleles were split. # TODO: Handle exception when the flag "ALLELE_NUM" is not present tb_csq = (tb_csq.annotate(csq_array=hl.cond( tb_csq.was_split, tb_csq.csq_array.filter(lambda x: (hl.int(x["ALLELE_NUM"]) == tb_csq. a_index)), tb_csq.csq_array))) # select and annotate one transcript per variant based on pre-defined rules tb_csq = pick_transcript(ht=tb_csq, csq_array='csq_array') # Expand selected transcript (dict) annotations adding independent fields. tb_csq = annotate_from_dict(ht=tb_csq, dict_field='tx') # Parse the "Consequence" field. Keep only the more severe consequence. # Avoid the notation "consequence_1&consequence_2" tb_csq = (tb_csq.transmute(Consequence=tb_csq.Consequence.split('&')[0])) # print fields overview tb_csq.describe() # drop unnecessary fields tb_csq = (tb_csq.drop('csq_array', 'tx')) # write table as HailTable to disk (tb_csq.write(output=args.tb_output_path)) if args.write_to_file: # write table to disk as a BGZ-compressed TSV file (tb_csq.export(args.tb_output_path + '.tsv.bgz')) # Stop Hail hl.stop()
def handler(signum, frame): global _timeout_state _timeout_state = True hl.stop() hl.init(**_init_args) raise BenchmarkTimeoutError()
def main(args): # Initializing Hail on cluster mode init_hail_on_cluster(tmp_dir=HAIL_TMP_DIR, log_file=HAIL_LOG_PATH, local_mode=True) # 1- Aggregate MatrixTable per gene/consequences creating gene/csq X sample matrix # Read MatrixTable mt = hl.read_matrix_table(args.mt_input_path) # Annotate csq group info per variants # Define consequences variant rules with hail expressions # TODO: check if field exist in dataset csq_group_rules = {} if args.ptv: csq_group_rules.update({'PTV': mt.csq_type == 'PTV'}) if args.pav: csq_group_rules.update({'PAV': mt.csq_type == 'PAV'}) if args.syn: csq_group_rules.update({'SYN': mt.csq_type == 'SYN'}) if args.cadd: sq_group_rules.update({ 'CADD': (mt.csq_type == 'PAV') & (mt.cadd_phred >= args.cadd_threshold) }) if args.mpc: csq_group_rules.update( {'MPC': (mt.csq_type == 'PAV') & (mt.mpc >= args.mpc_threshold)}) # Annotate groups per variants mt = (mt.annotate_rows(csq_group=csq_group_rules)) # Transmute csq_group and convert to set (easier to explode and grouping later) mt = (mt.transmute_rows(csq_group=hl.set( hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys())))) # Explode nested csq_group before grouping mt = (mt.explode_rows(mt.csq_group)) # Group mt by gene/csq_group. mt_grouped = (mt.group_rows_by( mt.csq_group, mt.symbol).partition_hint(100).aggregate( n_het=hl.agg.count_where(mt.GT.is_het()))) # 2- Annotate gene set information # Import/parsing gene cluster table clusters = hl.import_table(args.gene_set_path, no_header=True) # parsing gene set column clusters = (clusters.transmute(genes=hl.set(clusters['f1'].split( delim='[|]')))) clusters = (clusters.explode(clusters.genes)) clusters = (clusters.group_by('genes').partition_hint(100).aggregate( cluster_name=hl.agg.collect_as_set(clusters['f0'])).key_by('genes')) # annotate gene set info mt_grouped = (mt_grouped.annotate_rows( cluster_name=clusters[mt_grouped.symbol].cluster_name)) # 3- Aggregate per gene set and consequences # Group mt by gene set/csq_group. mt_grouped = (mt_grouped.explode_rows(mt_grouped.cluster_name)) mt_grouped = (mt_grouped.group_rows_by( mt_grouped.cluster_name, mt_grouped.csq_group).partition_hint(100).aggregate( n_het=hl.agg.sum(mt_grouped.n_het))) # force to eval all aggregation operation by writing mt to disk mt_grouped = mt_grouped.persist(storage_level='DISK_ONLY') if args.logistic_regression: # covariates list covs = list(args.covs_list) # Define x expression (entries/genotype) x_expr = 'n_het' extra_annotations = {'analysis': 'all_cases', 'covariates': covs} tb_stats = logistic_regression(mt=mt_grouped, x_expr=x_expr, response=args.phenotype_field, covs=covs, pass_through=[], extra_fields=extra_annotations) # export table tb_stats.export(args.output_path) if args.fet: None # TODO: implement Fisher Exact-based burden gene set test hl.stop()
def main(args): # Init Hail hl.init(default_reference=args.default_ref_genome) # Import VEPed VCF file as MatrixTable and get VCF file meta-data # vcf_path = args.vcf_vep_path mt = hl.import_vcf(path=get_vep_vqsr_vcf_path(), force_bgz=args.force_bgz) # getting annotated VEP fields names from VCF-header vep_fields = get_vep_fields(vcf_path=get_vep_vqsr_vcf_path(), vep_csq_field=args.csq_field) if args.split_multi_allelic: # split multi-allelic variants mt = hl.split_multi_hts(mt) # split/annotate fields in the info field (use allele index ) mt = mt.annotate_rows(info=mt.info.annotate( **{field: mt.info[field][mt.a_index - 1] for field in INFO_FIELDS})) # parse/annotate the CSQ field in a different structure tb_csq = mt.rows() tb_csq = (tb_csq.annotate(csq_raw=tb_csq.info[args.csq_field])) # Convert/annotate all transcripts per variants with a structure of type array<dict<str, str>>. # The transcript(s) are represented as a dict<k,v>, where keys are the field names extracted from the VCF header and # the values are the current annotated values in the CSQ field. tb_csq = (tb_csq.annotate(csq_raw=tb_csq.csq_raw.map( lambda x: hl.dict(hl.zip(vep_fields, x.split('[|]')))))) # Keep transcript(s) matching with the allele index (only used if variant were split with split_multi_hts) # It requires having the flag "ALLELE_NUM" annotated by VEP # Apply only were the alleles were split. # TODO: Handle exception when the flag "ALLELE_NUM" is not present if all( [x in list(tb_csq._fields.keys()) for x in ['was_split', 'a_index']]): tb_csq = (tb_csq.annotate(csq_raw=hl.cond( tb_csq.was_split, tb_csq.csq_raw.filter(lambda x: (hl.int(x["ALLELE_NUM"]) == tb_csq. a_index)), tb_csq.csq_raw))) # select and annotate one transcript per variant based on pre-defined rules tb_csq = pick_transcript( ht=tb_csq, csq_array='csq_raw', ) # Expand selected transcript (dict) annotations adding independent fields. tb_csq = annotate_from_dict(ht=tb_csq, dict_field='tx', output_filed='vep') # Parse the "Consequence" field. Keep only the more severe consequence. # Avoid the notation "consequence_1&consequence_2" tb_csq = (tb_csq.annotate(vep=tb_csq.vep.annotate( Consequence=tb_csq.vep.Consequence.split('&')[0]))) # Parse the protein DOMAIN field if 'DOMAINS' in vep_fields: tb_csq = (tb_csq.annotate(vep=tb_csq.vep.annotate( DOMAINS=vep_protein_domain_ann_expr(tb_csq.vep['DOMAINS'])))) # drop redundant/temp fields tb_csq = (tb_csq.drop('csq_raw', 'tx').repartition(500)) # print fields overview tb_csq.describe() # write table as HailTable to disk # (tb_csq # .write(output=args.tb_output_path, # overwrite=args.overwrite) # ) output_path = get_variant_qc_ht_path(part='vep_vqsr', split=args.split_multi_allelic) tb_csq = (tb_csq.checkpoint(output=output_path, overwrite=args.overwrite)) if args.write_to_file: # write table to disk as a BGZ-compressed TSV file (tb_csq.export(f'{output_path}.tsv.bgz')) # Stop Hail hl.stop()
def download_data(data_dir): global _data_dir, _mt _data_dir = data_dir or os.environ.get( 'HAIL_BENCHMARK_DIR') or '/tmp/hail_benchmark_data' logging.info(f'using benchmark data directory {_data_dir}') os.makedirs(_data_dir, exist_ok=True) files = map(lambda f: os.path.join(_data_dir, f), [ 'profile.vcf.bgz', 'profile.mt', 'table_10M_par_1000.ht', 'table_10M_par_100.ht', 'table_10M_par_10.ht', 'gnomad_dp_simulation.mt', 'many_strings_table.ht', 'many_ints_table.ht', 'sim_ukb.bgen' ]) if not all(os.path.exists(file) for file in files): hl.init() # use all cores vcf = os.path.join(_data_dir, 'profile.vcf.bgz') logging.info('downloading profile.vcf.bgz...') urlretrieve( 'https://storage.googleapis.com/hail-common/benchmark/profile.vcf.bgz', vcf) logging.info('done downloading profile.vcf.bgz.') logging.info('importing profile.vcf.bgz...') hl.import_vcf(vcf, min_partitions=16).write(os.path.join( _data_dir, 'profile.mt'), overwrite=True) logging.info('done importing profile.vcf.bgz.') logging.info('writing 10M row partitioned tables...') ht = hl.utils.range_table( 10_000_000, 1000).annotate(**{f'f_{i}': hl.rand_unif(0, 1) for i in range(5)}) ht = ht.checkpoint(os.path.join(_data_dir, 'table_10M_par_1000.ht'), overwrite=True) ht = ht.naive_coalesce(100).checkpoint(os.path.join( _data_dir, 'table_10M_par_100.ht'), overwrite=True) ht.naive_coalesce(10).write(os.path.join(_data_dir, 'table_10M_par_10.ht'), overwrite=True) logging.info('done writing 10M row partitioned tables.') logging.info('creating gnomad_dp_simulation matrix table...') mt = hl.utils.range_matrix_table(n_rows=250_000, n_cols=1_000, n_partitions=32) mt = mt.annotate_entries(x=hl.int(hl.rand_unif(0, 4.5)**3)) mt.write(os.path.join(_data_dir, 'gnomad_dp_simulation.mt'), overwrite=True) logging.info('done creating gnomad_dp_simulation matrix table.') logging.info('downloading many_strings_table.tsv.bgz...') mst_tsv = os.path.join(_data_dir, 'many_strings_table.tsv.bgz') mst_ht = os.path.join(_data_dir, 'many_strings_table.ht') urlretrieve( 'https://storage.googleapis.com/hail-common/benchmark/many_strings_table.tsv.bgz', mst_tsv) logging.info('done downloading many_strings_table.tsv.bgz.') logging.info('importing many_strings_table.tsv.bgz...') hl.import_table(mst_tsv).write(mst_ht, overwrite=True) logging.info('done importing many_strings_table.tsv.bgz.') logging.info('downloading many_ints_table.tsv.bgz...') mit_tsv = os.path.join(_data_dir, 'many_ints_table.tsv.bgz') mit_ht = os.path.join(_data_dir, 'many_ints_table.ht') urlretrieve( 'https://storage.googleapis.com/hail-common/benchmark/many_ints_table.tsv.bgz', mit_tsv) logging.info('done downloading many_ints_table.tsv.bgz.') logging.info('importing many_ints_table.tsv.bgz...') hl.import_table(mit_tsv, types={ 'idx': 'int', **{f'i{i}': 'int' for i in range(5)}, **{f'array{i}': 'array<int>' for i in range(2)} }).write(mit_ht, overwrite=True) logging.info('done importing many_ints_table.tsv.bgz.') bgen = 'sim_ukb.bgen' sample = 'sim_ukb.sample' logging.info(f'downloading {bgen}...') local_bgen = os.path.join(_data_dir, bgen) local_sample = os.path.join(_data_dir, sample) urlretrieve( f'https://storage.googleapis.com/hail-common/benchmark/{bgen}', local_bgen) urlretrieve( f'https://storage.googleapis.com/hail-common/benchmark/{sample}', local_sample) logging.info(f'done downloading {bgen}...') logging.info(f'indexing {bgen}...') hl.index_bgen(local_bgen) logging.info(f'done indexing {bgen}.') hl.stop() else: logging.info('all files found.')