def get_r_human_readable(pop: str, var1: str, var2: str, ref_genome: str = "GRCh37"): bm = ld_matrix(pop).bm() ht = ld_index(pop).ht() chrom, pos, ref, alt = var1.split("-") var1 = (hl.parse_locus(f"{chrom}:{pos}", ref_genome), [ref, alt]) chrom, pos, ref, alt = var2.split("-") var2 = (hl.parse_locus(f"{chrom}:{pos}", ref_genome), [ref, alt]) return get_r_for_pair_of_variants(bm, ht, var1, var2)
def run_gwas(mt, phen: str, sim_name: str, subset_idx: int, param_suffix: str, wd: str, is_logreg=True): assert {'GT', 'dosage'}.intersection( mt.entry ) != {}, "mt does not have an entry field named 'dosage' or 'GT' corresponding to genotype data" mt = mt.filter_cols(mt.subset_idx == subset_idx) mt = mt.filter_cols(hl.is_defined(mt[phen])) print( f'\n\ngwas sample count (subset {subset_idx}): {mt.count_cols()}\n\n') if 'dosage' in mt.entry: mt = mt.annotate_rows(EAF=hl.agg.mean(mt.dosage) / 2) elif 'GT' in mt.entry: mt = mt.annotate_rows(EAF=hl.agg.mean(mt.GT.n_alt_alleles()) / 2) gwas_path = f'{wd}/gwas.{"logreg" if is_logreg else "linreg"}.{sim_name}.subset_{subset_idx}.{param_suffix}.tsv.gz' if not hl.hadoop_is_file(gwas_path): gt_field = mt.dosage if 'dosage' in mt.entry else mt.GT.n_alt_alleles() if is_logreg: gwas_ht = hl.logistic_regression_rows(test='wald', y=mt[phen], x=gt_field, covariates=[1], pass_through=['EAF']) else: gwas_ht = hl.linear_regression_rows(y=mt[phen], x=gt_field, covariates=[1], pass_through=['EAF']) gwas_ht.select('EAF', 'beta', 'standard_error', 'p_value').export(gwas_path) else: print(f'GWAS already run! ({gwas_path})') gwas_ht = hl.import_table(gwas_path, impute=True, force=True) gwas_ht = gwas_ht.annotate(locus=hl.parse_locus(gwas_ht.locus), alleles=gwas_ht.alleles.replace( '\[\"', '').replace('\"\]', '').split('\",\"')) gwas_ht = gwas_ht.key_by('locus', 'alleles') return gwas_ht
def load_variant_data(directory: str, pheno_key_dict, ukb_vep_path: str, extension: str = 'single.txt', n_cases: int = -1, n_controls: int = -1, heritability: float = -1.0, saige_version: str = 'NA', inv_normalized: str = 'NA', overwrite: bool = False, legacy_annotations: bool = False, num_partitions: int = 1000): output_ht_path = f'{directory}/variant_results.ht' ht = hl.import_table(f'{directory}/*.{extension}', delimiter=' ', impute=True) print(f'Loading: {directory}/*.{extension} ...') marker_id_col = 'markerID' if extension == 'single.txt' else 'SNPID' locus_alleles = ht[marker_id_col].split('_') if n_cases == -1: n_cases = hl.null(hl.tint) if n_controls == -1: n_controls = hl.null(hl.tint) if heritability == -1.0: heritability = hl.null(hl.tfloat) if saige_version == 'NA': saige_version = hl.null(hl.tstr) if inv_normalized == 'NA': inv_normalized = hl.null(hl.tstr) ht = ht.key_by(locus=hl.parse_locus(locus_alleles[0]), alleles=locus_alleles[1].split('/'), **pheno_key_dict).distinct().naive_coalesce(num_partitions) if marker_id_col == 'SNPID': ht = ht.drop('CHR', 'POS', 'rsid', 'Allele1', 'Allele2') ht = ht.transmute(Pvalue=ht['p.value']).annotate_globals( n_cases=n_cases, n_controls=n_controls, heritability=heritability, saige_version=saige_version, inv_normalized=inv_normalized) ht = ht.drop('varT', 'varTstar', 'N', 'Tstat') ht = ht.annotate(**get_vep_formatted_data( ukb_vep_path, legacy_annotations=legacy_annotations)[hl.struct( locus=ht.locus, alleles=ht.alleles )]) # TODO: fix this for variants that overlap multiple genes ht = ht.checkpoint(output_ht_path, overwrite=overwrite, _read_if_exists=not overwrite).drop( 'n_cases', 'n_controls', 'heritability')
def test_constructors(self): rg = hl.ReferenceGenome("foo", ["1"], {"1": 100}) schema = hl.tstruct(a=hl.tfloat64, b=hl.tfloat64, c=hl.tint32, d=hl.tint32) rows = [{'a': 2.0, 'b': 4.0, 'c': 1, 'd': 5}] kt = hl.Table.parallelize(rows, schema) kt = kt.annotate(d=hl.int64(kt.d)) kt = kt.annotate(l1=hl.parse_locus("1:51"), l2=hl.locus("1", 51, reference_genome=rg), i1=hl.parse_locus_interval("1:51-56", reference_genome=rg), i2=hl.interval(hl.locus("1", 51, reference_genome=rg), hl.locus("1", 56, reference_genome=rg))) expected_schema = {'a': hl.tfloat64, 'b': hl.tfloat64, 'c': hl.tint32, 'd': hl.tint64, 'l1': hl.tlocus(), 'l2': hl.tlocus(rg), 'i1': hl.tinterval(hl.tlocus(rg)), 'i2': hl.tinterval(hl.tlocus(rg))} self.assertTrue(all([expected_schema[f] == t for f, t in kt.row.dtype.items()]))
def import_var(seqr: str) -> hl.Table: ''' Reads in tsv of variants downloaded from seqr into a hail Table. :param str seqr: Path to variants tsv :return: Table of variants :rtype: hl.Table ''' ht = hl.import_table(seqr, impute=True) # add 'chr' in front of chromosome; hail will not recognize a locus as a valid b38 locus unless the chromosome is prefixed with 'chr' ht = ht.transmute(chrom=hl.format('chr%s', ht.chrom)) # create locus and alleles (need these two fields to succesfully join with gnomAD data) ht = ht.transmute(locus=hl.parse_locus(hl.format('%s:%s', ht.chrom, ht.pos)), alleles=[ht.ref, ht.alt]) ht = ht.key_by('locus', 'alleles') ht.describe() return ht
def query(output): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') # get frequency of loadings values loadings = hl.read_table(LOADINGS) number_of_pcs = hl.len(loadings.loadings).take(1)[0] print(loadings.count()) for i in range(0, (number_of_pcs)): pc = i + 1 freq = Counter(hl.abs(loadings.loadings[i]).collect()) filename = 'loadings_pc' + str(pc) + '.txt' with open(filename, 'w') as f: for key, value in freq.items(): str_value = repr(key) + ' ' + repr(value) f.write(str_value + '\n') f.close() subprocess.run(['gsutil', 'cp', filename, output], check=False) # pull out variants that looked like they're capped in the loadings plot mt = hl.read_matrix_table(HGDP1KG_TOBWGS) # Get NFE samples only mt = mt.filter_cols(( mt.hgdp_1kg_metadata.population_inference.pop == 'nfe') | (mt.s.contains('TOB'))) intervals = [ hl.parse_locus(x, reference_genome='GRCh38') for x in [ 'chr1:176163025', 'chr5:272714', 'chr5:36104012', 'chr1:183565810', 'chr3:58111799', ] ] mt_hits = mt.filter_rows(hl.literal(intervals).contains(mt.locus)) mt_path = f'{output}/capped_loadings_intervals.mt' mt_hits.write(mt_path)
def annotate_with_coding(ht, fname): ss0 = hl.import_table(f'{wd_data}/{fname}',impute=True,force=True,types={'chr':hl.tstr}) if 'variant' in list(ss0.row): variant = ss0.variant.split(':') ss = ss0.filter(hl.is_valid_locus(variant[0], hl.int(variant[1]), 'GRCh37')) locus = ss.variant.split(':') ss = ss.annotate(locus = hl.parse_locus(locus[0]+':'+locus[1],reference_genome='GRCh37')) if 'ytx' in ss.row: # a proxy for checking if the sumstats are from UKB variant = ss.variant.split(':') ss.annotate(A1 = variant[2], A2 = variant[3]) elif 'chr' in list(ss0.row) and 'pos' in list(ss0.row): ss = ss0.annotate(locus = hl.locus(contig=ss0.chr,pos=ss0.pos,reference_genome='GRCh37')) ss = ss.annotate(coding=hl.is_defined(ht[ss.locus])) fields_to_drop = [] fields = ['locus','AC','ytx','tstat','effect_allele','other_allele'] for field in fields: if field in ss.row: fields_to_drop.append(field) ss = ss.drop(*fields_to_drop) ss.export(f"{wd_data}/{fname.split('.tsv')[0]}.coding.tsv{fname.split('.tsv')[1]}")
def generate_datasets(doctest_namespace): doctest_namespace['hl'] = hl doctest_namespace['np'] = np ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint(f'output/example.mt', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata small_mt = hl.balding_nichols_model(3, 4, 4) doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt', overwrite=True) # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint(f'output/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds ld_score_one_pheno_sumstats = hl.import_table( 'data/ld_score_regression.one_pheno.sumstats.tsv', types={ 'locus': hl.tlocus('GRCh37'), 'alleles': hl.tarray(hl.tstr), 'chi_squared': hl.tfloat64, 'n': hl.tint32, 'ld_score': hl.tfloat64, 'phenotype': hl.tstr, 'chi_squared_50_irnt': hl.tfloat64, 'n_50_irnt': hl.tint32, 'chi_squared_20160': hl.tfloat64, 'n_20160': hl.tint32 }, key=['locus', 'alleles']) doctest_namespace[ 'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats mt = hl.import_matrix_table( 'data/ld_score_regression.all_phenos.sumstats.tsv', row_fields={ 'locus': hl.tstr, 'alleles': hl.tstr, 'ld_score': hl.tfloat64 }, entry_type=hl.tstr) mt = mt.key_cols_by(phenotype=mt.col_id) mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus), alleles=mt.alleles.split(',')) mt = mt.drop('row_id', 'col_id') mt = mt.annotate_entries(x=mt.x.split(",")) mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]), n=hl.int32(mt.x[1])) mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score)) doctest_namespace['ld_score_all_phenos_sumstats'] = mt print("finished setting up doctest...")
'/ludc/Home/daniel_c/dva/files/ukb_index/chr{}.idx2'.format(ch) } # Creating MatrixTable mt = hl.import_bgen(bgen_file, entry_fields=['GT'], sample_file=ukb_sf, index_file_map=file_map, _row_fields=['rsid']) # Extracting SNPs of interest mt_f = hl.filter_intervals(mt, ploci) mt_f = hl.variant_qc(mt_f) chromdat['chrompos'] = chromdat['chrom'] + ':' + chromdat[ 'hg19_pos'].astype(str) chromdat_hl = hl.Table.from_pandas(chromdat) chromdat_hl = chromdat_hl.annotate( locus=hl.parse_locus(chromdat_hl.chrompos, reference_genome='GRCh37')) chromdat_hl = chromdat_hl.key_by('locus') mt_f = mt_f.annotate_rows(**chromdat_hl[mt_f.locus]) flip = hl.case().when(mt_f.ea == mt_f.alleles[0], True).when(mt_f.ea == mt_f.alleles[1], False).or_missing() mt_f = mt_f.annotate_rows(flip=flip) mt_f = mt_f.annotate_rows( prior=2 * hl.if_else(mt_f.flip, mt_f.variant_qc.AF[0], mt_f.variant_qc.AF[1])) mt_f = mt_f.select_entries(G=hl.coalesce( hl.if_else(mt_f.flip, 2 - mt_f.GT.n_alt_alleles(), mt_f.GT.n_alt_alleles()), mt_f.prior)) ## Exporting result output = '/ludc/Home/daniel_c/dva/files/ukbgeno/chrom{}.vcf.bgz'.format(ch) hl.export_vcf(mt_f, output)
# TODO: write out matrix table with sim results? Should always be able to get same exact mt if global seed is set elif sim_name[:3] != 'bn_' and hl.hadoop_is_file( betas_path) and hl.hadoop_is_file(phens_path): mt = get_mt( remove_withdrawn=False ) # no need to remove withdrawn samples because phenotypes have only been calculated for non-withdrawn samples betas = hl.import_table(betas_path, impute=True, force=True) phens = hl.import_table(phens_path, key=['s'], types={'s': hl.tstr}, impute=True, force=True) betas = betas.annotate(locus=hl.parse_locus(betas.locus), alleles=betas.alleles.replace( '\[\"', '').replace('\"\]', '').split('\",\"')) betas = betas.key_by('locus', 'alleles') # sim_mt = mt.annotate_rows(beta=betas[mt.locus, mt.alleles].beta) sim_mt = sim_mt.annotate_cols(y_binarized=phens[sim_mt.s].y_binarized) else: mt = get_mt(remove_withdrawn=True) sim_mt = get_sim_mt(mt=mt, h2=h2, pi=pi, K=K) sim_mt.rows().select('beta').export(betas_path) sim_mt.cols().select('y', 'y_binarized').export(phens_path)
for variant in args.variant: print("------------") try: chrom, pos, ref, alt = variant.split("-") chrom_without_prefix = chrom.replace("chr", "") chrom = "chr" + chrom_without_prefix pos = int(pos) except: p.error(f"Unable to parse variant: {variant}") break print(f"locus: {chrom}:{pos-200}-{pos+200}") locus = hl.parse_locus(f"{chrom}:{pos}", reference_genome="GRCh38") print(f"checking v3: {chrom}-{pos}") ht_v3 = hl.read_table(v3_table) matches = ht_v3.filter(ht_v3.locus == locus, keep=True).collect() print() for match in matches: print("----") print(f" {match}") print_bam_paths(match) print(f"checking v3.1: {chrom}-{pos}") ht_v3_1 = hl.read_table(v3_1_table) matches = ht_v3_1.filter(ht_v3_1.locus == locus, keep=True).collect() print() for match in matches:
# Import modules and init Hail import hail as hl from hail_init import DEFAULT_REF # Read variant QC passing matrix table variantqc_pass = hl.read_matrix_table("variantqc_pass.mt") # Exact or approximate coordinates intervals = ["chr10:52765380-52772784", "chr1:100M-200M"] filtered_mt = hl.filter_intervals(variantqc_pass, [ hl.parse_locus_interval(x, reference_genome=DEFAULT_REF) for x in intervals ]) # Nucleotide window around locus locus = hl.parse_locus("chrX:23833353", DEFAULT_REF) window = locus.window(100000, 100000) # 100,000 nucleotides before and after filtered_mt = variantqc_pass.filter_rows(window.contains(variantqc_pass.locus)) # Filter by allelic frequency filtered_mt = filtered_mt.filter_rows(filtered_mt.variant_qc.AF[1] < 0.01)
def test_xpos_1(self): locus = hl.parse_locus("1:55505463", "GRCh37") self.assertEqual(hl.eval(get_expr_for_xpos(locus)), 1055505463)
def test_xpos_grch38(self): locus = hl.parse_locus("chr2:166847734", "GRCh38") self.assertEqual(hl.eval(get_expr_for_xpos(locus)), 2166847734)
def test_xpos_2(self): locus = hl.parse_locus("X:18525192", "GRCh37") self.assertEqual(hl.eval(get_expr_for_xpos(locus)), 23018525192)
def main(args): # Start Hail hl.init(default_reference=args.default_ref_genome) # Import adj genotype MT and remove mt = hl.read_matrix_table( get_qc_mt_path(dataset=args.exome_cohort, part='sample_qc_adj_genotypes', split=True)) # keep samples passing QC filtering mt = (mt.filter_cols(mt.pass_filters).select_cols().select_rows()) # import variant info fields (vcf info) variant_info_ht = (get_vep_annotation_ht().drop('vep')) # Add useful annotation for variant hard filter ht = ( mt.annotate_rows( inbreeding_coeff=variant_info_ht[mt.row_key].info.InbreedingCoeff, vqsr_filter=variant_info_ht[mt.row_key].filters, VQSLOD=variant_info_ht[mt.row_key].info.VQSLOD, gt_counts=hl.agg.count_where(hl.is_defined( mt.GT)) # expected MT filtered to high-quality GT ).rows()) # 1. Apply variant hard filters # hard filter expression variant_hard_filter_expr = { 'fail_inbreeding_coeff': ht.inbreeding_coeff < INBREEDING_COEFFICIENT_CUTOFF, 'AC0': ht.gt_counts == 0 } ht = (ht.annotate(**variant_hard_filter_expr)) # 2. Apply VQSR filter ht = (ht.annotate(fail_vqsr=hl.len(ht.vqsr_filter) != 0)) # 3. Apply RF filter # import/parse rf final HT ht_rf = hl.read_table(get_variant_qc_ht_path(part='rf_result')) ht_rf = (ht_rf.select(rf_probability_tp=ht_rf.rf_probability['TP'], variant_type=ht_rf.variant_type)) ht = (ht.annotate(**ht_rf[ht.key])) ht = (ht.annotate(fail_rf=hl.case().when( (ht.rf_probability_tp < RF_PROBABILITY_SNV_CUTOFF) & (ht.variant_type == 'snv'), True).when( (ht.rf_probability_tp < RF_PROBABILITY_INDEL_CUTOFF) & (ht.variant_type == 'indel'), True).default(False))) # 5. Apply coverage/capture interval filters ## gnomad genome coverage gnomad_coverage_ht = get_gnomad_genomes_coverage_ht().key_by() gnomad_coverage_ht = (gnomad_coverage_ht.annotate(locus=hl.parse_locus( gnomad_coverage_ht.locus, reference_genome='GRCh38')).key_by('locus')) ht = (ht.annotate(gnomad_cov_10X=gnomad_coverage_ht[ht.locus].over_10)) ht = (ht.annotate(is_coveraged_gnomad_genomes=ht.gnomad_cov_10X >= 0.9)) ## defined in capture intervals # filter to capture intervals (intersect) ht_defined_intervals = filter_capture_intervals(ht) ht = (ht.annotate(is_defined_capture_intervals=hl.is_defined( ht_defined_intervals[ht.key]))) # 6. Summary final variant QC # final variant qc filter joint expression final_variant_qc_ann_expr = { 'pass_variant_qc_filters': hl.cond( ~ht.fail_inbreeding_coeff & ~ht.AC0 & ~ht.fail_vqsr & ~ht.fail_rf & ht.is_coveraged_gnomad_genomes & ht.is_defined_capture_intervals, True, False) } ht = (ht.annotate(**final_variant_qc_ann_expr)) # Counts the number of variants (snv and indels) affected by every filter and add as global field filter_flags = [ 'fail_inbreeding_coeff', 'AC0', 'fail_vqsr', 'fail_rf', 'is_coveraged_gnomad_genomes', 'is_defined_capture_intervals', 'pass_variant_qc_filters' ] summary_filter_expr = { v: hl.struct( **{ f: hl.agg.filter(ht.variant_type == v, hl.agg.counter(ht[f])) for f in filter_flags }) for v in ['snv', 'indel'] } ht = ht.annotate_globals( summary_filter=ht.aggregate(summary_filter_expr, _localize=False)) # write HT variant QC final table output_path = get_variant_qc_ht_path(dataset=args.exome_cohort, part='final_qc') ht = ht.checkpoint(output_path, overwrite=args.overwrite) # print filter summary logger.info(f'Variant QC filter summary: {ht.summary_filter.collect()}') # export HT to file if args.write_to_file: ht.export(f'{output_path}.tsv.bgz') # Stop Hail hl.stop() print("Finished!")