def maf_filter(mt, maf, filter_ac0_after_pruning=False): """ Takes matrix table, filters out failing genotypes, variants, and samples, and MAF prunes the table, and returns the matrix table :param mt: matrix table to prune (should be LD pruned and have x chrom removed). :param filter_ac0_after_pruning: filter variants no longer in the data, e.g. sum(AC) = 0? :return: returns maf filtered matrix table. """ # Run hl.variant_qc() to get AFs mt = hl.variant_qc(mt) # Filter MAF logging.info(f'Filtering out variants with minor allele frequency < {maf}') mt = mt.filter_rows(mt.row.variant_qc.AF[1] > maf, keep=True) mt = mt.annotate_globals(maf_threshold_LDpruning=maf) if filter_ac0_after_pruning: logging.info( 'Removing variants with alt allele count = 0 (monomorphic variants).' ) mt = hl.variant_qc(mt) mt = mt.filter_rows(hl.sum(mt.row.variant_qc.AC) == hl.int(0), keep=False) count = mt.count() logging.info( f"MT count after removing monomorphic variants and MAF filtering: {count}" ) else: logging.info("MAF pruned mt count:" + str(mt.count())) return mt
def variant_and_sample_qc_nested_with_filters_2(mt_path): mt = hl.read_matrix_table(mt_path) mt = hl.variant_qc(mt) mt = mt.filter_rows(mt.variant_qc.call_rate >= .8) mt = hl.sample_qc(mt) mt = mt.filter_cols(mt.sample_qc.call_rate >= .8) mt = hl.variant_qc(mt) mt = mt.filter_rows(mt.variant_qc.call_rate >= .98) mt = hl.sample_qc(mt) mt = mt.filter_cols(mt.sample_qc.call_rate >= .98) mt.count()
def query(output): """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) eigenvalues_path = f'{output}/eigenvalues_10k.csv' scores_path = f'{output}/scores_10k.ht' loadings_path = f'{output}/loadings_10k.ht' downsampled_mt_path = f'{output}/downsampled_mt.mt' # filter out variants with a call rate <0.99 and variants where there # is no non-reference allele called. mt_qc = hl.variant_qc(mt) filt_mt = mt_qc.filter_rows((mt_qc.variant_qc.call_rate >= 0.99) & (mt_qc.variant_qc.n_non_ref >= 1)) nrows = filt_mt.count_rows() # Downsample the dataset to approximately 10k randomly-selected rows # (the input must be a proportion) downsampled_mt = filt_mt.sample_rows(10000 / nrows, seed=12345) eigenvalues, scores, loadings = hl.hwe_normalized_pca( downsampled_mt.GT, compute_loadings=True, k=20) # save the list of eigenvalues eigenvalues_df = pd.DataFrame(eigenvalues) eigenvalues_df.to_csv(eigenvalues_path, index=False) # save the scores, loadings, and downsampled matrix table scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True) downsampled_mt.write(downsampled_mt_path, overwrite=True)
def main(args): # Start Hail hl.init(default_reference=args.default_ref_genome) # Read Hail MatrixTable mt = hl.read_matrix_table(args.mt_input_path) # compute sample and variant qc mt = hl.variant_qc(mt) # write variant qc hailtable tb_variant_qc = (mt .select_rows('variant_qc') .rows() .flatten() .key_by('locus', 'alleles') ) output_path_ht = f'{args.ht_output_path}_variant_qc.ht' tb_variant_qc.write(output=output_path_ht) if args.write_to_file: (hl.read_table(output_path_ht) .export(f'{output_path_ht}_variant_qc.tsv.bgz') ) # Stop Hail hl.stop() print("Finished!")
def calclulate_hail_variant_qc(mt: hl.MatrixTable) -> hl.MatrixTable: ''' Compute variant qc metrics :param mt: the original matrixtable :return: annotated matrixtable with variant_qc struct ''' mt_with_variantqc = hl.variant_qc(mt, name='variant_qc') return mt_with_variantqc
def test_sample_and_variant_qc_call_rate(self): mt = hl.import_vcf(resource('sample.vcf')) n_rows, n_cols = mt.count() mt = mt.filter_entries(mt.GQ > 5) mt = hl.variant_qc(hl.sample_qc(mt)) assert mt.aggregate_cols(hl.agg.all(hl.approx_equal(mt.sample_qc.call_rate, mt.sample_qc.n_called / n_rows))) assert mt.aggregate_rows(hl.agg.all(hl.approx_equal(mt.variant_qc.call_rate, mt.variant_qc.n_called / n_cols)))
def test_variant_qc(self): data = [ {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0}, {'v': '1:1:A:T', 's': '2', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5}, {'v': '1:1:A:T', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 11, 'DP': 100}, {'v': '1:1:A:T', 's': '4', 'GT': None, 'GQ': None, 'DP': 100}, {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1, 2]), 'GQ': 10, 'DP': 5}, {'v': '1:2:A:T,C', 's': '2', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 5}, {'v': '1:2:A:T,C', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 10, 'DP': 5}, {'v': '1:2:A:T,C', 's': '4', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5}, ] ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}')) ht = ht.transmute(**hl.parse_variant(ht.v)) mt = ht.to_matrix_table(['locus', 'alleles'], ['s']) mt = hl.variant_qc(mt, 'vqc') r = mt.rows().collect() self.assertEqual(r[0].vqc.AF, [0.5, 0.5]) self.assertEqual(r[0].vqc.AC, [3, 3]) self.assertEqual(r[0].vqc.AN, 6) self.assertEqual(r[0].vqc.homozygote_count, [1, 1]) self.assertEqual(r[0].vqc.n_called, 3) self.assertEqual(r[0].vqc.n_not_called, 1) self.assertEqual(r[0].vqc.call_rate, 0.75) self.assertEqual(r[0].vqc.n_het, 1) self.assertEqual(r[0].vqc.n_non_ref, 2) self.assertEqual(r[0].vqc.het_freq_hwe, 0.6) self.assertEqual(r[0].vqc.p_value_hwe, 0.7) self.assertEqual(r[0].vqc.dp_stats.min, 0) self.assertEqual(r[0].vqc.dp_stats.max, 100) self.assertEqual(r[0].vqc.dp_stats.mean, 51.25) self.assertAlmostEqual(r[0].vqc.dp_stats.stdev, 48.782040752719645) self.assertEqual(r[0].vqc.gq_stats.min, 10) self.assertEqual(r[0].vqc.gq_stats.max, 11) self.assertAlmostEqual(r[0].vqc.gq_stats.mean, 10.333333333333334) self.assertAlmostEqual(r[0].vqc.gq_stats.stdev, 0.47140452079103168) self.assertEqual(r[1].vqc.AF, [0.125, 0.5, 0.375]) self.assertEqual(r[1].vqc.AC, [1, 4, 3]) self.assertEqual(r[1].vqc.AN, 8) self.assertEqual(r[1].vqc.homozygote_count, [0, 1, 1]) self.assertEqual(r[1].vqc.n_called, 4) self.assertEqual(r[1].vqc.n_not_called, 0) self.assertEqual(r[1].vqc.call_rate, 1.0) self.assertEqual(r[1].vqc.n_het, 2) self.assertEqual(r[1].vqc.n_non_ref, 4) self.assertEqual(r[1].vqc.p_value_hwe, None) self.assertEqual(r[1].vqc.het_freq_hwe, None) self.assertEqual(r[1].vqc.dp_stats.min, 5) self.assertEqual(r[1].vqc.dp_stats.max, 5) self.assertEqual(r[1].vqc.dp_stats.mean, 5) self.assertEqual(r[1].vqc.dp_stats.stdev, 0.0) self.assertEqual(r[1].vqc.gq_stats.min, 10) self.assertEqual(r[1].vqc.gq_stats.max, 10) self.assertEqual(r[1].vqc.gq_stats.mean, 10) self.assertEqual(r[1].vqc.gq_stats.stdev, 0)
def compute_qc_metrics(mt: hl.MatrixTable) -> hl.MatrixTable: """ Compute per-sample metrics and common variant statistics useful for quality control :param mt: Hail MatrixTable :return: Hail MatrixTable with variant and sample qc metrics """ mt = hl.variant_qc(mt) mt = hl.sample_qc(mt) return mt
def genetics_pipeline(): mt = get_mt() mt = hl.split_multi_hts(mt) mt = hl.variant_qc(mt) mt = hl.sample_qc(mt) mt = mt.filter_cols(mt.sample_qc.call_rate > 0.95) mt = mt.filter_rows(mt.variant_qc.AC[1] > 5) mt = mt.filter_entries(hl.case().when( hl.is_indel(mt.alleles[0], mt.alleles[1]), mt.GQ > 20).default(mt.GQ > 10)) mt.write('/tmp/genetics_pipeline.mt', overwrite=True)
def ld_prune_filter(intersect_out, prune_out, overwrite: bool = False): mt = hl.read_matrix_table(intersect_out) print(mt.count()) mt = hl.variant_qc(mt) mt_filt = mt.filter_rows((mt.variant_qc.AF[0] > 0.001) & (mt.variant_qc.AF[0] < 0.999)) print(mt_filt.count()) mt_intersect_prune = hl.ld_prune(mt_filt.GT, r2=0.8, bp_window_size=500000) mt_intersect_pruned = mt_filt.filter_rows( hl.is_defined(mt_intersect_prune[mt_filt.row_key])) mt_intersect_pruned.write(prune_out, overwrite)
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') tob_wgs = hl.read_matrix_table(TOB_WGS) tob_wgs = hl.experimental.densify(tob_wgs) tob_wgs = hl.variant_qc(tob_wgs) # get MAF > 0.05 tob_wgs = tob_wgs.filter_rows(tob_wgs.variant_qc.AF[0] < 1) snp_maf_05 = tob_wgs.aggregate_rows( hl.agg.count_where(tob_wgs.variant_qc.AF[1] > 0.05)) print(f'Variant MAF > 0.05 = {snp_maf_05}')
def compute_qc(mt: hl.MatrixTable, root_col_name='sample_qc', root_row_name='variant_qc') -> hl.MatrixTable: """ Given a MatrixTable, compute samples/variants quality controls metrics :param mt: Input MatrixTable :param root_col_name: prefix sample qc field :param root_row_name: prefix variant qc field :return: MatrixTable with quality control computed """ mt = hl.sample_qc(mt, name=root_col_name) mt = hl.variant_qc(mt, name=root_row_name) return mt
def filter_snps(mt, maf): mt = hl.variant_qc(mt) mt = mt.annotate_rows(maf=hl.min(mt.variant_qc.AF)) mt.filter_rows(mt.maf > maf) # MHC chr6:25-35Mb # chr8.inversion chr8:7-13Mb intervals = ['chr6:25M-35M', 'chr8:7M-13M'] mt = hl.filter_intervals(mt, [ hl.parse_locus_interval(x, reference_genome='GRCh38') for x in intervals ], keep=False) return mt
def pca_filter_mt(in_mt: hl.MatrixTable, maf: float = 0.05, hwe: float = 1e-3, call_rate: float = 0.98, ld_cor: float = 0.2, ld_window: int = 250000): print("\nInitial number of SNPs before filtering: {}".format( in_mt.count_rows())) mt = hl.variant_qc(in_mt) print(f'\nFiltering out variants with MAF < {maf}') mt_filt = mt.annotate_rows(maf=hl.min(mt.variant_qc.AF)) mt_filt = mt_filt.filter_rows(mt_filt.maf > maf) print(f'\nFiltering out variants with HWE < {hwe:1e}') mt_filt = mt_filt.filter_rows(mt_filt.variant_qc.p_value_hwe > hwe) print(f'\nFiltering out variants with Call Rate < {call_rate}') mt_filt = mt_filt.filter_rows(mt_filt.variant_qc.call_rate >= call_rate) # no strand ambiguity print('\nFiltering out strand ambigous variants') mt_filt = mt_filt.filter_rows( ~hl.is_strand_ambiguous(mt_filt.alleles[0], mt_filt.alleles[1])) # MHC chr6:25-35Mb # chr8.inversion chr8:7-13Mb print( '\nFiltering out variants in MHC [chr6:25M-35M] and chromosome 8 inversions [chr8:7M-13M]' ) intervals = ['chr6:25M-35M', 'chr8:7M-13M'] mt_filt = hl.filter_intervals(mt_filt, [ hl.parse_locus_interval(x, reference_genome='GRCh38') for x in intervals ], keep=False) # This step is expensive (on local machine) print( f'\nLD pruning using correlation threshold of {ld_cor} and window size of {ld_window}' ) mt_ld_prune = hl.ld_prune(mt_filt.GT, r2=ld_cor, bp_window_size=ld_window) mt_ld_pruned = mt_filt.filter_rows( hl.is_defined(mt_ld_prune[mt_filt.row_key])) print("\nNumber of SNPs after filtering: {}".format( mt_ld_pruned.count_rows())) return mt_ld_pruned
def query(output): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles') # filter to loci that are contained in both matrix tables after densifying tob_wgs = hl.experimental.densify(tob_wgs) # Entries and columns must be identical tob_wgs_select = tob_wgs.select_entries( GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)) hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT) hgdp_1kg_select = hgdp_1kg_select.select_cols() # Join datasets hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select) # Add in metadata information hgdp_1kg_metadata = hgdp_1kg.cols() hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols( hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s]) # choose variants based off of gnomAD v3 parameters hgdp1kg_tobwgs_joined = hl.variant_qc(hgdp1kg_tobwgs_joined) hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_rows( IB=hl.agg.inbreeding(hgdp1kg_tobwgs_joined.GT, hgdp1kg_tobwgs_joined.variant_qc.AF[1])) hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.filter_rows( (hl.len(hgdp1kg_tobwgs_joined.alleles) == 2) & (hgdp1kg_tobwgs_joined.locus.in_autosome()) & (hgdp1kg_tobwgs_joined.variant_qc.AF[1] > 0.01) & (hgdp1kg_tobwgs_joined.variant_qc.call_rate > 0.99) & (hgdp1kg_tobwgs_joined.IB.f_stat > -0.25)) hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.cache() nrows = hgdp1kg_tobwgs_joined.count_rows() print(f'hgdp1kg_tobwgs_joined.count_rows() = {nrows}') hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.sample_rows( NUM_ROWS_BEFORE_LD_PRUNE / nrows, seed=12345) pruned_variant_table = hl.ld_prune(hgdp1kg_tobwgs_joined.GT, r2=0.1, bp_window_size=500000) hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.filter_rows( hl.is_defined(pruned_variant_table[hgdp1kg_tobwgs_joined.row_key])) mt_path = f'{output}/tob_wgs_hgdp_1kg_filtered_variants.mt' hgdp1kg_tobwgs_joined.write(mt_path)
def ld_prune_filter(mt: hl.MatrixTable, mt_ld: str, overwrite: bool = False): """ Runs variant QC and filters out rare variants, those with missingness, and LD prunes to independent variants :param mt: Matrix table to run variant QC on and filter variants from :param mt_ld: Path to write intermediate filtered mt :param overwrite: if True, overwrites existing data :return: """ mt.describe() mt = hl.variant_qc(mt) # mt_filt = mt.filter_rows((mt.variant_qc.AF[0] > 0.01) & (mt.variant_qc.AF[0] < 0.99)) mt_filt = mt.filter_rows((mt.variant_qc.AF[0] > 0.05) & (mt.variant_qc.AF[0] < 0.95) & (mt.variant_qc.call_rate > 0.999)) # pruned = hl.ld_prune(mt_filt.GT, r2=0.2, bp_window_size=500000) pruned = hl.ld_prune(mt_filt.GT, r2=0.1, bp_window_size=500000) mt_filt = mt_filt.filter_rows(hl.is_defined(pruned[mt_filt.row_key])) mt_filt.write(mt_ld, overwrite)
def query(output): """Query script entry point.""" hl.init(default_reference='GRCh38') mt_path = f'{output}/filtered_mt.mt' mt = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) # reproduce gnomAD genotype filtering mt = annotate_adj(mt) mt = mt.filter_entries(mt.adj) mt = hl.variant_qc(mt) # Filter to common and biallelic variants mt = mt.filter_rows((hl.len(mt.alleles) == 2) & (mt.variant_qc.AF[1] > 0.05)) pruned_variant_table = hl.ld_prune(mt.GT, r2=0.2, bp_window_size=500000) filtered_mt = mt.filter_rows( hl.is_defined(pruned_variant_table[mt.row_key])) # save filtered mt table filtered_mt.write(mt_path, overwrite=True)
def load_files(file_prefix, overwrite, gencove, mt): """ loads VCFs, run sample QC and variant QC, writes matrix table :param file_prefix: :param overwrite: :return: """ if gencove: ngap_downsample = hl.read_matrix_table(file_prefix + '_grch38.mt') else: ngap_downsample = hl.import_vcf(file_prefix + '.vcf.gz', force_bgz=True, reference_genome='GRCh38', min_partitions=200) ngap_downsample = hl.split_multi_hts(ngap_downsample) ngap_downsample = ngap_downsample.filter_cols( (ngap_downsample.s != 'NGE0018') & (ngap_downsample.s != 'NGE0130')) ngap_sample_qc = hl.sample_qc(ngap_downsample) ngap_sample_variant_qc = hl.variant_qc(ngap_sample_qc) ngap_sample_variant_qc.write(file_prefix + '.mt', overwrite=overwrite)
def compute_kinship_ht(mt, genome_version="GRCh38"): mt = filter_to_biallelics(mt) mt = filter_to_autosomes(mt) mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1])) mt = hl.variant_qc(mt) mt = mt.filter_rows(mt.variant_qc.call_rate > 0.99) #mt = mt.filter_rows(mt.info.AF > 0.001) # leaves 100% of variants mt = ld_prune(mt, genome_version=genome_version) ibd_results_ht = hl.identity_by_descent(mt, maf=mt.info.AF, min=0.10, max=1.0) ibd_results_ht = ibd_results_ht.annotate( ibd0=ibd_results_ht.ibd.Z0, ibd1=ibd_results_ht.ibd.Z1, ibd2=ibd_results_ht.ibd.Z2, pi_hat=ibd_results_ht.ibd.PI_HAT).drop("ibs0", "ibs1", "ibs2", "ibd") kin_ht = ibd_results_ht # filter to anything above the relationship of a grandparent first_degree_pi_hat = .40 grandparent_pi_hat = .20 grandparent_ibd1 = 0.25 grandparent_ibd2 = 0.15 kin_ht = kin_ht.key_by("i", "j") kin_ht = kin_ht.filter((kin_ht.pi_hat > first_degree_pi_hat) | ( (kin_ht.pi_hat > grandparent_pi_hat) & (kin_ht.ibd1 > grandparent_ibd1) & (kin_ht.ibd2 < grandparent_ibd2))) kin_ht = kin_ht.annotate(relation=hl.sorted([kin_ht.i, kin_ht.j ])) #better variable name return kin_ht
def run_gwas(vcf_file, phenotypes_file, output_file): table = hl.import_table(phenotypes_file, impute=True).key_by('Sample') hl.import_vcf(vcf_file).write('tmp.mt') mt = hl.read_matrix_table('tmp.mt') mt = mt.annotate_cols(pheno=table[mt.s]) mt = hl.sample_qc(mt) mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97)) ab = mt.AD[1] / hl.sum(mt.AD) filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) | (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) | (mt.GT.is_hom_var() & (ab >= 0.9))) mt = mt.filter_entries(filter_condition_ab) mt = hl.variant_qc(mt) mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01) eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT) mt = mt.annotate_cols(scores=pcs[mt.s].scores) gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption, x=mt.GT.n_alt_alleles(), covariates=[ 1.0, mt.pheno.isFemale, mt.scores[0], mt.scores[1], mt.scores[2] ]) gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles), P=gwas.p_value) gwas = gwas.key_by(gwas.SNP) gwas = gwas.select(gwas.P) gwas.export(f'{output_file}.assoc', header=True) hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
mt_split = hl.split_multi(mt) mt_split = mt_split.select_entries( GT=hl.downcode(mt_split.GT, mt_split.a_index)) mt_split = mt_split.annotate_rows(info=hl.struct( AC=mt_split.info.AC[mt_split.a_index - 1], VT=(hl.case().when((mt_split.alleles[0].length() == 1) & ( mt_split.alleles[1].length() == 1), 'SNP').when( mt_split.alleles[0].matches('<CN*>') | mt_split.alleles[1].matches('<CN*>'), 'SV').default('INDEL')))) n_rows, n_cols = mt_split.count() n_partitions = mt_split.n_partitions() mt_split = hl.sample_qc(mt_split) mt_split = hl.variant_qc(mt_split) mt_split = mt_split.annotate_globals( metadata=hl.struct(name='1000_Genomes_phase3_chrMT', reference_genome='GRCh37', n_rows=n_rows, n_cols=n_cols, n_partitions=n_partitions)) mt_split.write( 'gs://hail-datasets-hail-data/1000_Genomes_phase3_chrMT.GRCh37.mt', overwrite=True) mt = hl.read_matrix_table( 'gs://hail-datasets-hail-data/1000_Genomes_phase3_chrMT.GRCh37.mt') mt.describe()
def relatedness_check(in_mt: hl.MatrixTable = None, method: str = 'pc_relate', outdir: str = None, kin_estimate: float = 0.98): global mt, samples_to_remove in_mt = hl.variant_qc(in_mt) in_mt = hl.sample_qc(in_mt) # _localize=False means don't put this in Python, keep it as a Hail expr call_rate_dict = in_mt.aggregate_cols(hl.dict( hl.agg.collect((in_mt.s, in_mt.sample_qc.call_rate))), _localize=False) if method == 'pc_relate': print("\nUsing PC-Relate for relatedness checks") relatedness_ht = hl.pc_relate(in_mt.GT, 0.01, k=10, min_kinship=0.1, statistics='kin') samples_to_remove_ht = relatedness_ht.filter( relatedness_ht.kin > kin_estimate) # get call rates for both samples so we remove the one with lower call rate between the two samples_to_remove = samples_to_remove_ht.annotate( cr_s1=call_rate_dict[samples_to_remove_ht.i.s], cr_s2=call_rate_dict[samples_to_remove_ht.j.s]) samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond( samples_to_remove.cr_s1 <= samples_to_remove.cr_s2, samples_to_remove.i, samples_to_remove.j)) elif method == 'ibd': print("\nUsing PLINK-style identity by descent for relatedness checks") in_mt = in_mt.annotate_rows(maf=hl.min(in_mt.variant_qc.AF)) relatedness_ht = hl.identity_by_descent( in_mt, maf=in_mt['maf'] ) # this returns a Hail Table with the sample pairs samples_to_remove_ht = relatedness_ht.filter( relatedness_ht.ibd.PI_HAT > kin_estimate) # get call rates for both samples so we remove the one with lower call rate between the two samples_to_remove = samples_to_remove_ht.annotate( cr_s1=call_rate_dict[samples_to_remove_ht.i], cr_s2=call_rate_dict[samples_to_remove_ht.j]) samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond( samples_to_remove.cr_s1 <= samples_to_remove.cr_s2, samples_to_remove.i, samples_to_remove.j)) else: print("\nUsing KING for relatedness checks") if kin_estimate > 0.5: raise Exception( "\nThe maximum kinship coefficient is for KING 0.5") relatedness_mt = hl.king(in_mt.GT) filtered_relatedness_mt = relatedness_mt.filter_entries( (relatedness_mt.s_1 != relatedness_mt.s) & (relatedness_mt.phi >= kin_estimate), keep=True) samples_to_remove_ht = filtered_relatedness_mt.entries() samples_to_remove = samples_to_remove_ht.annotate( cr_s1=call_rate_dict[samples_to_remove_ht.s_1], cr_s2=call_rate_dict[samples_to_remove_ht.s]) samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond( samples_to_remove.cr_s1 <= samples_to_remove.cr_s2, samples_to_remove.s_1, samples_to_remove.s)) samples = samples_list.sample_to_remove.collect() if len(samples) > 0: in_mt = in_mt.filter_cols(hl.literal(samples).contains(in_mt['s']), keep=False) print("\nNumber of samples that fail relatedness checks: {}".format( len(samples))) with open(outdir + 'relatedness_removed_samples.tsv', 'w') as f: for sample in samples: f.write(sample + "\n") else: print("\nNo samples failed the relatedness check") return in_mt
def get_data(a2_reference): mt_imported = hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam', a2_reference=a2_reference) return (hl.variant_qc(mt_imported) .rows() .key_by('rsid'))
## # Main script # logger.info(f"Reading pedigree file {args.fam}") pedigree = hl.Pedigree.read(args.fam) logger.info(f"Importing vcf file {args.vcf}") data = hl.import_vcf(args.vcf, call_fields=['GT'], skip_invalid_loci=True, force_bgz=True) data = hl.split_multi_hts(data) data = data.annotate_rows(AC=data.info.AC[data.a_index - 1], iAF=data.info.AF[data.a_index - 1]) data = hl.variant_qc(data) logger.info("Applying de novo filter...") de_novo_scores = hl.de_novo(data, pedigree, pop_frequency_prior=data.variant_qc.AF[-1]) de_novo_mt = de_novo_scores.to_matrix_table(row_key=['locus', 'alleles'], col_key=['id']) de_novo_data = data.annotate_entries(p_de_novo=de_novo_mt[(data.locus, data.alleles), data.s].p_de_novo) logger.info("Annotating trio data...") trio_mt = hl.trio_matrix(de_novo_data, pedigree, complete_trios=True) de_novo_data = de_novo_data.annotate_entries( mother=trio_mt[(de_novo_data.locus, de_novo_data.alleles),
filter='\d/\d/\d', skip_invalid_loci=True, force_bgz=True, reference_genome='GRCh38', contig_recoding=recoding_dict) ### filter to pass variants and split_multi mt2 = mt.filter_rows(mt.filters.size() > 0, keep=False) mt2 = hl.split_multi_hts(mt2) #variant read counts of 3 #at least one read in both forward and reverse orientations #remove monomorphic variants mt3 = mt2.filter_entries( ((mt2.AD[1] < 2) | (mt2.F1R2[1] == 0) | (mt2.F2R1[1] == 0)), keep=False) mt3 = hl.variant_qc(mt3) mt3 = mt3.filter_rows( (mt3.variant_qc.AF[1] > 0) & (mt3.variant_qc.AF[1] < 1), keep=True) mt4 = mt3.annotate_rows(v = hl.variant_str(mt3.locus, mt3.alleles),\ NumAltAlleles = hl.agg.max(mt3.GT.n_alt_alleles()), \ VAF =hl.agg.explode(lambda x: hl.agg.mean(x), mt3.AF),\ TLOD =mt3.info.TLOD[0], \ GERMQ = mt3.info.GERMQ, \ STR=mt3.info.STR,\ AD_alt=hl.agg.mean(mt3.AD[1]),\ AD_ref=hl.agg.mean(mt3.AD[0])) mt4 = mt4.annotate_entries( Binomial_Prob=hl.binom_test(mt4.AD[1], mt4.DP, 0.5, 'greater')) mt4 = mt4.key_rows_by("v")
def main(): # # Args (local) # chrom = 11 # chain_file = '/Users/em21/Projects/ot_genetics/genetics-sumstats_data/extras/prepare_uk_biobank_gwas_catalog/sitelist/input_data/grch37_to_grch38.over.chain.gz' # in_bgen = 'example_data/ukb_imp_chr{chrom}_v3.example.bgen' # in_sample = 'output/ukb_10k_downsampled.sample' # to_keep_list = 'output/ukb_10k_downsampled.sample_list.tsv' # out_plink = 'output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k' # cores = 1 # Use "*" for all # maf_threshold = 0.001 # Args (server) chrom = sys.argv[1] chain_file = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/grch37_to_grch38.over.chain.gz' in_bgen = '/nfs/users/nfs_e/em21/otcoregen/uk_biobank_data/data/genetics/imputation/ukb_imp_chr{chrom}_v3.bgen' in_sample = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample' to_keep_list = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample_list.tsv' out_plink = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k' cores = sys.argv[2] # Use "*" for all maf_threshold = 0.001 # Set the maximum number of cores hl.init(master="local[{}]".format(cores)) # Prepare liftover rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover(chain_file, rg38) # Create my own rg38 with altered names rg38_custom_contigs = [ contig.replace('chr', '') for contig in rg38.contigs ] rg38_custom_lens = {} for contig in rg38.lengths: rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig] rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs, rg38_custom_lens) print('Processing chromosome {0}'.format(chrom)) # Index bgen if not existing if not hl.hadoop_exists(in_bgen.format(chrom=chrom) + '.idx2'): hl.index_bgen(in_bgen.format(chrom=chrom), contig_recoding={ "01": "1", "02": "2", "03": "3", "04": "4", "05": "5", "06": "6", "07": "7", "08": "8", "09": "9" }, reference_genome='GRCh37') # Load bgen mt = hl.import_bgen(in_bgen.format(chrom=chrom), entry_fields=['GT'], sample_file=in_sample) # Load list samples to keep samples_to_keep = hl.import_table(to_keep_list, no_header=True, impute=False, types={ 'f0': hl.tstr }).key_by('f0') # Downsample to required subset of samples mt = mt.filter_cols(hl.is_defined(samples_to_keep[mt.s])) # Re-call to remove phasing (required for plink output) # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False)) # Filter on MAF mt = hl.variant_qc(mt) mt = mt.annotate_rows(variant_qc=mt.variant_qc.annotate( MAF=hl.min(mt.variant_qc.AF))) mt = mt.filter_rows(mt.variant_qc.MAF >= maf_threshold) # Liftover mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38')) # Strip chr from contig name (causes problems with GCTA) mt = mt.annotate_rows( contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', '')) # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom) mt = mt.key_rows_by() mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38, mt.locus_GRCh38.position, reference_genome=rg38_custom)) mt = mt.key_rows_by(mt.locus, mt.alleles) # Remove rows with missing locus (after liftover) mt = mt.filter_rows(hl.is_defined(mt.locus)) # Write plink format hl.export_plink(dataset=mt, output=out_plink.format(chrom=chrom)) return 0
files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals(global_field_1=5, global_field_2=10, pli={'SCN1A': 0.999, 'SONIC': 0.014}, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
def variant_qc(): hl.variant_qc(get_mt()).rows()._force_count()
hl.init(default_reference='GRCh37') ## Variant level annotations (VEP annotations; annotated separately) mt5 = hl.read_table('gs://ukbb_v2/projects/mzekavat/ukbb_v3.AllAutosomalANDchrX.annotations.ht') ## UKBB imputed bgens: ds = hl.import_bgen('gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen',entry_fields = ['GT'],sample_file='gs://ukbb_v2/data/ukb7089_imp_chr3_v3_s487395.sample') ## Phenotype file phenos = hl.import_table('gs://ukbb_v2/projects/mzekavat/Pneumonia_GWAS/ukbb_PhenoFile.ALL_500k_incidPrevCases.plusRespPhenos.plusBPMeds.plusPFTs.plusCHIP.QCed.txt.gz',force_bgz=True,key = 'id',types={'id':hl.tstr},impute=True) ds = ds.annotate_rows(**mt5.index(ds.row_key)) ds = ds.annotate_cols(pheno = phenos[ds.col_key]) ds = ds.annotate_cols(array = hl.if_else((ds.pheno.genotyping_array == "UKBB"), 1, 0)) ds = ds.filter_cols(hl.is_defined(ds.pheno.age), keep=True) ### variant qc mt = hl.variant_qc(ds,name='variant_qc') mt = mt.filter_rows( ((mt.variant_qc.AF[1] > 0.001) & (mt.variant_qc.AF[1] < 0.999) & (mt.info>0.4) & (mt.variant_qc.p_value_hwe >= 0.0000000001)),keep = True ) final= mt.annotate_rows(AF = mt.variant_qc.AF[1],AC = mt.variant_qc.AC[1],AN = mt.variant_qc.AN) #final_annot = final.annotate_rows(HWE = final.variant_qc.p_value_hwe, callRate = final.variant_qc.call_rate) #final_annot = final_annot.drop('variant_qc').rows() ### gwas logistic regression wald gwas = hl.logistic_regression_rows(test='wald',\ y=final.pheno.All_Pneumonia,\ x=final.GT.n_alt_alleles(),\ covariates=[1, final.pheno.age,final.pheno.age2, final.pheno.Sex_numeric, final.pheno.ever_smoked, final.pheno.PC1,final.pheno.PC2,final.pheno.PC3,final.pheno.PC4,final.pheno.PC5,final.pheno.PC6,final.pheno.PC7,final.pheno.PC8,final.pheno.PC9,final.pheno.PC10,final.array], pass_through=['rsid','Gene','Consequence','clin_sig', 'metasvm','LOF_LOFTEE','PolyPhen','SIFT','hgvsp','AF', 'AC', 'AN','info']) ### Writting out the annotated GWAS results: gwas.flatten().export('gs://ukbb_v2/projects/mzekavat/Pneumonia_GWAS/logreg_wald_All_Pneumonia.tsv.bgz') gwas.write('gs://ukbb_v2/projects/mzekavat/Pneumonia_GWAS/logreg_wald_All_Pneumonia.ht') gwas = hl.read_table('gs://ukbb_v2/projects/mzekavat/Pneumonia_GWAS/logreg_wald_All_Pneumonia.ht')
sample_annotations = hl.read_table(PHENOTYPES_TABLE) impute_sex_annotations = hl.read_table(IMPUTESEX_TABLE) annotation_annotations = hl.read_table(ANNOTATION_TABLE) mt = hl.read_matrix_table(MT) mt = mt.drop('a_index', 'qual', 'info', 'filters', 'was_split') mt = mt.filter_cols(hl.is_defined(ht_final_samples[mt.col_key])) mt = mt.filter_rows(hl.is_defined(ht_final_variants[mt.row_key])) mt = mt.annotate_cols(phenotype=sample_annotations[mt.col_key]) mt = mt.annotate_cols(imputesex=impute_sex_annotations[mt.col_key]) mt = mt.annotate_rows(annotation=annotation_annotations[mt.row_key]) mt = hl.variant_qc(mt, name='qc') mt = mt.annotate_rows(qc=mt.qc.annotate(p_value_hwe=hl.case().when( mt.locus.in_autosome(), mt.qc.het_freq_hwe).default( hl.agg.filter(mt.imputesex.impute_sex.is_female, hl.agg.hardy_weinberg_test(mt.GT).het_freq_hwe)))) mt = mt.annotate_rows(annotation=mt.annotation.annotate( info=mt.annotation.info.annotate( AC=mt.annotation.info.AC[mt.annotation.a_index - 1], AF=mt.annotation.info.AF[mt.annotation.a_index - 1], ))) mt = hl.sample_qc(mt) mt_pca = mt.filter_rows(hl.is_defined(ht_final_pruned_variants[mt.row_key]))
def variant_and_sample_qc(): mt = get_mt() hl.sample_qc(hl.variant_qc(mt))._force_count_rows()
def init(doctest_namespace): # This gets run once per process -- must avoid race conditions print("setting up doctest...") olddir = os.getcwd() os.chdir("docs/") doctest_namespace['hl'] = hl doctest_namespace['agg'] = agg if not os.path.isdir("output/"): try: os.mkdir("output/") except OSError: pass files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.read_matrix_table('data/example.vds') doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate(consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)}) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={'Age': hl.tint32, 'Children': hl.tarray(hl.tstr)}, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({'Alice': 43, 'Bob': 33, 'Charles': 44}) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval("1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() print("finished setting up doctest...") yield os.chdir(olddir)
import gcsfs fs = gcsfs.GCSFileSystem(project='your-project') bucket = client.get_bucket('your-bucket') import hail as hl import hail.expr.aggregators as agg hl.init() #read mt file mt = hl.read_matrix_table( "gs://1k_genome/1000-genomes/VDS-of-all/ALL.chr.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.mt" ) #print(mt.count()) (39706715, 1092) #filter MAF mt = hl.variant_qc(mt) mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01) #print(mt.count()) (13404583, 1092) #filter only SNPs mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1])) #print(mt.count()) (12194564, 1092) #annotate MT file table = (hl.import_table('gs://ines-work/KG-annotation-with-sexencoder.csv', delimiter=',', missing='', quote='"', types={ 'Gender_Classification': hl.tfloat64 }).key_by('Sample'))
def generate_datasets(doctest_namespace): doctest_namespace['hl'] = hl doctest_namespace['np'] = np ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint(f'output/example.mt', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata small_mt = hl.balding_nichols_model(3, 4, 4) doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt', overwrite=True) # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint(f'output/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds ld_score_one_pheno_sumstats = hl.import_table( 'data/ld_score_regression.one_pheno.sumstats.tsv', types={ 'locus': hl.tlocus('GRCh37'), 'alleles': hl.tarray(hl.tstr), 'chi_squared': hl.tfloat64, 'n': hl.tint32, 'ld_score': hl.tfloat64, 'phenotype': hl.tstr, 'chi_squared_50_irnt': hl.tfloat64, 'n_50_irnt': hl.tint32, 'chi_squared_20160': hl.tfloat64, 'n_20160': hl.tint32 }, key=['locus', 'alleles']) doctest_namespace[ 'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats mt = hl.import_matrix_table( 'data/ld_score_regression.all_phenos.sumstats.tsv', row_fields={ 'locus': hl.tstr, 'alleles': hl.tstr, 'ld_score': hl.tfloat64 }, entry_type=hl.tstr) mt = mt.key_cols_by(phenotype=mt.col_id) mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus), alleles=mt.alleles.split(',')) mt = mt.drop('row_id', 'col_id') mt = mt.annotate_entries(x=mt.x.split(",")) mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]), n=hl.int32(mt.x[1])) mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score)) doctest_namespace['ld_score_all_phenos_sumstats'] = mt print("finished setting up doctest...")