def run_gwas(vcf_file, phenotypes_file, output_file): table = hl.import_table(phenotypes_file, impute=True).key_by('Sample') hl.import_vcf(vcf_file).write('tmp.mt') mt = hl.read_matrix_table('tmp.mt') mt = mt.annotate_cols(pheno=table[mt.s]) downsampled = mt.sample_rows(0.01, seed=11223344) eigenvalues, pcs, _ = hl.hwe_normalized_pca(downsampled.GT) mt = mt.annotate_cols(scores=pcs[mt.s].scores) gwas = hl.linear_regression_rows( y=mt.pheno.CaffeineConsumption, x=mt.GT.n_alt_alleles(), covariates=[1.0, mt.scores[0], mt.scores[1], mt.scores[2]]) gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles), P=gwas.p_value) gwas = gwas.key_by(gwas.SNP) gwas = gwas.select(gwas.P) gwas.export(f'{output_file}.assoc', header=True) hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
def export_qced_file(mt: hl.MatrixTable, out_dir: str, basename: str, export_type='hail'): outname = basename + '_qced' if export_type == 'hail': mt.write('{}GWASpy/Preimp_QC/{}.mt'.format(out_dir, outname), overwrite=True) elif export_type == 'plink': hl.export_plink(dataset=mt, output='{}GWASpy/Preimp_QC/{}'.format( out_dir, outname), fam_id=mt.fam_id, ind_id=mt.s, pat_id=mt.pat_id, mat_id=mt.mat_id, is_female=mt.is_female, pheno=mt.is_case, varid=mt.rsid) else: hl.export_vcf(mt, '{}GWASpy/Preimp_QC/{}.vcf.bgz'.format(out_dir, outname))
def test_import_plink_a1_major(self): mt = get_dataset() bfile = '/tmp/sample_plink' hl.export_plink(mt, bfile, ind_id=mt.s) def get_data(a2_reference): mt_imported = hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam', a2_reference=a2_reference) return (hl.variant_qc(mt_imported).rows().key_by('rsid')) a2 = get_data(a2_reference=True) a1 = get_data(a2_reference=False) j = (a2.annotate(a1_alleles=a1[a2.rsid].alleles, a1_vqc=a1[a2.rsid].variant_qc).rename({ 'variant_qc': 'a2_vqc', 'alleles': 'a2_alleles' })) self.assertTrue( j.all((j.a1_alleles[0] == j.a2_alleles[1]) & (j.a1_alleles[1] == j.a2_alleles[0]) & (j.a1_vqc.n_not_called == j.a2_vqc.n_not_called) & (j.a1_vqc.n_het == j.a2_vqc.n_het) & (j.a1_vqc.homozygote_count[0] == j.a2_vqc.homozygote_count[1]) & (j.a1_vqc.homozygote_count[1] == j.a2_vqc.homozygote_count[0])))
def test_import_plink_empty_bim(self): mt = get_dataset().drop_rows() bfile = '/tmp/test_empty_bim' hl.export_plink(mt, bfile, ind_id=mt.s) with self.assertRaisesRegex(FatalError, ".bim file does not contain any variants"): hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam')
def to_plink(pops: list, subsets_dir, mt, ht_sample, bfile_path, export_varid: bool = True, overwrite=False): r''' Exports matrix table to PLINK2 files NOTE: These files will need to split up by chromosome before plink_clump.py can be run. ''' assert 'GT' in mt.entry, "mt must have 'GT' as an entry field" assert mt.GT.dtype == hl.tcall, "entry field 'GT' must be of type `Call`" if not overwrite and all([ hl.hadoop_exists(f'{bfile_path}.{suffix}') for suffix in ['bed', 'bim'] ]): print(f'\nPLINK .bed and .bim files already exist for {bfile_path}') print(bfile_path) else: print(f'Saving to bfile prefix {bfile_path}') mt_sample = mt.annotate_rows(varid=hl.str(mt.locus) + ':' + mt.alleles[0] + ':' + mt.alleles[1]) mt_sample = mt_sample.filter_cols(hl.is_defined( ht_sample[mt_sample.s])) hl.export_plink(dataset=mt_sample, output=bfile_path, ind_id=mt_sample.s, varid=mt_sample.varid) # varid used to be rsid
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') tob_wgs = hl.read_matrix_table(TOB_WGS) tob_wgs = hl.experimental.densify(tob_wgs) tob_wgs = hl.split_multi_hts(tob_wgs) tob_wgs_path = output_path('tob_wgs_plink') hl.export_plink(tob_wgs, tob_wgs_path, ind_id=tob_wgs.s)
def main(): # Parse args args = parse_args() # Prepare liftover rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover(args.chainfile, rg38) # Create my own rg38 with altered names rg38_custom_contigs = [ contig.replace('chr', '') for contig in rg38.contigs ] rg38_custom_lens = {} for contig in rg38.lengths: rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig] rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs, rg38_custom_lens) # Load plink mt = hl.import_plink(bed=args.in_plink + '.bed', bim=args.in_plink + '.bim', fam=args.in_plink + '.fam', reference_genome='GRCh37', min_partitions=args.min_partitions) # # Re-call to remove phasing (required for plink output) # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False)) # Liftover mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38')) # Strip chr from contig name (causes problems with GCTA) mt = mt.annotate_rows( contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', '')) # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom) mt = mt.key_rows_by() mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38, mt.locus_GRCh38.position, reference_genome=rg38_custom)) mt = mt.key_rows_by(mt.locus, mt.alleles) # Remove rows with missing locus (after liftover) mt = mt.filter_rows(hl.is_defined(mt.locus)) # Write plink format hl.export_plink(dataset=mt, output=args.out_plink) return 0
def test_export_import_plink_same(self): mt = get_dataset() mt = mt.select_rows(rsid=hl.delimit([mt.locus.contig, hl.str(mt.locus.position), mt.alleles[0], mt.alleles[1]], ':'), cm_position=15.0) mt = mt.select_cols(fam_id=hl.null(hl.tstr), pat_id=hl.null(hl.tstr), mat_id=hl.null(hl.tstr), is_female=hl.null(hl.tbool), is_case=hl.null(hl.tbool)) mt = mt.select_entries('GT') bfile = '/tmp/test_import_export_plink' hl.export_plink(mt, bfile, ind_id=mt.s, cm_position=mt.cm_position) mt_imported = hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam', a2_reference=True, reference_genome='GRCh37') self.assertTrue(mt._same(mt_imported)) self.assertTrue(mt.aggregate_rows(hl.agg.all(mt.cm_position == 15.0)))
def test_import_plink_contig_recoding_w_reference(self): vcf = hl.split_multi_hts( hl.import_vcf(resource('sample2.vcf'), reference_genome=hl.get_reference('GRCh38'), contig_recoding={"22": "chr22"})) hl.export_plink(vcf, '/tmp/sample_plink') bfile = '/tmp/sample_plink' plink = hl.import_plink( bfile + '.bed', bfile + '.bim', bfile + '.fam', a2_reference=True, contig_recoding={'chr22': '22'}, reference_genome='GRCh37').rows() self.assertTrue(plink.all(plink.locus.contig == "22")) self.assertEqual(vcf.count_rows(), plink.count()) self.assertTrue(plink.locus.dtype, hl.tlocus('GRCh37'))
def test_import_plink(self): vcf = hl.split_multi_hts( hl.import_vcf(resource('sample2.vcf'), reference_genome=hl.get_reference('GRCh38'), contig_recoding={"22": "chr22"})) hl.export_plink(vcf, '/tmp/sample_plink') bfile = '/tmp/sample_plink' plink = hl.import_plink( bfile + '.bed', bfile + '.bim', bfile + '.fam', a2_reference=True, contig_recoding={'chr22': '22'}, reference_genome='GRCh37').rows() self.assertTrue(plink.all(plink.locus.contig == "22")) self.assertEqual(vcf.count_rows(), plink.count()) self.assertTrue(plink.locus.dtype, hl.tlocus('GRCh37'))
def test_export_plink(self): vcf_file = resource('sample.vcf') mt = hl.split_multi_hts(hl.import_vcf(vcf_file, min_partitions=10)) # permute columns so not in alphabetical order! import random indices = list(range(mt.count_cols())) random.shuffle(indices) mt = mt.choose_cols(indices) split_vcf_file = uri_path(new_temp_file()) hl_output = uri_path(new_temp_file()) plink_output = uri_path(new_temp_file()) merge_output = uri_path(new_temp_file()) hl.export_vcf(mt, split_vcf_file) hl.export_plink(mt, hl_output) run_command(["plink", "--vcf", split_vcf_file, "--make-bed", "--out", plink_output, "--const-fid", "--keep-allele-order"]) data = [] with open(uri_path(plink_output + ".bim")) as file: for line in file: row = line.strip().split() row[1] = ":".join([row[0], row[3], row[5], row[4]]) data.append("\t".join(row) + "\n") with open(plink_output + ".bim", 'w') as f: f.writelines(data) run_command(["plink", "--bfile", plink_output, "--bmerge", hl_output, "--merge-mode", "6", "--out", merge_output]) same = True with open(merge_output + ".diff") as f: for line in f: row = line.strip().split() if row != ["SNP", "FID", "IID", "NEW", "OLD"]: same = False break self.assertTrue(same)
def test_export_plink(self): vcf_file = resource('sample.vcf') mt = hl.split_multi_hts(hl.import_vcf(vcf_file, min_partitions=10)) split_vcf_file = uri_path(new_temp_file()) hl_output = uri_path(new_temp_file()) plink_output = uri_path(new_temp_file()) merge_output = uri_path(new_temp_file()) hl.export_vcf(mt, split_vcf_file) hl.export_plink(mt, hl_output) run_command([ "plink", "--vcf", split_vcf_file, "--make-bed", "--out", plink_output, "--const-fid", "--keep-allele-order" ]) data = [] with open(uri_path(plink_output + ".bim")) as file: for line in file: row = line.strip().split() row[1] = ":".join([row[0], row[3], row[5], row[4]]) data.append("\t".join(row) + "\n") with open(plink_output + ".bim", 'w') as f: f.writelines(data) run_command([ "plink", "--bfile", plink_output, "--bmerge", hl_output, "--merge-mode", "6", "--out", merge_output ]) same = True with open(merge_output + ".diff") as f: for line in f: row = line.strip().split() if row != ["SNP", "FID", "IID", "NEW", "OLD"]: same = False break self.assertTrue(same)
def test_export_plink(self): ds = self.get_dataset() hl.export_plink(ds, '/tmp/plink_example', id=ds.s) hl.export_plink(ds, '/tmp/plink_example2', id=ds.s, fam_id=ds.s, pat_id="nope", mat_id="nada", is_female=True, is_case=False) hl.export_plink(ds, '/tmp/plink_example3', id=ds.s, fam_id=ds.s, pat_id="nope", mat_id="nada", is_female=True, quant_pheno=hl.float64(hl.len(ds.s))) self.assertRaises(ValueError, lambda: hl.export_plink(ds, '/tmp/plink_example', is_case=True, quant_pheno=0.0)) self.assertRaises(ValueError, lambda: hl.export_plink(ds, '/tmp/plink_example', foo=0.0)) self.assertRaises(TypeError, lambda: hl.export_plink(ds, '/tmp/plink_example', is_case=0.0))
def test_import_plink_a1_major(self): mt = get_dataset() bfile = '/tmp/sample_plink' hl.export_plink(mt, bfile, ind_id=mt.s) def get_data(a2_reference): mt_imported = hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam', a2_reference=a2_reference) return (hl.variant_qc(mt_imported) .rows() .key_by('rsid')) a2 = get_data(a2_reference=True) a1 = get_data(a2_reference=False) j = (a2.annotate(a1_alleles=a1[a2.rsid].alleles, a1_vqc=a1[a2.rsid].variant_qc) .rename({'variant_qc': 'a2_vqc', 'alleles': 'a2_alleles'})) self.assertTrue(j.all((j.a1_alleles[0] == j.a2_alleles[1]) & (j.a1_alleles[1] == j.a2_alleles[0]) & (j.a1_vqc.n_not_called == j.a2_vqc.n_not_called) & (j.a1_vqc.n_het == j.a2_vqc.n_het) & (j.a1_vqc.homozygote_count[0] == j.a2_vqc.homozygote_count[1]) & (j.a1_vqc.homozygote_count[1] == j.a2_vqc.homozygote_count[0])))
def test_export_import_plink_same(self): mt = get_dataset() mt = mt.select_rows(rsid=hl.delimit([ mt.locus.contig, hl.str(mt.locus.position), mt.alleles[0], mt.alleles[1] ], ':'), cm_position=15.0) mt = mt.select_cols(fam_id=hl.null(hl.tstr), pat_id=hl.null(hl.tstr), mat_id=hl.null(hl.tstr), is_female=hl.null(hl.tbool), is_case=hl.null(hl.tbool)) mt = mt.select_entries('GT') bfile = '/tmp/test_import_export_plink' hl.export_plink(mt, bfile, ind_id=mt.s, cm_position=mt.cm_position) mt_imported = hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam', a2_reference=True, reference_genome='GRCh37') self.assertTrue(mt._same(mt_imported)) self.assertTrue(mt.aggregate_rows(hl.agg.all(mt.cm_position == 15.0)))
def run_gwas(vcf_file, phenotypes_file, output_file): table = hl.import_table(phenotypes_file, impute=True).key_by('Sample') hl.import_vcf(vcf_file).write('tmp.mt') mt = hl.read_matrix_table('tmp.mt') mt = mt.annotate_cols(pheno=table[mt.s]) mt = hl.sample_qc(mt) mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97)) ab = mt.AD[1] / hl.sum(mt.AD) filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) | (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) | (mt.GT.is_hom_var() & (ab >= 0.9))) mt = mt.filter_entries(filter_condition_ab) mt = hl.variant_qc(mt) mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01) eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT) mt = mt.annotate_cols(scores=pcs[mt.s].scores) gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption, x=mt.GT.n_alt_alleles(), covariates=[ 1.0, mt.pheno.isFemale, mt.scores[0], mt.scores[1], mt.scores[2] ]) gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles), P=gwas.p_value) gwas = gwas.key_by(gwas.SNP) gwas = gwas.select(gwas.P) gwas.export(f'{output_file}.assoc', header=True) hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
(mt['gnomad2_popmax_gnomad.AF'] <= rare_threshold) & (mt['exac_af_adj'] <= rare_threshold) & (mt.symbol == 'HEY2') & (mt.csq_type != 'SYN'), keep=True) # Annotate sex info mt = (mt.annotate_cols(is_female=hl.case().when(mt.f_stat <= 0.4, True).when( mt.f_stat >= 0.6, False).or_missing())) # Export plink files date = time.strftime("%d-%m-%Y") PLINK_OUTPUT_PATH = f'/mnt/nfs/mdatanode/wes10k_resources/wes1k/plink_output/hey2_{date}' hl.export_plink(dataset=mt, output=PLINK_OUTPUT_PATH, ind_id=mt.s, pheno=mt.isCase, is_female=mt.is_female) # Export useful info (e.g. covariates, annotation) delimiter = '|' sample_expr_annotations = dict( cases_het=hl.delimit(hl.agg.filter(mt.GT.is_het() & mt.isCase, hl.agg.collect(mt.s)), delimiter=delimiter), cases_hom=hl.delimit(hl.agg.filter(mt.GT.is_hom_var() & mt.isCase, hl.agg.collect(mt.s)), delimiter=delimiter), controls_het=hl.delimit(hl.agg.filter(mt.GT.is_het() & ~mt.isCase, hl.agg.collect(mt.s)), delimiter=delimiter),
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("importing vds...") vds = hl.read_matrix_table(vds_splitmulti_file) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # III. Remove rare variants #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("removing rare variants...") vds = vds.filter_rows((vds.info.AF[0] > 0.01) & (vds.info.AF[0] < 0.99)) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # IV. Remove indels #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("removing indels...") vds = vds.filter_rows(hl.is_indel(vds.alleles[0], vds.alleles[1]) == False) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # V. Write output #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("writing out...") hl.export_plink(vds, plink_files_out) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # print Runtime #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ stop = timeit.default_timer() print("runtime: " + str(stop - start) + " seconds")
#! /usr/bin/python import sys import hail as hl n_samples = int(sys.argv[1]) n_variants = int(sys.argv[2]) path = sys.argv[3] mt = hl.balding_nichols_model(1, n_samples, n_variants) mt = mt.key_cols_by(s = hl.str(mt.sample_idx)) mt = mt.annotate_entries(GT = hl.unphased_diploid_gt_index_call(hl.rand_bool(0.5) * 2)) hl.export_vcf(mt, path + ".vcf") hl.export_plink(mt, path)
import hail as hl hl.set_global_seed(0) mt = hl.balding_nichols_model(n_populations=3, n_variants=(1 << 10), n_samples=4) mt = mt.key_cols_by(s='s' + hl.str(mt.sample_idx)) mt = mt.annotate_entries(GT=hl.or_missing(hl.rand_bool(0.99), mt.GT)) hl.export_plink(mt, 'balding-nichols-1024-variants-4-samples-3-populations', fam_id='f' + mt.s)
def main(args): mt = hl.read_matrix_table(args.matrixtable) # ld pruning pruned_ht = hl.ld_prune(mt.GT, r2=0.1) pruned_mt = mt.filter_rows(hl.is_defined(pruned_ht[mt.row_key])) pruned_mt.write(f"{args.output_dir}/mt_ldpruned.mt", overwrite=True) # PC relate pruned_mt = pruned_mt.select_entries( GT=hl.unphased_diploid_gt_index_call(pruned_mt.GT.n_alt_alleles())) eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT, k=10, compute_loadings=False) scores.write(f"{args.output_dir}/mt_pruned.pca_scores.ht", overwrite=True) relatedness_ht = hl.pc_relate(pruned_mt.GT, min_individual_maf=0.05, scores_expr=scores[pruned_mt.col_key].scores, block_size=4096, min_kinship=0.05, statistics='kin2') relatedness_ht.write(f"{args.output_dir}/mt_relatedness.ht", overwrite=True) pairs = relatedness_ht.filter(relatedness_ht['kin'] > 0.125) related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, keep=False) related_samples_to_remove.write( f"{args.output_dir}/mt_related_samples_to_remove.ht", overwrite=True) pca_mt = pruned_mt.filter_cols(hl.is_defined( related_samples_to_remove[pruned_mt.col_key]), keep=False) related_mt = pruned_mt.filter_cols(hl.is_defined( related_samples_to_remove[pruned_mt.col_key]), keep=True) variants, samples = pca_mt.count() print(f"{samples} samples after relatedness step.") # Population pca plink_mt = pca_mt.annotate_cols(uid=pca_mt.s).key_cols_by('uid') hl.export_plink(plink_mt, f"{args.output_dir}/mt_unrelated.plink", fam_id=plink_mt.uid, ind_id=plink_mt.uid) pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca( pca_mt.GT, k=20, compute_loadings=True) pca_af_ht = pca_mt.annotate_rows( pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2).rows() pca_loadings = pca_loadings.annotate( pca_af=pca_af_ht[pca_loadings.key].pca_af) pca_scores.write(f"{args.output_dir}/mt_pca_scores.ht", overwrite=True) pca_loadings.write(f"{args.output_dir}/mt_pca_loadings.ht", overwrite=True) pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores) variants, samples = related_mt.count() print( 'Projecting population PCs for {} related samples...'.format(samples)) #related_scores = pc_project(related_mt, pca_loadings) #relateds = related_mt.cols() #relateds = relateds.annotate(scores=related_scores[relateds.key].scores) pca_mt.write(f"{args.output_dir}/mt_pca.mt", overwrite=True) p = hl.plot.scatter(pca_mt.scores[0], pca_mt.scores[1], title='PCA', xlabel='PC1', ylabel='PC2') output_file(f"{args.plot_dir}/pca.html") save(p)
def test_export_plink_exprs(self): ds = get_dataset() fam_mapping = {'f0': 'fam_id', 'f1': 'ind_id', 'f2': 'pat_id', 'f3': 'mat_id', 'f4': 'is_female', 'f5': 'pheno'} bim_mapping = {'f0': 'contig', 'f1': 'varid', 'f2': 'cm_position', 'f3': 'position', 'f4': 'a1', 'f5': 'a2'} # Test default arguments out1 = new_temp_file() hl.export_plink(ds, out1) fam1 = (hl.import_table(out1 + '.fam', no_header=True, impute=False, missing="") .rename(fam_mapping)) bim1 = (hl.import_table(out1 + '.bim', no_header=True, impute=False) .rename(bim_mapping)) self.assertTrue(fam1.all((fam1.fam_id == "0") & (fam1.pat_id == "0") & (fam1.mat_id == "0") & (fam1.is_female == "0") & (fam1.pheno == "NA"))) self.assertTrue(bim1.all((bim1.varid == bim1.contig + ":" + bim1.position + ":" + bim1.a2 + ":" + bim1.a1) & (bim1.cm_position == "0.0"))) # Test non-default FAM arguments out2 = new_temp_file() hl.export_plink(ds, out2, ind_id=ds.s, fam_id=ds.s, pat_id="nope", mat_id="nada", is_female=True, pheno=False) fam2 = (hl.import_table(out2 + '.fam', no_header=True, impute=False, missing="") .rename(fam_mapping)) self.assertTrue(fam2.all((fam2.fam_id == fam2.ind_id) & (fam2.pat_id == "nope") & (fam2.mat_id == "nada") & (fam2.is_female == "2") & (fam2.pheno == "1"))) # Test quantitative phenotype out3 = new_temp_file() hl.export_plink(ds, out3, ind_id=ds.s, pheno=hl.float64(hl.len(ds.s))) fam3 = (hl.import_table(out3 + '.fam', no_header=True, impute=False, missing="") .rename(fam_mapping)) self.assertTrue(fam3.all((fam3.fam_id == "0") & (fam3.pat_id == "0") & (fam3.mat_id == "0") & (fam3.is_female == "0") & (fam3.pheno != "0") & (fam3.pheno != "NA"))) # Test non-default BIM arguments out4 = new_temp_file() hl.export_plink(ds, out4, varid="hello", cm_position=100) bim4 = (hl.import_table(out4 + '.bim', no_header=True, impute=False) .rename(bim_mapping)) self.assertTrue(bim4.all((bim4.varid == "hello") & (bim4.cm_position == "100.0"))) # Test call expr out5 = new_temp_file() ds_call = ds.annotate_entries(gt_fake=hl.call(0, 0)) hl.export_plink(ds_call, out5, call=ds_call.gt_fake) ds_all_hom_ref = hl.import_plink(out5 + '.bed', out5 + '.bim', out5 + '.fam') nerrors = ds_all_hom_ref.aggregate_entries(hl.agg.count_where(~ds_all_hom_ref.GT.is_hom_ref())) self.assertTrue(nerrors == 0) # Test white-space in FAM id expr raises error with self.assertRaisesRegex(TypeError, "has spaces in the following values:"): hl.export_plink(ds, new_temp_file(), mat_id="hello world") # Test white-space in varid expr raises error with self.assertRaisesRegex(FatalError, "no white space allowed:"): hl.export_plink(ds, new_temp_file(), varid="hello world")
#! /usr/bin/python import sys import hail as hl n_samples = int(sys.argv[1]) n_variants = int(sys.argv[2]) path = sys.argv[3] mt = hl.balding_nichols_model(1, n_samples, n_variants) mt = mt.key_cols_by(s=hl.str(mt.sample_idx)) mt = mt.annotate_entries( GT=hl.unphased_diploid_gt_index_call(hl.rand_bool(0.5) * 2)) hl.export_vcf(mt, path + ".vcf") hl.export_plink(mt, path) chimera0 = mt.filter_rows(mt.locus.position < n_variants / 2) chimera0 = chimera0.filter_cols(chimera0.s == "0") chimera1 = mt.filter_rows(mt.locus.position >= n_variants / 2) chimera1 = chimera1.filter_cols(chimera1.s == "1") chimera1 = chimera1.key_cols_by(s="0") mt2 = chimera0.union_rows(chimera1) hl.export_vcf(mt2, path + "-chimera.vcf") hl.export_plink(mt2, path + "-chimera")
def main(args): hl.init(log='/sample_qc.log', tmp_dir='hdfs:///pc_relate.tmp/') if not args.load_joint_pruned_qc_mt: logger.info('Joining exomes and genomes...') exome_qc_mt = read_and_pre_process_data( qc_mt_path('exomes'), qc_ht_path('exomes', 'hard_filters')) genome_qc_mt = read_and_pre_process_data( qc_mt_path('genomes'), qc_ht_path('genomes', 'hard_filters')) joint_qc_mt = exome_qc_mt.union_cols( genome_qc_mt) # NOTE: this is an inner join on rows joint_qc_mt = joint_qc_mt.filter_rows( (hl.agg.mean(joint_qc_mt.GT.n_alt_alleles()) / 2 > 0.001) & (hl.agg.fraction(hl.is_defined(joint_qc_mt.GT)) > 0.99)) joint_qc_mt.write(qc_mt_path('joint'), args.overwrite) logger.info('LD-pruning joint mt of exomes and genomes...') joint_qc_mt = hl.read_matrix_table(qc_mt_path('joint')) variants, samples = joint_qc_mt.count() logger.info('Pruning {0} variants in {1} samples'.format( variants, samples)) joint_qc_pruned_ht = hl.ld_prune(joint_qc_mt.GT, r2=0.1) # Note writing the LD-pruned MT is probably overkill # vs using `filter_rows` to filter sites based on the LD-pruned HT. joint_qc_pruned_mt = joint_qc_mt.filter_rows( hl.is_defined(joint_qc_pruned_ht[joint_qc_mt.row_key])) joint_qc_pruned_mt.write(qc_mt_path('joint', ld_pruned=True), args.overwrite) pruned_mt = hl.read_matrix_table(qc_mt_path('joint', ld_pruned=True)) variants, samples = pruned_mt.count() logger.info('{0} samples, {1} variants found in LD-pruned joint MT'.format( samples, variants)) if not args.skip_pc_relate: logger.info('Running PCA for PC-Relate...') eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT, k=10, compute_loadings=False) scores.write( qc_temp_data_prefix('joint') + '.pruned.pca_scores.ht', args.overwrite) logger.info('Running PC-Relate...') scores = hl.read_table( qc_temp_data_prefix('joint') + '.pruned.pca_scores.ht') # NOTE: This needs SSDs on your workers (for the temp files) and no pre-emptibles while the BlockMatrix writes relatedness_ht = hl.pc_relate( pruned_mt.GT, min_individual_maf=0.05, scores_expr=scores[pruned_mt.col_key].scores, block_size=4096, min_kinship=0.05, statistics='kin2') relatedness_ht.write(relatedness_ht_path, args.overwrite) relatedness_ht = hl.read_table(relatedness_ht_path) if not args.skip_relatedness: infer_ped(GnomADRelatedData('exomes')) infer_ped(GnomADRelatedData('genomes')) logger.info('Making rank file...') rank_table = make_rank_file(rank_annotations_path('joint')) logger.info('Finished making rank file...') related_samples_to_drop_ranked = get_related_samples_to_drop( rank_table, relatedness_ht) related_samples_to_drop_ranked.write( qc_temp_data_prefix('joint') + '.related_samples_to_drop.ht', args.overwrite) pca_mt, related_mt = split_mt_by_relatedness(pruned_mt) if not args.skip_pop_pca: variants, samples = pca_mt.count() logger.info('{} samples after removing relateds'.format(samples)) # TODO: Check that there are no longer any 2nd-degree relateds in the callset by running KING on the output file below plink_mt = pca_mt.annotate_cols(uid=pca_mt.data_type + '_' + pca_mt.s.replace(" ", "_")).replace( "/", "_").key_cols_by('uid') hl.export_plink(plink_mt, qc_temp_data_prefix('joint') + '.unrelated.plink', fam_id=plink_mt.uid, ind_id=plink_mt.uid) logger.info( 'Computing population PCs and annotating with known population labels...' ) pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca( pca_mt.GT, k=20, compute_loadings=True) pca_af_ht = pca_mt.annotate_rows( pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2).rows() pca_loadings = pca_loadings.annotate( pca_af=pca_af_ht[pca_loadings.key].pca_af) pca_scores.write(ancestry_pca_scores_ht_path(), args.overwrite) pca_loadings.write(ancestry_pca_loadings_ht_path(), args.overwrite) pca_scores = hl.read_table(ancestry_pca_scores_ht_path()) pca_loadings = hl.read_table(ancestry_pca_loadings_ht_path()) pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores) variants, samples = related_mt.count() logger.info( 'Projecting population PCs for {} related samples...'.format(samples)) related_scores = pc_project(related_mt, pca_loadings) relateds = related_mt.cols() relateds = relateds.annotate(scores=related_scores[relateds.key].scores) logger.info('Assigning population annotations...') pop_colnames = ['related', 'known_pop', 'scores'] pop_annots_ht = hl.import_table(known_population_annotations, impute=True).key_by('combined_sample') joint_ht = pca_mt.cols().union(relateds) joint_ht = joint_ht.annotate( known_pop=pop_annots_ht[joint_ht.data_type.replace('s', '') + '_' + joint_ht.s.replace(' ', '_')].known_pop ) # FIXME: temporarily doing the underscore thing until known_population_annotations is fixed joint_pca_ht = joint_ht.select(*pop_colnames) joint_pca_ht, joint_pca_fit = run_assign_population_pcs( joint_pca_ht, qc_temp_data_prefix('joint') + '.RF_pop_assignments.txt.bgz', qc_temp_data_prefix('joint') + '.RF_fit.pkl', pcs=list(range(1, 7))) joint_ht = joint_ht.annotate(pop=joint_pca_ht[joint_ht.key].pop).select( 'pop', *pop_colnames) # Add special Estonian pop category for genomes estonian_ht = (hl.import_table(estonian_batches, impute=True).annotate( data_type='genomes').key_by('data_type', 'sample')) joint_ht = joint_ht.annotate(batch=estonian_ht[joint_ht.key].batch) joint_ht = joint_ht.annotate(qc_pop=hl.case(missing_false=True).when( hl.is_defined(joint_ht.pop) & (joint_ht.batch == 1), 'est_b1' ).when(hl.is_defined(joint_ht.pop) & (joint_ht.batch == 2), 'est_b2').default(joint_ht.pop)).persist() # These are keyed by only `s` genome_mt = get_gnomad_data('genomes', adj=False, split=False, meta_root=None).select_cols() exome_mt = get_gnomad_data('exomes', adj=False, split=False, meta_root=None).select_cols() # Population-specific filtering if not args.skip_calculate_sample_metrics: logger.info( 'Running mini sample QC for platform- and population-specific filtering...' ) gnomad_sample_qc(exome_mt).cols().select('sample_qc').write( qc_temp_data_prefix('exomes') + '.sample_qc.ht', args.overwrite) gnomad_sample_qc(genome_mt).cols().select('sample_qc').write( qc_temp_data_prefix('genomes') + '.sample_qc.ht', args.overwrite) # TODO: check that the pcr_free annotations are complete once samples are updated from Jessica's spreadsheet logger.info('Annotating population and platform assignments...') platform_ht = hl.read_table(qc_ht_path('exomes', 'platforms')) exome_ht = exome_mt.cols() exome_ht = exome_ht.annotate( qc_platform=platform_ht.key_by('s')[exome_ht.s].qc_platform, **joint_ht.filter( joint_ht.data_type == 'exomes').key_by('s')[exome_ht.s]) genome_meta_ht = hl.read_table(qc_ht_path('genomes', 'hard_filters')) genome_ht = genome_mt.cols() genome_ht = genome_ht.annotate( qc_platform=genome_meta_ht.key_by('s')[genome_ht.s].qc_platform, **joint_ht.filter( joint_ht.data_type == 'genomes').key_by('s')[genome_ht.s]) exome_sample_qc_ht = hl.read_table( qc_temp_data_prefix('exomes') + '.sample_qc.ht') genome_sample_qc_ht = hl.read_table( qc_temp_data_prefix('genomes') + '.sample_qc.ht') exome_ht = exome_ht.annotate(**exome_sample_qc_ht[exome_ht.s]) genome_ht = genome_ht.annotate(**genome_sample_qc_ht[genome_ht.s]) # For each population, aggregate sample QC metrics and calculate the MAD/mean/stdev logger.info( 'Calculating platform- and population-specific sample QC thresholds...' ) exome_qc_metrics = [ 'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion', 'n_deletion', 'r_het_hom_var' ] exome_pop_platform_filter_ht = compute_stratified_metrics_filter( exome_ht, exome_qc_metrics, ['qc_pop', 'qc_platform']) exome_ht = exome_ht.annotate_globals( hl.eval(exome_pop_platform_filter_ht.globals)) exome_ht = exome_ht.annotate( **exome_pop_platform_filter_ht[exome_ht.key]).persist() genome_qc_metrics = [ 'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion', 'n_deletion', 'r_het_hom_var' ] genome_pop_platform_filter_ht = compute_stratified_metrics_filter( genome_ht, genome_qc_metrics, ['qc_pop', 'qc_platform']) genome_ht = genome_ht.annotate_globals( hl.eval(genome_pop_platform_filter_ht.globals)) genome_ht = genome_ht.annotate( **genome_pop_platform_filter_ht[genome_ht.key]).persist() # Annotate samples that fail their respective filters checkpoint = exome_ht.aggregate( hl.agg.count_where(hl.len(exome_ht.pop_platform_filters) == 0)) logger.info( f'{checkpoint} exome samples found passing pop/platform-specific filtering' ) exome_ht.key_by(data_type='exomes', s=exome_ht.s).write(qc_ht_path('exomes', 'pop_platform'), args.overwrite) checkpoint = genome_ht.aggregate( hl.agg.count_where(hl.len(genome_ht.pop_platform_filters) == 0)) logger.info( f'{checkpoint} genome samples found passing pop/platform-specific filtering' ) genome_ht.key_by(data_type='genomes', s=genome_ht.s).write( qc_ht_path('genomes', 'pop_platform'), args.overwrite)
def test_import_plink_empty_fam(self): mt = get_dataset().drop_cols() bfile = '/tmp/test_empty_fam' hl.export_plink(mt, bfile, ind_id=mt.s) with self.assertRaisesRegex(FatalError, "Empty .fam file"): hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam')
def run( n_variants: int, n_samples: int, n_contigs: int, n_covars: int, n_traits: int, output_dir: str, ): hl.init() mt = get_plink_sim_dataset(n_variants=n_variants, n_samples=n_samples, n_contigs=n_contigs) gt = hl.linalg.BlockMatrix.from_entry_expr( mt.GT.n_alt_alleles()).to_numpy() logger.info(f"Created calls w/ shape {gt.shape}") sample_ids = mt.s.collect() logger.info(f"Num samples: {len(sample_ids)}") logger.info(f"First samples: {sample_ids[:5]}") def get_covariates(n, sample_ids, seed=0): rs = np.random.RandomState(seed) df = pd.DataFrame( rs.normal(size=(len(sample_ids), n)), columns=[f"X{i:03d}" for i in range(n)], ) df = df.assign(sample_id=sample_ids).set_index("sample_id") return df df_cov = get_covariates(n_covars, sample_ids) logger.info(f"Covariate info:\n{_info(df_cov)}") logger.info(f"Covariate head:\n{df_cov.head()}") def get_betas(n_traits, gt, df_cov, seed=0): rs = np.random.RandomState(seed) n_covars = df_cov.shape[1] n_variants = gt.shape[0] traits = [f"Y{i:04d}" for i in range(n_traits)] beta_cov = rs.normal(loc=2.0, scale=1, size=(n_covars, n_traits)) beta_var = rs.normal(loc=-2.0, scale=1, size=(n_variants, n_traits)) # Set last half of all betas to 0 beta_cov[(beta_cov.shape[0] // 2):, :] = 0 beta_var[(beta_var.shape[0] // 2):, :] = 0 df_beta_cov = pd.DataFrame(beta_cov, index=[f"B-{c}" for c in df_cov.columns], columns=traits) df_beta_var = pd.DataFrame( beta_var, index=[f"B-V{i:07d}" for i in range(n_variants)], columns=traits) return df_beta_cov, df_beta_var df_beta_cov, df_beta_var = get_betas(n_traits, gt, df_cov) logger.info(f"Beta cov info:\n{_info(df_beta_cov)}") logger.info(f"Beta cov head:\n{df_beta_cov.head()}") logger.info(f"Beta var info:\n{_info(df_beta_var)}") logger.info(f"Beta var head:\n{df_beta_var.head()}") def get_traits(gt, df_cov, df_beta_var, df_beta_cov, scale=0.001, seed=0): n_variants, n_samples = gt.shape assert gt.shape[1] == df_cov.shape[0] assert df_beta_var.shape[1] == df_beta_cov.shape[1] n_traits = df_beta_var.shape[1] rs = np.random.RandomState(seed) noise = rs.normal(scale=scale, loc=0, size=(n_samples, n_traits)) Y = gt.T @ df_beta_var.values + df_cov.values @ df_beta_cov.values + noise df_trait = pd.DataFrame(Y, index=df_cov.index, columns=df_beta_cov.columns) assert df_trait.notnull().all().all() return df_trait df_trait = get_traits(gt, df_cov, df_beta_var, df_beta_cov, scale=0.001) logger.info(f"Trait info: {_info(df_trait)}") logger.info(f"Trait head:\n{df_trait.head()}") output_path = Path(output_dir) if output_path.exists(): logger.info(f"Clearing old output path at {output_path}") shutil.rmtree(output_path) output_path.mkdir(parents=True) logger.info(f"Writing results to {output_path}") path = str(output_path / "genotypes") hl.export_plink(mt, path) logger.info(f"PLINK written to {path}") path = str(output_path / "covariates.csv") df_cov.reset_index().to_csv(path, index=False) logger.info(f"Covariates written to {path}") path = str(output_path / "traits.csv") df_trait.reset_index().to_csv(path, index=False) logger.info(f"Traits written to {path}") path = str(output_path / "beta_covariate.csv") df_beta_cov.to_csv(path, index=True) logger.info(f"Covariate betas written to {path}") path = str(output_path / "beta_variant.csv") df_beta_var.to_csv(path, index=True) logger.info(f"Variant betas written to {path}") logger.info("Simulated data generation complete")
def test_import_plink_empty_bim(self): mt = get_dataset().filter_rows(False) bfile = '/tmp/test_empty_bim' hl.export_plink(mt, bfile, ind_id=mt.s) with self.assertRaisesRegex(FatalError, ".bim file does not contain any variants"): hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam')
(ct.select(ID=ct.s, sexFstat=ct.f_stat, isFemale=ct.is_female, ydp=ct.ydp).export(sample_sex_fstat_file)) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ld pruning #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("LD pruning...") vds5_ldp = hl.ld_prune(vds5, n_cores=1600, r2=0.1) #vds5_ldp = hl.ld_prune(vds5, n_cores=60, r2=0.2, window=1000000, memory_per_core=512) print("writing LD pruned VDS...") vds5_ldp.write(vds_ldpruned_common_file, overwrite=True) hl.export_plink(vds5_ldp, vds_ldpruned_common_plink, fam_id=vds5_ldp.s, id=vds5_ldp.s) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # IBD analysis #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # use king until pcrelate works #vds.ibd(min=0.1).flatten().rename({'ibd.Z0': 'Z0', 'ibd.Z1': 'Z1', 'ibd.Z2': 'Z2', 'ibd.PI_HAT': 'PI_HAT'}).export(ibd_results_file) # print runtime stop = timeit.default_timer() print("runtime: " + str(stop - start) + " seconds")
import hail as hl target_samples = hl.import_table('gs://apcdr/ukb_holdout/ukb31063.gwas_samples.holdout_and_target.txt', key='s') contig = 'autosomes' contig_expr = 'chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}' ht_variants = hl.read_table('gs://ukb31063/ukb31063.neale_gwas_variants.ht') mt = hl.import_bgen( path=f'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_{contig_expr}_v3.bgen', sample_file=f'gs://ukb31063/ukb31063.{contig}.sample', entry_fields=['dosage'], variants=ht_variants) mt_target = mt.filter_cols(hl.is_defined(target_samples[mt.s])) # target hl.export_plink(mt_target, 'gs://apcdr/ukb_holdout/ukb31063.holdout.target_individuals', ind_id= mt_target.s, varid=mt_target.rsid)
# ld prune #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ onekg = hl.ld_prune(onekg, n_cores=800, r2=0.2) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # write vds #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ onekg.write(onekg_ldpruned_file, overwrite=True) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # write plink #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print('export plink') hl.export_plink(onekg, onekg_plink_prefix, fam_id=onekg.s, id=onekg.s) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # pca #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # print('PCA...') # eigenvalues, scores, loadings = hl.hwe_normalized_pca(onekg, k=10) # with hl.utils.hadoop_open(pca_value_prefix + '_all.txt', 'w') as f: # for val in eigenvalues: # f.write(str(val) + '\n') # scores.flatten().export(pca_score_prefix + '_all.txt') # onekgeur = onekg.filter_cols(onekg.super == 'EUR', keep=True)
def test_export_plink_exprs(self): ds = get_dataset() fam_mapping = { 'f0': 'fam_id', 'f1': 'ind_id', 'f2': 'pat_id', 'f3': 'mat_id', 'f4': 'is_female', 'f5': 'pheno' } bim_mapping = { 'f0': 'contig', 'f1': 'varid', 'f2': 'cm_position', 'f3': 'position', 'f4': 'a1', 'f5': 'a2' } # Test default arguments out1 = new_temp_file() hl.export_plink(ds, out1) fam1 = (hl.import_table(out1 + '.fam', no_header=True, impute=False, missing="").rename(fam_mapping)) bim1 = (hl.import_table(out1 + '.bim', no_header=True, impute=False).rename(bim_mapping)) self.assertTrue( fam1.all((fam1.fam_id == "0") & (fam1.pat_id == "0") & (fam1.mat_id == "0") & (fam1.is_female == "0") & (fam1.pheno == "NA"))) self.assertTrue( bim1.all((bim1.varid == bim1.contig + ":" + bim1.position + ":" + bim1.a2 + ":" + bim1.a1) & (bim1.cm_position == "0.0"))) # Test non-default FAM arguments out2 = new_temp_file() hl.export_plink(ds, out2, ind_id=ds.s, fam_id=ds.s, pat_id="nope", mat_id="nada", is_female=True, pheno=False) fam2 = (hl.import_table(out2 + '.fam', no_header=True, impute=False, missing="").rename(fam_mapping)) self.assertTrue( fam2.all((fam2.fam_id == fam2.ind_id) & (fam2.pat_id == "nope") & (fam2.mat_id == "nada") & (fam2.is_female == "2") & (fam2.pheno == "1"))) # Test quantitative phenotype out3 = new_temp_file() hl.export_plink(ds, out3, ind_id=ds.s, pheno=hl.float64(hl.len(ds.s))) fam3 = (hl.import_table(out3 + '.fam', no_header=True, impute=False, missing="").rename(fam_mapping)) self.assertTrue( fam3.all((fam3.fam_id == "0") & (fam3.pat_id == "0") & (fam3.mat_id == "0") & (fam3.is_female == "0") & (fam3.pheno != "0") & (fam3.pheno != "NA"))) # Test non-default BIM arguments out4 = new_temp_file() hl.export_plink(ds, out4, varid="hello", cm_position=100) bim4 = (hl.import_table(out4 + '.bim', no_header=True, impute=False).rename(bim_mapping)) self.assertTrue( bim4.all((bim4.varid == "hello") & (bim4.cm_position == "100.0"))) # Test call expr out5 = new_temp_file() ds_call = ds.annotate_entries(gt_fake=hl.call(0, 0)) hl.export_plink(ds_call, out5, call=ds_call.gt_fake) ds_all_hom_ref = hl.import_plink(out5 + '.bed', out5 + '.bim', out5 + '.fam') nerrors = ds_all_hom_ref.aggregate_entries( hl.agg.count_where(~ds_all_hom_ref.GT.is_hom_ref())) self.assertTrue(nerrors == 0) # Test white-space in FAM id expr raises error with self.assertRaisesRegex(TypeError, "has spaces in the following values:"): hl.export_plink(ds, new_temp_file(), mat_id="hello world") # Test white-space in varid expr raises error with self.assertRaisesRegex(FatalError, "no white space allowed:"): hl.export_plink(ds, new_temp_file(), varid="hello world")
def main(): # # Args (local) # chrom = 11 # chain_file = '/Users/em21/Projects/ot_genetics/genetics-sumstats_data/extras/prepare_uk_biobank_gwas_catalog/sitelist/input_data/grch37_to_grch38.over.chain.gz' # in_bgen = 'example_data/ukb_imp_chr{chrom}_v3.example.bgen' # in_sample = 'output/ukb_10k_downsampled.sample' # to_keep_list = 'output/ukb_10k_downsampled.sample_list.tsv' # out_plink = 'output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k' # cores = 1 # Use "*" for all # maf_threshold = 0.001 # Args (server) chrom = sys.argv[1] chain_file = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/grch37_to_grch38.over.chain.gz' in_bgen = '/nfs/users/nfs_e/em21/otcoregen/uk_biobank_data/data/genetics/imputation/ukb_imp_chr{chrom}_v3.bgen' in_sample = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample' to_keep_list = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample_list.tsv' out_plink = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k' cores = sys.argv[2] # Use "*" for all maf_threshold = 0.001 # Set the maximum number of cores hl.init(master="local[{}]".format(cores)) # Prepare liftover rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover(chain_file, rg38) # Create my own rg38 with altered names rg38_custom_contigs = [ contig.replace('chr', '') for contig in rg38.contigs ] rg38_custom_lens = {} for contig in rg38.lengths: rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig] rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs, rg38_custom_lens) print('Processing chromosome {0}'.format(chrom)) # Index bgen if not existing if not hl.hadoop_exists(in_bgen.format(chrom=chrom) + '.idx2'): hl.index_bgen(in_bgen.format(chrom=chrom), contig_recoding={ "01": "1", "02": "2", "03": "3", "04": "4", "05": "5", "06": "6", "07": "7", "08": "8", "09": "9" }, reference_genome='GRCh37') # Load bgen mt = hl.import_bgen(in_bgen.format(chrom=chrom), entry_fields=['GT'], sample_file=in_sample) # Load list samples to keep samples_to_keep = hl.import_table(to_keep_list, no_header=True, impute=False, types={ 'f0': hl.tstr }).key_by('f0') # Downsample to required subset of samples mt = mt.filter_cols(hl.is_defined(samples_to_keep[mt.s])) # Re-call to remove phasing (required for plink output) # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False)) # Filter on MAF mt = hl.variant_qc(mt) mt = mt.annotate_rows(variant_qc=mt.variant_qc.annotate( MAF=hl.min(mt.variant_qc.AF))) mt = mt.filter_rows(mt.variant_qc.MAF >= maf_threshold) # Liftover mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38')) # Strip chr from contig name (causes problems with GCTA) mt = mt.annotate_rows( contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', '')) # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom) mt = mt.key_rows_by() mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38, mt.locus_GRCh38.position, reference_genome=rg38_custom)) mt = mt.key_rows_by(mt.locus, mt.alleles) # Remove rows with missing locus (after liftover) mt = mt.filter_rows(hl.is_defined(mt.locus)) # Write plink format hl.export_plink(dataset=mt, output=out_plink.format(chrom=chrom)) return 0
print_count(mt_clean) # Changing sex annotation so it will properly be output when converted to plink format mt_clean = mt_clean.annotate_cols( sex=hl.if_else((mt_clean.reported_sex == 'F'), True, False)) # Sanity check sex count after changing format mt_clean.aggregate_cols(hl.agg.counter(mt_clean.sex == True)) # Reading in csv with FID info fid = hl.import_table('gs://neurogap/Pilot_Data_HailQC/fid_info.csv', delimiter=',') # Changing keys to match mt_clean fid = fid.key_by(fid.siteID) # Adding on proper FIDs to dataset (for plink output) mt_clean = mt_clean.annotate_cols(fid=fid[mt_clean.siteID].FID) # Changing row key to what we want for plink output mt_clean = mt_clean.key_rows_by(locus=mt_clean['locus'], alleles=mt_clean['alleles']) # Output to plink, specifying desired fields hl.export_plink(mt_clean, 'gs://neurogap-pilot-clean/NeuroGAP_pilot_clean', ind_id=mt_clean.collab_PID, fam_id=mt_clean.fid, is_female=mt_clean.sex, varid=mt_clean.rsid)
def test_grm(self): tolerance = 0.001 def load_id_file(path): ids = [] with hl.hadoop_open(path) as f: for l in f: r = l.strip().split('\t') self.assertEqual(len(r), 2) ids.append(r[1]) return ids def load_rel(ns, path): rel = np.zeros((ns, ns)) with hl.hadoop_open(path) as f: for i, l in enumerate(f): for j, n in enumerate(map(float, l.strip().split('\t'))): rel[i, j] = n self.assertEqual(j, i) self.assertEqual(i, ns - 1) return rel def load_grm(ns, nv, path): m = np.zeros((ns, ns)) with utils.hadoop_open(path) as f: i = 0 for l in f: row = l.strip().split('\t') self.assertEqual(int(row[2]), nv) m[int(row[0]) - 1, int(row[1]) - 1] = float(row[3]) i += 1 self.assertEqual(i, ns * (ns + 1) / 2) return m def load_bin(ns, path): m = np.zeros((ns, ns)) with utils.hadoop_open(path, 'rb') as f: for i in range(ns): for j in range(i + 1): b = f.read(4) self.assertEqual(len(b), 4) m[i, j] = unpack('<f', bytearray(b))[0] left = f.read() self.assertEqual(len(left), 0) return m b_file = utils.new_temp_file(prefix="plink") rel_file = utils.new_temp_file(prefix="test", suffix="rel") rel_id_file = utils.new_temp_file(prefix="test", suffix="rel.id") grm_file = utils.new_temp_file(prefix="test", suffix="grm") grm_bin_file = utils.new_temp_file(prefix="test", suffix="grm.bin") grm_nbin_file = utils.new_temp_file(prefix="test", suffix="grm.N.bin") dataset = self.get_dataset() n_samples = dataset.count_cols() dataset = dataset.annotate_rows(AC=agg.sum(dataset.GT.n_alt_alleles()), n_called=agg.count_where(hl.is_defined(dataset.GT))) dataset = dataset.filter_rows((dataset.AC > 0) & (dataset.AC < 2 * dataset.n_called)) dataset = dataset.filter_rows(dataset.n_called == n_samples).persist() hl.export_plink(dataset, b_file, id=dataset.s) sample_ids = [row.s for row in dataset.cols().select('s').collect()] n_variants = dataset.count_rows() self.assertGreater(n_variants, 0) grm = hl.genetic_relatedness_matrix(dataset) grm.export_id_file(rel_id_file) ############ ### rel p_file = utils.new_temp_file(prefix="plink") syscall('''plink --bfile {} --make-rel --out {}''' .format(utils.uri_path(b_file), utils.uri_path(p_file)), shell=True, stdout=DEVNULL, stderr=DEVNULL) self.assertEqual(load_id_file(p_file + ".rel.id"), sample_ids) grm.export_rel(rel_file) self.assertEqual(load_id_file(rel_id_file), sample_ids) self.assertTrue(np.allclose(load_rel(n_samples, p_file + ".rel"), load_rel(n_samples, rel_file), atol=tolerance)) ############ ### gcta-grm p_file = utils.new_temp_file(prefix="plink") syscall('''plink --bfile {} --make-grm-gz --out {}''' .format(utils.uri_path(b_file), utils.uri_path(p_file)), shell=True, stdout=DEVNULL, stderr=DEVNULL) self.assertEqual(load_id_file(p_file + ".grm.id"), sample_ids) grm.export_gcta_grm(grm_file) self.assertTrue(np.allclose(load_grm(n_samples, n_variants, p_file + ".grm.gz"), load_grm(n_samples, n_variants, grm_file), atol=tolerance)) ############ ### gcta-grm-bin p_file = utils.new_temp_file(prefix="plink") syscall('''plink --bfile {} --make-grm-bin --out {}''' .format(utils.uri_path(b_file), utils.uri_path(p_file)), shell=True, stdout=DEVNULL, stderr=DEVNULL) self.assertEqual(load_id_file(p_file + ".grm.id"), sample_ids) grm.export_gcta_grm_bin(grm_bin_file, grm_nbin_file) self.assertTrue(np.allclose(load_bin(n_samples, p_file + ".grm.bin"), load_bin(n_samples, grm_bin_file), atol=tolerance)) self.assertTrue(np.allclose(load_bin(n_samples, p_file + ".grm.N.bin"), load_bin(n_samples, grm_nbin_file), atol=tolerance))
mt.count() # def rename_samples(mt, mapping): # return mt.key_cols_by(s = hl.literal(mapping).get(mt.s, default=mt.s)) # mt = rename_samples(mt, {'431-BG00852 D':'431-BG00852_D'}) for x in range(1, 23): mt_chr = hl.filter_intervals(mt, [ hl.parse_locus_interval(hl.eval('chr' + hl.str(x)), reference_genome='GRCh38') ]) n_chr = mt_chr.count_rows() print('\nn variants in chr') print(x) print(n_chr) hl.export_plink(mt_chr, PLINK_FILES + '.chr' + str(x)) mt_chr = hl.filter_intervals( mt, [hl.parse_locus_interval('chrX', reference_genome='GRCh38')]) n_chr = mt_chr.count_rows() print('\nn variants in chrX') print(n_chr) hl.export_plink(mt_chr, PLINK_FILES + '.chr' + 'X')
def test_import_plink_empty_fam(self): mt = get_dataset().filter_cols(False) bfile = '/tmp/test_empty_fam' hl.export_plink(mt, bfile, ind_id=mt.s) with self.assertRaisesRegex(FatalError, "Empty .fam file"): hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam')
def run_qc(mt: hl.MatrixTable, dirname: str, basename: str, input_type: str, pre_geno: float, mind: float, fhet_y: int, fhet_x: int, geno: float, midi: float, maf: float, hwe_th_co: float, hwe_th_ca: float, qc_round: int, withpna: int = 0) -> hl.MatrixTable: """ :param mt: Hail MatrixTable :param dirname: :param basename: :param input_type: :param pre_geno: :param mind: :param fhet_y: :param fhet_x: :param geno: :param midi: :param maf: :param hwe_th_co: :param hwe_th_ca: :param qc_round: :param withpna: :return: """ # compute qc metrics mt = qc.compute_qc_metrics(mt) # Pre-qc counts pre_qc_counts = qc.collect_counts(mt) # pre-qc plots print("Generating pre-QC plots") pre_cas_var_base64, pre_con_var_base64 = plt.cr_var_plts(mt, geno) pre_cas_id_base64, pre_con_id_base64 = plt.cr_id_plts(mt, mind) pre_man_qq_base64 = plt.man_qq_plts(mt) # 1. SNP QC: call rate ≥ 0.95 print("1. SNP QC: call rate ≥ 0.95") mt, var_pre_filter = qc.filter_var_cr(mt, pre_geno) print("Pre QC call rate < 0.95: {}".format(var_pre_filter['geno_removed'])) print("Samples: {}".format(mt.count_cols())) # 2. Sample QC: call rate in cases or controls ≥ 0.98 print("2. Sample QC: call rate in cases or controls ≥ 0.98") mt, id_cr_filter = qc.filter_sample_cr(mt, mind) print("Sample QC < 0.98: {}".format(id_cr_filter['sample_miss_cases'] + id_cr_filter['sample_miss_controls'])) print("Samples: {}".format(mt.count_cols())) # 3. Sample QC: F_stats print("3. Sample QC: F_stats") mt, f_stat_results = qc.filter_sex_check(mt, fhet_y, fhet_x) print("Sex check filtered: {}".format(f_stat_results['sex_check_removed'])) print("Samples: {}".format(mt.count_cols())) # 4. Sample QC: Sex violations (excluded) - genetic sex does not match pedigree sex print( "4. Sample QC: Sex violations (excluded) - genetic sex does not match pedigree sex" ) mt, sex_violations = qc.sex_violations(mt, input_type) print("Sex violations: {}".format(sex_violations['sex_excluded'])) print("Samples: {}".format(mt.count_cols())) # 5. Sample QC: Sex warnings (not excluded) - undefined phenotype / ambiguous genotypes print( "# 5. Sample QC: Sex warnings (not excluded) - undefined phenotype / ambiguous genotypes" ) sex_warnings_count = qc.sex_warnings(mt, input_type) print("Sex warning: {}".format(sex_warnings_count)) print("Samples: {}".format(mt.count_cols())) # 6. SNP QC: call rate ≥ 0.98 print("# 6. SNP QC: call rate ≥ 0.98") mt, var_filter = qc.filter_var_cr(mt, geno) print("SNP QC call rate < 0.98: {}".format(var_filter['geno_removed'])) print("Samples: {}".format(mt.count_cols())) # 7. SNP QC: missing difference > 0.02 print("# 7. SNP QC: missing difference > 0.02") # 8. SNP QC: SNPs with no valid association p value are excluded (i.e., invariant SNP) print( "# 8. SNP QC: SNPs with no valid association p value are excluded (i.e., invariant SNP)" ) if withpna == 0: mt, invariant_snps = qc.filter_invariant_snps(mt) print("Monormorphic SNPs: {}".format( invariant_snps['monomorphic_snps'])) print("Samples: {}".format(mt.count_cols())) # 9. SNP QC: with MAF ≥ 0.01 print("# 9. SNP QC: with MAF ≥ 0.01") mt, maf_results = qc.filter_maf(mt, maf) print("MAF: {}".format(maf_results['maf_removed'])) print("Samples: {}".format(mt.count_cols())) # 10. SNP QC: Hardy-Weinberg equilibrium (HWE) in controls p value ≥ 1e-06 print( "# 10. SNP QC: Hardy-Weinberg equilibrium (HWE) in controls p value ≥ 1e-06" ) mt, hwe_con_results = qc.filter_hwe(mt, 'Control', hwe_th_co) print("HWE Controls: {}".format(hwe_con_results['maf_removed'])) print("Samples: {}".format(mt.count_cols())) # 11. SNP QC: Hardy-Weinberg equilibrium (HWE) in cases p value ≥ 1e-10 print( "# 11. SNP QC: Hardy-Weinberg equilibrium (HWE) in cases p value ≥ 1e-10" ) mt, hwe_cas_results = qc.filter_hwe(mt, 'Case', hwe_th_ca) print("HWE Cases: {}".format(hwe_cas_results['maf_removed'])) print("Samples: {}".format(mt.count_cols())) # Post-qc counts post_qc_counts = qc.collect_counts(mt) # Post-QC plots print("Generating post-QC plots") print("Generating variant call rate plots") pos_cas_var_base64, pos_con_var_base64 = plt.cr_var_plts(mt, geno) print("Generating sample call rate plots") pos_cas_id_base64, pos_con_id_base64 = plt.cr_id_plts(mt, mind) print("Generating Manhattand & QQ plots") pos_man_qq_base64 = plt.man_qq_plts(mt) # # pre_cas_var_base64, pre_cas_id_base64, pre_con_var_base64, pre_con_id_base64 qc_plots_list = [ pre_man_qq_base64, pos_man_qq_base64, pre_con_id_base64, pre_cas_id_base64, pos_con_id_base64, pos_cas_id_base64, f_stat_results['sex_check_plot'], pre_con_var_base64, pre_cas_var_base64, pos_con_var_base64, pos_cas_var_base64 ] # Tables filter_counts_list = [ var_pre_filter['geno_removed'], id_cr_filter['sample_miss_cases'] + id_cr_filter['sample_miss_controls'], f_stat_results['sex_check_removed'], sex_violations['sex_excluded'], sex_warnings_count, var_filter['geno_removed'], invariant_snps['monomorphic_snps'], hwe_con_results['maf_removed'], hwe_cas_results['maf_removed'] ] size_of_sample_html, exlusion_overview_html = generate_tables( pre_qc_counts, post_qc_counts, filter_counts_list) qc_tables_list = [size_of_sample_html, exlusion_overview_html] outplink = dirname + basename + '_qc{}'.format(qc_round) hl.export_plink(mt, outplink) return qc_tables_list, qc_plots_list