def test_ibd(self): dataset = self.get_dataset() def plinkify(ds, min=None, max=None): vcf = utils.new_temp_file(prefix="plink", suffix="vcf") plinkpath = utils.new_temp_file(prefix="plink") hl.export_vcf(ds, vcf) threshold_string = "{} {}".format("--min {}".format(min) if min else "", "--max {}".format(max) if max else "") plink_command = "plink --double-id --allow-extra-chr --vcf {} --genome full --out {} {}" \ .format(utils.uri_path(vcf), utils.uri_path(plinkpath), threshold_string) result_file = utils.uri_path(plinkpath + ".genome") syscall(plink_command, shell=True, stdout=DEVNULL, stderr=DEVNULL) ### format of .genome file is: # _, fid1, iid1, fid2, iid2, rt, ez, z0, z1, z2, pihat, phe, # dst, ppc, ratio, ibs0, ibs1, ibs2, homhom, hethet (+ separated) ### format of ibd is: # i (iid1), j (iid2), ibd: {Z0, Z1, Z2, PI_HAT}, ibs0, ibs1, ibs2 results = {} with open(result_file) as f: f.readline() for line in f: row = line.strip().split() results[(row[1], row[3])] = (list(map(float, row[6:10])), list(map(int, row[14:17]))) return results def compare(ds, min=None, max=None): plink_results = plinkify(ds, min, max) hail_results = hl.identity_by_descent(ds, min=min, max=max).collect() for row in hail_results: key = (row.i, row.j) self.assertAlmostEqual(plink_results[key][0][0], row.ibd.Z0, places=4) self.assertAlmostEqual(plink_results[key][0][1], row.ibd.Z1, places=4) self.assertAlmostEqual(plink_results[key][0][2], row.ibd.Z2, places=4) self.assertAlmostEqual(plink_results[key][0][3], row.ibd.PI_HAT, places=4) self.assertEqual(plink_results[key][1][0], row.ibs0) self.assertEqual(plink_results[key][1][1], row.ibs1) self.assertEqual(plink_results[key][1][2], row.ibs2) compare(dataset) compare(dataset, min=0.0, max=1.0) dataset = dataset.annotate_rows(dummy_maf=0.01) hl.identity_by_descent(dataset, dataset['dummy_maf'], min=0.0, max=1.0) hl.identity_by_descent(dataset, hl.float32(dataset['dummy_maf']), min=0.0, max=1.0)
def compare(ds, min=None, max=None): plink_results = plinkify(ds, min, max) hail_results = hl.identity_by_descent(ds, min=min, max=max).collect() for row in hail_results: key = (row.i, row.j) self.assertAlmostEqual(plink_results[key][0][0], row.ibd.Z0, places=4) self.assertAlmostEqual(plink_results[key][0][1], row.ibd.Z1, places=4) self.assertAlmostEqual(plink_results[key][0][2], row.ibd.Z2, places=4) self.assertAlmostEqual(plink_results[key][0][3], row.ibd.PI_HAT, places=4) self.assertEqual(plink_results[key][1][0], row.ibs0) self.assertEqual(plink_results[key][1][1], row.ibs1) self.assertEqual(plink_results[key][1][2], row.ibs2)
def compute_kinship_ht(mt, genome_version="GRCh38"): mt = filter_to_biallelics(mt) mt = filter_to_autosomes(mt) mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1])) mt = hl.variant_qc(mt) mt = mt.filter_rows(mt.variant_qc.call_rate > 0.99) #mt = mt.filter_rows(mt.info.AF > 0.001) # leaves 100% of variants mt = ld_prune(mt, genome_version=genome_version) ibd_results_ht = hl.identity_by_descent(mt, maf=mt.info.AF, min=0.10, max=1.0) ibd_results_ht = ibd_results_ht.annotate( ibd0=ibd_results_ht.ibd.Z0, ibd1=ibd_results_ht.ibd.Z1, ibd2=ibd_results_ht.ibd.Z2, pi_hat=ibd_results_ht.ibd.PI_HAT).drop("ibs0", "ibs1", "ibs2", "ibd") kin_ht = ibd_results_ht # filter to anything above the relationship of a grandparent first_degree_pi_hat = .40 grandparent_pi_hat = .20 grandparent_ibd1 = 0.25 grandparent_ibd2 = 0.15 kin_ht = kin_ht.key_by("i", "j") kin_ht = kin_ht.filter((kin_ht.pi_hat > first_degree_pi_hat) | ( (kin_ht.pi_hat > grandparent_pi_hat) & (kin_ht.ibd1 > grandparent_ibd1) & (kin_ht.ibd2 < grandparent_ibd2))) kin_ht = kin_ht.annotate(relation=hl.sorted([kin_ht.i, kin_ht.j ])) #better variable name return kin_ht
def relatedness_check(in_mt: hl.MatrixTable = None, method: str = 'pc_relate', outdir: str = None, kin_estimate: float = 0.98): global mt, samples_to_remove in_mt = hl.variant_qc(in_mt) in_mt = hl.sample_qc(in_mt) # _localize=False means don't put this in Python, keep it as a Hail expr call_rate_dict = in_mt.aggregate_cols(hl.dict( hl.agg.collect((in_mt.s, in_mt.sample_qc.call_rate))), _localize=False) if method == 'pc_relate': print("\nUsing PC-Relate for relatedness checks") relatedness_ht = hl.pc_relate(in_mt.GT, 0.01, k=10, min_kinship=0.1, statistics='kin') samples_to_remove_ht = relatedness_ht.filter( relatedness_ht.kin > kin_estimate) # get call rates for both samples so we remove the one with lower call rate between the two samples_to_remove = samples_to_remove_ht.annotate( cr_s1=call_rate_dict[samples_to_remove_ht.i.s], cr_s2=call_rate_dict[samples_to_remove_ht.j.s]) samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond( samples_to_remove.cr_s1 <= samples_to_remove.cr_s2, samples_to_remove.i, samples_to_remove.j)) elif method == 'ibd': print("\nUsing PLINK-style identity by descent for relatedness checks") in_mt = in_mt.annotate_rows(maf=hl.min(in_mt.variant_qc.AF)) relatedness_ht = hl.identity_by_descent( in_mt, maf=in_mt['maf'] ) # this returns a Hail Table with the sample pairs samples_to_remove_ht = relatedness_ht.filter( relatedness_ht.ibd.PI_HAT > kin_estimate) # get call rates for both samples so we remove the one with lower call rate between the two samples_to_remove = samples_to_remove_ht.annotate( cr_s1=call_rate_dict[samples_to_remove_ht.i], cr_s2=call_rate_dict[samples_to_remove_ht.j]) samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond( samples_to_remove.cr_s1 <= samples_to_remove.cr_s2, samples_to_remove.i, samples_to_remove.j)) else: print("\nUsing KING for relatedness checks") if kin_estimate > 0.5: raise Exception( "\nThe maximum kinship coefficient is for KING 0.5") relatedness_mt = hl.king(in_mt.GT) filtered_relatedness_mt = relatedness_mt.filter_entries( (relatedness_mt.s_1 != relatedness_mt.s) & (relatedness_mt.phi >= kin_estimate), keep=True) samples_to_remove_ht = filtered_relatedness_mt.entries() samples_to_remove = samples_to_remove_ht.annotate( cr_s1=call_rate_dict[samples_to_remove_ht.s_1], cr_s2=call_rate_dict[samples_to_remove_ht.s]) samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond( samples_to_remove.cr_s1 <= samples_to_remove.cr_s2, samples_to_remove.s_1, samples_to_remove.s)) samples = samples_list.sample_to_remove.collect() if len(samples) > 0: in_mt = in_mt.filter_cols(hl.literal(samples).contains(in_mt['s']), keep=False) print("\nNumber of samples that fail relatedness checks: {}".format( len(samples))) with open(outdir + 'relatedness_removed_samples.tsv', 'w') as f: for sample in samples: f.write(sample + "\n") else: print("\nNo samples failed the relatedness check") return in_mt
mt = hl.sample_qc(mt) # Calculate statistics on sample statistics stats_singleton = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.n_singleton)) stats_ti_tv = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.r_ti_tv)) stats_het_hom_var = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.r_het_hom_var)) stats_het = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.n_het)) ######## 3.2 Sex check on chromosome X (inbreeding coefficient) # Determine sex from GT calls in sex chromosomes t = hl.impute_sex(mt.GT) # Only keep those where genetic sex matches self-reported Sex mt = mt.filter_cols(t[mt.s].is_female == mt.is_female) ######## 3.3 Check for genetic relationship / "duplicates" # Calculate identity-by-descent matrix mt_relatedness = hl.identity_by_descent(mt) # keep pairs of samples with PI_HAT in [0.2, 1] using MAF computed from the dataset itself in row field panel_maf. t_ibd = relatedness.filter(relatedness.ibd.PI_HAT > 0.2) t_ibd.key_by('i') mt.key_cols_by("s") #Collect the IDs of the related samples in t_ibd ibd_idx = t_ibd.aggregate(hl.agg.collect_as_set(t_ibd.i)) mt_ibd = mt.filter_cols(hl.is_defined(ibd_idx)) ######### 3.3 Filter samples for outliers more than (6 * SD) from mean (Part 2) # Number of singletons mt = mt.filter_cols(mt.sample_qc.n_singleton < (stats_singleton.mean + (6 * stats_singleton.stdev))) mt = mt.filter_cols(mt.sample_qc.n_singleton > (stats_singleton.mean - (6 * stats_singleton.stdev))) #Ti/Tv ratio
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Dec 11 09:46:37 2018 @author: nbaya """ import hail as hl phen='50' variant_set='hm3' n_chunks=300 batch='1' mt = hl.read_matrix_table('gs://nbaya/split/ukb31063.'+variant_set+'_variants.gwas_samples_'+phen+'_grouped'+str(n_chunks)+'_batch_'+batch+'.mt') mt1 = mt.annotate_entries(gt = hl.int(hl.int(mt.dosage*3/2)*2/3)) mt1 = mt1.annotate_entries(GT = hl.call(mt1.gt)) hl.identity_by_descent(mt1).write('gs://nbaya/split/ibd.'+variant_set+'_variants.'+phen+'_grouped'+str(n_chunks)+'_batch_'+batch+'.ht')
def main(args): output_dir = args.output_dir output_name = args.output_name inferred_sex = args.inferred_sex mt_path = args.mt_path input_pedigree = args.input_pedigree gnomad_ld = args.gnomad_ld run_ibd = args.run_ibd first_degree_pi_hat = args.first_degree_pi_hat grandparent_pi_hat = args.grandparent_pi_hat grandparent_ibd1 = args.grandparent_ibd1 grandparent_ibd2 = args.grandparent_ibd2 filter_kinship_ht = args.filter_kinship_ht logger.info("Reading in inputs...") mt = hl.read_matrix_table(mt_path) pedigree = hl.import_table(input_pedigree, impute=True) # Infer build of the MatrixTable build = get_reference_genome(mt.locus).name logger.info( "Filtering to biallelic SNVs on autosomes and performing LD pruning..." ) mt = filter_rows_for_qc(mt, min_af=0.001, min_callrate=0.99, apply_hard_filters=False) mt = ld_prune(mt, build, gnomad_ld) out_mt = f"{output_dir}/{output_name}_processed_mt.mt" logger.info("Remapping sample names...") mt, sex_ht = remap_samples(mt_path, mt, pedigree, inferred_sex) mt = mt.checkpoint(out_mt, overwrite=True) if run_ibd: logger.info("Running identity by descent...") ibd_results_ht = hl.identity_by_descent(mt, maf=mt.AF, min=0.10, max=1.0) ibd_results_ht = ibd_results_ht.annotate( ibd0=ibd_results_ht.ibd.Z0, ibd1=ibd_results_ht.ibd.Z1, ibd2=ibd_results_ht.ibd.Z2, pi_hat=ibd_results_ht.ibd.PI_HAT, ).drop("ibs0", "ibs1", "ibs2", "ibd") out_ht = f"{output_dir}/{output_name}_ibd_kinship.tsv" ibd_results_ht.export(out_ht) else: logger.warn("Skipping IBD - using previous calculations...") if not file_exists(f"{output_dir}/{output_name}_ibd_kinship.tsv"): logger.warning( "IBD calculation was skipped but no file with previous calculations was found...", sample, ) logger.info("Reading in kinship ht...") kin_ht = hl.import_table(f"{output_dir}/{output_name}_ibd_kinship.tsv", impute=True) # Subset MatrixTable and sex ht to the samples in the pedigree mt_subset, sex_ht, expected_samples, vcf_samples = subset_samples( mt, pedigree, sex_ht, output_dir, output_name) # Subset Table to the samples in the pedigree subset = hl.set(expected_samples) kin_ht = kin_ht.filter( subset.contains(kin_ht.i) | subset.contains(kin_ht.j)) # Key the Table kin_ht = kin_ht.key_by("i", "j") # Setup output file out_summary = hl.hadoop_open( f"{output_dir}/{output_name}_ped_check_summary.txt", "w") if filter_kinship_ht: logger.info( "Filtering kinship table to remove unrelated individuals from analysis..." ) kin_ht = filter_kin_ht(kin_ht, out_summary) # Output basic stats out_summary.write("Number individuals in pedigree: " + str(len(expected_samples)) + "\n") out_summary.write("Number individuals in subset from the VCF: " + str(len(vcf_samples)) + "\n") out_summary.write("Number of relationships in the kinship table: " + str(kin_ht.count()) + "\n\n") out_summary.close() seqr_projects, family_ids, given_sex = write_functional_pedigree( input_pedigree, vcf_samples, output_dir, output_name) # Compare inferred and given sex check_sex(sex_ht, output_dir, output_name) kin_ht = add_project_and_family_annotations(kin_ht, seqr_projects, family_ids) logger.info("Writing kinship ht per project...") # Output original ht per project for project in set(seqr_projects.values()): full_ht = kin_ht.filter((kin_ht.seqr_proj_i == project) | (kin_ht.seqr_proj_j == project)) full_ht.drop("seqr_proj_i", "seqr_proj_j").export( f"{output_dir}/{project}/{output_name}_{project}_annotated_kin.txt" )