def test_king_filtered_entries_no_error(): plink_path = resource('balding-nichols-1024-variants-4-samples-3-populations') mt = hl.import_plink(bed=f'{plink_path}.bed', bim=f'{plink_path}.bim', fam=f'{plink_path}.fam') mt = mt.filter_entries(hl.rand_bool(0.5)) hl.king(mt.GT)._force_count_rows()
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) mt = mt.filter_cols( (mt.hgdp_1kg_metadata.population_inference.pop == 'nfe') | (mt.s.contains('TOB')) ) # Remove related samples (at the 2nd degree or closer) king = hl.king(mt.GT) king_path = output_path('king_kinship_estimate_NFE.ht') king.write(king_path) ht = king.entries() related_samples = ht.filter((ht.s_1 != ht.s) & (ht.phi > 0.125), keep=True) struct = hl.struct(i=related_samples.s_1, j=related_samples.s) struct = struct.annotate(phi=related_samples.phi) related_samples_to_remove = hl.maximal_independent_set( struct.i, struct.j, False # pylint: disable=E1101 ) n_related_samples = related_samples_to_remove.count() print(f'related_samples_to_remove.count() = {n_related_samples}') # save as html html = pd.DataFrame( {'related_individual': related_samples_to_remove.node.collect()} ).to_html() plot_filename_html = output_path(f'related_samples.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
def test_king_large(): plink_path = resource('fastlmmTest') mt = hl.import_plink(bed=f'{plink_path}.bed', bim=f'{plink_path}.bim', fam=f'{plink_path}.fam', reference_genome=None) kinship = hl.king(mt.GT) assert_c_king_same_as_hail_king(resource('fastlmmTest.kin0.bgz'), kinship)
def test_king_small(): plink_path = resource('balding-nichols-1024-variants-4-samples-3-populations') mt = hl.import_plink(bed=f'{plink_path}.bed', bim=f'{plink_path}.bim', fam=f'{plink_path}.fam') kinship = hl.king(mt.GT) assert_c_king_same_as_hail_king( resource('balding-nichols-1024-variants-4-samples-3-populations.kin0'), kinship)
def relatedness_check(in_mt: hl.MatrixTable = None, method: str = 'pc_relate', outdir: str = None, kin_estimate: float = 0.98): global mt, samples_to_remove in_mt = hl.variant_qc(in_mt) in_mt = hl.sample_qc(in_mt) # _localize=False means don't put this in Python, keep it as a Hail expr call_rate_dict = in_mt.aggregate_cols(hl.dict( hl.agg.collect((in_mt.s, in_mt.sample_qc.call_rate))), _localize=False) if method == 'pc_relate': print("\nUsing PC-Relate for relatedness checks") relatedness_ht = hl.pc_relate(in_mt.GT, 0.01, k=10, min_kinship=0.1, statistics='kin') samples_to_remove_ht = relatedness_ht.filter( relatedness_ht.kin > kin_estimate) # get call rates for both samples so we remove the one with lower call rate between the two samples_to_remove = samples_to_remove_ht.annotate( cr_s1=call_rate_dict[samples_to_remove_ht.i.s], cr_s2=call_rate_dict[samples_to_remove_ht.j.s]) samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond( samples_to_remove.cr_s1 <= samples_to_remove.cr_s2, samples_to_remove.i, samples_to_remove.j)) elif method == 'ibd': print("\nUsing PLINK-style identity by descent for relatedness checks") in_mt = in_mt.annotate_rows(maf=hl.min(in_mt.variant_qc.AF)) relatedness_ht = hl.identity_by_descent( in_mt, maf=in_mt['maf'] ) # this returns a Hail Table with the sample pairs samples_to_remove_ht = relatedness_ht.filter( relatedness_ht.ibd.PI_HAT > kin_estimate) # get call rates for both samples so we remove the one with lower call rate between the two samples_to_remove = samples_to_remove_ht.annotate( cr_s1=call_rate_dict[samples_to_remove_ht.i], cr_s2=call_rate_dict[samples_to_remove_ht.j]) samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond( samples_to_remove.cr_s1 <= samples_to_remove.cr_s2, samples_to_remove.i, samples_to_remove.j)) else: print("\nUsing KING for relatedness checks") if kin_estimate > 0.5: raise Exception( "\nThe maximum kinship coefficient is for KING 0.5") relatedness_mt = hl.king(in_mt.GT) filtered_relatedness_mt = relatedness_mt.filter_entries( (relatedness_mt.s_1 != relatedness_mt.s) & (relatedness_mt.phi >= kin_estimate), keep=True) samples_to_remove_ht = filtered_relatedness_mt.entries() samples_to_remove = samples_to_remove_ht.annotate( cr_s1=call_rate_dict[samples_to_remove_ht.s_1], cr_s2=call_rate_dict[samples_to_remove_ht.s]) samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond( samples_to_remove.cr_s1 <= samples_to_remove.cr_s2, samples_to_remove.s_1, samples_to_remove.s)) samples = samples_list.sample_to_remove.collect() if len(samples) > 0: in_mt = in_mt.filter_cols(hl.literal(samples).contains(in_mt['s']), keep=False) print("\nNumber of samples that fail relatedness checks: {}".format( len(samples))) with open(outdir + 'relatedness_removed_samples.tsv', 'w') as f: for sample in samples: f.write(sample + "\n") else: print("\nNo samples failed the relatedness check") return in_mt
def king(): mt = hl.balding_nichols_model(6, n_variants=10000, n_samples=4096) path = hl.utils.new_temp_file(extension='mt') hl.king(mt.GT).write(path, overwrite=True)