def test_from_entry_expr(self): mt = get_dataset() mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache() a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy() a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy() a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy() self._assert_eq(a1, a2) self._assert_eq(a1, a3) path = new_temp_file() BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32) a4 = BlockMatrix.read(path).to_numpy() self._assert_eq(a1, a4)
def test_aggregate2(self): schema = hl.tstruct(status=hl.tint32, GT=hl.tcall, qPheno=hl.tint32) rows = [{'status': 0, 'GT': hl.Call([0, 0]), 'qPheno': 3}, {'status': 0, 'GT': hl.Call([0, 1]), 'qPheno': 13}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict( kt.group_by(status=kt.status) .aggregate( x1=agg.collect(kt.qPheno * 2), x2=agg.explode(lambda elt: agg.collect(elt), [kt.qPheno, kt.qPheno + 1]), x3=agg.min(kt.qPheno), x4=agg.max(kt.qPheno), x5=agg.sum(kt.qPheno), x6=agg.product(hl.int64(kt.qPheno)), x7=agg.count(), x8=agg.count_where(kt.qPheno == 3), x9=agg.fraction(kt.qPheno == 1), x10=agg.stats(hl.float64(kt.qPheno)), x11=agg.hardy_weinberg_test(kt.GT), x13=agg.inbreeding(kt.GT, 0.1), x14=agg.call_stats(kt.GT, ["A", "T"]), x15=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')))[0], x16=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')).c.banana)[0], x17=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tarray(hl.tint32))), x18=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tset(hl.tint32))), x19=agg.take(kt.GT, 1, ordering=-kt.qPheno) ).take(1)[0]) expected = {u'status': 0, u'x13': {u'n_called': 2, u'expected_homs': 1.64, u'f_stat': -1.777777777777777, u'observed_homs': 1}, u'x14': {u'AC': [3, 1], u'AF': [0.75, 0.25], u'AN': 4, u'homozygote_count': [1, 0]}, u'x15': {u'a': 5, u'c': {u'banana': u'apple'}, u'b': u'foo'}, u'x10': {u'min': 3.0, u'max': 13.0, u'sum': 16.0, u'stdev': 5.0, u'n': 2, u'mean': 8.0}, u'x8': 1, u'x9': 0.0, u'x16': u'apple', u'x11': {u'het_freq_hwe': 0.5, u'p_value': 0.5}, u'x2': [3, 4, 13, 14], u'x3': 3, u'x1': [6, 26], u'x6': 39, u'x7': 2, u'x4': 13, u'x5': 16, u'x17': [], u'x18': [], u'x19': [hl.Call([0, 1])]} self.maxDiff = None self.assertDictEqual(result, expected)
age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals(global_field_1=5, global_field_2=10, pli={'SCN1A': 0.999, 'SONIC': 0.014}, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds.write('data/example.vds', overwrite=True) lmmreg_ds = hl.variant_qc(hl.split_multi_hts(hl.import_vcf('data/sample.vcf.bgz'))) lmmreg_tsv = hl.import_table('data/example_lmmreg.tsv', 'Sample', impute=True) lmmreg_ds = lmmreg_ds.annotate_cols(**lmmreg_tsv[lmmreg_ds['s']]) lmmreg_ds = lmmreg_ds.annotate_rows(use_in_kinship = lmmreg_ds.variant_qc.AF[1] > 0.05) lmmreg_ds.write('data/example_lmmreg.vds', overwrite=True) burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden = burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows(weight = hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds.write('data/example_burden.vds', overwrite=True)
def test_export_plink_exprs(self): ds = get_dataset() fam_mapping = {'f0': 'fam_id', 'f1': 'ind_id', 'f2': 'pat_id', 'f3': 'mat_id', 'f4': 'is_female', 'f5': 'pheno'} bim_mapping = {'f0': 'contig', 'f1': 'varid', 'f2': 'cm_position', 'f3': 'position', 'f4': 'a1', 'f5': 'a2'} # Test default arguments out1 = new_temp_file() hl.export_plink(ds, out1) fam1 = (hl.import_table(out1 + '.fam', no_header=True, impute=False, missing="") .rename(fam_mapping)) bim1 = (hl.import_table(out1 + '.bim', no_header=True, impute=False) .rename(bim_mapping)) self.assertTrue(fam1.all((fam1.fam_id == "0") & (fam1.pat_id == "0") & (fam1.mat_id == "0") & (fam1.is_female == "0") & (fam1.pheno == "NA"))) self.assertTrue(bim1.all((bim1.varid == bim1.contig + ":" + bim1.position + ":" + bim1.a2 + ":" + bim1.a1) & (bim1.cm_position == "0.0"))) # Test non-default FAM arguments out2 = new_temp_file() hl.export_plink(ds, out2, ind_id=ds.s, fam_id=ds.s, pat_id="nope", mat_id="nada", is_female=True, pheno=False) fam2 = (hl.import_table(out2 + '.fam', no_header=True, impute=False, missing="") .rename(fam_mapping)) self.assertTrue(fam2.all((fam2.fam_id == fam2.ind_id) & (fam2.pat_id == "nope") & (fam2.mat_id == "nada") & (fam2.is_female == "2") & (fam2.pheno == "1"))) # Test quantitative phenotype out3 = new_temp_file() hl.export_plink(ds, out3, ind_id=ds.s, pheno=hl.float64(hl.len(ds.s))) fam3 = (hl.import_table(out3 + '.fam', no_header=True, impute=False, missing="") .rename(fam_mapping)) self.assertTrue(fam3.all((fam3.fam_id == "0") & (fam3.pat_id == "0") & (fam3.mat_id == "0") & (fam3.is_female == "0") & (fam3.pheno != "0") & (fam3.pheno != "NA"))) # Test non-default BIM arguments out4 = new_temp_file() hl.export_plink(ds, out4, varid="hello", cm_position=100) bim4 = (hl.import_table(out4 + '.bim', no_header=True, impute=False) .rename(bim_mapping)) self.assertTrue(bim4.all((bim4.varid == "hello") & (bim4.cm_position == "100.0"))) # Test call expr out5 = new_temp_file() ds_call = ds.annotate_entries(gt_fake=hl.call(0, 0)) hl.export_plink(ds_call, out5, call=ds_call.gt_fake) ds_all_hom_ref = hl.import_plink(out5 + '.bed', out5 + '.bim', out5 + '.fam') nerrors = ds_all_hom_ref.aggregate_entries(hl.agg.count_where(~ds_all_hom_ref.GT.is_hom_ref())) self.assertTrue(nerrors == 0) # Test white-space in FAM id expr raises error with self.assertRaisesRegex(TypeError, "has spaces in the following values:"): hl.export_plink(ds, new_temp_file(), mat_id="hello world") # Test white-space in varid expr raises error with self.assertRaisesRegex(FatalError, "no white space allowed:"): hl.export_plink(ds, new_temp_file(), varid="hello world")
def parse_as_doubles(string, has_non_ref): ints = string.split(r'\|') ints = hl.cond(has_non_ref, ints[:-1], ints) return ints.map(lambda i: hl.cond( (hl.len(i) == 0) | (i == '.'), hl.null(hl.tfloat64), hl.float64(i)))
def generate_datasets(doctest_namespace): doctest_namespace['hl'] = hl doctest_namespace['np'] = np ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint('output/example.mt', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata small_mt = hl.balding_nichols_model(3, 4, 4) doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt', overwrite=True) # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.missing(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl.nd.array([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint('output/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds ld_score_one_pheno_sumstats = hl.import_table( 'data/ld_score_regression.one_pheno.sumstats.tsv', types={ 'locus': hl.tlocus('GRCh37'), 'alleles': hl.tarray(hl.tstr), 'chi_squared': hl.tfloat64, 'n': hl.tint32, 'ld_score': hl.tfloat64, 'phenotype': hl.tstr, 'chi_squared_50_irnt': hl.tfloat64, 'n_50_irnt': hl.tint32, 'chi_squared_20160': hl.tfloat64, 'n_20160': hl.tint32 }, key=['locus', 'alleles']) doctest_namespace[ 'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats mt = hl.import_matrix_table( 'data/ld_score_regression.all_phenos.sumstats.tsv', row_fields={ 'locus': hl.tstr, 'alleles': hl.tstr, 'ld_score': hl.tfloat64 }, entry_type=hl.tstr) mt = mt.key_cols_by(phenotype=mt.col_id) mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus), alleles=mt.alleles.split(',')) mt = mt.drop('row_id', 'col_id') mt = mt.annotate_entries(x=mt.x.split(",")) mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]), n=hl.int32(mt.x[1])) mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score)) doctest_namespace['ld_score_all_phenos_sumstats'] = mt print("finished setting up doctest...")
def _to_expr(e, dtype): if e is None: return None elif isinstance(e, Expression): if e.dtype != dtype: assert is_numeric(dtype), 'expected {}, got {}'.format( dtype, e.dtype) if dtype == tfloat64: return hl.float64(e) elif dtype == tfloat32: return hl.float32(e) elif dtype == tint64: return hl.int64(e) else: assert dtype == tint32 return hl.int32(e) return e elif not is_compound(dtype): # these are not container types and cannot contain expressions if we got here return e elif isinstance(dtype, tstruct): new_fields = [] found_expr = False for f, t in dtype.items(): value = _to_expr(e[f], t) found_expr = found_expr or isinstance(value, Expression) new_fields.append(value) if not found_expr: return e else: exprs = [ new_fields[i] if isinstance(new_fields[i], Expression) else hl.literal(new_fields[i], dtype[i]) for i in range(len(new_fields)) ] fields = {name: expr for name, expr in zip(dtype.keys(), exprs)} from .typed_expressions import StructExpression return StructExpression._from_fields(fields) elif isinstance(dtype, tarray): elements = [] found_expr = False for element in e: value = _to_expr(element, dtype.element_type) found_expr = found_expr or isinstance(value, Expression) elements.append(value) if not found_expr: return e else: assert (len(elements) > 0) exprs = [ element if isinstance(element, Expression) else hl.literal( element, dtype.element_type) for element in elements ] indices, aggregations = unify_all(*exprs) x = ir.MakeArray([e._ir for e in exprs], None) return expressions.construct_expr(x, dtype, indices, aggregations) elif isinstance(dtype, tset): elements = [] found_expr = False for element in e: value = _to_expr(element, dtype.element_type) found_expr = found_expr or isinstance(value, Expression) elements.append(value) if not found_expr: return e else: assert (len(elements) > 0) exprs = [ element if isinstance(element, Expression) else hl.literal( element, dtype.element_type) for element in elements ] indices, aggregations = unify_all(*exprs) x = ir.ToSet( ir.ToStream(ir.MakeArray([e._ir for e in exprs], None))) return expressions.construct_expr(x, dtype, indices, aggregations) elif isinstance(dtype, ttuple): elements = [] found_expr = False assert len(e) == len(dtype.types) for i in range(len(e)): value = _to_expr(e[i], dtype.types[i]) found_expr = found_expr or isinstance(value, Expression) elements.append(value) if not found_expr: return e else: exprs = [ elements[i] if isinstance(elements[i], Expression) else hl.literal(elements[i], dtype.types[i]) for i in range(len(elements)) ] indices, aggregations = unify_all(*exprs) x = ir.MakeTuple([expr._ir for expr in exprs]) return expressions.construct_expr(x, dtype, indices, aggregations) elif isinstance(dtype, tdict): keys = [] values = [] found_expr = False for k, v in e.items(): k_ = _to_expr(k, dtype.key_type) v_ = _to_expr(v, dtype.value_type) found_expr = found_expr or isinstance(k_, Expression) found_expr = found_expr or isinstance(v_, Expression) keys.append(k_) values.append(v_) if not found_expr: return e else: assert len(keys) > 0 # Here I use `to_expr` to call `lit` the keys and values separately. # I anticipate a common mode is statically-known keys and Expression # values. key_array = to_expr(keys, tarray(dtype.key_type)) value_array = to_expr(values, tarray(dtype.value_type)) return hl.dict(hl.zip(key_array, value_array)) elif isinstance(dtype, hl.tndarray): return hl.nd.array(e) else: raise NotImplementedError(dtype)
def dmg_case_control_filter(mt,chrm='0',start=0,end=0,out_dir=os.getcwd(),name=""): """Filters matrix table that has been annotated with annovar for case control test.""" #convert chrm number to string if input is integer chrm = str(chrm) if name == "": name = "case_control_filtering_{}_{}-{}".format(chrm,start,end) child_dir = os.path.join(out_dir,"case_control_filtering",name) os.makedirs(child_dir) out_f = os.path.join(child_dir,"case_control_filtering_{}_{}-{}.out".format(chrm,start,end)) with open(out_f, "w") as f: #Import matrix table already annotated with bravo and MetaSVM (In this case, using annovar) mt = hl.read_matrix_table(mt) #Make annotations easier for filtering. If no bravo frequency for variants, set bravo freq to 0 mt = mt.annotate_rows(bravo=hl.cond(hl.is_defined(mt.info["bravo"][0]), hl.float64(mt.info["bravo"][0]), 0.0)) mt = mt.annotate_rows(meta_svm_pred=mt.info.MetaSVM_pred[0]) db_ht = hl.read_table('/gpfs/ycga/project/kahle/sp2349/datasets/dbNSFP/dbNSFp4.1a/dbNSFP4.1a_chr_all_GRCh37.ht') #Annotate mt with CADD16 scores. Original vcf was annotated with CADD13 mt = mt.annotate_rows(cadd16=hl.cond(hl.is_defined(db_ht[mt.row_key]), db_ht[mt.row_key].CADD_phred_hg19, 0.0)) #Subset probands for case_control # Step 1: Create a text file with the sample IDs you want to keep and import that text file as a hail table. table = (hl.import_table('/gpfs/ycga/project/kahle/sp2349/moyamoya/case_control/cc_output/final_probands_for_caseControl.txt', impute=True).key_by('Sample')) #The IDs_keep.txt file has the following format (including headers) # Sample should_retain # 1-00005 yes # 1-00187 yes # 1-00252 yes # 1-00386 yes # 1-00668 yes # etc etc # Annotate columns of matrix table with Sample-IDs you want to keep mt = mt.annotate_cols(is_retain = table[mt.s]) mt = mt.annotate_cols(should_retain = table[mt.s].should_retain) # Filter matrix table columns mt = mt.filter_cols(mt.col.is_retain.should_retain == 'yes', keep=True) mt = mt.filter_cols(mt.should_retain == 'yes', keep=True) sample_count = mt.cols().count() print("Sample count: {}".format(sample_count),file=f) print("Total allele count: {}".format(sample_count*2),file=f) #Filter on Bravo frequency mt_filtered = mt.filter_rows(mt.bravo <= 0.0005) #Filter on DIAPH1 coordinates mt_filtered = mt_filtered.filter_rows((mt_filtered.locus >= hl.locus(chrm,start)) & (mt_filtered.locus <= hl.locus(chrm,end))) print("Unique variants post bravo, CADD, and MetaSVM: {}".format(mt_filtered.rows().count()),file=f) #Filter for exonic and splice-site variants only mt_filtered = mt_filtered.filter_rows((mt_filtered.vep.most_severe_consequence == "stop_gained") | (mt_filtered.vep.most_severe_consequence == "splice_acceptor_variant") | (mt_filtered.vep.most_severe_consequence =="splice_donor_variant") | (mt_filtered.vep.most_severe_consequence == "frameshift_variant") | (mt_filtered.vep.most_severe_consequence =="stop_lost") | (mt_filtered.vep.most_severe_consequence =="start_lost") | ((mt_filtered.vep.most_severe_consequence =='missense_variant') & (mt_filtered.cadd16 >=20)) | ((mt_filtered.vep.most_severe_consequence =='missense_variant') & (mt_filtered.meta_svm_pred == "D")) | ((mt_filtered.vep.most_severe_consequence =='protein_altering_variant') & (mt_filtered.cadd16 >=20)) | ((mt_filtered.vep.most_severe_consequence =='protein_altering_variant') & (mt_filtered.meta_svm_pred == "D"))) print("Variants: {} kept".format(mt_filtered.count())) mt_filtered.count() #Convert matrix table to table for easier dropping of homozygous reference samples mt_filtered = mt_filtered.key_cols_by() mt_filtered_table = mt_filtered.entries() #Filter samples with homozygous reference calls (WT) mt_filtered_table = mt_filtered_table.filter(mt_filtered_table.GT.is_hom_ref() == True, keep=False) #mt_filtered_table.show() #Filter samples on GQ >= 20 and DP >= 8 mt_filtered_table = mt_filtered_table.filter(mt_filtered_table.DP > 9) mt_filtered_table = mt_filtered_table.filter(mt_filtered_table.GQ > 19) #Write to matrix table mt_filtered_table.write(os.path.join(out_dir,'Damaging_Cases_chr{}_{}-{}.ht'.format(chrm,start,end)),overwrite=True) print("Total Variants post filtering: {}".format(mt_filtered_table.count()),file=f) print("Total Cases (Alleles): {}".format(mt_filtered_table.aggregate(hl.agg.sum(mt_filtered_table.GT.n_alt_alleles()))),file=f) print("Total Homozygous Cases: {}".format(mt_filtered_table.aggregate(hl.agg.count_where(mt_filtered_table.GT.is_hom_var() == True))),file=f) print("Samples with variants: {}".format(mt_filtered_table.aggregate(hl.agg.counter(mt_filtered_table.s))),file=f) #Write to text file df = mt_filtered_table.to_pandas() df.to_csv(os.path.join(out_dir,'Damaging_Cases_chr{}_{}-{}_table.txt'.format(chrm,start,end)),sep="\t")
def test_aggregate2(self): schema = hl.tstruct(status=hl.tint32, GT=hl.tcall, qPheno=hl.tint32) rows = [{ 'status': 0, 'GT': hl.Call([0, 0]), 'qPheno': 3 }, { 'status': 0, 'GT': hl.Call([0, 1]), 'qPheno': 13 }] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict( kt.group_by(status=kt.status).aggregate( x1=agg.collect(kt.qPheno * 2), x2=agg.collect(agg.explode([kt.qPheno, kt.qPheno + 1])), x3=agg.min(kt.qPheno), x4=agg.max(kt.qPheno), x5=agg.sum(kt.qPheno), x6=agg.product(hl.int64(kt.qPheno)), x7=agg.count(), x8=agg.count_where(kt.qPheno == 3), x9=agg.fraction(kt.qPheno == 1), x10=agg.stats(hl.float64(kt.qPheno)), x11=agg.hardy_weinberg_test(kt.GT), x13=agg.inbreeding(kt.GT, 0.1), x14=agg.call_stats(kt.GT, ["A", "T"]), x15=agg.collect( hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')))[0], x16=agg.collect( hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')).c.banana)[0], x17=agg.collect(agg.explode(hl.null(hl.tarray(hl.tint32)))), x18=agg.collect(agg.explode(hl.null(hl.tset(hl.tint32)))), x19=agg.take(kt.GT, 1, ordering=-kt.qPheno)).take(1)[0]) expected = { u'status': 0, u'x13': { u'n_called': 2, u'expected_homs': 1.64, u'f_stat': -1.777777777777777, u'observed_homs': 1 }, u'x14': { u'AC': [3, 1], u'AF': [0.75, 0.25], u'AN': 4, u'homozygote_count': [1, 0] }, u'x15': { u'a': 5, u'c': { u'banana': u'apple' }, u'b': u'foo' }, u'x10': { u'min': 3.0, u'max': 13.0, u'sum': 16.0, u'stdev': 5.0, u'n': 2, u'mean': 8.0 }, u'x8': 1, u'x9': 0.0, u'x16': u'apple', u'x11': { u'het_freq_hwe': 0.5, u'p_value': 0.5 }, u'x2': [3, 4, 13, 14], u'x3': 3, u'x1': [6, 26], u'x6': 39, u'x7': 2, u'x4': 13, u'x5': 16, u'x17': [], u'x18': [], u'x19': [hl.Call([0, 1])] } self.maxDiff = None self.assertDictEqual(result, expected)
for phen in gwas_phens: print('####################') print('Starting phen ' + phen) print('####################') phen_starttime = datetime.datetime.now() phen_tb = tb.select(tb['"' + phen + '"']).rename( {'"' + phen + '"': 'phen'}) mt1 = variants.annotate_cols( phen_str=hl.str(phen_tb[variants.s]['phen']).replace('\"', '')) mt1 = mt1.filter_cols(mt1.phen_str == '', keep=False) if phen_tb.phen.dtype == hl.dtype('bool'): mt1 = mt1.annotate_cols(phen=hl.bool(mt1.phen_str)).drop('phen_str') else: mt1 = mt1.annotate_cols(phen=hl.float64(mt1.phen_str)).drop('phen_str') for sex in ['female', 'male']: mt2 = mt1.filter_cols(mt1.isFemale == (sex == 'female')) mt3 = mt2.add_col_index() mt3 = mt3.rename({'dosage': 'x', 'phen': 'y'}) n_samples = mt3.count_cols() print('\n>>> phen ' + sex + ' ' + phen + ': N samples = ' + str(n_samples) + ' <<<') mt = mt3 cov_list = [ mt['isFemale'], mt['age'], mt['age_squared'], mt['age_isFemale'],
def compute_ranked_bin( ht: hl.Table, score_expr: hl.expr.NumericExpression, bin_expr: Dict[str, hl.expr.BooleanExpression] = {"bin": True}, compute_snv_indel_separately: bool = True, n_bins: int = 100, desc: bool = True, ) -> hl.Table: r""" Return a table with a bin for each row based on the ranking of `score_expr`. The bin is computed by dividing the `score_expr` into `n_bins` bins containing approximately equal numbers of elements. This is done by ranking the rows by `score_expr` (and a random number in cases where multiple variants have the same score) and then assigning the variant to a bin based on its ranking. If `compute_snv_indel_separately` is True all items in `bin_expr` will be stratified by snv / indels for the ranking and bin calculation. Because SNV and indel rows are mutually exclusive, they are re-combined into a single annotation. For example if we have the following four variants and scores and `n_bins` of 2: ======== ======= ====== ================= ================= Variant Type Score bin - `compute_snv_indel_separately`: -------- ------- ------ ------------------------------------- \ \ \ False True ======== ======= ====== ================= ================= Var1 SNV 0.1 1 1 Var2 SNV 0.2 1 2 Var3 Indel 0.3 2 1 Var4 Indel 0.4 2 2 ======== ======= ====== ================= ================= .. note:: The `bin_expr` defines which data the bin(s) should be computed on. E.g., to get biallelic specific binning and singleton specific binning, the following could be used: .. code-block:: python bin_expr={ 'biallelic_bin': ~ht.was_split, 'singleton_bin': ht.singleton } :param ht: Input Table :param score_expr: Expression containing the score :param bin_expr: Specific row grouping(s) to perform ranking and binning on (see note) :param compute_snv_indel_separately: Should all `bin_expr` items be stratified by SNVs / indels :param n_bins: Number of bins to bin the data into :param desc: Whether to bin the score in descending order :return: Table with the requested bin annotations """ if compute_snv_indel_separately: # For each bin, add a SNV / indel stratification bin_expr = { f"{bin_id}_{snv}": (bin_expr & snv_expr) for bin_id, bin_expr in bin_expr.items() for snv, snv_expr in [ ("snv", hl.is_snp(ht.alleles[0], ht.alleles[1])), ("indel", ~hl.is_snp(ht.alleles[0], ht.alleles[1])), ] } bin_ht = ht.select( **{ f"_filter_{bin_id}": bin_expr for bin_id, bin_expr in bin_expr.items() }, _score=score_expr, snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), _rand=hl.rand_unif(0, 1), ) logger.info( "Sorting the HT by score_expr followed by a random float between 0 and 1. " "Then adding a row index per grouping defined by bin_expr...") bin_ht = bin_ht.order_by("_score", "_rand") bin_ht = bin_ht.annotate( **{ f"{bin_id}_rank": hl.or_missing( bin_ht[f"_filter_{bin_id}"], hl.scan.count_where(bin_ht[f"_filter_{bin_id}"]), ) for bin_id in bin_expr }) bin_ht = bin_ht.key_by("locus", "alleles") # Annotate globals with variant counts per group defined by bin_expr. This is used to determine bin assignment bin_ht = bin_ht.annotate_globals(bin_group_variant_counts=bin_ht.aggregate( hl.Struct( **{ bin_id: hl.agg.filter( bin_ht[f"_filter_{bin_id}"], hl.agg.count(), ) for bin_id in bin_expr }))) logger.info("Binning ranked rows into %d bins...", n_bins) bin_ht = bin_ht.select( "snv", **{ bin_id: hl.int( hl.floor( (n_bins * (bin_ht[f"{bin_id}_rank"] / hl.float64(bin_ht.bin_group_variant_counts[bin_id]))) + 1)) for bin_id in bin_expr }, ) if desc: bin_ht = bin_ht.annotate( **{bin_id: n_bins - bin_ht[bin_id] + 1 for bin_id in bin_expr}) # Because SNV and indel rows are mutually exclusive, re-combine them into a single bin. # Update the global bin_group_variant_counts struct to reflect the change in bin names in the table if compute_snv_indel_separately: bin_expr_no_snv = { bin_id.rsplit("_", 1)[0] for bin_id in bin_ht.bin_group_variant_counts } bin_ht = bin_ht.annotate_globals(bin_group_variant_counts=hl.struct( **{ bin_id: hl.struct( **{ snv: bin_ht.bin_group_variant_counts[f"{bin_id}_{snv}"] for snv in ["snv", "indel"] }) for bin_id in bin_expr_no_snv })) bin_ht = bin_ht.transmute( **{ bin_id: hl.if_else( bin_ht.snv, bin_ht[f"{bin_id}_snv"], bin_ht[f"{bin_id}_indel"], ) for bin_id in bin_expr_no_snv }) return bin_ht
def sample_qc(mt, name='sample_qc') -> MatrixTable: """Compute per-sample metrics useful for quality control. .. include:: ../_templates/req_tvariant.rst Examples -------- Compute sample QC metrics and remove low-quality samples: >>> dataset = hl.sample_qc(dataset, name='sample_qc') >>> filtered_dataset = dataset.filter_cols((dataset.sample_qc.dp_stats.mean > 20) & (dataset.sample_qc.r_ti_tv > 1.5)) Notes ----- This method computes summary statistics per sample from a genetic matrix and stores the results as a new column-indexed struct field in the matrix, named based on the `name` parameter. If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats` and `gq_stats` are structs with with four fields: - `mean` (``float64``) -- Mean value. - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom). - `min` (``int32``) -- Minimum value. - `max` (``int32``) -- Maximum value. If the dataset does not contain an entry field `GT` of type :py:data:`.tcall`, then an error is raised. The following fields are always computed from `GT`: - `call_rate` (``float64``) -- Fraction of calls not missing or filtered. Equivalent to `n_called` divided by :meth:`.count_rows`. - `n_called` (``int64``) -- Number of non-missing calls. - `n_not_called` (``int64``) -- Number of missing calls. - `n_filtered` (``int64``) -- Number of filtered entries. - `n_hom_ref` (``int64``) -- Number of homozygous reference calls. - `n_het` (``int64``) -- Number of heterozygous calls. - `n_hom_var` (``int64``) -- Number of homozygous alternate calls. - `n_non_ref` (``int64``) -- Sum of `n_het` and `n_hom_var`. - `n_snp` (``int64``) -- Number of SNP alternate alleles. - `n_insertion` (``int64``) -- Number of insertion alternate alleles. - `n_deletion` (``int64``) -- Number of deletion alternate alleles. - `n_singleton` (``int64``) -- Number of private alleles. - `n_transition` (``int64``) -- Number of transition (A-G, C-T) alternate alleles. - `n_transversion` (``int64``) -- Number of transversion alternate alleles. - `n_star` (``int64``) -- Number of star (upstream deletion) alleles. - `r_ti_tv` (``float64``) -- Transition/Transversion ratio. - `r_het_hom_var` (``float64``) -- Het/HomVar call ratio. - `r_insertion_deletion` (``float64``) -- Insertion/Deletion allele ratio. Missing values ``NA`` may result from division by zero. Parameters ---------- mt : :class:`.MatrixTable` Dataset. name : :obj:`str` Name for resulting field. Returns ------- :class:`.MatrixTable` Dataset with a new column-indexed field `name`. """ require_row_key_variant(mt, 'sample_qc') from hail.expr.functions import _num_allele_type, _allele_types allele_types = _allele_types[:] allele_types.extend(['Transition', 'Transversion']) allele_enum = {i: v for i, v in enumerate(allele_types)} allele_ints = {v: k for k, v in allele_enum.items()} def allele_type(ref, alt): return hl.bind( lambda at: hl.cond( at == allele_ints['SNP'], hl.cond(hl.is_transition(ref, alt), allele_ints['Transition'], allele_ints['Transversion']), at), _num_allele_type(ref, alt)) variant_ac = Env.get_uid() variant_atypes = Env.get_uid() mt = mt.annotate_rows( **{ variant_ac: hl.agg.call_stats(mt.GT, mt.alleles).AC, variant_atypes: mt.alleles[1:].map(lambda alt: allele_type(mt.alleles[0], alt)) }) bound_exprs = {} gq_dp_exprs = {} def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype if has_field_of_type('DP', hl.tint32): gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select( 'mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select( 'mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError( f"'sample_qc': expect an entry field 'GT' of type 'call'") bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT'])) n_rows_ref = hl.expr.construct_expr( hl.ir.Ref('n_rows'), hl.tint64, mt._col_indices, hl.utils.LinkedList(hl.expr.Aggregation)) bound_exprs['n_filtered'] = n_rows_ref - hl.agg.count() bound_exprs['n_hom_ref'] = hl.agg.count_where(mt['GT'].is_hom_ref()) bound_exprs['n_het'] = hl.agg.count_where(mt['GT'].is_het()) bound_exprs['n_singleton'] = hl.agg.sum( hl.sum( hl.range(0, mt['GT'].ploidy).map( lambda i: mt[variant_ac][mt['GT'][i]] == 1))) def get_allele_type(allele_idx): return hl.cond(allele_idx > 0, mt[variant_atypes][allele_idx - 1], hl.null(hl.tint32)) bound_exprs['allele_type_counts'] = hl.agg.explode( lambda elt: hl.agg.counter(elt), hl.range(0, mt['GT'].ploidy).map(lambda i: get_allele_type(mt['GT'][i]))) zero = hl.int64(0) result_struct = hl.rbind(hl.struct(**bound_exprs), lambda x: hl.rbind( hl.struct(**{ **gq_dp_exprs, 'call_rate': hl.float64(x.n_called) / (x.n_called + x.n_not_called + x.n_filtered), 'n_called': x.n_called, 'n_not_called': x.n_not_called, 'n_filtered': x.n_filtered, 'n_hom_ref': x.n_hom_ref, 'n_het': x.n_het, 'n_hom_var': x.n_called - x.n_hom_ref - x.n_het, 'n_non_ref': x.n_called - x.n_hom_ref, 'n_singleton': x.n_singleton, 'n_snp': x.allele_type_counts.get(allele_ints["Transition"], zero) + \ x.allele_type_counts.get(allele_ints["Transversion"], zero), 'n_insertion': x.allele_type_counts.get(allele_ints["Insertion"], zero), 'n_deletion': x.allele_type_counts.get(allele_ints["Deletion"], zero), 'n_transition': x.allele_type_counts.get(allele_ints["Transition"], zero), 'n_transversion': x.allele_type_counts.get(allele_ints["Transversion"], zero), 'n_star': x.allele_type_counts.get(allele_ints["Star"], zero) }), lambda s: s.annotate( r_ti_tv=divide_null(hl.float64(s.n_transition), s.n_transversion), r_het_hom_var=divide_null(hl.float64(s.n_het), s.n_hom_var), r_insertion_deletion=divide_null(hl.float64(s.n_insertion), s.n_deletion) ))) mt = mt.annotate_cols(**{name: result_struct}) mt = mt.drop(variant_ac, variant_atypes) return mt
global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds.write('data/example.vds', overwrite=True) lmmreg_ds = hl.variant_qc( hl.split_multi_hts(hl.import_vcf('data/sample.vcf.bgz'))) lmmreg_tsv = hl.import_table('data/example_lmmreg.tsv', 'Sample', impute=True) lmmreg_ds = lmmreg_ds.annotate_cols(**lmmreg_tsv[lmmreg_ds['s']]) lmmreg_ds = lmmreg_ds.annotate_rows( use_in_kinship=lmmreg_ds.variant_qc.AF > 0.05) lmmreg_ds.write('data/example_lmmreg.vds', overwrite=True) burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds.write('data/example_burden.vds')
def test_export_plink_exprs(self): ds = get_dataset() fam_mapping = { 'f0': 'fam_id', 'f1': 'ind_id', 'f2': 'pat_id', 'f3': 'mat_id', 'f4': 'is_female', 'f5': 'pheno' } bim_mapping = { 'f0': 'contig', 'f1': 'varid', 'f2': 'cm_position', 'f3': 'position', 'f4': 'a1', 'f5': 'a2' } # Test default arguments out1 = new_temp_file() hl.export_plink(ds, out1) fam1 = (hl.import_table(out1 + '.fam', no_header=True, impute=False, missing="").rename(fam_mapping)) bim1 = (hl.import_table(out1 + '.bim', no_header=True, impute=False).rename(bim_mapping)) self.assertTrue( fam1.all((fam1.fam_id == "0") & (fam1.pat_id == "0") & (fam1.mat_id == "0") & (fam1.is_female == "0") & (fam1.pheno == "NA"))) self.assertTrue( bim1.all((bim1.varid == bim1.contig + ":" + bim1.position + ":" + bim1.a2 + ":" + bim1.a1) & (bim1.cm_position == "0.0"))) # Test non-default FAM arguments out2 = new_temp_file() hl.export_plink(ds, out2, ind_id=ds.s, fam_id=ds.s, pat_id="nope", mat_id="nada", is_female=True, pheno=False) fam2 = (hl.import_table(out2 + '.fam', no_header=True, impute=False, missing="").rename(fam_mapping)) self.assertTrue( fam2.all((fam2.fam_id == fam2.ind_id) & (fam2.pat_id == "nope") & (fam2.mat_id == "nada") & (fam2.is_female == "2") & (fam2.pheno == "1"))) # Test quantitative phenotype out3 = new_temp_file() hl.export_plink(ds, out3, ind_id=ds.s, pheno=hl.float64(hl.len(ds.s))) fam3 = (hl.import_table(out3 + '.fam', no_header=True, impute=False, missing="").rename(fam_mapping)) self.assertTrue( fam3.all((fam3.fam_id == "0") & (fam3.pat_id == "0") & (fam3.mat_id == "0") & (fam3.is_female == "0") & (fam3.pheno != "0") & (fam3.pheno != "NA"))) # Test non-default BIM arguments out4 = new_temp_file() hl.export_plink(ds, out4, varid="hello", cm_position=100) bim4 = (hl.import_table(out4 + '.bim', no_header=True, impute=False).rename(bim_mapping)) self.assertTrue( bim4.all((bim4.varid == "hello") & (bim4.cm_position == "100.0"))) # Test call expr out5 = new_temp_file() ds_call = ds.annotate_entries(gt_fake=hl.call(0, 0)) hl.export_plink(ds_call, out5, call=ds_call.gt_fake) ds_all_hom_ref = hl.import_plink(out5 + '.bed', out5 + '.bim', out5 + '.fam') nerrors = ds_all_hom_ref.aggregate_entries( hl.agg.count_where(~ds_all_hom_ref.GT.is_hom_ref())) self.assertTrue(nerrors == 0) # Test white-space in FAM id expr raises error with self.assertRaisesRegex(TypeError, "has spaces in the following values:"): hl.export_plink(ds, new_temp_file(), mat_id="hello world") # Test white-space in varid expr raises error with self.assertRaisesRegex(FatalError, "no white space allowed:"): hl.export_plink(ds, new_temp_file(), varid="hello world")
def sample_qc(vds: 'VariantDataset', *, gq_bins: 'Sequence[int]' = (0, 20, 60), dp_bins: 'Sequence[int]' = (0, 1, 10, 20, 30), dp_field=None) -> 'Table': """Compute sample quality metrics about a :class:`.VariantDataset`. If the `dp_field` parameter is not specified, the ``DP`` is used for depth if present. If no ``DP`` field is present, the ``MIN_DP`` field is used. If no ``DP`` or ``MIN_DP`` field is present, no depth statistics will be calculated. Parameters ---------- vds : :class:`.VariantDataset` Dataset in VariantDataset representation. name : :obj:`str` Name for resulting field. gq_bins : :class:`tuple` of :obj:`int` Tuple containing cutoffs for genotype quality (GQ) scores. dp_bins : :class:`tuple` of :obj:`int` Tuple containing cutoffs for depth (DP) scores. dp_field : :obj:`str` Name of depth field. If not supplied, DP or MIN_DP will be used, in that order. Returns ------- :class:`.Table` Hail Table of results, keyed by sample. """ require_first_key_field_locus(vds.reference_data, 'sample_qc') require_first_key_field_locus(vds.variant_data, 'sample_qc') ref = vds.reference_data if 'DP' in ref.entry: ref_dp_field_to_use = 'DP' elif 'MIN_DP' in ref.entry: ref_dp_field_to_use = 'MIN_DP' else: ref_dp_field_to_use = dp_field from hail.expr.functions import _num_allele_type, _allele_types allele_types = _allele_types[:] allele_types.extend(['Transition', 'Transversion']) allele_enum = {i: v for i, v in enumerate(allele_types)} allele_ints = {v: k for k, v in allele_enum.items()} def allele_type(ref, alt): return hl.bind( lambda at: hl.if_else( at == allele_ints['SNP'], hl.if_else(hl.is_transition(ref, alt), allele_ints[ 'Transition'], allele_ints['Transversion']), at), _num_allele_type(ref, alt)) variant_ac = Env.get_uid() variant_atypes = Env.get_uid() vmt = vds.variant_data if 'GT' not in vmt.entry: vmt = vmt.annotate_entries( GT=hl.experimental.lgt_to_gt(vmt.LGT, vmt.LA)) vmt = vmt.annotate_rows( **{ variant_ac: hl.agg.call_stats(vmt.GT, vmt.alleles).AC, variant_atypes: vmt.alleles[1:].map(lambda alt: allele_type(vmt.alleles[0], alt)) }) bound_exprs = {} bound_exprs['n_het'] = hl.agg.count_where(vmt['GT'].is_het()) bound_exprs['n_hom_var'] = hl.agg.count_where(vmt['GT'].is_hom_var()) bound_exprs['n_singleton'] = hl.agg.sum( hl.rbind( vmt['GT'], lambda gt: hl.sum( hl.range(0, gt.ploidy).map(lambda i: hl.rbind( gt[i], lambda gti: (gti != 0) & (vmt[variant_ac][gti] == 1)))))) bound_exprs['allele_type_counts'] = hl.agg.explode( lambda allele_type: hl.tuple( hl.agg.count_where(allele_type == i) for i in range(len(allele_ints))), (hl.range(0, vmt['GT'].ploidy).map(lambda i: vmt['GT'][i]).filter( lambda allele_idx: allele_idx > 0).map( lambda allele_idx: vmt[variant_atypes][allele_idx - 1]))) dp_exprs = {} if ref_dp_field_to_use is not None and 'DP' in vmt.entry: dp_exprs['dp'] = hl.tuple( hl.agg.count_where(vmt.DP >= x) for x in dp_bins) gq_dp_exprs = hl.struct( **{'gq': hl.tuple(hl.agg.count_where(vmt.GQ >= x) for x in gq_bins)}, **dp_exprs) result_struct = hl.rbind( hl.struct(**bound_exprs), lambda x: hl.rbind( hl.struct( **{ 'gq_dp_exprs': gq_dp_exprs, 'n_het': x.n_het, 'n_hom_var': x.n_hom_var, 'n_non_ref': x.n_het + x.n_hom_var, 'n_singleton': x.n_singleton, 'n_snp': (x.allele_type_counts[allele_ints['Transition']] + x. allele_type_counts[allele_ints['Transversion']]), 'n_insertion': x.allele_type_counts[allele_ints['Insertion']], 'n_deletion': x.allele_type_counts[allele_ints['Deletion']], 'n_transition': x.allele_type_counts[allele_ints['Transition']], 'n_transversion': x.allele_type_counts[allele_ints['Transversion']], 'n_star': x.allele_type_counts[allele_ints['Star']] }), lambda s: s.annotate(r_ti_tv=divide_null( hl.float64(s.n_transition), s.n_transversion), r_het_hom_var=divide_null( hl.float64(s.n_het), s.n_hom_var), r_insertion_deletion=divide_null( hl.float64(s.n_insertion), s. n_deletion)))) variant_results = vmt.select_cols(**result_struct).cols() rmt = vds.reference_data ref_dp_expr = {} if ref_dp_field_to_use is not None: ref_dp_expr['ref_bases_over_dp_threshold'] = hl.tuple( hl.agg.filter(rmt[ref_dp_field_to_use] >= x, hl.agg.sum(1 + rmt.END - rmt.locus.position)) for x in dp_bins) ref_results = rmt.select_cols(ref_bases_over_gq_threshold=hl.tuple( hl.agg.filter(rmt.GQ >= x, hl.agg.sum(1 + rmt.END - rmt.locus.position)) for x in gq_bins), **ref_dp_expr).cols() joined = ref_results[variant_results.key] joined_dp_expr = {} dp_bins_field = {} if ref_dp_field_to_use is not None: joined_dp_expr['bases_over_dp_threshold'] = hl.tuple( x + y for x, y in zip(variant_results.gq_dp_exprs.dp, joined.ref_bases_over_dp_threshold)) dp_bins_field['dp_bins'] = hl.tuple(dp_bins) joined_results = variant_results.transmute( bases_over_gq_threshold=hl.tuple( x + y for x, y in zip(variant_results.gq_dp_exprs.gq, joined.ref_bases_over_gq_threshold)), **joined_dp_expr) joined_results = joined_results.annotate_globals(gq_bins=hl.tuple(gq_bins), **dp_bins_field) return joined_results
def maximal_independent_set(i, j, keep=True, tie_breaker=None, keyed=True) -> Table: """Return a table containing the vertices in a near `maximal independent set <https://en.wikipedia.org/wiki/Maximal_independent_set>`_ of an undirected graph whose edges are given by a two-column table. Examples -------- Run PC-relate and compute pairs of closely related individuals: >>> pc_rel = hl.pc_relate(dataset.GT, 0.001, k=2, statistics='kin') >>> pairs = pc_rel.filter(pc_rel['kin'] > 0.125) Starting from the above pairs, prune individuals from a dataset until no close relationships remain: >>> related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False) >>> result = dataset.filter_cols( ... hl.is_defined(related_samples_to_remove[dataset.col_key]), keep=False) Starting from the above pairs, prune individuals from a dataset until no close relationships remain, preferring to keep cases over controls: >>> samples = dataset.cols() >>> pairs_with_case = pairs.key_by( ... i=hl.struct(id=pairs.i, is_case=samples[pairs.i].is_case), ... j=hl.struct(id=pairs.j, is_case=samples[pairs.j].is_case)) >>> def tie_breaker(l, r): ... return hl.cond(l.is_case & ~r.is_case, -1, ... hl.cond(~l.is_case & r.is_case, 1, 0)) >>> related_samples_to_remove = hl.maximal_independent_set( ... pairs_with_case.i, pairs_with_case.j, False, tie_breaker) >>> result = dataset.filter_cols(hl.is_defined( ... related_samples_to_remove.key_by( ... s = related_samples_to_remove.node.id.s)[dataset.col_key]), keep=False) Notes ----- The vertex set of the graph is implicitly all the values realized by `i` and `j` on the rows of this table. Each row of the table corresponds to an undirected edge between the vertices given by evaluating `i` and `j` on that row. An undirected edge may appear multiple times in the table and will not affect the output. Vertices with self-edges are removed as they are not independent of themselves. The expressions for `i` and `j` must have the same type. The value of `keep` determines whether the vertices returned are those in the maximal independent set, or those in the complement of this set. This is useful if you need to filter a table without removing vertices that don't appear in the graph at all. This method implements a greedy algorithm which iteratively removes a vertex of highest degree until the graph contains no edges. The greedy algorithm always returns an independent set, but the set may not always be perfectly maximal. `tie_breaker` is a Python function taking two arguments---say `l` and `r`---each of which is an :class:`Expression` of the same type as `i` and `j`. `tie_breaker` returns a :class:`NumericExpression`, which defines an ordering on nodes. A pair of nodes can be ordered in one of three ways, and `tie_breaker` must encode the relationship as follows: - if ``l < r`` then ``tie_breaker`` evaluates to some negative integer - if ``l == r`` then ``tie_breaker`` evaluates to 0 - if ``l > r`` then ``tie_breaker`` evaluates to some positive integer For example, the usual ordering on the integers is defined by: ``l - r``. The `tie_breaker` function must satisfy the following property: ``tie_breaker(l, r) == -tie_breaker(r, l)``. When multiple nodes have the same degree, this algorithm will order the nodes according to ``tie_breaker`` and remove the *largest* node. If `keyed` is ``False``, then a node may appear twice in the resulting table. Parameters ---------- i : :class:`.Expression` Expression to compute one endpoint of an edge. j : :class:`.Expression` Expression to compute another endpoint of an edge. keep : :obj:`bool` If ``True``, return vertices in set. If ``False``, return vertices removed. tie_breaker : function Function used to order nodes with equal degree. keyed : :obj:`bool` If ``True``, key the resulting table by the `node` field, this requires a sort. Returns ------- :class:`.Table` Table with the set of independent vertices. The table schema is one row field `node` which has the same type as input expressions `i` and `j`. """ if i.dtype != j.dtype: raise ValueError( "'maximal_independent_set' expects arguments `i` and `j` to have same type. " "Found {} and {}.".format(i.dtype, j.dtype)) source = i._indices.source if not isinstance(source, Table): raise ValueError( "'maximal_independent_set' expects an expression of 'Table'. Found {}" .format("expression of '{}'".format(source.__class__) if source is not None else 'scalar expression')) if i._indices.source != j._indices.source: raise ValueError( "'maximal_independent_set' expects arguments `i` and `j` to be expressions of the same Table. " "Found\n{}\n{}".format(i, j)) node_t = i.dtype if tie_breaker: wrapped_node_t = ttuple(node_t) l = construct_variable('l', wrapped_node_t) r = construct_variable('r', wrapped_node_t) tie_breaker_expr = hl.float64(tie_breaker(l[0], r[0])) t, _ = source._process_joins(i, j, tie_breaker_expr) tie_breaker_str = str(tie_breaker_expr._ir) else: t, _ = source._process_joins(i, j) tie_breaker_str = None edges = t.select(__i=i, __j=j).key_by().select('__i', '__j') edges_path = new_temp_file() edges.write(edges_path) edges = hl.read_table(edges_path) mis_nodes = construct_expr( JavaIR(Env.hail().utils.Graph.pyMaximalIndependentSet( Env.spark_backend('maximal_independent_set')._to_java_ir( edges.collect(_localize=False)._ir), node_t._parsable_string(), joption(tie_breaker_str))), hl.tset(node_t)) nodes = edges.select(node=[edges.__i, edges.__j]) nodes = nodes.explode(nodes.node) nodes = nodes.annotate_globals(mis_nodes=mis_nodes) nodes = nodes.filter(nodes.mis_nodes.contains(nodes.node), keep) nodes = nodes.select_globals() if keyed: return nodes.key_by('node').distinct() return nodes
def sample_qc(mt, name='sample_qc') -> MatrixTable: """Compute per-sample metrics useful for quality control. .. include:: ../_templates/req_tvariant.rst Examples -------- Compute sample QC metrics and remove low-quality samples: >>> dataset = hl.sample_qc(dataset, name='sample_qc') >>> filtered_dataset = dataset.filter_cols((dataset.sample_qc.dp_stats.mean > 20) & (dataset.sample_qc.r_ti_tv > 1.5)) Notes ----- This method computes summary statistics per sample from a genetic matrix and stores the results as a new column-indexed struct field in the matrix, named based on the `name` parameter. If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats` and `gq_stats` are structs with with four fields: - `mean` (``float64``) -- Mean value. - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom). - `min` (``int32``) -- Minimum value. - `max` (``int32``) -- Maximum value. If the dataset does not contain an entry field `GT` of type :py:data:`.tcall`, then an error is raised. The following fields are always computed from `GT`: - `call_rate` (``float64``) -- Fraction of calls non-missing. - `n_called` (``int64``) -- Number of non-missing calls. - `n_not_called` (``int64``) -- Number of missing calls. - `n_hom_ref` (``int64``) -- Number of homozygous reference calls. - `n_het` (``int64``) -- Number of heterozygous calls. - `n_hom_var` (``int64``) -- Number of homozygous alternate calls. - `n_non_ref` (``int64``) -- Sum of ``n_het`` and ``n_hom_var``. - `n_snp` (``int64``) -- Number of SNP alternate alleles. - `n_insertion` (``int64``) -- Number of insertion alternate alleles. - `n_deletion` (``int64``) -- Number of deletion alternate alleles. - `n_singleton` (``int64``) -- Number of private alleles. - `n_transition` (``int64``) -- Number of transition (A-G, C-T) alternate alleles. - `n_transversion` (``int64``) -- Number of transversion alternate alleles. - `n_star` (``int64``) -- Number of star (upstream deletion) alleles. - `r_ti_tv` (``float64``) -- Transition/Transversion ratio. - `r_het_hom_var` (``float64``) -- Het/HomVar call ratio. - `r_insertion_deletion` (``float64``) -- Insertion/Deletion allele ratio. Missing values ``NA`` may result from division by zero. Parameters ---------- mt : :class:`.MatrixTable` Dataset. name : :obj:`str` Name for resulting field. Returns ------- :class:`.MatrixTable` Dataset with a new column-indexed field `name`. """ require_row_key_variant(mt, 'sample_qc') from hail.expr.functions import _num_allele_type , _allele_types allele_types = _allele_types[:] allele_types.extend(['Transition', 'Transversion']) allele_enum = {i: v for i, v in enumerate(allele_types)} allele_ints = {v: k for k, v in allele_enum.items()} def allele_type(ref, alt): return hl.bind(lambda at: hl.cond(at == allele_ints['SNP'], hl.cond(hl.is_transition(ref, alt), allele_ints['Transition'], allele_ints['Transversion']), at), _num_allele_type(ref, alt)) variant_ac = Env.get_uid() variant_atypes = Env.get_uid() mt = mt.annotate_rows(**{variant_ac: hl.agg.call_stats(mt.GT, mt.alleles).AC, variant_atypes: mt.alleles[1:].map(lambda alt: allele_type(mt.alleles[0], alt))}) exprs = {} def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype if has_field_of_type('DP', hl.tint32): exprs['dp_stats'] = hl.agg.stats(mt.DP).select('mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): exprs['gq_stats'] = hl.agg.stats(mt.GQ).select('mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError(f"'sample_qc': expect an entry field 'GT' of type 'call'") exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT'])) exprs['n_hom_ref'] = hl.agg.count_where(mt['GT'].is_hom_ref()) exprs['n_het'] = hl.agg.count_where(mt['GT'].is_het()) exprs['n_singleton'] = hl.agg.sum(hl.sum(hl.range(0, mt['GT'].ploidy).map(lambda i: mt[variant_ac][mt['GT'][i]] == 1))) def get_allele_type(allele_idx): return hl.cond(allele_idx > 0, mt[variant_atypes][allele_idx - 1], hl.null(hl.tint32)) exprs['allele_type_counts'] = hl.agg.explode( lambda elt: hl.agg.counter(elt), hl.range(0, mt['GT'].ploidy).map(lambda i: get_allele_type(mt['GT'][i]))) mt = mt.annotate_cols(**{name: hl.struct(**exprs)}) zero = hl.int64(0) select_exprs = {} if 'dp_stats' in exprs: select_exprs['dp_stats'] = mt[name].dp_stats if 'gq_stats' in exprs: select_exprs['gq_stats'] = mt[name].gq_stats select_exprs = { **select_exprs, 'call_rate': hl.float64(mt[name].n_called) / (mt[name].n_called + mt[name].n_not_called), 'n_called': mt[name].n_called, 'n_not_called': mt[name].n_not_called, 'n_hom_ref': mt[name].n_hom_ref, 'n_het': mt[name].n_het, 'n_hom_var': mt[name].n_called - mt[name].n_hom_ref - mt[name].n_het, 'n_non_ref': mt[name].n_called - mt[name].n_hom_ref, 'n_singleton': mt[name].n_singleton, 'n_snp': mt[name].allele_type_counts.get(allele_ints["Transition"], zero) + \ mt[name].allele_type_counts.get(allele_ints["Transversion"], zero), 'n_insertion': mt[name].allele_type_counts.get(allele_ints["Insertion"], zero), 'n_deletion': mt[name].allele_type_counts.get(allele_ints["Deletion"], zero), 'n_transition': mt[name].allele_type_counts.get(allele_ints["Transition"], zero), 'n_transversion': mt[name].allele_type_counts.get(allele_ints["Transversion"], zero), 'n_star': mt[name].allele_type_counts.get(allele_ints["Star"], zero) } mt = mt.annotate_cols(**{name: mt[name].select(**select_exprs)}) mt = mt.annotate_cols(**{name: mt[name].annotate( r_ti_tv=divide_null(hl.float64(mt[name].n_transition), mt[name].n_transversion), r_het_hom_var=divide_null(hl.float64(mt[name].n_het), mt[name].n_hom_var), r_insertion_deletion=divide_null(hl.float64(mt[name].n_insertion), mt[name].n_deletion) )}) mt = mt.drop(variant_ac, variant_atypes) return mt
def merge_sample_qc_expr( sample_qc_exprs: List[hl.expr.StructExpression], ) -> hl.expr.StructExpression: """ Creates an expression that merges results from non-overlapping strata of hail.sample_qc E.g.: - Compute autosomes and sex chromosomes metrics separately, then merge results - Compute bi-allelic and multi-allelic metrics separately, then merge results Note regarding the merging of ``dp_stats`` and ``gq_stats``: Because ``n`` is needed to aggregate ``stdev``, ``n_called`` is used for this purpose. This should work very well on a standard GATK VCF and it essentially assumes that: - samples that are called have `DP` and `GQ` fields - samples that are not called do not have `DP` and `GQ` fields Even if these assumptions are broken for some genotypes, it shouldn't matter too much. :param sample_qc_exprs: List of sample QC struct expressions for each stratification :return: Combined sample QC results """ # List of metrics that can be aggregated by summing additive_metrics = [ "n_called", "n_not_called", "n_filtered", "n_hom_ref", "n_het", "n_hom_var", "n_snp", "n_insertion", "n_deletion", "n_singleton", "n_transition", "n_transversion", "n_star", ] # List of metrics that are ratio of summed metrics (name, nominator, denominator) ratio_metrics = [ ("call_rate", "n_called", "n_not_called"), ("r_ti_tv", "n_transition", "n_transversion"), ("r_het_hom_var", "n_het", "n_hom_var"), ("r_insertion_deletion", "n_insertion", "n_deletion"), ] # List of metrics that are struct generated by a stats counter stats_metrics = ["gq_stats", "dp_stats"] # Gather metrics present in sample qc fields sample_qc_fields = set(sample_qc_exprs[0]) for sample_qc_expr in sample_qc_exprs[1:]: sample_qc_fields = sample_qc_fields.union(set(sample_qc_expr)) # Merge additive metrics in sample qc fields merged_exprs = { metric: hl.sum([sample_qc_expr[metric] for sample_qc_expr in sample_qc_exprs]) for metric in additive_metrics if metric in sample_qc_fields } # Merge ratio metrics in sample qc fields merged_exprs.update({ metric: hl.float64(divide_null(merged_exprs[nom], merged_exprs[denom])) for metric, nom, denom in ratio_metrics if nom in sample_qc_fields and denom in sample_qc_fields }) # Merge stats counter metrics in sample qc fields # Use n_called as n for DP and GQ stats if "n_called" in sample_qc_fields: merged_exprs.update({ metric: merge_stats_counters_expr([ sample_qc_expr[metric].annotate(n=sample_qc_expr.n_called) for sample_qc_expr in sample_qc_exprs ]).drop("n") for metric in stats_metrics if metric in sample_qc_fields }) return hl.struct(**merged_exprs)