def make_call(gt): if gt == 0.0: return hl.Call([0, 0]) if gt == 1.0: return hl.Call([0, 1]) if gt == 2.0: return hl.Call([1, 1])
def test_lgt_to_gt(): call_0_0_f = hl.call(0, 0, phased=False) call_0_0_t = hl.call(0, 0, phased=True) call_0_1_f = hl.call(0, 1, phased=False) call_2_0_t = hl.call(2, 0, phased=True) call_1 = hl.call(1, phased=False) la = [0, 3, 5] assert hl.eval(tuple(hl.vds.lgt_to_gt(c, la) for c in [call_0_0_f, call_0_0_t, call_0_1_f, call_2_0_t, call_1])) == \ tuple([hl.Call([0, 0], phased=False), hl.Call([0, 0], phased=True), hl.Call([0, 3], phased=False), hl.Call([5, 0], phased=True), hl.Call([3], phased=False)])
def test_errors(self): schema = hl.tstruct(status=hl.tint32, gt=hl.tcall, qPheno=hl.tint32) rows = [{'status': 0, 'gt': hl.Call([0, 0]), 'qPheno': 3}, {'status': 0, 'gt': hl.Call([0, 1]), 'qPheno': 13}, {'status': 1, 'gt': hl.Call([0, 1]), 'qPheno': 20}] kt = hl.Table.parallelize(rows, schema) def f(): kt.a = 5 self.assertRaises(NotImplementedError, f)
def test_aggregate2(self): schema = hl.tstruct(status=hl.tint32, GT=hl.tcall, qPheno=hl.tint32) rows = [{'status': 0, 'GT': hl.Call([0, 0]), 'qPheno': 3}, {'status': 0, 'GT': hl.Call([0, 1]), 'qPheno': 13}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict( kt.group_by(status=kt.status) .aggregate( x1=agg.collect(kt.qPheno * 2), x2=agg.explode(lambda elt: agg.collect(elt), [kt.qPheno, kt.qPheno + 1]), x3=agg.min(kt.qPheno), x4=agg.max(kt.qPheno), x5=agg.sum(kt.qPheno), x6=agg.product(hl.int64(kt.qPheno)), x7=agg.count(), x8=agg.count_where(kt.qPheno == 3), x9=agg.fraction(kt.qPheno == 1), x10=agg.stats(hl.float64(kt.qPheno)), x11=agg.hardy_weinberg_test(kt.GT), x13=agg.inbreeding(kt.GT, 0.1), x14=agg.call_stats(kt.GT, ["A", "T"]), x15=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')))[0], x16=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')).c.banana)[0], x17=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tarray(hl.tint32))), x18=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tset(hl.tint32))), x19=agg.take(kt.GT, 1, ordering=-kt.qPheno) ).take(1)[0]) expected = {u'status': 0, u'x13': {u'n_called': 2, u'expected_homs': 1.64, u'f_stat': -1.777777777777777, u'observed_homs': 1}, u'x14': {u'AC': [3, 1], u'AF': [0.75, 0.25], u'AN': 4, u'homozygote_count': [1, 0]}, u'x15': {u'a': 5, u'c': {u'banana': u'apple'}, u'b': u'foo'}, u'x10': {u'min': 3.0, u'max': 13.0, u'sum': 16.0, u'stdev': 5.0, u'n': 2, u'mean': 8.0}, u'x8': 1, u'x9': 0.0, u'x16': u'apple', u'x11': {u'het_freq_hwe': 0.5, u'p_value': 0.5}, u'x2': [3, 4, 13, 14], u'x3': 3, u'x1': [6, 26], u'x6': 39, u'x7': 2, u'x4': 13, u'x5': 16, u'x17': [], u'x18': [], u'x19': [hl.Call([0, 1])]} self.maxDiff = None self.assertDictEqual(result, expected)
def test_variant_qc(self): data = [ {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0}, {'v': '1:1:A:T', 's': '2', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5}, {'v': '1:1:A:T', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 11, 'DP': 100}, {'v': '1:1:A:T', 's': '4', 'GT': None, 'GQ': None, 'DP': 100}, {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1, 2]), 'GQ': 10, 'DP': 5}, {'v': '1:2:A:T,C', 's': '2', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 5}, {'v': '1:2:A:T,C', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 10, 'DP': 5}, {'v': '1:2:A:T,C', 's': '4', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5}, ] ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}')) ht = ht.transmute(**hl.parse_variant(ht.v)) mt = ht.to_matrix_table(['locus', 'alleles'], ['s']) mt = hl.variant_qc(mt, 'vqc') r = mt.rows().collect() self.assertEqual(r[0].vqc.AF, [0.5, 0.5]) self.assertEqual(r[0].vqc.AC, [3, 3]) self.assertEqual(r[0].vqc.AN, 6) self.assertEqual(r[0].vqc.homozygote_count, [1, 1]) self.assertEqual(r[0].vqc.n_called, 3) self.assertEqual(r[0].vqc.n_not_called, 1) self.assertEqual(r[0].vqc.call_rate, 0.75) self.assertEqual(r[0].vqc.n_het, 1) self.assertEqual(r[0].vqc.n_non_ref, 2) self.assertEqual(r[0].vqc.het_freq_hwe, 0.6) self.assertEqual(r[0].vqc.p_value_hwe, 0.7) self.assertEqual(r[0].vqc.dp_stats.min, 0) self.assertEqual(r[0].vqc.dp_stats.max, 100) self.assertEqual(r[0].vqc.dp_stats.mean, 51.25) self.assertAlmostEqual(r[0].vqc.dp_stats.stdev, 48.782040752719645) self.assertEqual(r[0].vqc.gq_stats.min, 10) self.assertEqual(r[0].vqc.gq_stats.max, 11) self.assertAlmostEqual(r[0].vqc.gq_stats.mean, 10.333333333333334) self.assertAlmostEqual(r[0].vqc.gq_stats.stdev, 0.47140452079103168) self.assertEqual(r[1].vqc.AF, [0.125, 0.5, 0.375]) self.assertEqual(r[1].vqc.AC, [1, 4, 3]) self.assertEqual(r[1].vqc.AN, 8) self.assertEqual(r[1].vqc.homozygote_count, [0, 1, 1]) self.assertEqual(r[1].vqc.n_called, 4) self.assertEqual(r[1].vqc.n_not_called, 0) self.assertEqual(r[1].vqc.call_rate, 1.0) self.assertEqual(r[1].vqc.n_het, 2) self.assertEqual(r[1].vqc.n_non_ref, 4) self.assertEqual(r[1].vqc.p_value_hwe, None) self.assertEqual(r[1].vqc.het_freq_hwe, None) self.assertEqual(r[1].vqc.dp_stats.min, 5) self.assertEqual(r[1].vqc.dp_stats.max, 5) self.assertEqual(r[1].vqc.dp_stats.mean, 5) self.assertEqual(r[1].vqc.dp_stats.stdev, 0.0) self.assertEqual(r[1].vqc.gq_stats.min, 10) self.assertEqual(r[1].vqc.gq_stats.max, 10) self.assertEqual(r[1].vqc.gq_stats.mean, 10) self.assertEqual(r[1].vqc.gq_stats.stdev, 0)
def _convert_from_json(self, x): if x == '-': return hl.Call([]) if x == '|-': return hl.Call([], phased=True) if x[0] == '|': return hl.Call([int(x[1:])], phased=True) n = len(x) i = 0 while i < n: c = x[i] if c in '|/': break i += 1 if i == n: return hl.Call([int(x)]) return hl.Call([int(x[0:i]), int(x[i + 1:])], phased=(c == '|'))
def test_sample_qc(self): data = [ {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0}, {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1]), 'GQ': 15, 'DP': 5}, {'v': '1:3:A:G,C', 's': '1', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 4}, {'v': '1:4:G:A', 's': '1', 'GT': hl.Call([0, 1]), 'GQ': None, 'DP': 5}, {'v': '1:5:C:CG', 's': '1', 'GT': hl.Call([1, 1]), 'GQ': 20, 'DP': 3}, {'v': '1:6:C:A', 's': '1', 'GT': None, 'GQ': 0, 'DP': None}, ] ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}')) ht = ht.transmute(**hl.parse_variant(ht.v)) mt = ht.to_matrix_table(['locus', 'alleles'], ['s']) mt = hl.sample_qc(mt, 'sqc') r = mt.cols().select('sqc').collect() self.assertAlmostEqual(r[0].sqc.gq_stats.mean, 11) self.assertAlmostEqual(r[0].sqc.gq_stats.stdev, 6.6332495807) self.assertAlmostEqual(r[0].sqc.gq_stats.min, 0) self.assertAlmostEqual(r[0].sqc.gq_stats.max, 20) self.assertAlmostEqual(r[0].sqc.dp_stats.mean, 3.399999999) self.assertAlmostEqual(r[0].sqc.dp_stats.stdev, 1.8547236990) self.assertAlmostEqual(r[0].sqc.dp_stats.min, 0) self.assertAlmostEqual(r[0].sqc.dp_stats.max, 5) self.assertAlmostEqual(r[0].sqc.call_rate, 0.8333333333) self.assertEqual(r[0].sqc.n_called, 5) self.assertEqual(r[0].sqc.n_not_called, 1) self.assertEqual(r[0].sqc.n_hom_ref, 1) self.assertEqual(r[0].sqc.n_het, 1) self.assertEqual(r[0].sqc.n_hom_var, 3) self.assertEqual(r[0].sqc.n_insertion, 2) self.assertEqual(r[0].sqc.n_deletion, 0) self.assertEqual(r[0].sqc.n_singleton, 3) self.assertEqual(r[0].sqc.n_transition, 1) self.assertEqual(r[0].sqc.n_transversion, 3) self.assertEqual(r[0].sqc.n_star, 0) self.assertEqual(r[0].sqc.n_non_ref, 4) self.assertAlmostEqual(r[0].sqc.r_ti_tv, 0.333333333) self.assertAlmostEqual(r[0].sqc.r_het_hom_var, 0.3333333333) self.assertAlmostEqual(r[0].sqc.r_insertion_deletion, None)
def values(self): values = [(hl.tbool, True), (hl.tint32, 0), (hl.tint64, 0), (hl.tfloat32, 0.5), (hl.tfloat64, 0.5), (hl.tstr, "foo"), (hl.tstruct(x=hl.tint32), hl.Struct(x=0)), (hl.tarray(hl.tint32), [0, 1, 4]), (hl.tset(hl.tint32), {0, 1, 4}), (hl.tdict(hl.tstr, hl.tint32), { "a": 0, "b": 1, "c": 4 }), (hl.tinterval(hl.tint32), hl.Interval(0, 1, True, False)), (hl.tlocus(hl.default_reference()), hl.Locus("1", 1)), (hl.tcall, hl.Call([0, 1]))] return values
def get_ldsim_dataset(n_variants=16, n_samples=4, n_contigs=2, seed=None): data = [] rs = np.random.RandomState(seed) for v in range(n_variants): for s in range(n_samples): for c in range(n_contigs): data.append({ 'v': f'{c+1}:{v+1}:A:C', 's': f's{s+1:09d}', 'cm': .1, 'GT': hl.Call([rs.randint(0, 2), rs.randint(0, 2)]) }) ht = hl.Table.parallelize( data, hl.dtype('struct{v: str, s: str, cm: float64, GT: call}')) ht = ht.transmute(**hl.parse_variant(ht.v)) mt = ht.to_matrix_table(row_key=['locus', 'alleles'], col_key=['s'], row_fields=['cm']) return add_default_plink_fields(mt)
def get_plink_sim_dataset(n_variants=16, n_samples=4, n_contigs=2, seed=0): data = [] rs = np.random.RandomState(seed) contig_index = dividx(n_variants, n_contigs) assert contig_index.ndim == 1 assert contig_index.size == n_variants for v in range(n_variants): c = contig_index[v] for s in range(n_samples): data.append({ "v": f"{c+1}:{v+1}:A:C", "s": f"S{s+1:07d}", "cm": 0.1, "GT": hl.Call([rs.randint(0, 2), rs.randint(0, 2)]), }) ht = hl.Table.parallelize( data, hl.dtype("struct{v: str, s: str, cm: float64, GT: call}")) ht = ht.transmute(**hl.parse_variant(ht.v)) mt = ht.to_matrix_table(row_key=["locus", "alleles"], col_key=["s"], row_fields=["cm"]) return add_default_plink_fields(mt)
def test_lgt_to_gt_invalid(): c1 = hl.call(1, 1) c2 = hl.call(1, 1, phased=True) assert hl.eval(hl.vds.lgt_to_gt(c1, [0, 17495])) == hl.Call([17495, 17495])
intervals = ["chr5"] mt_filtered = hl.filter_intervals(mt_filtered, [ hl.parse_locus_interval(x, reference_genome='GRCh38') for x in intervals ]) mt_trans = mt_filtered.filter_rows(mt_filtered.info.AC[0] == 2, keep=True) mt_untrans = mt_filtered.filter_rows(mt_filtered.info.AC[0] == 1, keep=True) #.filter_rows(mt_filtered.info.AC[0] ==, keep=True) print(mt_filtered.info.AC.summarize()) print(mt_filtered.info.AC.show()) #group_cols_by(mt_trans.id) mt_trans_count = mt_trans.group_cols_by(mt_trans.fam_id).aggregate( transmitted_singletons_count=hl.agg.count_where( (mt_trans.info.AC[0] == 2) & (mt_trans.proband_entry.GT == hl.Call([0, 1])) & (((mt_trans.father_entry.GT == hl.Call([0, 1])) & (mt_trans.mother_entry.GT == hl.Call([0, 0]))) | ((mt_trans.mother_entry.GT == hl.Call([0, 1])) & (mt_trans.father_entry.GT == hl.Call([0, 0])))))) #print(mt_untrans.father_entry.GT.show()) #print(mt_untrans.mother_entry.GT.show()) #print(mt_untrans.proband_entry.GT.show()) #group_cols_by(mt_untrans.id) mt_untrans_count = mt_untrans.group_cols_by(mt_untrans.fam_id).aggregate( untransmitted_singletons_count=hl.agg.count_where( (mt_untrans.proband_entry.GT == hl.Call([0, 0])) & ((mt_untrans.father_entry.GT == hl.Call([0, 1])) | (mt_untrans.father_entry.GT == hl.Call([1, 0])) | (mt_untrans.mother_entry.GT == hl.Call([0, 1])) | (mt_untrans.mother_entry.GT == hl.Call([1, 0])))))
def test_concordance_n_discordant(self): dataset = get_dataset() _, cols_conc, rows_conc = hl.concordance(dataset, dataset) assert cols_conc.aggregate( hl.agg.count_where(cols_conc.n_discordant != 0)) == 0 rows1 = [ hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '1', 'GT': hl.Call([0, 0]) }), hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '2', 'GT': hl.Call([0, 0]) }), hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '3', 'GT': hl.Call([1, 1]) }), hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '4', 'GT': hl.Call([1, 1]) }), hl.Struct( **{ 'locus': hl.Locus('1', 101), 'alleles': ['A', 'T'], 's': '1', 'GT': hl.Call([1, 1]) }), ] rows2 = [ hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '1', 'GT': None }), hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '2', 'GT': hl.Call([0, 1]) }), hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '3', 'GT': hl.Call([0, 1]) }), hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '4', 'GT': hl.Call([1, 1]) }), ] def make_mt(rows): ht = hl.Table.parallelize( rows, schema= 'struct{locus:locus<GRCh37>,alleles:array<str>,s:str,GT:call}') return ht.to_matrix_table(row_key=['locus', 'alleles'], col_key=['s']) global_conc_2, cols_conc_2, rows_conc_2 = hl.concordance( make_mt(rows1), make_mt(rows2)) assert cols_conc_2.collect() == [ hl.Struct(s='1', concordance=[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 0], [1, 0, 0, 0, 0]], n_discordant=0), hl.Struct(s='2', concordance=[[1, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], n_discordant=1), hl.Struct(s='3', concordance=[[1, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 1, 0]], n_discordant=1), hl.Struct(s='4', concordance=[[1, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 1]], n_discordant=0), ] assert global_conc_2 == [[3, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 1, 0, 1, 0], [0, 0, 0, 0, 0], [1, 0, 0, 1, 1]] assert rows_conc_2.collect() == [ hl.Struct(locus=hl.Locus('1', 100), alleles=['A', 'T'], concordance=[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 1, 0, 1, 0], [0, 0, 0, 0, 0], [0, 0, 0, 1, 1]], n_discordant=2), hl.Struct(locus=hl.Locus('1', 101), alleles=['A', 'T'], concordance=[[3, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [1, 0, 0, 0, 0]], n_discordant=0), ]
def test_to_dense_mt(): vds = hl.vds.read_vds( os.path.join(resource('vds'), '1kg_2samples_starts.vds')) vds = hl.vds.filter_chromosomes(vds, keep='chr22') dense = hl.vds.to_dense_mt(vds).select_entries('LGT', 'LA', 'GQ', 'DP') assert dense.rows().select()._same(vds.variant_data.rows().select( )), "rows differ between variant data and dense mt" assert dense.filter_entries(hl.is_defined(dense.LA))._same( vds.variant_data.select_entries('LGT', 'LA', 'GQ', 'DP')), "cannot recover variant data" as_dict = dense.aggregate_entries( hl.dict( hl.zip(hl.agg.collect((hl.str(dense.locus), dense.s)), hl.agg.collect(dense.entry)))) assert as_dict.get(('chr22:10514784', 'NA12891')) == None assert as_dict.get( ('chr22:10514784', 'NA12878')) == hl.Struct(LGT=hl.Call([0, 1]), LA=[0, 1], GQ=23, DP=4) assert as_dict.get( ('chr22:10516150', 'NA12891')) == hl.Struct(LGT=hl.Call([0, 1]), LA=[0, 1], GQ=64, DP=4) assert as_dict.get( ('chr22:10516150', 'NA12878')) == hl.Struct(LGT=hl.Call([0, 1]), LA=[0, 1], GQ=99, DP=10) assert as_dict.get( ('chr22:10519088', 'NA12891')) == hl.Struct(LGT=hl.Call([0, 1]), LA=[0, 1], GQ=99, DP=21) assert as_dict.get(('chr22:10519088', 'NA12878')) == None assert as_dict.get( ('chr22:10562435', 'NA12891')) == hl.Struct(LGT=hl.Call([0, 1]), LA=[0, 1], GQ=99, DP=15) assert as_dict.get( ('chr22:10562435', 'NA12878')) == hl.Struct(LGT=hl.Call([0, 0]), LA=None, GQ=21, DP=9) assert as_dict.get( ('chr22:10562436', 'NA12891')) == hl.Struct(LGT=hl.Call([0, 1]), LA=[0, 1], GQ=99, DP=15) assert as_dict.get( ('chr22:10562436', 'NA12878')) == hl.Struct(LGT=hl.Call([0, 0]), LA=None, GQ=21, DP=9)