def test_select(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32), g=hl.tstruct(x=hl.tbool, y=hl.tint32)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': {'x': True, 'y': 2}}, {'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': [], 'g': {'x': True, 'y': 2}}, {'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7], 'g': None}] kt = hl.Table.parallelize(rows, schema) t1 = kt.select(kt.a, kt.e) self.assertEqual(list(t1.row), ['a', 'e']) self.assertEqual(list(t1.key), []) t2 = kt.key_by('e') t2 = t2.select(t2.a) self.assertEqual(list(t2.row), ['e', 'a']) self.assertEqual(list(t2.key), ['e']) self.assertEqual(list(kt.select(kt.a, foo=kt.a + kt.b - kt.c - kt.d).row), ['a', 'foo']) self.assertEqual(list(kt.select(kt.a, foo=kt.a + kt.b - kt.c - kt.d, **kt.g).row), ['a', 'foo', 'x', 'y']) # select no fields s = kt.select() self.assertEqual(list(s.row), []) self.assertEqual(list(s.key), [])
def test_annotate(self): vds = self.get_vds() vds = vds.annotate_globals(foo=5) self.assertEqual(vds.globals.dtype, hl.tstruct(foo=hl.tint32)) vds = vds.annotate_rows(x1=agg.count(), x2=agg.fraction(False), x3=agg.count_where(True), x4=vds.info.AC + vds.foo) vds = vds.annotate_cols(apple=6) vds = vds.annotate_cols(y1=agg.count(), y2=agg.fraction(False), y3=agg.count_where(True), y4=vds.foo + vds.apple) expected_schema = hl.tstruct(s=hl.tstr, apple=hl.tint32, y1=hl.tint64, y2=hl.tfloat64, y3=hl.tint64, y4=hl.tint32) self.assertTrue(schema_eq(vds.col.dtype, expected_schema), "expected: " + str(vds.col.dtype) + "\nactual: " + str(expected_schema)) vds = vds.select_entries(z1=vds.x1 + vds.foo, z2=vds.x1 + vds.y1 + vds.foo) self.assertTrue(schema_eq(vds.entry.dtype, hl.tstruct(z1=hl.tint64, z2=hl.tint64)))
def visit_struct(self, node, visited_children): tstruct, _, brace, maybe_fields, brace = visited_children if not maybe_fields: return hl.tstruct() else: fields = maybe_fields[0] return hl.tstruct(**dict(fields))
def test_range_table(self): t = hl.utils.range_table(26, n_partitions=5) self.assertEqual(t.globals.dtype, hl.tstruct()) self.assertEqual(t.row.dtype, hl.tstruct(idx=hl.tint32)) self.assertEqual(t.row_value.dtype, hl.tstruct()) self.assertEqual(list(t.key), ['idx']) self.assertEqual([r.idx for r in t.collect()], list(range(26)))
def _compute_type(self): child_typ = self.child.typ self._type = hl.tmatrix( child_typ.global_type, child_typ.col_key_type._concat( hl.tstruct(**{f: hl.tarray(t) for f, t in child_typ.col_value_type.items()})), child_typ.col_key, child_typ.row_type, child_typ.row_key, hl.tstruct(**{f: hl.tarray(t) for f, t in child_typ.entry_type.items()}))
def test_localize_entries(self): ref_schema = hl.tstruct(row_idx=hl.tint32, __entries=hl.tarray(hl.tstruct(v=hl.tint32))) ref_data = [{'row_idx': i, '__entries': [{'v': i+j} for j in range(6)]} for i in range(8)] ref_tab = hl.Table.parallelize(ref_data, ref_schema).key_by('row_idx') ref_tab = ref_tab.select_globals(__cols=[hl.struct(col_idx=i) for i in range(6)]) mt = hl.utils.range_matrix_table(8, 6) mt = mt.annotate_entries(v=mt.row_idx+mt.col_idx) t = mt._localize_entries('__entries', '__cols') self.assertTrue(t._same(ref_tab))
def test_transmute(self): mt = ( hl.utils.range_matrix_table(1, 1) .annotate_globals(g1=0, g2=0) .annotate_cols(c1=0, c2=0) .annotate_rows(r1=0, r2=0) .annotate_entries(e1=0, e2=0)) self.assertEqual(mt.transmute_globals(g3=mt.g2 + 1).globals.dtype, hl.tstruct(g1=hl.tint, g3=hl.tint)) self.assertEqual(mt.transmute_rows(r3=mt.r2 + 1).row_value.dtype, hl.tstruct(r1=hl.tint, r3=hl.tint)) self.assertEqual(mt.transmute_cols(c3=mt.c2 + 1).col_value.dtype, hl.tstruct(c1=hl.tint, c3=hl.tint)) self.assertEqual(mt.transmute_entries(e3=mt.e2 + 1).entry.dtype, hl.tstruct(e1=hl.tint, e3=hl.tint))
def test_reference_genome_liftover(self): grch37 = hl.get_reference('GRCh37') grch38 = hl.get_reference('GRCh38') self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37')) grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37') self.assertTrue(grch37.has_liftover('GRCh38') and grch38.has_liftover('GRCh37')) ds = hl.import_vcf(resource('sample.vcf')) t = ds.annotate_rows(liftover=hl.liftover(hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows() self.assertTrue(t.all(t.locus == t.liftover)) null_locus = hl.null(hl.tlocus('GRCh38')) rows = [ {'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38')}, {'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38')}, {'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38')}, {'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38')}, {'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38')}, {'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38')}, {'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus} ] schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38)) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.cond(hl.is_defined(t.l38), hl.liftover(t.l37, 'GRCh38') == t.l38, hl.is_missing(hl.liftover(t.l37, 'GRCh38'))))) t = t.filter(hl.is_defined(t.l38)) self.assertTrue(t.count() == 6) t = t.key_by('l38') t.count() self.assertTrue(list(t.key) == ['l38']) null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38'))) rows = [ {'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval}, {'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'), 'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')} ] schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38))) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38)) grch37.remove_liftover("GRCh38") grch38.remove_liftover("GRCh37")
def test_range_matrix_table(self): mt = hl.utils.range_matrix_table(13, 7, n_partitions=5) self.assertEqual(mt.globals.dtype, hl.tstruct()) self.assertEqual(mt.row.dtype, hl.tstruct(row_idx=hl.tint32)) self.assertEqual(mt.col.dtype, hl.tstruct(col_idx=hl.tint32)) self.assertEqual(mt.entry.dtype, hl.tstruct()) self.assertEqual(list(mt.row_key), ['row_idx']) self.assertEqual(list(mt.col_key), ['col_idx']) self.assertEqual([r.row_idx for r in mt.rows().collect()], list(range(13))) self.assertEqual([r.col_idx for r in mt.cols().collect()], list(range(7)))
def test_localize_self_join(self): ref_schema = hl.tstruct(row_idx=hl.tint32, __entries=hl.tarray(hl.tstruct(v=hl.tint32))) ref_data = [{'row_idx': i, '__entries': [{'v': i+j} for j in range(6)]} for i in range(8)] ref_tab = hl.Table.parallelize(ref_data, ref_schema).key_by('row_idx') ref_tab = ref_tab.join(ref_tab, how='outer') mt = hl.utils.range_matrix_table(8, 6) mt = mt.annotate_entries(v=mt.row_idx+mt.col_idx) t = mt._localize_entries('__entries', '__cols').drop('__cols') t = t.join(t, how='outer') self.assertTrue(t._same(ref_tab))
def test_maximal_independent_set(self): # prefer to remove nodes with higher index t = hl.utils.range_table(10) graph = t.select(i=hl.int64(t.idx), j=hl.int64(t.idx + 10), bad_type=hl.float32(t.idx)) mis_table = hl.maximal_independent_set(graph.i, graph.j, True, lambda l, r: l - r) mis = [row['node'] for row in mis_table.collect()] self.assertEqual(sorted(mis), list(range(0, 10))) self.assertEqual(mis_table.row.dtype, hl.tstruct(node=hl.tint64)) self.assertEqual(mis_table.key.dtype, hl.tstruct(node=hl.tint64)) self.assertRaises(ValueError, lambda: hl.maximal_independent_set(graph.i, graph.bad_type, True)) self.assertRaises(ValueError, lambda: hl.maximal_independent_set(graph.i, hl.utils.range_table(10).idx, True)) self.assertRaises(ValueError, lambda: hl.maximal_independent_set(hl.literal(1), hl.literal(2), True))
def test_transmute(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32), g=hl.tstruct(x=hl.tbool, y=hl.tint32)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': {'x': True, 'y': 2}}, {'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': [], 'g': {'x': True, 'y': 2}}, {'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7], 'g': None}] df = hl.Table.parallelize(rows, schema) df = df.transmute(h=df.a + df.b + df.c + df.g.y) r = df.select('h').collect() self.assertEqual(list(df.row), ['d', 'e', 'f', 'h']) self.assertEqual(r, [hl.Struct(h=x) for x in [10, 20, None]])
def test_maximal_independent_set2(self): edges = [(0, 4), (0, 1), (0, 2), (1, 5), (1, 3), (2, 3), (2, 6), (3, 7), (4, 5), (4, 6), (5, 7), (6, 7)] edges = [{"i": l, "j": r} for l, r in edges] t = hl.Table.parallelize(edges, hl.tstruct(i=hl.tint64, j=hl.tint64)) mis_t = hl.maximal_independent_set(t.i, t.j) self.assertTrue(mis_t.row.dtype == hl.tstruct(node=hl.tint64) and mis_t.globals.dtype == hl.tstruct()) mis = set([row.node for row in mis_t.collect()]) maximal_indep_sets = [{0, 6, 5, 3}, {1, 4, 7, 2}] non_maximal_indep_sets = [{0, 7}, {6, 1}] self.assertTrue(mis in non_maximal_indep_sets or mis in maximal_indep_sets)
def test_group_cols_by_aggregate(self): mt, mt2 = self.get_groupable_matrix2() col_result = (mt.group_cols_by(group=mt2.cols()[mt.col_idx].col_idx2 < 2) .aggregate_cols(collect=hl.agg.collect(mt.col_idx)) .aggregate_cols(count=hl.agg.count()) .aggregate_entries(sum=hl.agg.sum(mt2[mt.row_idx, mt.col_idx].x + mt.glob) + mt.glob - 15 - mt.row_idx) # tests fixed indices .aggregate_entries(x=5) .result()) col_expected = ( hl.Table.parallelize( [{'group': True, 'row_idx': 0, 'sum': 1, 'collect': [0, 1], 'count': 2, 'r1': 3, 'x': 5}, {'group': True, 'row_idx': 1, 'sum': 2, 'collect': [0, 1], 'count': 2, 'r1': 3, 'x': 5}, {'group': True, 'row_idx': 2, 'sum': 3, 'collect': [0, 1], 'count': 2, 'r1': 3, 'x': 5}, {'group': True, 'row_idx': 3, 'sum': 4, 'collect': [0, 1], 'count': 2, 'r1': 3, 'x': 5}, {'group': False, 'row_idx': 0, 'sum': 5, 'collect': [2, 3], 'count': 2, 'r1': 3, 'x': 5}, {'group': False, 'row_idx': 1, 'sum': 6, 'collect': [2, 3], 'count': 2, 'r1': 3, 'x': 5}, {'group': False, 'row_idx': 2, 'sum': 7, 'collect': [2, 3], 'count': 2, 'r1': 3, 'x': 5}, {'group': False, 'row_idx': 3, 'sum': 8, 'collect': [2, 3], 'count': 2, 'r1': 3, 'x': 5}], hl.tstruct(row_idx=hl.tint32, r1=hl.tint32, group=hl.tbool, collect=hl.tarray(hl.tint32), count=hl.tint64, sum=hl.tint64, x=hl.tint32) ).annotate_globals(glob=5).key_by('row_idx', 'group') ) self.assertTrue(col_result.entries()._same(col_expected))
def test_import_gen_no_reference_specified(self): gen = hl.import_gen(resource('example.gen'), sample_file=resource('example.sample'), reference_genome=None) self.assertTrue(gen.locus.dtype == hl.tstruct(contig=hl.tstr, position=hl.tint32)) self.assertEqual(gen.count_rows(), 199)
def test_index_maintains_count(self): t1 = hl.Table.parallelize([ {'a': 'foo', 'b': 1}, {'a': 'bar', 'b': 2}, {'a': 'bar', 'b': 2}], hl.tstruct(a=hl.tstr, b=hl.tint32), key='a') t2 = hl.Table.parallelize([ {'t': 'foo', 'x': 3.14}, {'t': 'bar', 'x': 2.78}, {'t': 'bar', 'x': -1}, {'t': 'quam', 'x': 0}], hl.tstruct(t=hl.tstr, x=hl.tfloat64), key='t') j = t1.annotate(f=t2[t1.a].x) self.assertEqual(j.count(), t1.count())
def _compute_type(self): for c in self.children: c.typ # force child_typ = self.children[0].typ self._type = hl.ttable( hl.tstruct(**{self.global_name: hl.tarray(child_typ.global_type)}), child_typ.key_type._insert_field(self.data_name, hl.tarray(child_typ.value_type)), child_typ.row_key)
def test_import_bgen_dosage_entry(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage']) self.assertEqual(bgen.entry.dtype, hl.tstruct(dosage=hl.tfloat64)) self.assertEqual(bgen.count_rows(), 199)
def test_import_bgen_GT_GP_entries(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['GT', 'GP'], sample_file=resource('example.sample')) self.assertEqual(bgen.entry.dtype, hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64)))
def test_import_bgen_no_reference(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome=None) bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['GT', 'GP', 'dosage']) self.assertEqual(bgen.locus.dtype, hl.tstruct(contig=hl.tstr, position=hl.tint32)) self.assertEqual(bgen.count_rows(), 199)
def test_make_table_empty_entry_field(self): mt = hl.utils.range_matrix_table(3, 2) mt = mt.select_entries(**{'': mt.row_idx * mt.col_idx}) mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx)) t = mt.make_table() self.assertEqual( t.row.dtype, hl.tstruct(**{'row_idx': hl.tint32, '0': hl.tint32, '1': hl.tint32}))
def table_irs(self): b = ir.TrueIR() table_read = ir.TableRead( ir.TableNativeReader(resource('backward_compatability/1.0.0/table/0.ht')), False) table_read_row_type = hl.dtype('struct{idx: int32, f32: float32, i64: int64, m: float64, astruct: struct{a: int32, b: float64}, mstruct: struct{x: int32, y: str}, aset: set<str>, mset: set<float64>, d: dict<array<str>, float64>, md: dict<int32, str>, h38: locus<GRCh38>, ml: locus<GRCh37>, i: interval<locus<GRCh37>>, c: call, mc: call, t: tuple(call, str, str), mt: tuple(locus<GRCh37>, bool)}') matrix_read = ir.MatrixRead( ir.MatrixNativeReader(resource('backward_compatability/1.0.0/matrix_table/0.hmt')), False, False) range = ir.TableRange(10, 4) table_irs = [ ir.TableKeyBy(table_read, ['m', 'd'], False), ir.TableFilter(table_read, b), table_read, ir.MatrixColsTable(matrix_read), ir.TableAggregateByKey( table_read, ir.MakeStruct([('a', ir.I32(5))])), ir.TableKeyByAndAggregate( table_read, ir.MakeStruct([('a', ir.I32(5))]), ir.MakeStruct([('b', ir.I32(5))]), 1, 2), ir.TableJoin( table_read, ir.TableRange(100, 10), 'inner', 1), ir.MatrixEntriesTable(matrix_read), ir.MatrixRowsTable(matrix_read), ir.TableParallelize(ir.MakeStruct([ ('rows', ir.Literal(hl.tarray(hl.tstruct(a=hl.tint32)), [{'a':None}, {'a':5}, {'a':-3}])), ('global', ir.MakeStruct([]))]), None), ir.TableMapRows( ir.TableKeyBy(table_read, []), ir.MakeStruct([ ('a', ir.GetField(ir.Ref('row'), 'f32')), ('b', ir.F64(-2.11))])), ir.TableMapGlobals( table_read, ir.MakeStruct([ ('foo', ir.NA(hl.tarray(hl.tint32)))])), ir.TableRange(100, 10), ir.TableRepartition(table_read, 10, ir.RepartitionStrategy.COALESCE), ir.TableUnion( [ir.TableRange(100, 10), ir.TableRange(50, 10)]), ir.TableExplode(table_read, ['mset']), ir.TableHead(table_read, 10), ir.TableOrderBy(ir.TableKeyBy(table_read, []), [('m', 'A'), ('m', 'D')]), ir.TableDistinct(table_read), ir.CastMatrixToTable(matrix_read, '__entries', '__cols'), ir.TableRename(table_read, {'idx': 'idx_foo'}, {'global_f32': 'global_foo'}), ir.TableMultiWayZipJoin([table_read, table_read], '__data', '__globals'), ir.MatrixToTableApply(matrix_read, {'name': 'LinearRegressionRowsSingle', 'yFields': ['col_m'], 'xField': 'entry_m', 'covFields': [], 'rowBlockSize': 10, 'passThrough': []}), ir.TableToTableApply(table_read, {'name': 'TableFilterPartitions', 'parts': [0], 'keep': True}) ] return table_irs
def test_aggregate_by_key_partitioning(self): ht1 = hl.Table.parallelize([ {'k': 'foo', 'b': 1}, {'k': 'bar', 'b': 2}, {'k': 'bar', 'b': 2}], hl.tstruct(k=hl.tstr, b=hl.tint32), key='k') self.assertEqual( set(ht1.group_by('k').aggregate(mean_b = hl.agg.mean(ht1.b)).collect()), {hl.Struct(k='foo', mean_b=1.0), hl.Struct(k='bar', mean_b=2.0)})
def test_import_bgen_no_entries(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=[], sample_file=resource('example.sample')) self.assertEqual(bgen.entry.dtype, hl.tstruct()) bgen._jvds.typecheck()
def test_joins_work_correctly(self): mt, mt2 = self.get_groupable_matrix2() col_result = (mt.group_cols_by(group=mt2.cols()[mt.col_idx].col_idx2 < 2) .aggregate(sum=hl.agg.sum(mt2[mt.row_idx, mt.col_idx].x + mt.glob) + mt.glob - 15) .drop('r1')) col_expected = ( hl.Table.parallelize( [{'row_idx': 0, 'group': True, 'sum': 1}, {'row_idx': 0, 'group': False, 'sum': 5}, {'row_idx': 1, 'group': True, 'sum': 3}, {'row_idx': 1, 'group': False, 'sum': 7}, {'row_idx': 2, 'group': True, 'sum': 5}, {'row_idx': 2, 'group': False, 'sum': 9}, {'row_idx': 3, 'group': True, 'sum': 7}, {'row_idx': 3, 'group': False, 'sum': 11}], hl.tstruct(row_idx=hl.tint32, group=hl.tbool, sum=hl.tint64) ).annotate_globals(glob=5).key_by('row_idx', 'group') ) self.assertTrue(col_result.entries()._same(col_expected)) row_result = (mt.group_rows_by(group=mt2.rows()[mt.row_idx].row_idx2 < 2) .aggregate(sum=hl.agg.sum(mt2[mt.row_idx, mt.col_idx].x + mt.glob) + mt.glob - 15) .drop('c1')) row_expected = ( hl.Table.parallelize( [{'group': True, 'col_idx': 0, 'sum': 1}, {'group': True, 'col_idx': 1, 'sum': 3}, {'group': True, 'col_idx': 2, 'sum': 5}, {'group': True, 'col_idx': 3, 'sum': 7}, {'group': False, 'col_idx': 0, 'sum': 5}, {'group': False, 'col_idx': 1, 'sum': 7}, {'group': False, 'col_idx': 2, 'sum': 9}, {'group': False, 'col_idx': 3, 'sum': 11}], hl.tstruct(group=hl.tbool, col_idx=hl.tint32, sum=hl.tint64) ).annotate_globals(glob=5).key_by('group', 'col_idx') ) self.assertTrue(row_result.entries()._same(row_expected))
def test_maximal_independent_set3(self): is_case = {"A", "C", "E", "G", "H"} edges = [("A", "B"), ("C", "D"), ("E", "F"), ("G", "H")] edges = [{"i": {"id": l, "is_case": l in is_case}, "j": {"id": r, "is_case": r in is_case}} for l, r in edges] t = hl.Table.parallelize(edges, hl.tstruct(i=hl.tstruct(id=hl.tstr, is_case=hl.tbool), j=hl.tstruct(id=hl.tstr, is_case=hl.tbool))) tiebreaker = lambda l, r: (hl.case() .when(l.is_case & (~r.is_case), -1) .when(~(l.is_case) & r.is_case, 1) .default(0)) mis = hl.maximal_independent_set(t.i, t.j, tie_breaker=tiebreaker) expected_sets = [{"A", "C", "E", "G"}, {"A", "C", "E", "H"}] self.assertTrue(mis.all(mis.node.is_case)) self.assertTrue(set([row.id for row in mis.select(mis.node.id).collect()]) in expected_sets)
def test_import_vcf_missing_info_field_elements(self): mt = hl.import_vcf(resource('missingInfoArray.vcf'), reference_genome='GRCh37', array_elements_required=False) mt = mt.select_rows(FOO=mt.info.FOO, BAR=mt.info.BAR) expected = hl.Table.parallelize([{'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 'FOO': [1, None], 'BAR': [2, None, None]}, {'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'], 'FOO': [None, 2, None], 'BAR': [None, 1.0, None]}], hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), FOO=hl.tarray(hl.tint), BAR=hl.tarray(hl.tfloat64)), key=['locus', 'alleles']) self.assertTrue(mt.rows()._same(expected))
def test_computed_key_join_1(self): ds = self.get_vds() kt = hl.Table.parallelize( [{'key': 0, 'value': True}, {'key': 1, 'value': False}], hl.tstruct(key=hl.tint32, value=hl.tbool), key=['key']) ds = ds.annotate_rows(key=ds.locus.position % 2) ds = ds.annotate_rows(value=kt[ds['key']]['value']) rt = ds.rows() self.assertTrue( rt.all(((rt.locus.position % 2) == 0) == rt['value']))
def test_multi_way_zip_join(self): d1 = [{"id": 0, "name": "a", "data": 0.0}, {"id": 1, "name": "b", "data": 3.14}, {"id": 2, "name": "c", "data": 2.78}] d2 = [{"id": 0, "name": "d", "data": 1.1}, {"id": 0, "name": "x", "data": 2.2}, {"id": 2, "name": "v", "data": 7.89}] d3 = [{"id": 1, "name": "f", "data": 9.99}, {"id": 2, "name": "g", "data": -1.0}, {"id": 3, "name": "z", "data": 0.01}] s = hl.tstruct(id=hl.tint32, name=hl.tstr, data=hl.tfloat64) ts = [hl.Table.parallelize(r, schema=s, key='id') for r in [d1, d2, d3]] joined = hl.Table._multi_way_zip_join(ts, '__data', '__globals').drop('__globals') dexpected = [{"id": 0, "__data": [{"name": "a", "data": 0.0}, {"name": "d", "data": 1.1}, None]}, {"id": 0, "__data": [None, {"name": "x", "data": 2.2}, None]}, {"id": 1, "__data": [{"name": "b", "data": 3.14}, None, {"name": "f", "data": 9.99}]}, {"id": 2, "__data": [{"name": "c", "data": 2.78}, {"name": "v", "data": 7.89}, {"name": "g", "data": -1.0}]}, {"id": 3, "__data": [None, None, {"name": "z", "data": 0.01}]}] expected = hl.Table.parallelize( dexpected, schema=hl.tstruct(id=hl.tint32, __data=hl.tarray(hl.tstruct(name=hl.tstr, data=hl.tfloat64))), key='id') self.assertTrue(expected._same(joined)) expected2 = expected.transmute(data=expected['__data']) joined_same_name = hl.Table._multi_way_zip_join(ts, 'data', 'globals').drop('globals') self.assertTrue(expected2._same(joined_same_name)) joined_nothing = hl.Table._multi_way_zip_join(ts, 'data', 'globals').drop('data', 'globals') self.assertEqual(joined_nothing._force_count(), 5)
def test_parses(self): env = {'c': hl.tbool, 'a': hl.tarray(hl.tint32), 'aa': hl.tarray(hl.tarray(hl.tint32)), 'da': hl.tarray(hl.ttuple(hl.tint32, hl.tstr)), 'v': hl.tint32, 's': hl.tstruct(x=hl.tint32, y=hl.tint64, z=hl.tfloat64), 't': hl.ttuple(hl.tint32, hl.tint64, hl.tfloat64), 'call': hl.tcall, 'x': hl.tint32} env = {name: t._jtype for name, t in env.items()} for x in self.value_irs(): Env.hail().expr.ir.IRParser.parse_value_ir(str(x), env, {})
def test_select(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32), g=hl.tstruct(x=hl.tbool, y=hl.tint32)) rows = [{ 'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': { 'x': True, 'y': 2 } }, { 'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': [], 'g': { 'x': True, 'y': 2 } }, { 'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7], 'g': None }] kt = hl.Table.parallelize(rows, schema) t1 = kt.select(kt.a, kt.e) self.assertEqual(list(t1.row), ['a', 'e']) self.assertEqual(list(t1.key), []) t2 = kt.key_by('e') t2 = t2.select(t2.a) self.assertEqual(list(t2.row), ['e', 'a']) self.assertEqual(list(t2.key), ['e']) self.assertEqual( list(kt.select(kt.a, foo=kt.a + kt.b - kt.c - kt.d).row), ['a', 'foo']) self.assertEqual( list(kt.select(kt.a, foo=kt.a + kt.b - kt.c - kt.d, **kt.g).row), ['a', 'foo', 'x', 'y']) # select no fields s = kt.select() self.assertEqual(list(s.row), []) self.assertEqual(list(s.key), [])
def generate_datasets(doctest_namespace): doctest_namespace['hl'] = hl ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint(f'output/example.mt', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata small_mt = hl.balding_nichols_model(3, 4, 4) doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt', overwrite=True) # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._ndarray([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint(f'output/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds ld_score_one_pheno_sumstats = hl.import_table( 'data/ld_score_regression.one_pheno.sumstats.tsv', types={ 'locus': hl.tlocus('GRCh37'), 'alleles': hl.tarray(hl.tstr), 'chi_squared': hl.tfloat64, 'n': hl.tint32, 'ld_score': hl.tfloat64, 'phenotype': hl.tstr, 'chi_squared_50_irnt': hl.tfloat64, 'n_50_irnt': hl.tint32, 'chi_squared_20160': hl.tfloat64, 'n_20160': hl.tint32 }, key=['locus', 'alleles']) doctest_namespace[ 'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats mt = hl.import_matrix_table( 'data/ld_score_regression.all_phenos.sumstats.tsv', row_fields={ 'locus': hl.tstr, 'alleles': hl.tstr, 'ld_score': hl.tfloat64 }, entry_type=hl.tstr) mt = mt.key_cols_by(phenotype=mt.col_id) mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus), alleles=mt.alleles.split(',')) mt = mt.drop('row_id', 'col_id') mt = mt.annotate_entries(x=mt.x.split(",")) mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]), n=hl.int32(mt.x[1])) mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score)) doctest_namespace['ld_score_all_phenos_sumstats'] = mt print("finished setting up doctest...")
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--gencode", action="append", default=[], metavar=("version", "gtf_path", "canonical_transcripts_path"), nargs=3, required=True, ) parser.add_argument("--hgnc") parser.add_argument("--mane-select-transcripts") parser.add_argument("--min-partitions", type=int, default=32) parser.add_argument("--output", required=True) args = parser.parse_args() genes = None all_gencode_versions = [ gencode_version for gencode_version, _, _ in args.gencode ] for gencode_version, gtf_path, canonical_transcripts_path in args.gencode: gencode_genes = load_gencode_gene_models( gtf_path, min_partitions=args.min_partitions) # Canonical transcripts file is a TSV with two columns: gene ID and transcript ID and no header row canonical_transcripts = hl.import_table( canonical_transcripts_path, key="gene_id", min_partitions=args.min_partitions) gencode_genes = gencode_genes.annotate( canonical_transcript_id=canonical_transcripts[ gencode_genes.gene_id].transcript_id) gencode_genes = gencode_genes.select( **{f"v{gencode_version}": gencode_genes.row_value}) if not genes: genes = gencode_genes else: genes = genes.join(gencode_genes, "outer") genes = genes.select(gencode=genes.row_value) hgnc = hl.import_table(args.hgnc, missing="") hgnc = hgnc.select( hgnc_id=hgnc["HGNC ID"], symbol=hgnc["Approved symbol"], name=hgnc["Approved name"], previous_symbols=hgnc["Previous symbols"], alias_symbols=hgnc["Alias symbols"], omim_id=hgnc["OMIM ID(supplied by OMIM)"], gene_id=hl.or_else(hgnc["Ensembl gene ID"], hgnc["Ensembl ID(supplied by Ensembl)"]), ) hgnc = hgnc.filter(hl.is_defined(hgnc.gene_id)).key_by("gene_id") hgnc = hgnc.annotate( previous_symbols=hl.cond( hgnc.previous_symbols == "", hl.empty_array(hl.tstr), hgnc.previous_symbols.split(",").map(lambda s: s.strip()), ), alias_symbols=hl.cond( hgnc.alias_symbols == "", hl.empty_array(hl.tstr), hgnc.alias_symbols.split(",").map(lambda s: s.strip())), ) genes = genes.annotate(**hgnc[genes.gene_id]) genes = genes.annotate(symbol_source=hl.cond(hl.is_defined(genes.symbol), "hgnc", hl.null(hl.tstr))) # If an HGNC gene symbol was not present, use the symbol from Gencode for gencode_version in all_gencode_versions: genes = genes.annotate( symbol=hl.or_else( genes.symbol, genes.gencode[f"v{gencode_version}"].gene_symbol), symbol_source=hl.cond( hl.is_missing(genes.symbol) & hl.is_defined( genes.gencode[f"v{gencode_version}"].gene_symbol), f"gencode (v{gencode_version})", genes.symbol_source, ), ) # Collect all fields that can be used to search by gene name genes = genes.annotate( symbol_upper_case=genes.symbol.upper(), search_terms=hl.empty_array(hl.tstr).append(genes.symbol).extend( genes.previous_symbols).extend(genes.alias_symbols), ) for gencode_version in all_gencode_versions: genes = genes.annotate(search_terms=hl.rbind( genes.gencode[f"v{gencode_version}"].gene_symbol, lambda symbol_in_gencode: hl.cond( hl.is_defined(symbol_in_gencode), genes.search_terms.append(symbol_in_gencode), genes. search_terms), )) genes = genes.annotate( search_terms=hl.set(genes.search_terms.map(lambda s: s.upper()))) if args.mane_select_transcripts: mane_select_transcripts = hl.import_table(args.mane_select_transcripts, force=True) mane_select_transcripts = mane_select_transcripts.select( gene_id=mane_select_transcripts.Ensembl_Gene.split("\\.")[0], matched_gene_version=mane_select_transcripts.Ensembl_Gene.split( "\\.")[1], ensembl_id=mane_select_transcripts.Ensembl_nuc.split("\\.")[0], ensembl_version=mane_select_transcripts.Ensembl_nuc.split("\\.") [1], refseq_id=mane_select_transcripts.RefSeq_nuc.split("\\.")[0], refseq_version=mane_select_transcripts.RefSeq_nuc.split("\\.")[1], ) mane_select_transcripts = mane_select_transcripts.key_by("gene_id") # For GRCh38 (Gencode >= 20) transcripts, use the MANE Select transcripts to annotate transcripts # with their matching RefSeq transcript. ensembl_to_refseq_map = {} for transcript in mane_select_transcripts.collect(): ensembl_to_refseq_map[transcript.ensembl_id] = { transcript.ensembl_version: hl.Struct(refseq_id=transcript.refseq_id, refseq_version=transcript.refseq_version) } ensembl_to_refseq_map = hl.literal(ensembl_to_refseq_map) for gencode_version in ["19", "29"]: if int(gencode_version) >= 20: transcript_annotation = lambda transcript: transcript.annotate( **ensembl_to_refseq_map.get( transcript.transcript_id, hl.empty_dict( hl.tstr, hl.tstruct(refseq_id=hl.tstr, refseq_version=hl.tstr)), ).get( transcript.transcript_version, hl.struct(refseq_id=hl.null(hl.tstr), refseq_version=hl.null(hl.tstr)), )) else: transcript_annotation = lambda transcript: transcript.annotate( refseq_id=hl.null(hl.tstr), refseq_version=hl.null(hl.tstr)) genes = genes.annotate(gencode=genes.gencode.annotate( **{ f"v{gencode_version}": genes.gencode[f"v{gencode_version}"].annotate( transcripts=genes.gencode[f"v{gencode_version}"]. transcripts.map(transcript_annotation)) })) # Annotate genes with their MANE Select transcript genes = genes.annotate( mane_select_transcript=mane_select_transcripts[genes.gene_id]) genes.describe() genes.write(args.output, overwrite=True)
def _compute_type(self): name = self.config['name'] child_typ = self.child.typ if name == 'LinearRegressionRowsChained': pass_through = self.config['passThrough'] chained_schema = hl.dtype( 'struct{n:array<int32>,sum_x:array<float64>,y_transpose_x:array<array<float64>>,beta:array<array<float64>>,standard_error:array<array<float64>>,t_stat:array<array<float64>>,p_value:array<array<float64>>}') self._type = hl.ttable( child_typ.global_type, (child_typ.row_key_type ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through}) ._concat(chained_schema)), child_typ.row_key) elif name == 'LinearRegressionRowsSingle': pass_through = self.config['passThrough'] chained_schema = hl.dtype( 'struct{n:int32,sum_x:float64,y_transpose_x:array<float64>,beta:array<float64>,standard_error:array<float64>,t_stat:array<float64>,p_value:array<float64>}') self._type = hl.ttable( child_typ.global_type, (child_typ.row_key_type ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through}) ._concat(chained_schema)), child_typ.row_key) elif name == 'LogisticRegression': pass_through = self.config['passThrough'] logreg_type = hl.tstruct(logistic_regression=hl.tarray(regression_test_type(self.config['test']))) self._type = hl.ttable( child_typ.global_type, (child_typ.row_key_type ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through}) ._concat(logreg_type)), child_typ.row_key) elif name == 'PoissonRegression': pass_through = self.config['passThrough'] poisreg_type = regression_test_type(self.config['test']) self._type = hl.ttable( child_typ.global_type, (child_typ.row_key_type ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through}) ._concat(poisreg_type)), child_typ.row_key) elif name == 'Skat': key_field = self.config['keyField'] key_type = child_typ.row_type[key_field] skat_type = hl.dtype(f'struct{{id:{key_type},size:int32,q_stat:float64,p_value:float64,fault:int32}}') self._type = hl.ttable( hl.tstruct(), skat_type, ['id']) elif name == 'PCA': self._type = hl.ttable( hl.tstruct(eigenvalues=hl.tarray(hl.tfloat64), scores=hl.tarray(child_typ.col_key_type._insert_field('scores', hl.tarray(hl.tfloat64)))), child_typ.row_key_type._insert_field('loadings', dtype('array<float64>')), child_typ.row_key) else: assert name == 'LocalLDPrune', name self._type = hl.ttable( hl.tstruct(), child_typ.row_key_type._insert_fields(mean=hl.tfloat64, centered_length_rec=hl.tfloat64), list(child_typ.row_key))
def de_novo(mt: MatrixTable, pedigree: Pedigree, pop_frequency_prior, *, min_gq: int = 20, min_p: float = 0.05, max_parent_ab: float = 0.05, min_child_ab: float = 0.20, min_dp_ratio: float = 0.10) -> Table: r"""Call putative *de novo* events from trio data. .. include:: ../_templates/req_tstring.rst .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst Examples -------- Call de novo events: >>> pedigree = hl.Pedigree.read('data/trios.fam') >>> priors = hl.import_table('data/gnomadFreq.tsv', impute=True) >>> priors = priors.transmute(**hl.parse_variant(priors.Variant)).key_by('locus', 'alleles') >>> de_novo_results = hl.de_novo(dataset, pedigree, pop_frequency_prior=priors[dataset.row_key].AF) Notes ----- This method assumes the GATK high-throughput sequencing fields exist: `GT`, `AD`, `DP`, `GQ`, `PL`. This method replicates the functionality of `Kaitlin Samocha's de novo caller <https://github.com/ksamocha/de_novo_scripts>`__. The version corresponding to git commit ``bde3e40`` is implemented in Hail with her permission and assistance. This method produces a :class:`.Table` with the following fields: - `locus` (``locus``) -- Variant locus. - `alleles` (``array<str>``) -- Variant alleles. - `id` (``str``) -- Proband sample ID. - `prior` (``float64``) -- Site frequency prior. It is the maximum of: the computed dataset alternate allele frequency, the `pop_frequency_prior` parameter, and the global prior ``1 / 3e7``. - `proband` (``struct``) -- Proband column fields from `mt`. - `father` (``struct``) -- Father column fields from `mt`. - `mother` (``struct``) -- Mother column fields from `mt`. - `proband_entry` (``struct``) -- Proband entry fields from `mt`. - `father_entry` (``struct``) -- Father entry fields from `mt`. - `proband_entry` (``struct``) -- Mother entry fields from `mt`. - `is_female` (``bool``) -- ``True`` if proband is female. - `p_de_novo` (``float64``) -- Unfiltered posterior probability that the event is *de novo* rather than a missed heterozygous event in a parent. - `confidence` (``str``) Validation confidence. One of: ``'HIGH'``, ``'MEDIUM'``, ``'LOW'``. The key of the table is ``['locus', 'alleles', 'id']``. The model looks for de novo events in which both parents are homozygous reference and the proband is a heterozygous. The model makes the simplifying assumption that when this configuration ``x = (AA, AA, AB)`` of calls occurs, exactly one of the following is true: - ``d``: a de novo mutation occurred in the proband and all calls are accurate. - ``m``: at least one parental allele is actually heterozygous and the proband call is accurate. We can then estimate the posterior probability of a de novo mutation as: .. math:: \mathrm{P_{\text{de novo}}} = \frac{\mathrm{P}(d\,|\,x)}{\mathrm{P}(d\,|\,x) + \mathrm{P}(m\,|\,x)} Applying Bayes rule to the numerator and denominator yields .. math:: \frac{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d)}{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d) + \mathrm{P}(x\,|\,m)\,\mathrm{P}(m)} The prior on de novo mutation is estimated from the rate in the literature: .. math:: \mathrm{P}(d) = \frac{1 \text{mutation}}{30,000,000\, \text{bases}} The prior used for at least one alternate allele between the parents depends on the alternate allele frequency: .. math:: \mathrm{P}(m) = 1 - (1 - AF)^4 The likelihoods :math:`\mathrm{P}(x\,|\,d)` and :math:`\mathrm{P}(x\,|\,m)` are computed from the PL (genotype likelihood) fields using these factorizations: .. math:: \mathrm{P}(x = (AA, AA, AB) \,|\,d) = \Big( &\mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AA) \\ \cdot &\mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AA) \\ \cdot &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\, \mathrm{proband} = AB) \Big) .. math:: \mathrm{P}(x = (AA, AA, AB) \,|\,m) = \Big( & \mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AB) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AA) \\ + \, &\mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AA) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AB) \Big) \\ \cdot \, &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\, \mathrm{proband} = AB) (Technically, the second factorization assumes there is exactly (rather than at least) one alternate allele among the parents, which may be justified on the grounds that it is typically the most likely case by far.) While this posterior probability is a good metric for grouping putative de novo mutations by validation likelihood, there exist error modes in high-throughput sequencing data that are not appropriately accounted for by the phred-scaled genotype likelihoods. To this end, a number of hard filters are applied in order to assign validation likelihood. These filters are different for SNPs and insertions/deletions. In the below rules, the following variables are used: - ``DR`` refers to the ratio of the read depth in the proband to the combined read depth in the parents. - ``DP`` refers to the read depth (DP field) of the proband. - ``AB`` refers to the read allele balance of the proband (number of alternate reads divided by total reads). - ``AC`` refers to the count of alternate alleles across all individuals in the dataset at the site. - ``p`` refers to :math:`\mathrm{P_{\text{de novo}}}`. - ``min_p`` refers to the ``min_p`` function parameter. HIGH-quality SNV: .. code-block:: text (p > 0.99) AND (AB > 0.3) AND (AC == 1) OR (p > 0.99) AND (AB > 0.3) AND (DR > 0.2) OR (p > 0.5) AND (AB > 0.3) AND (AC < 10) AND (DP > 10) MEDIUM-quality SNV: .. code-block:: text (p > 0.5) AND (AB > 0.3) OR (AC == 1) LOW-quality SNV: .. code-block:: text (AB > 0.2) HIGH-quality indel: .. code-block:: text (p > 0.99) AND (AB > 0.3) AND (AC == 1) MEDIUM-quality indel: .. code-block:: text (p > 0.5) AND (AB > 0.3) AND (AC < 10) LOW-quality indel: .. code-block:: text (AB > 0.2) Additionally, de novo candidates are not considered if the proband GQ is smaller than the ``min_gq`` parameter, if the proband allele balance is lower than the ``min_child_ab`` parameter, if the depth ratio between the proband and parents is smaller than the ``min_depth_ratio`` parameter, if the allele balance in a parent is above the ``max_parent_ab`` parameter, or if the posterior probability `p` is smaller than the `min_p` parameter. Parameters ---------- mt : :class:`.MatrixTable` High-throughput sequencing dataset. pedigree : :class:`.Pedigree` Sample pedigree. pop_frequency_prior : :class:`.Float64Expression` Expression for population alternate allele frequency prior. min_gq Minimum proband GQ to be considered for *de novo* calling. min_p Minimum posterior probability to be considered for *de novo* calling. max_parent_ab Maximum parent allele balance. min_child_ab Minimum proband allele balance/ min_dp_ratio Minimum ratio between proband read depth and parental read depth. Returns ------- :class:`.Table` """ DE_NOVO_PRIOR = 1 / 30000000 MIN_POP_PRIOR = 100 / 30000000 required_entry_fields = {'GT', 'AD', 'DP', 'GQ', 'PL'} missing_fields = required_entry_fields - set(mt.entry) if missing_fields: raise ValueError( f"'de_novo': expected 'MatrixTable' to have at least {required_entry_fields}, " f"missing {missing_fields}") mt = mt.annotate_rows(__prior=pop_frequency_prior, __alt_alleles=hl.agg.sum(mt.GT.n_alt_alleles()), __total_alleles=2 * hl.agg.sum(hl.is_defined(mt.GT))) # subtract 1 from __alt_alleles to correct for the observed genotype mt = mt.annotate_rows( __site_freq=hl.max((mt.__alt_alleles - 1) / mt.__total_alleles, mt.__prior, MIN_POP_PRIOR)) mt = require_biallelic(mt, 'de_novo') # FIXME check that __site_freq is between 0 and 1 when possible in expr tm = trio_matrix(mt, pedigree, complete_trios=True) autosomal = tm.locus.in_autosome_or_par() | (tm.locus.in_x_nonpar() & tm.is_female) hemi_x = tm.locus.in_x_nonpar() & ~tm.is_female hemi_y = tm.locus.in_y_nonpar() & ~tm.is_female hemi_mt = tm.locus.in_mito() & tm.is_female is_snp = hl.is_snp(tm.alleles[0], tm.alleles[1]) n_alt_alleles = tm.__alt_alleles prior = tm.__site_freq het_hom_hom = tm.proband_entry.GT.is_het() & tm.father_entry.GT.is_hom_ref( ) & tm.mother_entry.GT.is_hom_ref() kid_ad_fail = tm.proband_entry.AD[1] / hl.sum( tm.proband_entry.AD) < min_child_ab failure = hl.null(hl.tstruct(p_de_novo=hl.tfloat64, confidence=hl.tstr)) kid = tm.proband_entry dad = tm.father_entry mom = tm.mother_entry kid_linear_pl = 10**(-kid.PL / 10) kid_pp = hl.bind(lambda x: x / hl.sum(x), kid_linear_pl) dad_linear_pl = 10**(-dad.PL / 10) dad_pp = hl.bind(lambda x: x / hl.sum(x), dad_linear_pl) mom_linear_pl = 10**(-mom.PL / 10) mom_pp = hl.bind(lambda x: x / hl.sum(x), mom_linear_pl) kid_ad_ratio = kid.AD[1] / hl.sum(kid.AD) dp_ratio = kid.DP / (dad.DP + mom.DP) def call_auto(kid_pp, dad_pp, mom_pp, kid_ad_ratio): p_data_given_dn = dad_pp[0] * mom_pp[0] * kid_pp[1] * DE_NOVO_PRIOR p_het_in_parent = 1 - (1 - prior)**4 p_data_given_missed_het = (dad_pp[1] * mom_pp[0] + dad_pp[0] * mom_pp[1]) * kid_pp[1] * p_het_in_parent p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het) def solve(p_de_novo): return (hl.case().when(kid.GQ < min_gq, failure).when( (kid.DP / (dad.DP + mom.DP) < min_dp_ratio) | ~(kid_ad_ratio >= min_child_ab), failure).when( (hl.sum(mom.AD) == 0) | (hl.sum(dad.AD) == 0), failure).when( (mom.AD[1] / hl.sum(mom.AD) > max_parent_ab) | (dad.AD[1] / hl.sum(dad.AD) > max_parent_ab), failure).when(p_de_novo < min_p, failure).when( ~is_snp, hl.case().when( (p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')).when( (p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct( p_de_novo=p_de_novo, confidence='MEDIUM')).when( kid_ad_ratio > 0.2, hl.struct( p_de_novo=p_de_novo, confidence='LOW')). or_missing()).default(hl.case().when( ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')).when( (p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct( p_de_novo=p_de_novo, confidence='MEDIUM')).when( kid_ad_ratio > 0.2, hl.struct( p_de_novo=p_de_novo, confidence='LOW')). or_missing())) return hl.bind(solve, p_de_novo) def call_hemi(kid_pp, parent, parent_pp, kid_ad_ratio): p_data_given_dn = parent_pp[0] * kid_pp[1] * DE_NOVO_PRIOR p_het_in_parent = 1 - (1 - prior)**4 p_data_given_missed_het = (parent_pp[1] + parent_pp[2]) * kid_pp[2] * p_het_in_parent p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het) def solve(p_de_novo): return (hl.case().when(kid.GQ < min_gq, failure).when( (kid.DP / (parent.DP) < min_dp_ratio) | (kid_ad_ratio < min_child_ab), failure).when((hl.sum(parent.AD) == 0), failure).when( parent.AD[1] / hl.sum(parent.AD) > max_parent_ab, failure).when(p_de_novo < min_p, failure).when( ~is_snp, hl.case().when( (p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct( p_de_novo=p_de_novo, confidence='HIGH')).when( (p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')).when( kid_ad_ratio > 0.3, hl.struct( p_de_novo=p_de_novo, confidence='LOW')). or_missing()).default(hl.case().when( ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct( p_de_novo=p_de_novo, confidence='HIGH')).when( (p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM') ).when( kid_ad_ratio > 0.2, hl.struct(p_de_novo=p_de_novo, confidence='LOW')).or_missing())) return hl.bind(solve, p_de_novo) de_novo_call = (hl.case().when(~het_hom_hom | kid_ad_fail, failure).when( autosomal, hl.bind(call_auto, kid_pp, dad_pp, mom_pp, kid_ad_ratio)).when( hemi_x | hemi_mt, hl.bind(call_hemi, kid_pp, mom, mom_pp, kid_ad_ratio)).when( hemi_y, hl.bind(call_hemi, kid_pp, dad, dad_pp, kid_ad_ratio)).or_missing()) tm = tm.annotate_entries(__call=de_novo_call) tm = tm.filter_entries(hl.is_defined(tm.__call)) entries = tm.entries() return (entries.select('__site_freq', 'proband', 'father', 'mother', 'proband_entry', 'father_entry', 'mother_entry', 'is_female', **entries.__call).rename({'__site_freq': 'prior'}))
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]: """Filter rows with a list of intervals. Examples -------- Filter to loci falling within one interval: >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')]) Remove all loci within list of intervals: >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']] >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False) Notes ----- Based on the ``keep`` argument, this method will either restrict to points in the supplied interval ranges, or remove all rows in those ranges. When ``keep=True``, partitions that don't overlap any supplied interval will not be loaded at all. This enables :func:`.filter_intervals` to be used for reasonably low-latency queries of small ranges of the dataset, even on large datasets. Parameters ---------- ds : :class:`.MatrixTable` or :class:`.Table` Dataset to filter. intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval` Intervals to filter on. The point type of the interval must be a prefix of the key or equal to the first field of the key. keep : :obj:`bool` If ``True``, keep only rows that fall within any interval in `intervals`. If ``False``, keep only rows that fall outside all intervals in `intervals`. Returns ------- :class:`.MatrixTable` or :class:`.Table` """ if isinstance(ds, MatrixTable): k_type = ds.row_key.dtype else: assert isinstance(ds, Table) k_type = ds.key.dtype point_type = intervals.dtype.element_type.point_type def is_struct_prefix(partial, full): if list(partial) != list(full)[:len(partial)]: return False for k, v in partial.items(): if full[k] != v: return False return True if point_type == k_type[0]: needs_wrapper = True point_type = hl.tstruct(foo=point_type) elif isinstance(point_type, tstruct) and is_struct_prefix( point_type, k_type): needs_wrapper = False else: raise TypeError( "The point type is incompatible with key type of the dataset ('{}', '{}')" .format(repr(point_type), repr(k_type))) def wrap_input(interval): if interval is None: raise TypeError( "'filter_intervals' does not allow missing values in 'intervals'." ) elif needs_wrapper: return Interval(Struct(foo=interval.start), Struct(foo=interval.end), interval.includes_start, interval.includes_end) else: return interval intervals_type = intervals.dtype intervals = hl.eval(intervals) intervals = hl.tarray(hl.tinterval(point_type))._convert_to_json( [wrap_input(i) for i in intervals]) if isinstance(ds, MatrixTable): config = { 'name': 'MatrixFilterIntervals', 'keyType': point_type._parsable_string(), 'intervals': intervals, 'keep': keep } return MatrixTable(MatrixToMatrixApply(ds._mir, config)) else: config = { 'name': 'TableFilterIntervals', 'keyType': point_type._parsable_string(), 'intervals': intervals, 'keep': keep } return Table(TableToTableApply(ds._tir, config))
def test_update(self): vds = self.get_vds() vds = vds.select_entries(dp=vds.DP, gq=vds.GQ) self.assertTrue( schema_eq(vds.entry.dtype, hl.tstruct(dp=hl.tint32, gq=hl.tint32)))
def test_str_annotation_regression(self): t = hl.Table.parallelize([{'alleles': ['A', 'T']}], hl.tstruct(alleles=hl.tarray(hl.tstr))) t = t.annotate(ref=t.alleles[0]) t._force_count()
def matrix_irs(self): hl.index_bgen(resource('example.8bits.bgen'), reference_genome=hl.get_reference('GRCh37'), contig_recoding={'01': '1'}) collect = ir.MakeStruct([('x', ir.ApplyAggOp('Collect', [], None, [ir.I32(0)]))]) matrix_read = ir.MatrixRead( ir.MatrixNativeReader( resource('backward_compatability/1.0.0/matrix_table/0.hmt'), None, False), False, False) table_read = ir.TableRead( ir.TableNativeReader(resource('backward_compatability/1.0.0/table/0.ht'), None, False), False) matrix_range = ir.MatrixRead(ir.MatrixRangeReader(1, 1, 10)) matrix_irs = [ ir.MatrixRepartition(matrix_range, 100, ir.RepartitionStrategy.SHUFFLE), ir.MatrixUnionRows(matrix_range, matrix_range), ir.MatrixDistinctByRow(matrix_range), ir.MatrixRowsHead(matrix_read, 5), ir.MatrixColsHead(matrix_read, 5), ir.CastTableToMatrix( ir.CastMatrixToTable(matrix_read, '__entries', '__cols'), '__entries', '__cols', []), ir.MatrixAggregateRowsByKey(matrix_read, collect, collect), ir.MatrixAggregateColsByKey(matrix_read, collect, collect), matrix_read, matrix_range, ir.MatrixRead(ir.MatrixVCFReader(resource('sample.vcf'), ['GT'], hl.tfloat64, None, None, None, None, False, True, False, True, None, None, None)), ir.MatrixRead(ir.MatrixBGENReader(resource('example.8bits.bgen'), None, {}, 10, 1, None)), ir.MatrixFilterRows(matrix_read, ir.FalseIR()), ir.MatrixFilterCols(matrix_read, ir.FalseIR()), ir.MatrixFilterEntries(matrix_read, ir.FalseIR()), ir.MatrixChooseCols(matrix_read, [1, 0]), ir.MatrixMapCols(matrix_read, ir.MakeStruct([('x', ir.I64(20))]), ['x']), ir.MatrixKeyRowsBy(matrix_read, ['row_i64'], False), ir.MatrixMapRows(ir.MatrixKeyRowsBy(matrix_read, []), ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapEntries(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapGlobals(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixCollectColsByKey(matrix_read), ir.MatrixExplodeRows(matrix_read, ['row_aset']), ir.MatrixExplodeCols(matrix_read, ['col_aset']), ir.MatrixAnnotateRowsTable(matrix_read, table_read, '__foo'), ir.MatrixAnnotateColsTable(matrix_read, table_read, '__foo'), ir.MatrixToMatrixApply(matrix_read, {'name': 'MatrixFilterPartitions', 'parts': [0], 'keep': True}), ir.MatrixRename(matrix_read, {'global_f32': 'global_foo'}, {'col_f32': 'col_foo'}, {'row_aset': 'row_aset2'}, {'entry_f32': 'entry_foo'}), ir.MatrixFilterIntervals(matrix_read, [hl.utils.Interval(hl.utils.Struct(row_idx=0), hl.utils.Struct(row_idx=10))], hl.tstruct(row_idx=hl.tint32), keep=False), ] return matrix_irs
def test_aggregate2(self): schema = hl.tstruct(status=hl.tint32, GT=hl.tcall, qPheno=hl.tint32) rows = [{ 'status': 0, 'GT': hl.Call([0, 0]), 'qPheno': 3 }, { 'status': 0, 'GT': hl.Call([0, 1]), 'qPheno': 13 }] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict( kt.group_by(status=kt.status).aggregate( x1=agg.collect(kt.qPheno * 2), x2=agg.explode(lambda elt: agg.collect(elt), [kt.qPheno, kt.qPheno + 1]), x3=agg.min(kt.qPheno), x4=agg.max(kt.qPheno), x5=agg.sum(kt.qPheno), x6=agg.product(hl.int64(kt.qPheno)), x7=agg.count(), x8=agg.count_where(kt.qPheno == 3), x9=agg.fraction(kt.qPheno == 1), x10=agg.stats(hl.float64(kt.qPheno)), x11=agg.hardy_weinberg_test(kt.GT), x13=agg.inbreeding(kt.GT, 0.1), x14=agg.call_stats(kt.GT, ["A", "T"]), x15=agg.collect( hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')))[0], x16=agg.collect( hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')).c.banana)[0], x17=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tarray(hl.tint32))), x18=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tset(hl.tint32))), x19=agg.take(kt.GT, 1, ordering=-kt.qPheno)).take(1)[0]) expected = { u'status': 0, u'x13': { u'n_called': 2, u'expected_homs': 1.64, u'f_stat': -1.777777777777777, u'observed_homs': 1 }, u'x14': { u'AC': [3, 1], u'AF': [0.75, 0.25], u'AN': 4, u'homozygote_count': [1, 0] }, u'x15': { u'a': 5, u'c': { u'banana': u'apple' }, u'b': u'foo' }, u'x10': { u'min': 3.0, u'max': 13.0, u'sum': 16.0, u'stdev': 5.0, u'n': 2, u'mean': 8.0 }, u'x8': 1, u'x9': 0.0, u'x16': u'apple', u'x11': { u'het_freq_hwe': 0.5, u'p_value': 0.5 }, u'x2': [3, 4, 13, 14], u'x3': 3, u'x1': [6, 26], u'x6': 39, u'x7': 2, u'x4': 13, u'x5': 16, u'x17': [], u'x18': [], u'x19': [hl.Call([0, 1])] } self.maxDiff = None self.assertDictEqual(result, expected)
def test_locus_windows(self): def assert_eq(a, b): self.assertTrue(np.array_equal(a, np.array(b))) centimorgans = hl.literal([0.1, 1.0, 1.0, 1.5, 1.9]) mt = hl.balding_nichols_model(1, 5, 5).add_row_index() mt = mt.annotate_rows(cm=centimorgans[hl.int32(mt.row_idx)]).cache() starts, stops = hl.linalg.utils.locus_windows(mt.locus, 2) assert_eq(starts, [0, 0, 0, 1, 2]) assert_eq(stops, [3, 4, 5, 5, 5]) starts, stops = hl.linalg.utils.locus_windows(mt.locus, 0.5, coord_expr=mt.cm) assert_eq(starts, [0, 1, 1, 1, 3]) assert_eq(stops, [1, 4, 4, 5, 5]) starts, stops = hl.linalg.utils.locus_windows( mt.locus, 1.0, coord_expr=2 * centimorgans[hl.int32(mt.row_idx)]) assert_eq(starts, [0, 1, 1, 1, 3]) assert_eq(stops, [1, 4, 4, 5, 5]) rows = [{ 'locus': hl.Locus('1', 1), 'cm': 1.0 }, { 'locus': hl.Locus('1', 2), 'cm': 3.0 }, { 'locus': hl.Locus('1', 4), 'cm': 4.0 }, { 'locus': hl.Locus('2', 1), 'cm': 2.0 }, { 'locus': hl.Locus('2', 1), 'cm': 2.0 }, { 'locus': hl.Locus('3', 3), 'cm': 5.0 }] ht = hl.Table.parallelize(rows, hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1) assert_eq(starts, [0, 0, 2, 3, 3, 5]) assert_eq(stops, [2, 2, 3, 5, 5, 6]) starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) assert_eq(starts, [0, 1, 1, 3, 3, 5]) assert_eq(stops, [1, 3, 3, 5, 5, 6]) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.order_by(ht.cm).locus, 1.0) self.assertTrue('ascending order' in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows( ht.locus, 1.0, coord_expr=hl.utils.range_table(1).idx) self.assertTrue('different source' in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(hl.locus('1', 1), 1.0) self.assertTrue("no source" in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=0.0) self.assertTrue("no source" in str(cm.exception)) ht = ht.annotate_globals(x=hl.locus('1', 1), y=1.0) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.x, 1.0) self.assertTrue("row-indexed" in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, ht.y) self.assertTrue("row-indexed" in str(cm.exception)) ht = hl.Table.parallelize([{ 'locus': hl.null(hl.tlocus()), 'cm': 1.0 }], hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0) self.assertTrue("missing value for 'locus_expr'" in str(cm.exception)) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) self.assertTrue("missing value for 'locus_expr'" in str(cm.exception)) ht = hl.Table.parallelize([{ 'locus': hl.Locus('1', 1), 'cm': hl.null(hl.tfloat64) }], hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) self.assertTrue("missing value for 'coord_expr'" in str(cm.exception))
def test_reference_genome_liftover(self): grch37 = hl.get_reference('GRCh37') grch38 = hl.get_reference('GRCh38') self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37')) grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37') self.assertTrue( grch37.has_liftover('GRCh38') and grch38.has_liftover('GRCh37')) ds = hl.import_vcf(resource('sample.vcf')) t = ds.annotate_rows(liftover=hl.liftover( hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows() self.assertTrue(t.all(t.locus == t.liftover)) null_locus = hl.null(hl.tlocus('GRCh38')) rows = [{ 'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus }, { 'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus }, { 'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38') }, { 'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38') }, { 'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38') }, { 'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus }, { 'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus }, { 'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus }, { 'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38') }, { 'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38') }, { 'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38') }, { 'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus }] schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38)) t = hl.Table.parallelize(rows, schema) self.assertTrue( t.all( hl.cond(hl.is_defined(t.l38), hl.liftover(t.l37, 'GRCh38') == t.l38, hl.is_missing(hl.liftover(t.l37, 'GRCh38'))))) t = t.filter(hl.is_defined(t.l38)) self.assertTrue(t.count() == 6) t = t.key_by('l38') t.count() self.assertTrue(list(t.key) == ['l38']) null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38'))) rows = [{ 'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval }, { 'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'), 'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38') }] schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38))) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38)) grch37.remove_liftover("GRCh38") grch38.remove_liftover("GRCh37")
def test_transmute_globals(self): ht = hl.utils.range_table(1).annotate_globals(a=5, b=10) self.assertEqual(ht.transmute_globals(c=ht.a + 5).globals.dtype, hl.tstruct(b=hl.tint, c=hl.tint))
def test_multi_way_zip_join(self): d1 = [{ "id": 0, "name": "a", "data": 0.0 }, { "id": 1, "name": "b", "data": 3.14 }, { "id": 2, "name": "c", "data": 2.78 }] d2 = [{ "id": 0, "name": "d", "data": 1.1 }, { "id": 0, "name": "x", "data": 2.2 }, { "id": 2, "name": "v", "data": 7.89 }] d3 = [{ "id": 1, "name": "f", "data": 9.99 }, { "id": 2, "name": "g", "data": -1.0 }, { "id": 3, "name": "z", "data": 0.01 }] s = hl.tstruct(id=hl.tint32, name=hl.tstr, data=hl.tfloat64) ts = [ hl.Table.parallelize(r, schema=s, key='id') for r in [d1, d2, d3] ] joined = hl.Table._multi_way_zip_join(ts, '__data', '__globals').drop('__globals') dexpected = [{ "id": 0, "__data": [{ "name": "a", "data": 0.0 }, { "name": "d", "data": 1.1 }, None] }, { "id": 0, "__data": [None, { "name": "x", "data": 2.2 }, None] }, { "id": 1, "__data": [{ "name": "b", "data": 3.14 }, None, { "name": "f", "data": 9.99 }] }, { "id": 2, "__data": [{ "name": "c", "data": 2.78 }, { "name": "v", "data": 7.89 }, { "name": "g", "data": -1.0 }] }, { "id": 3, "__data": [None, None, { "name": "z", "data": 0.01 }] }] expected = hl.Table.parallelize(dexpected, schema=hl.tstruct( id=hl.tint32, __data=hl.tarray( hl.tstruct(name=hl.tstr, data=hl.tfloat64))), key='id') self.assertTrue(expected._same(joined)) expected2 = expected.transmute(data=expected['__data']) joined_same_name = hl.Table._multi_way_zip_join( ts, 'data', 'globals').drop('globals') self.assertTrue(expected2._same(joined_same_name)) joined_nothing = hl.Table._multi_way_zip_join(ts, 'data', 'globals').drop( 'data', 'globals') self.assertEqual(joined_nothing._force_count(), 5)
def test_join_mangling(self): t1 = hl.utils.range_table(10).annotate_globals(glob1=5).annotate(row1=5) j = t1.join(t1, 'inner') assert j.row.dtype == hl.tstruct(idx=hl.tint32, row1=hl.tint32, row1_1=hl.tint32) assert j.globals.dtype == hl.tstruct(glob1=hl.tint32, glob1_1=hl.tint32) j._force_count()
def test_mendel_errors(self): mt = hl.import_vcf(resource('mendel.vcf')) ped = hl.Pedigree.read(resource('mendel.fam')) men, fam, ind, var = hl.mendel_errors(mt['GT'], ped) self.assertEqual( men.key.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr), s=hl.tstr)) self.assertEqual( men.row.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr), s=hl.tstr, fam_id=hl.tstr, mendel_code=hl.tint)) self.assertEqual(fam.key.dtype, hl.tstruct(pat_id=hl.tstr, mat_id=hl.tstr)) self.assertEqual( fam.row.dtype, hl.tstruct(pat_id=hl.tstr, mat_id=hl.tstr, fam_id=hl.tstr, children=hl.tint, errors=hl.tint64, snp_errors=hl.tint64)) self.assertEqual(ind.key.dtype, hl.tstruct(s=hl.tstr)) self.assertEqual( ind.row.dtype, hl.tstruct(s=hl.tstr, fam_id=hl.tstr, errors=hl.tint64, snp_errors=hl.tint64)) self.assertEqual( var.key.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr))) self.assertEqual( var.row.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr), errors=hl.tint64)) self.assertEqual(men.count(), 41) self.assertEqual(fam.count(), 2) self.assertEqual(ind.count(), 7) self.assertEqual(var.count(), mt.count_rows()) self.assertEqual( set(fam.select('children', 'errors', 'snp_errors').collect()), { hl.utils.Struct(pat_id='Dad1', mat_id='Mom1', children=2, errors=41, snp_errors=39), hl.utils.Struct(pat_id='Dad2', mat_id='Mom2', children=1, errors=0, snp_errors=0) }) self.assertEqual( set(ind.select('errors', 'snp_errors').collect()), { hl.utils.Struct(s='Son1', errors=23, snp_errors=22), hl.utils.Struct(s='Dtr1', errors=18, snp_errors=17), hl.utils.Struct(s='Dad1', errors=19, snp_errors=18), hl.utils.Struct(s='Mom1', errors=22, snp_errors=21), hl.utils.Struct(s='Dad2', errors=0, snp_errors=0), hl.utils.Struct(s='Mom2', errors=0, snp_errors=0), hl.utils.Struct(s='Son2', errors=0, snp_errors=0) }) to_keep = hl.set([(hl.Locus("1", 1), ['C', 'CT']), (hl.Locus("1", 2), ['C', 'T']), (hl.Locus("X", 1), ['C', 'T']), (hl.Locus("X", 3), ['C', 'T']), (hl.Locus("Y", 1), ['C', 'T']), (hl.Locus("Y", 3), ['C', 'T'])]) self.assertEqual( var.filter(to_keep.contains( (var.locus, var.alleles))).order_by('locus').select( 'locus', 'alleles', 'errors').collect(), [ hl.utils.Struct( locus=hl.Locus("1", 1), alleles=['C', 'CT'], errors=2), hl.utils.Struct( locus=hl.Locus("1", 2), alleles=['C', 'T'], errors=1), hl.utils.Struct( locus=hl.Locus("X", 1), alleles=['C', 'T'], errors=2), hl.utils.Struct( locus=hl.Locus("X", 3), alleles=['C', 'T'], errors=1), hl.utils.Struct( locus=hl.Locus("Y", 1), alleles=['C', 'T'], errors=1), hl.utils.Struct( locus=hl.Locus("Y", 3), alleles=['C', 'T'], errors=1), ]) ped2 = hl.Pedigree.read(resource('mendelWithMissingSex.fam')) men2, _, _, _ = hl.mendel_errors(mt['GT'], ped2) self.assertTrue( men2.filter(men2.s == 'Dtr1')._same(men.filter(men.s == 'Dtr1')))
def test_import_vcf_no_reference_specified(self): vcf = hl.import_vcf(resource('sample2.vcf'), reference_genome=None) self.assertTrue( vcf.locus.dtype == hl.tstruct(contig=hl.tstr, position=hl.tint32)) self.assertEqual(vcf.count_rows(), 735)
@pytest.mark.parametrize( "input_regions,expected_output_regions", [ ( hl.literal([ hl.utils.Struct(start=5, stop=10), hl.utils.Struct(start=7, stop=12), hl.utils.Struct(start=10, stop=11), ]), [hl.utils.Struct(start=5, stop=12)], ), ( hl.literal([ hl.utils.Struct(start=5, stop=10), hl.utils.Struct(start=11, stop=14), hl.utils.Struct(start=17, stop=22), hl.utils.Struct(start=22, stop=24), ]), [ hl.utils.Struct(start=5, stop=14), hl.utils.Struct(start=17, stop=24), ], ), (hl.empty_array(hl.tstruct(start=hl.tint, stop=hl.tint)), []), ], ) def test_merge_overlapping_regions(input_regions, expected_output_regions): assert hl.eval( merge_overlapping_regions(input_regions)) == expected_output_regions
def _compute_type(self): self._type = hl.ttable(hl.tstruct(), hl.tstruct(idx=hl.tint32), ['idx'])
def value_irs(self): b = ir.TrueIR() c = ir.Ref('c', hl.tbool) i = ir.I32(5) j = ir.I32(7) st = ir.Str('Hail') a = ir.Ref('a', hl.tarray(hl.tint32)) aa = ir.Ref('aa', hl.tarray(hl.tarray(hl.tint32))) da = ir.Ref('da', hl.tarray(hl.ttuple(hl.tint32, hl.tstr))) v = ir.Ref('v', hl.tint32) s = ir.Ref('s', hl.tstruct(x=hl.tint32, y=hl.tint64, z=hl.tfloat64)) t = ir.Ref('t', hl.ttuple(hl.tint32, hl.tint64, hl.tfloat64)) call = ir.Ref('call', hl.tcall) collect_sig = ir.AggSignature('Collect', [], None, [hl.tint32]) call_stats_sig = ir.AggSignature('CallStats', [], [hl.tint32], [hl.tcall]) hist_sig = ir.AggSignature('Histogram', [hl.tfloat64, hl.tfloat64, hl.tint32], None, [hl.tfloat64]) take_by_sig = ir.AggSignature('TakeBy', [hl.tint32], None, [hl.tfloat64, hl.tfloat64]) value_irs = [ i, ir.I64(5), ir.F32(3.14), ir.F64(3.14), s, ir.TrueIR(), ir.FalseIR(), ir.Void(), ir.Cast(i, hl.tfloat64), ir.NA(hl.tint32), ir.IsNA(i), ir.If(b, i, j), ir.Let('v', i, v), ir.Ref('x', hl.tint32), ir.ApplyBinaryOp('+', i, j), ir.ApplyUnaryOp('-', i), ir.ApplyComparisonOp('EQ', i, j), ir.MakeArray([i, ir.NA(hl.tint32), ir.I32(-3)], hl.tarray(hl.tint32)), ir.ArrayRef(a, i), ir.ArrayLen(a), ir.ArrayRange(ir.I32(0), ir.I32(5), ir.I32(1)), ir.ArraySort(a, b, False), ir.ToSet(a), ir.ToDict(da), ir.ToArray(a), ir.LowerBoundOnOrderedCollection(a, i, True), ir.GroupByKey(da), ir.ArrayMap(a, 'v', v), ir.ArrayFilter(a, 'v', v), ir.ArrayFlatMap(aa, 'v', v), ir.ArrayFold(a, ir.I32(0), 'x', 'v', v), ir.ArrayScan(a, ir.I32(0), 'x', 'v', v), ir.ArrayFor(a, 'v', ir.Void()), ir.ApplyAggOp(ir.I32(0), [], None, collect_sig), ir.ApplyScanOp(ir.I32(0), [], None, collect_sig), ir.ApplyAggOp(ir.F64(-2.11), [ir.F64(-5.0), ir.F64(5.0), ir.I32(100)], None, hist_sig), ir.ApplyAggOp(call, [], [ir.I32(2)], call_stats_sig), ir.ApplyAggOp(ir.F64(-2.11), [ir.I32(10)], None, take_by_sig), ir.InitOp(ir.I32(0), [ir.I32(2)], call_stats_sig), ir.SeqOp(ir.I32(0), [i], collect_sig), ir.SeqOp(ir.I32(0), [ir.F64(-2.11), ir.I32(17)], take_by_sig), ir.Begin([ir.Void()]), ir.MakeStruct([('x', i)]), ir.SelectFields(s, ['x', 'z']), ir.InsertFields(s, [('x', i)]), ir.GetField(s, 'x'), ir.MakeTuple([i, b]), ir.GetTupleElement(t, 1), ir.StringSlice(st, ir.I32(1), ir.I32(2)), ir.StringLength(st), ir.In(2, hl.tfloat64), ir.Die('mumblefoo', hl.tfloat64), ir.Apply('&&', b, c), ir.Apply('toFloat64', i), ir.Apply('isDefined', s), ir.Uniroot('x', ir.F64(3.14), ir.F64(-5.0), ir.F64(5.0)), ir.Literal(hl.tarray(hl.tint32), [1, 2, None]), ] return value_irs
def _compute_type(self): self._type = hl.ttable(hl.tstruct(), hl.tstruct(**{'i': hl.tint64, 'j': hl.tint64, 'entry': hl.tfloat64}), [])
def table_irs(self): b = ir.TrueIR() table_read = ir.TableRead( ir.TableNativeReader( resource('backward_compatability/1.0.0/table/0.ht')), False) table_read_row_type = hl.dtype( 'struct{idx: int32, f32: float32, i64: int64, m: float64, astruct: struct{a: int32, b: float64}, mstruct: struct{x: int32, y: str}, aset: set<str>, mset: set<float64>, d: dict<array<str>, float64>, md: dict<int32, str>, h38: locus<GRCh38>, ml: locus<GRCh37>, i: interval<locus<GRCh37>>, c: call, mc: call, t: tuple(call, str, str), mt: tuple(locus<GRCh37>, bool)}' ) matrix_read = ir.MatrixRead( ir.MatrixNativeReader( resource('backward_compatability/1.0.0/matrix_table/0.hmt')), False, False) range = ir.TableRange(10, 4) table_irs = [ ir.TableKeyBy(table_read, ['m', 'd'], False), ir.TableFilter(table_read, b), table_read, ir.MatrixColsTable(matrix_read), ir.TableAggregateByKey(table_read, ir.MakeStruct([('a', ir.I32(5))])), ir.TableKeyByAndAggregate(table_read, ir.MakeStruct([('a', ir.I32(5))]), ir.MakeStruct([('b', ir.I32(5))]), 1, 2), ir.TableJoin(table_read, ir.TableRange(100, 10), 'inner', 1), ir.MatrixEntriesTable(matrix_read), ir.MatrixRowsTable(matrix_read), ir.TableParallelize( ir.MakeStruct([('rows', ir.Literal(hl.tarray(hl.tstruct(a=hl.tint32)), [{ 'a': None }, { 'a': 5 }, { 'a': -3 }])), ('global', ir.MakeStruct([]))]), None), ir.TableMapRows( ir.TableKeyBy(table_read, []), ir.MakeStruct([('a', ir.GetField(ir.Ref('row'), 'f32')), ('b', ir.F64(-2.11))])), ir.TableMapGlobals( table_read, ir.MakeStruct([('foo', ir.NA(hl.tarray(hl.tint32)))])), ir.TableRange(100, 10), ir.TableRepartition(table_read, 10, ir.RepartitionStrategy.COALESCE), ir.TableUnion([ir.TableRange(100, 10), ir.TableRange(50, 10)]), ir.TableExplode(table_read, ['mset']), ir.TableHead(table_read, 10), ir.TableOrderBy(ir.TableKeyBy(table_read, []), [('m', 'A'), ('m', 'D')]), ir.TableDistinct(table_read), ir.CastMatrixToTable(matrix_read, '__entries', '__cols'), ir.TableRename(table_read, {'idx': 'idx_foo'}, {'global_f32': 'global_foo'}), ir.TableMultiWayZipJoin([table_read, table_read], '__data', '__globals'), ir.MatrixToTableApply( matrix_read, { 'name': 'LinearRegressionRowsSingle', 'yFields': ['col_m'], 'xField': 'entry_m', 'covFields': [], 'rowBlockSize': 10, 'passThrough': [] }), ir.TableToTableApply(table_read, { 'name': 'TableFilterPartitions', 'parts': [0], 'keep': True }) ] return table_irs
def test_import_bed_no_reference_specified(self): bed_file = resource('example1.bed') t = hl.import_bed(bed_file, reference_genome=None) self.assertTrue(t.count() == 3) self.assertEqual(t.interval.dtype.point_type, hl.tstruct(contig=hl.tstr, position=hl.tint32))
def info_score(gp) -> StructExpression: r"""Compute the IMPUTE information score. Examples -------- Calculate the info score per variant: >>> gen_mt = hl.import_gen('data/example.gen', sample_file='data/example.sample') >>> gen_mt = gen_mt.annotate_rows(info_score = hl.agg.info_score(gen_mt.GP)) Calculate group-specific info scores per variant: >>> gen_mt = hl.import_gen('data/example.gen', sample_file='data/example.sample') >>> gen_mt = gen_mt.annotate_cols(is_case = hl.rand_bool(0.5)) >>> gen_mt = gen_mt.annotate_rows(info_score_case = hl.agg.info_score(hl.agg.filter(gen_mt.is_case, gen_mt.GP)), ... info_score_ctrl = hl.agg.info_score(hl.agg.filter(~gen_mt.is_case, gen_mt.GP))) Notes ----- The result of :func:`.info_score` is a struct with two fields: - `score` (``float64``) -- Info score. - `n_included` (``int32``) -- Number of non-missing samples included in the calculation. We implemented the IMPUTE info measure as described in the supplementary information from `Marchini & Howie. Genotype imputation for genome-wide association studies. Nature Reviews Genetics (2010) <http://www.nature.com/nrg/journal/v11/n7/extref/nrg2796-s3.pdf>`__. To calculate the info score :math:`I_{A}` for one SNP: .. math:: I_{A} = \begin{cases} 1 - \frac{\sum_{i=1}^{N}(f_{i} - e_{i}^2)}{2N\hat{\theta}(1 - \hat{\theta})} & \text{when } \hat{\theta} \in (0, 1) \\ 1 & \text{when } \hat{\theta} = 0, \hat{\theta} = 1\\ \end{cases} - :math:`N` is the number of samples with imputed genotype probabilities [:math:`p_{ik} = P(G_{i} = k)` where :math:`k \in \{0, 1, 2\}`] - :math:`e_{i} = p_{i1} + 2p_{i2}` is the expected genotype per sample - :math:`f_{i} = p_{i1} + 4p_{i2}` - :math:`\hat{\theta} = \frac{\sum_{i=1}^{N}e_{i}}{2N}` is the MLE for the population minor allele frequency Hail will not generate identical results to `QCTOOL <http://www.well.ox.ac.uk/~gav/qctool/#overview>`__ for the following reasons: - Hail automatically removes genotype probability distributions that do not meet certain requirements on data import with :func:`.import_gen` and :func:`.import_bgen`. - Hail does not use the population frequency to impute genotype probabilities when a genotype probability distribution has been set to missing. - Hail calculates the same statistic for sex chromosomes as autosomes while QCTOOL incorporates sex information. - The floating point number Hail stores for each genotype probability is slightly different than the original data due to rounding and normalization of probabilities. Warning ------- - The info score Hail reports will be extremely different from QCTOOL when a SNP has a high missing rate. - If the `gp` array must contain 3 elements, and its elements may not be missing. - If the genotype data was not imported using the :func:`.import_gen` or :func:`.import_bgen` functions, then the results for all variants will be ``score = NA`` and ``n_included = 0``. - It only makes semantic sense to compute the info score per variant. While the aggregator will run in any context if its arguments are the right type, the results are only meaningful in a narrow context. Parameters ---------- gp : :class:`.ArrayNumericExpression` Genotype probability array. Must have 3 elements, all of which must be defined. Returns ------- :class:`.StructExpression` Struct with fields `score` and `n_included`. """ t = hl.tstruct(score=hl.tfloat64, n_included=hl.tint32) return _agg_func('InfoScore', gp, t)
def test_import_locus_intervals_no_reference_specified(self): interval_file = resource('annotinterall.interval_list') t = hl.import_locus_intervals(interval_file, reference_genome=None) self.assertTrue(t.count() == 2) self.assertEqual(t.interval.dtype.point_type, hl.tstruct(contig=hl.tstr, position=hl.tint32))
def linreg(y, x): """Compute linear regression statistics. Examples -------- Regress HT against an intercept (1) , SEX, and C1: >>> table1.aggregate(agg.linreg(table1.HT, [1, table1.SEX == 'F', table1.C1])) Struct( beta=[88.50000000000014, 81.50000000000057, -10.000000000000068], standard_error=[14.430869689661844, 59.70552738231206, 7.000000000000016], t_stat=[6.132686518775844, 1.365032746099571, -1.428571428571435], p_value=[0.10290201427537926, 0.40250974549499974, 0.3888002244284281], n=4) Regress blood pressure against an intercept (1), age, height, and height squared: >>> ds_ann = ds.annotate_rows( ... linreg = hl.agg.linreg(ds.pheno.blood_pressure, ... [1, ds.pheno.age, ds.pheno.height, ds.pheno.height ** 2])) Notes ----- This aggregator returns a struct expression with five fields: - `beta` (:class:`.tarray` of :py:data:`.tfloat64`): Estimated regression coefficient for each predictor. - `standard_error` (:class:`.tarray` of :py:data:`.tfloat64`): Estimated standard error estimate for each predictor. - `t_stat` (:class:`.tarray` of :py:data:`.tfloat64`): t statistic for each predictor. - `p_value` (:class:`.tarray` of :py:data:`.tfloat64`): p-value for each predictor. - `n` (:py:data:`.tint64`): Number of samples included in the regression. A sample is included if and only if `y` and all elements of `x` are non-missing. The first four fields are missing if n is less than or equal to the number of predictors or if the predictors are linearly dependent. Parameters ---------- y : :class:`.Float64Expression` Response variable. x : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` Independent variables. Returns ------- :class:`.StructExpression` Struct with fields `beta`, `standard_error`, `t_stat`, `p_value`, and `n`. """ x = wrap_to_list(x) k = len(x) if k == 0: raise ValueError("'linreg' requires at least one predictor in `x`") t = hl.tstruct(beta=hl.tarray(hl.tfloat64), standard_error=hl.tarray(hl.tfloat64), t_stat=hl.tarray(hl.tfloat64), p_value=hl.tarray(hl.tfloat64), n=hl.tint64) x = hl.array(x) k = hl.int32(k) return _agg_func('LinearRegression', y, t, [k], f=lambda expr: x)
import hail as hl gvcfs = ['gs://hail-common/test-resources/HG00096.g.vcf.gz', 'gs://hail-common/test-resources/HG00268.g.vcf.gz'] hl.init(default_reference='GRCh38') parts_json = [ {'start': {'locus': {'contig': 'chr20', 'position': 17821257}}, 'end': {'locus': {'contig': 'chr20', 'position': 18708366}}, 'includeStart': True, 'includeEnd': True}, {'start': {'locus': {'contig': 'chr20', 'position': 18708367}}, 'end': {'locus': {'contig': 'chr20', 'position': 19776611}}, 'includeStart': True, 'includeEnd': True}, {'start': {'locus': {'contig': 'chr20', 'position': 19776612}}, 'end': {'locus': {'contig': 'chr20', 'position': 21144633}}, 'includeStart': True, 'includeEnd': True}, ] parts = hl.tarray(hl.tinterval(hl.tstruct(locus=hl.tlocus('GRCh38'))))._convert_from_json(parts_json) for mt in hl.import_gvcfs(gvcfs, parts): mt._force_count_rows()
def test_annotate(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3]}, {'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': []}, {'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7]}] kt = hl.Table.parallelize(rows, schema) self.assertTrue(kt.annotate()._same(kt)) result1 = convert_struct_to_dict(kt.annotate(foo=kt.a + 1, foo2=kt.a).take(1)[0]) self.assertDictEqual(result1, {'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'foo': 5, 'foo2': 4}) result3 = convert_struct_to_dict(kt.annotate( x1=kt.f.map(lambda x: x * 2), x2=kt.f.map(lambda x: [x, x + 1]).flatmap(lambda x: x), x3=hl.min(kt.f), x4=hl.max(kt.f), x5=hl.sum(kt.f), x6=hl.product(kt.f), x7=kt.f.length(), x8=kt.f.filter(lambda x: x == 3), x9=kt.f[1:], x10=kt.f[:], x11=kt.f[1:2], x12=kt.f.map(lambda x: [x, x + 1]), x13=kt.f.map(lambda x: [[x, x + 1], [x + 2]]).flatmap(lambda x: x), x14=hl.cond(kt.a < kt.b, kt.c, hl.null(hl.tint32)), x15={1, 2, 3} ).take(1)[0]) self.assertDictEqual(result3, {'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'x1': [2, 4, 6], 'x2': [1, 2, 2, 3, 3, 4], 'x3': 1, 'x4': 3, 'x5': 6, 'x6': 6, 'x7': 3, 'x8': [3], 'x9': [2, 3], 'x10': [1, 2, 3], 'x11': [2], 'x12': [[1, 2], [2, 3], [3, 4]], 'x13': [[1, 2], [3], [2, 3], [4], [3, 4], [5]], 'x14': None, 'x15': set([1, 2, 3])}) kt.annotate( x1=kt.a + 5, x2=5 + kt.a, x3=kt.a + kt.b, x4=kt.a - 5, x5=5 - kt.a, x6=kt.a - kt.b, x7=kt.a * 5, x8=5 * kt.a, x9=kt.a * kt.b, x10=kt.a / 5, x11=5 / kt.a, x12=kt.a / kt.b, x13=-kt.a, x14=+kt.a, x15=kt.a == kt.b, x16=kt.a == 5, x17=5 == kt.a, x18=kt.a != kt.b, x19=kt.a != 5, x20=5 != kt.a, x21=kt.a > kt.b, x22=kt.a > 5, x23=5 > kt.a, x24=kt.a >= kt.b, x25=kt.a >= 5, x26=5 >= kt.a, x27=kt.a < kt.b, x28=kt.a < 5, x29=5 < kt.a, x30=kt.a <= kt.b, x31=kt.a <= 5, x32=5 <= kt.a, x33=(kt.a == 0) & (kt.b == 5), x34=(kt.a == 0) | (kt.b == 5), x35=False, x36=True )
def generate_datasets(doctest_namespace): doctest_namespace['hl'] = hl if not os.path.isdir("output/"): try: os.mkdir("output/") except OSError: pass files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.read_matrix_table('data/example.vds') doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._ndarray([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() print("finished setting up doctest...")
def test_operators(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3]}, {'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': []}, {'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7]}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict(kt.annotate( x1=kt.a + 5, x2=5 + kt.a, x3=kt.a + kt.b, x4=kt.a - 5, x5=5 - kt.a, x6=kt.a - kt.b, x7=kt.a * 5, x8=5 * kt.a, x9=kt.a * kt.b, x10=kt.a / 5, x11=5 / kt.a, x12=kt.a / kt.b, x13=-kt.a, x14=+kt.a, x15=kt.a == kt.b, x16=kt.a == 5, x17=5 == kt.a, x18=kt.a != kt.b, x19=kt.a != 5, x20=5 != kt.a, x21=kt.a > kt.b, x22=kt.a > 5, x23=5 > kt.a, x24=kt.a >= kt.b, x25=kt.a >= 5, x26=5 >= kt.a, x27=kt.a < kt.b, x28=kt.a < 5, x29=5 < kt.a, x30=kt.a <= kt.b, x31=kt.a <= 5, x32=5 <= kt.a, x33=(kt.a == 0) & (kt.b == 5), x34=(kt.a == 0) | (kt.b == 5), x35=False, x36=True ).take(1)[0]) expected = {'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'x1': 9, 'x2': 9, 'x3': 5, 'x4': -1, 'x5': 1, 'x6': 3, 'x7': 20, 'x8': 20, 'x9': 4, 'x10': 4.0 / 5, 'x11': 5.0 / 4, 'x12': 4, 'x13': -4, 'x14': 4, 'x15': False, 'x16': False, 'x17': False, 'x18': True, 'x19': True, 'x20': True, 'x21': True, 'x22': False, 'x23': True, 'x24': True, 'x25': False, 'x26': True, 'x27': False, 'x28': True, 'x29': False, 'x30': False, 'x31': True, 'x32': False, 'x33': False, 'x34': False, 'x35': False, 'x36': True} for k, v in expected.items(): if isinstance(v, float): self.assertAlmostEqual(v, result[k], msg=k) else: self.assertEqual(v, result[k], msg=k)