Ejemplo n.º 1
0
    def test_select(self):
        schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32),
                            g=hl.tstruct(x=hl.tbool, y=hl.tint32))

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': {'x': True, 'y': 2}},
                {'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': [], 'g': {'x': True, 'y': 2}},
                {'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7], 'g': None}]

        kt = hl.Table.parallelize(rows, schema)

        t1 = kt.select(kt.a, kt.e)
        self.assertEqual(list(t1.row), ['a', 'e'])
        self.assertEqual(list(t1.key), [])

        t2 = kt.key_by('e')
        t2 = t2.select(t2.a)
        self.assertEqual(list(t2.row), ['e', 'a'])
        self.assertEqual(list(t2.key), ['e'])

        self.assertEqual(list(kt.select(kt.a, foo=kt.a + kt.b - kt.c - kt.d).row), ['a', 'foo'])
        self.assertEqual(list(kt.select(kt.a, foo=kt.a + kt.b - kt.c - kt.d, **kt.g).row), ['a', 'foo', 'x', 'y'])

        # select no fields
        s = kt.select()
        self.assertEqual(list(s.row), [])
        self.assertEqual(list(s.key), [])
Ejemplo n.º 2
0
    def test_annotate(self):
        vds = self.get_vds()
        vds = vds.annotate_globals(foo=5)

        self.assertEqual(vds.globals.dtype, hl.tstruct(foo=hl.tint32))

        vds = vds.annotate_rows(x1=agg.count(),
                                x2=agg.fraction(False),
                                x3=agg.count_where(True),
                                x4=vds.info.AC + vds.foo)

        vds = vds.annotate_cols(apple=6)
        vds = vds.annotate_cols(y1=agg.count(),
                                y2=agg.fraction(False),
                                y3=agg.count_where(True),
                                y4=vds.foo + vds.apple)

        expected_schema = hl.tstruct(s=hl.tstr, apple=hl.tint32, y1=hl.tint64, y2=hl.tfloat64, y3=hl.tint64,
                                     y4=hl.tint32)

        self.assertTrue(schema_eq(vds.col.dtype, expected_schema),
                        "expected: " + str(vds.col.dtype) + "\nactual: " + str(expected_schema))

        vds = vds.select_entries(z1=vds.x1 + vds.foo,
                                 z2=vds.x1 + vds.y1 + vds.foo)
        self.assertTrue(schema_eq(vds.entry.dtype, hl.tstruct(z1=hl.tint64, z2=hl.tint64)))
Ejemplo n.º 3
0
 def visit_struct(self, node, visited_children):
     tstruct, _, brace, maybe_fields, brace = visited_children
     if not maybe_fields:
         return hl.tstruct()
     else:
         fields = maybe_fields[0]
         return hl.tstruct(**dict(fields))
Ejemplo n.º 4
0
    def test_range_table(self):
        t = hl.utils.range_table(26, n_partitions=5)
        self.assertEqual(t.globals.dtype, hl.tstruct())
        self.assertEqual(t.row.dtype, hl.tstruct(idx=hl.tint32))
        self.assertEqual(t.row_value.dtype, hl.tstruct())
        self.assertEqual(list(t.key), ['idx'])

        self.assertEqual([r.idx for r in t.collect()], list(range(26)))
Ejemplo n.º 5
0
 def _compute_type(self):
     child_typ = self.child.typ
     self._type = hl.tmatrix(
         child_typ.global_type,
         child_typ.col_key_type._concat(
             hl.tstruct(**{f: hl.tarray(t) for f, t in child_typ.col_value_type.items()})),
         child_typ.col_key,
         child_typ.row_type,
         child_typ.row_key,
         hl.tstruct(**{f: hl.tarray(t) for f, t in child_typ.entry_type.items()}))
Ejemplo n.º 6
0
 def test_localize_entries(self):
     ref_schema = hl.tstruct(row_idx=hl.tint32,
                             __entries=hl.tarray(hl.tstruct(v=hl.tint32)))
     ref_data = [{'row_idx': i, '__entries': [{'v': i+j} for j in range(6)]}
                 for i in range(8)]
     ref_tab = hl.Table.parallelize(ref_data, ref_schema).key_by('row_idx')
     ref_tab = ref_tab.select_globals(__cols=[hl.struct(col_idx=i) for i in range(6)])
     mt = hl.utils.range_matrix_table(8, 6)
     mt = mt.annotate_entries(v=mt.row_idx+mt.col_idx)
     t = mt._localize_entries('__entries', '__cols')
     self.assertTrue(t._same(ref_tab))
Ejemplo n.º 7
0
 def test_transmute(self):
     mt = (
         hl.utils.range_matrix_table(1, 1)
             .annotate_globals(g1=0, g2=0)
             .annotate_cols(c1=0, c2=0)
             .annotate_rows(r1=0, r2=0)
             .annotate_entries(e1=0, e2=0))
     self.assertEqual(mt.transmute_globals(g3=mt.g2 + 1).globals.dtype, hl.tstruct(g1=hl.tint, g3=hl.tint))
     self.assertEqual(mt.transmute_rows(r3=mt.r2 + 1).row_value.dtype, hl.tstruct(r1=hl.tint, r3=hl.tint))
     self.assertEqual(mt.transmute_cols(c3=mt.c2 + 1).col_value.dtype, hl.tstruct(c1=hl.tint, c3=hl.tint))
     self.assertEqual(mt.transmute_entries(e3=mt.e2 + 1).entry.dtype, hl.tstruct(e1=hl.tint, e3=hl.tint))
Ejemplo n.º 8
0
    def test_reference_genome_liftover(self):
        grch37 = hl.get_reference('GRCh37')
        grch38 = hl.get_reference('GRCh38')

        self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37'))
        grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38')
        grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37')
        self.assertTrue(grch37.has_liftover('GRCh38') and grch38.has_liftover('GRCh37'))

        ds = hl.import_vcf(resource('sample.vcf'))
        t = ds.annotate_rows(liftover=hl.liftover(hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows()
        self.assertTrue(t.all(t.locus == t.liftover))

        null_locus = hl.null(hl.tlocus('GRCh38'))

        rows = [
            {'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38')},
            {'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38')},
            {'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38')},
            {'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38')},
            {'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38')},
            {'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38')},
            {'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus}
        ]
        schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38))
        t = hl.Table.parallelize(rows, schema)
        self.assertTrue(t.all(hl.cond(hl.is_defined(t.l38),
                                      hl.liftover(t.l37, 'GRCh38') == t.l38,
                                      hl.is_missing(hl.liftover(t.l37, 'GRCh38')))))

        t = t.filter(hl.is_defined(t.l38))
        self.assertTrue(t.count() == 6)

        t = t.key_by('l38')
        t.count()
        self.assertTrue(list(t.key) == ['l38'])

        null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38')))
        rows = [
            {'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval},
            {'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'),
             'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')}
        ]
        schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38)))
        t = hl.Table.parallelize(rows, schema)
        self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38))

        grch37.remove_liftover("GRCh38")
        grch38.remove_liftover("GRCh37")
Ejemplo n.º 9
0
    def test_range_matrix_table(self):
        mt = hl.utils.range_matrix_table(13, 7, n_partitions=5)
        self.assertEqual(mt.globals.dtype, hl.tstruct())
        self.assertEqual(mt.row.dtype, hl.tstruct(row_idx=hl.tint32))
        self.assertEqual(mt.col.dtype, hl.tstruct(col_idx=hl.tint32))
        self.assertEqual(mt.entry.dtype, hl.tstruct())

        self.assertEqual(list(mt.row_key), ['row_idx'])
        self.assertEqual(list(mt.col_key), ['col_idx'])

        self.assertEqual([r.row_idx for r in mt.rows().collect()], list(range(13)))
        self.assertEqual([r.col_idx for r in mt.cols().collect()], list(range(7)))
Ejemplo n.º 10
0
 def test_localize_self_join(self):
     ref_schema = hl.tstruct(row_idx=hl.tint32,
                             __entries=hl.tarray(hl.tstruct(v=hl.tint32)))
     ref_data = [{'row_idx': i, '__entries': [{'v': i+j} for j in range(6)]}
                 for i in range(8)]
     ref_tab = hl.Table.parallelize(ref_data, ref_schema).key_by('row_idx')
     ref_tab = ref_tab.join(ref_tab, how='outer')
     mt = hl.utils.range_matrix_table(8, 6)
     mt = mt.annotate_entries(v=mt.row_idx+mt.col_idx)
     t = mt._localize_entries('__entries', '__cols').drop('__cols')
     t = t.join(t, how='outer')
     self.assertTrue(t._same(ref_tab))
Ejemplo n.º 11
0
    def test_maximal_independent_set(self):
        # prefer to remove nodes with higher index
        t = hl.utils.range_table(10)
        graph = t.select(i=hl.int64(t.idx), j=hl.int64(t.idx + 10), bad_type=hl.float32(t.idx))

        mis_table = hl.maximal_independent_set(graph.i, graph.j, True, lambda l, r: l - r)
        mis = [row['node'] for row in mis_table.collect()]
        self.assertEqual(sorted(mis), list(range(0, 10)))
        self.assertEqual(mis_table.row.dtype, hl.tstruct(node=hl.tint64))
        self.assertEqual(mis_table.key.dtype, hl.tstruct(node=hl.tint64))

        self.assertRaises(ValueError, lambda: hl.maximal_independent_set(graph.i, graph.bad_type, True))
        self.assertRaises(ValueError, lambda: hl.maximal_independent_set(graph.i, hl.utils.range_table(10).idx, True))
        self.assertRaises(ValueError, lambda: hl.maximal_independent_set(hl.literal(1), hl.literal(2), True))
Ejemplo n.º 12
0
    def test_transmute(self):
        schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32),
                            g=hl.tstruct(x=hl.tbool, y=hl.tint32))

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': {'x': True, 'y': 2}},
                {'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': [], 'g': {'x': True, 'y': 2}},
                {'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7], 'g': None}]
        df = hl.Table.parallelize(rows, schema)

        df = df.transmute(h=df.a + df.b + df.c + df.g.y)
        r = df.select('h').collect()

        self.assertEqual(list(df.row), ['d', 'e', 'f', 'h'])
        self.assertEqual(r, [hl.Struct(h=x) for x in [10, 20, None]])
Ejemplo n.º 13
0
    def test_maximal_independent_set2(self):
        edges = [(0, 4), (0, 1), (0, 2), (1, 5), (1, 3), (2, 3), (2, 6),
                 (3, 7), (4, 5), (4, 6), (5, 7), (6, 7)]
        edges = [{"i": l, "j": r} for l, r in edges]

        t = hl.Table.parallelize(edges, hl.tstruct(i=hl.tint64, j=hl.tint64))
        mis_t = hl.maximal_independent_set(t.i, t.j)
        self.assertTrue(mis_t.row.dtype == hl.tstruct(node=hl.tint64) and
                        mis_t.globals.dtype == hl.tstruct())

        mis = set([row.node for row in mis_t.collect()])
        maximal_indep_sets = [{0, 6, 5, 3}, {1, 4, 7, 2}]
        non_maximal_indep_sets = [{0, 7}, {6, 1}]
        self.assertTrue(mis in non_maximal_indep_sets or mis in maximal_indep_sets)
Ejemplo n.º 14
0
    def test_group_cols_by_aggregate(self):
        mt, mt2 = self.get_groupable_matrix2()

        col_result = (mt.group_cols_by(group=mt2.cols()[mt.col_idx].col_idx2 < 2)
                      .aggregate_cols(collect=hl.agg.collect(mt.col_idx))
                      .aggregate_cols(count=hl.agg.count())
                      .aggregate_entries(sum=hl.agg.sum(mt2[mt.row_idx, mt.col_idx].x + mt.glob) + mt.glob - 15 - mt.row_idx) # tests fixed indices
                      .aggregate_entries(x=5)
                      .result())

        col_expected = (
            hl.Table.parallelize(
                [{'group': True, 'row_idx': 0, 'sum': 1, 'collect': [0, 1], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': True, 'row_idx': 1, 'sum': 2, 'collect': [0, 1], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': True, 'row_idx': 2, 'sum': 3, 'collect': [0, 1], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': True, 'row_idx': 3, 'sum': 4, 'collect': [0, 1], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': False, 'row_idx': 0, 'sum': 5, 'collect': [2, 3], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': False, 'row_idx': 1, 'sum': 6, 'collect': [2, 3], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': False, 'row_idx': 2, 'sum': 7, 'collect': [2, 3], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': False, 'row_idx': 3, 'sum': 8, 'collect': [2, 3], 'count': 2, 'r1': 3, 'x': 5}],
                hl.tstruct(row_idx=hl.tint32, r1=hl.tint32, group=hl.tbool, collect=hl.tarray(hl.tint32),
                           count=hl.tint64, sum=hl.tint64, x=hl.tint32)
            ).annotate_globals(glob=5).key_by('row_idx', 'group')
        )

        self.assertTrue(col_result.entries()._same(col_expected))
Ejemplo n.º 15
0
    def test_import_gen_no_reference_specified(self):
        gen = hl.import_gen(resource('example.gen'),
                            sample_file=resource('example.sample'),
                            reference_genome=None)

        self.assertTrue(gen.locus.dtype == hl.tstruct(contig=hl.tstr, position=hl.tint32))
        self.assertEqual(gen.count_rows(), 199)
Ejemplo n.º 16
0
    def test_index_maintains_count(self):
        t1 = hl.Table.parallelize([
            {'a': 'foo', 'b': 1},
            {'a': 'bar', 'b': 2},
            {'a': 'bar', 'b': 2}],
            hl.tstruct(a=hl.tstr, b=hl.tint32),
            key='a')
        t2 = hl.Table.parallelize([
            {'t': 'foo', 'x': 3.14},
            {'t': 'bar', 'x': 2.78},
            {'t': 'bar', 'x': -1},
            {'t': 'quam', 'x': 0}],
            hl.tstruct(t=hl.tstr, x=hl.tfloat64),
            key='t')

        j = t1.annotate(f=t2[t1.a].x)
        self.assertEqual(j.count(), t1.count())
Ejemplo n.º 17
0
 def _compute_type(self):
     for c in self.children:
         c.typ  # force
     child_typ = self.children[0].typ
     self._type = hl.ttable(
         hl.tstruct(**{self.global_name: hl.tarray(child_typ.global_type)}),
         child_typ.key_type._insert_field(self.data_name, hl.tarray(child_typ.value_type)),
         child_typ.row_key)
Ejemplo n.º 18
0
    def test_import_bgen_dosage_entry(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['dosage'])
        self.assertEqual(bgen.entry.dtype, hl.tstruct(dosage=hl.tfloat64))
        self.assertEqual(bgen.count_rows(), 199)
Ejemplo n.º 19
0
    def test_import_bgen_GT_GP_entries(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['GT', 'GP'],
                              sample_file=resource('example.sample'))
        self.assertEqual(bgen.entry.dtype, hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64)))
Ejemplo n.º 20
0
    def test_import_bgen_no_reference(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome=None)

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['GT', 'GP', 'dosage'])
        self.assertEqual(bgen.locus.dtype, hl.tstruct(contig=hl.tstr, position=hl.tint32))
        self.assertEqual(bgen.count_rows(), 199)
Ejemplo n.º 21
0
    def test_make_table_empty_entry_field(self):
        mt = hl.utils.range_matrix_table(3, 2)
        mt = mt.select_entries(**{'': mt.row_idx * mt.col_idx})
        mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx))

        t = mt.make_table()
        self.assertEqual(
            t.row.dtype,
            hl.tstruct(**{'row_idx': hl.tint32, '0': hl.tint32, '1': hl.tint32}))
Ejemplo n.º 22
0
    def table_irs(self):
        b = ir.TrueIR()
        table_read = ir.TableRead(
            ir.TableNativeReader(resource('backward_compatability/1.0.0/table/0.ht')), False)
        table_read_row_type = hl.dtype('struct{idx: int32, f32: float32, i64: int64, m: float64, astruct: struct{a: int32, b: float64}, mstruct: struct{x: int32, y: str}, aset: set<str>, mset: set<float64>, d: dict<array<str>, float64>, md: dict<int32, str>, h38: locus<GRCh38>, ml: locus<GRCh37>, i: interval<locus<GRCh37>>, c: call, mc: call, t: tuple(call, str, str), mt: tuple(locus<GRCh37>, bool)}')

        matrix_read = ir.MatrixRead(
            ir.MatrixNativeReader(resource('backward_compatability/1.0.0/matrix_table/0.hmt')), False, False)

        range = ir.TableRange(10, 4)
        table_irs = [
            ir.TableKeyBy(table_read, ['m', 'd'], False),
            ir.TableFilter(table_read, b),
            table_read,
            ir.MatrixColsTable(matrix_read),
            ir.TableAggregateByKey(
                table_read,
                ir.MakeStruct([('a', ir.I32(5))])),
            ir.TableKeyByAndAggregate(
                table_read,
                ir.MakeStruct([('a', ir.I32(5))]),
                ir.MakeStruct([('b', ir.I32(5))]),
                1, 2),
            ir.TableJoin(
                table_read,
                ir.TableRange(100, 10), 'inner', 1),
            ir.MatrixEntriesTable(matrix_read),
            ir.MatrixRowsTable(matrix_read),
            ir.TableParallelize(ir.MakeStruct([
                ('rows', ir.Literal(hl.tarray(hl.tstruct(a=hl.tint32)), [{'a':None}, {'a':5}, {'a':-3}])),
                ('global', ir.MakeStruct([]))]), None),
            ir.TableMapRows(
                ir.TableKeyBy(table_read, []),
                ir.MakeStruct([
                    ('a', ir.GetField(ir.Ref('row'), 'f32')),
                    ('b', ir.F64(-2.11))])),
            ir.TableMapGlobals(
                table_read,
                ir.MakeStruct([
                    ('foo', ir.NA(hl.tarray(hl.tint32)))])),
            ir.TableRange(100, 10),
            ir.TableRepartition(table_read, 10, ir.RepartitionStrategy.COALESCE),
            ir.TableUnion(
                [ir.TableRange(100, 10), ir.TableRange(50, 10)]),
            ir.TableExplode(table_read, ['mset']),
            ir.TableHead(table_read, 10),
            ir.TableOrderBy(ir.TableKeyBy(table_read, []), [('m', 'A'), ('m', 'D')]),
            ir.TableDistinct(table_read),
            ir.CastMatrixToTable(matrix_read, '__entries', '__cols'),
            ir.TableRename(table_read, {'idx': 'idx_foo'}, {'global_f32': 'global_foo'}),
            ir.TableMultiWayZipJoin([table_read, table_read], '__data', '__globals'),
            ir.MatrixToTableApply(matrix_read, {'name': 'LinearRegressionRowsSingle', 'yFields': ['col_m'], 'xField': 'entry_m', 'covFields': [], 'rowBlockSize': 10, 'passThrough': []}),
            ir.TableToTableApply(table_read, {'name': 'TableFilterPartitions', 'parts': [0], 'keep': True})
        ]

        return table_irs
Ejemplo n.º 23
0
 def test_aggregate_by_key_partitioning(self):
     ht1 = hl.Table.parallelize([
         {'k': 'foo', 'b': 1},
         {'k': 'bar', 'b': 2},
         {'k': 'bar', 'b': 2}],
         hl.tstruct(k=hl.tstr, b=hl.tint32),
         key='k')
     self.assertEqual(
         set(ht1.group_by('k').aggregate(mean_b = hl.agg.mean(ht1.b)).collect()),
         {hl.Struct(k='foo', mean_b=1.0), hl.Struct(k='bar', mean_b=2.0)})
Ejemplo n.º 24
0
    def test_import_bgen_no_entries(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=[],
                              sample_file=resource('example.sample'))
        self.assertEqual(bgen.entry.dtype, hl.tstruct())
        bgen._jvds.typecheck()
Ejemplo n.º 25
0
    def test_joins_work_correctly(self):
        mt, mt2 = self.get_groupable_matrix2()

        col_result = (mt.group_cols_by(group=mt2.cols()[mt.col_idx].col_idx2 < 2)
                      .aggregate(sum=hl.agg.sum(mt2[mt.row_idx, mt.col_idx].x + mt.glob) + mt.glob - 15)
                      .drop('r1'))

        col_expected = (
            hl.Table.parallelize(
                [{'row_idx': 0, 'group': True, 'sum': 1},
                 {'row_idx': 0, 'group': False, 'sum': 5},
                 {'row_idx': 1, 'group': True, 'sum': 3},
                 {'row_idx': 1, 'group': False, 'sum': 7},
                 {'row_idx': 2, 'group': True, 'sum': 5},
                 {'row_idx': 2, 'group': False, 'sum': 9},
                 {'row_idx': 3, 'group': True, 'sum': 7},
                 {'row_idx': 3, 'group': False, 'sum': 11}],
                hl.tstruct(row_idx=hl.tint32, group=hl.tbool, sum=hl.tint64)
            ).annotate_globals(glob=5).key_by('row_idx', 'group')
        )

        self.assertTrue(col_result.entries()._same(col_expected))

        row_result = (mt.group_rows_by(group=mt2.rows()[mt.row_idx].row_idx2 < 2)
                      .aggregate(sum=hl.agg.sum(mt2[mt.row_idx, mt.col_idx].x + mt.glob) + mt.glob - 15)
                      .drop('c1'))

        row_expected = (
            hl.Table.parallelize(
                [{'group': True, 'col_idx': 0, 'sum': 1},
                 {'group': True, 'col_idx': 1, 'sum': 3},
                 {'group': True, 'col_idx': 2, 'sum': 5},
                 {'group': True, 'col_idx': 3, 'sum': 7},
                 {'group': False, 'col_idx': 0, 'sum': 5},
                 {'group': False, 'col_idx': 1, 'sum': 7},
                 {'group': False, 'col_idx': 2, 'sum': 9},
                 {'group': False, 'col_idx': 3, 'sum': 11}],
                hl.tstruct(group=hl.tbool, col_idx=hl.tint32, sum=hl.tint64)
            ).annotate_globals(glob=5).key_by('group', 'col_idx')
        )

        self.assertTrue(row_result.entries()._same(row_expected))
Ejemplo n.º 26
0
    def test_maximal_independent_set3(self):
        is_case = {"A", "C", "E", "G", "H"}
        edges = [("A", "B"), ("C", "D"), ("E", "F"), ("G", "H")]
        edges = [{"i": {"id": l, "is_case": l in is_case},
                  "j": {"id": r, "is_case": r in is_case}} for l, r in edges]

        t = hl.Table.parallelize(edges, hl.tstruct(i=hl.tstruct(id=hl.tstr, is_case=hl.tbool),
                                                   j=hl.tstruct(id=hl.tstr, is_case=hl.tbool)))

        tiebreaker = lambda l, r: (hl.case()
                                   .when(l.is_case & (~r.is_case), -1)
                                   .when(~(l.is_case) & r.is_case, 1)
                                   .default(0))

        mis = hl.maximal_independent_set(t.i, t.j, tie_breaker=tiebreaker)

        expected_sets = [{"A", "C", "E", "G"}, {"A", "C", "E", "H"}]

        self.assertTrue(mis.all(mis.node.is_case))
        self.assertTrue(set([row.id for row in mis.select(mis.node.id).collect()]) in expected_sets)
Ejemplo n.º 27
0
 def test_import_vcf_missing_info_field_elements(self):
     mt = hl.import_vcf(resource('missingInfoArray.vcf'), reference_genome='GRCh37', array_elements_required=False)
     mt = mt.select_rows(FOO=mt.info.FOO, BAR=mt.info.BAR)
     expected = hl.Table.parallelize([{'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'],
                                       'FOO': [1, None], 'BAR': [2, None, None]},
                                      {'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'],
                                       'FOO': [None, 2, None], 'BAR': [None, 1.0, None]}],
                                     hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr),
                                                FOO=hl.tarray(hl.tint), BAR=hl.tarray(hl.tfloat64)),
                                     key=['locus', 'alleles'])
     self.assertTrue(mt.rows()._same(expected))
Ejemplo n.º 28
0
 def test_computed_key_join_1(self):
     ds = self.get_vds()
     kt = hl.Table.parallelize(
         [{'key': 0, 'value': True},
          {'key': 1, 'value': False}],
         hl.tstruct(key=hl.tint32, value=hl.tbool),
         key=['key'])
     ds = ds.annotate_rows(key=ds.locus.position % 2)
     ds = ds.annotate_rows(value=kt[ds['key']]['value'])
     rt = ds.rows()
     self.assertTrue(
         rt.all(((rt.locus.position % 2) == 0) == rt['value']))
Ejemplo n.º 29
0
    def test_multi_way_zip_join(self):
        d1 = [{"id": 0, "name": "a", "data": 0.0},
              {"id": 1, "name": "b", "data": 3.14},
              {"id": 2, "name": "c", "data": 2.78}]
        d2 = [{"id": 0, "name": "d", "data": 1.1},
              {"id": 0, "name": "x", "data": 2.2},
              {"id": 2, "name": "v", "data": 7.89}]
        d3 = [{"id": 1, "name": "f", "data":  9.99},
              {"id": 2, "name": "g", "data": -1.0},
              {"id": 3, "name": "z", "data":  0.01}]
        s = hl.tstruct(id=hl.tint32, name=hl.tstr, data=hl.tfloat64)
        ts = [hl.Table.parallelize(r, schema=s, key='id') for r in [d1, d2, d3]]
        joined = hl.Table._multi_way_zip_join(ts, '__data', '__globals').drop('__globals')
        dexpected = [{"id": 0, "__data": [{"name": "a", "data": 0.0},
                                          {"name": "d", "data": 1.1},
                                          None]},
                     {"id": 0, "__data": [None,
                                          {"name": "x", "data": 2.2},
                                          None]},
                     {"id": 1, "__data": [{"name": "b", "data": 3.14},
                                          None,
                                          {"name": "f", "data":  9.99}]},
                     {"id": 2, "__data": [{"name": "c", "data": 2.78},
                                          {"name": "v", "data": 7.89},
                                          {"name": "g", "data": -1.0}]},
                     {"id": 3, "__data": [None,
                                          None,
                                          {"name": "z", "data":  0.01}]}]
        expected = hl.Table.parallelize(
            dexpected,
            schema=hl.tstruct(id=hl.tint32, __data=hl.tarray(hl.tstruct(name=hl.tstr, data=hl.tfloat64))),
            key='id')
        self.assertTrue(expected._same(joined))

        expected2 = expected.transmute(data=expected['__data'])
        joined_same_name = hl.Table._multi_way_zip_join(ts, 'data', 'globals').drop('globals')
        self.assertTrue(expected2._same(joined_same_name))

        joined_nothing = hl.Table._multi_way_zip_join(ts, 'data', 'globals').drop('data', 'globals')
        self.assertEqual(joined_nothing._force_count(), 5)
Ejemplo n.º 30
0
 def test_parses(self):
     env = {'c': hl.tbool,
            'a': hl.tarray(hl.tint32),
            'aa': hl.tarray(hl.tarray(hl.tint32)),
            'da': hl.tarray(hl.ttuple(hl.tint32, hl.tstr)),
            'v': hl.tint32,
            's': hl.tstruct(x=hl.tint32, y=hl.tint64, z=hl.tfloat64),
            't': hl.ttuple(hl.tint32, hl.tint64, hl.tfloat64),
            'call': hl.tcall,
            'x': hl.tint32}
     env = {name: t._jtype for name, t in env.items()}
     for x in self.value_irs():
         Env.hail().expr.ir.IRParser.parse_value_ir(str(x), env, {})
Ejemplo n.º 31
0
    def test_select(self):
        schema = hl.tstruct(a=hl.tint32,
                            b=hl.tint32,
                            c=hl.tint32,
                            d=hl.tint32,
                            e=hl.tstr,
                            f=hl.tarray(hl.tint32),
                            g=hl.tstruct(x=hl.tbool, y=hl.tint32))

        rows = [{
            'a': 4,
            'b': 1,
            'c': 3,
            'd': 5,
            'e': "hello",
            'f': [1, 2, 3],
            'g': {
                'x': True,
                'y': 2
            }
        }, {
            'a': 0,
            'b': 5,
            'c': 13,
            'd': -1,
            'e': "cat",
            'f': [],
            'g': {
                'x': True,
                'y': 2
            }
        }, {
            'a': 4,
            'b': 2,
            'c': 20,
            'd': 3,
            'e': "dog",
            'f': [5, 6, 7],
            'g': None
        }]

        kt = hl.Table.parallelize(rows, schema)

        t1 = kt.select(kt.a, kt.e)
        self.assertEqual(list(t1.row), ['a', 'e'])
        self.assertEqual(list(t1.key), [])

        t2 = kt.key_by('e')
        t2 = t2.select(t2.a)
        self.assertEqual(list(t2.row), ['e', 'a'])
        self.assertEqual(list(t2.key), ['e'])

        self.assertEqual(
            list(kt.select(kt.a, foo=kt.a + kt.b - kt.c - kt.d).row),
            ['a', 'foo'])
        self.assertEqual(
            list(kt.select(kt.a, foo=kt.a + kt.b - kt.c - kt.d, **kt.g).row),
            ['a', 'foo', 'x', 'y'])

        # select no fields
        s = kt.select()
        self.assertEqual(list(s.row), [])
        self.assertEqual(list(s.key), [])
Ejemplo n.º 32
0
def generate_datasets(doctest_namespace):
    doctest_namespace['hl'] = hl

    ds = hl.import_vcf('data/sample.vcf.bgz')
    ds = ds.sample_rows(0.03)
    ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                          panel_maf=0.1,
                          anno1=5,
                          anno2=0,
                          consequence="LOF",
                          gene="A",
                          score=5.0)
    ds = ds.annotate_rows(a_index=1)
    ds = hl.sample_qc(hl.variant_qc(ds))
    ds = ds.annotate_cols(is_case=True,
                          pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                          is_female=hl.rand_bool(0.5),
                                          age=hl.rand_norm(65, 10),
                                          height=hl.rand_norm(70, 10),
                                          blood_pressure=hl.rand_norm(120, 20),
                                          cohort_name="cohort1"),
                          cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                          cov1=hl.rand_norm(0, 1),
                          cov2=hl.rand_norm(0, 1),
                          cohort="SIGMA")
    ds = ds.annotate_globals(
        global_field_1=5,
        global_field_2=10,
        pli={
            'SCN1A': 0.999,
            'SONIC': 0.014
        },
        populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
    ds = ds.annotate_rows(gene=['TTN'])
    ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
    ds = ds.checkpoint(f'output/example.mt', overwrite=True)

    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(
        consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata
    doctest_namespace['cols_to_keep'] = s_metadata
    doctest_namespace['cols_to_remove'] = s_metadata
    doctest_namespace['rows_to_keep'] = v_metadata
    doctest_namespace['rows_to_remove'] = v_metadata

    small_mt = hl.balding_nichols_model(3, 4, 4)
    doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt',
                                                        overwrite=True)

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv',
                             impute=True,
                             types={
                                 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)
                             })
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv',
                                   delimiter='\\s+',
                                   types={
                                       'Age': hl.tint32,
                                       'Children': hl.tarray(hl.tstr)
                                   },
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({
        'Alice': 43,
        'Bob': 33,
        'Charles': 44
    })
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval(
        "1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)
    doctest_namespace['nd'] = hl._ndarray([[1, 2], [3, 4]])

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv",
                                              impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    burden_ds = hl.import_vcf('data/example_burden.vcf')
    burden_kt = hl.import_table('data/example_burden.tsv',
                                key='Sample',
                                impute=True)
    burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s])
    burden_ds = burden_ds.annotate_rows(
        weight=hl.float64(burden_ds.locus.position))
    burden_ds = hl.variant_qc(burden_ds)
    genekt = hl.import_locus_intervals('data/gene.interval_list')
    burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
    burden_ds = burden_ds.checkpoint(f'output/example_burden.vds',
                                     overwrite=True)
    doctest_namespace['burden_ds'] = burden_ds

    ld_score_one_pheno_sumstats = hl.import_table(
        'data/ld_score_regression.one_pheno.sumstats.tsv',
        types={
            'locus': hl.tlocus('GRCh37'),
            'alleles': hl.tarray(hl.tstr),
            'chi_squared': hl.tfloat64,
            'n': hl.tint32,
            'ld_score': hl.tfloat64,
            'phenotype': hl.tstr,
            'chi_squared_50_irnt': hl.tfloat64,
            'n_50_irnt': hl.tint32,
            'chi_squared_20160': hl.tfloat64,
            'n_20160': hl.tint32
        },
        key=['locus', 'alleles'])
    doctest_namespace[
        'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats

    mt = hl.import_matrix_table(
        'data/ld_score_regression.all_phenos.sumstats.tsv',
        row_fields={
            'locus': hl.tstr,
            'alleles': hl.tstr,
            'ld_score': hl.tfloat64
        },
        entry_type=hl.tstr)
    mt = mt.key_cols_by(phenotype=mt.col_id)
    mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus),
                        alleles=mt.alleles.split(','))
    mt = mt.drop('row_id', 'col_id')
    mt = mt.annotate_entries(x=mt.x.split(","))
    mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]),
                              n=hl.int32(mt.x[1]))
    mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score))
    doctest_namespace['ld_score_all_phenos_sumstats'] = mt

    print("finished setting up doctest...")
Ejemplo n.º 33
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--gencode",
        action="append",
        default=[],
        metavar=("version", "gtf_path", "canonical_transcripts_path"),
        nargs=3,
        required=True,
    )
    parser.add_argument("--hgnc")
    parser.add_argument("--mane-select-transcripts")
    parser.add_argument("--min-partitions", type=int, default=32)
    parser.add_argument("--output", required=True)
    args = parser.parse_args()

    genes = None

    all_gencode_versions = [
        gencode_version for gencode_version, _, _ in args.gencode
    ]

    for gencode_version, gtf_path, canonical_transcripts_path in args.gencode:
        gencode_genes = load_gencode_gene_models(
            gtf_path, min_partitions=args.min_partitions)

        # Canonical transcripts file is a TSV with two columns: gene ID and transcript ID and no header row
        canonical_transcripts = hl.import_table(
            canonical_transcripts_path,
            key="gene_id",
            min_partitions=args.min_partitions)
        gencode_genes = gencode_genes.annotate(
            canonical_transcript_id=canonical_transcripts[
                gencode_genes.gene_id].transcript_id)

        gencode_genes = gencode_genes.select(
            **{f"v{gencode_version}": gencode_genes.row_value})

        if not genes:
            genes = gencode_genes
        else:
            genes = genes.join(gencode_genes, "outer")

    genes = genes.select(gencode=genes.row_value)

    hgnc = hl.import_table(args.hgnc, missing="")

    hgnc = hgnc.select(
        hgnc_id=hgnc["HGNC ID"],
        symbol=hgnc["Approved symbol"],
        name=hgnc["Approved name"],
        previous_symbols=hgnc["Previous symbols"],
        alias_symbols=hgnc["Alias symbols"],
        omim_id=hgnc["OMIM ID(supplied by OMIM)"],
        gene_id=hl.or_else(hgnc["Ensembl gene ID"],
                           hgnc["Ensembl ID(supplied by Ensembl)"]),
    )
    hgnc = hgnc.filter(hl.is_defined(hgnc.gene_id)).key_by("gene_id")
    hgnc = hgnc.annotate(
        previous_symbols=hl.cond(
            hgnc.previous_symbols == "",
            hl.empty_array(hl.tstr),
            hgnc.previous_symbols.split(",").map(lambda s: s.strip()),
        ),
        alias_symbols=hl.cond(
            hgnc.alias_symbols == "", hl.empty_array(hl.tstr),
            hgnc.alias_symbols.split(",").map(lambda s: s.strip())),
    )

    genes = genes.annotate(**hgnc[genes.gene_id])
    genes = genes.annotate(symbol_source=hl.cond(hl.is_defined(genes.symbol),
                                                 "hgnc", hl.null(hl.tstr)))

    # If an HGNC gene symbol was not present, use the symbol from Gencode
    for gencode_version in all_gencode_versions:
        genes = genes.annotate(
            symbol=hl.or_else(
                genes.symbol,
                genes.gencode[f"v{gencode_version}"].gene_symbol),
            symbol_source=hl.cond(
                hl.is_missing(genes.symbol) & hl.is_defined(
                    genes.gencode[f"v{gencode_version}"].gene_symbol),
                f"gencode (v{gencode_version})",
                genes.symbol_source,
            ),
        )

    # Collect all fields that can be used to search by gene name
    genes = genes.annotate(
        symbol_upper_case=genes.symbol.upper(),
        search_terms=hl.empty_array(hl.tstr).append(genes.symbol).extend(
            genes.previous_symbols).extend(genes.alias_symbols),
    )
    for gencode_version in all_gencode_versions:
        genes = genes.annotate(search_terms=hl.rbind(
            genes.gencode[f"v{gencode_version}"].gene_symbol,
            lambda symbol_in_gencode: hl.cond(
                hl.is_defined(symbol_in_gencode),
                genes.search_terms.append(symbol_in_gencode), genes.
                search_terms),
        ))

    genes = genes.annotate(
        search_terms=hl.set(genes.search_terms.map(lambda s: s.upper())))

    if args.mane_select_transcripts:
        mane_select_transcripts = hl.import_table(args.mane_select_transcripts,
                                                  force=True)
        mane_select_transcripts = mane_select_transcripts.select(
            gene_id=mane_select_transcripts.Ensembl_Gene.split("\\.")[0],
            matched_gene_version=mane_select_transcripts.Ensembl_Gene.split(
                "\\.")[1],
            ensembl_id=mane_select_transcripts.Ensembl_nuc.split("\\.")[0],
            ensembl_version=mane_select_transcripts.Ensembl_nuc.split("\\.")
            [1],
            refseq_id=mane_select_transcripts.RefSeq_nuc.split("\\.")[0],
            refseq_version=mane_select_transcripts.RefSeq_nuc.split("\\.")[1],
        )
        mane_select_transcripts = mane_select_transcripts.key_by("gene_id")

        # For GRCh38 (Gencode >= 20) transcripts, use the MANE Select transcripts to annotate transcripts
        # with their matching RefSeq transcript.
        ensembl_to_refseq_map = {}
        for transcript in mane_select_transcripts.collect():
            ensembl_to_refseq_map[transcript.ensembl_id] = {
                transcript.ensembl_version:
                hl.Struct(refseq_id=transcript.refseq_id,
                          refseq_version=transcript.refseq_version)
            }

        ensembl_to_refseq_map = hl.literal(ensembl_to_refseq_map)

        for gencode_version in ["19", "29"]:
            if int(gencode_version) >= 20:
                transcript_annotation = lambda transcript: transcript.annotate(
                    **ensembl_to_refseq_map.get(
                        transcript.transcript_id,
                        hl.empty_dict(
                            hl.tstr,
                            hl.tstruct(refseq_id=hl.tstr,
                                       refseq_version=hl.tstr)),
                    ).get(
                        transcript.transcript_version,
                        hl.struct(refseq_id=hl.null(hl.tstr),
                                  refseq_version=hl.null(hl.tstr)),
                    ))
            else:
                transcript_annotation = lambda transcript: transcript.annotate(
                    refseq_id=hl.null(hl.tstr),
                    refseq_version=hl.null(hl.tstr))

            genes = genes.annotate(gencode=genes.gencode.annotate(
                **{
                    f"v{gencode_version}":
                    genes.gencode[f"v{gencode_version}"].annotate(
                        transcripts=genes.gencode[f"v{gencode_version}"].
                        transcripts.map(transcript_annotation))
                }))

        # Annotate genes with their MANE Select transcript
        genes = genes.annotate(
            mane_select_transcript=mane_select_transcripts[genes.gene_id])

    genes.describe()

    genes.write(args.output, overwrite=True)
Ejemplo n.º 34
0
 def _compute_type(self):
     name = self.config['name']
     child_typ = self.child.typ
     if name == 'LinearRegressionRowsChained':
         pass_through = self.config['passThrough']
         chained_schema = hl.dtype(
             'struct{n:array<int32>,sum_x:array<float64>,y_transpose_x:array<array<float64>>,beta:array<array<float64>>,standard_error:array<array<float64>>,t_stat:array<array<float64>>,p_value:array<array<float64>>}')
         self._type = hl.ttable(
             child_typ.global_type,
             (child_typ.row_key_type
              ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through})
              ._concat(chained_schema)),
             child_typ.row_key)
     elif name == 'LinearRegressionRowsSingle':
         pass_through = self.config['passThrough']
         chained_schema = hl.dtype(
             'struct{n:int32,sum_x:float64,y_transpose_x:array<float64>,beta:array<float64>,standard_error:array<float64>,t_stat:array<float64>,p_value:array<float64>}')
         self._type = hl.ttable(
             child_typ.global_type,
             (child_typ.row_key_type
              ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through})
              ._concat(chained_schema)),
             child_typ.row_key)
     elif name == 'LogisticRegression':
         pass_through = self.config['passThrough']
         logreg_type = hl.tstruct(logistic_regression=hl.tarray(regression_test_type(self.config['test'])))
         self._type = hl.ttable(
             child_typ.global_type,
             (child_typ.row_key_type
              ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through})
              ._concat(logreg_type)),
             child_typ.row_key)
     elif name == 'PoissonRegression':
         pass_through = self.config['passThrough']
         poisreg_type = regression_test_type(self.config['test'])
         self._type = hl.ttable(
             child_typ.global_type,
             (child_typ.row_key_type
              ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through})
              ._concat(poisreg_type)),
             child_typ.row_key)
     elif name == 'Skat':
         key_field = self.config['keyField']
         key_type = child_typ.row_type[key_field]
         skat_type = hl.dtype(f'struct{{id:{key_type},size:int32,q_stat:float64,p_value:float64,fault:int32}}')
         self._type = hl.ttable(
             hl.tstruct(),
             skat_type,
             ['id'])
     elif name == 'PCA':
         self._type = hl.ttable(
             hl.tstruct(eigenvalues=hl.tarray(hl.tfloat64),
                        scores=hl.tarray(child_typ.col_key_type._insert_field('scores', hl.tarray(hl.tfloat64)))),
             child_typ.row_key_type._insert_field('loadings', dtype('array<float64>')),
             child_typ.row_key)
     else:
         assert name == 'LocalLDPrune', name
         self._type = hl.ttable(
             hl.tstruct(),
             child_typ.row_key_type._insert_fields(mean=hl.tfloat64, centered_length_rec=hl.tfloat64),
             list(child_typ.row_key))
Ejemplo n.º 35
0
def de_novo(mt: MatrixTable,
            pedigree: Pedigree,
            pop_frequency_prior,
            *,
            min_gq: int = 20,
            min_p: float = 0.05,
            max_parent_ab: float = 0.05,
            min_child_ab: float = 0.20,
            min_dp_ratio: float = 0.10) -> Table:
    r"""Call putative *de novo* events from trio data.

    .. include:: ../_templates/req_tstring.rst

    .. include:: ../_templates/req_tvariant.rst

    .. include:: ../_templates/req_biallelic.rst

    Examples
    --------

    Call de novo events:

    >>> pedigree = hl.Pedigree.read('data/trios.fam')
    >>> priors = hl.import_table('data/gnomadFreq.tsv', impute=True)
    >>> priors = priors.transmute(**hl.parse_variant(priors.Variant)).key_by('locus', 'alleles')
    >>> de_novo_results = hl.de_novo(dataset, pedigree, pop_frequency_prior=priors[dataset.row_key].AF)

    Notes
    -----
    This method assumes the GATK high-throughput sequencing fields exist:
    `GT`, `AD`, `DP`, `GQ`, `PL`.

    This method replicates the functionality of `Kaitlin Samocha's de novo
    caller <https://github.com/ksamocha/de_novo_scripts>`__. The version
    corresponding to git commit ``bde3e40`` is implemented in Hail with her
    permission and assistance.

    This method produces a :class:`.Table` with the following fields:

     - `locus` (``locus``) -- Variant locus.
     - `alleles` (``array<str>``) -- Variant alleles.
     - `id` (``str``) -- Proband sample ID.
     - `prior` (``float64``) -- Site frequency prior. It is the maximum of:
       the computed dataset alternate allele frequency, the
       `pop_frequency_prior` parameter, and the global prior
       ``1 / 3e7``.
     - `proband` (``struct``) -- Proband column fields from `mt`.
     - `father` (``struct``) -- Father column fields from `mt`.
     - `mother` (``struct``) -- Mother column fields from `mt`.
     - `proband_entry` (``struct``) -- Proband entry fields from `mt`.
     - `father_entry` (``struct``) -- Father entry fields from `mt`.
     - `proband_entry` (``struct``) -- Mother entry fields from `mt`.
     - `is_female` (``bool``) -- ``True`` if proband is female.
     - `p_de_novo` (``float64``) -- Unfiltered posterior probability
       that the event is *de novo* rather than a missed heterozygous
       event in a parent.
     - `confidence` (``str``) Validation confidence. One of: ``'HIGH'``,
       ``'MEDIUM'``, ``'LOW'``.

    The key of the table is ``['locus', 'alleles', 'id']``.

    The model looks for de novo events in which both parents are homozygous
    reference and the proband is a heterozygous. The model makes the simplifying
    assumption that when this configuration ``x = (AA, AA, AB)`` of calls
    occurs, exactly one of the following is true:

     - ``d``: a de novo mutation occurred in the proband and all calls are
       accurate.
     - ``m``: at least one parental allele is actually heterozygous and
       the proband call is accurate.

    We can then estimate the posterior probability of a de novo mutation as:

    .. math::

        \mathrm{P_{\text{de novo}}} = \frac{\mathrm{P}(d\,|\,x)}{\mathrm{P}(d\,|\,x) + \mathrm{P}(m\,|\,x)}

    Applying Bayes rule to the numerator and denominator yields

    .. math::

        \frac{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d)}{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d) +
        \mathrm{P}(x\,|\,m)\,\mathrm{P}(m)}

    The prior on de novo mutation is estimated from the rate in the literature:

    .. math::

        \mathrm{P}(d) = \frac{1 \text{mutation}}{30,000,000\, \text{bases}}

    The prior used for at least one alternate allele between the parents
    depends on the alternate allele frequency:

    .. math::

        \mathrm{P}(m) = 1 - (1 - AF)^4

    The likelihoods :math:`\mathrm{P}(x\,|\,d)` and :math:`\mathrm{P}(x\,|\,m)`
    are computed from the PL (genotype likelihood) fields using these
    factorizations:

    .. math::

        \mathrm{P}(x = (AA, AA, AB) \,|\,d) = \Big(
        &\mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AA) \\
        \cdot &\mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} =
        AA) \\ \cdot &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\,
        \mathrm{proband} = AB) \Big)

    .. math::

        \mathrm{P}(x = (AA, AA, AB) \,|\,m) = \Big( &
        \mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AB)
        \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} =
        AA) \\ + \, &\mathrm{P}(x_{\mathrm{father}} = AA \,|\,
        \mathrm{father} = AA) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA
        \,|\, \mathrm{mother} = AB) \Big) \\ \cdot \,
        &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\, \mathrm{proband} = AB)

    (Technically, the second factorization assumes there is exactly (rather
    than at least) one alternate allele among the parents, which may be
    justified on the grounds that it is typically the most likely case by far.)

    While this posterior probability is a good metric for grouping putative de
    novo mutations by validation likelihood, there exist error modes in
    high-throughput sequencing data that are not appropriately accounted for by
    the phred-scaled genotype likelihoods. To this end, a number of hard filters
    are applied in order to assign validation likelihood.

    These filters are different for SNPs and insertions/deletions. In the below
    rules, the following variables are used:

     - ``DR`` refers to the ratio of the read depth in the proband to the
       combined read depth in the parents.
     - ``DP`` refers to the read depth (DP field) of the proband.
     - ``AB`` refers to the read allele balance of the proband (number of
       alternate reads divided by total reads).
     - ``AC`` refers to the count of alternate alleles across all individuals
       in the dataset at the site.
     - ``p`` refers to :math:`\mathrm{P_{\text{de novo}}}`.
     - ``min_p`` refers to the ``min_p`` function parameter.

    HIGH-quality SNV:

    .. code-block:: text

        (p > 0.99) AND (AB > 0.3) AND (AC == 1)
            OR
        (p > 0.99) AND (AB > 0.3) AND (DR > 0.2)
            OR
        (p > 0.5) AND (AB > 0.3) AND (AC < 10) AND (DP > 10)

    MEDIUM-quality SNV:

    .. code-block:: text

        (p > 0.5) AND (AB > 0.3)
            OR
        (AC == 1)

    LOW-quality SNV:

    .. code-block:: text

       (AB > 0.2)

    HIGH-quality indel:

    .. code-block:: text

        (p > 0.99) AND (AB > 0.3) AND (AC == 1)

    MEDIUM-quality indel:

    .. code-block:: text

        (p > 0.5) AND (AB > 0.3) AND (AC < 10)

    LOW-quality indel:

    .. code-block:: text

       (AB > 0.2)

    Additionally, de novo candidates are not considered if the proband GQ is
    smaller than the ``min_gq`` parameter, if the proband allele balance is
    lower than the ``min_child_ab`` parameter, if the depth ratio between the
    proband and parents is smaller than the ``min_depth_ratio`` parameter, if
    the allele balance in a parent is above the ``max_parent_ab`` parameter, or
    if the posterior probability `p` is smaller than the `min_p` parameter.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        High-throughput sequencing dataset.
    pedigree : :class:`.Pedigree`
        Sample pedigree.
    pop_frequency_prior : :class:`.Float64Expression`
        Expression for population alternate allele frequency prior.
    min_gq
        Minimum proband GQ to be considered for *de novo* calling.
    min_p
        Minimum posterior probability to be considered for *de novo* calling.
    max_parent_ab
        Maximum parent allele balance.
    min_child_ab
        Minimum proband allele balance/
    min_dp_ratio
        Minimum ratio between proband read depth and parental read depth.

    Returns
    -------
    :class:`.Table`
    """
    DE_NOVO_PRIOR = 1 / 30000000
    MIN_POP_PRIOR = 100 / 30000000

    required_entry_fields = {'GT', 'AD', 'DP', 'GQ', 'PL'}
    missing_fields = required_entry_fields - set(mt.entry)
    if missing_fields:
        raise ValueError(
            f"'de_novo': expected 'MatrixTable' to have at least {required_entry_fields}, "
            f"missing {missing_fields}")

    mt = mt.annotate_rows(__prior=pop_frequency_prior,
                          __alt_alleles=hl.agg.sum(mt.GT.n_alt_alleles()),
                          __total_alleles=2 * hl.agg.sum(hl.is_defined(mt.GT)))
    # subtract 1 from __alt_alleles to correct for the observed genotype
    mt = mt.annotate_rows(
        __site_freq=hl.max((mt.__alt_alleles - 1) /
                           mt.__total_alleles, mt.__prior, MIN_POP_PRIOR))
    mt = require_biallelic(mt, 'de_novo')

    # FIXME check that __site_freq is between 0 and 1 when possible in expr
    tm = trio_matrix(mt, pedigree, complete_trios=True)

    autosomal = tm.locus.in_autosome_or_par() | (tm.locus.in_x_nonpar()
                                                 & tm.is_female)
    hemi_x = tm.locus.in_x_nonpar() & ~tm.is_female
    hemi_y = tm.locus.in_y_nonpar() & ~tm.is_female
    hemi_mt = tm.locus.in_mito() & tm.is_female

    is_snp = hl.is_snp(tm.alleles[0], tm.alleles[1])
    n_alt_alleles = tm.__alt_alleles
    prior = tm.__site_freq
    het_hom_hom = tm.proband_entry.GT.is_het() & tm.father_entry.GT.is_hom_ref(
    ) & tm.mother_entry.GT.is_hom_ref()
    kid_ad_fail = tm.proband_entry.AD[1] / hl.sum(
        tm.proband_entry.AD) < min_child_ab

    failure = hl.null(hl.tstruct(p_de_novo=hl.tfloat64, confidence=hl.tstr))

    kid = tm.proband_entry
    dad = tm.father_entry
    mom = tm.mother_entry

    kid_linear_pl = 10**(-kid.PL / 10)
    kid_pp = hl.bind(lambda x: x / hl.sum(x), kid_linear_pl)

    dad_linear_pl = 10**(-dad.PL / 10)
    dad_pp = hl.bind(lambda x: x / hl.sum(x), dad_linear_pl)

    mom_linear_pl = 10**(-mom.PL / 10)
    mom_pp = hl.bind(lambda x: x / hl.sum(x), mom_linear_pl)

    kid_ad_ratio = kid.AD[1] / hl.sum(kid.AD)
    dp_ratio = kid.DP / (dad.DP + mom.DP)

    def call_auto(kid_pp, dad_pp, mom_pp, kid_ad_ratio):
        p_data_given_dn = dad_pp[0] * mom_pp[0] * kid_pp[1] * DE_NOVO_PRIOR
        p_het_in_parent = 1 - (1 - prior)**4
        p_data_given_missed_het = (dad_pp[1] * mom_pp[0] + dad_pp[0] *
                                   mom_pp[1]) * kid_pp[1] * p_het_in_parent
        p_de_novo = p_data_given_dn / (p_data_given_dn +
                                       p_data_given_missed_het)

        def solve(p_de_novo):
            return (hl.case().when(kid.GQ < min_gq, failure).when(
                (kid.DP / (dad.DP + mom.DP) < min_dp_ratio)
                | ~(kid_ad_ratio >= min_child_ab), failure).when(
                    (hl.sum(mom.AD) == 0) | (hl.sum(dad.AD) == 0),
                    failure).when(
                        (mom.AD[1] / hl.sum(mom.AD) > max_parent_ab) |
                        (dad.AD[1] / hl.sum(dad.AD) > max_parent_ab),
                        failure).when(p_de_novo < min_p, failure).when(
                            ~is_snp,
                            hl.case().when(
                                (p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                                (n_alt_alleles == 1),
                                hl.struct(p_de_novo=p_de_novo,
                                          confidence='HIGH')).when(
                                              (p_de_novo > 0.5) &
                                              (kid_ad_ratio > 0.3) &
                                              (n_alt_alleles <= 5),
                                              hl.struct(
                                                  p_de_novo=p_de_novo,
                                                  confidence='MEDIUM')).when(
                                                      kid_ad_ratio > 0.2,
                                                      hl.struct(
                                                          p_de_novo=p_de_novo,
                                                          confidence='LOW')).
                            or_missing()).default(hl.case().when(
                                ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                                 (dp_ratio > 0.2)) | ((p_de_novo > 0.99) &
                                                      (kid_ad_ratio > 0.3) &
                                                      (n_alt_alleles == 1)) |
                                ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) &
                                 (n_alt_alleles < 10) & (kid.DP > 10)),
                                hl.struct(p_de_novo=p_de_novo,
                                          confidence='HIGH')).when(
                                              (p_de_novo > 0.5) &
                                              ((kid_ad_ratio > 0.3) |
                                               (n_alt_alleles == 1)),
                                              hl.struct(
                                                  p_de_novo=p_de_novo,
                                                  confidence='MEDIUM')).when(
                                                      kid_ad_ratio > 0.2,
                                                      hl.struct(
                                                          p_de_novo=p_de_novo,
                                                          confidence='LOW')).
                                                  or_missing()))

        return hl.bind(solve, p_de_novo)

    def call_hemi(kid_pp, parent, parent_pp, kid_ad_ratio):
        p_data_given_dn = parent_pp[0] * kid_pp[1] * DE_NOVO_PRIOR
        p_het_in_parent = 1 - (1 - prior)**4
        p_data_given_missed_het = (parent_pp[1] +
                                   parent_pp[2]) * kid_pp[2] * p_het_in_parent
        p_de_novo = p_data_given_dn / (p_data_given_dn +
                                       p_data_given_missed_het)

        def solve(p_de_novo):
            return (hl.case().when(kid.GQ < min_gq, failure).when(
                (kid.DP /
                 (parent.DP) < min_dp_ratio) | (kid_ad_ratio < min_child_ab),
                failure).when((hl.sum(parent.AD) == 0), failure).when(
                    parent.AD[1] / hl.sum(parent.AD) > max_parent_ab,
                    failure).when(p_de_novo < min_p, failure).when(
                        ~is_snp,
                        hl.case().when(
                            (p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                            (n_alt_alleles == 1),
                            hl.struct(
                                p_de_novo=p_de_novo, confidence='HIGH')).when(
                                    (p_de_novo > 0.5) & (kid_ad_ratio > 0.3) &
                                    (n_alt_alleles <= 5),
                                    hl.struct(p_de_novo=p_de_novo,
                                              confidence='MEDIUM')).when(
                                                  kid_ad_ratio > 0.3,
                                                  hl.struct(
                                                      p_de_novo=p_de_novo,
                                                      confidence='LOW')).
                        or_missing()).default(hl.case().when(
                            ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                             (dp_ratio > 0.2)) | ((p_de_novo > 0.99) &
                                                  (kid_ad_ratio > 0.3) &
                                                  (n_alt_alleles == 1)) |
                            ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) &
                             (n_alt_alleles < 10) & (kid.DP > 10)),
                            hl.struct(
                                p_de_novo=p_de_novo, confidence='HIGH')).when(
                                    (p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) |
                                                         (n_alt_alleles == 1)),
                                    hl.struct(p_de_novo=p_de_novo,
                                              confidence='MEDIUM')
                                ).when(
                                    kid_ad_ratio > 0.2,
                                    hl.struct(p_de_novo=p_de_novo,
                                              confidence='LOW')).or_missing()))

        return hl.bind(solve, p_de_novo)

    de_novo_call = (hl.case().when(~het_hom_hom | kid_ad_fail, failure).when(
        autosomal,
        hl.bind(call_auto, kid_pp, dad_pp, mom_pp, kid_ad_ratio)).when(
            hemi_x | hemi_mt,
            hl.bind(call_hemi, kid_pp, mom, mom_pp, kid_ad_ratio)).when(
                hemi_y, hl.bind(call_hemi, kid_pp, dad, dad_pp,
                                kid_ad_ratio)).or_missing())

    tm = tm.annotate_entries(__call=de_novo_call)
    tm = tm.filter_entries(hl.is_defined(tm.__call))
    entries = tm.entries()
    return (entries.select('__site_freq', 'proband', 'father', 'mother',
                           'proband_entry', 'father_entry', 'mother_entry',
                           'is_female',
                           **entries.__call).rename({'__site_freq': 'prior'}))
Ejemplo n.º 36
0
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]:
    """Filter rows with a list of intervals.

    Examples
    --------

    Filter to loci falling within one interval:

    >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')])

    Remove all loci within list of intervals:

    >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']]
    >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False)

    Notes
    -----
    Based on the ``keep`` argument, this method will either restrict to points
    in the supplied interval ranges, or remove all rows in those ranges.

    When ``keep=True``, partitions that don't overlap any supplied interval
    will not be loaded at all.  This enables :func:`.filter_intervals` to be
    used for reasonably low-latency queries of small ranges of the dataset, even
    on large datasets.

    Parameters
    ----------
    ds : :class:`.MatrixTable` or :class:`.Table`
        Dataset to filter.
    intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval`
        Intervals to filter on.  The point type of the interval must
        be a prefix of the key or equal to the first field of the key.
    keep : :obj:`bool`
        If ``True``, keep only rows that fall within any interval in `intervals`.
        If ``False``, keep only rows that fall outside all intervals in
        `intervals`.

    Returns
    -------
    :class:`.MatrixTable` or :class:`.Table`

    """

    if isinstance(ds, MatrixTable):
        k_type = ds.row_key.dtype
    else:
        assert isinstance(ds, Table)
        k_type = ds.key.dtype

    point_type = intervals.dtype.element_type.point_type

    def is_struct_prefix(partial, full):
        if list(partial) != list(full)[:len(partial)]:
            return False
        for k, v in partial.items():
            if full[k] != v:
                return False
        return True

    if point_type == k_type[0]:
        needs_wrapper = True
        point_type = hl.tstruct(foo=point_type)
    elif isinstance(point_type, tstruct) and is_struct_prefix(
            point_type, k_type):
        needs_wrapper = False
    else:
        raise TypeError(
            "The point type is incompatible with key type of the dataset ('{}', '{}')"
            .format(repr(point_type), repr(k_type)))

    def wrap_input(interval):
        if interval is None:
            raise TypeError(
                "'filter_intervals' does not allow missing values in 'intervals'."
            )
        elif needs_wrapper:
            return Interval(Struct(foo=interval.start),
                            Struct(foo=interval.end), interval.includes_start,
                            interval.includes_end)
        else:
            return interval

    intervals_type = intervals.dtype
    intervals = hl.eval(intervals)
    intervals = hl.tarray(hl.tinterval(point_type))._convert_to_json(
        [wrap_input(i) for i in intervals])

    if isinstance(ds, MatrixTable):
        config = {
            'name': 'MatrixFilterIntervals',
            'keyType': point_type._parsable_string(),
            'intervals': intervals,
            'keep': keep
        }
        return MatrixTable(MatrixToMatrixApply(ds._mir, config))
    else:
        config = {
            'name': 'TableFilterIntervals',
            'keyType': point_type._parsable_string(),
            'intervals': intervals,
            'keep': keep
        }
        return Table(TableToTableApply(ds._tir, config))
Ejemplo n.º 37
0
 def test_update(self):
     vds = self.get_vds()
     vds = vds.select_entries(dp=vds.DP, gq=vds.GQ)
     self.assertTrue(
         schema_eq(vds.entry.dtype, hl.tstruct(dp=hl.tint32, gq=hl.tint32)))
Ejemplo n.º 38
0
 def test_str_annotation_regression(self):
     t = hl.Table.parallelize([{'alleles': ['A', 'T']}],
                              hl.tstruct(alleles=hl.tarray(hl.tstr)))
     t = t.annotate(ref=t.alleles[0])
     t._force_count()
Ejemplo n.º 39
0
    def matrix_irs(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      reference_genome=hl.get_reference('GRCh37'),
                      contig_recoding={'01': '1'})

        collect = ir.MakeStruct([('x', ir.ApplyAggOp('Collect', [], None, [ir.I32(0)]))])

        matrix_read = ir.MatrixRead(
            ir.MatrixNativeReader(
                resource('backward_compatability/1.0.0/matrix_table/0.hmt'), None, False),
            False, False)
        table_read = ir.TableRead(
            ir.TableNativeReader(resource('backward_compatability/1.0.0/table/0.ht'), None, False), False)

        matrix_range = ir.MatrixRead(ir.MatrixRangeReader(1, 1, 10))
        matrix_irs = [
            ir.MatrixRepartition(matrix_range, 100, ir.RepartitionStrategy.SHUFFLE),
            ir.MatrixUnionRows(matrix_range, matrix_range),
            ir.MatrixDistinctByRow(matrix_range),
            ir.MatrixRowsHead(matrix_read, 5),
            ir.MatrixColsHead(matrix_read, 5),
            ir.CastTableToMatrix(
                ir.CastMatrixToTable(matrix_read, '__entries', '__cols'),
                '__entries',
                '__cols',
                []),
            ir.MatrixAggregateRowsByKey(matrix_read, collect, collect),
            ir.MatrixAggregateColsByKey(matrix_read, collect, collect),
            matrix_read,
            matrix_range,
            ir.MatrixRead(ir.MatrixVCFReader(resource('sample.vcf'), ['GT'], hl.tfloat64, None, None, None, None,
                                             False, True, False, True, None, None, None)),
            ir.MatrixRead(ir.MatrixBGENReader(resource('example.8bits.bgen'), None, {}, 10, 1, None)),
            ir.MatrixFilterRows(matrix_read, ir.FalseIR()),
            ir.MatrixFilterCols(matrix_read, ir.FalseIR()),
            ir.MatrixFilterEntries(matrix_read, ir.FalseIR()),
            ir.MatrixChooseCols(matrix_read, [1, 0]),
            ir.MatrixMapCols(matrix_read, ir.MakeStruct([('x', ir.I64(20))]), ['x']),
            ir.MatrixKeyRowsBy(matrix_read, ['row_i64'], False),
            ir.MatrixMapRows(ir.MatrixKeyRowsBy(matrix_read, []), ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixMapEntries(matrix_read, ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixMapGlobals(matrix_read, ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixCollectColsByKey(matrix_read),
            ir.MatrixExplodeRows(matrix_read, ['row_aset']),
            ir.MatrixExplodeCols(matrix_read, ['col_aset']),
            ir.MatrixAnnotateRowsTable(matrix_read, table_read, '__foo'),
            ir.MatrixAnnotateColsTable(matrix_read, table_read, '__foo'),
            ir.MatrixToMatrixApply(matrix_read, {'name': 'MatrixFilterPartitions', 'parts': [0], 'keep': True}),
            ir.MatrixRename(matrix_read, {'global_f32': 'global_foo'}, {'col_f32': 'col_foo'}, {'row_aset': 'row_aset2'}, {'entry_f32': 'entry_foo'}),
            ir.MatrixFilterIntervals(matrix_read, [hl.utils.Interval(hl.utils.Struct(row_idx=0), hl.utils.Struct(row_idx=10))], hl.tstruct(row_idx=hl.tint32), keep=False),
        ]

        return matrix_irs
Ejemplo n.º 40
0
    def test_aggregate2(self):
        schema = hl.tstruct(status=hl.tint32, GT=hl.tcall, qPheno=hl.tint32)

        rows = [{
            'status': 0,
            'GT': hl.Call([0, 0]),
            'qPheno': 3
        }, {
            'status': 0,
            'GT': hl.Call([0, 1]),
            'qPheno': 13
        }]

        kt = hl.Table.parallelize(rows, schema)

        result = convert_struct_to_dict(
            kt.group_by(status=kt.status).aggregate(
                x1=agg.collect(kt.qPheno * 2),
                x2=agg.explode(lambda elt: agg.collect(elt),
                               [kt.qPheno, kt.qPheno + 1]),
                x3=agg.min(kt.qPheno),
                x4=agg.max(kt.qPheno),
                x5=agg.sum(kt.qPheno),
                x6=agg.product(hl.int64(kt.qPheno)),
                x7=agg.count(),
                x8=agg.count_where(kt.qPheno == 3),
                x9=agg.fraction(kt.qPheno == 1),
                x10=agg.stats(hl.float64(kt.qPheno)),
                x11=agg.hardy_weinberg_test(kt.GT),
                x13=agg.inbreeding(kt.GT, 0.1),
                x14=agg.call_stats(kt.GT, ["A", "T"]),
                x15=agg.collect(
                    hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')))[0],
                x16=agg.collect(
                    hl.Struct(a=5, b="foo",
                              c=hl.Struct(banana='apple')).c.banana)[0],
                x17=agg.explode(lambda elt: agg.collect(elt),
                                hl.null(hl.tarray(hl.tint32))),
                x18=agg.explode(lambda elt: agg.collect(elt),
                                hl.null(hl.tset(hl.tint32))),
                x19=agg.take(kt.GT, 1, ordering=-kt.qPheno)).take(1)[0])

        expected = {
            u'status': 0,
            u'x13': {
                u'n_called': 2,
                u'expected_homs': 1.64,
                u'f_stat': -1.777777777777777,
                u'observed_homs': 1
            },
            u'x14': {
                u'AC': [3, 1],
                u'AF': [0.75, 0.25],
                u'AN': 4,
                u'homozygote_count': [1, 0]
            },
            u'x15': {
                u'a': 5,
                u'c': {
                    u'banana': u'apple'
                },
                u'b': u'foo'
            },
            u'x10': {
                u'min': 3.0,
                u'max': 13.0,
                u'sum': 16.0,
                u'stdev': 5.0,
                u'n': 2,
                u'mean': 8.0
            },
            u'x8': 1,
            u'x9': 0.0,
            u'x16': u'apple',
            u'x11': {
                u'het_freq_hwe': 0.5,
                u'p_value': 0.5
            },
            u'x2': [3, 4, 13, 14],
            u'x3': 3,
            u'x1': [6, 26],
            u'x6': 39,
            u'x7': 2,
            u'x4': 13,
            u'x5': 16,
            u'x17': [],
            u'x18': [],
            u'x19': [hl.Call([0, 1])]
        }

        self.maxDiff = None

        self.assertDictEqual(result, expected)
Ejemplo n.º 41
0
    def test_locus_windows(self):
        def assert_eq(a, b):
            self.assertTrue(np.array_equal(a, np.array(b)))

        centimorgans = hl.literal([0.1, 1.0, 1.0, 1.5, 1.9])

        mt = hl.balding_nichols_model(1, 5, 5).add_row_index()
        mt = mt.annotate_rows(cm=centimorgans[hl.int32(mt.row_idx)]).cache()

        starts, stops = hl.linalg.utils.locus_windows(mt.locus, 2)
        assert_eq(starts, [0, 0, 0, 1, 2])
        assert_eq(stops, [3, 4, 5, 5, 5])

        starts, stops = hl.linalg.utils.locus_windows(mt.locus,
                                                      0.5,
                                                      coord_expr=mt.cm)
        assert_eq(starts, [0, 1, 1, 1, 3])
        assert_eq(stops, [1, 4, 4, 5, 5])

        starts, stops = hl.linalg.utils.locus_windows(
            mt.locus, 1.0, coord_expr=2 * centimorgans[hl.int32(mt.row_idx)])
        assert_eq(starts, [0, 1, 1, 1, 3])
        assert_eq(stops, [1, 4, 4, 5, 5])

        rows = [{
            'locus': hl.Locus('1', 1),
            'cm': 1.0
        }, {
            'locus': hl.Locus('1', 2),
            'cm': 3.0
        }, {
            'locus': hl.Locus('1', 4),
            'cm': 4.0
        }, {
            'locus': hl.Locus('2', 1),
            'cm': 2.0
        }, {
            'locus': hl.Locus('2', 1),
            'cm': 2.0
        }, {
            'locus': hl.Locus('3', 3),
            'cm': 5.0
        }]

        ht = hl.Table.parallelize(rows,
                                  hl.tstruct(locus=hl.tlocus('GRCh37'),
                                             cm=hl.tfloat64),
                                  key=['locus'])

        starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1)
        assert_eq(starts, [0, 0, 2, 3, 3, 5])
        assert_eq(stops, [2, 2, 3, 5, 5, 6])

        starts, stops = hl.linalg.utils.locus_windows(ht.locus,
                                                      1.0,
                                                      coord_expr=ht.cm)
        assert_eq(starts, [0, 1, 1, 3, 3, 5])
        assert_eq(stops, [1, 3, 3, 5, 5, 6])

        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.order_by(ht.cm).locus, 1.0)
        self.assertTrue('ascending order' in str(cm.exception))

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(
                ht.locus, 1.0, coord_expr=hl.utils.range_table(1).idx)
        self.assertTrue('different source' in str(cm.exception))

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(hl.locus('1', 1), 1.0)
        self.assertTrue("no source" in str(cm.exception))

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=0.0)
        self.assertTrue("no source" in str(cm.exception))

        ht = ht.annotate_globals(x=hl.locus('1', 1), y=1.0)
        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.x, 1.0)
        self.assertTrue("row-indexed" in str(cm.exception))
        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, ht.y)
        self.assertTrue("row-indexed" in str(cm.exception))

        ht = hl.Table.parallelize([{
            'locus': hl.null(hl.tlocus()),
            'cm': 1.0
        }],
                                  hl.tstruct(locus=hl.tlocus('GRCh37'),
                                             cm=hl.tfloat64),
                                  key=['locus'])
        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0)
        self.assertTrue("missing value for 'locus_expr'" in str(cm.exception))
        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
        self.assertTrue("missing value for 'locus_expr'" in str(cm.exception))

        ht = hl.Table.parallelize([{
            'locus': hl.Locus('1', 1),
            'cm': hl.null(hl.tfloat64)
        }],
                                  hl.tstruct(locus=hl.tlocus('GRCh37'),
                                             cm=hl.tfloat64),
                                  key=['locus'])
        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
        self.assertTrue("missing value for 'coord_expr'" in str(cm.exception))
Ejemplo n.º 42
0
    def test_reference_genome_liftover(self):
        grch37 = hl.get_reference('GRCh37')
        grch38 = hl.get_reference('GRCh38')

        self.assertTrue(not grch37.has_liftover('GRCh38')
                        and not grch38.has_liftover('GRCh37'))
        grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'),
                            'GRCh38')
        grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'),
                            'GRCh37')
        self.assertTrue(
            grch37.has_liftover('GRCh38') and grch38.has_liftover('GRCh37'))

        ds = hl.import_vcf(resource('sample.vcf'))
        t = ds.annotate_rows(liftover=hl.liftover(
            hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows()
        self.assertTrue(t.all(t.locus == t.liftover))

        null_locus = hl.null(hl.tlocus('GRCh38'))

        rows = [{
            'l37': hl.locus('20', 1, 'GRCh37'),
            'l38': null_locus
        }, {
            'l37': hl.locus('20', 60000, 'GRCh37'),
            'l38': null_locus
        }, {
            'l37': hl.locus('20', 60001, 'GRCh37'),
            'l38': hl.locus('chr20', 79360, 'GRCh38')
        }, {
            'l37': hl.locus('20', 278686, 'GRCh37'),
            'l38': hl.locus('chr20', 298045, 'GRCh38')
        }, {
            'l37': hl.locus('20', 278687, 'GRCh37'),
            'l38': hl.locus('chr20', 298046, 'GRCh38')
        }, {
            'l37': hl.locus('20', 278688, 'GRCh37'),
            'l38': null_locus
        }, {
            'l37': hl.locus('20', 278689, 'GRCh37'),
            'l38': null_locus
        }, {
            'l37': hl.locus('20', 278690, 'GRCh37'),
            'l38': null_locus
        }, {
            'l37': hl.locus('20', 278691, 'GRCh37'),
            'l38': hl.locus('chr20', 298047, 'GRCh38')
        }, {
            'l37': hl.locus('20', 37007586, 'GRCh37'),
            'l38': hl.locus('chr12', 32563117, 'GRCh38')
        }, {
            'l37': hl.locus('20', 62965520, 'GRCh37'),
            'l38': hl.locus('chr20', 64334167, 'GRCh38')
        }, {
            'l37': hl.locus('20', 62965521, 'GRCh37'),
            'l38': null_locus
        }]
        schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38))
        t = hl.Table.parallelize(rows, schema)
        self.assertTrue(
            t.all(
                hl.cond(hl.is_defined(t.l38),
                        hl.liftover(t.l37, 'GRCh38') == t.l38,
                        hl.is_missing(hl.liftover(t.l37, 'GRCh38')))))

        t = t.filter(hl.is_defined(t.l38))
        self.assertTrue(t.count() == 6)

        t = t.key_by('l38')
        t.count()
        self.assertTrue(list(t.key) == ['l38'])

        null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38')))
        rows = [{
            'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'),
            'i38': null_locus_interval
        }, {
            'i37':
            hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'),
            'i38':
            hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')
        }]
        schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)),
                            i38=hl.tinterval(hl.tlocus(grch38)))
        t = hl.Table.parallelize(rows, schema)
        self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38))

        grch37.remove_liftover("GRCh38")
        grch38.remove_liftover("GRCh37")
Ejemplo n.º 43
0
 def test_transmute_globals(self):
     ht = hl.utils.range_table(1).annotate_globals(a=5, b=10)
     self.assertEqual(ht.transmute_globals(c=ht.a + 5).globals.dtype, hl.tstruct(b=hl.tint, c=hl.tint))
Ejemplo n.º 44
0
    def test_multi_way_zip_join(self):
        d1 = [{
            "id": 0,
            "name": "a",
            "data": 0.0
        }, {
            "id": 1,
            "name": "b",
            "data": 3.14
        }, {
            "id": 2,
            "name": "c",
            "data": 2.78
        }]
        d2 = [{
            "id": 0,
            "name": "d",
            "data": 1.1
        }, {
            "id": 0,
            "name": "x",
            "data": 2.2
        }, {
            "id": 2,
            "name": "v",
            "data": 7.89
        }]
        d3 = [{
            "id": 1,
            "name": "f",
            "data": 9.99
        }, {
            "id": 2,
            "name": "g",
            "data": -1.0
        }, {
            "id": 3,
            "name": "z",
            "data": 0.01
        }]
        s = hl.tstruct(id=hl.tint32, name=hl.tstr, data=hl.tfloat64)
        ts = [
            hl.Table.parallelize(r, schema=s, key='id') for r in [d1, d2, d3]
        ]
        joined = hl.Table._multi_way_zip_join(ts, '__data',
                                              '__globals').drop('__globals')
        dexpected = [{
            "id":
            0,
            "__data": [{
                "name": "a",
                "data": 0.0
            }, {
                "name": "d",
                "data": 1.1
            }, None]
        }, {
            "id": 0,
            "__data": [None, {
                "name": "x",
                "data": 2.2
            }, None]
        }, {
            "id":
            1,
            "__data": [{
                "name": "b",
                "data": 3.14
            }, None, {
                "name": "f",
                "data": 9.99
            }]
        }, {
            "id":
            2,
            "__data": [{
                "name": "c",
                "data": 2.78
            }, {
                "name": "v",
                "data": 7.89
            }, {
                "name": "g",
                "data": -1.0
            }]
        }, {
            "id": 3,
            "__data": [None, None, {
                "name": "z",
                "data": 0.01
            }]
        }]
        expected = hl.Table.parallelize(dexpected,
                                        schema=hl.tstruct(
                                            id=hl.tint32,
                                            __data=hl.tarray(
                                                hl.tstruct(name=hl.tstr,
                                                           data=hl.tfloat64))),
                                        key='id')
        self.assertTrue(expected._same(joined))

        expected2 = expected.transmute(data=expected['__data'])
        joined_same_name = hl.Table._multi_way_zip_join(
            ts, 'data', 'globals').drop('globals')
        self.assertTrue(expected2._same(joined_same_name))

        joined_nothing = hl.Table._multi_way_zip_join(ts, 'data',
                                                      'globals').drop(
                                                          'data', 'globals')
        self.assertEqual(joined_nothing._force_count(), 5)
Ejemplo n.º 45
0
 def test_join_mangling(self):
     t1 = hl.utils.range_table(10).annotate_globals(glob1=5).annotate(row1=5)
     j = t1.join(t1, 'inner')
     assert j.row.dtype == hl.tstruct(idx=hl.tint32, row1=hl.tint32, row1_1=hl.tint32)
     assert j.globals.dtype == hl.tstruct(glob1=hl.tint32, glob1_1=hl.tint32)
     j._force_count()
Ejemplo n.º 46
0
    def test_mendel_errors(self):
        mt = hl.import_vcf(resource('mendel.vcf'))
        ped = hl.Pedigree.read(resource('mendel.fam'))

        men, fam, ind, var = hl.mendel_errors(mt['GT'], ped)

        self.assertEqual(
            men.key.dtype,
            hl.tstruct(locus=mt.locus.dtype,
                       alleles=hl.tarray(hl.tstr),
                       s=hl.tstr))
        self.assertEqual(
            men.row.dtype,
            hl.tstruct(locus=mt.locus.dtype,
                       alleles=hl.tarray(hl.tstr),
                       s=hl.tstr,
                       fam_id=hl.tstr,
                       mendel_code=hl.tint))
        self.assertEqual(fam.key.dtype,
                         hl.tstruct(pat_id=hl.tstr, mat_id=hl.tstr))
        self.assertEqual(
            fam.row.dtype,
            hl.tstruct(pat_id=hl.tstr,
                       mat_id=hl.tstr,
                       fam_id=hl.tstr,
                       children=hl.tint,
                       errors=hl.tint64,
                       snp_errors=hl.tint64))
        self.assertEqual(ind.key.dtype, hl.tstruct(s=hl.tstr))
        self.assertEqual(
            ind.row.dtype,
            hl.tstruct(s=hl.tstr,
                       fam_id=hl.tstr,
                       errors=hl.tint64,
                       snp_errors=hl.tint64))
        self.assertEqual(
            var.key.dtype,
            hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr)))
        self.assertEqual(
            var.row.dtype,
            hl.tstruct(locus=mt.locus.dtype,
                       alleles=hl.tarray(hl.tstr),
                       errors=hl.tint64))

        self.assertEqual(men.count(), 41)
        self.assertEqual(fam.count(), 2)
        self.assertEqual(ind.count(), 7)
        self.assertEqual(var.count(), mt.count_rows())

        self.assertEqual(
            set(fam.select('children', 'errors', 'snp_errors').collect()), {
                hl.utils.Struct(pat_id='Dad1',
                                mat_id='Mom1',
                                children=2,
                                errors=41,
                                snp_errors=39),
                hl.utils.Struct(pat_id='Dad2',
                                mat_id='Mom2',
                                children=1,
                                errors=0,
                                snp_errors=0)
            })

        self.assertEqual(
            set(ind.select('errors', 'snp_errors').collect()), {
                hl.utils.Struct(s='Son1', errors=23, snp_errors=22),
                hl.utils.Struct(s='Dtr1', errors=18, snp_errors=17),
                hl.utils.Struct(s='Dad1', errors=19, snp_errors=18),
                hl.utils.Struct(s='Mom1', errors=22, snp_errors=21),
                hl.utils.Struct(s='Dad2', errors=0, snp_errors=0),
                hl.utils.Struct(s='Mom2', errors=0, snp_errors=0),
                hl.utils.Struct(s='Son2', errors=0, snp_errors=0)
            })

        to_keep = hl.set([(hl.Locus("1", 1), ['C', 'CT']),
                          (hl.Locus("1", 2), ['C', 'T']),
                          (hl.Locus("X", 1), ['C', 'T']),
                          (hl.Locus("X", 3), ['C', 'T']),
                          (hl.Locus("Y", 1), ['C', 'T']),
                          (hl.Locus("Y", 3), ['C', 'T'])])
        self.assertEqual(
            var.filter(to_keep.contains(
                (var.locus, var.alleles))).order_by('locus').select(
                    'locus', 'alleles', 'errors').collect(),
            [
                hl.utils.Struct(
                    locus=hl.Locus("1", 1), alleles=['C', 'CT'], errors=2),
                hl.utils.Struct(
                    locus=hl.Locus("1", 2), alleles=['C', 'T'], errors=1),
                hl.utils.Struct(
                    locus=hl.Locus("X", 1), alleles=['C', 'T'], errors=2),
                hl.utils.Struct(
                    locus=hl.Locus("X", 3), alleles=['C', 'T'], errors=1),
                hl.utils.Struct(
                    locus=hl.Locus("Y", 1), alleles=['C', 'T'], errors=1),
                hl.utils.Struct(
                    locus=hl.Locus("Y", 3), alleles=['C', 'T'], errors=1),
            ])

        ped2 = hl.Pedigree.read(resource('mendelWithMissingSex.fam'))
        men2, _, _, _ = hl.mendel_errors(mt['GT'], ped2)

        self.assertTrue(
            men2.filter(men2.s == 'Dtr1')._same(men.filter(men.s == 'Dtr1')))
Ejemplo n.º 47
0
 def test_import_vcf_no_reference_specified(self):
     vcf = hl.import_vcf(resource('sample2.vcf'), reference_genome=None)
     self.assertTrue(
         vcf.locus.dtype == hl.tstruct(contig=hl.tstr, position=hl.tint32))
     self.assertEqual(vcf.count_rows(), 735)
Ejemplo n.º 48
0
@pytest.mark.parametrize(
    "input_regions,expected_output_regions",
    [
        (
            hl.literal([
                hl.utils.Struct(start=5, stop=10),
                hl.utils.Struct(start=7, stop=12),
                hl.utils.Struct(start=10, stop=11),
            ]),
            [hl.utils.Struct(start=5, stop=12)],
        ),
        (
            hl.literal([
                hl.utils.Struct(start=5, stop=10),
                hl.utils.Struct(start=11, stop=14),
                hl.utils.Struct(start=17, stop=22),
                hl.utils.Struct(start=22, stop=24),
            ]),
            [
                hl.utils.Struct(start=5, stop=14),
                hl.utils.Struct(start=17, stop=24),
            ],
        ),
        (hl.empty_array(hl.tstruct(start=hl.tint, stop=hl.tint)), []),
    ],
)
def test_merge_overlapping_regions(input_regions, expected_output_regions):
    assert hl.eval(
        merge_overlapping_regions(input_regions)) == expected_output_regions
Ejemplo n.º 49
0
 def _compute_type(self):
     self._type = hl.ttable(hl.tstruct(),
                            hl.tstruct(idx=hl.tint32),
                            ['idx'])
Ejemplo n.º 50
0
    def value_irs(self):
        b = ir.TrueIR()
        c = ir.Ref('c', hl.tbool)
        i = ir.I32(5)
        j = ir.I32(7)
        st = ir.Str('Hail')
        a = ir.Ref('a', hl.tarray(hl.tint32))
        aa = ir.Ref('aa', hl.tarray(hl.tarray(hl.tint32)))
        da = ir.Ref('da', hl.tarray(hl.ttuple(hl.tint32, hl.tstr)))
        v = ir.Ref('v', hl.tint32)
        s = ir.Ref('s', hl.tstruct(x=hl.tint32, y=hl.tint64, z=hl.tfloat64))
        t = ir.Ref('t', hl.ttuple(hl.tint32, hl.tint64, hl.tfloat64))
        call = ir.Ref('call', hl.tcall)

        collect_sig = ir.AggSignature('Collect', [], None, [hl.tint32])

        call_stats_sig = ir.AggSignature('CallStats', [], [hl.tint32],
                                         [hl.tcall])

        hist_sig = ir.AggSignature('Histogram',
                                   [hl.tfloat64, hl.tfloat64, hl.tint32], None,
                                   [hl.tfloat64])

        take_by_sig = ir.AggSignature('TakeBy', [hl.tint32], None,
                                      [hl.tfloat64, hl.tfloat64])

        value_irs = [
            i,
            ir.I64(5),
            ir.F32(3.14),
            ir.F64(3.14),
            s,
            ir.TrueIR(),
            ir.FalseIR(),
            ir.Void(),
            ir.Cast(i, hl.tfloat64),
            ir.NA(hl.tint32),
            ir.IsNA(i),
            ir.If(b, i, j),
            ir.Let('v', i, v),
            ir.Ref('x', hl.tint32),
            ir.ApplyBinaryOp('+', i, j),
            ir.ApplyUnaryOp('-', i),
            ir.ApplyComparisonOp('EQ', i, j),
            ir.MakeArray([i, ir.NA(hl.tint32), ir.I32(-3)],
                         hl.tarray(hl.tint32)),
            ir.ArrayRef(a, i),
            ir.ArrayLen(a),
            ir.ArrayRange(ir.I32(0), ir.I32(5), ir.I32(1)),
            ir.ArraySort(a, b, False),
            ir.ToSet(a),
            ir.ToDict(da),
            ir.ToArray(a),
            ir.LowerBoundOnOrderedCollection(a, i, True),
            ir.GroupByKey(da),
            ir.ArrayMap(a, 'v', v),
            ir.ArrayFilter(a, 'v', v),
            ir.ArrayFlatMap(aa, 'v', v),
            ir.ArrayFold(a, ir.I32(0), 'x', 'v', v),
            ir.ArrayScan(a, ir.I32(0), 'x', 'v', v),
            ir.ArrayFor(a, 'v', ir.Void()),
            ir.ApplyAggOp(ir.I32(0), [], None, collect_sig),
            ir.ApplyScanOp(ir.I32(0), [], None, collect_sig),
            ir.ApplyAggOp(ir.F64(-2.11),
                          [ir.F64(-5.0),
                           ir.F64(5.0), ir.I32(100)], None, hist_sig),
            ir.ApplyAggOp(call, [], [ir.I32(2)], call_stats_sig),
            ir.ApplyAggOp(ir.F64(-2.11), [ir.I32(10)], None, take_by_sig),
            ir.InitOp(ir.I32(0), [ir.I32(2)], call_stats_sig),
            ir.SeqOp(ir.I32(0), [i], collect_sig),
            ir.SeqOp(ir.I32(0), [ir.F64(-2.11), ir.I32(17)], take_by_sig),
            ir.Begin([ir.Void()]),
            ir.MakeStruct([('x', i)]),
            ir.SelectFields(s, ['x', 'z']),
            ir.InsertFields(s, [('x', i)]),
            ir.GetField(s, 'x'),
            ir.MakeTuple([i, b]),
            ir.GetTupleElement(t, 1),
            ir.StringSlice(st, ir.I32(1), ir.I32(2)),
            ir.StringLength(st),
            ir.In(2, hl.tfloat64),
            ir.Die('mumblefoo', hl.tfloat64),
            ir.Apply('&&', b, c),
            ir.Apply('toFloat64', i),
            ir.Apply('isDefined', s),
            ir.Uniroot('x', ir.F64(3.14), ir.F64(-5.0), ir.F64(5.0)),
            ir.Literal(hl.tarray(hl.tint32), [1, 2, None]),
        ]

        return value_irs
Ejemplo n.º 51
0
 def _compute_type(self):
     self._type = hl.ttable(hl.tstruct(), hl.tstruct(**{'i': hl.tint64, 'j': hl.tint64, 'entry': hl.tfloat64}), [])
Ejemplo n.º 52
0
    def table_irs(self):
        b = ir.TrueIR()
        table_read = ir.TableRead(
            ir.TableNativeReader(
                resource('backward_compatability/1.0.0/table/0.ht')), False)
        table_read_row_type = hl.dtype(
            'struct{idx: int32, f32: float32, i64: int64, m: float64, astruct: struct{a: int32, b: float64}, mstruct: struct{x: int32, y: str}, aset: set<str>, mset: set<float64>, d: dict<array<str>, float64>, md: dict<int32, str>, h38: locus<GRCh38>, ml: locus<GRCh37>, i: interval<locus<GRCh37>>, c: call, mc: call, t: tuple(call, str, str), mt: tuple(locus<GRCh37>, bool)}'
        )

        matrix_read = ir.MatrixRead(
            ir.MatrixNativeReader(
                resource('backward_compatability/1.0.0/matrix_table/0.hmt')),
            False, False)

        range = ir.TableRange(10, 4)
        table_irs = [
            ir.TableKeyBy(table_read, ['m', 'd'], False),
            ir.TableFilter(table_read, b), table_read,
            ir.MatrixColsTable(matrix_read),
            ir.TableAggregateByKey(table_read,
                                   ir.MakeStruct([('a', ir.I32(5))])),
            ir.TableKeyByAndAggregate(table_read,
                                      ir.MakeStruct([('a', ir.I32(5))]),
                                      ir.MakeStruct([('b', ir.I32(5))]), 1, 2),
            ir.TableJoin(table_read, ir.TableRange(100, 10), 'inner', 1),
            ir.MatrixEntriesTable(matrix_read),
            ir.MatrixRowsTable(matrix_read),
            ir.TableParallelize(
                ir.MakeStruct([('rows',
                                ir.Literal(hl.tarray(hl.tstruct(a=hl.tint32)),
                                           [{
                                               'a': None
                                           }, {
                                               'a': 5
                                           }, {
                                               'a': -3
                                           }])),
                               ('global', ir.MakeStruct([]))]), None),
            ir.TableMapRows(
                ir.TableKeyBy(table_read, []),
                ir.MakeStruct([('a', ir.GetField(ir.Ref('row'), 'f32')),
                               ('b', ir.F64(-2.11))])),
            ir.TableMapGlobals(
                table_read,
                ir.MakeStruct([('foo', ir.NA(hl.tarray(hl.tint32)))])),
            ir.TableRange(100, 10),
            ir.TableRepartition(table_read, 10,
                                ir.RepartitionStrategy.COALESCE),
            ir.TableUnion([ir.TableRange(100, 10),
                           ir.TableRange(50, 10)]),
            ir.TableExplode(table_read, ['mset']),
            ir.TableHead(table_read, 10),
            ir.TableOrderBy(ir.TableKeyBy(table_read, []), [('m', 'A'),
                                                            ('m', 'D')]),
            ir.TableDistinct(table_read),
            ir.CastMatrixToTable(matrix_read, '__entries', '__cols'),
            ir.TableRename(table_read, {'idx': 'idx_foo'},
                           {'global_f32': 'global_foo'}),
            ir.TableMultiWayZipJoin([table_read, table_read], '__data',
                                    '__globals'),
            ir.MatrixToTableApply(
                matrix_read, {
                    'name': 'LinearRegressionRowsSingle',
                    'yFields': ['col_m'],
                    'xField': 'entry_m',
                    'covFields': [],
                    'rowBlockSize': 10,
                    'passThrough': []
                }),
            ir.TableToTableApply(table_read, {
                'name': 'TableFilterPartitions',
                'parts': [0],
                'keep': True
            })
        ]

        return table_irs
Ejemplo n.º 53
0
 def test_import_bed_no_reference_specified(self):
     bed_file = resource('example1.bed')
     t = hl.import_bed(bed_file, reference_genome=None)
     self.assertTrue(t.count() == 3)
     self.assertEqual(t.interval.dtype.point_type,
                      hl.tstruct(contig=hl.tstr, position=hl.tint32))
Ejemplo n.º 54
0
def info_score(gp) -> StructExpression:
    r"""Compute the IMPUTE information score.

    Examples
    --------
    Calculate the info score per variant:

    >>> gen_mt = hl.import_gen('data/example.gen', sample_file='data/example.sample')
    >>> gen_mt = gen_mt.annotate_rows(info_score = hl.agg.info_score(gen_mt.GP))

    Calculate group-specific info scores per variant:

    >>> gen_mt = hl.import_gen('data/example.gen', sample_file='data/example.sample')
    >>> gen_mt = gen_mt.annotate_cols(is_case = hl.rand_bool(0.5))
    >>> gen_mt = gen_mt.annotate_rows(info_score_case = hl.agg.info_score(hl.agg.filter(gen_mt.is_case, gen_mt.GP)),
    ...                               info_score_ctrl = hl.agg.info_score(hl.agg.filter(~gen_mt.is_case, gen_mt.GP)))

    Notes
    -----
    The result of :func:`.info_score` is a struct with two fields:

        - `score` (``float64``) -- Info score.
        - `n_included` (``int32``) -- Number of non-missing samples included in the
          calculation.

    We implemented the IMPUTE info measure as described in the supplementary
    information from `Marchini & Howie. Genotype imputation for genome-wide
    association studies. Nature Reviews Genetics (2010)
    <http://www.nature.com/nrg/journal/v11/n7/extref/nrg2796-s3.pdf>`__. To
    calculate the info score :math:`I_{A}` for one SNP:

    .. math::

        I_{A} =
        \begin{cases}
        1 - \frac{\sum_{i=1}^{N}(f_{i} - e_{i}^2)}{2N\hat{\theta}(1 - \hat{\theta})} & \text{when } \hat{\theta} \in (0, 1) \\
        1 & \text{when } \hat{\theta} = 0, \hat{\theta} = 1\\
        \end{cases}

    - :math:`N` is the number of samples with imputed genotype probabilities
      [:math:`p_{ik} = P(G_{i} = k)` where :math:`k \in \{0, 1, 2\}`]
    - :math:`e_{i} = p_{i1} + 2p_{i2}` is the expected genotype per sample
    - :math:`f_{i} = p_{i1} + 4p_{i2}`
    - :math:`\hat{\theta} = \frac{\sum_{i=1}^{N}e_{i}}{2N}` is the MLE for the
      population minor allele frequency

    Hail will not generate identical results to `QCTOOL
    <http://www.well.ox.ac.uk/~gav/qctool/#overview>`__ for the following
    reasons:

    - Hail automatically removes genotype probability distributions that do not
      meet certain requirements on data import with :func:`.import_gen` and
      :func:`.import_bgen`.
    - Hail does not use the population frequency to impute genotype
      probabilities when a genotype probability distribution has been set to
      missing.
    - Hail calculates the same statistic for sex chromosomes as autosomes while
      QCTOOL incorporates sex information.
    - The floating point number Hail stores for each genotype probability is
      slightly different than the original data due to rounding and
      normalization of probabilities.

    Warning
    -------
    - The info score Hail reports will be extremely different from QCTOOL when
      a SNP has a high missing rate.
    - If the `gp` array must contain 3 elements, and its elements may not be
      missing.
    - If the genotype data was not imported using the :func:`.import_gen` or
      :func:`.import_bgen` functions, then the results for all variants will be
      ``score = NA`` and ``n_included = 0``.
    - It only makes semantic sense to compute the info score per variant. While
      the aggregator will run in any context if its arguments are the right
      type, the results are only meaningful in a narrow context.

    Parameters
    ----------
    gp : :class:`.ArrayNumericExpression`
        Genotype probability array. Must have 3 elements, all of which must be
        defined.

    Returns
    -------
    :class:`.StructExpression`
        Struct with fields `score` and `n_included`.
    """
    t = hl.tstruct(score=hl.tfloat64, n_included=hl.tint32)
    return _agg_func('InfoScore', gp, t)
Ejemplo n.º 55
0
 def test_import_locus_intervals_no_reference_specified(self):
     interval_file = resource('annotinterall.interval_list')
     t = hl.import_locus_intervals(interval_file, reference_genome=None)
     self.assertTrue(t.count() == 2)
     self.assertEqual(t.interval.dtype.point_type,
                      hl.tstruct(contig=hl.tstr, position=hl.tint32))
Ejemplo n.º 56
0
def linreg(y, x):
    """Compute linear regression statistics.

    Examples
    --------
    Regress HT against an intercept (1) , SEX, and C1:

    >>> table1.aggregate(agg.linreg(table1.HT, [1, table1.SEX == 'F', table1.C1]))
    Struct(
    beta=[88.50000000000014, 81.50000000000057, -10.000000000000068],
    standard_error=[14.430869689661844, 59.70552738231206, 7.000000000000016],
    t_stat=[6.132686518775844, 1.365032746099571, -1.428571428571435],
    p_value=[0.10290201427537926, 0.40250974549499974, 0.3888002244284281],
    n=4)

    Regress blood pressure against an intercept (1), age, height, and height squared:

    >>> ds_ann = ds.annotate_rows(
    ...    linreg = hl.agg.linreg(ds.pheno.blood_pressure,
    ...                           [1, ds.pheno.age, ds.pheno.height, ds.pheno.height ** 2]))

    Notes
    -----
    This aggregator returns a struct expression with five fields:

     - `beta` (:class:`.tarray` of :py:data:`.tfloat64`): Estimated regression coefficient
       for each predictor.
     - `standard_error` (:class:`.tarray` of :py:data:`.tfloat64`): Estimated standard error
       estimate for each predictor.
     - `t_stat` (:class:`.tarray` of :py:data:`.tfloat64`): t statistic for each predictor.
     - `p_value` (:class:`.tarray` of :py:data:`.tfloat64`): p-value for each predictor.
     - `n` (:py:data:`.tint64`): Number of samples included in the regression. A sample is
       included if and only if `y` and all elements of `x` are non-missing.

    The first four fields are missing if n is less than or equal to the number of predictors
    or if the predictors are linearly dependent.

    Parameters
    ----------
    y : :class:`.Float64Expression`
        Response variable.
    x : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression`
        Independent variables.

    Returns
    -------
    :class:`.StructExpression`
        Struct with fields `beta`, `standard_error`, `t_stat`, `p_value`, and `n`.
    """
    x = wrap_to_list(x)
    k = len(x)
    if k == 0:
        raise ValueError("'linreg' requires at least one predictor in `x`")

    t = hl.tstruct(beta=hl.tarray(hl.tfloat64),
                   standard_error=hl.tarray(hl.tfloat64),
                   t_stat=hl.tarray(hl.tfloat64),
                   p_value=hl.tarray(hl.tfloat64),
                   n=hl.tint64)

    x = hl.array(x)
    k = hl.int32(k)

    return _agg_func('LinearRegression', y, t, [k], f=lambda expr: x)
Ejemplo n.º 57
0
import hail as hl

gvcfs = ['gs://hail-common/test-resources/HG00096.g.vcf.gz',
         'gs://hail-common/test-resources/HG00268.g.vcf.gz']
hl.init(default_reference='GRCh38')
parts_json = [
    {'start': {'locus': {'contig': 'chr20', 'position': 17821257}},
     'end': {'locus': {'contig': 'chr20', 'position': 18708366}},
     'includeStart': True,
     'includeEnd': True},
    {'start': {'locus': {'contig': 'chr20', 'position': 18708367}},
     'end': {'locus': {'contig': 'chr20', 'position': 19776611}},
     'includeStart': True,
     'includeEnd': True},
    {'start': {'locus': {'contig': 'chr20', 'position': 19776612}},
     'end': {'locus': {'contig': 'chr20', 'position': 21144633}},
     'includeStart': True,
     'includeEnd': True},
]

parts = hl.tarray(hl.tinterval(hl.tstruct(locus=hl.tlocus('GRCh38'))))._convert_from_json(parts_json)
for mt in hl.import_gvcfs(gvcfs, parts):
    mt._force_count_rows()
Ejemplo n.º 58
0
    def test_annotate(self):
        schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32))

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3]},
                {'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': []},
                {'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7]}]

        kt = hl.Table.parallelize(rows, schema)

        self.assertTrue(kt.annotate()._same(kt))

        result1 = convert_struct_to_dict(kt.annotate(foo=kt.a + 1,
                                                     foo2=kt.a).take(1)[0])

        self.assertDictEqual(result1, {'a': 4,
                                       'b': 1,
                                       'c': 3,
                                       'd': 5,
                                       'e': "hello",
                                       'f': [1, 2, 3],
                                       'foo': 5,
                                       'foo2': 4})

        result3 = convert_struct_to_dict(kt.annotate(
            x1=kt.f.map(lambda x: x * 2),
            x2=kt.f.map(lambda x: [x, x + 1]).flatmap(lambda x: x),
            x3=hl.min(kt.f),
            x4=hl.max(kt.f),
            x5=hl.sum(kt.f),
            x6=hl.product(kt.f),
            x7=kt.f.length(),
            x8=kt.f.filter(lambda x: x == 3),
            x9=kt.f[1:],
            x10=kt.f[:],
            x11=kt.f[1:2],
            x12=kt.f.map(lambda x: [x, x + 1]),
            x13=kt.f.map(lambda x: [[x, x + 1], [x + 2]]).flatmap(lambda x: x),
            x14=hl.cond(kt.a < kt.b, kt.c, hl.null(hl.tint32)),
            x15={1, 2, 3}
        ).take(1)[0])

        self.assertDictEqual(result3, {'a': 4,
                                       'b': 1,
                                       'c': 3,
                                       'd': 5,
                                       'e': "hello",
                                       'f': [1, 2, 3],
                                       'x1': [2, 4, 6], 'x2': [1, 2, 2, 3, 3, 4],
                                       'x3': 1, 'x4': 3, 'x5': 6, 'x6': 6, 'x7': 3, 'x8': [3],
                                       'x9': [2, 3], 'x10': [1, 2, 3], 'x11': [2],
                                       'x12': [[1, 2], [2, 3], [3, 4]],
                                       'x13': [[1, 2], [3], [2, 3], [4], [3, 4], [5]],
                                       'x14': None, 'x15': set([1, 2, 3])})
        kt.annotate(
            x1=kt.a + 5,
            x2=5 + kt.a,
            x3=kt.a + kt.b,
            x4=kt.a - 5,
            x5=5 - kt.a,
            x6=kt.a - kt.b,
            x7=kt.a * 5,
            x8=5 * kt.a,
            x9=kt.a * kt.b,
            x10=kt.a / 5,
            x11=5 / kt.a,
            x12=kt.a / kt.b,
            x13=-kt.a,
            x14=+kt.a,
            x15=kt.a == kt.b,
            x16=kt.a == 5,
            x17=5 == kt.a,
            x18=kt.a != kt.b,
            x19=kt.a != 5,
            x20=5 != kt.a,
            x21=kt.a > kt.b,
            x22=kt.a > 5,
            x23=5 > kt.a,
            x24=kt.a >= kt.b,
            x25=kt.a >= 5,
            x26=5 >= kt.a,
            x27=kt.a < kt.b,
            x28=kt.a < 5,
            x29=5 < kt.a,
            x30=kt.a <= kt.b,
            x31=kt.a <= 5,
            x32=5 <= kt.a,
            x33=(kt.a == 0) & (kt.b == 5),
            x34=(kt.a == 0) | (kt.b == 5),
            x35=False,
            x36=True
        )
Ejemplo n.º 59
0
def generate_datasets(doctest_namespace):
    doctest_namespace['hl'] = hl

    if not os.path.isdir("output/"):
        try:
            os.mkdir("output/")
        except OSError:
            pass

    files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
    for f in files:
        if os.path.isdir(f):
            shutil.rmtree(f)

    ds = hl.read_matrix_table('data/example.vds')
    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(
        consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata
    doctest_namespace['cols_to_keep'] = s_metadata
    doctest_namespace['cols_to_remove'] = s_metadata
    doctest_namespace['rows_to_keep'] = v_metadata
    doctest_namespace['rows_to_remove'] = v_metadata

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv',
                             impute=True,
                             types={
                                 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)
                             })
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv',
                                   delimiter='\\s+',
                                   types={
                                       'Age': hl.tint32,
                                       'Children': hl.tarray(hl.tstr)
                                   },
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({
        'Alice': 43,
        'Bob': 33,
        'Charles': 44
    })
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval(
        "1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)
    doctest_namespace['nd'] = hl._ndarray([[1, 2], [3, 4]])

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv",
                                              impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    print("finished setting up doctest...")
Ejemplo n.º 60
0
    def test_operators(self):
        schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32))

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3]},
                {'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': []},
                {'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7]}]

        kt = hl.Table.parallelize(rows, schema)

        result = convert_struct_to_dict(kt.annotate(
            x1=kt.a + 5,
            x2=5 + kt.a,
            x3=kt.a + kt.b,
            x4=kt.a - 5,
            x5=5 - kt.a,
            x6=kt.a - kt.b,
            x7=kt.a * 5,
            x8=5 * kt.a,
            x9=kt.a * kt.b,
            x10=kt.a / 5,
            x11=5 / kt.a,
            x12=kt.a / kt.b,
            x13=-kt.a,
            x14=+kt.a,
            x15=kt.a == kt.b,
            x16=kt.a == 5,
            x17=5 == kt.a,
            x18=kt.a != kt.b,
            x19=kt.a != 5,
            x20=5 != kt.a,
            x21=kt.a > kt.b,
            x22=kt.a > 5,
            x23=5 > kt.a,
            x24=kt.a >= kt.b,
            x25=kt.a >= 5,
            x26=5 >= kt.a,
            x27=kt.a < kt.b,
            x28=kt.a < 5,
            x29=5 < kt.a,
            x30=kt.a <= kt.b,
            x31=kt.a <= 5,
            x32=5 <= kt.a,
            x33=(kt.a == 0) & (kt.b == 5),
            x34=(kt.a == 0) | (kt.b == 5),
            x35=False,
            x36=True
        ).take(1)[0])

        expected = {'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3],
                    'x1': 9, 'x2': 9, 'x3': 5,
                    'x4': -1, 'x5': 1, 'x6': 3,
                    'x7': 20, 'x8': 20, 'x9': 4,
                    'x10': 4.0 / 5, 'x11': 5.0 / 4, 'x12': 4, 'x13': -4, 'x14': 4,
                    'x15': False, 'x16': False, 'x17': False,
                    'x18': True, 'x19': True, 'x20': True,
                    'x21': True, 'x22': False, 'x23': True,
                    'x24': True, 'x25': False, 'x26': True,
                    'x27': False, 'x28': True, 'x29': False,
                    'x30': False, 'x31': True, 'x32': False,
                    'x33': False, 'x34': False, 'x35': False, 'x36': True}

        for k, v in expected.items():
            if isinstance(v, float):
                self.assertAlmostEqual(v, result[k], msg=k)
            else:
                self.assertEqual(v, result[k], msg=k)