Beispiel #1
0
 def _compute_type(self):
     for c in self.children:
         c.typ  # force
     child_typ = self.children[0].typ
     self._type = hl.ttable(
         hl.tstruct(**{self.global_name: hl.tarray(child_typ.global_type)}),
         child_typ.key_type._insert_field(self.data_name, hl.tarray(child_typ.value_type)),
         child_typ.row_key)
Beispiel #2
0
    def table_irs(self):
        b = ir.TrueIR()
        table_read = ir.TableRead(
            ir.TableNativeReader(resource('backward_compatability/1.0.0/table/0.ht')), False)
        table_read_row_type = hl.dtype('struct{idx: int32, f32: float32, i64: int64, m: float64, astruct: struct{a: int32, b: float64}, mstruct: struct{x: int32, y: str}, aset: set<str>, mset: set<float64>, d: dict<array<str>, float64>, md: dict<int32, str>, h38: locus<GRCh38>, ml: locus<GRCh37>, i: interval<locus<GRCh37>>, c: call, mc: call, t: tuple(call, str, str), mt: tuple(locus<GRCh37>, bool)}')

        matrix_read = ir.MatrixRead(
            ir.MatrixNativeReader(resource('backward_compatability/1.0.0/matrix_table/0.hmt')), False, False)

        range = ir.TableRange(10, 4)
        table_irs = [
            ir.TableKeyBy(table_read, ['m', 'd'], False),
            ir.TableFilter(table_read, b),
            table_read,
            ir.MatrixColsTable(matrix_read),
            ir.TableAggregateByKey(
                table_read,
                ir.MakeStruct([('a', ir.I32(5))])),
            ir.TableKeyByAndAggregate(
                table_read,
                ir.MakeStruct([('a', ir.I32(5))]),
                ir.MakeStruct([('b', ir.I32(5))]),
                1, 2),
            ir.TableJoin(
                table_read,
                ir.TableRange(100, 10), 'inner', 1),
            ir.MatrixEntriesTable(matrix_read),
            ir.MatrixRowsTable(matrix_read),
            ir.TableParallelize(ir.MakeStruct([
                ('rows', ir.Literal(hl.tarray(hl.tstruct(a=hl.tint32)), [{'a':None}, {'a':5}, {'a':-3}])),
                ('global', ir.MakeStruct([]))]), None),
            ir.TableMapRows(
                ir.TableKeyBy(table_read, []),
                ir.MakeStruct([
                    ('a', ir.GetField(ir.Ref('row'), 'f32')),
                    ('b', ir.F64(-2.11))])),
            ir.TableMapGlobals(
                table_read,
                ir.MakeStruct([
                    ('foo', ir.NA(hl.tarray(hl.tint32)))])),
            ir.TableRange(100, 10),
            ir.TableRepartition(table_read, 10, ir.RepartitionStrategy.COALESCE),
            ir.TableUnion(
                [ir.TableRange(100, 10), ir.TableRange(50, 10)]),
            ir.TableExplode(table_read, ['mset']),
            ir.TableHead(table_read, 10),
            ir.TableOrderBy(ir.TableKeyBy(table_read, []), [('m', 'A'), ('m', 'D')]),
            ir.TableDistinct(table_read),
            ir.CastMatrixToTable(matrix_read, '__entries', '__cols'),
            ir.TableRename(table_read, {'idx': 'idx_foo'}, {'global_f32': 'global_foo'}),
            ir.TableMultiWayZipJoin([table_read, table_read], '__data', '__globals'),
            ir.MatrixToTableApply(matrix_read, {'name': 'LinearRegressionRowsSingle', 'yFields': ['col_m'], 'xField': 'entry_m', 'covFields': [], 'rowBlockSize': 10, 'passThrough': []}),
            ir.TableToTableApply(table_read, {'name': 'TableFilterPartitions', 'parts': [0], 'keep': True})
        ]

        return table_irs
Beispiel #3
0
 def _compute_type(self):
     child_typ = self.child.typ
     self._type = hl.tmatrix(
         child_typ.global_type,
         child_typ.col_key_type._concat(
             hl.tstruct(**{f: hl.tarray(t) for f, t in child_typ.col_value_type.items()})),
         child_typ.col_key,
         child_typ.row_type,
         child_typ.row_key,
         hl.tstruct(**{f: hl.tarray(t) for f, t in child_typ.entry_type.items()}))
Beispiel #4
0
 def _compute_type(self):
     child_typ = self.child.typ
     self._type = hl.tmatrix(
         child_typ.global_type,
         child_typ.col_key_type._concat(
             hl.tstruct(**{f: hl.tarray(t) for f, t in child_typ.col_value_type.items()})),
         child_typ.col_key,
         child_typ.row_type,
         child_typ.row_key,
         hl.tstruct(**{f: hl.tarray(t) for f, t in child_typ.entry_type.items()}))
Beispiel #5
0
    def table_irs(self):
        b = ir.TrueIR()
        table_read = ir.TableRead(
            'src/test/resources/backward_compatability/1.0.0/table/0.ht',
            False, None)
        table_read_row_type = hl.dtype(
            'struct{idx: int32, f32: float32, i64: int64, m: float64, astruct: struct{a: int32, b: float64}, mstruct: struct{x: int32, y: str}, aset: set<str>, mset: set<float64>, d: dict<array<str>, float64>, md: dict<int32, str>, h38: locus<GRCh38>, ml: locus<GRCh37>, i: interval<locus<GRCh37>>, c: call, mc: call, t: tuple(call, str, str), mt: tuple(locus<GRCh37>, bool)}'
        )

        matrix_read = ir.MatrixRead(
            'src/test/resources/backward_compatability/1.0.0/matrix_table/0.hmt',
            False, False)

        range = ir.TableRange(10, 4)
        table_irs = [
            ir.TableUnkey(table_read),
            ir.TableKeyBy(table_read, ['m', 'd'], 1, True),
            ir.TableFilter(table_read, b),
            table_read,
            ir.MatrixColsTable(matrix_read),
            ir.TableAggregateByKey(table_read,
                                   ir.MakeStruct([('a', ir.I32(5))])),
            ir.TableJoin(table_read, ir.TableRange(100, 10), 'inner'),
            ir.MatrixEntriesTable(matrix_read),
            ir.MatrixRowsTable(matrix_read),
            ir.TableParallelize(
                'Table{global:Struct{},key:None,row:Struct{a:Int32}}',
                ir.Value(hl.tarray(hl.tstruct(a=hl.tint32)), [{
                    'a': None
                }, {
                    'a': 5
                }, {
                    'a': -3
                }]), None),
            ir.TableMapRows(
                table_read,
                ir.MakeStruct([('a',
                                ir.GetField(ir.Ref('row', table_read_row_type),
                                            'f32')), ('b', ir.F64(-2.11))]),
                None, None),
            ir.TableMapGlobals(
                table_read,
                ir.MakeStruct([('foo', ir.NA(hl.tarray(hl.tint32)))]),
                ir.Value(hl.tstruct(), {})),
            ir.TableRange(100, 10),
            ir.TableRepartition(table_read, 10, False),
            ir.TableUnion([ir.TableRange(100, 10),
                           ir.TableRange(50, 10)]),
            ir.TableExplode(table_read, 'mset'),
            ir.TableOrderBy(ir.TableUnkey(table_read), [('m', 'A'),
                                                        ('m', 'D')]),
            ir.TableDistinct(table_read),
        ]

        return table_irs
Beispiel #6
0
    def table_irs(self):
        b = ir.TrueIR()
        table_read = ir.TableRead(
            resource('backward_compatability/1.0.0/table/0.ht'), False, None)
        table_read_row_type = hl.dtype('struct{idx: int32, f32: float32, i64: int64, m: float64, astruct: struct{a: int32, b: float64}, mstruct: struct{x: int32, y: str}, aset: set<str>, mset: set<float64>, d: dict<array<str>, float64>, md: dict<int32, str>, h38: locus<GRCh38>, ml: locus<GRCh37>, i: interval<locus<GRCh37>>, c: call, mc: call, t: tuple(call, str, str), mt: tuple(locus<GRCh37>, bool)}')

        matrix_read = ir.MatrixRead(
            ir.MatrixNativeReader(resource('backward_compatability/1.0.0/matrix_table/0.hmt')), False, False)

        range = ir.TableRange(10, 4)
        table_irs = [
            ir.TableKeyBy(table_read, ['m', 'd'], False),
            ir.TableFilter(table_read, b),
            table_read,
            ir.MatrixColsTable(matrix_read),
            ir.TableAggregateByKey(
                table_read,
                ir.MakeStruct([('a', ir.I32(5))])),
            ir.TableKeyByAndAggregate(
                table_read,
                ir.MakeStruct([('a', ir.I32(5))]),
                ir.MakeStruct([('b', ir.I32(5))]),
                1, 2),
            ir.TableJoin(
                table_read,
                ir.TableRange(100, 10), 'inner', 1),
            ir.MatrixEntriesTable(matrix_read),
            ir.MatrixRowsTable(matrix_read),
            ir.TableParallelize(ir.MakeStruct([
                ('rows', ir.Literal(hl.tarray(hl.tstruct(a=hl.tint32)), [{'a':None}, {'a':5}, {'a':-3}])),
                ('global', ir.MakeStruct([]))]), None),
            ir.TableMapRows(
                ir.TableKeyBy(table_read, []),
                ir.MakeStruct([
                    ('a', ir.GetField(ir.Ref('row'), 'f32')),
                    ('b', ir.F64(-2.11))])),
            ir.TableMapGlobals(
                table_read,
                ir.MakeStruct([
                    ('foo', ir.NA(hl.tarray(hl.tint32)))])),
            ir.TableRange(100, 10),
            ir.TableRepartition(table_read, 10, False),
            ir.TableUnion(
                [ir.TableRange(100, 10), ir.TableRange(50, 10)]),
            ir.TableExplode(table_read, 'mset'),
            ir.TableHead(table_read, 10),
            ir.TableOrderBy(ir.TableKeyBy(table_read, []), [('m', 'A'), ('m', 'D')]),
            ir.TableDistinct(table_read),
            ir.CastMatrixToTable(matrix_read, '__entries', '__cols'),
            ir.TableRename(table_read, {'idx': 'idx_foo'}, {'global_f32': 'global_foo'}),
            ir.TableMultiWayZipJoin([table_read, table_read], '__data', '__globals'),
        ]

        return table_irs
Beispiel #7
0
 def test_import_vcf_missing_info_field_elements(self):
     mt = hl.import_vcf(resource('missingInfoArray.vcf'), reference_genome='GRCh37', array_elements_required=False)
     mt = mt.select_rows(FOO=mt.info.FOO, BAR=mt.info.BAR)
     expected = hl.Table.parallelize([{'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'],
                                       'FOO': [1, None], 'BAR': [2, None, None]},
                                      {'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'],
                                       'FOO': [None, 2, None], 'BAR': [None, 1.0, None]}],
                                     hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr),
                                                FOO=hl.tarray(hl.tint), BAR=hl.tarray(hl.tfloat64)),
                                     key=['locus', 'alleles'])
     self.assertTrue(mt.rows()._same(expected))
Beispiel #8
0
    def blockmatrix_irs(self):
        scalar_ir = ir.F64(2)
        vector_ir = ir.MakeArray([ir.F64(3), ir.F64(2)], hl.tarray(hl.tfloat64))

        read = ir.BlockMatrixRead(ir.BlockMatrixNativeReader(resource('blockmatrix_example/0')))
        add_two_bms = ir.BlockMatrixMap2(read, read, 'l', 'r', ir.ApplyBinaryPrimOp('+', ir.Ref('l'), ir.Ref('r')), "Union")
        negate_bm = ir.BlockMatrixMap(read, 'element', ir.ApplyUnaryPrimOp('-', ir.Ref('element')), False)
        sqrt_bm = ir.BlockMatrixMap(read, 'element', hl.sqrt(construct_expr(ir.Ref('element'), hl.tfloat64))._ir, False)
        persisted = ir.BlockMatrixRead(ir.BlockMatrixPersistReader('x', read))

        scalar_to_bm = ir.ValueToBlockMatrix(scalar_ir, [1, 1], 1)
        col_vector_to_bm = ir.ValueToBlockMatrix(vector_ir, [2, 1], 1)
        row_vector_to_bm = ir.ValueToBlockMatrix(vector_ir, [1, 2], 1)
        broadcast_scalar = ir.BlockMatrixBroadcast(scalar_to_bm, [], [2, 2], 256)
        broadcast_col = ir.BlockMatrixBroadcast(col_vector_to_bm, [0], [2, 2], 256)
        broadcast_row = ir.BlockMatrixBroadcast(row_vector_to_bm, [1], [2, 2], 256)
        transpose = ir.BlockMatrixBroadcast(broadcast_scalar, [1, 0], [2, 2], 256)
        matmul = ir.BlockMatrixDot(broadcast_scalar, transpose)

        rectangle = ir.Literal(hl.tarray(hl.tint64), [0, 1, 5, 6])
        band = ir.Literal(hl.ttuple(hl.tint64, hl.tint64), (-1, 1))
        intervals = ir.Literal(hl.ttuple(hl.tarray(hl.tint64), hl.tarray(hl.tint64)), ([0, 1, 5, 6], [5, 6, 8, 9]))

        sparsify1 = ir.BlockMatrixSparsify(read, rectangle, ir.RectangleSparsifier)
        sparsify2 = ir.BlockMatrixSparsify(read, band, ir.BandSparsifier(True))
        sparsify3 = ir.BlockMatrixSparsify(read, intervals, ir.RowIntervalSparsifier(True))

        densify = ir.BlockMatrixDensify(read)

        pow_ir = (construct_expr(ir.Ref('l'), hl.tfloat64) ** construct_expr(ir.Ref('r'), hl.tfloat64))._ir
        squared_bm = ir.BlockMatrixMap2(scalar_to_bm, scalar_to_bm, 'l', 'r', pow_ir, "NeedsDense")
        slice_bm = ir.BlockMatrixSlice(matmul, [slice(0, 2, 1), slice(0, 1, 1)])

        return [
            read,
            persisted,
            add_two_bms,
            negate_bm,
            sqrt_bm,
            scalar_to_bm,
            col_vector_to_bm,
            row_vector_to_bm,
            broadcast_scalar,
            broadcast_col,
            broadcast_row,
            squared_bm,
            transpose,
            sparsify1,
            sparsify2,
            sparsify3,
            densify,
            matmul,
            slice_bm
        ]
Beispiel #9
0
 def test_parses(self):
     env = {'c': hl.tbool,
            'a': hl.tarray(hl.tint32),
            'aa': hl.tarray(hl.tarray(hl.tint32)),
            'da': hl.tarray(hl.ttuple(hl.tint32, hl.tstr)),
            'v': hl.tint32,
            's': hl.tstruct(x=hl.tint32, y=hl.tint64, z=hl.tfloat64),
            't': hl.ttuple(hl.tint32, hl.tint64, hl.tfloat64),
            'call': hl.tcall,
            'x': hl.tint32}
     env = {name: t._jtype for name, t in env.items()}
     for x in self.value_irs():
         Env.hail().expr.ir.IRParser.parse_value_ir(str(x), env, {})
Beispiel #10
0
 def test_parses(self):
     env = {'c': hl.tbool,
            'a': hl.tarray(hl.tint32),
            'aa': hl.tarray(hl.tarray(hl.tint32)),
            'da': hl.tarray(hl.ttuple(hl.tint32, hl.tstr)),
            'v': hl.tint32,
            's': hl.tstruct(x=hl.tint32, y=hl.tint64, z=hl.tfloat64),
            't': hl.ttuple(hl.tint32, hl.tint64, hl.tfloat64),
            'call': hl.tcall,
            'x': hl.tint32}
     env = {name: t._parsable_string() for name, t in env.items()}
     for x in self.value_irs():
         Env.hail().expr.ir.IRParser.parse_value_ir(str(x), env, {})
Beispiel #11
0
    def test(self):
        schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr,
                            f=hl.tarray(hl.tint32),
                            g=hl.tarray(
                                hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)),
                            h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr),
                            i=hl.tbool,
                            j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr))

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5,
                 'e': "hello", 'f': [1, 2, 3],
                 'g': [hl.Struct(x=1, y=5, z='banana')],
                 'h': hl.Struct(a=5, b=3, c='winter'),
                 'i': True,
                 'j': hl.Struct(x=3, y=2, z='summer')}]

        kt = hl.Table.parallelize(rows, schema)

        result = convert_struct_to_dict(kt.annotate(
            chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d),
            ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5),
            dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])),
            dpois=hl.dpois(4, kt.a),
            drop=kt.h.drop('b', 'c'),
            exp=hl.exp(kt.c),
            fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d),
            hwe=hl.hardy_weinberg_p(1, 2, 1),
            index=hl.index(kt.g, 'z'),
            is_defined=hl.is_defined(kt.i),
            is_missing=hl.is_missing(kt.i),
            is_nan=hl.is_nan(hl.float64(kt.a)),
            json=hl.json(kt.g),
            log=hl.log(kt.a, kt.b),
            log10=hl.log10(kt.c),
            or_else=hl.or_else(kt.a, 5),
            or_missing=hl.or_missing(kt.i, kt.j),
            pchisqtail=hl.pchisqtail(kt.a, kt.b),
            pcoin=hl.rand_bool(0.5),
            pnorm=hl.pnorm(0.2),
            pow=2.0 ** kt.b,
            ppois=hl.ppois(kt.a, kt.b),
            qchisqtail=hl.qchisqtail(kt.a, kt.b),
            range=hl.range(0, 5, kt.b),
            rnorm=hl.rand_norm(0.0, kt.b),
            rpois=hl.rand_pois(kt.a),
            runif=hl.rand_unif(kt.b, kt.a),
            select=kt.h.select('c', 'b'),
            sqrt=hl.sqrt(kt.a),
            to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)],
            where=hl.cond(kt.i, 5, 10)
        ).take(1)[0])
Beispiel #12
0
 def _compute_type(self):
     name = self.config['name']
     child_typ = self.child.typ
     if name == 'MatrixFilterPartitions':
         self._type = child_typ
     else:
         assert name == 'WindowByLocus', name
         self._type = hl.tmatrix(
             child_typ.global_type,
             child_typ.col_type,
             child_typ.col_key,
             child_typ.row_type._insert_field('prev_rows', hl.tarray(child_typ.row_type)),
             child_typ.row_key,
             child_typ.entry_type._insert_field('prev_entries', hl.tarray(child_typ.entry_type)))
Beispiel #13
0
def read_variants_ht(path: str) -> hl.Table:
    variants_ht = hl.read_table(path)

    # Make sure that types match
    assert (isinstance(variants_ht.key[0], hl.expr.LocusExpression) &
            (variants_ht.key[1].dtype == hl.tarray(hl.tstr))
            & isinstance(variants_ht.key[2], hl.expr.LocusExpression) &
            (variants_ht.key[3].dtype == hl.tarray(hl.tstr)))

    variants_ht = variants_ht.key_by(**get_sorted_variants_expr(
        variants_ht.key[0], variants_ht.key[1], variants_ht.key[2],
        variants_ht.key[3])).persist()

    return variants_ht
Beispiel #14
0
 def test_parses(self):
     env = {'c': hl.tbool,
            'a': hl.tarray(hl.tint32),
            'st': hl.tstream(hl.tint32),
            'aa': hl.tarray(hl.tarray(hl.tint32)),
            'sta': hl.tstream(hl.tarray(hl.tint32)),
            'da': hl.tarray(hl.ttuple(hl.tint32, hl.tstr)),
            'nd': hl.tndarray(hl.tfloat64, 1),
            'v': hl.tint32,
            's': hl.tstruct(x=hl.tint32, y=hl.tint64, z=hl.tfloat64),
            't': hl.ttuple(hl.tint32, hl.tint64, hl.tfloat64),
            'call': hl.tcall,
            'x': hl.tint32}
     for x in self.value_irs():
         Env.spark_backend('ValueIRTests.test_parses')._parse_value_ir(str(x), env)
Beispiel #15
0
 def _compute_type(self):
     name = self.config['name']
     child_typ = self.child.typ
     if (name == 'MatrixFilterPartitions'
             or name == 'MatrixFilterIntervals'):
         self._type = child_typ
     else:
         assert name == 'WindowByLocus', name
         self._type = hl.tmatrix(
             child_typ.global_type,
             child_typ.col_type,
             child_typ.col_key,
             child_typ.row_type._insert_field('prev_rows', hl.tarray(child_typ.row_type)),
             child_typ.row_key,
             child_typ.entry_type._insert_field('prev_entries', hl.tarray(child_typ.entry_type)))
Beispiel #16
0
    def __init__(self, schema, paths, key, intervals):
        assert (key is None) == (intervals is None)
        self.schema = schema
        self.paths = paths
        self.key = key

        if intervals is not None:
            t = hl.expr.impute_type(intervals)
            if not isinstance(t, hl.tarray) and not isinstance(
                    t.element_type, hl.tinterval):
                raise TypeError("'intervals' must be an array of tintervals")
            pt = t.element_type.point_type
            if isinstance(pt, hl.tstruct):
                self._interval_type = t
            else:
                self._interval_type = hl.tarray(
                    hl.tinterval(hl.tstruct(__point=pt)))

        if intervals is not None and t != self._interval_type:
            self.intervals = [
                hl.Interval(hl.Struct(__point=i.start),
                            hl.Struct(__point=i.end), i.includes_start,
                            i.includes_end) for i in intervals
            ]
        else:
            self.intervals = intervals
Beispiel #17
0
def downsample(x, y, label=None, n_divisions=500) -> ArrayExpression:
    """Downsample (x, y) coordinate datapoints.

    Parameters
    ---------
    x : :class:`.NumericExpression`
        X-values to be downsampled.
    y : :class:`.NumericExpression`
        Y-values to be downsampled.
    label : :class:`.StringExpression` or :class:`.ArrayExpression`
        Additional data for each (x, y) coordinate. Can pass in multiple fields in an :class:`.ArrayExpression`.
    n_divisions : :obj:`int`
        Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints.

    Returns
    -------
    :class:`.ArrayExpression`
        Expression for downsampled coordinate points (x, y). The element type of the array is
        :py:data:`.ttuple` of :py:data:`.tfloat64`, :py:data:`.tfloat64`, and :py:data:`.tarray` of :py:data:`.tstring`
    """
    if label is None:
        label = hl.null(hl.tarray(hl.tstr))
    elif isinstance(label, StringExpression):
        label = hl.array([label])
    return _agg_func('downsample', [x, y, label], tarray(ttuple(tfloat64, tfloat64, tarray(tstr))),
                     constructor_args=[n_divisions])
Beispiel #18
0
def downsample(x, y, label=None, n_divisions=500) -> ArrayExpression:
    """Downsample (x, y) coordinate datapoints.

    Parameters
    ---------
    x : :class:`.NumericExpression`
        X-values to be downsampled.
    y : :class:`.NumericExpression`
        Y-values to be downsampled.
    label : :class:`.StringExpression` or :class:`.ArrayExpression`
        Additional data for each (x, y) coordinate. Can pass in multiple fields in an :class:`.ArrayExpression`.
    n_divisions : :obj:`int`
        Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints.

    Returns
    -------
    :class:`.ArrayExpression`
        Expression for downsampled coordinate points (x, y). The element type of the array is
        :py:data:`.ttuple` of :py:data:`.tfloat64`, :py:data:`.tfloat64`, and :py:data:`.tarray` of :py:data:`.tstring`
    """
    if label is None:
        label = hl.null(hl.tarray(hl.tstr))
    elif isinstance(label, StringExpression):
        label = hl.array([label])
    return _agg_func('downsample',
                     _to_agg(x),
                     tarray(ttuple(tfloat64, tfloat64, tarray(tstr))),
                     constructor_args=[n_divisions],
                     seq_op_args=[lambda x: x, y, label])
Beispiel #19
0
    def _get_train_counts(ht: hl.Table) -> Tuple[int, int]:
        """
        Determine the number of TP and FP variants in the input Table and report some stats on Ti, Tv, indels.

        :param ht: Input Table
        :return: Counts of TP and FP variants in the table
        """
        train_stats = hl.struct(n=hl.agg.count())

        if "alleles" in ht.row and ht.row.alleles.dtype == hl.tarray(hl.tstr):
            train_stats = train_stats.annotate(
                ti=hl.agg.count_where(
                    hl.expr.is_transition(ht.alleles[0], ht.alleles[1])),
                tv=hl.agg.count_where(
                    hl.expr.is_transversion(ht.alleles[0], ht.alleles[1])),
                indel=hl.agg.count_where(
                    hl.expr.is_indel(ht.alleles[0], ht.alleles[1])),
            )

        # Sample training examples
        pd_stats = (ht.group_by(**{
            "contig": ht.locus.contig,
            "tp": ht._tp,
            "fp": ht._fp
        }).aggregate(**train_stats).to_pandas())

        logger.info(pformat(pd_stats))
        pd_stats = pd_stats.fillna(False)

        # Number of true positive and false positive variants to be sampled for the training set
        n_tp = pd_stats[pd_stats["tp"] & ~pd_stats["fp"]]["n"].sum()
        n_fp = pd_stats[~pd_stats["tp"] & pd_stats["fp"]]["n"].sum()

        return n_tp, n_fp
Beispiel #20
0
    def test_group_cols_by_aggregate(self):
        mt, mt2 = self.get_groupable_matrix2()

        col_result = (mt.group_cols_by(group=mt2.cols()[mt.col_idx].col_idx2 < 2)
                      .aggregate_cols(collect=hl.agg.collect(mt.col_idx))
                      .aggregate_cols(count=hl.agg.count())
                      .aggregate_entries(sum=hl.agg.sum(mt2[mt.row_idx, mt.col_idx].x + mt.glob) + mt.glob - 15 - mt.row_idx) # tests fixed indices
                      .aggregate_entries(x=5)
                      .result())

        col_expected = (
            hl.Table.parallelize(
                [{'group': True, 'row_idx': 0, 'sum': 1, 'collect': [0, 1], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': True, 'row_idx': 1, 'sum': 2, 'collect': [0, 1], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': True, 'row_idx': 2, 'sum': 3, 'collect': [0, 1], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': True, 'row_idx': 3, 'sum': 4, 'collect': [0, 1], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': False, 'row_idx': 0, 'sum': 5, 'collect': [2, 3], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': False, 'row_idx': 1, 'sum': 6, 'collect': [2, 3], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': False, 'row_idx': 2, 'sum': 7, 'collect': [2, 3], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': False, 'row_idx': 3, 'sum': 8, 'collect': [2, 3], 'count': 2, 'r1': 3, 'x': 5}],
                hl.tstruct(row_idx=hl.tint32, r1=hl.tint32, group=hl.tbool, collect=hl.tarray(hl.tint32),
                           count=hl.tint64, sum=hl.tint64, x=hl.tint32)
            ).annotate_globals(glob=5).key_by('row_idx', 'group')
        )

        self.assertTrue(col_result.entries()._same(col_expected))
Beispiel #21
0
 def test_uniqueness(self):
     db = hl.experimental.DB(config=AnnotationDBTests.db_json)
     t = hl.utils.range_table(10)
     t = t.annotate(locus=hl.locus('1', t.idx + 1))
     t = db.annotate_rows_db(t, 'unique_dataset', 'nonunique_dataset')
     t.unique_dataset.dtype == hl.tstruct(annotation=hl.tstr)
     t.nonunique_dataset.dtype == hl.tstruct(annotation=hl.tarray(hl.tstr))
def validate_variant_results_table(ds):
    assert ds.key.dtype.fields == ("locus", "alleles"), "Table must be keyed by locus and alleles"
    assert ds.locus.dtype in (hl.tlocus("GRCh37"), hl.tlocus("GRCh38")), "'locus' must be a locus type"
    assert ds.alleles.dtype == hl.tarray(hl.tstr), "'alleles' must be an array of strings"

    required_fields = {
        "gene_id": hl.tstr,
        "consequence": hl.tstr,
        "hgvsc": hl.tstr,
        "hgvsp": hl.tstr,
    }
    for field, typ in required_fields.items():
        assert field in ds.row_value.dtype.fields, f"Missing required field '{field}'"
        assert ds[field].dtype == typ, f"{field} should be type {typ}"

    assert "group_results" in ds.row_value.dtype.fields, "Table must have a 'group_results' field"
    assert isinstance(ds.group_results.dtype, hl.tdict), "'group_results' must be a dict"
    assert ds.group_results.dtype.key_type == hl.tstr, "'group_results' keys must be strings"
    assert isinstance(ds.group_results.dtype.value_type, hl.tstruct), "'group_results' value must be a struct"

    for typ in ds.group_results.dtype.value_type.types:
        assert (
            typ in ALLOWED_RESULT_TYPES
        ), f"'group_results' fields may only be one of {', '.join(map(str, ALLOWED_RESULT_TYPES))}"

    assert isinstance(ds.info.dtype, hl.tstruct), "'info' must be a struct"
    for typ in ds.info.dtype.types:
        assert (
            typ in ALLOWED_RESULT_TYPES
        ), f"'info' fields may only be one of {', '.join(map(str, ALLOWED_RESULT_TYPES))}"
Beispiel #23
0
def impute_sex_aggregator(call,
                          aaf,
                          aaf_threshold=0.0,
                          include_par=False,
                          female_threshold=0.4,
                          male_threshold=0.8) -> hl.Table:
    """:func:`.impute_sex` as an aggregator."""
    mt = call._indices.source
    rg = mt.locus.dtype.reference_genome
    x_contigs = hl.literal(
        hl.eval(
            hl.map(lambda x_contig: hl.parse_locus_interval(x_contig, rg),
                   rg.x_contigs)))
    inbreeding = hl.agg.inbreeding(call, aaf)
    is_female = hl.if_else(
        inbreeding.f_stat < female_threshold, True,
        hl.if_else(inbreeding.f_stat > male_threshold, False,
                   hl.is_missing('tbool')))
    expression = hl.struct(is_female=is_female, **inbreeding)
    if not include_par:
        interval_type = hl.tarray(hl.tinterval(hl.tlocus(rg)))
        par_intervals = hl.literal(rg.par, interval_type)
        expression = hl.agg.filter(
            ~par_intervals.any(
                lambda par_interval: par_interval.contains(mt.locus)),
            expression)
    expression = hl.agg.filter(
        (aaf > aaf_threshold) & (aaf < (1 - aaf_threshold)), expression)
    expression = hl.agg.filter(
        x_contigs.any(lambda contig: contig.contains(mt.locus)), expression)

    return expression
Beispiel #24
0
    def test_annotate_globals(self):
        mt = hl.utils.range_matrix_table(1, 1)
        ht = hl.utils.range_table(1, 1)
        data = [(5, hl.tint, operator.eq),
                (float('nan'), hl.tfloat32, lambda x, y: str(x) == str(y)),
                (float('inf'), hl.tfloat64, lambda x, y: str(x) == str(y)),
                (float('-inf'), hl.tfloat64, lambda x, y: str(x) == str(y)),
                (1.111, hl.tfloat64, operator.eq),
                ([
                    hl.Struct(**{
                        'a': None,
                        'b': 5
                    }),
                    hl.Struct(**{
                        'a': 'hello',
                        'b': 10
                    })
                ], hl.tarray(hl.tstruct(a=hl.tstr, b=hl.tint)), operator.eq)]

        for x, t, f in data:
            self.assertTrue(
                f(mt.annotate_globals(foo=hl.literal(x, t)).foo.value, x),
                f"{x}, {t}")
            self.assertTrue(
                f(ht.annotate_globals(foo=hl.literal(x, t)).foo.value, x),
                f"{x}, {t}")
Beispiel #25
0
 def _compute_type(self):
     name = self.config['name']
     child_typ = self.child.typ
     pass_through = self.config['passThrough']
     if name == 'LinearRegressionRowsChained':
         chained_schema = hl.dtype(
             'struct{n:array<int32>,sum_x:array<float64>,y_transpose_x:array<array<float64>>,beta:array<array<float64>>,standard_error:array<array<float64>>,t_stat:array<array<float64>>,p_value:array<array<float64>>}'
         )
         self._type = hl.ttable(
             child_typ.global_type, (child_typ.row_key_type._insert_fields(
                 **{f: child_typ.row_type[f]
                    for f in pass_through})._concat(chained_schema)),
             child_typ.row_key)
     elif name == 'LinearRegressionRowsSingle':
         chained_schema = hl.dtype(
             'struct{n:int32,sum_x:float64,y_transpose_x:array<float64>,beta:array<float64>,standard_error:array<float64>,t_stat:array<float64>,p_value:array<float64>}'
         )
         self._type = hl.ttable(
             child_typ.global_type, (child_typ.row_key_type._insert_fields(
                 **{f: child_typ.row_type[f]
                    for f in pass_through})._concat(chained_schema)),
             child_typ.row_key)
     else:
         assert name == 'LogisticRegression', name
         pass_through = self.config['passThrough']
         logreg_type = hl.tstruct(logistic_regression=hl.tarray(
             regression_test_type(self.config['test'])))
         self._type = hl.ttable(
             child_typ.global_type, (child_typ.row_key_type._insert_fields(
                 **{f: child_typ.row_type[f]
                    for f in pass_through})._concat(logreg_type)),
             child_typ.row_key)
Beispiel #26
0
def test_ndarray_shape():
    np_e = np.array(3)
    np_row = np.array([1, 2, 3])
    np_col = np.array([[1], [2], [3]])
    np_m = np.array([[1, 2], [3, 4]])
    np_nd = np.arange(30).reshape((2, 5, 3))

    e = hl._ndarray(np_e)
    row = hl._ndarray(np_row)
    col = hl._ndarray(np_col)
    m = hl._ndarray(np_m)
    nd = hl._ndarray(np_nd)
    missing = hl._ndarray(hl.null(hl.tarray(hl.tint32)))

    assert_all_eval_to(
        (e.shape, np_e.shape),
        (row.shape, np_row.shape),
        (col.shape, np_col.shape),
        (m.shape, np_m.shape),
        (nd.shape, np_nd.shape),
        ((row + nd).shape, (np_row + np_nd).shape),
        ((row + col).shape, (np_row + np_col).shape),
        (m.transpose().shape, np_m.transpose().shape),
        (missing.shape, None)
    )
Beispiel #27
0
def test_ndarray_ref():

    scalar = 5.0
    np_scalar = np.array(scalar)
    h_scalar = hl.nd.array(scalar)
    h_np_scalar = hl.nd.array(np_scalar)

    assert_evals_to(h_scalar[()], 5.0)
    assert_evals_to(h_np_scalar[()], 5.0)

    cube = [[[0, 1], [2, 3]], [[4, 5], [6, 7]]]
    h_cube = hl.nd.array(cube)
    h_np_cube = hl.nd.array(np.array(cube))
    missing = hl.nd.array(hl.null(hl.tarray(hl.tint32)))

    assert_all_eval_to(
        (h_cube[0, 0, 1], 1), (h_cube[1, 1, 0], 6), (h_np_cube[0, 0, 1], 1),
        (h_np_cube[1, 1, 0], 6), (hl.nd.array([[[[1]]]])[0, 0, 0, 0], 1),
        (hl.nd.array([[[1, 2]], [[3, 4]]])[1, 0, 0], 3), (missing[1], None),
        (hl.nd.array([1, 2, 3])[hl.null(hl.tint32)], None),
        (h_cube[0, 0, hl.null(hl.tint32)], None))

    with pytest.raises(FatalError) as exc:
        hl.eval(hl.nd.array([1, 2, 3])[4])
    assert "Index out of bounds" in str(exc)
Beispiel #28
0
def test_ndarray_eval():
    data_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    nd_expr = hl._ndarray(data_list)
    evaled = hl.eval(nd_expr)
    np_equiv = np.array(data_list, dtype=np.int32)
    assert(np.array_equal(evaled, np_equiv))
    assert(evaled.strides == np_equiv.strides)

    assert hl.eval(hl._ndarray([[], []])).strides == (8, 8)
    assert np.array_equal(hl.eval(hl._ndarray([])), np.array([]))

    zero_array = np.zeros((10, 10), dtype=np.int64)
    evaled_zero_array = hl.eval(hl.literal(zero_array))

    assert np.array_equal(evaled_zero_array, zero_array)
    assert zero_array.dtype == evaled_zero_array.dtype

    # Testing from hail arrays
    assert np.array_equal(hl.eval(hl._ndarray(hl.range(6))), np.arange(6))
    assert np.array_equal(hl.eval(hl._ndarray(hl.int64(4))), np.array(4))

    # Testing missing data
    assert hl.eval(hl._ndarray(hl.null(hl.tarray(hl.tint32)))) is None

    with pytest.raises(ValueError) as exc:
        hl._ndarray([[4], [1, 2, 3], 5])
    assert "inner dimensions do not match" in str(exc.value)
    def test_group_cols_by_aggregate(self):
        mt, mt2 = self.get_groupable_matrix2()

        col_result = (mt.group_cols_by(group=mt2.cols()[mt.col_idx].col_idx2 < 2)
                      .aggregate_cols(collect=hl.agg.collect(mt.col_idx))
                      .aggregate_cols(count=hl.agg.count())
                      .aggregate_entries(sum=hl.agg.sum(mt2[mt.row_idx, mt.col_idx].x + mt.glob) + mt.glob - 15 - mt.row_idx) # tests fixed indices
                      .aggregate_entries(x=5)
                      .result())

        col_expected = (
            hl.Table.parallelize(
                [{'group': True, 'row_idx': 0, 'sum': 1, 'collect': [0, 1], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': True, 'row_idx': 1, 'sum': 2, 'collect': [0, 1], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': True, 'row_idx': 2, 'sum': 3, 'collect': [0, 1], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': True, 'row_idx': 3, 'sum': 4, 'collect': [0, 1], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': False, 'row_idx': 0, 'sum': 5, 'collect': [2, 3], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': False, 'row_idx': 1, 'sum': 6, 'collect': [2, 3], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': False, 'row_idx': 2, 'sum': 7, 'collect': [2, 3], 'count': 2, 'r1': 3, 'x': 5},
                 {'group': False, 'row_idx': 3, 'sum': 8, 'collect': [2, 3], 'count': 2, 'r1': 3, 'x': 5}],
                hl.tstruct(row_idx=hl.tint32, r1=hl.tint32, group=hl.tbool, collect=hl.tarray(hl.tint32),
                           count=hl.tint64, sum=hl.tint64, x=hl.tint32)
            ).annotate_globals(glob=5).key_by('row_idx', 'group')
        )

        self.assertTrue(col_result.entries()._same(col_expected))
Beispiel #30
0
    def test_import_vcf_missing_format_field_elements(self):
        mt = hl.import_vcf(resource('missingFormatArray.vcf'), reference_genome='GRCh37', array_elements_required=False)
        mt = mt.select_rows().select_entries('AD', 'PL')

        expected = hl.Table.parallelize([{'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 's': 'C1046::HG02024',
                                          'AD': [None, None], 'PL': [0, None, 180]},
                                         {'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 's': 'C1046::HG02025',
                                          'AD': [None, 6], 'PL': [70, None]},
                                         {'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'], 's': 'C1046::HG02024',
                                          'AD': [0, 0, None], 'PL': [396, None, None, 33, None, 0]},
                                         {'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'], 's': 'C1046::HG02025',
                                          'AD': [0, 0, 9], 'PL': [None, None, None]}],
                                        hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), s=hl.tstr,
                                                   AD=hl.tarray(hl.tint), PL=hl.tarray(hl.tint)),
                                        key=['locus', 'alleles', 's'])

        self.assertTrue(mt.entries()._same(expected))
Beispiel #31
0
    def test_matrix_ir_parses(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      reference_genome=hail.get_reference('GRCh37'),
                      contig_recoding={'01': '1'})

        collect_sig = ir.AggSignature('Collect', [], None, [hl.tint32])
        collect = ir.MakeStruct([('x',
                                  ir.ApplyAggOp([ir.I32(0)], [], None,
                                                collect_sig,
                                                hl.tarray(hl.tint32)))])

        matrix_read = ir.MatrixRead(
            resource('backward_compatability/1.0.0/matrix_table/0.hmt'), False,
            False)
        table_read = ir.TableRead(
            resource('backward_compatability/1.0.0/table/0.ht'), False, None)

        matrix_irs = [
            ir.MatrixUnionRows(ir.MatrixRange(5, 5, 1),
                               ir.MatrixRange(5, 5, 1)),
            ir.UnlocalizeEntries(ir.LocalizeEntries(matrix_read, '__entries'),
                                 ir.MatrixColsTable(matrix_read), '__entries'),
            ir.MatrixAggregateRowsByKey(matrix_read, collect, collect),
            ir.MatrixAggregateColsByKey(matrix_read, collect, collect),
            ir.MatrixRange(1, 1, 10),
            ir.MatrixImportVCF([resource('sample.vcf')], False, False,
                               None, None, False, ['GT'],
                               hail.get_reference('GRCh37'), {}, True, False),
            ir.MatrixImportBGEN([resource('example.8bits.bgen')], ['GP'],
                                resource('example.sample'), {}, 10, 1,
                                ['varid'], None),
            ir.MatrixFilterRows(matrix_read, ir.FalseIR()),
            ir.MatrixFilterCols(matrix_read, ir.FalseIR()),
            ir.MatrixFilterEntries(matrix_read, ir.FalseIR()),
            ir.MatrixChooseCols(matrix_read, [1, 0]),
            ir.MatrixMapCols(matrix_read, ir.MakeStruct([('x', ir.I64(20))]),
                             ['x']),
            ir.MatrixKeyRowsBy(matrix_read, ['row_i64'], False),
            ir.MatrixMapRows(ir.MatrixKeyRowsBy(matrix_read, []),
                             ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixMapEntries(matrix_read,
                                ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixMapGlobals(matrix_read,
                                ir.MakeStruct([('x', ir.I64(20))])),
            ir.TableToMatrixTable(table_read, ['f32', 'i64'], ['m', 'astruct'],
                                  ['aset'], ['mset'], 100),
            ir.MatrixCollectColsByKey(matrix_read),
            ir.MatrixExplodeRows(matrix_read, ['row_aset']),
            ir.MatrixExplodeCols(matrix_read, ['col_aset']),
            ir.MatrixAnnotateRowsTable(matrix_read, table_read, '__foo', None),
            ir.MatrixAnnotateColsTable(matrix_read, table_read, '__foo'),
        ]

        for x in matrix_irs:
            try:
                Env.hail().expr.Parser.parse_matrix_ir(str(x))
            except Exception as e:
                raise ValueError(str(x)) from e
Beispiel #32
0
    def test_import_bgen_GT_GP_entries(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['GT', 'GP'],
                              sample_file=resource('example.sample'))
        self.assertEqual(bgen.entry.dtype, hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64)))
Beispiel #33
0
    def test_import_bgen_row_fields(self):
        default_row_fields = hl.import_bgen(resource('example.8bits.bgen'),
                                            entry_fields=['dosage'],
                                            contig_recoding={'01': '1'},
                                            reference_genome='GRCh37')
        self.assertEqual(
            default_row_fields.row.dtype,
            hl.tstruct(locus=hl.tlocus('GRCh37'),
                       alleles=hl.tarray(hl.tstr),
                       rsid=hl.tstr,
                       varid=hl.tstr))
        no_row_fields = hl.import_bgen(resource('example.8bits.bgen'),
                                       entry_fields=['dosage'],
                                       contig_recoding={'01': '1'},
                                       reference_genome='GRCh37',
                                       _row_fields=[])
        self.assertEqual(
            no_row_fields.row.dtype,
            hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr)))
        varid_only = hl.import_bgen(resource('example.8bits.bgen'),
                                    entry_fields=['dosage'],
                                    contig_recoding={'01': '1'},
                                    reference_genome='GRCh37',
                                    _row_fields=['varid'])
        self.assertEqual(
            varid_only.row.dtype,
            hl.tstruct(locus=hl.tlocus('GRCh37'),
                       alleles=hl.tarray(hl.tstr),
                       varid=hl.tstr))
        rsid_only = hl.import_bgen(resource('example.8bits.bgen'),
                                   entry_fields=['dosage'],
                                   contig_recoding={'01': '1'},
                                   reference_genome='GRCh37',
                                   _row_fields=['rsid'])
        self.assertEqual(
            rsid_only.row.dtype,
            hl.tstruct(locus=hl.tlocus('GRCh37'),
                       alleles=hl.tarray(hl.tstr),
                       rsid=hl.tstr))

        self.assertTrue(default_row_fields.drop('varid')._same(rsid_only))
        self.assertTrue(default_row_fields.drop('rsid')._same(varid_only))
        self.assertTrue(
            default_row_fields.drop('varid', 'rsid')._same(no_row_fields))
Beispiel #34
0
def test_ndarray_eval():
    data_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    mishapen_data_list1 = [[4], [1, 2, 3]]
    mishapen_data_list2 = [[[1], [2, 3]]]
    mishapen_data_list3 = [[4], [1, 2, 3], 5]

    nd_expr = hl.nd.array(data_list)
    evaled = hl.eval(nd_expr)
    np_equiv = np.array(data_list, dtype=np.int32)
    np_equiv_fortran_style = np.asfortranarray(np_equiv)
    np_equiv_extra_dimension = np_equiv.reshape((3, 1, 3))
    assert (np.array_equal(evaled, np_equiv))
    assert (evaled.strides == np_equiv.strides)

    assert hl.eval(hl.nd.array([[], []])).strides == (8, 8)
    assert np.array_equal(hl.eval(hl.nd.array([])), np.array([]))

    zero_array = np.zeros((10, 10), dtype=np.int64)
    evaled_zero_array = hl.eval(hl.literal(zero_array))

    assert np.array_equal(evaled_zero_array, zero_array)
    assert zero_array.dtype == evaled_zero_array.dtype

    # Testing correct interpretation of numpy strides
    assert np.array_equal(hl.eval(hl.literal(np_equiv_fortran_style)),
                          np_equiv_fortran_style)
    assert np.array_equal(hl.eval(hl.literal(np_equiv_extra_dimension)),
                          np_equiv_extra_dimension)

    # Testing from hail arrays
    assert np.array_equal(hl.eval(hl.nd.array(hl.range(6))), np.arange(6))
    assert np.array_equal(hl.eval(hl.nd.array(hl.int64(4))), np.array(4))

    # Testing from nested hail arrays
    assert np.array_equal(
        hl.eval(hl.nd.array(hl.array([hl.array(x) for x in data_list]))),
        np.arange(9).reshape((3, 3)) + 1)

    # Testing missing data
    assert hl.eval(hl.nd.array(hl.null(hl.tarray(hl.tint32)))) is None

    with pytest.raises(ValueError) as exc:
        hl.nd.array(mishapen_data_list1)
    assert "inner dimensions do not match" in str(exc.value)

    with pytest.raises(FatalError) as exc:
        hl.eval(hl.nd.array(hl.array(mishapen_data_list1)))
    assert "inner dimensions do not match" in str(exc.value)

    with pytest.raises(FatalError) as exc:
        hl.eval(hl.nd.array(hl.array(mishapen_data_list2)))
    assert "inner dimensions do not match" in str(exc.value)

    with pytest.raises(ValueError) as exc:
        hl.nd.array(mishapen_data_list3)
    assert "inner dimensions do not match" in str(exc.value)
Beispiel #35
0
 def test_agg_explode(self):
     t = hl.Table.parallelize([
         hl.struct(a=[1, 2]),
         hl.struct(a=hl.empty_array(hl.tint32)),
         hl.struct(a=hl.null(hl.tarray(hl.tint32))),
         hl.struct(a=[3]),
         hl.struct(a=[hl.null(hl.tint32)])
     ])
     self.assertCountEqual(t.aggregate(hl.agg.collect(hl.agg.explode(t.a))),
                           [1, 2, None, 3])
Beispiel #36
0
    def test_import_bgen_GT_GP_entries(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['GT', 'GP'],
                              sample_file=resource('example.sample'))
        self.assertEqual(bgen.entry.dtype,
                         hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64)))
Beispiel #37
0
 def test_agg_explode(self):
     t = hl.Table.parallelize([
         hl.struct(a=[1, 2]),
         hl.struct(a=hl.empty_array(hl.tint32)),
         hl.struct(a=hl.null(hl.tarray(hl.tint32))),
         hl.struct(a=[3]),
         hl.struct(a=[hl.null(hl.tint32)])
     ])
     self.assertCountEqual(t.aggregate(hl.agg.explode(lambda elt: hl.agg.collect(elt), t.a)),
                           [1, 2, None, 3])
Beispiel #38
0
 def _compute_type(self):
     child_typ = self.child.typ
     if self.product:
         value_type = hl.tarray(self.table.typ.value_type)
     else:
         value_type = self.table.typ.value_type
     self._type = hl.tmatrix(
         child_typ.global_type, child_typ.col_type, child_typ.col_key,
         child_typ.row_type._insert_field(self.root, value_type),
         child_typ.row_key, child_typ.entry_type)
Beispiel #39
0
 def array_floating_point_divide(arg_type, ret_type):
     register_function("/", (
         arg_type,
         hl.tarray(arg_type),
     ), hl.tarray(ret_type))
     register_function("/", (hl.tarray(arg_type), arg_type),
                       hl.tarray(ret_type))
     register_function("/", (hl.tarray(arg_type), hl.tarray(arg_type)),
                       hl.tarray(ret_type))
Beispiel #40
0
 def _compute_type(self):
     left_typ = self.left.typ
     right_typ = self.right.typ
     if self.product:
         right_val_typ = left_typ.row_type._insert_field(
             self.root, hl.tarray(right_typ.value_type))
     else:
         right_val_typ = left_typ.row_type._insert_field(
             self.root, right_typ.value_type)
     self._type = hl.ttable(left_typ.global_type, right_val_typ,
                            left_typ.row_key)
    def test_loop_memory(self):
        def foo(recur, arr, idx):
            return hl.if_else(idx > 10, arr,
                              recur(arr.append(hl.str(idx)), idx + 1))

        assert hl.eval(
            hl.experimental.loop(foo, hl.tarray(hl.tstr), hl.literal(['foo']),
                                 1)) == [
                                     'foo', '1', '2', '3', '4', '5', '6', '7',
                                     '8', '9', '10'
                                 ]
Beispiel #42
0
 def test_localize_entries(self):
     ref_schema = hl.tstruct(row_idx=hl.tint32,
                             __entries=hl.tarray(hl.tstruct(v=hl.tint32)))
     ref_data = [{'row_idx': i, '__entries': [{'v': i+j} for j in range(6)]}
                 for i in range(8)]
     ref_tab = hl.Table.parallelize(ref_data, ref_schema).key_by('row_idx')
     ref_tab = ref_tab.select_globals(__cols=[hl.struct(col_idx=i) for i in range(6)])
     mt = hl.utils.range_matrix_table(8, 6)
     mt = mt.annotate_entries(v=mt.row_idx+mt.col_idx)
     t = mt._localize_entries('__entries', '__cols')
     self.assertTrue(t._same(ref_tab))
Beispiel #43
0
 def test_localize_self_join(self):
     ref_schema = hl.tstruct(row_idx=hl.tint32,
                             __entries=hl.tarray(hl.tstruct(v=hl.tint32)))
     ref_data = [{'row_idx': i, '__entries': [{'v': i+j} for j in range(6)]}
                 for i in range(8)]
     ref_tab = hl.Table.parallelize(ref_data, ref_schema).key_by('row_idx')
     ref_tab = ref_tab.join(ref_tab, how='outer')
     mt = hl.utils.range_matrix_table(8, 6)
     mt = mt.annotate_entries(v=mt.row_idx+mt.col_idx)
     t = mt._localize_entries('__entries', '__cols').drop('__cols')
     t = t.join(t, how='outer')
     self.assertTrue(t._same(ref_tab))
Beispiel #44
0
def _linreg(y, x, nested_dim):
    k = len(x)
    k0 = nested_dim
    if k0 < 0 or k0 > k:
        raise ValueError("linreg: `nested_dim` must be between 0 and the number "
                         f"of covariates ({k}), inclusive")

    t = hl.tstruct(beta=hl.tarray(hl.tfloat64),
                   standard_error=hl.tarray(hl.tfloat64),
                   t_stat=hl.tarray(hl.tfloat64),
                   p_value=hl.tarray(hl.tfloat64),
                   multiple_standard_error=hl.tfloat64,
                   multiple_r_squared=hl.tfloat64,
                   adjusted_r_squared=hl.tfloat64,
                   f_stat=hl.tfloat64,
                   multiple_p_value=hl.tfloat64,
                   n=hl.tint64)

    x = hl.array(x)
    k = hl.int32(k)
    k0 = hl.int32(k0)

    return _agg_func('LinearRegression', [y, x], t, [k, k0])
Beispiel #45
0
    def test_import_bgen_row_fields(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        default_row_fields = hl.import_bgen(resource('example.8bits.bgen'),
                                            entry_fields=['dosage'])
        self.assertEqual(default_row_fields.row.dtype,
                         hl.tstruct(locus=hl.tlocus('GRCh37'),
                                    alleles=hl.tarray(hl.tstr),
                                    rsid=hl.tstr,
                                    varid=hl.tstr))
        no_row_fields = hl.import_bgen(resource('example.8bits.bgen'),
                                       entry_fields=['dosage'],
                                       _row_fields=[])
        self.assertEqual(no_row_fields.row.dtype,
                         hl.tstruct(locus=hl.tlocus('GRCh37'),
                                    alleles=hl.tarray(hl.tstr)))
        varid_only = hl.import_bgen(resource('example.8bits.bgen'),
                                    entry_fields=['dosage'],
                                    _row_fields=['varid'])
        self.assertEqual(varid_only.row.dtype,
                         hl.tstruct(locus=hl.tlocus('GRCh37'),
                                    alleles=hl.tarray(hl.tstr),
                                    varid=hl.tstr))
        rsid_only = hl.import_bgen(resource('example.8bits.bgen'),
                                   entry_fields=['dosage'],
                                   _row_fields=['rsid'])
        self.assertEqual(rsid_only.row.dtype,
                         hl.tstruct(locus=hl.tlocus('GRCh37'),
                                    alleles=hl.tarray(hl.tstr),
                                    rsid=hl.tstr))

        self.assertTrue(default_row_fields.drop('varid')._same(rsid_only))
        self.assertTrue(default_row_fields.drop('rsid')._same(varid_only))
        self.assertTrue(
            default_row_fields.drop('varid', 'rsid')._same(no_row_fields))
Beispiel #46
0
    def test_annotate_globals(self):
        mt = hl.utils.range_matrix_table(1, 1)
        ht = hl.utils.range_table(1, 1)
        data = [
            (5, hl.tint, operator.eq),
            (float('nan'), hl.tfloat32, lambda x, y: str(x) == str(y)),
            (float('inf'), hl.tfloat64, lambda x, y: str(x) == str(y)),
            (float('-inf'), hl.tfloat64, lambda x, y: str(x) == str(y)),
            (1.111, hl.tfloat64, operator.eq),
            ([hl.Struct(**{'a': None, 'b': 5}),
              hl.Struct(**{'a': 'hello', 'b': 10})], hl.tarray(hl.tstruct(a=hl.tstr, b=hl.tint)), operator.eq)
        ]

        for x, t, f in data:
            self.assertTrue(f(hl.eval(mt.annotate_globals(foo=hl.literal(x, t)).foo), x), f"{x}, {t}")
            self.assertTrue(f(hl.eval(ht.annotate_globals(foo=hl.literal(x, t)).foo), x), f"{x}, {t}")
Beispiel #47
0
    def test_aggregate2(self):
        schema = hl.tstruct(status=hl.tint32, GT=hl.tcall, qPheno=hl.tint32)

        rows = [{'status': 0, 'GT': hl.Call([0, 0]), 'qPheno': 3},
                {'status': 0, 'GT': hl.Call([0, 1]), 'qPheno': 13}]

        kt = hl.Table.parallelize(rows, schema)

        result = convert_struct_to_dict(
            kt.group_by(status=kt.status)
                .aggregate(
                x1=agg.collect(kt.qPheno * 2),
                x2=agg.explode(lambda elt: agg.collect(elt), [kt.qPheno, kt.qPheno + 1]),
                x3=agg.min(kt.qPheno),
                x4=agg.max(kt.qPheno),
                x5=agg.sum(kt.qPheno),
                x6=agg.product(hl.int64(kt.qPheno)),
                x7=agg.count(),
                x8=agg.count_where(kt.qPheno == 3),
                x9=agg.fraction(kt.qPheno == 1),
                x10=agg.stats(hl.float64(kt.qPheno)),
                x11=agg.hardy_weinberg_test(kt.GT),
                x13=agg.inbreeding(kt.GT, 0.1),
                x14=agg.call_stats(kt.GT, ["A", "T"]),
                x15=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')))[0],
                x16=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')).c.banana)[0],
                x17=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tarray(hl.tint32))),
                x18=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tset(hl.tint32))),
                x19=agg.take(kt.GT, 1, ordering=-kt.qPheno)
            ).take(1)[0])

        expected = {u'status': 0,
                    u'x13': {u'n_called': 2, u'expected_homs': 1.64, u'f_stat': -1.777777777777777,
                             u'observed_homs': 1},
                    u'x14': {u'AC': [3, 1], u'AF': [0.75, 0.25], u'AN': 4, u'homozygote_count': [1, 0]},
                    u'x15': {u'a': 5, u'c': {u'banana': u'apple'}, u'b': u'foo'},
                    u'x10': {u'min': 3.0, u'max': 13.0, u'sum': 16.0, u'stdev': 5.0, u'n': 2, u'mean': 8.0},
                    u'x8': 1, u'x9': 0.0, u'x16': u'apple',
                    u'x11': {u'het_freq_hwe': 0.5, u'p_value': 0.5},
                    u'x2': [3, 4, 13, 14], u'x3': 3, u'x1': [6, 26], u'x6': 39, u'x7': 2, u'x4': 13, u'x5': 16,
                    u'x17': [],
                    u'x18': [],
                    u'x19': [hl.Call([0, 1])]}

        self.maxDiff = None

        self.assertDictEqual(result, expected)
Beispiel #48
0
 def values(self):
     values = [
         (hl.tbool, True),
         (hl.tint32, 0),
         (hl.tint64, 0),
         (hl.tfloat32, 0.5),
         (hl.tfloat64, 0.5),
         (hl.tstr, "foo"),
         (hl.tstruct(x=hl.tint32), hl.Struct(x=0)),
         (hl.tarray(hl.tint32), [0, 1, 4]),
         (hl.tset(hl.tint32), {0, 1, 4}),
         (hl.tdict(hl.tstr, hl.tint32), {"a": 0, "b": 1, "c": 4}),
         (hl.tinterval(hl.tint32), hl.Interval(0, 1, True, False)),
         (hl.tlocus(hl.default_reference()), hl.Locus("1", 1)),
         (hl.tcall, hl.Call([0, 1]))
     ]
     return values
Beispiel #49
0
    def test_multi_way_zip_join(self):
        d1 = [{"id": 0, "name": "a", "data": 0.0},
              {"id": 1, "name": "b", "data": 3.14},
              {"id": 2, "name": "c", "data": 2.78}]
        d2 = [{"id": 0, "name": "d", "data": 1.1},
              {"id": 0, "name": "x", "data": 2.2},
              {"id": 2, "name": "v", "data": 7.89}]
        d3 = [{"id": 1, "name": "f", "data":  9.99},
              {"id": 2, "name": "g", "data": -1.0},
              {"id": 3, "name": "z", "data":  0.01}]
        s = hl.tstruct(id=hl.tint32, name=hl.tstr, data=hl.tfloat64)
        ts = [hl.Table.parallelize(r, schema=s, key='id') for r in [d1, d2, d3]]
        joined = hl.Table._multi_way_zip_join(ts, '__data', '__globals').drop('__globals')
        dexpected = [{"id": 0, "__data": [{"name": "a", "data": 0.0},
                                          {"name": "d", "data": 1.1},
                                          None]},
                     {"id": 0, "__data": [None,
                                          {"name": "x", "data": 2.2},
                                          None]},
                     {"id": 1, "__data": [{"name": "b", "data": 3.14},
                                          None,
                                          {"name": "f", "data":  9.99}]},
                     {"id": 2, "__data": [{"name": "c", "data": 2.78},
                                          {"name": "v", "data": 7.89},
                                          {"name": "g", "data": -1.0}]},
                     {"id": 3, "__data": [None,
                                          None,
                                          {"name": "z", "data":  0.01}]}]
        expected = hl.Table.parallelize(
            dexpected,
            schema=hl.tstruct(id=hl.tint32, __data=hl.tarray(hl.tstruct(name=hl.tstr, data=hl.tfloat64))),
            key='id')
        self.assertTrue(expected._same(joined))

        expected2 = expected.transmute(data=expected['__data'])
        joined_same_name = hl.Table._multi_way_zip_join(ts, 'data', 'globals').drop('globals')
        self.assertTrue(expected2._same(joined_same_name))

        joined_nothing = hl.Table._multi_way_zip_join(ts, 'data', 'globals').drop('data', 'globals')
        self.assertEqual(joined_nothing._force_count(), 5)
Beispiel #50
0
    def blockmatrix_irs(self):
        scalar_ir = ir.F64(2)
        vector_ir = ir.MakeArray([ir.F64(3), ir.F64(2)], hl.tarray(hl.tfloat64))

        read = ir.BlockMatrixRead(ir.BlockMatrixNativeReader(resource('blockmatrix_example/0')))
        add_two_bms = ir.BlockMatrixMap2(read, read, ir.ApplyBinaryPrimOp('+', ir.Ref('l'), ir.Ref('r')))
        negate_bm = ir.BlockMatrixMap(read, ir.ApplyUnaryPrimOp('-', ir.Ref('element')))
        sqrt_bm = ir.BlockMatrixMap(read, hl.sqrt(construct_expr(ir.Ref('element'), hl.tfloat64))._ir)

        scalar_to_bm = ir.ValueToBlockMatrix(scalar_ir, [1, 1], 1)
        col_vector_to_bm = ir.ValueToBlockMatrix(vector_ir, [2, 1], 1)
        row_vector_to_bm = ir.ValueToBlockMatrix(vector_ir, [1, 2], 1)
        broadcast_scalar = ir.BlockMatrixBroadcast(scalar_to_bm, [], [2, 2], 256)
        broadcast_col = ir.BlockMatrixBroadcast(col_vector_to_bm, [0], [2, 2], 256)
        broadcast_row = ir.BlockMatrixBroadcast(row_vector_to_bm, [1], [2, 2], 256)
        transpose = ir.BlockMatrixBroadcast(broadcast_scalar, [1, 0], [2, 2], 256)
        matmul = ir.BlockMatrixDot(broadcast_scalar, transpose)

        pow_ir = (construct_expr(ir.Ref('l'), hl.tfloat64) ** construct_expr(ir.Ref('r'), hl.tfloat64))._ir
        squared_bm = ir.BlockMatrixMap2(scalar_to_bm, scalar_to_bm, pow_ir)

        return [
            read,
            add_two_bms,
            negate_bm,
            sqrt_bm,
            scalar_to_bm,
            col_vector_to_bm,
            row_vector_to_bm,
            broadcast_scalar,
            broadcast_col,
            broadcast_row,
            squared_bm,
            transpose,
            matmul
        ]
Beispiel #51
0
 def test_str_annotation_regression(self):
     t = hl.Table.parallelize([{'alleles': ['A', 'T']}],
                              hl.tstruct(alleles=hl.tarray(hl.tstr)))
     t = t.annotate(ref=t.alleles[0])
     t._force_count()
Beispiel #52
0
    def test_annotate(self):
        schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32))

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3]},
                {'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': []},
                {'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7]}]

        kt = hl.Table.parallelize(rows, schema)

        self.assertTrue(kt.annotate()._same(kt))

        result1 = convert_struct_to_dict(kt.annotate(foo=kt.a + 1,
                                                     foo2=kt.a).take(1)[0])

        self.assertDictEqual(result1, {'a': 4,
                                       'b': 1,
                                       'c': 3,
                                       'd': 5,
                                       'e': "hello",
                                       'f': [1, 2, 3],
                                       'foo': 5,
                                       'foo2': 4})

        result3 = convert_struct_to_dict(kt.annotate(
            x1=kt.f.map(lambda x: x * 2),
            x2=kt.f.map(lambda x: [x, x + 1]).flatmap(lambda x: x),
            x3=hl.min(kt.f),
            x4=hl.max(kt.f),
            x5=hl.sum(kt.f),
            x6=hl.product(kt.f),
            x7=kt.f.length(),
            x8=kt.f.filter(lambda x: x == 3),
            x9=kt.f[1:],
            x10=kt.f[:],
            x11=kt.f[1:2],
            x12=kt.f.map(lambda x: [x, x + 1]),
            x13=kt.f.map(lambda x: [[x, x + 1], [x + 2]]).flatmap(lambda x: x),
            x14=hl.cond(kt.a < kt.b, kt.c, hl.null(hl.tint32)),
            x15={1, 2, 3}
        ).take(1)[0])

        self.assertDictEqual(result3, {'a': 4,
                                       'b': 1,
                                       'c': 3,
                                       'd': 5,
                                       'e': "hello",
                                       'f': [1, 2, 3],
                                       'x1': [2, 4, 6], 'x2': [1, 2, 2, 3, 3, 4],
                                       'x3': 1, 'x4': 3, 'x5': 6, 'x6': 6, 'x7': 3, 'x8': [3],
                                       'x9': [2, 3], 'x10': [1, 2, 3], 'x11': [2],
                                       'x12': [[1, 2], [2, 3], [3, 4]],
                                       'x13': [[1, 2], [3], [2, 3], [4], [3, 4], [5]],
                                       'x14': None, 'x15': set([1, 2, 3])})
        kt.annotate(
            x1=kt.a + 5,
            x2=5 + kt.a,
            x3=kt.a + kt.b,
            x4=kt.a - 5,
            x5=5 - kt.a,
            x6=kt.a - kt.b,
            x7=kt.a * 5,
            x8=5 * kt.a,
            x9=kt.a * kt.b,
            x10=kt.a / 5,
            x11=5 / kt.a,
            x12=kt.a / kt.b,
            x13=-kt.a,
            x14=+kt.a,
            x15=kt.a == kt.b,
            x16=kt.a == 5,
            x17=5 == kt.a,
            x18=kt.a != kt.b,
            x19=kt.a != 5,
            x20=5 != kt.a,
            x21=kt.a > kt.b,
            x22=kt.a > 5,
            x23=5 > kt.a,
            x24=kt.a >= kt.b,
            x25=kt.a >= 5,
            x26=5 >= kt.a,
            x27=kt.a < kt.b,
            x28=kt.a < 5,
            x29=5 < kt.a,
            x30=kt.a <= kt.b,
            x31=kt.a <= 5,
            x32=5 <= kt.a,
            x33=(kt.a == 0) & (kt.b == 5),
            x34=(kt.a == 0) | (kt.b == 5),
            x35=False,
            x36=True
        )
Beispiel #53
0
    def test_filter(self):
        schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32))

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3]},
                {'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': []},
                {'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7]}]

        kt = hl.Table.parallelize(rows, schema)

        self.assertEqual(kt.filter(kt.a == 4).count(), 2)
        self.assertEqual(kt.filter((kt.d == -1) | (kt.c == 20) | (kt.e == "hello")).count(), 3)
        self.assertEqual(kt.filter((kt.c != 20) & (kt.a == 4)).count(), 1)
        self.assertEqual(kt.filter(True).count(), 3)
Beispiel #54
0
    def test_transmute(self):
        schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32),
                            g=hl.tstruct(x=hl.tbool, y=hl.tint32))

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': {'x': True, 'y': 2}},
                {'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': [], 'g': {'x': True, 'y': 2}},
                {'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7], 'g': None}]
        df = hl.Table.parallelize(rows, schema)

        df = df.transmute(h=df.a + df.b + df.c + df.g.y)
        r = df.select('h').collect()

        self.assertEqual(list(df.row), ['d', 'e', 'f', 'h'])
        self.assertEqual(r, [hl.Struct(h=x) for x in [10, 20, None]])
Beispiel #55
0
    def test_select(self):
        schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32),
                            g=hl.tstruct(x=hl.tbool, y=hl.tint32))

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': {'x': True, 'y': 2}},
                {'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': [], 'g': {'x': True, 'y': 2}},
                {'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7], 'g': None}]

        kt = hl.Table.parallelize(rows, schema)

        t1 = kt.select(kt.a, kt.e)
        self.assertEqual(list(t1.row), ['a', 'e'])
        self.assertEqual(list(t1.key), [])

        t2 = kt.key_by('e')
        t2 = t2.select(t2.a)
        self.assertEqual(list(t2.row), ['e', 'a'])
        self.assertEqual(list(t2.key), ['e'])

        self.assertEqual(list(kt.select(kt.a, foo=kt.a + kt.b - kt.c - kt.d).row), ['a', 'foo'])
        self.assertEqual(list(kt.select(kt.a, foo=kt.a + kt.b - kt.c - kt.d, **kt.g).row), ['a', 'foo', 'x', 'y'])

        # select no fields
        s = kt.select()
        self.assertEqual(list(s.row), [])
        self.assertEqual(list(s.key), [])
Beispiel #56
0
 def floating_point_divide(arg_type, ret_type):
     register_function("/", (arg_type, hl.tarray(arg_type),), hl.tarray(ret_type))
     register_function("/", (hl.tarray(arg_type),arg_type), hl.tarray(ret_type))
     register_function("/", (hl.tarray(arg_type),hl.tarray(arg_type)), hl.tarray(ret_type))
Beispiel #57
0
    def value_irs(self):
        b = ir.TrueIR()
        c = ir.Ref('c')
        i = ir.I32(5)
        j = ir.I32(7)
        st = ir.Str('Hail')
        a = ir.Ref('a')
        aa = ir.Ref('aa')
        da = ir.Ref('da')
        v = ir.Ref('v')
        s = ir.Ref('s')
        t = ir.Ref('t')
        call = ir.Ref('call')

        table = ir.TableRange(5, 3)

        collect_sig = ir.AggSignature('Collect', [], None, [hl.tint32])

        call_stats_sig = ir.AggSignature('CallStats', [], [hl.tint32], [hl.tcall])

        hist_sig = ir.AggSignature(
            'Histogram', [hl.tfloat64, hl.tfloat64, hl.tint32], None, [hl.tfloat64])

        take_by_sig = ir.AggSignature('TakeBy', [hl.tint32], None, [hl.tfloat64, hl.tfloat64])

        table = ir.TableRange(10, 4)

        value_irs = [
            i, ir.I64(5), ir.F32(3.14), ir.F64(3.14), s, ir.TrueIR(), ir.FalseIR(), ir.Void(),
            ir.Cast(i, hl.tfloat64),
            ir.NA(hl.tint32),
            ir.IsNA(i),
            ir.If(b, i, j),
            ir.Let('v', i, v),
            ir.Ref('x'),
            ir.ApplyBinaryOp('+', i, j),
            ir.ApplyUnaryOp('-', i),
            ir.ApplyComparisonOp('EQ', i, j),
            ir.MakeArray([i, ir.NA(hl.tint32), ir.I32(-3)], hl.tarray(hl.tint32)),
            ir.ArrayRef(a, i),
            ir.ArrayLen(a),
            ir.ArrayRange(ir.I32(0), ir.I32(5), ir.I32(1)),
            ir.ArraySort(a, b, False),
            ir.ToSet(a),
            ir.ToDict(da),
            ir.ToArray(a),
            ir.LowerBoundOnOrderedCollection(a, i, True),
            ir.GroupByKey(da),
            ir.ArrayMap(a, 'v', v),
            ir.ArrayFilter(a, 'v', v),
            ir.ArrayFlatMap(aa, 'v', v),
            ir.ArrayFold(a, ir.I32(0), 'x', 'v', v),
            ir.ArrayScan(a, ir.I32(0), 'x', 'v', v),
            ir.ArrayFor(a, 'v', ir.Void()),
            ir.AggFilter(ir.TrueIR(), ir.I32(0)),
            ir.AggExplode(ir.ArrayRange(ir.I32(0), ir.I32(2), ir.I32(1)), 'x', ir.I32(0)),
            ir.AggGroupBy(ir.TrueIR(), ir.I32(0)),
            ir.ApplyAggOp([], None, [ir.I32(0)], collect_sig),
            ir.ApplyScanOp([], None, [ir.I32(0)], collect_sig),
            ir.ApplyAggOp([ir.F64(-5.0), ir.F64(5.0), ir.I32(100)], None, [ir.F64(-2.11)], hist_sig),
            ir.ApplyAggOp([], [ir.I32(2)], [call], call_stats_sig),
            ir.ApplyAggOp([ir.I32(10)], None, [ir.F64(-2.11), ir.F64(-2.11)], take_by_sig),
            ir.InitOp(ir.I32(0), [ir.I32(2)], call_stats_sig),
            ir.SeqOp(ir.I32(0), [i], collect_sig),
            ir.SeqOp(ir.I32(0), [ir.F64(-2.11), ir.I32(17)], take_by_sig),
            ir.Begin([ir.Void()]),
            ir.MakeStruct([('x', i)]),
            ir.SelectFields(s, ['x', 'z']),
            ir.InsertFields(s, [('x', i)]),
            ir.GetField(s, 'x'),
            ir.MakeTuple([i, b]),
            ir.GetTupleElement(t, 1),
            ir.StringSlice(st, ir.I32(1), ir.I32(2)),
            ir.StringLength(st),
            ir.In(2, hl.tfloat64),
            ir.Die('mumblefoo', hl.tfloat64),
            ir.Apply('&&', b, c),
            ir.Apply('toFloat64', i),
            ir.Uniroot('x', ir.F64(3.14), ir.F64(-5.0), ir.F64(5.0)),
            ir.Literal(hl.tarray(hl.tint32), [1, 2, None]),
            ir.TableCount(table),
            ir.TableAggregate(table, ir.MakeStruct([('foo', ir.ApplyAggOp([], None, [ir.I32(0)], collect_sig))])),
            ir.TableWrite(table, new_temp_file(), False, True, "fake_codec_spec$$"),
        ]

        return value_irs
Beispiel #58
0
    def test_mendel_errors(self):
        mt = hl.import_vcf(resource('mendel.vcf'))
        ped = hl.Pedigree.read(resource('mendel.fam'))

        men, fam, ind, var = hl.mendel_errors(mt['GT'], ped)

        self.assertEqual(men.key.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr),
                                                   s=hl.tstr))
        self.assertEqual(men.row.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr),
                                                   s=hl.tstr,
                                                   fam_id=hl.tstr,
                                                   mendel_code=hl.tint))
        self.assertEqual(fam.key.dtype, hl.tstruct(pat_id=hl.tstr,
                                                   mat_id=hl.tstr))
        self.assertEqual(fam.row.dtype, hl.tstruct(pat_id=hl.tstr,
                                                   mat_id=hl.tstr,
                                                   fam_id=hl.tstr,
                                                   children=hl.tint,
                                                   errors=hl.tint64,
                                                   snp_errors=hl.tint64))
        self.assertEqual(ind.key.dtype, hl.tstruct(s=hl.tstr))
        self.assertEqual(ind.row.dtype, hl.tstruct(s=hl.tstr,
                                                   fam_id=hl.tstr,
                                                   errors=hl.tint64,
                                                   snp_errors=hl.tint64))
        self.assertEqual(var.key.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr)))
        self.assertEqual(var.row.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr),
                                                   errors=hl.tint64))

        self.assertEqual(men.count(), 41)
        self.assertEqual(fam.count(), 2)
        self.assertEqual(ind.count(), 7)
        self.assertEqual(var.count(), mt.count_rows())

        self.assertEqual(set(fam.select('children', 'errors', 'snp_errors').collect()),
                         {
                             hl.utils.Struct(pat_id='Dad1', mat_id='Mom1', children=2,
                                             errors=41, snp_errors=39),
                             hl.utils.Struct(pat_id='Dad2', mat_id='Mom2', children=1,
                                             errors=0, snp_errors=0)
                         })

        self.assertEqual(set(ind.select('errors', 'snp_errors').collect()),
                         {
                             hl.utils.Struct(s='Son1', errors=23, snp_errors=22),
                             hl.utils.Struct(s='Dtr1', errors=18, snp_errors=17),
                             hl.utils.Struct(s='Dad1', errors=19, snp_errors=18),
                             hl.utils.Struct(s='Mom1', errors=22, snp_errors=21),
                             hl.utils.Struct(s='Dad2', errors=0, snp_errors=0),
                             hl.utils.Struct(s='Mom2', errors=0, snp_errors=0),
                             hl.utils.Struct(s='Son2', errors=0, snp_errors=0)
                         })

        to_keep = hl.set([
            (hl.Locus("1", 1), ['C', 'CT']),
            (hl.Locus("1", 2), ['C', 'T']),
            (hl.Locus("X", 1), ['C', 'T']),
            (hl.Locus("X", 3), ['C', 'T']),
            (hl.Locus("Y", 1), ['C', 'T']),
            (hl.Locus("Y", 3), ['C', 'T'])
        ])
        self.assertEqual(var.filter(to_keep.contains((var.locus, var.alleles)))
                         .order_by('locus')
                         .select('locus', 'alleles', 'errors').collect(),
                         [
                             hl.utils.Struct(locus=hl.Locus("1", 1), alleles=['C', 'CT'], errors=2),
                             hl.utils.Struct(locus=hl.Locus("1", 2), alleles=['C', 'T'], errors=1),
                             hl.utils.Struct(locus=hl.Locus("X", 1), alleles=['C', 'T'], errors=2),
                             hl.utils.Struct(locus=hl.Locus("X", 3), alleles=['C', 'T'], errors=1),
                             hl.utils.Struct(locus=hl.Locus("Y", 1), alleles=['C', 'T'], errors=1),
                             hl.utils.Struct(locus=hl.Locus("Y", 3), alleles=['C', 'T'], errors=1),
                         ])

        ped2 = hl.Pedigree.read(resource('mendelWithMissingSex.fam'))
        men2, _, _, _ = hl.mendel_errors(mt['GT'], ped2)

        self.assertTrue(men2.filter(men2.s == 'Dtr1')._same(men.filter(men.s == 'Dtr1')))
Beispiel #59
0
 def _compute_type(self):
     child_typ = self.child.typ
     self._type = hl.ttable(child_typ.global_type._insert_field(self.cols_field_name, hl.tarray(child_typ.col_type)),
                            child_typ.row_type._insert_field(self.entries_field_name,
                                                             hl.tarray(child_typ.entry_type)),
                            child_typ.row_key)
Beispiel #60
0
 def _compute_type(self):
     name = self.config['name']
     child_typ = self.child.typ
     if name == 'LinearRegressionRowsChained':
         pass_through = self.config['passThrough']
         chained_schema = hl.dtype(
             'struct{n:array<int32>,sum_x:array<float64>,y_transpose_x:array<array<float64>>,beta:array<array<float64>>,standard_error:array<array<float64>>,t_stat:array<array<float64>>,p_value:array<array<float64>>}')
         self._type = hl.ttable(
             child_typ.global_type,
             (child_typ.row_key_type
              ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through})
              ._concat(chained_schema)),
             child_typ.row_key)
     elif name == 'LinearRegressionRowsSingle':
         pass_through = self.config['passThrough']
         chained_schema = hl.dtype(
             'struct{n:int32,sum_x:float64,y_transpose_x:array<float64>,beta:array<float64>,standard_error:array<float64>,t_stat:array<float64>,p_value:array<float64>}')
         self._type = hl.ttable(
             child_typ.global_type,
             (child_typ.row_key_type
              ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through})
              ._concat(chained_schema)),
             child_typ.row_key)
     elif name == 'LogisticRegression':
         pass_through = self.config['passThrough']
         logreg_type = hl.tstruct(logistic_regression=hl.tarray(regression_test_type(self.config['test'])))
         self._type = hl.ttable(
             child_typ.global_type,
             (child_typ.row_key_type
              ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through})
              ._concat(logreg_type)),
             child_typ.row_key)
     elif name == 'PoissonRegression':
         pass_through = self.config['passThrough']
         poisreg_type = regression_test_type(self.config['test'])
         self._type = hl.ttable(
             child_typ.global_type,
             (child_typ.row_key_type
              ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through})
              ._concat(poisreg_type)),
             child_typ.row_key)
     elif name == 'Skat':
         key_field = self.config['keyField']
         key_type = child_typ.row_type[key_field]
         skat_type = hl.dtype(f'struct{{id:{key_type},size:int32,q_stat:float64,p_value:float64,fault:int32}}')
         self._type = hl.ttable(
             hl.tstruct(),
             skat_type,
             ['id'])
     elif name == 'PCA':
         self._type = hl.ttable(
             hl.tstruct(eigenvalues=hl.tarray(hl.tfloat64),
                        scores=hl.tarray(child_typ.col_key_type._insert_field('scores', hl.tarray(hl.tfloat64)))),
             child_typ.row_key_type._insert_field('loadings', dtype('array<float64>')),
             child_typ.row_key)
     else:
         assert name == 'LocalLDPrune', name
         self._type = hl.ttable(
             hl.tstruct(),
             child_typ.row_key_type._insert_fields(mean=hl.tfloat64, centered_length_rec=hl.tfloat64),
             list(child_typ.row_key))