Beispiel #1
0
 def _compute_type(self):
     name = self.config['name']
     child_typ = self.child.typ
     pass_through = self.config['passThrough']
     if name == 'LinearRegressionRowsChained':
         chained_schema = hl.dtype(
             'struct{n:array<int32>,sum_x:array<float64>,y_transpose_x:array<array<float64>>,beta:array<array<float64>>,standard_error:array<array<float64>>,t_stat:array<array<float64>>,p_value:array<array<float64>>}'
         )
         self._type = hl.ttable(
             child_typ.global_type, (child_typ.row_key_type._insert_fields(
                 **{f: child_typ.row_type[f]
                    for f in pass_through})._concat(chained_schema)),
             child_typ.row_key)
     elif name == 'LinearRegressionRowsSingle':
         chained_schema = hl.dtype(
             'struct{n:int32,sum_x:float64,y_transpose_x:array<float64>,beta:array<float64>,standard_error:array<float64>,t_stat:array<float64>,p_value:array<float64>}'
         )
         self._type = hl.ttable(
             child_typ.global_type, (child_typ.row_key_type._insert_fields(
                 **{f: child_typ.row_type[f]
                    for f in pass_through})._concat(chained_schema)),
             child_typ.row_key)
     else:
         assert name == 'LogisticRegression', name
         pass_through = self.config['passThrough']
         logreg_type = hl.tstruct(logistic_regression=hl.tarray(
             regression_test_type(self.config['test'])))
         self._type = hl.ttable(
             child_typ.global_type, (child_typ.row_key_type._insert_fields(
                 **{f: child_typ.row_type[f]
                    for f in pass_through})._concat(logreg_type)),
             child_typ.row_key)
Beispiel #2
0
 def _compute_type(self):
     name = self.config['name']
     child_typ = self.child.typ
     if name == 'LinearRegressionRowsChained':
         pass_through = self.config['passThrough']
         chained_schema = hl.dtype('struct{n:array<int32>,sum_x:array<float64>,y_transpose_x:array<array<float64>>,beta:array<array<float64>>,standard_error:array<array<float64>>,t_stat:array<array<float64>>,p_value:array<array<float64>>}')
         self._type = hl.ttable(
             child_typ.global_type,
             (child_typ.row_key_type
              ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through})
              ._concat(chained_schema)),
             child_typ.row_key)
     elif name == 'LinearRegressionRowsSingle':
         pass_through = self.config['passThrough']
         chained_schema = hl.dtype('struct{n:int32,sum_x:float64,y_transpose_x:array<float64>,beta:array<float64>,standard_error:array<float64>,t_stat:array<float64>,p_value:array<float64>}')
         self._type = hl.ttable(
             child_typ.global_type,
             (child_typ.row_key_type
              ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through})
              ._concat(chained_schema)),
             child_typ.row_key)
     elif name == 'LogisticRegression':
         pass_through = self.config['passThrough']
         logreg_type = hl.tstruct(logistic_regression=hl.tarray(regression_test_type(self.config['test'])))
         self._type = hl.ttable(
             child_typ.global_type,
             (child_typ.row_key_type
              ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through})
              ._concat(logreg_type)),
             child_typ.row_key)
     elif name == 'PoissonRegression':
         pass_through = self.config['passThrough']
         poisreg_type = regression_test_type(self.config['test'])
         self._type = hl.ttable(
             child_typ.global_type,
             (child_typ.row_key_type
              ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through})
              ._concat(poisreg_type)),
             child_typ.row_key)
     elif name == 'Skat':
         key_field = self.config['keyField']
         key_type = child_typ.row_type[key_field]
         skat_type = hl.dtype(f'struct{{id:{key_type},size:int32,q_stat:float64,p_value:float64,fault:int32}}')
         self._type = hl.ttable(
             hl.tstruct(),
             skat_type,
             ['id'])
     elif name == 'PCA':
         self._type = hl.ttable(
             hl.tstruct(eigenvalues=hl.tarray(hl.tfloat64),
                        scores=hl.tarray(child_typ.col_key_type._insert_field('scores', hl.tarray(hl.tfloat64)))),
             child_typ.row_key_type._insert_field('loadings', dtype('array<float64>')),
             child_typ.row_key)
     else:
         assert name == 'LocalLDPrune', name
         self._type = hl.ttable(
             hl.tstruct(),
             child_typ.row_key_type._insert_fields(mean=hl.tfloat64, centered_length_rec=hl.tfloat64),
             list(child_typ.row_key))
Beispiel #3
0
 def test_transmute_key(self):
     ht = hl.utils.range_table(10)
     self.assertEqual(
         ht.transmute(y=ht.idx + 2).row.dtype,
         hl.dtype('struct{idx: int32, y: int32}'))
     ht = ht.key_by()
     self.assertEqual(
         ht.transmute(y=ht.idx + 2).row.dtype, hl.dtype('struct{y: int32}'))
Beispiel #4
0
 def test_uniqueness(self):
     db = hl.experimental.DB(region='us',
                             cloud='gcp',
                             config=AnnotationDBTests.db_json)
     t = hl.utils.range_table(10)
     t = t.key_by(locus=hl.locus('1', t.idx + 1))
     t = db.annotate_rows_db(t, 'unique_dataset', 'nonunique_dataset')
     assert t.unique_dataset.dtype == hl.dtype(
         'struct{idx: int32, annotation: str}')
     assert t.nonunique_dataset.dtype == hl.dtype(
         'array<struct{idx: int32, annotation: str}>')
Beispiel #5
0
def generate_interval_list_ht(genome_ref: str = 'GRCh38') -> hl.Table:
    """
    Generate a list of intervals (union)

    :return: A joint table (union) of intervals
    """

    intervals = [
        get_ssv2_intervals_ht(),
        get_ssv3_intervals_ht(),
        get_ssv4_intervals_ht(),
        get_ssv5_intervals_ht(),
        get_idt_xgen_intervals_ht()
    ]

    # get global annotation(s) from input tables
    sources = [t.source.collect()[0] for t in intervals]
    platform_labels = [t.platform_label.collect()[0] for t in intervals]

    global_ann_expr = dict(
        zip(GLOBAL_ANNOTATION_FIELDS,
            (current_date(), sources, genome_ref, platform_labels)))

    # keep only the interval <key> field for all tables
    intervals = [ht.key_by('interval').select() for ht in intervals]

    ht_interval = (hl.Table.union(*intervals).select_globals())

    ht_interval = ht_interval.annotate_globals(**global_ann_expr)

    assert ht_interval.key.interval.dtype == hl.dtype(
        f'interval<locus<{genome_ref}>>')

    return ht_interval
Beispiel #6
0
def calculate_phenotypes(mt,
                         genotype,
                         beta,
                         h2,
                         popstrat=None,
                         popstrat_var=None):
    """Calculates phenotypes by multiplying genotypes and betas.
    
    Parameters
    ----------
    mt : :class:`.MatrixTable`
        :class:`.MatrixTable` with all relevant fields passed as parameters.
    genotype : :class:`.Expression`
        Entry field of genotypes. 
    beta : :class:`.Expression`
        Row field of SNP effects.
    h2 : :obj:`float` or :obj:`int` or :obj:`list`
        SNP-based heritability (:math:`h^2`) of simulated trait. Can only be 
        ``None`` if running annotation-informed model.
    popstrat : :class:`.Expression`, optional
        Column field containing population stratification term.
    popstrat_var : :obj:`float` or :obj:`int`
        Variance of population stratification term.
        
    Returns
    -------
    :class:`.MatrixTable`
        :class:`.MatrixTable` with simulated phenotype as column field.
    """
    assert popstrat_var is None or (popstrat_var >=
                                    0), 'popstrat_var must be non-negative'
    tid = ''.join(
        random.choices(string.ascii_uppercase + string.ascii_lowercase, k=5)
    )  # "temporary id" -- random string to identify temporary intermediate fields generated by this method
    mt = annotate_all(
        mt=mt,
        row_exprs={'beta_' + tid: beta},
        col_exprs={} if popstrat is None else {'popstrat_' + tid: popstrat},
        entry_exprs={'gt_' + tid: genotype})
    mt = normalize_genotypes(mt['gt_' + tid])
    if mt['beta_' + tid].dtype == dtype('array<float64>'):  #if >1 traits
        h2 = h2 if type(h2) is list else [h2]
        mt = mt.annotate_cols(y_no_noise=hl.agg.array_agg(
            lambda beta: hl.agg.sum(beta * mt['norm_gt']), mt['beta_' + tid]))
        mt = mt.annotate_cols(
            y=mt.y_no_noise +
            hl.literal(h2).map(lambda x: hl.rand_norm(0, hl.sqrt(1 - x))))
    else:
        mt = mt.annotate_cols(y_no_noise=hl.agg.sum(mt['beta_' + tid] *
                                                    mt['norm_gt']))
        mt = mt.annotate_cols(y=mt.y_no_noise +
                              hl.rand_norm(0, hl.sqrt(1 - h2)))
    if popstrat is not None:
        var_factor = 1 if popstrat_var is None else (popstrat_var**(
            1 / 2)) / mt.aggregate_cols(hl.agg.stats(mt['popstrat_' +
                                                        tid])).stdev
        mt = mt.annotate_cols(y_w_popstrat=mt.y +
                              mt['popstrat_' + tid] * var_factor)
    mt = _clean_fields(mt, tid)
    return mt
Beispiel #7
0
def normalize_genotypes(genotype):
    r"""Normalizes genotypes to have mean 0 and variance 1 at each SNP

    Parameters
    ----------
    genotype : :class:`.Expression` or :class:`.CallExpression`
        Entry field of genotypes.

    Returns
    -------
    :class:`.MatrixTable`
        :class:`.MatrixTable` with normalized genotypes.
    """
    # tid = "temporary id" -- random string to identify temporary intermediate fields generated by this method
    tid = _get_tid()
    mt = genotype._indices.source
    mt = mt.annotate_entries(
        **{
            'gt_' + tid:
            genotype.n_alt_alleles(
            ) if genotype.dtype is dtype('call') else genotype
        })
    mt = mt.annotate_rows(**{'gt_stats_' + tid: hl.agg.stats(mt['gt_' + tid])})
    # TODO: Add MAF filter to remove invariant SNPs?
    mt = mt.annotate_entries(
        norm_gt=(mt['gt_' + tid] - mt['gt_stats_' + tid].mean) /
        mt['gt_stats_' + tid].stdev)
    mt = _clean_fields(mt, tid)
    return mt
Beispiel #8
0
def normalize_genotypes(genotype):
    r"""Normalizes genotypes to have mean 0 and variance 1 at each SNP
    
    Parameters
    ----------
    genotype : :class:`.Expression` or :class:`.CallExpression`
        Entry field of genotypes.
    
    Returns
    -------
    :class:`.MatrixTable`
        :class:`.MatrixTable` with normalized genotypes.
    """
    tid = ''.join(
        random.choices(string.ascii_uppercase + string.ascii_lowercase, k=5)
    )  # "temporary id" -- random string to identify temporary intermediate fields generated by this method
    mt = genotype._indices.source
    mt = mt.annotate_entries(
        **{
            'gt_' + tid:
            genotype.n_alt_alleles(
            ) if genotype.dtype is dtype('call') else genotype
        })
    mt = mt.annotate_rows(**{'gt_stats_' + tid: hl.agg.stats(mt['gt_' + tid])})
    mt = mt.annotate_entries(
        norm_gt=(mt['gt_' + tid] - mt['gt_stats_' + tid].mean) /
        mt['gt_stats_' + tid].stdev)
    mt = _clean_fields(mt, tid)
    return mt
Beispiel #9
0
def annotate_phen(tb, phen, sex, phen_tb_dict, filter_to_phen=True):
    r'''
    Annotates `tb` with phenotype `phen` and filters to individuals with 
    phenotype defined. Uses sex-specific IRNT phenotypes.
    sex options: female, male, both_sexes
    '''
    print(
        f'\n... Reading UKB phenotype "{phen_dict[phen][0]}" for {sex} (code: {phen}) ...'
    )

    phen_tb0 = phen_tb_dict[sex]
    phen_tb = phen_tb0.select(phen).rename({phen: 'phen'})

    if type(tb) == hl.table.Table:
        annotate_fn = hl.Table.annotate
        filter_fn = hl.Table.filter
    elif type(tb) == hl.matrixtable.MatrixTable:
        annotate_fn = hl.MatrixTable.annotate_cols
        filter_fn = hl.MatrixTable.filter_cols

    tb0 = annotate_fn(self=tb,
                      phen_str=hl.str(phen_tb[tb.s]['phen']).replace('\"', ''))

    if filter_to_phen:  # filter to individuals with phenotype data defined
        tb1 = filter_fn(self=tb0, expr=tb0.phen_str == '', keep=False)

    if phen_tb.phen.dtype == hl.dtype('bool'):
        tb2 = annotate_fn(self=tb1,
                          phen=hl.bool(tb1.phen_str)).drop('phen_str')
    else:
        tb2 = annotate_fn(self=tb1,
                          phen=hl.float64(tb1.phen_str)).drop('phen_str')

    return tb2
Beispiel #10
0
def normalize_genotypes(genotype):
    r"""Normalizes genotypes to have mean 0 and variance 1 at each SNP

    Parameters
    ----------
    genotype : :class:`.Expression` or :class:`.CallExpression`
        Entry field of genotypes.

    Returns
    -------
    :class:`.MatrixTable`
        :class:`.MatrixTable` with normalized genotypes.
    """
    uid = Env.get_uid(base=100)
    mt = genotype._indices.source
    mt = mt.annotate_entries(
        **{
            'gt_' + uid:
            genotype.n_alt_alleles(
            ) if genotype.dtype is hl.dtype('call') else genotype
        })
    mt = mt.annotate_rows(**{'gt_stats_' + uid: hl.agg.stats(mt['gt_' + uid])})
    # TODO: Add MAF filter to remove invariant SNPs?
    mt = mt.annotate_entries(
        norm_gt=(mt['gt_' + uid] - mt['gt_stats_' + uid].mean) /
        mt['gt_stats_' + uid].stdev)
    mt = _clean_fields(mt, uid)
    return mt
Beispiel #11
0
    def table_irs(self):
        b = ir.TrueIR()
        table_read = ir.TableRead(
            'src/test/resources/backward_compatability/1.0.0/table/0.ht',
            False, None)
        table_read_row_type = hl.dtype(
            'struct{idx: int32, f32: float32, i64: int64, m: float64, astruct: struct{a: int32, b: float64}, mstruct: struct{x: int32, y: str}, aset: set<str>, mset: set<float64>, d: dict<array<str>, float64>, md: dict<int32, str>, h38: locus<GRCh38>, ml: locus<GRCh37>, i: interval<locus<GRCh37>>, c: call, mc: call, t: tuple(call, str, str), mt: tuple(locus<GRCh37>, bool)}'
        )

        matrix_read = ir.MatrixRead(
            'src/test/resources/backward_compatability/1.0.0/matrix_table/0.hmt',
            False, False)

        range = ir.TableRange(10, 4)
        table_irs = [
            ir.TableUnkey(table_read),
            ir.TableKeyBy(table_read, ['m', 'd'], 1, True),
            ir.TableFilter(table_read, b), table_read,
            ir.MatrixColsTable(matrix_read),
            ir.TableAggregateByKey(table_read,
                                   ir.MakeStruct([('a', ir.I32(5))])),
            ir.TableKeyByAndAggregate(table_read,
                                      ir.MakeStruct([('a', ir.I32(5))]),
                                      ir.MakeStruct([('b', ir.I32(5))]), 1, 2),
            ir.TableJoin(table_read, ir.TableRange(100, 10), 'inner'),
            ir.MatrixEntriesTable(matrix_read),
            ir.MatrixRowsTable(matrix_read),
            ir.TableParallelize(
                'Table{global:Struct{},key:None,row:Struct{a:Int32}}',
                ir.Value(hl.tarray(hl.tstruct(a=hl.tint32)), [{
                    'a': None
                }, {
                    'a': 5
                }, {
                    'a': -3
                }]), None),
            ir.TableMapRows(
                table_read,
                ir.MakeStruct([('a',
                                ir.GetField(ir.Ref('row', table_read_row_type),
                                            'f32')), ('b', ir.F64(-2.11))]),
                None, None),
            ir.TableMapGlobals(
                table_read,
                ir.MakeStruct([('foo', ir.NA(hl.tarray(hl.tint32)))]),
                ir.Value(hl.tstruct(), {})),
            ir.TableRange(100, 10),
            ir.TableRepartition(table_read, 10, False),
            ir.TableUnion([ir.TableRange(100, 10),
                           ir.TableRange(50, 10)]),
            ir.TableExplode(table_read, 'mset'),
            ir.TableHead(table_read, 10),
            ir.TableOrderBy(ir.TableUnkey(table_read), [('m', 'A'),
                                                        ('m', 'D')]),
            ir.TableDistinct(table_read),
            ir.LocalizeEntries(matrix_read, '__entries')
        ]

        return table_irs
Beispiel #12
0
def blocking_execute(code):
    jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {})
    typ = hl.dtype(jir.typ().toString())
    result = Env.hail().backend.spark.SparkBackend.executeJSON(jir)
    return {
        'type': str(typ),
        'result': result
    }
Beispiel #13
0
def blocking_execute(code):
    jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {})
    typ = hl.dtype(jir.typ().toString())
    result = Env.hc()._jhc.backend().executeJSON(jir)
    return {
        'type': str(typ),
        'result': result
    }
Beispiel #14
0
 def build_mt(a):
     data = [{'v': 0, 's': 0, 'x': a[0]},
             {'v': 0, 's': 1, 'x': a[1]},
             {'v': 0, 's': 2, 'x': a[2]}]
     ht = hl.Table.parallelize(data, hl.dtype('struct{v: int32, s: int32, x: float64}'))
     mt = ht.to_matrix_table(['v'], ['s'])
     ids = mt.key_cols_by()['s'].collect()
     return mt.choose_cols([ids.index(0), ids.index(1), ids.index(2)])
Beispiel #15
0
    def table_irs(self):
        b = ir.TrueIR()
        table_read = ir.TableRead(
            ir.TableNativeReader(resource('backward_compatability/1.0.0/table/0.ht'), None, False), False)
        table_read_row_type = hl.dtype('struct{idx: int32, f32: float32, i64: int64, m: float64, astruct: struct{a: int32, b: float64}, mstruct: struct{x: int32, y: str}, aset: set<str>, mset: set<float64>, d: dict<array<str>, float64>, md: dict<int32, str>, h38: locus<GRCh38>, ml: locus<GRCh37>, i: interval<locus<GRCh37>>, c: call, mc: call, t: tuple(call, str, str), mt: tuple(locus<GRCh37>, bool)}')

        matrix_read = ir.MatrixRead(
            ir.MatrixNativeReader(resource('backward_compatability/1.0.0/matrix_table/0.hmt'), None, False),
            False, False)

        range = ir.TableRange(10, 4)
        table_irs = [
            ir.TableKeyBy(table_read, ['m', 'd'], False),
            ir.TableFilter(table_read, b),
            table_read,
            ir.MatrixColsTable(matrix_read),
            ir.TableAggregateByKey(
                table_read,
                ir.MakeStruct([('a', ir.I32(5))])),
            ir.TableKeyByAndAggregate(
                table_read,
                ir.MakeStruct([('a', ir.I32(5))]),
                ir.MakeStruct([('b', ir.I32(5))]),
                1, 2),
            ir.TableJoin(
                table_read,
                ir.TableRange(100, 10), 'inner', 1),
            ir.MatrixEntriesTable(matrix_read),
            ir.MatrixRowsTable(matrix_read),
            ir.TableParallelize(ir.MakeStruct([
                ('rows', ir.Literal(hl.tarray(hl.tstruct(a=hl.tint32)), [{'a':None}, {'a':5}, {'a':-3}])),
                ('global', ir.MakeStruct([]))]), None),
            ir.TableMapRows(
                ir.TableKeyBy(table_read, []),
                ir.MakeStruct([
                    ('a', ir.GetField(ir.Ref('row'), 'f32')),
                    ('b', ir.F64(-2.11))])),
            ir.TableMapGlobals(
                table_read,
                ir.MakeStruct([
                    ('foo', ir.NA(hl.tarray(hl.tint32)))])),
            ir.TableRange(100, 10),
            ir.TableRepartition(table_read, 10, ir.RepartitionStrategy.COALESCE),
            ir.TableUnion(
                [ir.TableRange(100, 10), ir.TableRange(50, 10)]),
            ir.TableExplode(table_read, ['mset']),
            ir.TableHead(table_read, 10),
            ir.TableOrderBy(ir.TableKeyBy(table_read, []), [('m', 'A'), ('m', 'D')]),
            ir.TableDistinct(table_read),
            ir.CastMatrixToTable(matrix_read, '__entries', '__cols'),
            ir.TableRename(table_read, {'idx': 'idx_foo'}, {'global_f32': 'global_foo'}),
            ir.TableMultiWayZipJoin([table_read, table_read], '__data', '__globals'),
            ir.MatrixToTableApply(matrix_read, {'name': 'LinearRegressionRowsSingle', 'yFields': ['col_m'], 'xField': 'entry_m', 'covFields': [], 'rowBlockSize': 10, 'passThrough': []}),
            ir.TableToTableApply(table_read, {'name': 'TableFilterPartitions', 'parts': [0], 'keep': True}),
            ir.TableFilterIntervals(table_read, [hl.utils.Interval(hl.utils.Struct(row_idx=0), hl.utils.Struct(row_idx=10))], hl.tstruct(row_idx=hl.tint32), keep=False),
        ]

        return table_irs
Beispiel #16
0
    def test_realistic(self):
        dtype = hl.dtype('''struct{
locus: locus<GRCh37>,
alleles: array<str>,
rsid: str,
qual: float64,
filters: set<str>,
info: struct{
  NEGATIVE_TRAIN_SITE: bool,
  HWP: float64,
  AC: array<int32>},
empty_struct: struct{
},
variant_qc: struct{
  dp_stats: struct{
    mean: float64,
    stdev: float64,
    min: float64,
    max: float64},
  gq_stats: struct{
    mean: float64,
    stdev: float64,
    min: float64,
    max: float64},
  AC: array<int32>,
  AF: array<float64>,
  AN: int32,
  homozygote_count: array<int32>,
  call_rate: float64}}''')
        tree = PlacementTree.from_named_type('row', dtype)
        grid = tree.to_grid()
        assert len(grid) == 4

        row1 = grid[1]
        assert len(row1) == 8
        for i in range(5):
            assert row1[i] == (None, 1)
        assert row1[5] == (None, 3)
        assert row1[7] == ('variant_qc', 13)

        row2 = grid[2]
        assert len(row2) == 14
        for i in range(5):
            assert row2[i] == (None, 1)
        assert row2[5] == ('info', 3)
        assert row2[7] == ('dp_stats', 4)
        assert row2[8] == ('gq_stats', 4)
        for i in range(9, 13):
            assert row2[i] == (None, 1)

        row3 = grid[3]
        assert row3 == [('locus', 1), ('alleles', 1), ('rsid', 1), ('qual', 1),
                        ('filters', 1), ('NEGATIVE_TRAIN_SITE', 1), ('HWP', 1),
                        ('AC', 1), ('mean', 1), ('stdev', 1), ('min', 1),
                        ('max', 1), ('mean', 1), ('stdev', 1), ('min', 1),
                        ('max', 1), ('AC', 1), ('AF', 1), ('AN', 1),
                        ('homozygote_count', 1), ('call_rate', 1)]
Beispiel #17
0
    def test_variant_qc(self):
        data = [
            {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0},
            {'v': '1:1:A:T', 's': '2', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5},
            {'v': '1:1:A:T', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 11, 'DP': 100},
            {'v': '1:1:A:T', 's': '4', 'GT': None, 'GQ': None, 'DP': 100},
            {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1, 2]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '2', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '4', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5},
        ]

        ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}'))
        ht = ht.transmute(**hl.parse_variant(ht.v))
        mt = ht.to_matrix_table(['locus', 'alleles'], ['s'])
        mt = hl.variant_qc(mt, 'vqc')
        r = mt.rows().collect()

        self.assertEqual(r[0].vqc.AF, [0.5, 0.5])
        self.assertEqual(r[0].vqc.AC, [3, 3])
        self.assertEqual(r[0].vqc.AN, 6)
        self.assertEqual(r[0].vqc.homozygote_count, [1, 1])
        self.assertEqual(r[0].vqc.n_called, 3)
        self.assertEqual(r[0].vqc.n_not_called, 1)
        self.assertEqual(r[0].vqc.call_rate, 0.75)
        self.assertEqual(r[0].vqc.n_het, 1)
        self.assertEqual(r[0].vqc.n_non_ref, 2)
        self.assertEqual(r[0].vqc.het_freq_hwe, 0.6)
        self.assertEqual(r[0].vqc.p_value_hwe, 0.7)
        self.assertEqual(r[0].vqc.dp_stats.min, 0)
        self.assertEqual(r[0].vqc.dp_stats.max, 100)
        self.assertEqual(r[0].vqc.dp_stats.mean, 51.25)
        self.assertAlmostEqual(r[0].vqc.dp_stats.stdev, 48.782040752719645)
        self.assertEqual(r[0].vqc.gq_stats.min, 10)
        self.assertEqual(r[0].vqc.gq_stats.max, 11)
        self.assertAlmostEqual(r[0].vqc.gq_stats.mean, 10.333333333333334)
        self.assertAlmostEqual(r[0].vqc.gq_stats.stdev, 0.47140452079103168)

        self.assertEqual(r[1].vqc.AF, [0.125, 0.5, 0.375])
        self.assertEqual(r[1].vqc.AC, [1, 4, 3])
        self.assertEqual(r[1].vqc.AN, 8)
        self.assertEqual(r[1].vqc.homozygote_count, [0, 1, 1])
        self.assertEqual(r[1].vqc.n_called, 4)
        self.assertEqual(r[1].vqc.n_not_called, 0)
        self.assertEqual(r[1].vqc.call_rate, 1.0)
        self.assertEqual(r[1].vqc.n_het, 2)
        self.assertEqual(r[1].vqc.n_non_ref, 4)
        self.assertEqual(r[1].vqc.p_value_hwe, None)
        self.assertEqual(r[1].vqc.het_freq_hwe, None)
        self.assertEqual(r[1].vqc.dp_stats.min, 5)
        self.assertEqual(r[1].vqc.dp_stats.max, 5)
        self.assertEqual(r[1].vqc.dp_stats.mean, 5)
        self.assertEqual(r[1].vqc.dp_stats.stdev, 0.0)
        self.assertEqual(r[1].vqc.gq_stats.min, 10)
        self.assertEqual(r[1].vqc.gq_stats.max, 10)
        self.assertEqual(r[1].vqc.gq_stats.mean, 10)
        self.assertEqual(r[1].vqc.gq_stats.stdev, 0)
Beispiel #18
0
    def test_variant_qc(self):
        data = [
            {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0},
            {'v': '1:1:A:T', 's': '2', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5},
            {'v': '1:1:A:T', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 11, 'DP': 100},
            {'v': '1:1:A:T', 's': '4', 'GT': None, 'GQ': None, 'DP': 100},
            {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1, 2]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '2', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '4', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5},
        ]

        ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}'))
        ht = ht.transmute(**hl.parse_variant(ht.v))
        mt = ht.to_matrix_table(['locus', 'alleles'], ['s'])
        mt = hl.variant_qc(mt, 'vqc')
        r = mt.rows().collect()

        self.assertEqual(r[0].vqc.AF, [0.5, 0.5])
        self.assertEqual(r[0].vqc.AC, [3, 3])
        self.assertEqual(r[0].vqc.AN, 6)
        self.assertEqual(r[0].vqc.homozygote_count, [1, 1])
        self.assertEqual(r[0].vqc.n_called, 3)
        self.assertEqual(r[0].vqc.n_not_called, 1)
        self.assertEqual(r[0].vqc.call_rate, 0.75)
        self.assertEqual(r[0].vqc.n_het, 1)
        self.assertEqual(r[0].vqc.n_non_ref, 2)
        self.assertEqual(r[0].vqc.het_freq_hwe, 0.6)
        self.assertEqual(r[0].vqc.p_value_hwe, 0.7)
        self.assertEqual(r[0].vqc.dp_stats.min, 0)
        self.assertEqual(r[0].vqc.dp_stats.max, 100)
        self.assertEqual(r[0].vqc.dp_stats.mean, 51.25)
        self.assertAlmostEqual(r[0].vqc.dp_stats.stdev, 48.782040752719645)
        self.assertEqual(r[0].vqc.gq_stats.min, 10)
        self.assertEqual(r[0].vqc.gq_stats.max, 11)
        self.assertAlmostEqual(r[0].vqc.gq_stats.mean, 10.333333333333334)
        self.assertAlmostEqual(r[0].vqc.gq_stats.stdev, 0.47140452079103168)

        self.assertEqual(r[1].vqc.AF, [0.125, 0.5, 0.375])
        self.assertEqual(r[1].vqc.AC, [1, 4, 3])
        self.assertEqual(r[1].vqc.AN, 8)
        self.assertEqual(r[1].vqc.homozygote_count, [0, 1, 1])
        self.assertEqual(r[1].vqc.n_called, 4)
        self.assertEqual(r[1].vqc.n_not_called, 0)
        self.assertEqual(r[1].vqc.call_rate, 1.0)
        self.assertEqual(r[1].vqc.n_het, 2)
        self.assertEqual(r[1].vqc.n_non_ref, 4)
        self.assertEqual(r[1].vqc.p_value_hwe, None)
        self.assertEqual(r[1].vqc.het_freq_hwe, None)
        self.assertEqual(r[1].vqc.dp_stats.min, 5)
        self.assertEqual(r[1].vqc.dp_stats.max, 5)
        self.assertEqual(r[1].vqc.dp_stats.mean, 5)
        self.assertEqual(r[1].vqc.dp_stats.stdev, 0.0)
        self.assertEqual(r[1].vqc.gq_stats.min, 10)
        self.assertEqual(r[1].vqc.gq_stats.max, 10)
        self.assertEqual(r[1].vqc.gq_stats.mean, 10)
        self.assertEqual(r[1].vqc.gq_stats.stdev, 0)
Beispiel #19
0
    def table_irs(self):
        b = ir.TrueIR()
        table_read = ir.TableRead(
            ir.TableNativeReader(resource('backward_compatability/1.0.0/table/0.ht')), False)
        table_read_row_type = hl.dtype('struct{idx: int32, f32: float32, i64: int64, m: float64, astruct: struct{a: int32, b: float64}, mstruct: struct{x: int32, y: str}, aset: set<str>, mset: set<float64>, d: dict<array<str>, float64>, md: dict<int32, str>, h38: locus<GRCh38>, ml: locus<GRCh37>, i: interval<locus<GRCh37>>, c: call, mc: call, t: tuple(call, str, str), mt: tuple(locus<GRCh37>, bool)}')

        matrix_read = ir.MatrixRead(
            ir.MatrixNativeReader(resource('backward_compatability/1.0.0/matrix_table/0.hmt')), False, False)

        range = ir.TableRange(10, 4)
        table_irs = [
            ir.TableKeyBy(table_read, ['m', 'd'], False),
            ir.TableFilter(table_read, b),
            table_read,
            ir.MatrixColsTable(matrix_read),
            ir.TableAggregateByKey(
                table_read,
                ir.MakeStruct([('a', ir.I32(5))])),
            ir.TableKeyByAndAggregate(
                table_read,
                ir.MakeStruct([('a', ir.I32(5))]),
                ir.MakeStruct([('b', ir.I32(5))]),
                1, 2),
            ir.TableJoin(
                table_read,
                ir.TableRange(100, 10), 'inner', 1),
            ir.MatrixEntriesTable(matrix_read),
            ir.MatrixRowsTable(matrix_read),
            ir.TableParallelize(ir.MakeStruct([
                ('rows', ir.Literal(hl.tarray(hl.tstruct(a=hl.tint32)), [{'a':None}, {'a':5}, {'a':-3}])),
                ('global', ir.MakeStruct([]))]), None),
            ir.TableMapRows(
                ir.TableKeyBy(table_read, []),
                ir.MakeStruct([
                    ('a', ir.GetField(ir.Ref('row'), 'f32')),
                    ('b', ir.F64(-2.11))])),
            ir.TableMapGlobals(
                table_read,
                ir.MakeStruct([
                    ('foo', ir.NA(hl.tarray(hl.tint32)))])),
            ir.TableRange(100, 10),
            ir.TableRepartition(table_read, 10, ir.RepartitionStrategy.COALESCE),
            ir.TableUnion(
                [ir.TableRange(100, 10), ir.TableRange(50, 10)]),
            ir.TableExplode(table_read, ['mset']),
            ir.TableHead(table_read, 10),
            ir.TableOrderBy(ir.TableKeyBy(table_read, []), [('m', 'A'), ('m', 'D')]),
            ir.TableDistinct(table_read),
            ir.CastMatrixToTable(matrix_read, '__entries', '__cols'),
            ir.TableRename(table_read, {'idx': 'idx_foo'}, {'global_f32': 'global_foo'}),
            ir.TableMultiWayZipJoin([table_read, table_read], '__data', '__globals'),
            ir.MatrixToTableApply(matrix_read, {'name': 'LinearRegressionRowsSingle', 'yFields': ['col_m'], 'xField': 'entry_m', 'covFields': [], 'rowBlockSize': 10, 'passThrough': []}),
            ir.TableToTableApply(table_read, {'name': 'TableFilterPartitions', 'parts': [0], 'keep': True})
        ]

        return table_irs
Beispiel #20
0
def require_row_key_variant_w_struct_locus(dataset, method):
    if (list(dataset.row_key) != ['locus', 'alleles']
        or not dataset['alleles'].dtype == tarray(tstr)
        or (not isinstance(dataset['locus'].dtype, tlocus)
            and dataset['locus'].dtype != hl.dtype('struct{contig: str, position: int32}'))):
        raise ValueError("Method '{}' requires row key to be two fields 'locus'"
                         " (type 'locus<any>' or 'struct{{contig: str, position: int32}}') and "
                         "'alleles' (type 'array<str>')\n"
                         "  Found:{}".format(method, ''.join(
                             "\n    '{}': {}".format(k, str(dataset[k].dtype)) for k in dataset.row_key)))
Beispiel #21
0
def require_row_key_variant_w_struct_locus(dataset, method):
    if (list(dataset.row_key) != ['locus', 'alleles'] or
            not dataset['alleles'].dtype == tarray(tstr) or
            (not isinstance(dataset['locus'].dtype, tlocus) and
                     dataset['locus'].dtype != hl.dtype('struct{contig: str, position: int32}'))):
        raise ValueError("Method '{}' requires row key to be two fields 'locus'"
                         " (type 'locus<any>' or 'struct{{contig: str, position: int32}}') and "
                         "'alleles' (type 'array<str>')\n"
                         "  Found:{}".format(method, ''.join(
            "\n    '{}': {}".format(k, str(dataset[k].dtype)) for k in dataset.row_key)))
Beispiel #22
0
def annotate_fields(mt, gencode_release, gencode_path):
    genotypes = hl.agg.collect(
        hl.struct(sample_id=mt.s,
                  gq=mt.GQ,
                  cn=mt.RD_CN,
                  num_alt=hl.if_else(hl.is_defined(mt.GT),
                                     mt.GT.n_alt_alleles(), -1)))
    rows = mt.annotate_rows(genotypes=genotypes).rows()

    rows = rows.annotate(**{k: v(rows) for k, v in CORE_FIELDS.items()})

    gene_id_mapping = hl.literal(
        load_gencode(gencode_release, download_path=gencode_path))

    rows = rows.annotate(
        sortedTranscriptConsequences=hl.flatmap(
            lambda x: x,
            hl.filter(lambda x: hl.is_defined(x), [
                rows.info[col].map(lambda gene: hl.struct(
                    gene_symbol=gene,
                    gene_id=gene_id_mapping[gene],
                    predicted_consequence=col.split('__')[-1])) for col in [
                        gene_col for gene_col in rows.info
                        if gene_col.startswith('PROTEIN_CODING__')
                        and rows.info[gene_col].dtype == hl.dtype('array<str>')
                    ]
            ])),
        sv_type=rows.alleles[1].replace('[<>]', '').split(':', 2),
    )

    DERIVED_FIELDS.update({
        'filters':
        lambda rows: hl.if_else(
            hl.len(rows.filters) > 0, rows.filters,
            hl.missing(hl.dtype('array<str>')))
    })
    rows = rows.annotate(**{k: v(rows) for k, v in DERIVED_FIELDS.items()})

    rows = rows.rename({'rsid': 'variantId'})

    return rows.key_by().select(*FIELDS)
Beispiel #23
0
def execute():
    code = flask.request.json
    info(f'execute: {code}')
    try:
        jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {})
        typ = hl.dtype(jir.typ().toString())
        value = Env.hail().backend.spark.SparkBackend.executeJSON(jir)
        result = {'type': str(typ), 'value': value}
        info(f'result: {result}')
        return flask.jsonify(result)
    except FatalError as e:
        return flask.jsonify({'message': e.args[0]}), 400
Beispiel #24
0
def _require_row_variant_w_struct_locus(mt: MatrixTable) -> NoReturn:
    """
    Similar to hail.methods.misc.require_row_key_variant_w_struct_locus, but not necessarily as keys
    """
    assert check_argument_types()

    if (not set(['locus', 'alleles']).issubset(set(mt.rows().row)) or
            not mt['alleles'].dtype == tarray(tstr) or
        (not isinstance(mt['locus'].dtype, tlocus) and
         mt['locus'].dtype != hl.dtype('struct{contig: str, position: int32}'))):
        raise ValueError("'hail.from_matrix_table' requires row to contain two fields 'locus'"
                         " (type 'locus<any>' or 'struct{{contig: str, position: int32}}') and "
                         "'alleles' (type 'array<str>')")
Beispiel #25
0
    def test_linear_mixed_model_function(self):
        n, f, m = 4, 2, 3
        y = np.array([0.0, 1.0, 8.0, 9.0])
        x = np.array([[1.0, 0.0],
                      [1.0, 2.0],
                      [1.0, 1.0],
                      [1.0, 4.0]])
        z = np.array([[0.0, 0.0, 1.0],
                      [0.0, 1.0, 2.0],
                      [1.0, 2.0, 0.0],
                      [2.0, 0.0, 1.0]])

        p_path = utils.new_temp_file()

        def make_call(gt):
            if gt == 0.0:
                return hl.Call([0, 0])
            if gt == 1.0:
                return hl.Call([0, 1])
            if gt == 2.0:
                return hl.Call([1, 1])

        data = [{'v': j, 's': i, 'y': y[i], 'x1': x[i, 1], 'zt': make_call(z[i, j])}
                for i in range(n) for j in range(m)]
        ht = hl.Table.parallelize(data, hl.dtype('struct{v: int32, s: int32, y: float64, x1: float64, zt: tcall}'))
        mt = ht.to_matrix_table(row_key=['v'], col_key=['s'], col_fields=['x1', 'y'])
        colsort = np.argsort(mt.key_cols_by().s.collect()).tolist()
        mt = mt.choose_cols(colsort)

        rrm = hl.realized_relationship_matrix(mt.zt).to_numpy()

        # kinship path agrees with from_kinship
        model, p = hl.linear_mixed_model(mt.y, [1, mt.x1], k=rrm, p_path=p_path, overwrite=True)
        model0, p0 = LinearMixedModel.from_kinship(y, x, rrm, p_path, overwrite=True)
        assert model0._same(model)
        assert np.allclose(p0, p)

        # random effects path with standardize=True agrees with low-rank rrm
        s0, u0 = np.linalg.eigh(rrm)
        s0 = np.flip(s0, axis=0)[:m]
        p0 = np.fliplr(u0).T[:m, :]
        model, p = hl.linear_mixed_model(mt.y, [1, mt.x1], z_t=mt.zt.n_alt_alleles(), p_path=p_path, overwrite=True)
        model0 = LinearMixedModel(p0 @ y, p0 @ x, s0, y, x, p_path=p_path)
        assert model0._same(model)

        # random effects path with standardize=False agrees with from_random_effects
        model0, p0 = LinearMixedModel.from_random_effects(y, x, z, p_path, overwrite=True)
        model, p = hl.linear_mixed_model(mt.y, [1, mt.x1], z_t=mt.zt.n_alt_alleles(), p_path=p_path, overwrite=True, standardize=False)
        assert model0._same(model)
        assert np.allclose(p0, p.to_numpy())
Beispiel #26
0
def execute():
    code = flask.request.json

    info(f'execute: {code}')

    jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {})

    typ = hl.dtype(jir.typ().toString())
    value = Env.hail().expr.ir.Interpret.interpretJSON(jir)

    result = {'type': str(typ), 'value': value}

    info(f'result: {result}')

    return flask.jsonify(result)
Beispiel #27
0
def execute():
    code = flask.request.json
    
    info(f'execute: {code}')
    
    jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {})
    
    typ = hl.dtype(jir.typ().toString())
    value = Env.hail().expr.ir.Interpret.interpretPyIR(code, {}, {})

    result = {
        'type': str(typ),
        'value': value
    }
    
    info(f'result: {result}')
    
    return flask.jsonify(result)
Beispiel #28
0
def get_ldsim_dataset(n_variants=16, n_samples=4, n_contigs=2, seed=None):
    data = []
    rs = np.random.RandomState(seed)
    for v in range(n_variants):
        for s in range(n_samples):
            for c in range(n_contigs):
                data.append({
                    'v': f'{c+1}:{v+1}:A:C',
                    's': f's{s+1:09d}',
                    'cm': .1,
                    'GT': hl.Call([rs.randint(0, 2),
                                   rs.randint(0, 2)])
                })
    ht = hl.Table.parallelize(
        data, hl.dtype('struct{v: str, s: str, cm: float64, GT: call}'))
    ht = ht.transmute(**hl.parse_variant(ht.v))
    mt = ht.to_matrix_table(row_key=['locus', 'alleles'],
                            col_key=['s'],
                            row_fields=['cm'])
    return add_default_plink_fields(mt)
Beispiel #29
0
    def test_sample_qc(self):
        data = [
            {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0},
            {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1]), 'GQ': 15, 'DP': 5},
            {'v': '1:3:A:G,C', 's': '1', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 4},
            {'v': '1:4:G:A', 's': '1', 'GT': hl.Call([0, 1]), 'GQ': None, 'DP': 5},
            {'v': '1:5:C:CG', 's': '1', 'GT': hl.Call([1, 1]), 'GQ': 20, 'DP': 3},
            {'v': '1:6:C:A', 's': '1', 'GT': None, 'GQ': 0, 'DP': None},
        ]

        ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}'))
        ht = ht.transmute(**hl.parse_variant(ht.v))
        mt = ht.to_matrix_table(['locus', 'alleles'], ['s'])
        mt = hl.sample_qc(mt, 'sqc')
        r = mt.cols().select('sqc').collect()

        self.assertAlmostEqual(r[0].sqc.gq_stats.mean, 11)
        self.assertAlmostEqual(r[0].sqc.gq_stats.stdev, 6.6332495807)
        self.assertAlmostEqual(r[0].sqc.gq_stats.min, 0)
        self.assertAlmostEqual(r[0].sqc.gq_stats.max, 20)
        self.assertAlmostEqual(r[0].sqc.dp_stats.mean, 3.399999999)
        self.assertAlmostEqual(r[0].sqc.dp_stats.stdev, 1.8547236990)
        self.assertAlmostEqual(r[0].sqc.dp_stats.min, 0)
        self.assertAlmostEqual(r[0].sqc.dp_stats.max, 5)
        self.assertAlmostEqual(r[0].sqc.call_rate, 0.8333333333)
        self.assertEqual(r[0].sqc.n_called, 5)
        self.assertEqual(r[0].sqc.n_not_called, 1)
        self.assertEqual(r[0].sqc.n_hom_ref, 1)
        self.assertEqual(r[0].sqc.n_het, 1)
        self.assertEqual(r[0].sqc.n_hom_var, 3)
        self.assertEqual(r[0].sqc.n_insertion, 2)
        self.assertEqual(r[0].sqc.n_deletion, 0)
        self.assertEqual(r[0].sqc.n_singleton, 3)
        self.assertEqual(r[0].sqc.n_transition, 1)
        self.assertEqual(r[0].sqc.n_transversion, 3)
        self.assertEqual(r[0].sqc.n_star, 0)
        self.assertEqual(r[0].sqc.n_non_ref, 4)
        self.assertAlmostEqual(r[0].sqc.r_ti_tv, 0.333333333)
        self.assertAlmostEqual(r[0].sqc.r_het_hom_var, 0.3333333333)
        self.assertAlmostEqual(r[0].sqc.r_insertion_deletion, None)
Beispiel #30
0
    def test_sample_qc(self):
        data = [
            {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0},
            {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1]), 'GQ': 15, 'DP': 5},
            {'v': '1:3:A:G,C', 's': '1', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 4},
            {'v': '1:4:G:A', 's': '1', 'GT': hl.Call([0, 1]), 'GQ': None, 'DP': 5},
            {'v': '1:5:C:CG', 's': '1', 'GT': hl.Call([1, 1]), 'GQ': 20, 'DP': 3},
            {'v': '1:6:C:A', 's': '1', 'GT': None, 'GQ': 0, 'DP': None},
        ]

        ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}'))
        ht = ht.transmute(**hl.parse_variant(ht.v))
        mt = ht.to_matrix_table(['locus', 'alleles'], ['s'])
        mt = hl.sample_qc(mt, 'sqc')
        r = mt.cols().select('sqc').collect()

        self.assertAlmostEqual(r[0].sqc.gq_stats.mean, 11)
        self.assertAlmostEqual(r[0].sqc.gq_stats.stdev, 6.6332495807)
        self.assertAlmostEqual(r[0].sqc.gq_stats.min, 0)
        self.assertAlmostEqual(r[0].sqc.gq_stats.max, 20)
        self.assertAlmostEqual(r[0].sqc.dp_stats.mean, 3.399999999)
        self.assertAlmostEqual(r[0].sqc.dp_stats.stdev, 1.8547236990)
        self.assertAlmostEqual(r[0].sqc.dp_stats.min, 0)
        self.assertAlmostEqual(r[0].sqc.dp_stats.max, 5)
        self.assertAlmostEqual(r[0].sqc.call_rate, 0.8333333333)
        self.assertEqual(r[0].sqc.n_called, 5)
        self.assertEqual(r[0].sqc.n_not_called, 1)
        self.assertEqual(r[0].sqc.n_hom_ref, 1)
        self.assertEqual(r[0].sqc.n_het, 1)
        self.assertEqual(r[0].sqc.n_hom_var, 3)
        self.assertEqual(r[0].sqc.n_insertion, 2)
        self.assertEqual(r[0].sqc.n_deletion, 0)
        self.assertEqual(r[0].sqc.n_singleton, 3)
        self.assertEqual(r[0].sqc.n_transition, 1)
        self.assertEqual(r[0].sqc.n_transversion, 3)
        self.assertEqual(r[0].sqc.n_star, 0)
        self.assertEqual(r[0].sqc.n_non_ref, 4)
        self.assertAlmostEqual(r[0].sqc.r_ti_tv, 0.333333333)
        self.assertAlmostEqual(r[0].sqc.r_het_hom_var, 0.3333333333)
        self.assertAlmostEqual(r[0].sqc.r_insertion_deletion, None)
Beispiel #31
0
def get_plink_sim_dataset(n_variants=16, n_samples=4, n_contigs=2, seed=0):
    data = []
    rs = np.random.RandomState(seed)
    contig_index = dividx(n_variants, n_contigs)
    assert contig_index.ndim == 1
    assert contig_index.size == n_variants
    for v in range(n_variants):
        c = contig_index[v]
        for s in range(n_samples):
            data.append({
                "v": f"{c+1}:{v+1}:A:C",
                "s": f"S{s+1:07d}",
                "cm": 0.1,
                "GT": hl.Call([rs.randint(0, 2),
                               rs.randint(0, 2)]),
            })
    ht = hl.Table.parallelize(
        data, hl.dtype("struct{v: str, s: str, cm: float64, GT: call}"))
    ht = ht.transmute(**hl.parse_variant(ht.v))
    mt = ht.to_matrix_table(row_key=["locus", "alleles"],
                            col_key=["s"],
                            row_fields=["cm"])
    return add_default_plink_fields(mt)
Beispiel #32
0
    def value_irs(self):
        b = ir.TrueIR()
        c = ir.Ref('c')
        i = ir.I32(5)
        j = ir.I32(7)
        st = ir.Str('Hail')
        a = ir.Ref('a')
        aa = ir.Ref('aa')
        da = ir.Ref('da')
        nd = ir.Ref('nd')
        v = ir.Ref('v')
        s = ir.Ref('s')
        t = ir.Ref('t')
        call = ir.Ref('call')

        table = ir.TableRange(5, 3)

        matrix_read = ir.MatrixRead(
            ir.MatrixNativeReader(
                resource('backward_compatability/1.0.0/matrix_table/0.hmt'),
                None, False), False, False)

        block_matrix_read = ir.BlockMatrixRead(
            ir.BlockMatrixNativeReader('fake_file_path'))

        value_irs = [
            i,
            ir.I64(5),
            ir.F32(3.14),
            ir.F64(3.14), s,
            ir.TrueIR(),
            ir.FalseIR(),
            ir.Void(),
            ir.Cast(i, hl.tfloat64),
            ir.NA(hl.tint32),
            ir.IsNA(i),
            ir.If(b, i, j),
            ir.Coalesce(i, j),
            ir.Let('v', i, v),
            ir.Ref('x'),
            ir.ApplyBinaryPrimOp('+', i, j),
            ir.ApplyUnaryPrimOp('-', i),
            ir.ApplyComparisonOp('EQ', i, j),
            ir.MakeArray([i, ir.NA(hl.tint32), ir.I32(-3)],
                         hl.tarray(hl.tint32)),
            ir.ArrayRef(a, i, ir.Str('foo')),
            ir.ArrayLen(a),
            ir.ArrayRange(ir.I32(0), ir.I32(5), ir.I32(1)),
            ir.ArraySort(a, 'l', 'r',
                         ir.ApplyComparisonOp("LT", ir.Ref('l'), ir.Ref('r'))),
            ir.ToSet(a),
            ir.ToDict(da),
            ir.ToArray(a),
            ir.MakeNDArray(
                ir.MakeArray([ir.F64(-1.0), ir.F64(1.0)],
                             hl.tarray(hl.tfloat64)),
                ir.MakeTuple([ir.I64(1), ir.I64(2)]), ir.TrueIR()),
            ir.NDArrayShape(nd),
            ir.NDArrayReshape(nd, ir.MakeTuple([ir.I64(5)])),
            ir.NDArrayRef(nd, [ir.I64(1), ir.I64(2)]),
            ir.NDArrayMap(nd, 'v', v),
            ir.NDArrayMatMul(nd, nd),
            ir.LowerBoundOnOrderedCollection(a, i, True),
            ir.GroupByKey(da),
            ir.ArrayMap(a, 'v', v),
            ir.ArrayZip([a, a], ['a', 'b'], ir.TrueIR(), 'ExtendNA'),
            ir.ArrayFilter(a, 'v', v),
            ir.ArrayFlatMap(aa, 'v', v),
            ir.ArrayFold(a, ir.I32(0), 'x', 'v', v),
            ir.ArrayScan(a, ir.I32(0), 'x', 'v', v),
            ir.ArrayLeftJoinDistinct(a, a, 'l', 'r', ir.I32(0), ir.I32(1)),
            ir.ArrayFor(a, 'v', ir.Void()),
            ir.AggFilter(ir.TrueIR(), ir.I32(0), False),
            ir.AggExplode(ir.ArrayRange(ir.I32(0), ir.I32(2), ir.I32(1)), 'x',
                          ir.I32(0), False),
            ir.AggGroupBy(ir.TrueIR(), ir.I32(0), False),
            ir.AggArrayPerElement(
                ir.ArrayRange(ir.I32(0), ir.I32(2), ir.I32(1)), 'x', 'y',
                ir.I32(0), False),
            ir.ApplyAggOp('Collect', [], [ir.I32(0)]),
            ir.ApplyScanOp('Collect', [], [ir.I32(0)]),
            ir.ApplyAggOp('CallStats', [ir.I32(2)], [call]),
            ir.ApplyAggOp('TakeBy', [ir.I32(10)],
                          [ir.F64(-2.11), ir.F64(-2.11)]),
            ir.Begin([ir.Void()]),
            ir.MakeStruct([('x', i)]),
            ir.SelectFields(s, ['x', 'z']),
            ir.InsertFields(s, [('x', i)], None),
            ir.GetField(s, 'x'),
            ir.MakeTuple([i, b]),
            ir.GetTupleElement(t, 1),
            ir.Die(ir.Str('mumblefoo'), hl.tfloat64),
            ir.Apply('&&', hl.tbool, b, c),
            ir.Apply('toFloat64', hl.tfloat64, i),
            ir.Literal(hl.tarray(hl.tint32), [1, 2, None]),
            ir.TableCount(table),
            ir.TableGetGlobals(table),
            ir.TableCollect(ir.TableKeyBy(table, [], False)),
            ir.TableToValueApply(table, {'name': 'ForceCountTable'}),
            ir.MatrixToValueApply(matrix_read,
                                  {'name': 'ForceCountMatrixTable'}),
            ir.TableAggregate(
                table,
                ir.MakeStruct([('foo', ir.ApplyAggOp('Collect', [],
                                                     [ir.I32(0)]))])),
            ir.TableWrite(
                table,
                ir.TableNativeWriter(new_temp_file(), False, True,
                                     "fake_codec_spec$$")),
            ir.TableWrite(
                table, ir.TableTextWriter(new_temp_file(), None, True, 0,
                                          ",")),
            ir.MatrixAggregate(
                matrix_read,
                ir.MakeStruct([('foo', ir.ApplyAggOp('Collect', [],
                                                     [ir.I32(0)]))])),
            ir.MatrixWrite(
                matrix_read,
                ir.MatrixNativeWriter(new_temp_file(), False, False, "", None,
                                      None)),
            ir.MatrixWrite(
                matrix_read,
                ir.MatrixNativeWriter(
                    new_temp_file(), False, False, "",
                    '[{"start":{"row_idx":0},"end":{"row_idx": 10},"includeStart":true,"includeEnd":false}]',
                    hl.dtype('array<interval<struct{row_idx:int32}>>'))),
            ir.MatrixWrite(
                matrix_read,
                ir.MatrixVCFWriter(new_temp_file(), None, False, None)),
            ir.MatrixWrite(matrix_read, ir.MatrixGENWriter(new_temp_file(),
                                                           4)),
            ir.MatrixWrite(matrix_read, ir.MatrixPLINKWriter(new_temp_file())),
            ir.MatrixMultiWrite([matrix_read, matrix_read],
                                ir.MatrixNativeMultiWriter(
                                    new_temp_file(), False, False)),
            ir.BlockMatrixWrite(
                block_matrix_read,
                ir.BlockMatrixNativeWriter('fake_file_path', False, False,
                                           False)),
            ir.LiftMeOut(ir.I32(1))
        ]

        return value_irs
Beispiel #33
0
 def _from_java(cls, jtype):
     t = hl.dtype(jtype.toString())
     t._add_jtype(jtype)
     return t
Beispiel #34
0
    lambda rows: rows.info.N_HOMALT,
    'gnomad_svs_ID':
    lambda rows: rows.info.gnomAD_V2_SVID,
    'gnomad_svs_AF':
    lambda rows: rows.info.gnomAD_V2_AF,
    'pos':
    lambda rows: rows.locus.position,
    'filters':
    lambda rows: hl.array(rows.filters.filter(lambda x: x != 'PASS')),
    'xpos':
    lambda rows: get_xpos(rows.locus.contig, rows.locus.position),
    'cpx_intervals':
    lambda rows: hl.if_else(
        hl.is_defined(rows.info.CPX_INTERVALS),
        rows.info.CPX_INTERVALS.map(lambda x: get_cpx_interval(x)),
        hl.missing(hl.dtype(INTERVAL_TYPE))),
}

DERIVED_FIELDS = {
    'xstart':
    lambda rows: rows.xpos,
    'xstop':
    lambda rows: hl.if_else(hl.is_defined(rows.info.END2),
                            get_xpos(rows.info.CHR2, rows.info.END2),
                            get_xpos(rows.locus.contig, rows.info.END)),
    'svType':
    lambda rows: rows.sv_type[0],
    'transcriptConsequenceTerms':
    lambda rows: [rows.sv_type[0]],
    'sv_type_detail':
    lambda rows: hl.if_else(
Beispiel #35
0
 def test_transmute_key(self):
     ht = hl.utils.range_table(10)
     self.assertEqual(ht.transmute(y = ht.idx + 2).row.dtype, hl.dtype('struct{idx: int32, y: int32}'))
     ht = ht.key_by()
     self.assertEqual(ht.transmute(y = ht.idx + 2).row.dtype, hl.dtype('struct{y: int32}'))
Beispiel #36
0
def calculate_phenotypes(mt,
                         genotype,
                         beta,
                         h2,
                         popstrat=None,
                         popstrat_var=None,
                         exact_h2=False):
    r"""Calculates phenotypes by multiplying genotypes and betas.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        :class:`.MatrixTable` with all relevant fields passed as parameters.
    genotype : :class:`.Expression` or :class:`.CallExpression`
        Entry field of genotypes.
    beta : :class:`.Expression`
        Row field of SNP effects.
    h2 : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`
        SNP-based heritability (:math:`h^2`) of simulated trait. Can only be
        ``None`` if running annotation-informed model.
    popstrat : :class:`.Expression`, optional
        Column field containing population stratification term.
    popstrat_var : :obj:`float` or :obj:`int`
        Variance of population stratification term.
    exact_h2: :obj:`bool`
        Whether to exactly simulate ratio of variance of genetic component of
        phenotype to variance of phenotype to be h2. If `False`, ratio will be
        h2 in expectation. Observed h2 in the simulation will be close to
        expected h2 for large-scale simulations.

    Returns
    -------
    :class:`.MatrixTable`
        :class:`.MatrixTable` with simulated phenotype as column field.
    """
    print('calculating phenotype')
    h2 = h2.tolist() if type(h2) is np.ndarray else (
        [h2] if type(h2) is not list else h2)
    assert popstrat_var is None or (popstrat_var >=
                                    0), 'popstrat_var must be non-negative'
    uid = Env.get_uid(base=100)
    mt = annotate_all(
        mt=mt,
        row_exprs={'beta_' + uid: beta},
        col_exprs={} if popstrat is None else {'popstrat_' + uid: popstrat},
        entry_exprs={
            'gt_' + uid:
            genotype.n_alt_alleles()
            if genotype.dtype is hl.dtype('call') else genotype
        })
    mt = mt.filter_rows(hl.agg.stats(mt['gt_' + uid]).stdev > 0)
    mt = normalize_genotypes(mt['gt_' + uid])
    if mt['beta_' + uid].dtype == hl.dtype('array<float64>'):  # if >1 traits
        if exact_h2:
            raise ValueError(
                'exact_h2=True not supported for multitrait simulations')
        else:
            mt = mt.annotate_cols(y_no_noise=hl.agg.array_agg(
                lambda beta: hl.agg.sum(beta * mt['norm_gt']), mt['beta_' +
                                                                  uid]))
            mt = mt.annotate_cols(
                y=mt.y_no_noise +
                hl.literal(h2).map(lambda x: hl.rand_norm(0, hl.sqrt(1 - x))))
    else:
        if exact_h2 and min([h2[0], 1 - h2[0]]) != 0:
            print('exact h2')
            mt = mt.annotate_cols(**{
                'y_no_noise_' + uid:
                hl.agg.sum(mt['beta_' + uid] * mt['norm_gt'])
            })
            y_no_noise_stdev = mt.aggregate_cols(
                hl.agg.stats(mt['y_no_noise_' + uid]).stdev)
            mt = mt.annotate_cols(
                y_no_noise=hl.sqrt(h2[0]) * mt['y_no_noise_' + uid] /
                y_no_noise_stdev
            )  # normalize genetic component of phenotype to have variance of exactly h2
            mt = mt.annotate_cols(
                **{'noise_' + uid: hl.rand_norm(0, hl.sqrt(1 - h2[0]))})
            noise_stdev = mt.aggregate_cols(
                hl.agg.stats(mt['noise_' + uid]).stdev)
            mt = mt.annotate_cols(noise=hl.sqrt(1 - h2[0]) *
                                  mt['noise_' + uid] / noise_stdev)
            mt = mt.annotate_cols(
                y=mt.y_no_noise +
                hl.sqrt(1 - h2[0]) * mt['noise_' + uid] / noise_stdev)
        else:
            mt = mt.annotate_cols(y_no_noise=hl.agg.sum(mt['beta_' + uid] *
                                                        mt['norm_gt']))
            mt = mt.annotate_cols(y=mt.y_no_noise +
                                  hl.rand_norm(0, hl.sqrt(1 - h2[0])))
    if popstrat is not None:
        var_factor = 1 if popstrat_var is None else (popstrat_var**(
            1 / 2)) / mt.aggregate_cols(hl.agg.stats(mt['popstrat_' +
                                                        uid])).stdev
        mt = mt.rename({'y': 'y_no_popstrat'})
        mt = mt.annotate_cols(y=mt.y_no_popstrat +
                              mt['popstrat_' + uid] * var_factor)
    mt = _clean_fields(mt, uid)
    return mt
Beispiel #37
0
def simulate_phenotypes(mt,
                        genotype,
                        h2,
                        pi=None,
                        rg=None,
                        annot=None,
                        popstrat=None,
                        popstrat_var=None,
                        exact_h2=False):
    r"""Simulate phenotypes for testing LD score regression.

    Simulates betas (SNP effects) under the infinitesimal, spike & slab, or
    annotation-informed models, depending on parameters passed. Optionally adds
    population stratification.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        :class:`.MatrixTable` containing genotypes to be used. Also should contain
        variant annotations as row fields if running the annotation-informed
        model or covariates as column fields if adding population stratification.
    genotype : :class:`.Expression` or :class:`.CallExpression`
        Entry field containing genotypes of individuals to be used for the
        simulation.
    h2 : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`
        SNP-based heritability of simulated trait.
    pi : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`, optional
        Probability of SNP being causal when simulating under the spike & slab
        model.
    rg : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`, optional
        Genetic correlation between traits.
    annot : :class:`.Expression`, optional
        Row field to use as our aggregated annotations.
    popstrat: :class:`.Expression`, optional
        Column field to use as our aggregated covariates for adding population
        stratification.
    exact_h2: :obj:`bool`, optional
        Whether to exactly simulate ratio of variance of genetic component of
        phenotype to variance of phenotype to be h2. If `False`, ratio will be
        h2 in expectation. Observed h2 in the simulation will be close to
        expected h2 for large-scale simulations.

    Returns
    -------
    :class:`.MatrixTable`
        :class:`.MatrixTable` with simulated betas and phenotypes, simulated according
        to specified model.
    """
    h2 = h2.tolist() if type(h2) is np.ndarray else (
        [h2] if type(h2) is not list else h2)
    pi = pi.tolist() if type(pi) is np.ndarray else pi
    uid = Env.get_uid(base=100)
    mt = annotate_all(
        mt=mt,
        row_exprs={} if annot is None else {'annot_' + uid: annot},
        col_exprs={} if popstrat is None else {'popstrat_' + uid: popstrat},
        entry_exprs={
            'gt_' + uid:
            genotype.n_alt_alleles()
            if genotype.dtype is hl.dtype('call') else genotype
        })
    mt, pi, rg = make_betas(mt=mt,
                            h2=h2,
                            pi=pi,
                            annot=None if annot is None else mt['annot_' +
                                                                uid],
                            rg=rg)
    mt = calculate_phenotypes(
        mt=mt,
        genotype=mt['gt_' + uid],
        beta=mt['beta'],
        h2=h2,
        popstrat=None if popstrat is None else mt['popstrat_' + uid],
        popstrat_var=popstrat_var,
        exact_h2=exact_h2)
    mt = annotate_all(mt=mt,
                      global_exprs={
                          'ldscsim':
                          hl.struct(
                              **{
                                  'h2':
                                  h2[0] if len(h2) == 1 else h2,
                                  **({} if pi == [None] else {
                                         'pi': pi
                                     }),
                                  **({} if rg == [None] else {
                                         'rg': rg[0] if len(rg) == 1 else rg
                                     }),
                                  **({} if annot is None else {
                                         'is_annot_inf': True
                                     }),
                                  **({} if popstrat is None else {
                                         'is_popstrat_inf': True
                                     }),
                                  **({} if popstrat_var is None else {
                                         'popstrat_var': popstrat_var
                                     }), 'exact_h2':
                                  exact_h2
                              })
                      })
    mt = _clean_fields(mt, uid)
    return mt
Beispiel #38
0
def trio_matrix(dataset, pedigree, complete_trios=False) -> MatrixTable:
    """Builds and returns a matrix where columns correspond to trios and entries contain genotypes for the trio.

    .. include:: ../_templates/req_tstring.rst

    Examples
    --------

    Create a trio matrix:

    >>> pedigree = hl.Pedigree.read('data/case_control_study.fam')
    >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True)

    Notes
    -----

    This method builds a new matrix table with one column per trio. If
    `complete_trios` is ``True``, then only trios that satisfy
    :meth:`.Trio.is_complete` are included. In this new dataset, the column
    identifiers are the sample IDs of the trio probands. The column fields and
    entries of the matrix are changed in the following ways:

    The new column fields consist of three structs (`proband`, `father`,
    `mother`), a Boolean field, and a string field:

    - **proband** (:class:`.tstruct`) - Column fields on the proband.
    - **father** (:class:`.tstruct`) - Column fields on the father.
    - **mother** (:class:`.tstruct`) - Column fields on the mother.
    - **id** (:py:data:`.tstr`) - Column key for the proband.
    - **is_female** (:py:data:`.tbool`) - Proband is female.
      ``True`` for female, ``False`` for male, missing if unknown.
    - **fam_id** (:py:data:`.tstr`) - Family ID.

    The new entry fields are:

    - **proband_entry** (:class:`.tstruct`) - Proband entry fields.
    - **father_entry** (:class:`.tstruct`) - Father entry fields.
    - **mother_entry** (:class:`.tstruct`) - Mother entry fields.

    Parameters
    ----------
    pedigree : :class:`.Pedigree`

    Returns
    -------
    :class:`.MatrixTable`
    """
    mt = dataset
    require_col_key_str(mt, "trio_matrix")

    k = mt.col_key.dtype.fields[0]
    samples = mt[k].collect()

    pedigree = pedigree.filter_to(samples)
    trios = pedigree.complete_trios() if complete_trios else pedigree.trios
    n_trios = len(trios)

    sample_idx = {}
    for i, s in enumerate(samples):
        sample_idx[s] = i

    trios = [
        hl.Struct(id=sample_idx[t.s],
                  pat_id=None if t.pat_id is None else sample_idx[t.pat_id],
                  mat_id=None if t.mat_id is None else sample_idx[t.mat_id],
                  is_female=t.is_female,
                  fam_id=t.fam_id) for t in trios
    ]
    trios_type = hl.dtype(
        'array<struct{id:int,pat_id:int,mat_id:int,is_female:bool,fam_id:str}>'
    )

    trios_sym = Env.get_uid()
    entries_sym = Env.get_uid()
    cols_sym = Env.get_uid()

    mt = mt.annotate_globals(**{trios_sym: hl.literal(trios, trios_type)})
    mt = mt._localize_entries(entries_sym, cols_sym)
    mt = mt.annotate_globals(
        **{
            cols_sym:
            hl.map(
                lambda i: hl.bind(
                    lambda t: hl.struct(id=mt[cols_sym][t.id][k],
                                        proband=mt[cols_sym][t.id],
                                        father=mt[cols_sym][t.pat_id],
                                        mother=mt[cols_sym][t.mat_id],
                                        is_female=t.is_female,
                                        fam_id=t.fam_id), mt[trios_sym][i]),
                hl.range(0, n_trios))
        })
    mt = mt.annotate(
        **{
            entries_sym:
            hl.map(
                lambda i: hl.bind(
                    lambda t: hl.struct(proband_entry=mt[entries_sym][t.id],
                                        father_entry=mt[entries_sym][t.pat_id],
                                        mother_entry=mt[entries_sym][t.mat_id]
                                        ), mt[trios_sym][i]),
                hl.range(0, n_trios))
        })
    mt = mt.drop(trios_sym)

    return mt._unlocalize_entries(entries_sym, cols_sym, ['id'])
Beispiel #39
0
def ht_to_vcf_mt(
    info_ht: hl.Table,
    pipe_delimited_annotations: List[str] = INFO_VCF_AS_PIPE_DELIMITED_FIELDS,
) -> hl.MatrixTable:
    """
    Creates a MT ready for vcf export from a HT. In particular, the following conversions are done:
    - All int64 are coerced to int32
    - Fields specified by `pipe_delimited_annotations` will be converted from arrays to pipe-delimited strings

    .. note::

        The MT returned has no cols.

    :param info_ht: Input HT
    :param pipe_delimited_annotations: List of info fields (they must be fields of the ht.info Struct)
    :return: MatrixTable ready for VCF export
    """
    def get_pipe_expr(
            array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpression:
        return hl.delimit(array_expr.map(lambda x: hl.or_else(hl.str(x), "")),
                          "|")

    # Make sure the HT is keyed by locus, alleles
    info_ht = info_ht.key_by("locus", "alleles")

    # Convert int64 fields to int32 (int64 isn't supported by VCF)
    for f, ft in info_ht.info.dtype.items():
        if ft == hl.dtype("int64"):
            logger.warning(
                f"Coercing field info.{f} from int64 to int32 for VCF output. Value will be capped at int32 max value."
            )
            info_ht = info_ht.annotate(info=info_ht.info.annotate(
                **{f: hl.int32(hl.min(2**31 - 1, info_ht.info[f]))}))
        elif ft == hl.dtype("array<int64>"):
            logger.warning(
                f"Coercing field info.{f} from array<int64> to array<int32> for VCF output. Array values will be capped at int32 max value."
            )
            info_ht = info_ht.annotate(info=info_ht.info.annotate(
                **{
                    f:
                    info_ht.info[f].map(
                        lambda x: hl.int32(hl.min(2**31 - 1, x)))
                }))

    info_expr = {}

    # Make sure to pipe-delimit fields that need to.
    # Note: the expr needs to be prefixed by "|" because GATK expect one value for the ref (always empty)
    # Note2: this doesn't produce the correct annotation for AS_SB_TABLE, but it is overwritten below
    for f in pipe_delimited_annotations:
        if f in info_ht.info:
            info_expr[f] = "|" + get_pipe_expr(info_ht.info[f])

    # Flatten SB if it is an array of arrays
    if "SB" in info_ht.info and not isinstance(info_ht.info.SB,
                                               hl.expr.ArrayNumericExpression):
        info_expr["SB"] = info_ht.info.SB[0].extend(info_ht.info.SB[1])

    if "AS_SB_TABLE" in info_ht.info:
        info_expr["AS_SB_TABLE"] = get_pipe_expr(
            info_ht.info.AS_SB_TABLE.map(lambda x: hl.delimit(x, ",")))

    # Annotate with new expression and add 's' empty string field required to cast HT to MT
    info_ht = info_ht.annotate(info=info_ht.info.annotate(**info_expr),
                               s=hl.null(hl.tstr))

    # Create an MT with no cols so that we acn export to VCF
    info_mt = info_ht.to_matrix_table_row_major(columns=["s"],
                                                entry_field_name="s")
    return info_mt.filter_cols(False)
Beispiel #40
0
 def _compute_type(self):
     name = self.config['name']
     child_typ = self.child.typ
     if name == 'LinearRegressionRowsChained':
         pass_through = self.config['passThrough']
         chained_schema = hl.dtype(
             'struct{n:array<int32>,sum_x:array<float64>,y_transpose_x:array<array<float64>>,beta:array<array<float64>>,standard_error:array<array<float64>>,t_stat:array<array<float64>>,p_value:array<array<float64>>}')
         self._type = hl.ttable(
             child_typ.global_type,
             (child_typ.row_key_type
              ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through})
              ._concat(chained_schema)),
             child_typ.row_key)
     elif name == 'LinearRegressionRowsSingle':
         pass_through = self.config['passThrough']
         chained_schema = hl.dtype(
             'struct{n:int32,sum_x:float64,y_transpose_x:array<float64>,beta:array<float64>,standard_error:array<float64>,t_stat:array<float64>,p_value:array<float64>}')
         self._type = hl.ttable(
             child_typ.global_type,
             (child_typ.row_key_type
              ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through})
              ._concat(chained_schema)),
             child_typ.row_key)
     elif name == 'LogisticRegression':
         pass_through = self.config['passThrough']
         logreg_type = hl.tstruct(logistic_regression=hl.tarray(regression_test_type(self.config['test'])))
         self._type = hl.ttable(
             child_typ.global_type,
             (child_typ.row_key_type
              ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through})
              ._concat(logreg_type)),
             child_typ.row_key)
     elif name == 'PoissonRegression':
         pass_through = self.config['passThrough']
         poisreg_type = regression_test_type(self.config['test'])
         self._type = hl.ttable(
             child_typ.global_type,
             (child_typ.row_key_type
              ._insert_fields(**{f: child_typ.row_type[f] for f in pass_through})
              ._concat(poisreg_type)),
             child_typ.row_key)
     elif name == 'Skat':
         key_field = self.config['keyField']
         key_type = child_typ.row_type[key_field]
         skat_type = hl.dtype(f'struct{{id:{key_type},size:int32,q_stat:float64,p_value:float64,fault:int32}}')
         self._type = hl.ttable(
             hl.tstruct(),
             skat_type,
             ['id'])
     elif name == 'PCA':
         self._type = hl.ttable(
             hl.tstruct(eigenvalues=hl.tarray(hl.tfloat64),
                        scores=hl.tarray(child_typ.col_key_type._insert_field('scores', hl.tarray(hl.tfloat64)))),
             child_typ.row_key_type._insert_field('loadings', dtype('array<float64>')),
             child_typ.row_key)
     else:
         assert name == 'LocalLDPrune', name
         self._type = hl.ttable(
             hl.tstruct(),
             child_typ.row_key_type._insert_fields(mean=hl.tfloat64, centered_length_rec=hl.tfloat64),
             list(child_typ.row_key))