Ejemplo n.º 1
0
def make_betas(mt,
               h2=None,
               pi=1,
               is_annot_inf=False,
               annot_coef_dict=None,
               annot_regex=None,
               h2_normalize=True):
    '''Simulate betas. Options: Infinitesimal model, spike & slab, annotation-informed'''
    check_beta_args(h2=h2,
                    pi=pi,
                    is_annot_inf=is_annot_inf,
                    annot_coef_dict=annot_coef_dict,
                    annot_regex=annot_regex,
                    h2_normalize=h2_normalize)
    M = mt.count_rows()
    if is_annot_inf:
        print('\rSimulating {} annotation-informed betas {}'.format(
            'h2-normalized' if h2_normalize else '', '(default coef: 1)'
            if annot_coef_dict is None else 'using annot_coef_dict'))
        mt1 = agg_fields(mt=mt, coef_dict=annot_coef_dict, regex=annot_regex)
        annot_sum = mt1.aggregate_rows(hl.agg.sum(mt1.__agg_annot))
        return mt1.annotate_rows(
            __beta=hl.rand_norm(
                0,
                hl.sqrt(mt1.__agg_annot *
                        (h2 / annot_sum if h2_normalize else 1)))
        )  # if is_h2_normalized: scale variance of betas to be h2, else: keep unscaled variance
    else:
        print('Simulating betas using {} model w/ h2 = {}'.format(
            ('infinitesimal' if pi is 1 else 'spike & slab'), h2))
        mt1 = mt.annotate_globals(__h2=none_to_null(h2), __pi=pi)
        return mt1.annotate_rows(__beta=hl.rand_bool(pi) *
                                 hl.rand_norm(0, hl.sqrt(h2 / (M * pi))))
Ejemplo n.º 2
0
Archivo: ldscsim.py Proyecto: zscu/hail
def calculate_phenotypes(mt,
                         genotype,
                         beta,
                         h2,
                         popstrat=None,
                         popstrat_var=None):
    """Calculates phenotypes by multiplying genotypes and betas.
    
    Parameters
    ----------
    mt : :class:`.MatrixTable`
        :class:`.MatrixTable` with all relevant fields passed as parameters.
    genotype : :class:`.Expression`
        Entry field of genotypes. 
    beta : :class:`.Expression`
        Row field of SNP effects.
    h2 : :obj:`float` or :obj:`int` or :obj:`list`
        SNP-based heritability (:math:`h^2`) of simulated trait. Can only be 
        ``None`` if running annotation-informed model.
    popstrat : :class:`.Expression`, optional
        Column field containing population stratification term.
    popstrat_var : :obj:`float` or :obj:`int`
        Variance of population stratification term.
        
    Returns
    -------
    :class:`.MatrixTable`
        :class:`.MatrixTable` with simulated phenotype as column field.
    """
    assert popstrat_var is None or (popstrat_var >=
                                    0), 'popstrat_var must be non-negative'
    tid = ''.join(
        random.choices(string.ascii_uppercase + string.ascii_lowercase, k=5)
    )  # "temporary id" -- random string to identify temporary intermediate fields generated by this method
    mt = annotate_all(
        mt=mt,
        row_exprs={'beta_' + tid: beta},
        col_exprs={} if popstrat is None else {'popstrat_' + tid: popstrat},
        entry_exprs={'gt_' + tid: genotype})
    mt = normalize_genotypes(mt['gt_' + tid])
    if mt['beta_' + tid].dtype == dtype('array<float64>'):  #if >1 traits
        h2 = h2 if type(h2) is list else [h2]
        mt = mt.annotate_cols(y_no_noise=hl.agg.array_agg(
            lambda beta: hl.agg.sum(beta * mt['norm_gt']), mt['beta_' + tid]))
        mt = mt.annotate_cols(
            y=mt.y_no_noise +
            hl.literal(h2).map(lambda x: hl.rand_norm(0, hl.sqrt(1 - x))))
    else:
        mt = mt.annotate_cols(y_no_noise=hl.agg.sum(mt['beta_' + tid] *
                                                    mt['norm_gt']))
        mt = mt.annotate_cols(y=mt.y_no_noise +
                              hl.rand_norm(0, hl.sqrt(1 - h2)))
    if popstrat is not None:
        var_factor = 1 if popstrat_var is None else (popstrat_var**(
            1 / 2)) / mt.aggregate_cols(hl.agg.stats(mt['popstrat_' +
                                                        tid])).stdev
        mt = mt.annotate_cols(y_w_popstrat=mt.y +
                              mt['popstrat_' + tid] * var_factor)
    mt = _clean_fields(mt, tid)
    return mt
Ejemplo n.º 3
0
def sim_phenotypes(mt, h2, popstrat=None, popstrat_c=None):
    mt1 = mt.annotate_cols(__y_no_noise = hl.agg.sum(mt.__beta * mt.__norm_gt))
    mt2 = mt1.annotate_cols(__y = mt1.__y_no_noise + hl.rand_norm(0,hl.sqrt(1-h2)))
    if popstrat is not None:
        return add_pop_strat(mt2, popstrat, popstrat_c)
    else:
        return mt2
Ejemplo n.º 4
0
def calculate_phenotypes(mt, genotype, h2, beta, is_popstrat=False, cov_coef_dict=None,
                         cov_regex=None):
    '''Calculates phenotypes given betas and genotypes. Adding population stratification is optional'''
    check_mt_sources(mt,genotype,beta)
    check_popstrat_args(is_popstrat=is_popstrat,cov_coef_dict=cov_coef_dict,cov_regex=cov_regex)
    mt1 = mt._annotate_all(row_exprs={'__beta':beta},
                           entry_exprs={'__gt':genotype},
                           global_exprs={'__is_popstrat':is_popstrat,
                                         '__cov_coef_dict':none_to_null(cov_coef_dict),
                                         '__cov_regex':none_to_null(cov_regex)})
    mt2 = normalize_genotypes(mt1.__gt)
    print('\rCalculating phenotypes{}...'.format(' w/ population stratification' if is_popstrat else '').ljust(81))
    mt3 = mt2.annotate_cols(__y_no_noise = hl.agg.sum(mt2.__beta * mt2.__norm_gt))
    if h2 is None:
        h2 = mt3.aggregate_cols(hl.agg.stats(mt3.__y_no_noise)).stdev**2
        if h2 > 1:
            print(f'WARNING: Total SNP-based h2 = {h2} (>1)')
            print('Not adding environmental noise')
            h2=1
    mt4 = mt3.annotate_cols(__y = mt3.__y_no_noise + hl.rand_norm(0,hl.sqrt(1-h2)))            
    if is_popstrat:
        return add_popstrat(mt4, 
                             y=mt4.__y, 
                             cov_coef_dict=cov_coef_dict,
                             cov_regex=cov_regex)
    else:
        return mt4
Ejemplo n.º 5
0
def make_betas(mt, h2=None, pi=1, is_annot_inf=False, annot_coef_dict=None, annot_regex=None, h2_normalize=True):
    '''Simulate betas. Options: Infinitesimal model, spike & slab, annotation-informed'''  
    check_beta_args(h2=h2,pi=pi,is_annot_inf=is_annot_inf,annot_coef_dict=annot_coef_dict,
                    annot_regex=annot_regex,h2_normalize=h2_normalize)
    M = mt.count_rows()
    if is_annot_inf:
        print('\rSimulating {} annotation-informed betas {}'.format(
                'h2-normalized' if h2_normalize else '',
                '(default coef: 1)' if annot_coef_dict is None else 'using annot_coef_dict'))
        mt1 = agg_fields(mt=mt,coef_dict=annot_coef_dict,regex=annot_regex)
        annot_sum = mt1.aggregate_rows(hl.agg.sum(mt1.__agg_annot))
        return mt1.annotate_rows(__beta = hl.rand_norm(0, hl.sqrt(mt1.__agg_annot*(h2/annot_sum if h2_normalize else 1)))) # if is_h2_normalized: scale variance of betas to be h2, else: keep unscaled variance
    else:
        print('Simulating betas using {} model w/ h2 = {}'.format(('infinitesimal' if pi is 1 else 'spike & slab'),h2))
        mt1 = mt.annotate_globals(__h2 = none_to_null(h2), __pi = pi)
        return mt1.annotate_rows(__beta = hl.rand_bool(pi)*hl.rand_norm(0,hl.sqrt(h2/(M*pi))))
Ejemplo n.º 6
0
def pc_project(
    mt: hl.MatrixTable,
    loadings_ht: hl.Table,
    loading_location: str = "loadings",
    af_location: str = "pca_af",
) -> hl.Table:
    """
    Project samples in `mt` on pre-computed PCs.

    :param mt: MT containing the samples to project
    :param loadings_ht: HT containing the PCA loadings and allele frequencies used for the PCA
    :param loading_location: Location of expression for loadings in `loadings_ht`
    :param af_location: Location of expression for allele frequency in `loadings_ht`
    :return: Table with scores calculated from loadings in column `scores`
    """
    n_variants = loadings_ht.count()

    mt = mt.annotate_rows(
        pca_loadings=loadings_ht[mt.row_key][loading_location],
        pca_af=loadings_ht[mt.row_key][af_location],
    )

    mt = mt.filter_rows(
        hl.is_defined(mt.pca_loadings)
        & hl.is_defined(mt.pca_af)
        & (mt.pca_af > 0)
        & (mt.pca_af < 1))

    gt_norm = (mt.GT.n_alt_alleles() - 2 * mt.pca_af) / hl.sqrt(
        n_variants * 2 * mt.pca_af * (1 - mt.pca_af))

    mt = mt.annotate_cols(scores=hl.agg.array_sum(mt.pca_loadings * gt_norm))

    return mt.cols().select("scores")
Ejemplo n.º 7
0
def pc_hwe_gt(
    mt: hl.MatrixTable,
    loadings_ht: hl.Table,
    loading_location: str = "loadings",
    af_location: str = "pca_af",
) -> hl.MatrixTable:
    n_variants = loadings_ht.count()

    mt = mt.annotate_rows(
        pca_loadings=loadings_ht[mt.row_key][loading_location],
        pca_af=loadings_ht[mt.row_key][af_location],
    )

    mt = mt.filter_rows(
        hl.is_defined(mt.pca_loadings)
        & hl.is_defined(mt.pca_af)
        & (mt.pca_af > 0)
        & (mt.pca_af < 1)
    )

    # Attach normalized entries to be used in projection
    mt = mt.annotate_entries(
        GTN=(mt.GT.n_alt_alleles() - 2 * mt.pca_af)
        / hl.sqrt(n_variants * 2 * mt.pca_af * (1 - mt.pca_af))
    )

    return mt
Ejemplo n.º 8
0
def sim_phenotypes(mt, genotype, h2, beta, popstrat=None, popstrat_s2=1):
    '''Simulate phenotypes given betas and genotypes. Adding population stratification is optional'''
    print('\rCalculating phenotypes{}...'.format(
        '' if popstrat is None else ' w/ population stratification').ljust(81))
    if popstrat is None:
        mt1 = mt._annotate_all(row_exprs={'__beta': beta},
                               entry_exprs={'__gt': genotype})
    else:
        mt1 = mt._annotate_all(row_exprs={'__beta': beta},
                               col_exprs={'__popstrat': popstrat},
                               entry_exprs={'__gt': genotype},
                               global_exprs={'__popstrat_s2': popstrat_s2})

    mt2 = normalize_genotypes(mt1, mt1.__gt)
    mt3 = mt2.annotate_cols(__y_no_noise=hl.agg.sum(mt2.__beta *
                                                    mt2.__norm_gt))
    mt4 = mt3.annotate_cols(__y=mt3.__y_no_noise +
                            hl.rand_norm(0, hl.sqrt(1 - h2)))

    if popstrat is None:
        return mt4
    else:
        return add_pop_strat(mt4,
                             y=mt4.__y,
                             popstrat=mt4.__popstrat,
                             popstrat_s2=hl.eval(mt4.__popstrat_s2))
Ejemplo n.º 9
0
def make_betas(mt, h2, pi=1, annot=None):
    '''Simulate betas. Options: Infinitesimal model, spike & slab, annotation-informed'''
    M = mt.count_rows()
    if annot is not None:
        print('\rSimulating annotation-informed betas w/ h2 = {}'.format(h2))
        mt1 = mt._annotate_all(row_exprs={'__annot': annot},
                               global_exprs={'__h2': h2})
        annot_sum = mt1.aggregate_rows(hl.agg.sum(mt1.__annot))
        return mt1.annotate_rows(
            __beta=hl.rand_norm(0, hl.sqrt(mt1.__annot / annot_sum * h2)))
    else:
        print('Simulating betas using {} model w/ h2 = {}'.format(
            ('infinitesimal' if pi is 1 else 'spike & slab'), h2))
        mt1 = mt.annotate_globals(__h2=h2, __pi=pi)
        return mt1.annotate_rows(__beta=hl.rand_bool(pi) *
                                 hl.rand_norm(0, hl.sqrt(h2 / (M * pi))))
Ejemplo n.º 10
0
def hwe_normalize(call_expr):
    mt = matrix_table_source('hwe_normalize/call_expr', call_expr)
    mt = mt.select_entries(__gt=call_expr.n_alt_alleles())
    mt = mt.annotate_rows(__AC=agg.sum(mt.__gt),
                          __n_called=agg.count_where(hl.is_defined(mt.__gt)))
    mt = mt.filter_rows((mt.__AC > 0) & (mt.__AC < 2 * mt.__n_called))

    n_variants = mt.count_rows()
    if n_variants == 0:
        raise FatalError(
            "hwe_normalize: found 0 variants after filtering out monomorphic sites."
        )
    info(
        f"hwe_normalize: found {n_variants} variants after filtering out monomorphic sites."
    )

    mt = mt.annotate_rows(__mean_gt=mt.__AC / mt.__n_called)
    mt = mt.annotate_rows(__hwe_scaled_std_dev=hl.sqrt(mt.__mean_gt *
                                                       (2 - mt.__mean_gt) *
                                                       n_variants / 2))
    mt = mt.unfilter_entries()

    normalized_gt = hl.or_else(
        (mt.__gt - mt.__mean_gt) / mt.__hwe_scaled_std_dev, 0.0)
    return normalized_gt
Ejemplo n.º 11
0
def make_betas(mt, h2, pi=1, annot=None):
    M = mt.count_rows()
    if annot is not None:
        annot_stats = mt.aggregate_rows(hl.agg.stats(mt.__annot), _localize=True)
        return mt.annotate_rows(__beta = hl.rand_norm(0, (mt.__annot - annot_stats.mean) / annot_stats.stdev * hl.sqrt(h2 / M)))
    else:
        return mt.annotate_rows(__beta = hl.rand_bool(pi)*hl.rand_norm(0,hl.sqrt(h2/(M*pi))))
Ejemplo n.º 12
0
    def blockmatrix_irs(self):
        scalar_ir = ir.F64(2)
        vector_ir = ir.MakeArray([ir.F64(3), ir.F64(2)],
                                 hl.tarray(hl.tfloat64))

        read = ir.BlockMatrixRead(
            ir.BlockMatrixNativeReader(resource('blockmatrix_example/0')))
        add_two_bms = ir.BlockMatrixMap2(
            read, read, 'l', 'r',
            ir.ApplyBinaryPrimOp('+', ir.Ref('l'), ir.Ref('r')), "Union")
        negate_bm = ir.BlockMatrixMap(
            read, 'element', ir.ApplyUnaryPrimOp('-', ir.Ref('element')),
            False)
        sqrt_bm = ir.BlockMatrixMap(
            read, 'element',
            hl.sqrt(construct_expr(ir.Ref('element'), hl.tfloat64))._ir, False)
        persisted = ir.BlockMatrixRead(ir.BlockMatrixPersistReader('x', read))

        scalar_to_bm = ir.ValueToBlockMatrix(scalar_ir, [1, 1], 1)
        col_vector_to_bm = ir.ValueToBlockMatrix(vector_ir, [2, 1], 1)
        row_vector_to_bm = ir.ValueToBlockMatrix(vector_ir, [1, 2], 1)
        broadcast_scalar = ir.BlockMatrixBroadcast(scalar_to_bm, [], [2, 2],
                                                   256)
        broadcast_col = ir.BlockMatrixBroadcast(col_vector_to_bm, [0], [2, 2],
                                                256)
        broadcast_row = ir.BlockMatrixBroadcast(row_vector_to_bm, [1], [2, 2],
                                                256)
        transpose = ir.BlockMatrixBroadcast(broadcast_scalar, [1, 0], [2, 2],
                                            256)
        matmul = ir.BlockMatrixDot(broadcast_scalar, transpose)

        rectangle = ir.Literal(hl.tarray(hl.tint64), [0, 1, 5, 6])
        band = ir.Literal(hl.ttuple(hl.tint64, hl.tint64), (-1, 1))
        intervals = ir.Literal(
            hl.ttuple(hl.tarray(hl.tint64), hl.tarray(hl.tint64)),
            ([0, 1, 5, 6], [5, 6, 8, 9]))

        sparsify1 = ir.BlockMatrixSparsify(read, rectangle,
                                           ir.RectangleSparsifier)
        sparsify2 = ir.BlockMatrixSparsify(read, band, ir.BandSparsifier(True))
        sparsify3 = ir.BlockMatrixSparsify(read, intervals,
                                           ir.RowIntervalSparsifier(True))

        densify = ir.BlockMatrixDensify(read)

        pow_ir = (construct_expr(ir.Ref('l'), hl.tfloat64)**construct_expr(
            ir.Ref('r'), hl.tfloat64))._ir
        squared_bm = ir.BlockMatrixMap2(scalar_to_bm, scalar_to_bm, 'l', 'r',
                                        pow_ir, "NeedsDense")
        slice_bm = ir.BlockMatrixSlice(
            matmul, [slice(0, 2, 1), slice(0, 1, 1)])

        return [
            read, persisted, add_two_bms, negate_bm, sqrt_bm, scalar_to_bm,
            col_vector_to_bm, row_vector_to_bm, broadcast_scalar,
            broadcast_col, broadcast_row, squared_bm, transpose, sparsify1,
            sparsify2, sparsify3, densify, matmul, slice_bm
        ]
Ejemplo n.º 13
0
def pc_project(call_expr, loadings_expr, af_expr):
    """Projects genotypes onto pre-computed PCs. Requires loadings and
    allele-frequency from a reference dataset (see example). Note that
    `loadings_expr` must have no missing data and reflect the rows
    from the original PCA run for this method to be accurate.

    Example
    -------
    >>> # Compute loadings and allele frequency for reference dataset
    >>> _, _, loadings_ht = hl.hwe_normalized_pca(mt.GT, k=10, compute_loadings=True)   # doctest: +SKIP
    >>> mt = mt.annotate_rows(af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)                # doctest: +SKIP
    >>> loadings_ht = loadings_ht.annotate(af=mt.rows()[loadings_ht.key].af)            # doctest: +SKIP
    >>> # Project new genotypes onto loadings
    >>> ht = pc_project(mt_to_project.GT, loadings_ht.loadings, loadings_ht.af)         # doctest: +SKIP

    Parameters
    ----------
    call_expr : :class:`.CallExpression`
        Entry-indexed call expression for genotypes
        to project onto loadings.
    loadings_expr : :class:`.ArrayNumericExpression`
        Location of expression for loadings
    af_expr : :class:`.Float64Expression`
        Location of expression for allele frequency

    Returns
    -------
    :class:`.Table`
        Table with scores calculated from loadings in column `scores`
    """
    check_entry_indexed('pc_project', call_expr)
    check_row_indexed('pc_project', loadings_expr)
    check_row_indexed('pc_project', af_expr)

    gt_source = call_expr._indices.source
    loadings_source = loadings_expr._indices.source
    af_source = af_expr._indices.source

    loadings_expr = _get_expr_or_join(loadings_expr, loadings_source, gt_source, '_loadings')
    af_expr = _get_expr_or_join(af_expr, af_source, gt_source, '_af')

    mt = gt_source._annotate_all(row_exprs={'_loadings': loadings_expr, '_af': af_expr},
                                 entry_exprs={'_call': call_expr})

    if isinstance(loadings_source, hl.MatrixTable):
        n_variants = loadings_source.count_rows()
    else:
        n_variants = loadings_source.count()

    mt = mt.filter_rows(hl.is_defined(mt._loadings) & hl.is_defined(mt._af) & (mt._af > 0) & (mt._af < 1))

    gt_norm = (mt._call.n_alt_alleles() - 2 * mt._af) / hl.sqrt(n_variants * 2 * mt._af * (1 - mt._af))

    return mt.select_cols(scores=hl.agg.array_sum(mt._loadings * gt_norm)).cols()
Ejemplo n.º 14
0
 def make_random_function(self, mt):
     from functools import reduce
     #check that row key of annotations matches row key of mt
     mt = mt.add_row_index()
     rows = [rf for rf in self.a_ht.row]
     self.a_ht = self.a_ht.annotate(__a__=reduce(
         self.f, map(lambda x: self.a_ht[rows[x]], range(len(rows)))))
     std = self.a_ht.aggregate(hl.agg.stats(self.a_ht.__a__)).stdev
     self.a_ht = self.a_ht.annotate(__a__=self.a_ht.__a__ *
                                    hl.sqrt(self.h2 / std))
     return mt.annotate_rows(beta=hl.literal(
         self.a_ht.__a__.take(mt.count_rows()))[hl.int32(mt.row_idx)])
Ejemplo n.º 15
0
 def spectral_moments(self, num_moments, R):
     eigval_powers = hl.nd.vstack([
         self.S.map(lambda x: x**(2 * i))
         for i in range(1, num_moments + 1)
     ])
     moments = eigval_powers @ (
         self.V1t[:, :self.k] @ R).map(lambda x: x**2)
     means = moments.sum(1) / self.k
     variances = (moments - means.reshape(
         -1, 1)).map(lambda x: x**2).sum(1) / (self.k - 1)
     stdevs = variances.map(lambda x: hl.sqrt(x))
     return hl.struct(moments=means, stdevs=stdevs)
Ejemplo n.º 16
0
    def test(self):
        schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr,
                            f=hl.tarray(hl.tint32),
                            g=hl.tarray(
                                hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)),
                            h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr),
                            i=hl.tbool,
                            j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr))

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5,
                 'e': "hello", 'f': [1, 2, 3],
                 'g': [hl.Struct(x=1, y=5, z='banana')],
                 'h': hl.Struct(a=5, b=3, c='winter'),
                 'i': True,
                 'j': hl.Struct(x=3, y=2, z='summer')}]

        kt = hl.Table.parallelize(rows, schema)

        result = convert_struct_to_dict(kt.annotate(
            chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d),
            ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5),
            dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])),
            dpois=hl.dpois(4, kt.a),
            drop=kt.h.drop('b', 'c'),
            exp=hl.exp(kt.c),
            fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d),
            hwe=hl.hardy_weinberg_p(1, 2, 1),
            index=hl.index(kt.g, 'z'),
            is_defined=hl.is_defined(kt.i),
            is_missing=hl.is_missing(kt.i),
            is_nan=hl.is_nan(hl.float64(kt.a)),
            json=hl.json(kt.g),
            log=hl.log(kt.a, kt.b),
            log10=hl.log10(kt.c),
            or_else=hl.or_else(kt.a, 5),
            or_missing=hl.or_missing(kt.i, kt.j),
            pchisqtail=hl.pchisqtail(kt.a, kt.b),
            pcoin=hl.rand_bool(0.5),
            pnorm=hl.pnorm(0.2),
            pow=2.0 ** kt.b,
            ppois=hl.ppois(kt.a, kt.b),
            qchisqtail=hl.qchisqtail(kt.a, kt.b),
            range=hl.range(0, 5, kt.b),
            rnorm=hl.rand_norm(0.0, kt.b),
            rpois=hl.rand_pois(kt.a),
            runif=hl.rand_unif(kt.b, kt.a),
            select=kt.h.select('c', 'b'),
            sqrt=hl.sqrt(kt.a),
            to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)],
            where=hl.cond(kt.i, 5, 10)
        ).take(1)[0])
Ejemplo n.º 17
0
def pc_project(
        # reference: https://github.com/macarthur-lab/gnomad_hail/blob/master/utils/generic.py#L131
        mt: hl.MatrixTable,
        loadings_ht: hl.Table,
        loading_location: str = "loadings",
        af_location: str = "pca_af") -> hl.Table:
    n_variants = loadings_ht.count()
    mt = mt.annotate_rows(
        pca_loadings=loadings_ht[mt.row_key][loading_location],
        pca_af=loadings_ht[mt.row_key][af_location])
    mt = mt.filter_rows(
        hl.is_defined(mt.pca_loadings) & hl.is_defined(mt.pca_af)
        & (mt.pca_af > 0) & (mt.pca_af < 1))
    gt_norm = (mt.GT.n_alt_alleles() - 2 * mt.pca_af) / hl.sqrt(
        n_variants * 2 * mt.pca_af * (1 - mt.pca_af))
    mt = mt.annotate_cols(scores=hl.agg.array_sum(mt.pca_loadings * gt_norm))
    return mt.cols().select('scores')
Ejemplo n.º 18
0
def sim_corr_phenotypes(mt, cov_array):
    h2_ls = np.diag(cov_array)
    n_phens = len(h2_ls)
    for i in range(n_phens):
        mt = mt._annotate_all(
            col_exprs={
                f'__y_no_noise_{i}': hl.agg.sum(mt[f'__beta_{i}'] *
                                                mt.__norm_gt)
            })
    for i in range(n_phens):
        mt = mt._annotate_all(
            col_exprs={
                f'__y_{i}':
                mt[f'__y_no_noise_{i}'] +
                hl.rand_norm(0, hl.sqrt(1 - h2_ls[i]))
            })
    return mt
Ejemplo n.º 19
0
def metaanalyze_gwas(subsets, gwas_ht_list, sim_name, param_suffix, wd):

    if len(gwas_ht_list) == 1:  # if list is single GWAS, don't meta-analyze
        return gwas_ht_list[0]

    sample_ct_dict = {}

    for subset_idx, tmp_gwas_ht in enumerate(gwas_ht_list, 1):
        sample_ct = subsets.filter(subsets.subset_idx == subset_idx).count()
        sample_ct_dict[subset_idx] = sample_ct
        print(
            f'\n\nmeta-analysis sample count subset {subset_idx}: {sample_ct}\n\n'
        )

    comb_gwas_ht = gwas_ht_list[0].annotate(subset_idx=1, n=sample_ct_dict[1])
    union_args = [
        ht.annotate(subset_idx=subset_idx, n=sample_ct_dict[subset_idx])
        for subset_idx, ht in enumerate(gwas_ht_list[1:], 2)
    ]  # list of gwas_ht's to join
    comb_gwas_ht = comb_gwas_ht.union(*union_args)

    comb_gwas_ht = comb_gwas_ht.annotate(w=1 /
                                         (comb_gwas_ht['standard_error']**2))

    agg_expr = {
        'meta_se':
        hl.sqrt(1 / (hl.agg.sum(comb_gwas_ht.w))),
        'meta_beta':
        hl.agg.sum(comb_gwas_ht['beta'] * comb_gwas_ht.w) /
        hl.agg.sum(comb_gwas_ht.w),
        'meta_EAF':
        hl.agg.sum(comb_gwas_ht['EAF'] * comb_gwas_ht['n']) /
        hl.agg.sum(comb_gwas_ht['n'])
    }

    comb_gwas_ht = comb_gwas_ht.group_by('locus',
                                         'alleles').aggregate(**agg_expr)

    comb_gwas_ht = comb_gwas_ht.annotate(
        meta_pval=2 *
        hl.pnorm(-hl.abs(comb_gwas_ht.meta_beta / comb_gwas_ht.meta_se)))

    meta_gwas_path = f'{wd}/gwas.logreg.{sim_name}.{param_suffix}.tsv.gz'
    comb_gwas_ht.export(meta_gwas_path)
Ejemplo n.º 20
0
def calculate_phenotypes(mt,
                         genotype,
                         h2,
                         beta,
                         is_popstrat=False,
                         cov_coef_dict=None,
                         cov_regex=None,
                         normalize_gt=True):
    '''Simulate phenotypes given betas and genotypes. Adding population stratification is optional'''
    check_mt_sources(mt, genotype, beta)
    check_popstrat_args(is_popstrat=is_popstrat,
                        cov_coef_dict=cov_coef_dict,
                        cov_regex=cov_regex)
    mt1 = mt._annotate_all(row_exprs={'__beta': beta},
                           entry_exprs={'__gt': genotype},
                           global_exprs={
                               '__is_popstrat': is_popstrat,
                               '__cov_coef_dict': none_to_null(cov_coef_dict),
                               '__cov_regex': none_to_null(cov_regex)
                           })
    if normalize_gt:
        mt2 = normalize_genotypes(mt1.__gt)
    else:
        mt2 = mt1.annotate_entries(__norm_gt=mt1.__gt)
    print('\rCalculating phenotypes{}...'.format(
        ' w/ population stratification' if is_popstrat else '').ljust(81))
    mt3 = mt2.annotate_cols(__y_no_noise=hl.agg.sum(mt2.__beta *
                                                    mt2.__norm_gt))
    if h2 is None:
        h2 = mt3.aggregate_cols(hl.agg.stats(mt3.__y_no_noise)).stdev**2
        if h2 > 1:
            print(f'WARNING: Total SNP-based h2 = {h2} (>1)')
            print('Not adding environmental noise')
            h2 = 1
    mt4 = mt3.annotate_cols(__y=mt3.__y_no_noise +
                            hl.rand_norm(0, hl.sqrt(1 - h2)))
    if is_popstrat:
        return add_popstrat(mt4,
                            y=mt4.__y,
                            cov_coef_dict=cov_coef_dict,
                            cov_regex=cov_regex)
    else:
        return mt4
Ejemplo n.º 21
0
    def blockmatrix_irs(self):
        scalar_ir = ir.F64(2)
        vector_ir = ir.MakeArray([ir.F64(3), ir.F64(2)],
                                 hl.tarray(hl.tfloat64))

        read = ir.BlockMatrixRead(
            ir.BlockMatrixNativeReader(resource('blockmatrix_example/0')))
        add_two_bms = ir.BlockMatrixMap2(
            read, read, 'l', 'r',
            ir.ApplyBinaryPrimOp('+', ir.Ref('l'), ir.Ref('r')))
        negate_bm = ir.BlockMatrixMap(
            read, 'element', ir.ApplyUnaryPrimOp('-', ir.Ref('element')))
        sqrt_bm = ir.BlockMatrixMap(
            read, 'element',
            hl.sqrt(construct_expr(ir.Ref('element'), hl.tfloat64))._ir)

        scalar_to_bm = ir.ValueToBlockMatrix(scalar_ir, [1, 1], 1)
        col_vector_to_bm = ir.ValueToBlockMatrix(vector_ir, [2, 1], 1)
        row_vector_to_bm = ir.ValueToBlockMatrix(vector_ir, [1, 2], 1)
        broadcast_scalar = ir.BlockMatrixBroadcast(scalar_to_bm, [], [2, 2],
                                                   256)
        broadcast_col = ir.BlockMatrixBroadcast(col_vector_to_bm, [0], [2, 2],
                                                256)
        broadcast_row = ir.BlockMatrixBroadcast(row_vector_to_bm, [1], [2, 2],
                                                256)
        transpose = ir.BlockMatrixBroadcast(broadcast_scalar, [1, 0], [2, 2],
                                            256)
        matmul = ir.BlockMatrixDot(broadcast_scalar, transpose)

        pow_ir = (construct_expr(ir.Ref('l'), hl.tfloat64)**construct_expr(
            ir.Ref('r'), hl.tfloat64))._ir
        squared_bm = ir.BlockMatrixMap2(scalar_to_bm, scalar_to_bm, 'l', 'r',
                                        pow_ir)
        slice_bm = ir.BlockMatrixSlice(
            matmul, [slice(0, 2, 1), slice(0, 1, 1)])

        return [
            read, add_two_bms, negate_bm, sqrt_bm, scalar_to_bm,
            col_vector_to_bm, row_vector_to_bm, broadcast_scalar,
            broadcast_col, broadcast_row, squared_bm, transpose, matmul,
            slice_bm
        ]
Ejemplo n.º 22
0
def _make_tsm_from_call(call_expr,
                        block_size,
                        mean_center=False,
                        hwe_normalize=False):
    mt = matrix_table_source('_make_tsm/entry_expr', call_expr)
    mt = mt.select_entries(__gt=call_expr.n_alt_alleles())
    if mean_center or hwe_normalize:
        mt = mt.annotate_rows(__AC=agg.sum(mt.__gt),
                              __n_called=agg.count_where(hl.is_defined(
                                  mt.__gt)))
        mt = mt.filter_rows((mt.__AC > 0) & (mt.__AC < 2 * mt.__n_called))

        n_variants = mt.count_rows()
        if n_variants == 0:
            raise FatalError(
                "_make_tsm: found 0 variants after filtering out monomorphic sites."
            )
        info(
            f"_make_tsm: found {n_variants} variants after filtering out monomorphic sites."
        )

        mt = mt.annotate_rows(__mean_gt=mt.__AC / mt.__n_called)
        mt = mt.unfilter_entries()

        mt = mt.select_entries(__x=hl.or_else(mt.__gt - mt.__mean_gt, 0.0))

        if hwe_normalize:
            mt = mt.annotate_rows(
                __hwe_scaled_std_dev=hl.sqrt(mt.__mean_gt *
                                             (2 - mt.__mean_gt) * n_variants /
                                             2))
            mt = mt.select_entries(__x=mt.__x / mt.__hwe_scaled_std_dev)
    else:
        mt = mt.select_entries(__x=mt.__gt)

    A, ht = mt_to_table_of_ndarray(mt.__x,
                                   block_size,
                                   return_checkpointed_table_also=True)
    A = A.persist()
    return TallSkinnyMatrix(A, A.ndarray, ht, list(mt.col_key))
Ejemplo n.º 23
0
def pc_project(mt: hl.MatrixTable,
               pc_loadings: hl.Table,
               loading_location: str = "loadings",
               af_location: str = "pca_af") -> hl.MatrixTable:
    """
    Projects samples in `mt` on PCs computed in `pc_mt`
    :param MatrixTable mt: MT containing the samples to project
    :param Table pc_loadings: MT containing the PC loadings for the variants
    :param str loading_location: Location of expression for loadings in `pc_loadings`
    :param str af_location: Location of expression for allele frequency in `pc_loadings`
    :return: MT with scores calculated from loadings
    """
    n_variants = mt.count_rows()

    mt = mt.annotate_rows(**pc_loadings[mt.locus, mt.alleles])
    mt = mt.filter_rows(
        hl.is_defined(mt[loading_location]) & hl.is_defined(mt[af_location])
        & (mt[af_location] > 0) & (mt[af_location] < 1))

    gt_norm = (mt.GT.n_alt_alleles() - 2 * mt[af_location]) / hl.sqrt(
        n_variants * 2 * mt[af_location] * (1 - mt[af_location]))
    return mt.annotate_cols(pca_scores=hl.agg.array_sum(mt[loading_location] *
                                                        gt_norm))
Ejemplo n.º 24
0
    def blockmatrix_irs(self):
        scalar_ir = ir.F64(2)
        vector_ir = ir.MakeArray([ir.F64(3), ir.F64(2)], hl.tarray(hl.tfloat64))

        read = ir.BlockMatrixRead(ir.BlockMatrixNativeReader(resource('blockmatrix_example/0')))
        add_two_bms = ir.BlockMatrixMap2(read, read, ir.ApplyBinaryPrimOp('+', ir.Ref('l'), ir.Ref('r')))
        negate_bm = ir.BlockMatrixMap(read, ir.ApplyUnaryPrimOp('-', ir.Ref('element')))
        sqrt_bm = ir.BlockMatrixMap(read, hl.sqrt(construct_expr(ir.Ref('element'), hl.tfloat64))._ir)

        scalar_to_bm = ir.ValueToBlockMatrix(scalar_ir, [1, 1], 1)
        col_vector_to_bm = ir.ValueToBlockMatrix(vector_ir, [2, 1], 1)
        row_vector_to_bm = ir.ValueToBlockMatrix(vector_ir, [1, 2], 1)
        broadcast_scalar = ir.BlockMatrixBroadcast(scalar_to_bm, [], [2, 2], 256)
        broadcast_col = ir.BlockMatrixBroadcast(col_vector_to_bm, [0], [2, 2], 256)
        broadcast_row = ir.BlockMatrixBroadcast(row_vector_to_bm, [1], [2, 2], 256)
        transpose = ir.BlockMatrixBroadcast(broadcast_scalar, [1, 0], [2, 2], 256)
        matmul = ir.BlockMatrixDot(broadcast_scalar, transpose)

        pow_ir = (construct_expr(ir.Ref('l'), hl.tfloat64) ** construct_expr(ir.Ref('r'), hl.tfloat64))._ir
        squared_bm = ir.BlockMatrixMap2(scalar_to_bm, scalar_to_bm, pow_ir)

        return [
            read,
            add_two_bms,
            negate_bm,
            sqrt_bm,
            scalar_to_bm,
            col_vector_to_bm,
            row_vector_to_bm,
            broadcast_scalar,
            broadcast_col,
            broadcast_row,
            squared_bm,
            transpose,
            matmul
        ]
Ejemplo n.º 25
0
def pc_project(mt,
               loadings_ht,
               loading_location="loadings",
               af_location="pca_af"):
    """
    Projects samples in `mt` on pre-computed PCs.
    :param MatrixTable mt: MT containing the samples to project into previously calculated PCs
    :param Table loadings_ht: HT containing the PCA loadings and allele frequencies used for the PCA
    :param str loading_location: Location of expression for loadings in `loadings_ht`
    :param str af_location: Location of expression for allele frequency in `loadings_ht`
    :return: Hail Table with scores calculated from loadings in column `scores`
    :rtype: Table

    From Konrad Karczewski
    """
    n_variants = loadings_ht.count()

    # Annotate matrix table with pca loadings and af from other dataset which pcs were calculated from
    mt = mt.annotate_rows(
        pca_loadings=loadings_ht[mt.row_key][loading_location],
        pca_af=loadings_ht[mt.row_key][af_location])

    # Filter to rows where pca_loadings and af are defined, and af > 0 and < 1
    mt = mt.filter_rows(
        hl.is_defined(mt.pca_loadings) & hl.is_defined(mt.pca_af)
        & (mt.pca_af > 0) & (mt.pca_af < 1))

    # Calculate genotype normalization constant
    # Basically, mean centers and normalizes the genotypes under the binomial distribution so that they can be
    # multiplied by the PC loadings to get the projected principal components
    gt_norm = (mt.GT.n_alt_alleles() - 2 * mt.pca_af) / hl.sqrt(
        n_variants * 2 * mt.pca_af * (1 - mt.pca_af))

    mt = mt.annotate_cols(scores=hl.agg.array_sum(mt.pca_loadings * gt_norm))

    return mt.cols().select('scores')
Ejemplo n.º 26
0
def merge_stats_counters_expr(
    stats: hl.expr.ArrayExpression,
) -> hl.expr.StructExpression:
    """
    Merges multiple stats counters, assuming that they were computed on non-overlapping data.

    Examples:

    - Merge stats computed on indel and snv separately
    - Merge stats computed on bi-allelic and multi-allelic variants separately
    - Merge stats computed on autosomes and sex chromosomes separately

    :param stats: An array of stats counters to merge
    :return: Merged stats Struct
    """

    def add_stats(
        i: hl.expr.StructExpression, j: hl.expr.StructExpression
    ) -> hl.expr.StructExpression:
        """
        This merges two stast counters together. It assumes that all stats counter fields are present in the struct.

        :param i: accumulator: struct with mean, n and variance
        :param j: new element: stats_struct -- needs to contain mean, n and variance
        :return: Accumulation over all elements: struct with mean, n and variance
        """
        delta = j.mean - i.mean
        n_tot = i.n + j.n
        return hl.struct(
            min=hl.min(i.min, j.min),
            max=hl.max(i.max, j.max),
            mean=(i.mean * i.n + j.mean * j.n) / n_tot,
            variance=i.variance + j.variance + (delta * delta * i.n * j.n) / n_tot,
            n=n_tot,
            sum=i.sum + j.sum,
        )

    # Gather all metrics present in all stats counters
    metrics = set(stats[0])
    dropped_metrics = set()
    for stat_expr in stats[1:]:
        stat_expr_metrics = set(stat_expr)
        dropped_metrics = dropped_metrics.union(stat_expr_metrics.difference(metrics))
        metrics = metrics.intersection(stat_expr_metrics)
    if dropped_metrics:
        logger.warning(
            f"The following metrics will be dropped during stats counter merging as they do not appear in all counters: {', '.join(dropped_metrics)}"
        )

    # Because merging standard deviation requires having the mean and n,
    # check that they are also present if `stdev` is. Otherwise remove stdev
    if "stdev" in metrics:
        missing_fields = [x for x in ["n", "mean"] if x not in metrics]
        if missing_fields:
            logger.warning(
                f'Cannot merge `stdev` from given stats counters since they are missing the following fields: {",".join(missing_fields)}'
            )
            metrics.remove("stdev")

    # Create a struct with all possible stats for merging.
    # This step helps when folding because we can rely on the struct schema
    # Note that for intermediate merging, we compute the variance rather than the stdev
    all_stats = hl.array(stats).map(
        lambda x: hl.struct(
            min=x.min if "min" in metrics else hl.null(hl.tfloat64),
            max=x.max if "max" in metrics else hl.null(hl.tfloat64),
            mean=x.mean if "mean" in metrics else hl.null(hl.tfloat64),
            variance=x.stdev * x.stdev if "stdev" in metrics else hl.null(hl.tfloat64),
            n=x.n if "n" in metrics else hl.null(hl.tfloat64),
            sum=x.sum if "sum" in metrics else hl.null(hl.tfloat64),
        )
    )

    # Merge the stats
    agg_stats = all_stats[1:].fold(add_stats, all_stats[0])

    # Return only the metrics that were present in all independent stats counters
    # If `stdev` is present, then compute it from the variance
    return agg_stats.select(
        **{
            metric: agg_stats[metric]
            if metric != "stdev"
            else hl.sqrt(agg_stats.variance)
            for metric in metrics
        }
    )
Ejemplo n.º 27
0
def linreg(y, x, nested_dim=1, weight=None) -> StructExpression:
    """Compute multivariate linear regression statistics.

    Examples
    --------
    Regress HT against an intercept (1), SEX, and C1:

    >>> table1.aggregate(agg.linreg(table1.HT, [1, table1.SEX == 'F', table1.C1]))
    Struct(beta=[88.50000000000014, 81.50000000000057, -10.000000000000068],
           standard_error=[14.430869689661844, 59.70552738231206, 7.000000000000016],
           t_stat=[6.132686518775844, 1.365032746099571, -1.428571428571435],
           p_value=[0.10290201427537926, 0.40250974549499974, 0.3888002244284281],
           multiple_standard_error=4.949747468305833,
           multiple_r_squared=0.7175792507204611,
           adjusted_r_squared=0.1527377521613834,
           f_stat=1.2704081632653061,
           multiple_p_value=0.5314327326007864,
           n=4)

    Regress blood pressure against an intercept (1), genotype, age, and
    the interaction of genotype and age:

    >>> ds_ann = ds.annotate_rows(linreg = 
    ...     hl.agg.linreg(ds.pheno.blood_pressure,
    ...                   [1,
    ...                    ds.GT.n_alt_alleles(),
    ...                    ds.pheno.age,
    ...                    ds.GT.n_alt_alleles() * ds.pheno.age]))

    Warning
    -------
    As in the example, the intercept covariate ``1`` must be included
    **explicitly** if desired.

    Notes
    -----
    In relation to
    `lm.summary <https://stat.ethz.ch/R-manual/R-devel/library/stats/html/summary.lm.html>`__
    in R, ``linreg(y, x = [1, mt.x1, mt.x2])`` computes
    ``summary(lm(y ~ x1 + x2))`` and
    ``linreg(y, x = [mt.x1, mt.x2], nested_dim=0)`` computes
    ``summary(lm(y ~ x1 + x2 - 1))``.

    More generally, `nested_dim` defines the number of effects to fit in the
    nested (null) model, with the effects on the remaining covariates fixed
    to zero.

    The returned struct has ten fields:
     - `beta` (:class:`.tarray` of :py:data:`.tfloat64`):
       Estimated regression coefficient for each covariate.
     - `standard_error` (:class:`.tarray` of :py:data:`.tfloat64`):
       Estimated standard error for each covariate.
     - `t_stat` (:class:`.tarray` of :py:data:`.tfloat64`):
       t-statistic for each covariate.
     - `p_value` (:class:`.tarray` of :py:data:`.tfloat64`):
       p-value for each covariate.
     - `multiple_standard_error` (:py:data:`.tfloat64`):
       Estimated standard deviation of the random error.
     - `multiple_r_squared` (:py:data:`.tfloat64`):
       Coefficient of determination for nested models.
     - `adjusted_r_squared` (:py:data:`.tfloat64`):
       Adjusted `multiple_r_squared` taking into account degrees of
       freedom.
     - `f_stat` (:py:data:`.tfloat64`):
       F-statistic for nested models.
     - `multiple_p_value` (:py:data:`.tfloat64`):
       p-value for the
       `F-test <https://en.wikipedia.org/wiki/F-test#Regression_problems>`__ of
       nested models.
     - `n` (:py:data:`.tint64`):
       Number of samples included in the regression. A sample is included if and
       only if `y`, all elements of `x`, and `weight` (if set) are non-missing.

    All but the last field are missing if `n` is less than or equal to the
    number of covariates or if the covariates are linearly dependent.

    If set, the `weight` parameter generalizes the model to `weighted least
    squares <https://en.wikipedia.org/wiki/Weighted_least_squares>`__, useful
    for heteroscedastic (diagonal but non-constant) variance.

    Warning
    -------
    If any weight is negative, the resulting statistics will be ``nan``.

    Parameters
    ----------
    y : :class:`.Float64Expression`
        Response (dependent variable).
    x : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression`
        Covariates (independent variables).
    nested_dim : :obj:`int`
        The null model includes the first `nested_dim` covariates.
        Must be between 0 and `k` (the length of `x`).
    weight : :class:`.Float64Expression`, optional
        Non-negative weight for weighted least squares.

    Returns
    -------
    :class:`.StructExpression`
        Struct of regression results.
    """
    x = wrap_to_list(x)
    if len(x) == 0:
        raise ValueError("linreg: must have at least one covariate in `x`")

    hl.methods.statgen._warn_if_no_intercept('linreg', x)

    if weight is None:
        return _linreg(y, x, nested_dim)
    else:
        return _linreg(hl.sqrt(weight) * y,
                       [hl.sqrt(weight) * xi for xi in x],
                       nested_dim)
Ejemplo n.º 28
0
def _get_info_agg_expr(
    mt: hl.MatrixTable,
    sum_agg_fields: Union[
        List[str], Dict[str, hl.expr.NumericExpression]
    ] = INFO_SUM_AGG_FIELDS,
    int32_sum_agg_fields: Union[
        List[str], Dict[str, hl.expr.NumericExpression]
    ] = INFO_INT32_SUM_AGG_FIELDS,
    median_agg_fields: Union[
        List[str], Dict[str, hl.expr.NumericExpression]
    ] = INFO_MEDIAN_AGG_FIELDS,
    array_sum_agg_fields: Union[
        List[str], Dict[str, hl.expr.ArrayNumericExpression]
    ] = INFO_ARRAY_SUM_AGG_FIELDS,
    prefix: str = "",
) -> Dict[str, hl.expr.Aggregation]:
    """
    Helper function containing code to create Aggregators for both site or AS info expression aggregations.

    Notes:

    1. If `SB` is specified in array_sum_agg_fields, it will be aggregated as `AS_SB_TABLE`, according to GATK standard nomenclature.
    2. If `RAW_MQandDP` is specified in array_sum_agg_fields, it will be used for the `MQ` calculation and then dropped according to GATK recommendation.
    3. If `RAW_MQ` and `MQ_DP` are given, they will be used for the `MQ` calculation and then dropped according to GATK recommendation.
    4. If the fields to be aggregate (`sum_agg_fields`, `int32_sum_agg_fields`, `median_agg_fields`) are passed as list of str,
       then they should correspond to entry fields in `mt` or in `mt.gvcf_info`.
       Priority is given to entry fields in `mt` over those in `mt.gvcf_info` in case of a name clash.

    :param mt: Input MT
    :param sum_agg_fields: Fields to aggregate using sum.
    :param int32_sum_agg_fields: Fields to aggregate using sum using int32.
    :param median_agg_fields: Fields to aggregate using (approximate) median.
    :param median_agg_fields: Fields to aggregate using element-wise summing over an array.
    :param prefix: Optional prefix for the fields. Used for adding 'AS_' in the AS case.

    :return: Dictionary of expression names and their corresponding aggregation Expression
    """

    def _agg_list_to_dict(
        mt: hl.MatrixTable, fields: List[str]
    ) -> Dict[str, hl.expr.NumericExpression]:
        out_fields = {}
        if "gvcf_info" in mt.entry:
            out_fields = {f: mt.gvcf_info[f] for f in fields if f in mt.gvcf_info}

        out_fields.update({f: mt[f] for f in fields if f in mt.entry})

        # Check that all fields were found
        missing_fields = [f for f in fields if f not in out_fields]
        if missing_fields:
            raise ValueError(
                "Could not find the following field(s)in the MT entry schema (or nested under mt.gvcf_info: {}".format(
                    ",".join(missing_fields)
                )
            )

        return out_fields

    # Map str to expressions where needed
    if isinstance(sum_agg_fields, list):
        sum_agg_fields = _agg_list_to_dict(mt, sum_agg_fields)

    if isinstance(int32_sum_agg_fields, list):
        int32_sum_agg_fields = _agg_list_to_dict(mt, int32_sum_agg_fields)

    if isinstance(median_agg_fields, list):
        median_agg_fields = _agg_list_to_dict(mt, median_agg_fields)

    if isinstance(array_sum_agg_fields, list):
        array_sum_agg_fields = _agg_list_to_dict(mt, array_sum_agg_fields)

    # Create aggregators
    agg_expr = {}

    agg_expr.update(
        {
            f"{prefix}{k}": hl.agg.approx_quantiles(expr, 0.5)
            for k, expr in median_agg_fields.items()
        }
    )
    agg_expr.update(
        {f"{prefix}{k}": hl.agg.sum(expr) for k, expr in sum_agg_fields.items()}
    )
    agg_expr.update(
        {
            f"{prefix}{k}": hl.int32(hl.agg.sum(expr))
            for k, expr in int32_sum_agg_fields.items()
        }
    )
    agg_expr.update(
        {
            f"{prefix}{k}": hl.agg.array_agg(lambda x: hl.agg.sum(x), expr)
            for k, expr in array_sum_agg_fields.items()
        }
    )

    # Handle annotations combinations and casting for specific annotations

    # If RAW_MQandDP is in agg_expr or if both MQ_DP and RAW_MQ are, compute MQ instead
    mq_tuple = None
    if f"{prefix}RAW_MQandDP" in agg_expr:
        logger.info(
            f"Computing {prefix}MQ as sqrt({prefix}RAW_MQandDP[0]/{prefix}RAW_MQandDP[1]). "
            f"Note that {prefix}MQ will be set to 0 if {prefix}RAW_MQandDP[1] == 0."
        )
        mq_tuple = agg_expr.pop(f"{prefix}RAW_MQandDP")
    elif f"{prefix}RAW_MQ" in agg_expr and f"{prefix}MQ_DP" in agg_expr:
        logger.info(
            f"Computing {prefix}MQ as sqrt({prefix}MQ_DP/{prefix}RAW_MQ). "
            f"Note that MQ will be set to 0 if {prefix}RAW_MQ == 0."
        )
        mq_tuple = (agg_expr.pop(f"{prefix}MQ_DP"), agg_expr.pop(f"{prefix}RAW_MQ"))

    if mq_tuple is not None:
        agg_expr[f"{prefix}MQ"] = hl.cond(
            mq_tuple[1] > 0, hl.sqrt(mq_tuple[0] / mq_tuple[1]), 0
        )

    # If both VarDP and QUALapprox are present, also compute QD.
    if f"{prefix}VarDP" in agg_expr and f"{prefix}QUALapprox" in agg_expr:
        logger.info(
            f"Computing {prefix}QD as {prefix}QUALapprox/{prefix}VarDP. "
            f"Note that {prefix}QD will be set to 0 if {prefix}VarDP == 0."
        )
        var_dp = hl.int32(hl.agg.sum(int32_sum_agg_fields["VarDP"]))
        agg_expr[f"{prefix}QD"] = hl.cond(
            var_dp > 0, agg_expr[f"{prefix}QUALapprox"] / var_dp, 0
        )

    # SB needs to be cast to int32 for FS down the line
    if f"{prefix}SB" in agg_expr:
        agg_expr[f"{prefix}SB"] = agg_expr[f"{prefix}SB"].map(lambda x: hl.int32(x))

    return agg_expr
Ejemplo n.º 29
0
def linreg(y, x, nested_dim=1, weight=None) -> StructExpression:
    """Compute multivariate linear regression statistics.

    Examples
    --------
    Regress HT against an intercept (1), SEX, and C1:

    >>> table1.aggregate(agg.linreg(table1.HT, [1, table1.SEX == 'F', table1.C1]))
    Struct(beta=[88.50000000000014, 81.50000000000057, -10.000000000000068],
           standard_error=[14.430869689661844, 59.70552738231206, 7.000000000000016],
           t_stat=[6.132686518775844, 1.365032746099571, -1.428571428571435],
           p_value=[0.10290201427537926, 0.40250974549499974, 0.3888002244284281],
           multiple_standard_error=4.949747468305833,
           multiple_r_squared=0.7175792507204611,
           adjusted_r_squared=0.1527377521613834,
           f_stat=1.2704081632653061,
           multiple_p_value=0.5314327326007864,
           n=4)

    Regress blood pressure against an intercept (1), genotype, age, and
    the interaction of genotype and age:

    >>> ds_ann = ds.annotate_rows(linreg = 
    ...     hl.agg.linreg(ds.pheno.blood_pressure,
    ...                   [1,
    ...                    ds.GT.n_alt_alleles(),
    ...                    ds.pheno.age,
    ...                    ds.GT.n_alt_alleles() * ds.pheno.age]))

    Warning
    -------
    As in the example, the intercept covariate ``1`` must be included
    **explicitly** if desired.

    Notes
    -----
    In relation to
    `lm.summary <https://stat.ethz.ch/R-manual/R-devel/library/stats/html/summary.lm.html>`__
    in R, ``linreg(y, x = [1, mt.x1, mt.x2])`` computes
    ``summary(lm(y ~ x1 + x2))`` and
    ``linreg(y, x = [mt.x1, mt.x2], nested_dim=0)`` computes
    ``summary(lm(y ~ x1 + x2 - 1))``.

    More generally, `nested_dim` defines the number of effects to fit in the
    nested (null) model, with the effects on the remaining covariates fixed
    to zero.

    The returned struct has ten fields:
     - `beta` (:class:`.tarray` of :py:data:`.tfloat64`):
       Estimated regression coefficient for each covariate.
     - `standard_error` (:class:`.tarray` of :py:data:`.tfloat64`):
       Estimated standard error for each covariate.
     - `t_stat` (:class:`.tarray` of :py:data:`.tfloat64`):
       t-statistic for each covariate.
     - `p_value` (:class:`.tarray` of :py:data:`.tfloat64`):
       p-value for each covariate.
     - `multiple_standard_error` (:py:data:`.tfloat64`):
       Estimated standard deviation of the random error.
     - `multiple_r_squared` (:py:data:`.tfloat64`):
       Coefficient of determination for nested models.
     - `adjusted_r_squared` (:py:data:`.tfloat64`):
       Adjusted `multiple_r_squared` taking into account degrees of
       freedom.
     - `f_stat` (:py:data:`.tfloat64`):
       F-statistic for nested models.
     - `multiple_p_value` (:py:data:`.tfloat64`):
       p-value for the
       `F-test <https://en.wikipedia.org/wiki/F-test#Regression_problems>`__ of
       nested models.
     - `n` (:py:data:`.tint64`):
       Number of samples included in the regression. A sample is included if and
       only if `y`, all elements of `x`, and `weight` (if set) are non-missing.

    All but the last field are missing if `n` is less than or equal to the
    number of covariates or if the covariates are linearly dependent.

    If set, the `weight` parameter generalizes the model to `weighted least
    squares <https://en.wikipedia.org/wiki/Weighted_least_squares>`__, useful
    for heteroscedastic (diagonal but non-constant) variance.

    Warning
    -------
    If any weight is negative, the resulting statistics will be ``nan``.

    Parameters
    ----------
    y : :class:`.Float64Expression`
        Response (dependent variable).
    x : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression`
        Covariates (independent variables).
    nested_dim : :obj:`int`
        The null model includes the first `nested_dim` covariates.
        Must be between 0 and `k` (the length of `x`).
    weight : :class:`.Float64Expression`, optional
        Non-negative weight for weighted least squares.

    Returns
    -------
    :class:`.StructExpression`
        Struct of regression results.
    """
    x = wrap_to_list(x)
    if len(x) == 0:
        raise ValueError("linreg: must have at least one covariate in `x`")

    hl.methods.statgen._warn_if_no_intercept('linreg', x)

    if weight is None:
        return _linreg(y, x, nested_dim)
    else:
        return _linreg(
            hl.sqrt(weight) * y, [hl.sqrt(weight) * xi for xi in x],
            nested_dim)
Ejemplo n.º 30
0
def ld_score_regression(weight_expr,
                        ld_score_expr,
                        chi_sq_exprs,
                        n_samples_exprs,
                        n_blocks=200,
                        two_step_threshold=30,
                        n_reference_panel_variants=None) -> Table:
    r"""Estimate SNP-heritability and level of confounding biases from
    GWAS summary statistics.

    Given a set or multiple sets of genome-wide association study (GWAS)
    summary statistics, :func:`.ld_score_regression` estimates the heritability
    of a trait or set of traits and the level of confounding biases present in
    the underlying studies by regressing chi-squared statistics on LD scores,
    leveraging the model:

    .. math::

        \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j

    *  :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic
       for variant :math:`j` resulting from a test of association between
       variant :math:`j` and a trait.
    *  :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant
       :math:`j`, calculated as the sum of squared correlation coefficients
       between variant :math:`j` and nearby variants. See :func:`ld_score`
       for further details.
    *  :math:`a` captures the contribution of confounding biases, such as
       cryptic relatedness and uncontrolled population structure, to the
       association test statistic.
    *  :math:`h_g^2` is the SNP-heritability, or the proportion of variation
       in the trait explained by the effects of variants included in the
       regression model above.
    *  :math:`M` is the number of variants used to estimate :math:`h_g^2`.
    *  :math:`N` is the number of samples in the underlying association study.

    For more details on the method implemented in this function, see:

    * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__

    Examples
    --------

    Run the method on a matrix table of summary statistics, where the rows
    are variants and the columns are different phenotypes:

    >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=mt_gwas['ld_score'],
    ...     ld_score_expr=mt_gwas['ld_score'],
    ...     chi_sq_exprs=mt_gwas['chi_squared'],
    ...     n_samples_exprs=mt_gwas['n'])


    Run the method on a table with summary statistics for a single
    phenotype:

    >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=ht_gwas['ld_score'],
    ...     ld_score_expr=ht_gwas['ld_score'],
    ...     chi_sq_exprs=ht_gwas['chi_squared_50_irnt'],
    ...     n_samples_exprs=ht_gwas['n_50_irnt'])

    Run the method on a table with summary statistics for multiple
    phenotypes:

    >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=ht_gwas['ld_score'],
    ...     ld_score_expr=ht_gwas['ld_score'],
    ...     chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'],
    ...                        ht_gwas['chi_squared_20160']],
    ...     n_samples_exprs=[ht_gwas['n_50_irnt'],
    ...                      ht_gwas['n_20160']])

    Notes
    -----
    The ``exprs`` provided as arguments to :func:`.ld_score_regression`
    must all be from the same object, either a :class:`Table` or a
    :class:`MatrixTable`.

    **If the arguments originate from a table:**

    *  The table must be keyed by fields ``locus`` of type
       :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of
       :py:data:`.tstr` elements.
    *  ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and
       ``n_samples_exprs`` are must be row-indexed fields.
    *  The number of expressions passed to ``n_samples_exprs`` must be
       equal to one or the number of expressions passed to
       ``chi_sq_exprs``. If just one expression is passed to
       ``n_samples_exprs``, that sample size expression is assumed to
       apply to all sets of statistics passed to ``chi_sq_exprs``.
       Otherwise, the expressions passed to ``chi_sq_exprs`` and
       ``n_samples_exprs`` are matched by index.
    *  The ``phenotype`` field that keys the table returned by
       :func:`.ld_score_regression` will have generic :obj:`int` values
       ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc.
       expressions passed to the ``chi_sq_exprs`` argument.

    **If the arguments originate from a matrix table:**

    *  The dimensions of the matrix table must be variants
       (rows) by phenotypes (columns).
    *  The rows of the matrix table must be keyed by fields
       ``locus`` of type :class:`.tlocus` and ``alleles``,
       a :py:data:`.tarray` of :py:data:`.tstr` elements.
    *  The columns of the matrix table must be keyed by a field
       of type :py:data:`.tstr` that uniquely identifies phenotypes
       represented in the matrix table. The column key must be a single
       expression; compound keys are not accepted.
    *  ``weight_expr`` and ``ld_score_expr`` must be row-indexed
       fields.
    *  ``chi_sq_exprs`` must be a single entry-indexed field
       (not a list of fields).
    *  ``n_samples_exprs`` must be a single entry-indexed field
       (not a list of fields).
    *  The ``phenotype`` field that keys the table returned by
       :func:`.ld_score_regression` will have values corresponding to the
       column keys of the input matrix table.

    This function returns a :class:`Table` with one row per set of summary
    statistics passed to the ``chi_sq_exprs`` argument. The following
    row-indexed fields are included in the table:

    *  **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The
       returned table is keyed by this field. See the notes below for
       details on the possible values of this field.
    *  **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared
       test statistic for the given phenotype.
    *  **intercept** (`Struct`) -- Contains fields:

       -  **estimate** (:py:data:`.tfloat64`) -- A point estimate of the
          intercept :math:`1 + Na`.
       -  **standard_error**  (:py:data:`.tfloat64`) -- An estimate of
          the standard error of this point estimate.

    *  **snp_heritability** (`Struct`) -- Contains fields:

       -  **estimate** (:py:data:`.tfloat64`) -- A point estimate of the
          SNP-heritability :math:`h_g^2`.
       -  **standard_error** (:py:data:`.tfloat64`) -- An estimate of
          the standard error of this point estimate.

    Warning
    -------
    :func:`.ld_score_regression` considers only the rows for which both row
    fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing
    values in either field are removed prior to fitting the LD score
    regression model.

    Parameters
    ----------
    weight_expr : :class:`.Float64Expression`
                  Row-indexed expression for the LD scores used to derive
                  variant weights in the model.
    ld_score_expr : :class:`.Float64Expression`
                    Row-indexed expression for the LD scores used as covariates
                    in the model.
    chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of
                        :class:`.Float64Expression`
                        One or more row-indexed (if table) or entry-indexed
                        (if matrix table) expressions for chi-squared
                        statistics resulting from genome-wide association
                        studies.
    n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of
                     :class:`.NumericExpression`
                     One or more row-indexed (if table) or entry-indexed
                     (if matrix table) expressions indicating the number of
                     samples used in the studies that generated the test
                     statistics supplied to ``chi_sq_exprs``.
    n_blocks : :obj:`int`
               The number of blocks used in the jackknife approach to
               estimating standard errors.
    two_step_threshold : :obj:`int`
                         Variants with chi-squared statistics greater than this
                         value are excluded in the first step of the two-step
                         procedure used to fit the model.
    n_reference_panel_variants : :obj:`int`, optional
                                 Number of variants used to estimate the
                                 SNP-heritability :math:`h_g^2`.

    Returns
    -------
    :class:`.Table`
        Table keyed by ``phenotype`` with intercept and heritability estimates
        for each phenotype passed to the function."""

    chi_sq_exprs = wrap_to_list(chi_sq_exprs)
    n_samples_exprs = wrap_to_list(n_samples_exprs)

    assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or
            (len(n_samples_exprs) == 1))
    __k = 2  # number of covariates, including intercept

    ds = chi_sq_exprs[0]._indices.source

    analyze('ld_score_regression/weight_expr',
            weight_expr,
            ds._row_indices)
    analyze('ld_score_regression/ld_score_expr',
            ld_score_expr,
            ds._row_indices)

    # format input dataset
    if isinstance(ds, MatrixTable):
        if len(chi_sq_exprs) != 1:
            raise ValueError("""Only one chi_sq_expr allowed if originating
                from a matrix table.""")
        if len(n_samples_exprs) != 1:
            raise ValueError("""Only one n_samples_expr allowed if
                originating from a matrix table.""")

        col_key = list(ds.col_key)
        if len(col_key) != 1:
            raise ValueError("""Matrix table must be keyed by a single
                phenotype field.""")

        analyze('ld_score_regression/chi_squared_expr',
                chi_sq_exprs[0],
                ds._entry_indices)
        analyze('ld_score_regression/n_samples_expr',
                n_samples_exprs[0],
                ds._entry_indices)

        ds = ds._select_all(row_exprs={'__locus': ds.locus,
                                       '__alleles': ds.alleles,
                                       '__w_initial': weight_expr,
                                       '__w_initial_floor': hl.max(weight_expr,
                                                                   1.0),
                                       '__x': ld_score_expr,
                                       '__x_floor': hl.max(ld_score_expr,
                                                           1.0)},
                            row_key=['__locus', '__alleles'],
                            col_exprs={'__y_name': ds[col_key[0]]},
                            col_key=['__y_name'],
                            entry_exprs={'__y': chi_sq_exprs[0],
                                         '__n': n_samples_exprs[0]})
        ds = ds.annotate_entries(**{'__w': ds.__w_initial})

        ds = ds.filter_rows(hl.is_defined(ds.__locus) &
                            hl.is_defined(ds.__alleles) &
                            hl.is_defined(ds.__w_initial) &
                            hl.is_defined(ds.__x))

    else:
        assert isinstance(ds, Table)
        for y in chi_sq_exprs:
            analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices)
        for n in n_samples_exprs:
            analyze('ld_score_regression/n_samples_expr', n, ds._row_indices)

        ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)]
        ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)]
        ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)]

        ds = ds.select(**dict(**{'__locus': ds.locus,
                                 '__alleles': ds.alleles,
                                 '__w_initial': weight_expr,
                                 '__x': ld_score_expr},
                              **{y: chi_sq_exprs[i]
                                 for i, y in enumerate(ys)},
                              **{w: weight_expr for w in ws},
                              **{n: n_samples_exprs[i]
                                 for i, n in enumerate(ns)}))
        ds = ds.key_by(ds.__locus, ds.__alleles)

        table_tmp_file = new_temp_file()
        ds.write(table_tmp_file)
        ds = hl.read_table(table_tmp_file)

        hts = [ds.select(**{'__w_initial': ds.__w_initial,
                            '__w_initial_floor': hl.max(ds.__w_initial,
                                                        1.0),
                            '__x': ds.__x,
                            '__x_floor': hl.max(ds.__x, 1.0),
                            '__y_name': i,
                            '__y': ds[ys[i]],
                            '__w': ds[ws[i]],
                            '__n': hl.int(ds[ns[i]])})
               for i, y in enumerate(ys)]

        mts = [ht.to_matrix_table(row_key=['__locus',
                                           '__alleles'],
                                  col_key=['__y_name'],
                                  row_fields=['__w_initial',
                                              '__w_initial_floor',
                                              '__x',
                                              '__x_floor'])
               for ht in hts]

        ds = mts[0]
        for i in range(1, len(ys)):
            ds = ds.union_cols(mts[i])

        ds = ds.filter_rows(hl.is_defined(ds.__locus) &
                            hl.is_defined(ds.__alleles) &
                            hl.is_defined(ds.__w_initial) &
                            hl.is_defined(ds.__x))

    mt_tmp_file1 = new_temp_file()
    ds.write(mt_tmp_file1)
    mt = hl.read_matrix_table(mt_tmp_file1)

    if not n_reference_panel_variants:
        M = mt.count_rows()
    else:
        M = n_reference_panel_variants

    # block variants for each phenotype
    n_phenotypes = mt.count_cols()

    mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) &
                                         (mt.__y < two_step_threshold)),
                             __in_step2=hl.is_defined(mt.__y))

    mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()),
                          __m_step1=hl.agg.count_where(mt.__in_step1),
                          __m_step2=hl.agg.count_where(mt.__in_step2))

    col_keys = list(mt.col_key)

    ht = mt.localize_entries(entries_array_field_name='__entries',
                             columns_array_field_name='__cols')

    ht = ht.annotate(__entries=hl.rbind(
        hl.scan.array_agg(
            lambda entry: hl.scan.count_where(entry.__in_step1),
            ht.__entries),
        lambda step1_indices: hl.map(
            lambda i: hl.rbind(
                hl.int(hl.or_else(step1_indices[i], 0)),
                ht.__cols[i].__m_step1,
                ht.__entries[i],
                lambda step1_idx, m_step1, entry: hl.rbind(
                    hl.map(
                        lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))),
                        hl.range(0, n_blocks + 1)),
                    lambda step1_separators: hl.rbind(
                        hl.set(step1_separators).contains(step1_idx),
                        hl.sum(
                            hl.map(
                                lambda s1: step1_idx >= s1,
                                step1_separators)) - 1,
                        lambda is_separator, step1_block: entry.annotate(
                            __step1_block=step1_block,
                            __step2_block=hl.cond(~entry.__in_step1 & is_separator,
                                                  step1_block - 1,
                                                  step1_block))))),
            hl.range(0, hl.len(ht.__entries)))))

    mt = ht._unlocalize_entries('__entries', '__cols', col_keys)

    mt_tmp_file2 = new_temp_file()
    mt.write(mt_tmp_file2)
    mt = hl.read_matrix_table(mt_tmp_file2)
    
    # initial coefficient estimates
    mt = mt.annotate_cols(__initial_betas=[
        1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)])
    mt = mt.annotate_cols(__step1_betas=mt.__initial_betas,
                          __step2_betas=mt.__initial_betas)

    # step 1 iteratively reweighted least squares
    for i in range(3):
        mt = mt.annotate_entries(__w=hl.cond(
            mt.__in_step1,
            1.0/(mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] +
                                               mt.__step1_betas[1] *
                                               mt.__x_floor)**2),
            0.0))
        mt = mt.annotate_cols(__step1_betas=hl.agg.filter(
            mt.__in_step1,
            hl.agg.linreg(y=mt.__y,
                          x=[1.0, mt.__x],
                          weight=mt.__w).beta))
        mt = mt.annotate_cols(__step1_h2=hl.max(hl.min(
            mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0))
        mt = mt.annotate_cols(__step1_betas=[
            mt.__step1_betas[0],
            mt.__step1_h2 * hl.agg.mean(mt.__n) / M])

    # step 1 block jackknife
    mt = mt.annotate_cols(__step1_block_betas=[
        hl.agg.filter((mt.__step1_block != i) & mt.__in_step1,
                      hl.agg.linreg(y=mt.__y,
                                    x=[1.0, mt.__x],
                                    weight=mt.__w).beta)
        for i in range(n_blocks)])

    mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map(
        lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x,
        mt.__step1_block_betas))

    mt = mt.annotate_cols(
        __step1_jackknife_mean=hl.map(
            lambda i: hl.mean(
                hl.map(lambda x: x[i],
                       mt.__step1_block_betas_bias_corrected)),
            hl.range(0, __k)),
        __step1_jackknife_variance=hl.map(
            lambda i: (hl.sum(
                hl.map(lambda x: x[i]**2,
                       mt.__step1_block_betas_bias_corrected)) -
                       hl.sum(
                hl.map(lambda x: x[i],
                       mt.__step1_block_betas_bias_corrected))**2 /
                       n_blocks) /
            (n_blocks - 1) / n_blocks,
            hl.range(0, __k)))

    # step 2 iteratively reweighted least squares
    for i in range(3):
        mt = mt.annotate_entries(__w=hl.cond(
            mt.__in_step2,
            1.0/(mt.__w_initial_floor *
                 2.0 * (mt.__step2_betas[0] +
                        mt.__step2_betas[1] *
                        mt.__x_floor)**2),
            0.0))
        mt = mt.annotate_cols(__step2_betas=[
            mt.__step1_betas[0],
            hl.agg.filter(mt.__in_step2,
                          hl.agg.linreg(y=mt.__y - mt.__step1_betas[0],
                                        x=[mt.__x],
                                        weight=mt.__w).beta[0])])
        mt = mt.annotate_cols(__step2_h2=hl.max(hl.min(
            mt.__step2_betas[1] * M/hl.agg.mean(mt.__n), 1.0), 0.0))
        mt = mt.annotate_cols(__step2_betas=[
            mt.__step1_betas[0],
            mt.__step2_h2 * hl.agg.mean(mt.__n)/M])

    # step 2 block jackknife
    mt = mt.annotate_cols(__step2_block_betas=[
        hl.agg.filter((mt.__step2_block != i) & mt.__in_step2,
                      hl.agg.linreg(y=mt.__y - mt.__step1_betas[0],
                                    x=[mt.__x],
                                    weight=mt.__w).beta[0])
        for i in range(n_blocks)])

    mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map(
        lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x,
        mt.__step2_block_betas))

    mt = mt.annotate_cols(
        __step2_jackknife_mean=hl.mean(
            mt.__step2_block_betas_bias_corrected),
        __step2_jackknife_variance=(
            hl.sum(mt.__step2_block_betas_bias_corrected**2) -
            hl.sum(mt.__step2_block_betas_bias_corrected)**2 /
            n_blocks) / (n_blocks - 1) / n_blocks)

    # combine step 1 and step 2 block jackknifes
    mt = mt.annotate_entries(
        __step2_initial_w=1.0/(mt.__w_initial_floor *
                               2.0 * (mt.__initial_betas[0] +
                                      mt.__initial_betas[1] *
                                      mt.__x_floor)**2))

    mt = mt.annotate_cols(
        __final_betas=[
            mt.__step1_betas[0],
            mt.__step2_betas[1]],
        __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) /
             hl.agg.sum(mt.__step2_initial_w * mt.__x**2)))

    mt = mt.annotate_cols(__final_block_betas=hl.map(
        lambda i: (mt.__step2_block_betas[i] - mt.__c *
                   (mt.__step1_block_betas[i][0] - mt.__final_betas[0])),
        hl.range(0, n_blocks)))

    mt = mt.annotate_cols(
        __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1] -
                                            (n_blocks - 1) *
                                            mt.__final_block_betas))

    mt = mt.annotate_cols(
        __final_jackknife_mean=[
            mt.__step1_jackknife_mean[0],
            hl.mean(mt.__final_block_betas_bias_corrected)],
        __final_jackknife_variance=[
            mt.__step1_jackknife_variance[0],
            (hl.sum(mt.__final_block_betas_bias_corrected**2) -
             hl.sum(mt.__final_block_betas_bias_corrected)**2 /
             n_blocks) / (n_blocks - 1) / n_blocks])

    # convert coefficient to heritability estimate
    mt = mt.annotate_cols(
        phenotype=mt.__y_name,
        mean_chi_sq=hl.agg.mean(mt.__y),
        intercept=hl.struct(
            estimate=mt.__final_betas[0],
            standard_error=hl.sqrt(mt.__final_jackknife_variance[0])),
        snp_heritability=hl.struct(
            estimate=(M/hl.agg.mean(mt.__n)) * mt.__final_betas[1],
            standard_error=hl.sqrt((M/hl.agg.mean(mt.__n))**2 *
                                   mt.__final_jackknife_variance[1])))

    # format and return results
    ht = mt.cols()
    ht = ht.key_by(ht.phenotype)
    ht = ht.select(ht.mean_chi_sq,
                   ht.intercept,
                   ht.snp_heritability)

    ht_tmp_file = new_temp_file()
    ht.write(ht_tmp_file)
    ht = hl.read_table(ht_tmp_file)
    
    return ht
Ejemplo n.º 31
0
def calculate_phenotypes(mt,
                         genotype,
                         beta,
                         h2,
                         popstrat=None,
                         popstrat_var=None,
                         exact_h2=False):
    r"""Calculates phenotypes by multiplying genotypes and betas.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        :class:`.MatrixTable` with all relevant fields passed as parameters.
    genotype : :class:`.Expression` or :class:`.CallExpression`
        Entry field of genotypes.
    beta : :class:`.Expression`
        Row field of SNP effects.
    h2 : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`
        SNP-based heritability (:math:`h^2`) of simulated trait. Can only be
        ``None`` if running annotation-informed model.
    popstrat : :class:`.Expression`, optional
        Column field containing population stratification term.
    popstrat_var : :obj:`float` or :obj:`int`
        Variance of population stratification term.
    exact_h2: :obj:`bool`
        Whether to exactly simulate ratio of variance of genetic component of
        phenotype to variance of phenotype to be h2. If `False`, ratio will be
        h2 in expectation. Observed h2 in the simulation will be close to
        expected h2 for large-scale simulations.

    Returns
    -------
    :class:`.MatrixTable`
        :class:`.MatrixTable` with simulated phenotype as column field.
    """
    print('calculating phenotype')
    h2 = h2.tolist() if type(h2) is np.ndarray else (
        [h2] if type(h2) is not list else h2)
    assert popstrat_var is None or (popstrat_var >=
                                    0), 'popstrat_var must be non-negative'
    uid = Env.get_uid(base=100)
    mt = annotate_all(
        mt=mt,
        row_exprs={'beta_' + uid: beta},
        col_exprs={} if popstrat is None else {'popstrat_' + uid: popstrat},
        entry_exprs={
            'gt_' + uid:
            genotype.n_alt_alleles()
            if genotype.dtype is hl.dtype('call') else genotype
        })
    mt = mt.filter_rows(hl.agg.stats(mt['gt_' + uid]).stdev > 0)
    mt = normalize_genotypes(mt['gt_' + uid])
    if mt['beta_' + uid].dtype == hl.dtype('array<float64>'):  # if >1 traits
        if exact_h2:
            raise ValueError(
                'exact_h2=True not supported for multitrait simulations')
        else:
            mt = mt.annotate_cols(y_no_noise=hl.agg.array_agg(
                lambda beta: hl.agg.sum(beta * mt['norm_gt']), mt['beta_' +
                                                                  uid]))
            mt = mt.annotate_cols(
                y=mt.y_no_noise +
                hl.literal(h2).map(lambda x: hl.rand_norm(0, hl.sqrt(1 - x))))
    else:
        if exact_h2 and min([h2[0], 1 - h2[0]]) != 0:
            print('exact h2')
            mt = mt.annotate_cols(**{
                'y_no_noise_' + uid:
                hl.agg.sum(mt['beta_' + uid] * mt['norm_gt'])
            })
            y_no_noise_stdev = mt.aggregate_cols(
                hl.agg.stats(mt['y_no_noise_' + uid]).stdev)
            mt = mt.annotate_cols(
                y_no_noise=hl.sqrt(h2[0]) * mt['y_no_noise_' + uid] /
                y_no_noise_stdev
            )  # normalize genetic component of phenotype to have variance of exactly h2
            mt = mt.annotate_cols(
                **{'noise_' + uid: hl.rand_norm(0, hl.sqrt(1 - h2[0]))})
            noise_stdev = mt.aggregate_cols(
                hl.agg.stats(mt['noise_' + uid]).stdev)
            mt = mt.annotate_cols(noise=hl.sqrt(1 - h2[0]) *
                                  mt['noise_' + uid] / noise_stdev)
            mt = mt.annotate_cols(
                y=mt.y_no_noise +
                hl.sqrt(1 - h2[0]) * mt['noise_' + uid] / noise_stdev)
        else:
            mt = mt.annotate_cols(y_no_noise=hl.agg.sum(mt['beta_' + uid] *
                                                        mt['norm_gt']))
            mt = mt.annotate_cols(y=mt.y_no_noise +
                                  hl.rand_norm(0, hl.sqrt(1 - h2[0])))
    if popstrat is not None:
        var_factor = 1 if popstrat_var is None else (popstrat_var**(
            1 / 2)) / mt.aggregate_cols(hl.agg.stats(mt['popstrat_' +
                                                        uid])).stdev
        mt = mt.rename({'y': 'y_no_popstrat'})
        mt = mt.annotate_cols(y=mt.y_no_popstrat +
                              mt['popstrat_' + uid] * var_factor)
    mt = _clean_fields(mt, uid)
    return mt
Ejemplo n.º 32
0
def make_betas(mt, h2, pi=None, annot=None, rg=None):
    r"""Generates betas under different models.

    Simulates betas (SNP effects) under the infinitesimal, spike & slab, or
    annotation-informed models, depending on parameters passed.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        MatrixTable containing genotypes to be used. Also should contain
        variant annotations as row fields if running the annotation-informed
        model or covariates as column fields if adding population stratification.
    h2 : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`
        SNP-based heritability of simulated trait(s).
    pi : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`, optional
        Probability of SNP being causal when simulating under the spike & slab
        model. If doing two-trait spike & slab `pi` is a list of probabilities for
        overlapping causal SNPs (see docstring of :func:`.multitrait_ss`)
    annot : :class:`.Expression`, optional
        Row field of aggregated annotations for annotation-informed model.
    rg : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`, optional
        Genetic correlation between traits.

    Returns
    -------
    mt : :class:`.MatrixTable`
        :class:`.MatrixTable` with betas as a row field, simulated according to specified model.
    pi : :obj:`list`
        Probability of a SNP being causal for different traits, possibly altered
        from input `pi` if covariance matrix for multitrait simulation was not
        positive semi-definite.
    rg : :obj:`list`
        Genetic correlation between traits, possibly altered from input `rg` if
        covariance matrix for multitrait simulation was not positive semi-definite.

    """
    h2 = h2.tolist() if type(h2) is np.ndarray else (
        [h2] if type(h2) is not list else h2)
    pi = pi.tolist() if type(pi) is np.ndarray else (
        [pi] if type(pi) is not list else pi)
    rg = rg.tolist() if type(rg) is np.ndarray else (
        [rg] if type(rg) is not list else rg)
    assert (all(x >= 0 and x <= 1
                for x in h2)), 'h2 values must be between 0 and 1'
    assert (pi is not [None]) or all(
        x >= 0 and x <= 1
        for x in pi), 'pi values for spike & slab must be between 0 and 1'
    assert (rg == [None]
            or all(x >= -1 and x <= 1
                   for x in rg)), 'rg values must be between -1 and 1 or None'
    if annot is not None:  # multi-trait annotation-informed
        assert rg == [
            None
        ], 'Correlated traits not supported for annotation-informed model'
        h2 = h2 if type(h2) is list else [h2]
        annot_sum = mt.aggregate_rows(hl.agg.sum(annot))
        mt = mt.annotate_rows(beta=hl.literal(h2).map(
            lambda x: hl.rand_norm(0, hl.sqrt(annot * x / (annot_sum * M)))))
    elif len(h2) > 1 and (pi == [None] or pi
                          == [1]):  # multi-trait correlated infinitesimal
        mt, rg = multitrait_inf(mt=mt, h2=h2, rg=rg)
    elif len(h2) == 2 and len(pi) > 1 and len(
            rg) == 1:  # two trait correlated spike & slab
        print('multitrait ss')
        mt, pi, rg = multitrait_ss(mt=mt,
                                   h2=h2,
                                   rg=0 if rg is [None] else rg[0],
                                   pi=pi)
    elif len(h2) == 1 and len(
            pi) == 1:  # single trait infinitesimal/spike & slab
        M = mt.count_rows()
        pi_temp = 1 if pi == [None] else pi[0]
        mt = mt.annotate_rows(beta=hl.rand_bool(pi_temp) *
                              hl.rand_norm(0, hl.sqrt(h2[0] / (M * pi_temp))))
    else:
        raise ValueError('Parameters passed do not match any models.')
    return mt, pi, rg
Ejemplo n.º 33
0
def default_generate_gene_lof_summary(
    mt: hl.MatrixTable,
    collapse_indels: bool = False,
    tx: bool = False,
    lof_csq_set: Set[str] = LOF_CSQ_SET,
    meta_root: str = "meta",
    pop_field: str = "pop",
    filter_loftee: bool = False,
) -> hl.Table:
    """
    Generate summary counts for loss-of-function (LoF), missense, and synonymous variants.

    Also calculates p, proportion of of haplotypes carrying a putative LoF (pLoF) variant,
    and observed/expected (OE) ratio of samples with homozygous pLoF variant calls.

    Summary counts are (all per gene):
        - Number of samples with no pLoF variants.
        - Number of samples with heterozygous pLoF variants.
        - Number of samples with homozygous pLoF variants.
        - Total number of sites with genotype calls.
        - All of the above stats grouped by population.

    Assumes MT was created using `default_generate_gene_lof_matrix`.

    .. note::
        Assumes LoF variants in MT were filtered (LOFTEE pass and no LoF flag only).
        If LoF variants have not been filtered and `filter_loftee` is True,
        expects MT has the row annotation `vep`.

    :param mt: Input MatrixTable.
    :param collapse_indels: Whether to collapse indels. Default is False.
    :param tx: Whether input MT has transcript expression data. Default is False.
    :param lof_csq_set: Set containing LoF transcript consequence strings. Default is LOF_CSQ_SET.
    :param meta_root: String indicating top level name for sample metadata. Default is 'meta'.
    :param pop_field: String indiciating field with sample population assignment information. Default is 'pop'.
    :param filter_loftee: Filters to LOFTEE pass variants (and no LoF flags) only. Default is False.
    :return: Table with het/hom summary counts.
    """
    if collapse_indels:
        grouping = ["gene_id", "gene", "most_severe_consequence"]
        if tx:
            grouping.append("expressed")
        else:
            grouping.extend(["transcript_id", "canonical"])
        mt = (mt.group_rows_by(*grouping).aggregate_rows(
            n_sites=hl.agg.sum(mt.n_sites),
            n_sites_array=hl.agg.array_sum(mt.n_sites_array),
            classic_caf=hl.agg.sum(mt.classic_caf),
            max_af=hl.agg.max(mt.max_af),
            classic_caf_array=hl.agg.array_sum(mt.classic_caf_array),
        ).aggregate_entries(
            num_homs=hl.agg.sum(mt.num_homs),
            num_hets=hl.agg.sum(mt.num_hets),
            defined_sites=hl.agg.sum(mt.defined_sites),
        ).result())

    if filter_loftee:
        lof_ht = get_most_severe_consequence_for_summary(mt.rows())
        mt = mt.filter_rows(
            hl.is_defined(lof_ht[mt.row_key].lof)
            & (lof_ht[mt.row_key].lof == "HC")
            & (lof_ht[mt.row_key].no_lof_flags))

    ht = mt.annotate_rows(
        lof=hl.struct(
            **get_het_hom_summary_dict(
                csq_set=lof_csq_set,
                most_severe_csq_expr=mt.most_severe_consequence,
                defined_sites_expr=mt.defined_sites,
                num_homs_expr=mt.num_homs,
                num_hets_expr=mt.num_hets,
                pop_expr=mt[meta_root][pop_field],
            ), ),
        missense=hl.struct(
            **get_het_hom_summary_dict(
                csq_set={"missense_variant"},
                most_severe_csq_expr=mt.most_severe_consequence,
                defined_sites_expr=mt.defined_sites,
                num_homs_expr=mt.num_homs,
                num_hets_expr=mt.num_hets,
                pop_expr=mt[meta_root][pop_field],
            ), ),
        synonymous=hl.struct(
            **get_het_hom_summary_dict(
                csq_set={"synonymous_variant"},
                most_severe_csq_expr=mt.most_severe_consequence,
                defined_sites_expr=mt.defined_sites,
                num_homs_expr=mt.num_homs,
                num_hets_expr=mt.num_hets,
                pop_expr=mt[meta_root][pop_field],
            ), ),
    ).rows()
    ht = ht.annotate(
        p=(1 - hl.sqrt(hl.float64(ht.lof.no_alt_calls) / ht.lof.defined)),
        pop_p=hl.dict(
            hl.array(ht.lof.pop_defined).map(lambda x: (
                x[0],
                1 - hl.sqrt(
                    hl.float64(ht.lof.pop_no_alt_calls.get(x[0])) / x[1]),
            ))),
    )
    ht = ht.annotate(exp_hom_lof=ht.lof.defined * ht.p * ht.p)
    return ht.annotate(oe=ht.lof.obs_hom / ht.exp_hom_lof)
Ejemplo n.º 34
0
def main(args):
    ########################################################################
    ### initialize
    phenos = [
        'height', 'bmi', 'sbp', 'dbp', 'wbc', 'monocyte', 'neutrophil',
        'eosinophil', 'basophil', 'lymphocyte', 'rbc', 'mch', 'mcv', 'mchc',
        'hb', 'ht', 'plt'
    ]
    phenos.sort()
    phenotype = 'ALL17'

    if args.clump_basename is None:
        clumps = args.dirname + args.basename + '_ALL17.clumped'
        prs_loci_table_location = args.dirname + 'keytables/ukb-' + phenotype + '_' + args.basename + '-pt-sumstats-locus-allele-keyed.kt'
        contig_row_dict_location = args.dirname + 'contig_row_dict-' + phenotype + '_' + args.basename
    else:
        clumps = args.dirname + args.clump_basename + '_ALL17.clumped'
        prs_loci_table_location = args.dirname + 'keytables/ukb-' + phenotype + '_' + args.basename_out + '-pt-sumstats-locus-allele-keyed.kt'
        contig_row_dict_location = args.dirname + 'contig_row_dict-' + phenotype + '_' + args.basename_out

        # clumps = args.dirname + end_dir + 'UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.iter + '_beta' + args.which_beta + '.clumped'
        # ss_filename = args.dirname + end_dir + 'UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.iter + '.tsv.gz'
        # out_base = args.dirname + end_dir + 'UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.iter + '_beta' + args.which_beta + '_gwas_PRS'

    clump_table_location = clumps.replace('.clumped', '.kt')

    contigs = {'0{}'.format(x): str(x) for x in range(1, 10)}

    bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen'

    start = time.time()
    # large block size because we read very little data (due to filtering & ignoring genotypes)
    hl.init(branching_factor=10, min_block_size=2000)
    # set min_block_size only in import_bgen

    ################################################################################
    ### set up the sumstats table (chr, bp for union SNPs)
    # if (args.generate_prs_loci_table):
    #     t = hl.import_table(sumstats_text_file,
    #                         delimiter='\s+',
    #                         impute=True)
    #     t = t.select(locus = hl.locus(hl.str(t.CHR), t.BP))
    #     t = t.key_by('locus')
    #     t.write(prs_loci_table_location, overwrite=True)
    #
    # ss = hl.read_table(prs_loci_table_location)

    if args.read_clumps:
        clump_file = hl.import_table(clumps, delimiter='\s+', impute=True)
        clump_file = clump_file.select(
            locus=hl.locus(hl.str(clump_file.CHR), clump_file.BP))
        clump_file = clump_file.key_by('locus')
        clump_file.write(clump_table_location, overwrite=True)

    clump_file = hl.read_table(clump_table_location)

    # ################################################################################
    # ### determine the indices of the prs variants in bgen
    # if (args.generate_contig_row_dict):
    #     mt = hl.methods.import_bgen(bgen_files,
    #                                 [],
    #                                 contig_recoding=contigs,
    #                                 _row_fields=['file_row_idx'])
    #     prs_rows = mt.filter_rows(hl.is_defined(ss[mt.locus])).rows()
    #     print('about to collect')
    #     # remove all unnecessary data, dropping keys and other irrelevant fields
    #     prs_rows = prs_rows.key_by()
    #     prs_rows = prs_rows.select(contig=prs_rows.locus.contig,
    #                                file_row_idx=prs_rows.file_row_idx)
    #     contig_row_list = prs_rows.collect()
    #     print('finished collecting')
    #     contig_reformed = [(x['contig'], x['file_row_idx']) for x in contig_row_list]
    #     print('reformed')
    #     from collections import defaultdict
    #     contig_row_dict = defaultdict(list)
    #     for k, v in contig_reformed:
    #         contig_row_dict[k].append(v)
    #     print('dictionary created')
    #
    #     with hl.hadoop_open(contig_row_dict_location, 'wb') as f:
    #         pickle.dump(contig_row_dict, f)
    # else:
    #     with hl.hadoop_open(contig_row_dict_location, 'rb') as f:
    #         contig_row_dict = pickle.load(f)

    ################################################################################
    ### Get true phenotypes from UKBB
    if args.pheno_table:
        # phenotypes = hl.import_table('gs://phenotype_31063/ukb31063.phesant_phenotypes.both_sexes.tsv.bgz',
        #                                key='userId', quote='"', impute=True, types={'userId': hl.tstr}, missing='')
        phenotypes = hl.import_table(
            'gs://armartin/disparities/ukbb/UKB_phenos_ALL17.txt.bgz',
            key='eid',
            impute=True,
            types={'eid': hl.tstr})

        covariates = hl.import_table(
            'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv',
            key='s',
            impute=True,
            types={'s': hl.tstr})

        samples = covariates.annotate(**phenotypes[covariates.s])

        # Write pheno/covar/sample info table
        for pheno in phenos:
            #sampleids = hl.import_table('gs://ukb31063-mega-gwas/hail-0.1/qc/ukb31063.gwas_samples.txt', delimiter='\s+').key_by('s')
            gwas_holdout = hl.import_table(
                'gs://armartin/mama/ukb31063.gwas_samples.gwas_vs_holdout.txt',
                delimiter='\s+').key_by('s')

            samples = samples.annotate(**{
                pheno + '_holdout':
                gwas_holdout[samples.s].in_gwas == 'FALSE'
            })

        samples.write(args.dirname + args.basename + '_holdout_gwas_phenos.ht',
                      True)

    if args.ss_tables:
        # Write ss info
        for pheno in phenos:
            print(pheno)
            # change sumstats to bgz
            #ss = hl.import_table('gs://armartin/disparities/pheno_31063_holdout_gwas_' + pheno + '.txt.gz',
            ss = hl.import_table(args.dirname + pheno + '_' + args.basename +
                                 '.*.bgz',
                                 delimiter='\s+',
                                 impute=True,
                                 types={
                                     'MAMA_BETA': hl.tfloat,
                                     'MAMA_PVAL': hl.tfloat,
                                     'BP': hl.tint
                                 })
            #, 'N': hl.tint})
            ss = ss.key_by(
                locus=hl.locus(hl.str(ss.CHR), hl.int(ss.BP))).repartition(200)

            ss.write(args.dirname + pheno + '_' + args.basename + '.ht', True)

    ################################################################################
    ### Run the PRS using phenotype-specific clump variants
    if args.write_bgen:
        mt_all = hl.import_bgen(
            bgen_files, ['dosage'],
            sample_file='gs://phenotype_31063/ukb31063.autosomes.sample',
            variants=clump_file.locus)
        # contig_row_dict2 = {'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{contig}_v3.bgen'.format(contig=k): v for k, v in contig_row_dict.items()}
        # mt_all = hl.methods.import_bgen(bgen_files,
        #                             ['dosage'],
        #                             sample_file='gs://phenotype_31063/ukb31063.autosomes.sample',
        #                             contig_recoding=contigs,
        #                             _variants_per_file=contig_row_dict2,
        #                             _row_fields=[])

        #samples.write(args.dirname + args.basename + '_holdout_gwas_phenos.ht', True)
        samples = hl.read_table(args.dirname + args.basename +
                                '_holdout_gwas_phenos.ht')
        mt_all = mt_all.annotate_cols(**samples[
            mt_all.s])  # ok that phenos keyed on userId not s?

        #
        if args.clump_basename is None:
            mt_all.repartition(5000, shuffle=False).write(
                args.dirname + args.basename + '_ALL17.mt', True)
        else:
            mt_all.repartition(5000, shuffle=False).write(
                args.dirname + args.basename_out + '_ALL17.mt', True)

    mt_all = hl.read_matrix_table(args.dirname + args.basename + '_ALL17.mt')

    for pheno in phenos:  #[6:len(phenos)]:
        print(pheno)
        ss = hl.read_table(args.dirname + pheno + '_' + args.basename + '.ht')
        """
        To add:
        - Filter only to samples in holdout GWAS
        - Filter to rows in phenotype-specific clump file
        - Build PRS for 10 p-value thresholds
        - Also fix nt1/nt2 to A1 and A2 (check) from sumstats.
        """
        # filter to only samples held out from GWAS
        mt = mt_all.filter_cols(mt_all[pheno + '_holdout'])

        mt = mt.annotate_rows(ss=ss[mt.locus])
        mt = annotate_beta(mt, mt.ss)

        p_max = {
            's1': 5e-8,
            's2': 1e-6,
            's3': 1e-4,
            's4': 1e-3,
            's5': 1e-2,
            's6': .05,
            's7': .1,
            's8': .2,
            's9': .5,
            's10': 1
        }

        if args.clump_basename is None:
            pheno_clump = specific_clumps(args.dirname + pheno + '_' +
                                          args.basename + '.clumped')
        else:
            pheno_clump = specific_clumps(args.dirname + pheno + '_' +
                                          args.clump_basename + '.clumped')

        mt = mt.filter_rows(pheno_clump.get(mt.locus, False))
        print(mt.count())

        # divide by sd's of frequencies to get standardized betas back to allelic scale for MAMA betas (only, not METAL)
        # sqrt(2pq)
        if args.betas_are_standardized:
            annot_expr = {
                k: hl.agg.sum(mt.beta / hl.sqrt(2 * hl.float(mt.ss.FRQ) * 1 -
                                                hl.float(mt.ss.FRQ)) *
                              mt.dosage * hl.int(mt.ss.MAMA_PVAL < v))
                for k, v in p_max.items()
            }
        else:
            annot_expr = {
                k:
                hl.agg.sum(mt.beta * mt.dosage * hl.int(mt.ss.MAMA_PVAL < v))
                for k, v in p_max.items()
            }

        mt = mt.annotate_cols(**annot_expr)

        if args.clump_basename is None:
            mt.cols().write(args.dirname + 'UKB_' + pheno + '_' +
                            args.basename + '_PRS.ht',
                            stage_locally=True,
                            overwrite=True)
            ht = hl.read_table(args.dirname + 'UKB_' + pheno + '_' +
                               args.basename + '_PRS.ht')
        else:
            mt.cols().write(args.dirname + 'UKB_' + pheno + '_' +
                            args.basename_out + '_PRS.ht',
                            stage_locally=True,
                            overwrite=True)
            ht = hl.read_table(args.dirname + 'UKB_' + pheno + '_' +
                               args.basename_out + '_PRS.ht')
        ht_out = ht.drop(*[x for x in list(ht.row) if 'holdout' in x],
                         *[x for x in phenos if pheno not in x])

        if args.clump_basename is None:
            output_location = args.dirname + 'UKB_' + pheno + '_' + args.basename + '_PRS.txt.bgz'
        else:
            output_location = args.dirname + 'UKB_' + pheno + '_' + args.basename_out + '_PRS.txt.bgz'
        ht_out.export(output_location)
    end = time.time()
    print("Success! Job was completed in %s" %
          time.strftime("%H:%M:%S", time.gmtime(end - start)))
Ejemplo n.º 35
0
def run_meta_split(i):
    print('####################')
    print('Starting split ' + str(i))
    print('####################')
    starttime = datetime.datetime.now()
    pi = ['A'] * int(n_chunks / 2) + ['B'] * int(n_chunks / 2)
    seed_id = int(batch +
                  str(i).zfill(4))  #create a seed_id unique to every split
    randstate = np.random.RandomState(seed_id)  #seed with seed_id

    randstate.shuffle(pi)
    gmt_shuf = gmt.annotate_cols(label=hl.literal(pi)[hl.int32(gmt.col_idx)])

    mt = gmt_shuf.group_cols_by(gmt_shuf.label).aggregate(
        unnorm_meta_beta=hl.agg.sum(gmt_shuf.beta /
                                    gmt_shuf.standard_error**2),
        inv_se2=hl.agg.sum(1 / gmt_shuf.standard_error**2)).key_rows_by('SNP')

    ht = mt.make_table()

    ht = ht.annotate(A_Z=ht['A.unnorm_meta_beta'] / hl.sqrt(ht['A.inv_se2']),
                     B_Z=ht['B.unnorm_meta_beta'] / hl.sqrt(ht['B.inv_se2']))

    ht = ht.drop('A.unnorm_meta_beta', 'B.unnorm_meta_beta', 'A.inv_se2',
                 'B.inv_se2')

    variants = hl.import_table('gs://nbaya/rg_sex/50_snps_alleles_N.tsv.gz',
                               types={'N': hl.tint64})
    variants = variants.key_by('SNP')
    #    mt_all = hl.read_matrix_table('gs://nbaya/split/ukb31063.hm3_variants.gwas_samples_'+phen+'_grouped'+str(n_chunks)+'_batch_'+batch+'.mt') #matrix table containing individual samples. OUTDATED
    ht_all = hl.read_table(
        'gs://nbaya/split/ukb31063.hm3_variants.gwas_samples_' + phen +
        '_grouped' + str(n_chunks) + '_batch_' + batch +
        '.ht')  #hail table containing individual samples
    variants = variants.annotate(N=hl.int32(ht_all.count() / 2))
    variants.show()

    metaA = variants.annotate(Z=ht[variants.SNP].A_Z)
    metaB = variants.annotate(Z=ht[variants.SNP].B_Z)

    #    metaA_path = 'gs://nbaya/rg_sex/'+phen+'_meta_A_n'+str(n_chunks)+'_batch_'+batch+'_s'+str(i)+'.tsv.bgz'
    #    metaB_path = 'gs://nbaya/rg_sex/'+phen+'_meta_B_n'+str(n_chunks)+'_batch_'+batch+'_s'+str(i)+'.tsv.bgz'
    metaA_path = 'gs://nbaya/rg_sex/' + variant_set + '_' + phen + '_meta_A_n' + str(
        n_chunks) + '_batch_' + batch + '_s' + str(
            i
        ) + '.tsv.bgz'  #only used by qc_pos variant set and later hm3 phens
    metaB_path = 'gs://nbaya/rg_sex/' + variant_set + '_' + phen + '_meta_B_n' + str(
        n_chunks) + '_batch_' + batch + '_s' + str(
            i
        ) + '.tsv.bgz'  #only used by qc_pos variant set and later hm3 phens
    metaA.export(metaA_path)
    metaB.export(metaB_path)

    endtime = datetime.datetime.now()
    elapsed = endtime - starttime
    print('####################')
    print('Completed iteration ' + str(i))
    print('Files written to:')
    print(metaA_path + '\t' + metaB_path)
    print('Time: {:%H:%M:%S (%Y-%b-%d)}'.format(datetime.datetime.now()))
    print('Iteration time: ' + str(round(elapsed.seconds / 60, 2)) +
          ' minutes')
    print('####################')