Exemple #1
0
def test_pcrelate_issue_5263():
    mt = hl.balding_nichols_model(3, 50, 100)
    expected = hl.pc_relate(mt.GT, 0.10, k=2, statistics='all')
    mt = mt.select_entries(GT2=mt.GT,
                           GT=hl.call(hl.rand_bool(0.5), hl.rand_bool(0.5)))
    actual = hl.pc_relate(mt.GT2, 0.10, k=2, statistics='all')
    assert expected._same(actual, tolerance=1e-4)
Exemple #2
0
 def test_plot_roc_curve(self):
     x = hl.utils.range_table(100).annotate(score1=hl.rand_norm(),
                                            score2=hl.rand_norm())
     x = x.annotate(tp=hl.cond(x.score1 > 0, hl.rand_bool(0.7), False),
                    score3=x.score1 + hl.rand_norm())
     ht = x.annotate(fp=hl.cond(~x.tp, hl.rand_bool(0.2), False))
     _, aucs = hl.experimental.plot_roc_curve(
         ht, ['score1', 'score2', 'score3'])
def kyle_sex_specific_qc(mt_path):
    mt = hl.read_matrix_table(mt_path)
    mt = mt.annotate_cols(sex=hl.cond(hl.rand_bool(0.5), 'Male', 'Female'))
    (num_males, num_females) = mt.aggregate_cols((hl.agg.count_where(mt.sex == 'Male'),
                                                  hl.agg.count_where(mt.sex == 'Female')))
    mt = mt.annotate_rows(
        male_hets=hl.agg.count_where(mt.GT.is_het() & (mt.sex == 'Male')),
        male_homvars=hl.agg.count_where(mt.GT.is_hom_var() & (mt.sex == 'Male')),
        male_calls=hl.agg.count_where(hl.is_defined(mt.GT) & (mt.sex == 'Male')),
        female_hets=hl.agg.count_where(mt.GT.is_het() & (mt.sex == 'Female')),
        female_homvars=hl.agg.count_where(mt.GT.is_hom_var() & (mt.sex == 'Female')),
        female_calls=hl.agg.count_where(hl.is_defined(mt.GT) & (mt.sex == 'Female'))
    )

    mt = mt.annotate_rows(
        call_rate=(hl.case()
                   .when(mt.locus.in_y_nonpar(), (mt.male_calls / num_males))
                   .when(mt.locus.in_x_nonpar(), (mt.male_calls + 2 * mt.female_calls) / (num_males + 2 * num_females))
                   .default((mt.male_calls + mt.female_calls) / (num_males + num_females))),
        AC=(hl.case()
            .when(mt.locus.in_y_nonpar(), mt.male_homvars)
            .when(mt.locus.in_x_nonpar(), mt.male_homvars + mt.female_hets + 2 * mt.female_homvars)
            .default(mt.male_hets + 2 * mt.male_homvars + mt.female_hets + 2 * mt.female_homvars)),
        AN=(hl.case()
            .when(mt.locus.in_y_nonpar(), mt.male_calls)
            .when(mt.locus.in_x_nonpar(), mt.male_calls + 2 * mt.female_calls)
            .default(2 * mt.male_calls + 2 * mt.female_calls))
    )

    mt.rows()._force_count()
Exemple #4
0
def make_betas(mt,
               h2=None,
               pi=1,
               is_annot_inf=False,
               annot_coef_dict=None,
               annot_regex=None,
               h2_normalize=True):
    '''Simulate betas. Options: Infinitesimal model, spike & slab, annotation-informed'''
    check_beta_args(h2=h2,
                    pi=pi,
                    is_annot_inf=is_annot_inf,
                    annot_coef_dict=annot_coef_dict,
                    annot_regex=annot_regex,
                    h2_normalize=h2_normalize)
    M = mt.count_rows()
    if is_annot_inf:
        print('\rSimulating {} annotation-informed betas {}'.format(
            'h2-normalized' if h2_normalize else '', '(default coef: 1)'
            if annot_coef_dict is None else 'using annot_coef_dict'))
        mt1 = agg_fields(mt=mt, coef_dict=annot_coef_dict, regex=annot_regex)
        annot_sum = mt1.aggregate_rows(hl.agg.sum(mt1.__agg_annot))
        return mt1.annotate_rows(
            __beta=hl.rand_norm(
                0,
                hl.sqrt(mt1.__agg_annot *
                        (h2 / annot_sum if h2_normalize else 1)))
        )  # if is_h2_normalized: scale variance of betas to be h2, else: keep unscaled variance
    else:
        print('Simulating betas using {} model w/ h2 = {}'.format(
            ('infinitesimal' if pi is 1 else 'spike & slab'), h2))
        mt1 = mt.annotate_globals(__h2=none_to_null(h2), __pi=pi)
        return mt1.annotate_rows(__beta=hl.rand_bool(pi) *
                                 hl.rand_norm(0, hl.sqrt(h2 / (M * pi))))
Exemple #5
0
def make_betas(mt, h2, pi=1, annot=None):
    M = mt.count_rows()
    if annot is not None:
        annot_stats = mt.aggregate_rows(hl.agg.stats(mt.__annot), _localize=True)
        return mt.annotate_rows(__beta = hl.rand_norm(0, (mt.__annot - annot_stats.mean) / annot_stats.stdev * hl.sqrt(h2 / M)))
    else:
        return mt.annotate_rows(__beta = hl.rand_bool(pi)*hl.rand_norm(0,hl.sqrt(h2/(M*pi))))
Exemple #6
0
def run_mendel_errors() -> hl.Table:
    meta_ht = meta.ht()
    ped = pedigree.versions[f"{CURRENT_RELEASE}_raw"].pedigree()
    logger.info(f"Running Mendel errors for {len(ped.trios)} trios.")

    fake_ped = create_fake_pedigree(
        n=100,
        sample_list=list(
            meta_ht.aggregate(
                hl.agg.filter(
                    hl.rand_bool(0.01)
                    & ((hl.len(meta_ht.qc_metrics_filters) == 0)
                       & hl.or_else(hl.len(meta_ht.hard_filters) == 0, False)),
                    hl.agg.collect_as_set(meta_ht.s),
                ))),
        real_pedigree=ped,
    )
    merged_ped = hl.Pedigree(trios=ped.trios + fake_ped.trios)

    ped_samples = hl.literal(
        set([
            s for trio in merged_ped.trios
            for s in [trio.s, trio.pat_id, trio.mat_id]
        ]))
    mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
    mt = mt.filter_cols(ped_samples.contains(mt.s))
    mt = hl.filter_intervals(
        mt, [hl.parse_locus_interval("chr20", reference_genome='GRCh38')])
    mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)
    mt = mt.select_entries("GT", "END")
    mt = hl.experimental.densify(mt)
    mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    mendel_errors, _, _, _ = hl.mendel_errors(mt["GT"], merged_ped)
    return mendel_errors
Exemple #7
0
def test_king_filtered_entries_no_error():
    plink_path = resource('balding-nichols-1024-variants-4-samples-3-populations')
    mt = hl.import_plink(bed=f'{plink_path}.bed',
                         bim=f'{plink_path}.bim',
                         fam=f'{plink_path}.fam')
    mt = mt.filter_entries(hl.rand_bool(0.5))
    hl.king(mt.GT)._force_count_rows()
Exemple #8
0
    def test_aggregate(self):
        vds = self.get_vds()

        vds = vds.annotate_globals(foo=5)
        vds = vds.annotate_rows(x1=agg.count())
        vds = vds.annotate_cols(y1=agg.count())
        vds = vds.annotate_entries(z1=vds.DP)

        qv = vds.aggregate_rows(agg.count())
        qs = vds.aggregate_cols(agg.count())
        qg = vds.aggregate_entries(agg.count())

        self.assertIsNotNone(vds.aggregate_entries(hl.agg.take(vds.s, 1)[0]))

        self.assertEqual(qv, 346)
        self.assertEqual(qs, 100)
        self.assertEqual(qg, qv * qs)

        qvs = vds.aggregate_rows(
            hl.Struct(x=agg.collect(vds.locus.contig), y=agg.collect(vds.x1)))

        qss = vds.aggregate_cols(
            hl.Struct(x=agg.collect(vds.s), y=agg.collect(vds.y1)))

        qgs = vds.aggregate_entries(
            hl.Struct(x=agg.collect(agg.filter(False, vds.y1)),
                      y=agg.collect(agg.filter(hl.rand_bool(0.1), vds.GT))))
Exemple #9
0
def _spectral_moments(A,
                      num_moments,
                      p=None,
                      moment_samples=500,
                      block_size=128):
    if not isinstance(A, TallSkinnyMatrix):
        check_entry_indexed('_spectral_moments/entry_expr', A)
        A = _make_tsm_from_call(A, block_size)

    n = A.ncols

    if p is None:
        p = min(num_moments // 2, 10)

    # TODO: When moment_samples > n, we should just do a TSQR on A, and compute
    # the spectrum of R.
    assert moment_samples < n, '_spectral_moments: moment_samples must be smaller than num cols of A'
    G = hl.nd.zeros(
        (n,
         moment_samples)).map(lambda n: hl.if_else(hl.rand_bool(0.5), -1, 1))
    Q1, R1 = hl.nd.qr(G)._persist()
    fact = _krylov_factorization(A, Q1, p, compute_U=False)
    moments_and_stdevs = hl.eval(fact.spectral_moments(num_moments, R1))
    moments = moments_and_stdevs.moments
    stdevs = moments_and_stdevs.stdevs
    return moments, stdevs
Exemple #10
0
    def test_aggregate(self):
        vds = self.get_vds()

        vds = vds.annotate_globals(foo=5)
        vds = vds.annotate_rows(x1=agg.count())
        vds = vds.annotate_cols(y1=agg.count())
        vds = vds.annotate_entries(z1=vds.DP)

        qv = vds.aggregate_rows(agg.count())
        qs = vds.aggregate_cols(agg.count())
        qg = vds.aggregate_entries(agg.count())

        self.assertIsNotNone(vds.aggregate_entries(hl.agg.take(vds.s, 1)[0]))

        self.assertEqual(qv, 346)
        self.assertEqual(qs, 100)
        self.assertEqual(qg, qv * qs)

        qvs = vds.aggregate_rows(hl.Struct(x=agg.collect(vds.locus.contig),
                                           y=agg.collect(vds.x1)))

        qss = vds.aggregate_cols(hl.Struct(x=agg.collect(vds.s),
                                           y=agg.collect(vds.y1)))

        qgs = vds.aggregate_entries(hl.Struct(x=agg.filter(False, agg.collect(vds.y1)),
                                              y=agg.filter(hl.rand_bool(0.1), agg.collect(vds.GT))))
Exemple #11
0
    def test_to_matrix_table(self):
        N, M = 50, 50
        mt = hl.utils.range_matrix_table(N, M)
        mt = mt.key_cols_by(s='Col' + hl.str(M - mt.col_idx))
        mt = mt.annotate_cols(c1=hl.rand_bool(0.5))
        mt = mt.annotate_rows(r1=hl.rand_bool(0.5))
        mt = mt.annotate_entries(e1=hl.rand_bool(0.5))

        re_mt = mt.entries().to_matrix_table(['row_idx'], ['s'], ['r1'],
                                             ['col_idx', 'c1'])
        new_col_order = re_mt.col_idx.collect()
        mapping = [
            t[1] for t in sorted([(old, new)
                                  for new, old in enumerate(new_col_order)])
        ]

        assert re_mt.choose_cols(mapping).drop('col_idx')._same(
            mt.drop('col_idx'))
Exemple #12
0
    def test(self):
        schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr,
                            f=hl.tarray(hl.tint32),
                            g=hl.tarray(
                                hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)),
                            h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr),
                            i=hl.tbool,
                            j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr))

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5,
                 'e': "hello", 'f': [1, 2, 3],
                 'g': [hl.Struct(x=1, y=5, z='banana')],
                 'h': hl.Struct(a=5, b=3, c='winter'),
                 'i': True,
                 'j': hl.Struct(x=3, y=2, z='summer')}]

        kt = hl.Table.parallelize(rows, schema)

        result = convert_struct_to_dict(kt.annotate(
            chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d),
            ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5),
            dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])),
            dpois=hl.dpois(4, kt.a),
            drop=kt.h.drop('b', 'c'),
            exp=hl.exp(kt.c),
            fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d),
            hwe=hl.hardy_weinberg_p(1, 2, 1),
            index=hl.index(kt.g, 'z'),
            is_defined=hl.is_defined(kt.i),
            is_missing=hl.is_missing(kt.i),
            is_nan=hl.is_nan(hl.float64(kt.a)),
            json=hl.json(kt.g),
            log=hl.log(kt.a, kt.b),
            log10=hl.log10(kt.c),
            or_else=hl.or_else(kt.a, 5),
            or_missing=hl.or_missing(kt.i, kt.j),
            pchisqtail=hl.pchisqtail(kt.a, kt.b),
            pcoin=hl.rand_bool(0.5),
            pnorm=hl.pnorm(0.2),
            pow=2.0 ** kt.b,
            ppois=hl.ppois(kt.a, kt.b),
            qchisqtail=hl.qchisqtail(kt.a, kt.b),
            range=hl.range(0, 5, kt.b),
            rnorm=hl.rand_norm(0.0, kt.b),
            rpois=hl.rand_pois(kt.a),
            runif=hl.rand_unif(kt.b, kt.a),
            select=kt.h.select('c', 'b'),
            sqrt=hl.sqrt(kt.a),
            to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)],
            where=hl.cond(kt.i, 5, 10)
        ).take(1)[0])
Exemple #13
0
def generate_random_gen():
    mt = hl.utils.range_matrix_table(30, 10)
    mt = (mt.annotate_rows(locus=hl.locus('20', mt.row_idx + 1),
                           alleles=['A', 'G']).key_rows_by('locus', 'alleles'))
    mt = (mt.annotate_cols(s=hl.str(mt.col_idx)).key_cols_by('s'))
    # using totally random values leads rounding differences where
    # identical GEN values get rounded differently, leading to
    # differences in the GT call between import_{gen, bgen}
    mt = mt.annotate_entries(a=hl.int32(hl.rand_unif(0.0, 255.0)))
    mt = mt.annotate_entries(b=hl.int32(hl.rand_unif(0.0, 255.0 - mt.a)))
    mt = mt.transmute_entries(GP=hl.array([mt.a, mt.b, 255.0 - mt.a - mt.b]) /
                              255.0)
    # 20% missing
    mt = mt.filter_entries(hl.rand_bool(0.8))
    hl.export_gen(mt, 'random', precision=4)
Exemple #14
0
def make_betas(mt, h2=None, pi=1, is_annot_inf=False, annot_coef_dict=None, annot_regex=None, h2_normalize=True):
    '''Simulate betas. Options: Infinitesimal model, spike & slab, annotation-informed'''  
    check_beta_args(h2=h2,pi=pi,is_annot_inf=is_annot_inf,annot_coef_dict=annot_coef_dict,
                    annot_regex=annot_regex,h2_normalize=h2_normalize)
    M = mt.count_rows()
    if is_annot_inf:
        print('\rSimulating {} annotation-informed betas {}'.format(
                'h2-normalized' if h2_normalize else '',
                '(default coef: 1)' if annot_coef_dict is None else 'using annot_coef_dict'))
        mt1 = agg_fields(mt=mt,coef_dict=annot_coef_dict,regex=annot_regex)
        annot_sum = mt1.aggregate_rows(hl.agg.sum(mt1.__agg_annot))
        return mt1.annotate_rows(__beta = hl.rand_norm(0, hl.sqrt(mt1.__agg_annot*(h2/annot_sum if h2_normalize else 1)))) # if is_h2_normalized: scale variance of betas to be h2, else: keep unscaled variance
    else:
        print('Simulating betas using {} model w/ h2 = {}'.format(('infinitesimal' if pi is 1 else 'spike & slab'),h2))
        mt1 = mt.annotate_globals(__h2 = none_to_null(h2), __pi = pi)
        return mt1.annotate_rows(__beta = hl.rand_bool(pi)*hl.rand_norm(0,hl.sqrt(h2/(M*pi))))
Exemple #15
0
def generate_random_gen():
    mt = hl.utils.range_matrix_table(30, 10)
    mt = (mt.annotate_rows(locus = hl.locus('20', mt.row_idx + 1),
                           alleles = ['A', 'G'])
          .key_rows_by('locus', 'alleles'))
    mt = (mt.annotate_cols(s = hl.str(mt.col_idx))
          .key_cols_by('s'))
    # using totally random values leads rounding differences where
    # identical GEN values get rounded differently, leading to
    # differences in the GT call between import_{gen, bgen}
    mt = mt.annotate_entries(a = hl.int32(hl.rand_unif(0.0, 255.0)))
    mt = mt.annotate_entries(b = hl.int32(hl.rand_unif(0.0, 255.0 - mt.a)))
    mt = mt.transmute_entries(GP = hl.array([mt.a, mt.b, 255.0 - mt.a - mt.b]) / 255.0)
    # 20% missing
    mt = mt.filter_entries(hl.rand_bool(0.8))
    hl.export_gen(mt, 'random', precision=4)
Exemple #16
0
def make_betas(mt, h2, pi=1, annot=None):
    '''Simulate betas. Options: Infinitesimal model, spike & slab, annotation-informed'''
    M = mt.count_rows()
    if annot is not None:
        print('\rSimulating annotation-informed betas w/ h2 = {}'.format(h2))
        mt1 = mt._annotate_all(row_exprs={'__annot': annot},
                               global_exprs={'__h2': h2})
        annot_sum = mt1.aggregate_rows(hl.agg.sum(mt1.__annot))
        return mt1.annotate_rows(
            __beta=hl.rand_norm(0, hl.sqrt(mt1.__annot / annot_sum * h2)))
    else:
        print('Simulating betas using {} model w/ h2 = {}'.format(
            ('infinitesimal' if pi is 1 else 'spike & slab'), h2))
        mt1 = mt.annotate_globals(__h2=h2, __pi=pi)
        return mt1.annotate_rows(__beta=hl.rand_bool(pi) *
                                 hl.rand_norm(0, hl.sqrt(h2 / (M * pi))))
Exemple #17
0
def logistic_regression_rows_wald_nd(mt_path):
    mt = hl.read_matrix_table(mt_path)
    mt = mt.head(2000)
    num_phenos = 5
    num_covs = 2
    pheno_dict = {
        f"pheno_{i}": hl.rand_bool(.5, seed=i)
        for i in range(num_phenos)
    }
    cov_dict = {
        f"cov_{i}": hl.rand_unif(0, 1, seed=i)
        for i in range(num_covs)
    }
    mt = mt.annotate_cols(**pheno_dict)
    mt = mt.annotate_cols(**cov_dict)
    res = hl._logistic_regression_rows_nd(
        test='wald',
        y=[mt[key] for key in pheno_dict.keys()],
        x=mt.x,
        covariates=[mt[key] for key in cov_dict.keys()])
    res._force_count()
Exemple #18
0
 def make_random_function(self, mt):  #pi is slab prob
     M = mt.count_rows()  # number of variants
     #        return hl.cond(hl.rand_unif(0,1) < self.pi, hl.rand_norm(0,self.h2/(M*self.pi)), 0)
     #        return hl.cond(hl.rand_bool(self.pi), hl.rand_norm(0,hl.sqrt(self.h2/(M*self.pi))), 0)
     return hl.rand_bool(self.pi) * hl.rand_norm(0, self.h2 / (M * self.pi))
Exemple #19
0
def sample_training_examples(
    ht: hl.Table,
    tp_expr: hl.BooleanExpression,
    fp_expr: hl.BooleanExpression,
    fp_to_tp: float = 1.0,
    test_expr: Optional[hl.expr.BooleanExpression] = None,
) -> hl.Table:
    """
    Returns a Table of all positive and negative training examples in `ht` with an annotation indicating those that
    should be used for training given a true positive (TP) to false positive (FP) ratio.

    The returned Table has the following annotations:
        - train: indicates if the variant should be used for training. A row is given False for the annotation if True
          for `test_expr`, True for both `tp_expr and fp_expr`, or it is pruned out to obtain the desired `fp_to_tp` ratio.
        - label: indicates if a variant is a 'TP' or 'FP' and will also be labeled as such for variants defined by `test_expr`.

    .. note::

        - This function does not support multi-allelic variants.
        - The function will give some stats about the TPs/FPs provided (Ti, Tv, indels).

    :param ht: Input Table.
    :param tp_expr: Expression for TP examples.
    :param fp_expr: Expression for FP examples.
    :param fp_to_tp: FP to TP ratio. If set to <= 0, all training examples are used.
    :param test_expr: Optional expression to exclude a set of variants from training set. Still contains TP/FP label annotation.
    :return: Table subset with corresponding TP and FP examples with desired FP to TP ratio.
    """

    ht = ht.select(
        _tp=hl.or_else(tp_expr, False),
        _fp=hl.or_else(fp_expr, False),
        _exclude=False if test_expr is None else test_expr,
    )
    ht = ht.filter(ht._tp | ht._fp).persist()

    # Get stats about TP / FP sets
    def _get_train_counts(ht: hl.Table) -> Tuple[int, int]:
        """
        Determine the number of TP and FP variants in the input Table and report some stats on Ti, Tv, indels.

        :param ht: Input Table
        :return: Counts of TP and FP variants in the table
        """
        train_stats = hl.struct(n=hl.agg.count())

        if "alleles" in ht.row and ht.row.alleles.dtype == hl.tarray(hl.tstr):
            train_stats = train_stats.annotate(
                ti=hl.agg.count_where(
                    hl.expr.is_transition(ht.alleles[0], ht.alleles[1])
                ),
                tv=hl.agg.count_where(
                    hl.expr.is_transversion(ht.alleles[0], ht.alleles[1])
                ),
                indel=hl.agg.count_where(
                    hl.expr.is_indel(ht.alleles[0], ht.alleles[1])
                ),
            )

        # Sample training examples
        pd_stats = (
            ht.group_by(**{"contig": ht.locus.contig, "tp": ht._tp, "fp": ht._fp})
            .aggregate(**train_stats)
            .to_pandas()
        )

        logger.info(pformat(pd_stats))
        pd_stats = pd_stats.fillna(False)

        # Number of true positive and false positive variants to be sampled for the training set
        n_tp = pd_stats[pd_stats["tp"] & ~pd_stats["fp"]]["n"].sum()
        n_fp = pd_stats[~pd_stats["tp"] & pd_stats["fp"]]["n"].sum()

        return n_tp, n_fp

    n_tp, n_fp = _get_train_counts(ht.filter(~ht._exclude))

    prob_tp = prob_fp = 1.0
    if fp_to_tp > 0:
        desired_fp = fp_to_tp * n_tp
        if desired_fp < n_fp:
            prob_fp = desired_fp / n_fp
        else:
            prob_tp = n_fp / desired_fp

        logger.info(
            f"Training examples sampling: tp={prob_tp}*{n_tp}, fp={prob_fp}*{n_fp}"
        )

        train_expr = (
            hl.case(missing_false=True)
            .when(ht._fp & hl.or_else(~ht._tp, True), hl.rand_bool(prob_fp))
            .when(ht._tp & hl.or_else(~ht._fp, True), hl.rand_bool(prob_tp))
            .default(False)
        )
    else:
        train_expr = ~(ht._tp & ht._fp)
        logger.info(f"Using all {n_tp} TP and {n_fp} FP training examples.")

    label_expr = (
        hl.case(missing_false=True)
        .when(ht._tp & hl.or_else(~ht._fp, True), "TP")
        .when(ht._fp & hl.or_else(~ht._tp, True), "FP")
        .default(hl.null(hl.tstr))
    )

    return ht.select(train=train_expr & ~ht._exclude, label=label_expr)
Exemple #20
0
import hail as hl
hl.set_global_seed(0)
mt = hl.balding_nichols_model(n_populations=3, n_variants=(1 << 10), n_samples=4)
mt = mt.key_cols_by(s='s' + hl.str(mt.sample_idx))
mt = mt.annotate_entries(GT=hl.or_missing(hl.rand_bool(0.99), mt.GT))
hl.export_plink(mt, 'balding-nichols-1024-variants-4-samples-3-populations', fam_id='f' + mt.s)
Exemple #21
0
def make_betas(mt, h2, pi=1, annot=None, rg=None):
    """Generates betas under different models. 
       
    Simulates betas (SNP effects) under the infinitesimal, spike & slab, or 
    annotation-informed models, depending on parameters passed.
    
    Parameters
    ----------
    mt : :class:`.MatrixTable`
        MatrixTable containing genotypes to be used. Also should contain 
        variant annotations as row fields if running the annotation-informed
        model or covariates as column fields if adding population stratification.
    h2 : :obj:`float` or :obj:`int` or :obj:`list`
        SNP-based heritability of simulated trait(s). 
    pi : :obj:`float` or :obj:`int` or :obj:`list`
        Probability of SNP being causal when simulating under the spike & slab 
        model. If doing two-trait spike & slab `pi` is a list of probabilities for
        overlapping causal SNPs (see docstring of :func:`.multitrait_ss`)
    annot : :class:`.Expression`
        Row field of aggregated annotations for annotation-informed model.
    
    Returns
    -------
    :class:`.MatrixTable`
        :class:`.MatrixTable` with betas as a row field, simulated according to specified model.
    """
    h2 = [h2] if type(h2) is not list else h2
    pi = [pi] if type(pi) is not list else pi
    rg = [rg] if type(rg) is not list else rg
    assert (all(x >= 0 and x <= 1
                for x in h2)), 'h2 values must be between 0 and 1'
    assert (all(
        x >= 0 and x <= 1
        for x in pi)), 'pi values for spike & slab must be between 0 and 1'
    assert (rg == [None]
            or all(x >= 0 and x <= 1
                   for x in rg)), 'rg values must be between 0 and 1 or None'
    if annot is not None:  #multi-trait annotation-informed
        assert rg == [
            None
        ], 'Correlated traits not supported for annotation-informed model'
        h2 = h2 if type(h2) is list else [h2]
        M = mt.count_rows()
        annot_var = mt.aggregate_rows(hl.agg.stats(annot)).stdev**2
        mt = mt.annotate_rows(
            beta=hl.literal(h2).map(lambda x: hl.rand_norm(
                0, hl.sqrt(annot * x / (annot_var * M))))
        )  # if is_h2_normalized: scale variance of betas to be h2, else: keep unscaled variance
        return mt
    elif len(h2) > 1 and pi == [1]:  #multi-trait correlated infinitesimal
        return multitrait_inf(mt=mt, h2=h2, rg=rg)
    elif len(h2) == 2 and len(pi) > 1:  #two trait correlated spike & slab
        return multitrait_ss(mt=mt,
                             h2=h2,
                             rg=0 if rg is [None] else rg[0],
                             pi=pi)
    elif len(h2) == 1 and len(
            pi) == 1:  #single trait infinitesimal/spike & slab
        M = mt.count_rows()
        return mt.annotate_rows(beta=hl.rand_bool(pi[0]) *
                                hl.rand_norm(0, hl.sqrt(h2[0] / (M * pi[0]))))
    else:
        raise ValueError('Insufficient parameters')
Exemple #22
0
def test_suite(mt, genotype, popstrat):
    '''Testing suite for simulation framework'''
    mt = mt._annotate_all(row_exprs={
        'a1': hl.rand_norm(),
        'a2': hl.rand_bool(0.1)
    },
                          col_exprs={'popstrat': popstrat},
                          entry_exprs={'gt': genotype})
    mt = mt.annotate_rows(annot=mt.a1 + mt.a2)

    n_sim = 7  #number of simulations
    sim_h2_ls = np.round(np.random.uniform(low=0, high=1, size=n_sim), 4)
    obs_h2_ls = []
    sim_mt_ls = []

    # Infinitesimal
    sim_mt_ls.append(simulate(mt=mt, h2=sim_h2_ls[0], genotype=mt.gt))
    # Spike & slab
    sim_mt_ls.append(simulate(mt=mt, h2=sim_h2_ls[1], pi=0.1, genotype=mt.gt))
    # Annotation-informed
    sim_mt_ls.append(
        simulate(mt=mt, h2=sim_h2_ls[1], genotype=mt.gt, annot=mt.annot)
    )  #has same h2 as previous spike and slab to check if sims match
    # Infinitesimal + population stratification, popstrat_s2 = 0.5
    sim_mt_ls.append(
        simulate(mt=mt,
                 h2=sim_h2_ls[3],
                 genotype=mt.gt,
                 popstrat=mt.popstrat,
                 popstrat_s2=0.5))
    # Infinitesimal + population stratification, popstrat_s2 = 0.25
    sim_mt_ls.append(
        simulate(mt=mt,
                 h2=sim_h2_ls[3],
                 genotype=mt.gt,
                 popstrat=mt.popstrat,
                 popstrat_s2=0.25))
    # Spike & slab + population stratification
    sim_mt_ls.append(
        simulate(mt=mt,
                 h2=sim_h2_ls[5],
                 pi=0.1,
                 genotype=mt.gt,
                 popstrat=mt.popstrat))
    # Annotation-informed + population stratification
    sim_mt_ls.append(
        simulate(mt=mt,
                 h2=sim_h2_ls[6],
                 genotype=mt.gt,
                 annot=mt.annot,
                 popstrat=mt.popstrat))

    for sim_mt in sim_mt_ls:
        print(sim_mt.describe())

    for sim_i, sim_mt in enumerate(sim_mt_ls):
        obs_h2_ls.append(
            np.round(
                sim_mt.aggregate_cols(
                    hl.agg.stats(sim_mt['__y_no_noise']).stdev**2), 4))
    print('\nExpected h2s: {} \nObserved h2s: {}'.format(sim_h2_ls, obs_h2_ls))
Exemple #23
0
#! /usr/bin/python

import sys
import hail as hl

n_samples = int(sys.argv[1])
n_variants = int(sys.argv[2])
path = sys.argv[3]

mt = hl.balding_nichols_model(1, n_samples, n_variants)
mt = mt.key_cols_by(s = hl.str(mt.sample_idx))
mt = mt.annotate_entries(GT = hl.unphased_diploid_gt_index_call(hl.rand_bool(0.5) * 2))
hl.export_vcf(mt, path + ".vcf")
hl.export_plink(mt, path)
Exemple #24
0
import os, shutil
import hail as hl

if not os.path.isdir("output/"):
    os.mkdir("output/")

files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
for f in files:
    if os.path.isdir(f):
        shutil.rmtree(f)

ds = hl.import_vcf('data/sample.vcf.bgz')
ds = ds.sample_rows(0.03)
ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                      panel_maf=0.1,
                      anno1=5,
                      anno2=0,
                      consequence="LOF",
                      gene="A",
                      score=5.0)
ds = ds.annotate_rows(a_index=1)
ds = hl.sample_qc(hl.variant_qc(ds))
ds = ds.annotate_cols(is_case=True,
                      pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                      is_female=hl.rand_bool(0.5),
                                      age=hl.rand_norm(65, 10),
                                      height=hl.rand_norm(70, 10),
                                      blood_pressure=hl.rand_norm(120, 20),
                                      cohort_name="cohort1"),
                      cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                      cov1=hl.rand_norm(0, 1),
Exemple #25
0
#! /usr/bin/python

import sys
import hail as hl

n_samples = int(sys.argv[1])
n_variants = int(sys.argv[2])
path = sys.argv[3]

mt = hl.balding_nichols_model(1, n_samples, n_variants)
mt = mt.key_cols_by(s=hl.str(mt.sample_idx))
mt = mt.annotate_entries(
    GT=hl.unphased_diploid_gt_index_call(hl.rand_bool(0.5) * 2))

hl.export_vcf(mt, path + ".vcf")
hl.export_plink(mt, path)

chimera0 = mt.filter_rows(mt.locus.position < n_variants / 2)
chimera0 = chimera0.filter_cols(chimera0.s == "0")

chimera1 = mt.filter_rows(mt.locus.position >= n_variants / 2)
chimera1 = chimera1.filter_cols(chimera1.s == "1")
chimera1 = chimera1.key_cols_by(s="0")

mt2 = chimera0.union_rows(chimera1)
hl.export_vcf(mt2, path + "-chimera.vcf")
hl.export_plink(mt2, path + "-chimera")
Exemple #26
0
def generate_datasets(doctest_namespace, output_dir):
    doctest_namespace['hl'] = hl

    files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
    for f in files:
        if os.path.isdir(f):
            shutil.rmtree(f)

    ds = hl.import_vcf('data/sample.vcf.bgz')
    ds = ds.sample_rows(0.03)
    ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                          panel_maf=0.1,
                          anno1=5,
                          anno2=0,
                          consequence="LOF",
                          gene="A",
                          score=5.0)
    ds = ds.annotate_rows(a_index=1)
    ds = hl.sample_qc(hl.variant_qc(ds))
    ds = ds.annotate_cols(is_case=True,
                          pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                          is_female=hl.rand_bool(0.5),
                                          age=hl.rand_norm(65, 10),
                                          height=hl.rand_norm(70, 10),
                                          blood_pressure=hl.rand_norm(120, 20),
                                          cohort_name="cohort1"),
                          cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                          cov1=hl.rand_norm(0, 1),
                          cov2=hl.rand_norm(0, 1),
                          cohort="SIGMA")
    ds = ds.annotate_globals(
        global_field_1=5,
        global_field_2=10,
        pli={
            'SCN1A': 0.999,
            'SONIC': 0.014
        },
        populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
    ds = ds.annotate_rows(gene=['TTN'])
    ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
    ds = ds.checkpoint(f'{output_dir.name}/example.vds', overwrite=True)
    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(
        consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata
    doctest_namespace['cols_to_keep'] = s_metadata
    doctest_namespace['cols_to_remove'] = s_metadata
    doctest_namespace['rows_to_keep'] = v_metadata
    doctest_namespace['rows_to_remove'] = v_metadata

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv',
                             impute=True,
                             types={
                                 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)
                             })
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv',
                                   delimiter='\\s+',
                                   types={
                                       'Age': hl.tint32,
                                       'Children': hl.tarray(hl.tstr)
                                   },
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({
        'Alice': 43,
        'Bob': 33,
        'Charles': 44
    })
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval(
        "1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)
    doctest_namespace['nd'] = hl._ndarray([[1, 2], [3, 4]])

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv",
                                              impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    burden_ds = hl.import_vcf('data/example_burden.vcf')
    burden_kt = hl.import_table('data/example_burden.tsv',
                                key='Sample',
                                impute=True)
    burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s])
    burden_ds = burden_ds.annotate_rows(
        weight=hl.float64(burden_ds.locus.position))
    burden_ds = hl.variant_qc(burden_ds)
    genekt = hl.import_locus_intervals('data/gene.interval_list')
    burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
    burden_ds = burden_ds.checkpoint(f'{output_dir.name}/example_burden.vds',
                                     overwrite=True)
    doctest_namespace['burden_ds'] = burden_ds

    print("finished setting up doctest...")
Exemple #27
0
def make_betas(mt, h2, pi=None, annot=None, rg=None):
    r"""Generates betas under different models.

    Simulates betas (SNP effects) under the infinitesimal, spike & slab, or
    annotation-informed models, depending on parameters passed.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        MatrixTable containing genotypes to be used. Also should contain
        variant annotations as row fields if running the annotation-informed
        model or covariates as column fields if adding population stratification.
    h2 : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`
        SNP-based heritability of simulated trait(s).
    pi : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`, optional
        Probability of SNP being causal when simulating under the spike & slab
        model. If doing two-trait spike & slab `pi` is a list of probabilities for
        overlapping causal SNPs (see docstring of :func:`.multitrait_ss`)
    annot : :class:`.Expression`, optional
        Row field of aggregated annotations for annotation-informed model.
    rg : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`, optional
        Genetic correlation between traits.

    Returns
    -------
    mt : :class:`.MatrixTable`
        :class:`.MatrixTable` with betas as a row field, simulated according to specified model.
    pi : :obj:`list`
        Probability of a SNP being causal for different traits, possibly altered
        from input `pi` if covariance matrix for multitrait simulation was not
        positive semi-definite.
    rg : :obj:`list`
        Genetic correlation between traits, possibly altered from input `rg` if
        covariance matrix for multitrait simulation was not positive semi-definite.

    """
    h2 = h2.tolist() if type(h2) is np.ndarray else (
        [h2] if type(h2) is not list else h2)
    pi = pi.tolist() if type(pi) is np.ndarray else (
        [pi] if type(pi) is not list else pi)
    rg = rg.tolist() if type(rg) is np.ndarray else (
        [rg] if type(rg) is not list else rg)
    assert (all(x >= 0 and x <= 1
                for x in h2)), 'h2 values must be between 0 and 1'
    assert (pi is not [None]) or all(
        x >= 0 and x <= 1
        for x in pi), 'pi values for spike & slab must be between 0 and 1'
    assert (rg == [None]
            or all(x >= -1 and x <= 1
                   for x in rg)), 'rg values must be between -1 and 1 or None'
    if annot is not None:  # multi-trait annotation-informed
        assert rg == [
            None
        ], 'Correlated traits not supported for annotation-informed model'
        h2 = h2 if type(h2) is list else [h2]
        annot_sum = mt.aggregate_rows(hl.agg.sum(annot))
        mt = mt.annotate_rows(beta=hl.literal(h2).map(
            lambda x: hl.rand_norm(0, hl.sqrt(annot * x / (annot_sum * M)))))
    elif len(h2) > 1 and (pi == [None] or pi
                          == [1]):  # multi-trait correlated infinitesimal
        mt, rg = multitrait_inf(mt=mt, h2=h2, rg=rg)
    elif len(h2) == 2 and len(pi) > 1 and len(
            rg) == 1:  # two trait correlated spike & slab
        print('multitrait ss')
        mt, pi, rg = multitrait_ss(mt=mt,
                                   h2=h2,
                                   rg=0 if rg is [None] else rg[0],
                                   pi=pi)
    elif len(h2) == 1 and len(
            pi) == 1:  # single trait infinitesimal/spike & slab
        M = mt.count_rows()
        pi_temp = 1 if pi == [None] else pi[0]
        mt = mt.annotate_rows(beta=hl.rand_bool(pi_temp) *
                              hl.rand_norm(0, hl.sqrt(h2[0] / (M * pi_temp))))
    else:
        raise ValueError('Parameters passed do not match any models.')
    return mt, pi, rg
Exemple #28
0
def run_rf_test(
        mt: hl.MatrixTable,
        output: str = "/tmp"
) -> Tuple[pyspark.ml.PipelineModel, hl.MatrixTable]:
    """
    Runs a dummy test RF on a given MT.

    1. Creates row annotations and labels to run model on
    2. Trains a RF pipeline model (including median imputation of missing values in created annotations)
    3. Saves the RF pipeline model
    4. Applies the model to the MT and prints features importance

    :param mt: Input MT
    :param output: Output files prefix to save the RF model
    :return: RF model and MatrixTable after applying RF model
    """

    mt = mt.annotate_rows(
        feature1=hl.rand_bool(0.1),
        feature2=hl.rand_norm(0.0, 1.0),
        feature3=hl.or_missing(hl.rand_bool(0.5), hl.rand_norm(0.0, 1.0)),
    )

    mt = mt.annotate_rows(label=hl.cond(mt["feature1"]
                                        & (mt["feature2"] > 0), "TP", "FP"))
    ht = mt.rows()

    def f3stats(ht):
        return ht.aggregate(
            hl.struct(
                n=hl.agg.count_where(hl.is_defined(ht["feature3"])),
                med=hl.median(hl.agg.collect(ht["feature3"])),
            ))

    f3_before_imputation = f3stats(ht)
    logger.info("Feature3 defined values before imputation: {}".format(
        f3_before_imputation.n))
    logger.info("Feature3 median: {}".format(f3_before_imputation.med))

    features_to_impute = ["feature3"]
    quantiles = get_columns_quantiles(ht, features_to_impute, [0.5])
    quantiles = {k: v[0] for k, v in quantiles.items()}

    logger.info("Features median:\n{}".format(f"{k}: {v}\n"
                                              for k, v in quantiles.items()))
    ht = ht.annotate(
        **{f: hl.or_else(ht[f], quantiles[f])
           for f in features_to_impute})
    ht = ht.annotate_globals(medians=quantiles)

    f3_after_imputation = f3stats(ht)
    logger.info("Feature3 defined values after imputation: {}".format(
        f3_after_imputation.n))
    logger.info("Feature3 median: {}".format(f3_after_imputation.med))

    ht = ht.select("label", "feature1", "feature2", "feature3")

    label = "label"
    features = ["feature1", "feature2", "feature3"]

    rf_model = train_rf(ht, features, label)
    save_model(rf_model, out_path=output + "/rf.model", overwrite=True)
    rf_model = load_model(output + "/rf.model")

    return rf_model, apply_rf_model(ht, rf_model, features, label)
                     reference_genome='GRCh37')

for chrom in range(2, 23):
    mtT = hl.import_plink(bed='gs://.../ukb_imp_chr%s_v3.bed' % chrom,
                          bim='gs://.../ukb_imp_chr%s_v3.bim' % chrom,
                          fam='gs://.../ukb_imp_chr%s_v3.fam' % chrom,
                          reference_genome='GRCh37')
    mt = mt.union_rows(mtT)

# Keep only unrelated (~361k samples)
tb2 = hl.import_table('gs://.../unrelated_samples.txt', impute=True)
tb2 = tb2.annotate(s_str=hl.str(tb2.s)).key_by('s_str')
mt = mt.semi_join_cols(tb2).add_col_index()
mt = mt.annotate_cols(s_index=mt.col_idx).key_cols_by('s_index')

# Extract 350k samples + basic QC
random.seed(123)
indices = random.sample(range(mt.count_cols()), 350000)
mt = mt.choose_cols(list(indices))
mt = hl.variant_qc(mt)
mt = mt.filter_rows((mt.variant_qc.AF[1] >= 0.05)
                    & (mt.variant_qc.AF[1] <= 0.95))

# Randomly assign sex (1 F, 0 M)
mt = mt.annotate_cols(sex=hl.cond(hl.rand_bool(0.5, seed=123), 1, 0))

# Simulate phenotypes and save MatrixTable
sim = hl.experimental.ldscsim.simulate_phenotypes(mt, mt.GT, h2=h2, rg=rg)
print(sim.describe())
sim.write(out, overwrite=True)
Exemple #30
0
def generate_datasets(doctest_namespace):
    doctest_namespace['hl'] = hl
    doctest_namespace['np'] = np

    ds = hl.import_vcf('data/sample.vcf.bgz')
    ds = ds.sample_rows(0.03)
    ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                          panel_maf=0.1,
                          anno1=5,
                          anno2=0,
                          consequence="LOF",
                          gene="A",
                          score=5.0)
    ds = ds.annotate_rows(a_index=1)
    ds = hl.sample_qc(hl.variant_qc(ds))
    ds = ds.annotate_cols(is_case=True,
                          pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                          is_female=hl.rand_bool(0.5),
                                          age=hl.rand_norm(65, 10),
                                          height=hl.rand_norm(70, 10),
                                          blood_pressure=hl.rand_norm(120, 20),
                                          cohort_name="cohort1"),
                          cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                          cov1=hl.rand_norm(0, 1),
                          cov2=hl.rand_norm(0, 1),
                          cohort="SIGMA")
    ds = ds.annotate_globals(
        global_field_1=5,
        global_field_2=10,
        pli={
            'SCN1A': 0.999,
            'SONIC': 0.014
        },
        populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
    ds = ds.annotate_rows(gene=['TTN'])
    ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
    ds = ds.checkpoint(f'output/example.mt', overwrite=True)

    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(
        consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata
    doctest_namespace['cols_to_keep'] = s_metadata
    doctest_namespace['cols_to_remove'] = s_metadata
    doctest_namespace['rows_to_keep'] = v_metadata
    doctest_namespace['rows_to_remove'] = v_metadata

    small_mt = hl.balding_nichols_model(3, 4, 4)
    doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt',
                                                        overwrite=True)

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv',
                             impute=True,
                             types={
                                 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)
                             })
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv',
                                   delimiter='\\s+',
                                   types={
                                       'Age': hl.tint32,
                                       'Children': hl.tarray(hl.tstr)
                                   },
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({
        'Alice': 43,
        'Bob': 33,
        'Charles': 44
    })
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval(
        "1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)
    doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]])

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv",
                                              impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    burden_ds = hl.import_vcf('data/example_burden.vcf')
    burden_kt = hl.import_table('data/example_burden.tsv',
                                key='Sample',
                                impute=True)
    burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s])
    burden_ds = burden_ds.annotate_rows(
        weight=hl.float64(burden_ds.locus.position))
    burden_ds = hl.variant_qc(burden_ds)
    genekt = hl.import_locus_intervals('data/gene.interval_list')
    burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
    burden_ds = burden_ds.checkpoint(f'output/example_burden.vds',
                                     overwrite=True)
    doctest_namespace['burden_ds'] = burden_ds

    ld_score_one_pheno_sumstats = hl.import_table(
        'data/ld_score_regression.one_pheno.sumstats.tsv',
        types={
            'locus': hl.tlocus('GRCh37'),
            'alleles': hl.tarray(hl.tstr),
            'chi_squared': hl.tfloat64,
            'n': hl.tint32,
            'ld_score': hl.tfloat64,
            'phenotype': hl.tstr,
            'chi_squared_50_irnt': hl.tfloat64,
            'n_50_irnt': hl.tint32,
            'chi_squared_20160': hl.tfloat64,
            'n_20160': hl.tint32
        },
        key=['locus', 'alleles'])
    doctest_namespace[
        'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats

    mt = hl.import_matrix_table(
        'data/ld_score_regression.all_phenos.sumstats.tsv',
        row_fields={
            'locus': hl.tstr,
            'alleles': hl.tstr,
            'ld_score': hl.tfloat64
        },
        entry_type=hl.tstr)
    mt = mt.key_cols_by(phenotype=mt.col_id)
    mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus),
                        alleles=mt.alleles.split(','))
    mt = mt.drop('row_id', 'col_id')
    mt = mt.annotate_entries(x=mt.x.split(","))
    mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]),
                              n=hl.int32(mt.x[1]))
    mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score))
    doctest_namespace['ld_score_all_phenos_sumstats'] = mt

    print("finished setting up doctest...")
Exemple #31
0
          "info_DP").write("gs://gnomad-qingbowang/gen_chr20_all_siteDP.ht",
                           overwrite=True)
print(et.count())  #sample size
#agg = et.group_by('agrees_PID').aggregate(n=hl.agg.stats(et.info_DP))
#agg.write("gs://gnomad-qingbowang/phased_has_higher_cov_ex_chr20_all.ht")
#agg.write("gs://gnomad-qingbowang/phased_has_higher_cov_gen_chr20_all_siteDP.ht")

exomes = get_gnomad_data("exomes",
                         release_samples=True,
                         adj=True,
                         release_annotations=True)  #実際はgenomeだけど.
#ex20 = hl.filter_intervals(exomes.select_rows("allele_info").select_cols(), [hl.parse_locus_interval("20:start-2M")]) #first 2Mb
#ex20 = hl.filter_intervals(exomes.select_rows("info_DP").select_cols(), [hl.parse_locus_interval("20")])
ex20 = exomes.select_rows("info_DP").select_cols()
ex20 = ex20.filter_cols(
    hl.rand_bool(0.01))  #~=1500 samples for small test, exome
#ex20 = ex20.filter_cols(hl.rand_bool(0.1)) #~=1500 samples for small test, genome
#全sampleに関して行うとしよう.
ex20 = hl.filter_alleles(ex20, lambda allele, i: hl.is_snp(
    ex20.alleles[0], allele))  # currently take only SNP
ex20 = ex20.filter_entries(
    ex20.GT.is_het())  # throw away unwanted entries (non alt)
ex20_pair = hl.window_by_locus(ex20, 2)  #just look at nearby pairs for now
ex20_pair = ex20_pair.filter_entries(
    (hl.is_defined(ex20_pair.GT) & (ex20_pair.prev_entries.length() > 0)))
ex20_pair = ex20_pair.filter_entries(
    ex20_pair.prev_entries.filter(lambda x: x.GT.is_het()).length() > 0)
et = ex20_pair.entries()
et = et.annotate(indices=hl.range(0, hl.len(et.prev_rows)))
et = et.explode('indices')
et = et.transmute(prev_row=et.prev_rows[et.indices],
Exemple #32
0
import os, shutil
import hail as hl

if not os.path.isdir("output/"):
    os.mkdir("output/")

files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
for f in files:
    if os.path.isdir(f):
        shutil.rmtree(f)

ds = hl.import_vcf('data/sample.vcf.bgz')
ds = ds.sample_rows(0.03)
ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                      panel_maf=0.1,
                      anno1=5,
                      anno2=0,
                      consequence="LOF",
                      gene="A",
                      score=5.0)
ds = ds.annotate_rows(a_index=1)
ds = hl.sample_qc(hl.variant_qc(ds))
ds = ds.annotate_cols(is_case=True,
                      pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                      is_female=hl.rand_bool(0.5),
                                      age=hl.rand_norm(65, 10),
                                      height=hl.rand_norm(70, 10),
                                      blood_pressure=hl.rand_norm(120, 20),
                                      cohort_name="cohort1"),
                      cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                      cov1=hl.rand_norm(0, 1),
Exemple #33
0
def _pca_and_moments(A,
                     k=10,
                     num_moments=5,
                     compute_loadings=False,
                     q_iterations=2,
                     oversampling_param=2,
                     block_size=128,
                     moment_samples=100):
    if not isinstance(A, TallSkinnyMatrix):
        check_entry_indexed('_spectral_moments/entry_expr', A)
        A = _make_tsm_from_call(A, block_size)

    # Set Parameters
    q = q_iterations
    L = k + oversampling_param
    n = A.ncols

    # Generate random matrix G
    G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1))
    G = hl.nd.qr(G)[0]._persist()

    fact = _krylov_factorization(A, G, q, compute_loadings)
    info("_reduced_svd: Computing local SVD")
    U, S, V = fact.reduced_svd(k)

    p = min(num_moments // 2, 10)

    # Generate random matrix G2 for moment estimation
    G2 = hl.nd.zeros(
        (n,
         moment_samples)).map(lambda n: hl.if_else(hl.rand_bool(0.5), -1, 1))
    # Project out components in subspace fact.V, which we can compute exactly
    G2 = G2 - fact.V @ (fact.V.T @ G2)
    Q1, R1 = hl.nd.qr(G2)._persist()
    fact2 = _krylov_factorization(A, Q1, p, compute_U=False)
    moments_and_stdevs = fact2.spectral_moments(num_moments, R1)
    # Add back exact moments
    moments = moments_and_stdevs.moments + hl.nd.array([
        fact.S.map(lambda x: x**(2 * i)).sum()
        for i in range(1, num_moments + 1)
    ])
    moments_and_stdevs = hl.eval(
        hl.struct(moments=moments, stdevs=moments_and_stdevs.stdevs))
    moments = moments_and_stdevs.moments
    stdevs = moments_and_stdevs.stdevs

    scores = V * S
    eigens = hl.eval(S * S)
    info("blanczos_pca: SVD Complete. Computing conversion to PCs.")

    hail_array_scores = scores._data_array()
    cols_and_scores = hl.zip(
        A.source_table.index_globals().cols,
        hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1]))
    st = hl.Table.parallelize(cols_and_scores, key=A.col_key)

    if compute_loadings:
        lt = A.source_table.select()
        lt = lt.annotate_globals(U=U)
        idx_name = '_tmp_pca_loading_index'
        lt = lt.add_index(idx_name)
        lt = lt.annotate(
            loadings=lt.U[lt[idx_name], :]._data_array()).select_globals()
        lt = lt.drop(lt[idx_name])
    else:
        lt = None

    return eigens, st, lt, moments, stdevs
Exemple #34
0
import hail as hl
import hail.expr.aggregators as agg
from hail.stats import *
from hail.utils.java import warn

if not os.path.isdir("output/"):
    os.mkdir("output/")

files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
for f in files:
    if os.path.isdir(f):
        shutil.rmtree(f)

ds = hl.import_vcf('data/sample.vcf.bgz')
ds = ds.sample_rows(0.03)
ds = ds.annotate_rows(use_in_kinship=hl.rand_bool(0.9),
                      panel_maf=0.1,
                      anno1=5,
                      anno2=0,
                      consequence="LOF",
                      gene="A",
                      score=5.0)
ds = ds.annotate_rows(a_index=1)
ds = hl.sample_qc(hl.variant_qc(ds))
ds = ds.annotate_cols(is_case=True,
                      pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                      is_female=hl.rand_bool(0.5),
                                      age=hl.rand_norm(65, 10),
                                      height=hl.rand_norm(70, 10),
                                      blood_pressure=hl.rand_norm(120, 20),
                                      cohort_name="cohort1"),
Exemple #35
0
 def test_plot_roc_curve(self):
     x = hl.utils.range_table(100).annotate(score1=hl.rand_norm(), score2=hl.rand_norm())
     x = x.annotate(tp=hl.cond(x.score1 > 0, hl.rand_bool(0.7), False), score3=x.score1 + hl.rand_norm())
     ht = x.annotate(fp=hl.cond(~x.tp, hl.rand_bool(0.2), False))
     _, aucs = hl.experimental.plot_roc_curve(ht, ['score1', 'score2', 'score3'])