def get_test_genotypes_bm(chrom, genotype_bm_path):

    meta_mt = hl.read_matrix_table(get_meta_analysis_results_path())
    #

    if chrom == 'all':
        mt = get_filtered_mt_with_x()
    else:
        mt = get_filtered_mt(chrom=chrom, entry_fields=('dosage', ))

    mt = mt.filter_rows(hl.is_defined(meta_mt.rows()[mt.row_key]))

    #    if not hl.hadoop_is_file(f'{genotype_samples_ht_path}/_SUCCESS'):
    #        samples = mt.s.take(10)
    #        mt = mt.filter_cols(hl.literal(samples).contains(mt.s))
    #        mt = mt.key_cols_by(userId=hl.int32(mt.s))
    #        mt.cols().add_index().write(genotype_samples_ht_path, overwrite=True)
    #    else:
    #        samples_ht = hl.read_table(genotype_samples_ht_path)
    controls = hl.read_table(f'{scratch_dir}/genotype_samples_n10.ht')
    cases = hl.read_table(f'{scratch_dir}/genotype_samples_n10_cases.ht')
    samples_ht = cases.union(controls)
    mt = mt.filter_cols(hl.is_defined(samples_ht[hl.int32(mt.s)]))

    mt = mt.key_cols_by(userId=hl.int32(mt.s))
    print(mt.count())

    mt = mt.select_cols().select_rows()
    mt = mt.repartition(1000)
    BlockMatrix.write_from_entry_expr(mt.dosage,
                                      genotype_bm_path,
                                      overwrite=True)
def get_test_genotypes_mt(chrom, genotype_samples_ht_path, genotype_mt_path,
                          cases_only):
    meta_mt = hl.read_matrix_table(get_meta_analysis_results_path())
    #
    if chrom == 'all':
        mt = get_filtered_mt_with_x()
    else:
        mt = get_filtered_mt(chrom=chrom, entry_fields=('dosage', ))

    if status == 'cases':
        t2d_ht = hl.read_table(
            f'gs://ukbb-diverse-temp-30day/nb-scratch/t2d.ht/')
        t2d_ht = t2d_ht.filter(t2d_ht.both_sexes == 1)
        t2d_ht = t2d_ht.key_by('userId')
        mt = mt.filter_cols(hl.is_defined(t2d_ht[hl.int32(mt.s)]))

    mt = mt.filter_rows(hl.is_defined(meta_mt.rows()[mt.row_key]))

    if not hl.hadoop_is_file(f'{genotype_samples_ht_path}/_SUCCESS'):
        samples = mt.s.take(10)
        mt = mt.filter_cols(hl.literal(samples).contains(mt.s))
        mt = mt.key_cols_by(userId=hl.int32(mt.s))
        mt.cols().add_index().write(genotype_samples_ht_path, overwrite=True)
    else:
        samples_ht = hl.read_table(genotype_samples_ht_path)
        samples = samples_ht.s.collect()
        mt = mt.filter_cols(hl.literal(samples).contains(mt.s))
        mt = mt.key_cols_by(userId=hl.int32(mt.s))

    mt = mt.select_entries('dosage')
    mt = mt.select_rows()
    mt = mt.select_cols()
    mt = mt.repartition(10)
    mt.write(genotype_mt_path)
Example #3
0
def _linreg(y, x, nested_dim):
    k = len(x)
    k0 = nested_dim
    if k0 < 0 or k0 > k:
        raise ValueError(
            "linreg: `nested_dim` must be between 0 and the number "
            f"of covariates ({k}), inclusive")

    t = hl.tstruct(beta=hl.tarray(hl.tfloat64),
                   standard_error=hl.tarray(hl.tfloat64),
                   t_stat=hl.tarray(hl.tfloat64),
                   p_value=hl.tarray(hl.tfloat64),
                   multiple_standard_error=hl.tfloat64,
                   multiple_r_squared=hl.tfloat64,
                   adjusted_r_squared=hl.tfloat64,
                   f_stat=hl.tfloat64,
                   multiple_p_value=hl.tfloat64,
                   n=hl.tint64)

    x = hl.array(x)
    k = hl.int32(k)
    k0 = hl.int32(k0)

    return _agg_func('LinearRegression',
                     _to_agg(y),
                     t, [k, k0],
                     seq_op_args=[lambda y: y, x])
Example #4
0
def default_compute_info(mt: hl.MatrixTable,
                         site_annotations: bool = False,
                         n_partitions: int = 5000) -> hl.Table:
    """
    Computes a HT with the typical GATK allele-specific (AS) info fields 
    as well as ACs and lowqual fields.
    Note that this table doesn't split multi-allelic sites.

    :param mt: Input MatrixTable. Note that this table should be filtered to nonref sites.
    :param site_annotations: Whether to also generate site level info fields. Default is False.
    :param n_partitions: Number of desired partitions for output Table. Default is 5000.
    :return: Table with info fields
    :rtype: Table
    """
    # Move gvcf info entries out from nested struct
    mt = mt.transmute_entries(**mt.gvcf_info)

    # Compute AS info expr
    info_expr = get_as_info_expr(mt)

    if site_annotations:
        info_expr = info_expr.annotate(**get_site_info_expr(mt))

    # Add AC and AC_raw:
    # First compute ACs for each non-ref allele, grouped by adj
    grp_ac_expr = hl.agg.array_agg(
        lambda ai: hl.agg.filter(
            mt.LA.contains(ai),
            hl.agg.group_by(
                get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD),
                hl.agg.sum(
                    mt.LGT.one_hot_alleles(mt.LA.map(lambda x: hl.str(x)))[
                        mt.LA.index(ai)]),
            ),
        ),
        hl.range(1, hl.len(mt.alleles)),
    )

    # Then, for each non-ref allele, compute
    # AC as the adj group
    # AC_raw as the sum of adj and non-adj groups
    info_expr = info_expr.annotate(
        AC_raw=grp_ac_expr.map(
            lambda i: hl.int32(i.get(True, 0) + i.get(False, 0))),
        AC=grp_ac_expr.map(lambda i: hl.int32(i.get(True, 0))),
    )

    info_ht = mt.select_rows(info=info_expr).rows()

    # Add AS lowqual flag
    info_ht = info_ht.annotate(AS_lowqual=get_lowqual_expr(
        info_ht.alleles, info_ht.info.AS_QUALapprox))

    if site_annotations:
        # Add lowqual flag
        info_ht = info_ht.annotate(
            lowqual=get_lowqual_expr(info_ht.alleles, info_ht.info.QUALapprox))

    return info_ht.naive_coalesce(n_partitions)
Example #5
0
def get_cpx_interval(x):
    # an example format of CPX_INTERVALS is "DUP_chr1:1499897-1499974"
    type_chr = x.split('_chr')
    chr_pos = type_chr[1].split(':')
    pos = chr_pos[1].split('-')
    return hl.struct(type=type_chr[0],
                     chrom=chr_pos[0],
                     start=hl.int32(pos[0]),
                     end=hl.int32(pos[1]))
Example #6
0
File: ldscsim.py Project: zscu/hail
def ascertainment_bias(mt, y, P):
    """Adds ascertainment bias to a binary phenotype such that it was sample 
    prevalence of `P` = cases/(cases+controls).
    
    Parameters
    ----------
    mt : :class:`.MatrixTable`
        :class:`.MatrixTable` containing binary phenotype to be used.
    y : :class:`.Expression`
        Column field of binary phenotype.
    P : :obj:`int` or :obj:`float`
        Desired "sample prevalence" of phenotype.
        
    Returns
    -------
    :class:`.MatrixTable`
        :class:`.MatrixTable` containing binary phenotype with prevalence of approx. P
    """
    assert P >= 0 and P <= 1, 'P must be in [0,1]'
    tid = ''.join(
        random.choices(string.ascii_uppercase + string.ascii_lowercase, k=5)
    )  # "temporary id" -- random string to identify temporary intermediate fields generated by this method
    mt = mt.annotate_cols(y_w_asc_bias=y)
    y_stats = mt.aggregate_cols(hl.agg.stats(mt.y_w_asc_bias))
    K = y_stats.mean
    n = y_stats.n
    assert abs(
        P - K
    ) < 1, 'Specified sample prevalence is incompatible with population prevalence.'
    if P < K:
        p = (1 - K) * P / (K * (1 - P))
        con = mt.filter_cols(mt.y_w_asc_bias == 0)
        cas = mt.filter_cols(mt.y_w_asc_bias == 1).add_col_index(
            name='col_idx_' + tid)
        keep = round(p * n * K) * [1] + round((1 - p) * n * K) * [0]
        cas = cas.annotate_cols(
            **
            {'keep_' + tid: hl.literal(keep)[hl.int32(cas['col_idx_' + tid])]})
        cas = cas.filter_cols(cas['keep_' + tid] == 1)
        cas = _clean_fields(cas, tid)
        mt = cas.union_cols(con)
    elif P > K:
        p = K * (1 - P) / ((1 - K) * P)
        cas = mt.filter_cols(mt.y_w_asc_bias == 1)
        con = mt.filter_cols(mt.y_w_asc_bias == 0).add_col_index(
            name='col_idx_' + tid)
        keep = round(p * n * (1 - K)) * [1] + round(
            (1 - p) * n * (1 - K)) * [0]
        con = con.annotate_cols(
            **
            {'keep_' + tid: hl.literal(keep)[hl.int32(con['col_idx_' + tid])]})
        con = con.filter_cols(con['keep_' + tid] == 1)
        con = _clean_fields(con, tid)
        mt = con.union_cols(cas)
    return mt
Example #7
0
def ascertainment_bias(mt, y, P):
    r"""Adds ascertainment bias to a binary phenotype to give it a sample
    prevalence of `P` = cases/(cases+controls).

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        :class:`.MatrixTable` containing binary phenotype to be used.
    y : :class:`.Expression`
        Column field of binary phenotype.
    P : :obj:`int` or :obj:`float`
        Desired "sample prevalence" of phenotype.

    Returns
    -------
    :class:`.MatrixTable`
        :class:`.MatrixTable` containing binary phenotype with prevalence of approx. P
    """
    assert P >= 0 and P <= 1, 'P must be in [0,1]'
    uid = Env.get_uid(base=100)
    mt = mt.annotate_cols(y_w_asc_bias=y)
    y_stats = mt.aggregate_cols(hl.agg.stats(mt.y_w_asc_bias))
    K = y_stats.mean
    n = y_stats.n
    assert abs(
        P - K
    ) < 1, 'Specified sample prevalence is incompatible with population prevalence.'
    if P < K:
        p = (1 - K) * P / (K * (1 - P))
        con = mt.filter_cols(mt.y_w_asc_bias == 0)
        cas = mt.filter_cols(mt.y_w_asc_bias == 1).add_col_index(
            name='col_idx_' + uid)
        keep = round(p * n * K) * [1] + round((1 - p) * n * K) * [0]
        cas = cas.annotate_cols(
            **
            {'keep_' + uid: hl.literal(keep)[hl.int32(cas['col_idx_' + uid])]})
        cas = cas.filter_cols(cas['keep_' + uid] == 1)
        cas = _clean_fields(cas, uid)
        mt = cas.union_cols(con)
    elif P > K:
        p = K * (1 - P) / ((1 - K) * P)
        cas = mt.filter_cols(mt.y_w_asc_bias == 1)
        con = mt.filter_cols(mt.y_w_asc_bias == 0).add_col_index(
            name='col_idx_' + uid)
        keep = round(p * n * (1 - K)) * [1] + round(
            (1 - p) * n * (1 - K)) * [0]
        con = con.annotate_cols(
            **
            {'keep_' + uid: hl.literal(keep)[hl.int32(con['col_idx_' + uid])]})
        con = con.filter_cols(con['keep_' + uid] == 1)
        con = _clean_fields(con, uid)
        mt = con.union_cols(cas)
    return mt
Example #8
0
def generate_random_gen():
    mt = hl.utils.range_matrix_table(30, 10)
    mt = (mt.annotate_rows(locus=hl.locus('20', mt.row_idx + 1),
                           alleles=['A', 'G']).key_rows_by('locus', 'alleles'))
    mt = (mt.annotate_cols(s=hl.str(mt.col_idx)).key_cols_by('s'))
    # using totally random values leads rounding differences where
    # identical GEN values get rounded differently, leading to
    # differences in the GT call between import_{gen, bgen}
    mt = mt.annotate_entries(a=hl.int32(hl.rand_unif(0.0, 255.0)))
    mt = mt.annotate_entries(b=hl.int32(hl.rand_unif(0.0, 255.0 - mt.a)))
    mt = mt.transmute_entries(GP=hl.array([mt.a, mt.b, 255.0 - mt.a - mt.b]) /
                              255.0)
    # 20% missing
    mt = mt.filter_entries(hl.rand_bool(0.8))
    hl.export_gen(mt, 'random', precision=4)
Example #9
0
File: nd.py Project: saponas/hail
def full(shape, value, dtype=None):
    """Creates a hail :class:`.NDArrayNumericExpression` full of the specified value.

    Examples
    --------

    Create a 5 by 7 NDArray of type :py:data:`.tfloat64` 9s.

    >>> hl.nd.full((5, 7), 9)

    It is possible to specify a type other than :py:data:`.tfloat64` with the `dtype` argument.

    >>> hl.nd.full((5, 7), 9, dtype=hl.tint32)

    Parameters
    ----------
    shape : `tuple` or :class:`.TupleExpression`
            Desired shape.
    value : :class:`.Expression` or python value
            Value to fill ndarray with.
    dtype : :class:`.HailType`
            Desired hail type.

    Returns
    -------
    :class:`.NDArrayNumericExpression`
        An ndarray of the specified shape filled with the specified value.
    """
    if isinstance(shape, Int64Expression):
        shape_product = shape
    else:
        shape_product = reduce(lambda a, b: a * b, shape)
    return arange(hl.int32(shape_product)).map(
        lambda x: cast_expr(value, dtype)).reshape(shape)
Example #10
0
def full(shape, value):
    if isinstance(shape, Int64Expression):
        shape_product = shape
    else:
        shape_product = reduce(lambda a, b: a * b, shape)
    return array(hl.range(
        hl.int32(shape_product)).map(lambda x: value)).reshape(shape)
Example #11
0
def full(shape, value, dtype=None):
    if isinstance(shape, Int64Expression):
        shape_product = shape
    else:
        shape_product = reduce(lambda a, b: a * b, shape)
    return arange(hl.int32(shape_product)).map(
        lambda x: cast_expr(value, dtype)).reshape(shape)
Example #12
0
def make_corr_betas(mt, h2=None, rg=None, cov_array=None, seed=None):
    '''Make correlated betas for multi-trait simulations'''
    seed = seed if seed is not None else int.from_bytes(os.urandom(4),
                                                        byteorder="big")
    #    assert ()
    M = mt.count_rows()
    if cov_array != None:
        n_phens = cov_array.shape[0]
    else:
        n_phens = len(h2)
    if rg is None and cov_array is None:
        print(f'Assuming rg=0 for all {n_phens} traits')
        rg = [0] * int((n_phens**2 - n_phens) / 2)
    if cov_array is None:
        cov_array = create_cov_array(h2, rg)
    cov_array = (1 / M) * cov_array
    randstate = np.random.RandomState(
        int(seed))  #seed random state for replicability
    betas = randstate.multivariate_normal(mean=np.zeros(n_phens),
                                          cov=cov_array,
                                          size=[
                                              M,
                                          ])
    df = pd.DataFrame([0] * M, columns=['__beta'])
    tb = hl.Table.from_pandas(df)
    tb = tb.add_index().key_by('idx')
    tb = tb.annotate(__beta=hl.literal(betas.tolist())[hl.int32(tb.idx)])
    mt = mt.add_row_index()
    mt = mt.annotate_rows(__beta=tb[mt.row_idx]['__beta'])
    return mt, betas
Example #13
0
def generate_random_gen():
    mt = hl.utils.range_matrix_table(30, 10)
    mt = (mt.annotate_rows(locus = hl.locus('20', mt.row_idx + 1),
                           alleles = ['A', 'G'])
          .key_rows_by('locus', 'alleles'))
    mt = (mt.annotate_cols(s = hl.str(mt.col_idx))
          .key_cols_by('s'))
    # using totally random values leads rounding differences where
    # identical GEN values get rounded differently, leading to
    # differences in the GT call between import_{gen, bgen}
    mt = mt.annotate_entries(a = hl.int32(hl.rand_unif(0.0, 255.0)))
    mt = mt.annotate_entries(b = hl.int32(hl.rand_unif(0.0, 255.0 - mt.a)))
    mt = mt.transmute_entries(GP = hl.array([mt.a, mt.b, 255.0 - mt.a - mt.b]) / 255.0)
    # 20% missing
    mt = mt.filter_entries(hl.rand_bool(0.8))
    hl.export_gen(mt, 'random', precision=4)
Example #14
0
    def test_import_bgen_variant_filtering(self):
        desired_variant_indexes = [1, 2, 3, 5, 7, 9, 11, 13, 17, 198]
        actual = hl.import_bgen(resource('example.8bits.bgen'), ['GT'],
                                contig_recoding={'01': '1'},
                                reference_genome=None,
                                n_partitions=10,
                                _row_fields=['file_row_idx'],
                                _variants_per_file={
                                    resource('example.8bits.bgen'):
                                    desired_variant_indexes
                                })
        # doing the expected import_bgen second catches the case where the
        # hadoop configuraiton is polluted with old data from the
        # _variants_per_file
        everything = hl.import_bgen(resource('example.8bits.bgen'), ['GT'],
                                    contig_recoding={'01': '1'},
                                    reference_genome=None,
                                    _row_fields=['file_row_idx'])
        self.assertEqual(everything.count(), (199, 500))

        expected = everything.filter_rows(
            hl.set(desired_variant_indexes).contains(
                hl.int32(everything.file_row_idx)))

        self.assertTrue(expected._same(actual))
        self.assertEqual(
            (hl.str(actual.locus.contig) + ":" +
             hl.str(actual.locus.position)).collect(), [
                 '1:3000', '1:4000', '1:5000', '1:7000', '1:9000', '1:11000',
                 '1:13000', '1:15000', '1:19000', '1:100001'
             ])
Example #15
0
File: ldscsim.py Project: zscu/hail
def multitrait_ss(mt, h2, pi, rg=0, seed=None):
    """Generates spike & slab betas for simulation of two correlated phenotypes.
    
    Parameters
    ----------
    mt : :class:`.MatrixTable`
        :class:`.MatrixTable` for simulated phenotype.
    h2 : :obj:`list`
        Desired SNP-based heritability of simulated traits.
    pi : :obj:`list`
        List of proportion of SNPs: :math:`p_{TT}`, :math:`p_{TF}`, :math:`p_{FT}`
        :math:`p_{TT}` is the proportion of SNPs that are causal for both traits, 
        :math:`p_{TF}` is the proportion of SNPs that are causal for trait 1 but not trait 2,
        :math:`p_{FT}` is the proportion of SNPs that are causal for trait 2 but not trait 1.
        :math:`p_{FF}` is the remaining proportion of SNPs and is the proportion
        of SNPs that are not causal for both traits.
    rg : :obj:`float` or :obj:`int`
        Genetic correlation between traits.
    seed : :obj:`int`, optional
        Seed for random number generator. If `seed` is ``None``, `seed` is set randomly.
    
    Warning
    -------
    May give inaccurate results if chosen parameters make the covariance matrix 
    not positive semi-definite.
    
    Returns
    -------
    :class:`.MatrixTable`
        :class:`.MatrixTable` with simulated SNP effects as a row field of arrays.
    """
    seed = seed if seed is not None else int(str(Env.next_seed())[:8])
    ptt, ptf, pft, pff = pi[0], pi[1], pi[2], 1 - sum(pi)
    cov_matrix = np.asarray([[1 / (ptt + ptf), rg / ptt],
                             [rg / ptt, 1 / (ptt + pft)]])
    M = mt.count_cols()
    randstate = np.random.RandomState(
        int(seed))  #seed random state for replicability
    beta = randstate.multivariate_normal(mean=np.zeros(2),
                                         cov=cov_matrix,
                                         size=[
                                             int(M),
                                         ])
    zeros = np.zeros(shape=int(M)).T
    beta_matrix = np.stack(
        (np.asarray([zeros, zeros]).T, np.asarray(
            [zeros, beta[:, 1]]).T, np.asarray([beta[:, 0], zeros]).T, beta),
        axis=1)
    idx = np.random.choice([0, 1, 2, 3], p=[pff, pft, ptf, ptt], size=int(M))
    betas = beta_matrix[range(int(M)), idx, :]
    betas[:, 0] *= (h2[0] / M)**(1 / 2)
    betas[:, 1] *= (h2[1] / M)**(1 / 2)
    df = pd.DataFrame([0] * M, columns=['beta'])
    tb = hl.Table.from_pandas(df)
    tb = tb.add_index().key_by('idx')
    tb = tb.annotate(beta=hl.literal(betas.tolist())[hl.int32(tb.idx)])
    mt = mt.add_row_index()
    mt = mt.annotate_rows(beta=tb[mt.row_idx]['beta'])
    return mt
Example #16
0
def eye(N, M=None, dtype=hl.tfloat64):
    """
    Construct a 2-D :class:`.NDArrayExpression` with ones on the *main* diagonal
    and zeros elsewhere.

    Parameters
    ----------
    N : :class:`.NumericExpression` or Python number
      Number of rows in the output.
    M : :class:`.NumericExpression` or Python number, optional
      Number of columns in the output. If None, defaults to `N`.
    dtype : numeric :class:`.HailType`, optional
      Element type of the returned array. Defaults to :py:data:`.tfloat64`

    Returns
    -------
    I : :class:`.NDArrayExpression` representing a Hail ndarray of shape (N,M)
      An ndarray whose elements are equal to one on the main diagonal, zeroes elsewhere.

    See Also
    --------
    :func:`.identity`
    :func:`.diagonal`

    Examples
    --------
    >>> hl.eval(hl.nd.eye(3))
    array([[1., 0., 0.],
           [0., 1., 0.],
           [0., 0., 1.]])
    >>> hl.eval(hl.nd.eye(2, 5, dtype=hl.tint32))
    array([[1, 0, 0, 0, 0],
           [0, 1, 0, 0, 0]], dtype=int32)
    """

    n_row = hl.int32(N)
    if M is None:
        n_col = n_row
    else:
        n_col = hl.int32(M)

    return hl.nd.array(hl.range(0, n_row * n_col).map(
        lambda i: hl.if_else((i // n_col) == (i % n_col),
                             hl.literal(1, dtype),
                             hl.literal(0, dtype))
    )).reshape((n_row, n_col))
Example #17
0
 def _promote_scalar(self, typ):
     if typ == tint32:
         return hail.int32(self)
     elif typ == tint64:
         return hail.int64(self)
     elif typ == tfloat32:
         return hail.float32(self)
     else:
         assert typ == tfloat64
         return hail.float64(self)
Example #18
0
def compute_fisher_exact(tb: hl.Table,
                         n_cases_col: str,
                         n_control_col: str,
                         total_cases_col: str,
                         total_controls_col: str,
                         correct_total_counts: bool,
                         root_col_name: str,
                         extra_fields: dict) -> hl.Table:
    """
    Perform two-sided Fisher Exact test. Add extra annotations (if any)

    :param tb: Hail Table
    :param n_cases_col: field name with number of (affected) cases
    :param n_control_col: field name with number of (affected) control
    :param total_cases_col: field name with total number of cases
    :param total_controls_col: field name with total number of controls
    :param correct_total_counts: should the total numbers (case/control) be corrected to avoid duplicated counting?
    :param root_col_name: field to be annotated with test results
    :param extra_fields: Extra filed (must be a dict) to be annotated
    :return: Hail Table with Fisher Exact test results.
    """
    # compute fisher exact
    if correct_total_counts:
        fet = hl.fisher_exact_test(c1=hl.int32(tb[n_cases_col]),
                                   c2=hl.int32(tb[n_control_col]),
                                   c3=hl.int32(tb[total_cases_col]) - hl.int32(tb[n_cases_col]),
                                   c4=hl.int32(tb[total_controls_col]) - hl.int32(tb[n_control_col]))
    else:
        fet = hl.fisher_exact_test(c1=hl.int32(tb[n_cases_col]),
                                   c2=hl.int32(tb[n_control_col]),
                                   c3=hl.int32(tb[total_cases_col]),
                                   c4=hl.int32(tb[total_controls_col]))

    tb = (tb
          .annotate(**{root_col_name: fet})
          .flatten()
          )

    if len(extra_fields) == 0:
        return tb
    else:
        return tb.annotate(**extra_fields)
Example #19
0
def test_ndarray_full():
    assert_ndarrays_eq((hl.nd.zeros(4), np.zeros(4)), (hl.nd.zeros(
        (3, 4, 5)), np.zeros((3, 4, 5))), (hl.nd.ones(6), np.ones(6)),
                       (hl.nd.ones((6, 6, 6)), np.ones((6, 6, 6))),
                       (hl.nd.full(7, 9), np.full(7, 9)), (hl.nd.full(
                           (3, 4, 5), 9), np.full((3, 4, 5), 9)))

    assert hl.eval(hl.nd.zeros((5, 5), dtype=hl.tfloat32)).dtype, np.float32
    assert hl.eval(hl.nd.ones(3, dtype=hl.tint64)).dtype, np.int64
    assert hl.eval(hl.nd.full((5, 6, 7), hl.int32(3),
                              dtype=hl.tfloat64)).dtype, np.float64
Example #20
0
def parse_as_ranksum(string, has_non_ref):
    typ = hl.ttuple(hl.tfloat64, hl.tint32)
    items = string.split(r'\|')
    items = hl.cond(has_non_ref, items[:-1], items)
    return items.map(lambda s: hl.cond(
        (hl.len(s) == 0) | (s == '.'),
        hl.null(typ),
        hl.rbind(s.split(','), lambda ss: hl.cond(
            hl.len(ss) != 2,  # bad field, possibly 'NaN', just set it null
            hl.null(hl.ttuple(hl.tfloat64, hl.tint32)),
            hl.tuple([hl.float64(ss[0]), hl.int32(ss[1])])))))
Example #21
0
 def make_random_function(self, mt):
     from functools import reduce
     #check that row key of annotations matches row key of mt
     mt = mt.add_row_index()
     rows = [rf for rf in self.a_ht.row]
     self.a_ht = self.a_ht.annotate(__a__=reduce(
         self.f, map(lambda x: self.a_ht[rows[x]], range(len(rows)))))
     std = self.a_ht.aggregate(hl.agg.stats(self.a_ht.__a__)).stdev
     self.a_ht = self.a_ht.annotate(__a__=self.a_ht.__a__ *
                                    hl.sqrt(self.h2 / std))
     return mt.annotate_rows(beta=hl.literal(
         self.a_ht.__a__.take(mt.count_rows()))[hl.int32(mt.row_idx)])
Example #22
0
def _linreg(y, x, nested_dim):
    k = len(x)
    k0 = nested_dim
    if k0 < 0 or k0 > k:
        raise ValueError("linreg: `nested_dim` must be between 0 and the number "
                         f"of covariates ({k}), inclusive")

    t = hl.tstruct(beta=hl.tarray(hl.tfloat64),
                   standard_error=hl.tarray(hl.tfloat64),
                   t_stat=hl.tarray(hl.tfloat64),
                   p_value=hl.tarray(hl.tfloat64),
                   multiple_standard_error=hl.tfloat64,
                   multiple_r_squared=hl.tfloat64,
                   adjusted_r_squared=hl.tfloat64,
                   f_stat=hl.tfloat64,
                   multiple_p_value=hl.tfloat64,
                   n=hl.tint64)

    x = hl.array(x)
    k = hl.int32(k)
    k0 = hl.int32(k0)

    return _agg_func('LinearRegression', [y, x], t, [k, k0])
Example #23
0
File: nd.py Project: saponas/hail
def diagonal(nd):
    """Gets the diagonal of a 2 dimensional NDArray.

    Examples
    --------

    >>> hl.eval(hl.nd.diagonal(hl.nd.array([[1, 2], [3, 4]])))
    array([1, 4], dtype=int32)

    :param nd: A 2 dimensional NDArray, shape(M, N).
    :return: A 1 dimension NDArray of length min (M, N), containing the diagonal of `nd`.
    """
    assert nd.ndim == 2, "diagonal requires 2 dimensional ndarray"
    shape_min = hl.min(nd.shape[0], nd.shape[1])
    return hl.nd.array(hl.range(hl.int32(shape_min)).map(lambda i: nd[i, i]))
Example #24
0
def get_phased_gnomad_ht(ht: hl.Table,
                         em: bool = True,
                         lr: bool = True,
                         shr: bool = True) -> hl.Table:
    expr_fun = []

    if em:
        expr_fun.append(get_em_expressions)

    if lr:
        expr_fun.append(get_lr_expressions)

    if shr:
        expr_fun.append(get_single_het_expressions)

    if not expr_fun:
        raise (Exception("No expressions to annotate"))

    # Support for both exploded or dict versions of gt_counts
    # dict
    if isinstance(ht.gt_counts, hl.expr.DictExpression):
        ht = ht.select(
            phase_info=ht.gt_counts.map_values(lambda pop_count: hl.bind(
                lambda x: hl.struct(
                    gt_counts=x,
                    **{k: v
                       for f in expr_fun for k, v in f(x).items()}),
                hl.struct(raw=pop_count.raw.map(lambda y: hl.int32(y)),
                          adj=pop_count.adj.map(lambda z: hl.int32(z))))))
    # exploded
    else:
        ht = ht.annotate(
            **{k: v
               for f in expr_fun for k, v in f(ht.gt_counts).items()})

    return ht
Example #25
0
    def _get(self, var_df, samples, field):
        # array with samples in HAIL
        if type(samples) is str:
            samples = [samples]

        # create table with vars in HAIL
        ht = hl.Table.from_pandas(var_df)

        # create table with samples
        df = pd.DataFrame({'s': samples})
        ht_samples = hl.Table.from_pandas(df)

        ht = ht.join(ht_samples)
        ht = ht.annotate(pos=hl.int32(ht.pos))
        ht = ht.add_index()
        ht = ht.key_by(locus=hl.struct(contig=ht.chrom, position=ht.pos),
                       alleles=hl.array([ht.ref, ht.alt]),
                       s=ht.s)

        # all variants per sample
        res_table = None

        ht_paths = self._get_Tables_paths(samples)
        # iterate through ht_vcfs with samples
        for ht_path in ht_paths:
            ht_vcf = hl.read_table(ht_path)

            ht_n = ht.join(ht_vcf, how='left')

            if res_table is None:
                res_table = ht_n
                res_table = res_table.checkpoint('db/checkpoint/ht1.ht',
                                                 overwrite=True)
            else:
                res_table = res_table.union(ht_n)

        # all variants per sample
        res_table = res_table.annotate(GT=hl.coalesce(res_table.GT, 0))
        res_table = res_table.annotate(DP=hl.coalesce(res_table.DP, 0))
        res_table = res_table.annotate(GQ=hl.coalesce(res_table.GQ, 0))
        res_table = res_table.order_by(res_table.idx)
        res_table = res_table.checkpoint('db/checkpoint/ht2.ht',
                                         overwrite=True)
        return np.column_stack([
            np.array(res_table.filter(
                res_table.s == sample)[field].collect()).reshape(-1, 1)
            for sample in samples
        ])
Example #26
0
def get_phen_files(nsamples, min_id, max_id, parsplit, paridx):
    nsamples = str(int(nsamples / 1000))
    print(
        f'\r#########\nGetting phen files for {nsamples}k samples\n#########')
    mt0 = hl.read_matrix_table('gs://nbaya/ldscsim/hm3.50_sim_h2_0.08.mt/')
    ht0 = mt0.select_cols(mt0.nonsim_phen).cols()
    ht1 = ht0.rename({'s': 'IID', 'nonsim_phen': 'y'})
    ht1 = ht1.annotate(FID='0')
    ht1 = ht1.key_by(ht1.FID)
    ht1 = ht1.select(ht1.IID, ht1.y)
    ht1 = ht1.key_by(ht1.IID)
    ids = hl.import_table(f'gs://nbaya/split/gcta/gcta_{nsamples}k.grm.id',
                          no_header=True)  #GRM ids
    ids = ids.rename({'f0': 'FID', 'f1': 'IID'})
    ids = set(ids.IID.take(ids.count()))
    ht2 = ht1.filter(hl.literal(ids).contains(ht1['IID']))
    n = ht2.count()
    rep_ids = range(
        min_id + paridx - 1, max_id + 1, parsplit
    )  #replicate "IDs", which were used as seeds to generate the random split
    for rep_id in rep_ids:
        try:
            is_complete = subprocess.check_output([
                'gsutil', 'ls',
                f'gs://nbaya/split/gcta/gcta_{nsamples}k.s{rep_id}.phen'
            ]) != None
        except:
            is_complete = False
        if not is_complete:
            start = datetime.now()
            pi = [1] * int(n / 2) + [0] * int(n / 2)
            randstate = np.random.RandomState(rep_id)
            randstate.shuffle(pi)
            ht = ht2.add_index()
            ht = ht.annotate(label=hl.literal(pi)[hl.int32(ht.idx)])
            ht = ht.annotate(y1=hl.cond(ht.label == 1, ht.y, hl.null('float')))
            ht = ht.annotate(y2=hl.cond(ht.label == 0, ht.y, hl.null('float')))
            ht = ht.drop(ht.idx, ht.label, ht.y)
            ht = ht.order_by(ht.y1)
            ht.show()
            ht.export(f'gs://nbaya/split/gcta/gcta_{nsamples}k.s{rep_id}.phen')
            runtime = datetime.now() - start
            print(
                f'######\nRuntime for generating phenfile of rep {rep_id}: {round((runtime.total_seconds())/60, 4)} min'
            )
        else:
            print(f'###### Already completed phenfile for replicate #{rep_id}')
Example #27
0
def unify_saige_ht_variant_schema(ht):
    shared = ('markerID', 'AC', 'AF', 'N', 'BETA', 'SE', 'Tstat', 'varT',
              'varTstar')
    new_floats = ('AF.Cases', 'AF.Controls')
    new_ints = ('N.Cases', 'N.Controls')
    shared_end = ('Pvalue', 'gene', 'annotation')
    if 'AF.Cases' not in list(ht.row):
        ht = ht.select(*shared,
                       **{field: hl.null(hl.tfloat64)
                          for field in new_floats},
                       **{field: hl.null(hl.tint32)
                          for field in new_ints},
                       **{field: ht[field]
                          for field in shared_end})
    else:
        ht = ht.select(*shared, *new_floats, *new_ints, *shared_end)
    return ht.annotate(SE=hl.float64(ht.SE), AC=hl.int32(ht.AC))
    def get_coverage_expr(mt):
        cov_arrays = hl.literal({
            x: [1, 1, 1, 1, 1, 1, 1, 1, 0]
            if x >= 50 else [1, 1, 1, 1, 1, 1, 1, 0, 0] if x >= 30 else
            ([1] * (i + 2)) + ([0] * (7 - i))
            for i, x in enumerate(range(5, 100, 5))
        })

        return hl.bind(
            lambda array_expr: hl.struct(
                **{
                    f'over_{x}': hl.int32(array_expr[i])
                    for i, x in enumerate([1, 5, 10, 15, 20, 25, 30, 50, 100])
                }),
            hl.agg.array_sum(hl.case().when(
                mt.x >= 100, [1, 1, 1, 1, 1, 1, 1, 1, 1]).when(
                    mt.x >= 5, cov_arrays[mt.x - (mt.x % 5)]).when(
                        mt.x >= 1, [1, 0, 0, 0, 0, 0, 0, 0, 0]).default(
                            [0, 0, 0, 0, 0, 0, 0, 0, 0])))
Example #29
0
    def hailBlanczos(A, G, k, q):

        h_list = []
        G_i = hl.nd.qr(G)[0]

        for j in range(0, q):
            info(f"blanczos_pca: Beginning iteration {j + 1}/{q+1}")
            temp = A.annotate(H_i=A.ndarray @ G_i)
            temp = temp.annotate(G_i_intermediate=temp.ndarray.T @ temp.H_i)
            result = temp.aggregate(hl.struct(
                Hi_chunks=hl.agg.collect(temp.H_i),
                G_i=hl.agg.ndarray_sum(temp.G_i_intermediate)),
                                    _localize=False)._persist()
            localized_H_i = hl.nd.vstack(result.Hi_chunks)
            h_list.append(localized_H_i)
            G_i = hl.nd.qr(result.G_i)[0]

        info(f"blanczos_pca: Beginning iteration {q+ 1}/{q+1}")
        temp = A.annotate(H_i=A.ndarray @ G_i)
        result = temp.aggregate(hl.agg.collect(temp.H_i),
                                _localize=False)._persist()
        info("blanczos_pca: Iterations complete. Computing local QR")
        localized_H_i = hl.nd.vstack(result)
        h_list.append(localized_H_i)
        H = hl.nd.hstack(h_list)
        Q = hl.nd.qr(H)[0]._persist()
        A = A.annotate(part_size=A.ndarray.shape[0])
        A = A.annotate(rows_preceeding=hl.int32(hl.scan.sum(A.part_size)))
        A = A.annotate_globals(Qt=Q.T)
        T = A.annotate(ndarray=A.Qt[:, A.rows_preceeding:A.rows_preceeding +
                                    A.part_size] @ A.ndarray)
        arr_T = T.aggregate(hl.agg.ndarray_sum(T.ndarray), _localize=False)

        info("blanczos_pca: QR Complete. Computing local SVD")
        U, S, W = hl.nd.svd(arr_T, full_matrices=False)._persist()

        V = Q @ U

        truncV = V[:, :k]
        truncS = S[:k]
        truncW = W[:k, :]

        return truncV, truncS, truncW
Example #30
0
def count(expr=None):
    """Count the number of records.

    Examples
    --------
    Group by the `SEX` field and count the number of rows in each category:

    .. doctest::

        >>> (table1.group_by(table1.SEX)
        ...        .aggregate(n=agg.count())
        ...        .show())
        +-----+-------+
        | SEX |     n |
        +-----+-------+
        | str | int64 |
        +-----+-------+
        | M   |     2 |
        | F   |     2 |
        +-----+-------+

    Notes
    -----
    If `expr` is not provided, then this method will count the number of
    records aggregated. If `expr` is provided, then the result should
    make use of :meth:`filter` or :meth:`explode` so that the number of
    records aggregated changes.

    Parameters
    ----------
    expr : :class:`.Expression`, or :obj:`None`
        Expression to count.

    Returns
    -------
    :class:`.Expression` of type :py:data:`.tint64`
        Total number of records.
    """
    if expr is not None:
        return _agg_func('count', expr, tint64)
    else:
        return _agg_func('count', _to_agg(hl.int32(0)), tint64)
 def fet_expr(het_count_exp: hl.expr.Int64Expression,
              hom_count_expr: hl.expr.Int64Expression):
     return hl.bind(
         lambda x: hl.struct(
             counts=x,
             dominant=hl.fisher_exact_test(x[0][0], x[0][1] + x[0][2],
                                           x[1][0], x[1][1] + x[1][2]),
             recessive=hl.fisher_exact_test(x[0][0] + x[0][1], x[0][
                 2], x[1][0] + x[1][1], x[1][2])),
         hl.bind(
             lambda x: [
                 [
                     hl.int32(
                         hl.cond(x.contains(False), x[False].get(0, 0),
                                 0)),
                     hl.int32(
                         hl.cond(x.contains(False), x[False].get(1, 0),
                                 0)),
                     hl.int32(
                         hl.cond(x.contains(False), x[False].get(2, 0),
                                 0))
                 ],
                 [
                     hl.int32(
                         hl.cond(x.contains(True), x[True].get(0, 0), 0)
                     ),
                     hl.int32(
                         hl.cond(x.contains(True), x[True].get(1, 0), 0)
                     ),
                     hl.int32(
                         hl.cond(x.contains(True), x[True].get(2, 0), 0)
                     )
                 ],
             ],
             hl.agg.group_by(
                 mt.is_case,
                 hl.agg.counter(
                     hl.min(2, het_count_exp + 2 * hom_count_expr)))))
Example #32
0
def compute_coverage_stats(
    mt: hl.MatrixTable,
    reference_ht: hl.Table,
    coverage_over_x_bins: List[int] = [1, 5, 10, 15, 20, 25, 30, 50, 100],
) -> hl.Table:
    """
    Computes the following coverage statistics for every base of the `reference_ht` provided:
        - mean
        - median
        - total DP
        - fraction of samples with coverage above X, for each x in `coverage_over_x_bins`

    The `reference_ht` is a table that contains row for each locus coverage should be computed on.
    It needs to be keyed with the same keys as `mt`, typically either `locus` or `locus, alleles`.
    The `reference_ht` can e.g. be created using `get_reference_ht`

    :param mt: Input sparse MT
    :param reference_ht: Input reference HT
    :param coverage_over_x_bins: List of boundaries for computing samples over X
    :return: Table with per-base coverage stats
    """

    n_samples = mt.count_cols()
    print(f"Computing coverage stats on {n_samples} samples.")

    # Create an outer join with the reference Table
    mt = mt.select_entries("END", "DP").select_cols().select_rows()
    col_key_fields = list(mt.col_key)
    t = mt._localize_entries("__entries", "__cols")
    t = t.join(reference_ht.key_by(*mt.row_key).select(_in_ref=True), how="outer")
    t = t.annotate(
        __entries=hl.or_else(
            t.__entries,
            hl.range(n_samples).map(lambda x: hl.null(t.__entries.dtype.element_type)),
        )
    )
    mt = t._unlocalize_entries("__entries", "__cols", col_key_fields)

    # Densify
    mt = hl.experimental.densify(mt)

    # Filter rows where the reference is missing
    mt = mt.filter_rows(mt._in_ref)

    # Unfilter entries so that entries with no ref block overlap aren't null
    mt = mt.unfilter_entries()

    # Compute coverage stats
    coverage_over_x_bins = sorted(coverage_over_x_bins)
    max_coverage_bin = coverage_over_x_bins[-1]
    hl_coverage_over_x_bins = hl.array(coverage_over_x_bins)

    # This expression creates a counter DP -> number of samples for DP between 0 and max_coverage_bin
    coverage_counter_expr = hl.agg.counter(
        hl.min(max_coverage_bin, hl.or_else(mt.DP, 0))
    )

    # This expression aggregates the DP counter in reverse order of the coverage_over_x_bins
    # and computes the cumulative sum over them.
    #  It needs to be in reverse order because we want the sum over samples covered by > X.
    count_array_expr = hl.cumulative_sum(
        hl.array(
            [
                hl.int32(coverage_counter_expr.get(max_coverage_bin, 0))
            ]  # The coverage was already floored to the max_coverage_bin, so no more aggregation is needed for the max bin
        ).extend(  # For each of the other bins, coverage needs to be summed between the boundaries
            hl.range(hl.len(hl_coverage_over_x_bins) - 1, 0, step=-1).map(
                lambda i: hl.sum(
                    hl.range(
                        hl_coverage_over_x_bins[i - 1], hl_coverage_over_x_bins[i]
                    ).map(lambda j: hl.int32(coverage_counter_expr.get(j, 0)))
                )
            )
        )
    )
    mean_expr = hl.agg.mean(hl.or_else(mt.DP, 0))

    # Annotate rows now
    return mt.select_rows(
        mean=hl.cond(hl.is_nan(mean_expr), 0, mean_expr),
        median_approx=hl.or_else(hl.agg.approx_median(hl.or_else(mt.DP, 0)), 0),
        total_DP=hl.agg.sum(mt.DP),
        **{
            f"over_{x}": count_array_expr[i] / n_samples
            for i, x in zip(
                range(
                    len(coverage_over_x_bins) - 1, -1, -1
                ),  # Reverse the bin index as count_array_expr has the reverse order
                coverage_over_x_bins,
            )
        },
    ).rows()
Example #33
0
    def test_locus_windows(self):
        def assert_eq(a, b):
            self.assertTrue(np.array_equal(a, np.array(b)))

        centimorgans = hl.literal([0.1, 1.0, 1.0, 1.5, 1.9])

        mt = hl.balding_nichols_model(1, 5, 5).add_row_index()
        mt = mt.annotate_rows(cm=centimorgans[hl.int32(mt.row_idx)]).cache()

        starts, stops = hl.linalg.utils.locus_windows(mt.locus, 2)
        assert_eq(starts, [0, 0, 0, 1, 2])
        assert_eq(stops, [3, 4, 5, 5, 5])

        starts, stops = hl.linalg.utils.locus_windows(mt.locus, 0.5, coord_expr=mt.cm)
        assert_eq(starts, [0, 1, 1, 1, 3])
        assert_eq(stops, [1, 4, 4, 5, 5])

        starts, stops = hl.linalg.utils.locus_windows(mt.locus, 1.0, coord_expr=2 * centimorgans[hl.int32(mt.row_idx)])
        assert_eq(starts, [0, 1, 1, 1, 3])
        assert_eq(stops, [1, 4, 4, 5, 5])

        rows = [{'locus': hl.Locus('1', 1), 'cm': 1.0},
                {'locus': hl.Locus('1', 2), 'cm': 3.0},
                {'locus': hl.Locus('1', 4), 'cm': 4.0},
                {'locus': hl.Locus('2', 1), 'cm': 2.0},
                {'locus': hl.Locus('2', 1), 'cm': 2.0},
                {'locus': hl.Locus('3', 3), 'cm': 5.0}]

        ht = hl.Table.parallelize(rows,
                                  hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64),
                                  key=['locus'])

        starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1)
        assert_eq(starts, [0, 0, 2, 3, 3, 5])
        assert_eq(stops, [2, 2, 3, 5, 5, 6])

        starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
        assert_eq(starts, [0, 1, 1, 3, 3, 5])
        assert_eq(stops, [1, 3, 3, 5, 5, 6])

        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.order_by(ht.cm).locus, 1.0)
        self.assertTrue('ascending order' in str(cm.exception))

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=hl.utils.range_table(1).idx)
        self.assertTrue('different source' in str(cm.exception))

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(hl.locus('1', 1), 1.0)
        self.assertTrue("no source" in str(cm.exception))

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=0.0)
        self.assertTrue("no source" in str(cm.exception))

        ht = ht.annotate_globals(x = hl.locus('1', 1), y = 1.0)
        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.x, 1.0)
        self.assertTrue("row-indexed" in str(cm.exception))
        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, ht.y)
        self.assertTrue("row-indexed" in str(cm.exception))

        ht = hl.Table.parallelize([{'locus': hl.null(hl.tlocus()), 'cm': 1.0}],
                                  hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus'])
        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0)
        self.assertTrue("missing value for 'locus_expr'" in str(cm.exception))
        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
        self.assertTrue("missing value for 'locus_expr'" in str(cm.exception))

        ht = hl.Table.parallelize([{'locus': hl.Locus('1', 1), 'cm': hl.null(hl.tfloat64)}],
                                  hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus'])
        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
        self.assertTrue("missing value for 'coord_expr'" in str(cm.exception))