Exemple #1
0
def generate_cross_pop_ld_scores_from_ld_matrices(pop1, pop2, data_type, pop_data, min_frequency=0.01, call_rate_cutoff=0.8,
                                                  adj: bool = False, radius: int = 1000000, overwrite=False,
                                                  temp_bucket='gs://gnomad-tmp/ld'):
    n1 = pop_data.pop[pop1]
    n2 = pop_data.pop[pop2]
    ht1 = hl.read_table(ld_resources._ld_index_path(data_type, pop1, adj=adj))
    ht1 = ht1.filter((ht1.pop_freq.AF >= min_frequency) &
                     (ht1.pop_freq.AF <= 1 - min_frequency) &
                     (ht1.pop_freq.AN / n1 >= 2 * call_rate_cutoff))

    ht2 = hl.read_table(ld_resources._ld_index_path(data_type, pop2, adj=adj))
    ht2 = ht2.filter((ht2.pop_freq.AF >= min_frequency) &
                     (ht2.pop_freq.AF <= 1 - min_frequency) &
                     (ht2.pop_freq.AN / n2 >= 2 * call_rate_cutoff))

    ht1 = ht1.filter(hl.is_defined(ht2[ht1.key])).add_index(name='new_idx').checkpoint(f'{temp_bucket}/{pop1}_{pop2}.ht', overwrite=overwrite, _read_if_exists=not overwrite)
    ht2 = ht2.filter(hl.is_defined(ht1[ht2.key])).add_index(name='new_idx').checkpoint(f'{temp_bucket}/{pop2}_{pop1}.ht', overwrite=overwrite, _read_if_exists=not overwrite)
    indices1 = ht1.idx.collect()
    indices2 = ht2.idx.collect()
    assert len(indices1) == len(indices2)

    r1 = BlockMatrix.read(ld_resources._ld_matrix_path(data_type, pop1, min_frequency >= COMMON_FREQ, adj=adj)).filter(indices1, indices1)
    r2 = BlockMatrix.read(ld_resources._ld_matrix_path(data_type, pop2, min_frequency >= COMMON_FREQ, adj=adj)).filter(indices2, indices2)
    r_bm = r1 * r2

    # TODO: is a bias adjustment needed?
    # r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0))

    out_name = ld_resources._cross_pop_ld_scores_path(data_type, pop1, pop2, adj)
    compute_and_annotate_ld_score(ht1, r_bm, radius, out_name, overwrite)
def plot_correlation_matrices(chr_list):
    """
    Plot combined correlation matrices for genotype-correlation and 
    sumstats-correlation matrices
    """
    for ch in chr_list:
        ss_ch = BlockMatrix.read('gs://nbaya/sumstats_corr/' + variant_set +
                                 '_ss_correlation_chr{}.bm/'.format(ch))
        gt_ch = BlockMatrix.read('gs://nbaya/sumstats_corr/' + variant_set +
                                 '_gt_correlation_chr{}.bm/'.format(ch))
        M_max = int(
            1e4
        )  #max number of variants to be taken from the block matrices (suggested: 2e4)
        M = ss_ch.shape[0]  #dimension of block matrix
        #        for idx in range(int(M/M_max)+1):       #index of which disjoint window we are looking at in the block matrix
        for idx in range(
                0,
                int(M / M_max) + 1
        ):  #index of which disjoint window we are looking at in the block matrix
            M0 = M_max * (idx)  #start variant index for block matrix filtering
            M1 = min(M_max * (idx + 1),
                     M)  #stop variant index for block matrix filtering
            ss_np = ss_ch[M0:M1, M0:M1].to_numpy()
            gt_np = gt_ch[M0:M1, M0:M1].to_numpy()
            print('\nStarting variant window: [' + str(M0) + ',' + str(M1) +
                  ']')
            w = int(
                5e3
            )  #window width of variants for correlation matrix (suggested: 2e3)
            for i in range(int((M1 - M0 - 1) / w) + 1):
                w0 = w * i  #start variant index for window of correlation matrix
                w1 = min(
                    w * (i + 1), M1 -
                    M0)  #stop variant index for window of correlation matrix
                full = (ss_np[w0:w1, w0:w1] + gt_np[w0:w1, w0:w1].T)
                np.fill_diagonal(full, 1)
                fig, ax = plt.subplots()
                ax.imshow(full, cmap='bwr')
                ax.plot([0, w], [0, w], 'k--', alpha=0.5, lw=2)
                plt.xlim([0, w])
                plt.ylim([w, 0])
                ax.text(w * 0.83, w * 0.1, "SS", fontsize=60, alpha=0.5)
                ax.text(w * 0.02, w * 0.97, "GT", fontsize=60, alpha=0.5)
                plt.title('chr' + str(ch) + ' ' + variant_set + ' variants (' +
                          str(M0 + w0) + '-' + str(M0 + w1) + ')')
                fig = plt.gcf()
                fig.set_size_inches(10, 10)
                path = ('gs://nbaya/sumstats_corr/plots/chr' + str(ch) + '_' +
                        variant_set + '_' + str(M0 + w0).zfill(len(str(M))) +
                        '-' + str(M0 + w1).zfill(len(str(M))) + '.png')
                with hl.hadoop_open(path, 'wb') as f:
                    fig.savefig(f, dpi=600)
                plt.close()
            print('\nFinished variant window: [' + str(M0) + ',' + str(M1) +
                  ']')
Exemple #3
0
def tree_matmul_tree_matsum(bm1,
                            bm2,
                            mul_splits: int,
                            sum_splits: int = None,
                            path_prefix: str = None,
                            read_if_exists=False):
    r'''
    Version of tree_matmul() that allows for intermediate sums of matrix 
    multiplication. `sum_splits` must be a divisor of `mul_splits`
    '''
    # TODO: Make a private function that acts recursively to ensure that the
    # matrix sums never include more than a maximum number of matrices
    assert mul_splits % sum_splits == 0, '`sum_splits` must be a divisor of `mul_splits'

    if not read_if_exists:
        print(bm1._n_block_cols)
        print(mul_splits)
        inner_brange_size = int(math.ceil(bm1._n_block_cols / mul_splits))
        print(f'inner_brange_size: {inner_brange_size}')
        split_points = list(range(0, bm1._n_block_cols,
                                  inner_brange_size)) + [bm1._n_block_cols]
        print(split_points)
        inner_ranges = list(zip(split_points[:-1], split_points[1:]))
        print(f'len(inner_ranges): {len(inner_ranges)}')
        blocks_to_multiply = [(bm1._select_blocks((0, bm1._n_block_rows),
                                                  (start, stop)),
                               bm2._select_blocks((start, stop),
                                                  (0, bm2._n_block_cols)))
                              for start, stop in inner_ranges]

        intermediate_multiply_exprs = [
            b1 @ b2 for b1, b2 in blocks_to_multiply
        ]
        print(len(intermediate_multiply_exprs))
        print(f'Writing {mul_splits} intermediate matrices to {path_prefix}')
        hl.experimental.write_block_matrices(intermediate_multiply_exprs,
                                             path_prefix)

    read_intermediates = [
        BlockMatrix.read(f"{path_prefix}_{i}") for i in range(0, mul_splits)
    ]

    tracked_partial_sums = []

    sum_block_size = math.ceil(mul_splits / sum_splits)
    for i in range(sum_splits):
        partial_sum_path = f"{path_prefix}-partial-{i}"
        sum(read_intermediates[i * sum_block_size:(i + 1) *
                               sum_block_size]).write(partial_sum_path,
                                                      overwrite=True)
        tracked_partial_sums.append(BlockMatrix.read(partial_sum_path))

    return sum(tracked_partial_sums)
Exemple #4
0
    def test_stage_locally(self):
        nd = np.arange(0, 80, dtype=float).reshape(8, 10)
        bm_uri = new_temp_file()
        BlockMatrix.from_numpy(nd, block_size=3).write(bm_uri, stage_locally=True)

        bm = BlockMatrix.read(bm_uri)
        self._assert_eq(nd, bm)
Exemple #5
0
    def test_stage_locally(self):
        nd = np.arange(0, 80, dtype=float).reshape(8, 10)
        bm_uri = new_temp_file()
        BlockMatrix.from_numpy(nd, block_size=3).write(bm_uri, stage_locally=True)

        bm = BlockMatrix.read(bm_uri)
        self._assert_eq(nd, bm)
Exemple #6
0
    def test_stage_locally(self):
        nd = np.arange(0, 80, dtype=float).reshape(8, 10)
        with hl.TemporaryDirectory(ensure_exists=False) as bm_uri:
            BlockMatrix.from_numpy(nd, block_size=3).write(bm_uri, stage_locally=True)

            bm = BlockMatrix.read(bm_uri)
            self._assert_eq(nd, bm)
Exemple #7
0
def generate_ld_scores_from_ld_matrix(pop_data,
                                      data_type,
                                      min_frequency=0.01,
                                      call_rate_cutoff=0.8,
                                      adj: bool = False,
                                      radius: int = 1000000,
                                      overwrite=False):
    # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR
    # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15 mins to compute LD scores per population (~$150 total)
    for label, pops in dict(pop_data).items():
        for pop, n in pops.items():
            ht = hl.read_table(
                ld_resources._ld_index_path(data_type, pop, adj=adj))
            ht = ht.filter((ht.pop_freq.AF >= min_frequency)
                           & (ht.pop_freq.AF <= 1 - min_frequency)
                           & (ht.pop_freq.AN / n >= 2 *
                              call_rate_cutoff)).add_index(name='new_idx')

            indices = ht.idx.collect()

            r2 = BlockMatrix.read(
                ld_resources._ld_matrix_path(data_type,
                                             pop,
                                             min_frequency >= COMMON_FREQ,
                                             adj=adj))
            r2 = r2.filter(indices, indices)**2
            r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0))

            out_name = ld_resources._ld_scores_path(data_type, pop, adj)
            compute_and_annotate_ld_score(ht, r2_adj, radius, out_name,
                                          overwrite)
    def bm(self) -> BlockMatrix:
        """
        Read and return the Hail MatrixTable resource.

        :return: Hail MatrixTable resource
        """
        return BlockMatrix.read(self.path)
Exemple #9
0
    def test_write_overwrite(self):
        with hl.TemporaryDirectory(ensure_exists=False) as path:
            bm = BlockMatrix.from_numpy(np.array([[0]]))
            bm.write(path)
            self.assertRaises(FatalError, lambda: bm.write(path))

            bm2 = BlockMatrix.from_numpy(np.array([[1]]))
            bm2.write(path, overwrite=True)
            self._assert_eq(BlockMatrix.read(path), bm2)
Exemple #10
0
def generate_ld_scores_from_ld_matrix(pop_data,
                                      data_type,
                                      min_frequency=0.01,
                                      call_rate_cutoff=0.8,
                                      adj: bool = False,
                                      radius: int = 1000000,
                                      overwrite=False):
    # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR
    # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15 mins to compute LD scores per population (~$150 total)
    for label, pops in dict(pop_data).items():
        for pop, n in pops.items():
            if pop in ('nfe', 'fin', 'asj'): continue
            ht = hl.read_table(ld_index_path(data_type, pop, adj=adj))
            ht = ht.filter((ht.pop_freq.AF >= min_frequency)
                           & (ht.pop_freq.AF <= 1 - min_frequency)
                           & (ht.pop_freq.AN / n >= 2 *
                              call_rate_cutoff)).add_index(name='new_idx')

            indices = ht.idx.collect()

            r2 = BlockMatrix.read(
                ld_matrix_path(data_type,
                               pop,
                               min_frequency >= COMMON_FREQ,
                               adj=adj))
            r2 = r2.filter(indices, indices)**2
            r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0))

            starts_and_stops = hl.linalg.utils.locus_windows(ht.locus,
                                                             radius,
                                                             _localize=False)

            # Lifted directly from https://github.com/hail-is/hail/blob/555e02d6c792263db2c3ed97db8002b489e2dacb/hail/python/hail/methods/statgen.py#L2595
            # for the time being, until efficient BlockMatrix filtering gets an easier interface
            r2_adj = BlockMatrix._from_java(
                r2_adj._jbm.filterRowIntervalsIR(
                    Env.backend()._to_java_ir(starts_and_stops._ir), False))

            l2row = r2_adj.sum(axis=0).T
            l2col = r2_adj.sum(axis=1)
            l2 = l2row + l2col + 1

            l2_bm_tmp = new_temp_file()
            l2_tsv_tmp = new_temp_file()
            l2.write(l2_bm_tmp, force_row_major=True)
            BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp)

            ht_scores = hl.import_table(l2_tsv_tmp,
                                        no_header=True,
                                        impute=True)
            ht_scores = ht_scores.add_index().rename({'f0': 'ld_score'})
            ht_scores = ht_scores.key_by('idx')

            ht = ht.annotate(**ht_scores[ht.new_idx]).select_globals()
            ht.filter(hl.is_defined(ht.ld_score)).write(
                ld_scores_path(data_type, pop, adj), overwrite)
Exemple #11
0
    def test_write_from_entry_expr_overwrite(self):
        mt = hl.balding_nichols_model(1, 1, 1)
        mt = mt.select_entries(x=mt.GT.n_alt_alleles())
        bm = BlockMatrix.from_entry_expr(mt.x)

        path = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt.x, path)
        self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x, path))

        BlockMatrix.write_from_entry_expr(mt.x, path, overwrite=True)
        self._assert_eq(BlockMatrix.read(path), bm)

        # non-field expressions currently take a separate code path
        path2 = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt.x + 1, path2)
        self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x + 1, path2))

        BlockMatrix.write_from_entry_expr(mt.x + 2, path2, overwrite=True)
        self._assert_eq(BlockMatrix.read(path2), bm + 2)
Exemple #12
0
    def test_write_from_entry_expr_overwrite(self):
        mt = hl.balding_nichols_model(1, 1, 1)
        mt = mt.select_entries(x=mt.GT.n_alt_alleles())
        bm = BlockMatrix.from_entry_expr(mt.x)

        path = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt.x, path)
        self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x, path))

        BlockMatrix.write_from_entry_expr(mt.x, path, overwrite=True)
        self._assert_eq(BlockMatrix.read(path), bm)

        # non-field expressions currently take a separate code path
        path2 = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt.x + 1, path2)
        self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x + 1, path2))

        BlockMatrix.write_from_entry_expr(mt.x + 2, path2, overwrite=True)
        self._assert_eq(BlockMatrix.read(path2), bm + 2)
Exemple #13
0
    def test_write_overwrite(self):
        path = new_temp_file()

        bm = BlockMatrix.from_numpy(np.array([[0]]))
        bm.write(path)
        self.assertRaises(FatalError, lambda: bm.write(path))

        bm2 = BlockMatrix.from_numpy(np.array([[1]]))
        bm2.write(path, overwrite=True)
        self._assert_eq(BlockMatrix.read(path), bm2)
Exemple #14
0
    def test_write_from_entry_expr_overwrite(self):
        mt = hl.balding_nichols_model(1, 1, 1)
        mt = mt.select_entries(x=mt.GT.n_alt_alleles())
        bm = BlockMatrix.from_entry_expr(mt.x)

        with hl.TemporaryDirectory(ensure_exists=False) as path:
            BlockMatrix.write_from_entry_expr(mt.x, path)
            self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x, path))

            BlockMatrix.write_from_entry_expr(mt.x, path, overwrite=True)
            self._assert_eq(BlockMatrix.read(path), bm)

        with hl.TemporaryDirectory(ensure_exists=False) as path:
            # non-field expressions currently take a separate code path
            BlockMatrix.write_from_entry_expr(mt.x + 1, path)
            self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x + 1, path))

            BlockMatrix.write_from_entry_expr(mt.x + 2, path, overwrite=True)
            self._assert_eq(BlockMatrix.read(path), bm + 2)
Exemple #15
0
    def test_write_overwrite(self):
        path = new_temp_file()

        bm = BlockMatrix.from_numpy(np.array([[0]]))
        bm.write(path)
        self.assertRaises(FatalError, lambda: bm.write(path))

        bm2 = BlockMatrix.from_numpy(np.array([[1]]))
        bm2.write(path, overwrite=True)
        self._assert_eq(BlockMatrix.read(path), bm2)
Exemple #16
0
    def test_from_entry_expr(self):
        mt = get_dataset()
        mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache()

        a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy()
        a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy()
        a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy()

        self._assert_eq(a1, a2)
        self._assert_eq(a1, a3)

        path = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32)
        a4 = BlockMatrix.read(path).to_numpy()
        self._assert_eq(a1, a4)
Exemple #17
0
    def test_from_entry_expr(self):
        mt = get_dataset()
        mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache()

        a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy()
        a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy()
        a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy()

        self._assert_eq(a1, a2)
        self._assert_eq(a1, a3)

        path = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32)
        a4 = BlockMatrix.read(path).to_numpy()
        self._assert_eq(a1, a4)
Exemple #18
0
    def test_from_entry_expr(self):
        mt = get_dataset()
        mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache()

        a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy()
        a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy()
        a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy()

        self._assert_eq(a1, a2)
        self._assert_eq(a1, a3)

        with hl.TemporaryDirectory(ensure_exists=False) as path:
            BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32)
            a4 = BlockMatrix.read(path).to_numpy()
            self._assert_eq(a1, a4)
Exemple #19
0
def export_snv_sv_ld_matrix(pop_data, data_type, common_only: bool = True, adj: bool = False, overwrite: bool = False):
    for label, pops in dict(pop_data).items():
        for pop in pops:
            if pop not in SNV_SV_POPS: continue
            bm = BlockMatrix.read(ld_resources._ld_matrix_path(data_type, pop, common_only, adj))
            ld_index = hl.read_table(ld_resources._ld_index_path(data_type, pop, common_only, adj))
            snvs = ld_index.filter(ld_index.alleles[0] != "N")
            svs = ld_index.filter(ld_index.alleles[0] == "N")
            snv_indices = snvs.idx.collect()
            sv_indices = svs.idx.collect()
            ht = bm.filter(snv_indices, sv_indices).entries(keyed=False)
            ht.filter(ht.entry != 0).write(ld_resources._ld_snv_sv_path(pop), overwrite)

            hl.read_table(ld_resources._ld_snv_sv_path(pop)).export(ld_resources._ld_snv_sv_path(pop).replace('.ht', '.txt.bgz'))
            snvs = snvs.add_index().key_by()
            svs = svs.add_index().key_by()
            snvs.select(chrom=snvs.locus.contig, pos=snvs.locus.position, ref=snvs.alleles[0], alt=snvs.alleles[1],
                        i=snvs.idx).export(ld_resources._ld_snv_sv_index_path(pop, 'snv'))
            svs.select(chrom=svs.locus.contig, pos=svs.locus.position, ref=svs.alleles[0], alt=svs.alleles[1],
                       j=svs.idx).export(ld_resources._ld_snv_sv_index_path(pop, 'sv'))
Exemple #20
0
def get_ref_X(ref_panel, overwrite=False):
    r'''
    Returns N_ref x M dim matrix of column-standardized genotypes of LD ref panel
    '''
    X_bm_path = f'{bucket}/{ref_panel}.X.bm'

    if overwrite or not hl.hadoop_is_file(f'{X_bm_path}/_SUCCESS'):
        mt = hl.import_plink(bed=f'{bucket}/{ref_panel}.bed',
                             bim=f'{bucket}/{ref_panel}.bim',
                             fam=f'{bucket}/{ref_panel}.fam')

        mt = mt.annotate_rows(stats=hl.agg.stats(mt.GT.n_alt_alleles()))
        mt = mt.annotate_entries(X=(mt.GT.n_alt_alleles() - mt.stats.mean) /
                                 mt.stats.stdev)

        X = BlockMatrix.from_entry_expr(mt.X)
        X = X.T

        X.write(f'{bucket}/{ref_panel}.X.bm', overwrite=True)

    X = BlockMatrix.read(X_bm_path)

    return X
Exemple #21
0
    def from_random_effects(cls, y, x, z,
                            p_path=None,
                            overwrite=False,
                            max_condition_number=1e-10,
                            complexity_bound=8192):
        r"""Initializes a model from :math:`y`, :math:`X`, and :math:`Z`.

        Examples
        --------
        >>> from hail.stats import LinearMixedModel
        >>> y = np.array([0.0, 1.0, 8.0, 9.0])
        >>> x = np.array([[1.0, 0.0],
        ...               [1.0, 2.0],
        ...               [1.0, 1.0],
        ...               [1.0, 4.0]])
        >>> z = np.array([[0.0, 0.0, 1.0],
        ...               [0.0, 1.0, 2.0],
        ...               [1.0, 2.0, 4.0],
        ...               [2.0, 4.0, 8.0]])
        >>> model, p = LinearMixedModel.from_random_effects(y, x, z)
        >>> model.fit()
        >>> model.h_sq
        0.38205307244271675

        Notes
        -----
        If :math:`n \leq m`, the returned model is full rank.

        If :math:`n > m`, the returned model is low rank. In this case only,
        eigenvalues less than or equal to `max_condition_number` times the top
        eigenvalue are dropped from :math:`S`, with the corresponding
        eigenvectors dropped from :math:`P`. This guards against precision
        loss on left eigenvectors computed via the right gramian :math:`Z^T Z`
        in :meth:`BlockMatrix.svd`.

        In either case, one can truncate to a rank :math:`r` model as follows.
        If `p` is an ndarray:

        >>> p_r = p[:r, :]     # doctest: +SKIP
        >>> s_r = model.s[:r]  # doctest: +SKIP
        >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x)  # doctest: +SKIP

        If `p` is a block matrix:

        >>> p[:r, :].write(p_r_path)          # doctest: +SKIP
        >>> p_r = BlockMatrix.read(p_r_path)  # doctest: +SKIP
        >>> s_r = model.s[:r]                 # doctest: +SKIP
        >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x, p_r_path)  # doctest: +SKIP

        This method applies no standardization to `z`.

        Warning
        -------
        If `z` is a block matrix, then ideally `z` should be the result of
        directly reading from disk (and possibly a transpose). This is most
        critical if :math:`n > m`, because in this case multiplication by `z`
        will result in all preceding transformations being repeated
        ``n / block_size`` times, as explained in :class:`.BlockMatrix`.

        At least one dimension must be less than or equal to 46300.
        See the warning in :meth:`.BlockMatrix.svd` for performance
        considerations.

        Parameters
        ----------
        y: :class:`ndarray`
            :math:`n` vector of observations :math:`y`.
        x: :class:`ndarray`
            :math:`n \times p` matrix of fixed effects :math:`X`.
        z: :class:`ndarray` or :class:`BlockMatrix`
            :math:`n \times m` matrix of random effects :math:`Z`.
        p_path: :obj:`str`, optional
            Path at which to write :math:`P` as a block matrix.
            Required if `z` is a block matrix.
        overwrite: :obj:`bool`
            If ``True``, overwrite an existing file at `p_path`.
        max_condition_number: :obj:`float`
            Maximum condition number. Must be greater than 1e-16.
        complexity_bound: :obj:`int`
            Complexity bound for :meth:`.BlockMatrix.svd` when `z` is a block
            matrix.

        Returns
        -------
        model: :class:`LinearMixedModel`
            Model constructed from :math:`y`, :math:`X`, and :math:`Z`.
        p: :class:`ndarray` or :class:`.BlockMatrix`
            Matrix :math:`P` whose rows are the eigenvectors of :math:`K`.
            The type is block matrix if `z` is a block matrix and
            :meth:`.BlockMatrix.svd` of `z` returns :math:`U` as a block matrix.
        """
        z_is_bm = isinstance(z, BlockMatrix)

        if z_is_bm and p_path is None:
            raise ValueError("from_random_effects: 'p_path' required when 'z'"
                             "is a block matrix.")

        if max_condition_number < 1e-16:
            raise ValueError("from_random_effects: 'max_condition_number' must "
                             f"be at least 1e-16, found {max_condition_number}")

        _check_dims(y, "y", 1)
        _check_dims(x, "x", 2)
        _check_dims(z, "z", 2)

        n, m = z.shape

        if y.shape[0] != n:
            raise ValueError("from_random_effects: 'y' and 'z' must have the "
                             "same number of rows")
        if x.shape[0] != n:
            raise ValueError("from_random_effects: 'x' and 'z' must have the "
                             "same number of rows")

        if z_is_bm:
            u, s0, _ = z.svd(complexity_bound=complexity_bound)
            p = u.T
            p_is_bm = isinstance(p, BlockMatrix)
        else:
            u, s0, _ = hl.linalg._svd(z, full_matrices=False)
            p = u.T
            p_is_bm = False

        s = s0 ** 2

        low_rank = n > m

        if low_rank:
            assert np.all(np.isfinite(s))
            r = np.searchsorted(-s, -max_condition_number * s[0])
            if r < m:
                info(f'from_random_effects: model rank reduced from {m} to {r} '
                     f'due to ill-condition.'
                     f'\n    Largest dropped eigenvalue was {s[r]}.')
            s = s[:r]
            p = p[:r, :]

        if p_path is not None:
            if p_is_bm:
                p.write(p_path, overwrite=overwrite)
                p = BlockMatrix.read(p_path)
            else:
                BlockMatrix.from_numpy(p).write(p_path, overwrite=overwrite)
        if p_is_bm:
            py, px = (p @ y).to_numpy(), (p @ x).to_numpy()
        else:
            py, px = p @ y, p @ x

        if low_rank:
            model = LinearMixedModel(py, px, s, y, x, p_path)
        else:
            model = LinearMixedModel(py, px, s, p_path=p_path)

        return model, p
def main(args):
    ht_snp = hl.import_table(args.snp, impute=True)
    ht_snp = ht_snp.annotate(variant=hl.delimit([
        ht_snp.chromosome,
        hl.str(ht_snp.position), ht_snp.allele1, ht_snp.allele2
    ],
                                                delimiter=':'))
    ht_snp = ht_snp.annotate(
        **hl.parse_variant(ht_snp.variant, reference_genome='GRCh38'))
    ht_snp = ht_snp.key_by('locus', 'alleles')
    ht_snp = ht_snp.add_index('idx_snp')
    ht_snp = ht_snp.checkpoint(new_temp_file())

    # annotate vep
    gnomad = hl.read_table(
        'gs://gnomad-public-requester-pays/release/3.0/ht/genomes/gnomad.genomes.r3.0.sites.ht'
    )
    ht_snp = ht_snp.join(gnomad.select('vep'), how='left')
    ht_snp = process_consequences(ht_snp)

    # extract most severe
    ht_snp = ht_snp.annotate(vep=(hl.case().when(
        hl.is_defined(ht_snp.vep.worst_csq_for_variant_canonical),
        ht_snp.vep.worst_csq_for_variant_canonical).when(
            hl.is_defined(ht_snp.vep.worst_csq_for_variant),
            ht_snp.vep.worst_csq_for_variant).or_missing()),
                             is_canonical_vep=hl.is_defined(
                                 ht_snp.vep.worst_csq_for_variant_canonical))
    ht_snp = ht_snp.annotate(most_severe=hl.if_else(
        hl.is_defined(ht_snp.vep), ht_snp.vep.most_severe_consequence,
        'intergenic_variant'),
                             gene_most_severe=ht_snp.vep.gene_symbol)
    ht_snp = ht_snp.select_globals()
    ht_snp = ht_snp.drop('vep')
    ht_snp = ht_snp.annotate(
        **annotate_consequence_category(ht_snp.most_severe))
    ht_snp = ht_snp.checkpoint(new_temp_file())

    df = ht_snp.key_by().drop('locus', 'alleles', 'variant',
                              'idx_snp').to_pandas()

    # annotate LD
    for pop in POPS:
        ht = hl.read_table(
            f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.adj.ld.variant_indices.ht'
        )
        ht = ht.annotate(locus_hg38=hl.liftover(ht.locus, 'GRCh38'))
        ht = ht.filter(hl.is_defined(ht.locus_hg38))
        ht = ht.key_by('locus_hg38', 'alleles').drop('locus')
        ht = ht_snp.join(ht, 'inner')
        ht = ht.checkpoint(new_temp_file())

        lead_idx = ht.order_by(hl.desc(ht.prob)).head(1).idx.collect()
        idx = ht.idx.collect()
        bm = BlockMatrix.read(
            f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.ld.bm'
        )
        bm = bm.filter(idx, idx)
        # re-densify triangluar matrix
        bm = bm + bm.T - get_diag_mat(bm.diagonal())
        bm = bm.filter_rows(
            np.where(np.array(idx) == lead_idx[0])[0].tolist())**2

        idx_snp = ht.idx_snp.collect()
        r2 = bm.to_numpy()[0]
        df[f'gnomad_lead_r2_{pop}'] = np.nan
        df[f'gnomad_lead_r2_{pop}'].iloc[idx_snp] = r2

    if args.out.startswith('gs://'):
        fopen = hl.hadoop_open
    else:
        fopen = open

    with fopen(args.out, 'w') as f:
        df.to_csv(f, sep='\t', na_rep='NA', index=False)
Exemple #23
0
def ld_score(entry_expr,
             locus_expr,
             radius,
             coord_expr=None,
             annotation_exprs=None,
             block_size=None) -> Table:
    """Calculate LD scores.

    Example
    -------

    >>> # Load genetic data into MatrixTable
    >>> mt = hl.import_plink(bed='data/ldsc.bed',
    ...                      bim='data/ldsc.bim',
    ...                      fam='data/ldsc.fam')

    >>> # Create locus-keyed Table with numeric variant annotations
    >>> ht = hl.import_table('data/ldsc.annot',
    ...                      types={'BP': hl.tint,
    ...                             'binary': hl.tfloat,
    ...                             'continuous': hl.tfloat})
    >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP))
    >>> ht = ht.key_by('locus')

    >>> # Annotate MatrixTable with external annotations
    >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary,
    ...                       continuous_annotation=ht[mt.locus].continuous)

    >>> # Calculate LD scores using centimorgan coordinates
    >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(),
    ...                                      locus_expr=mt.locus,
    ...                                      radius=1.0,
    ...                                      coord_expr=mt.cm_position,
    ...                                      annotation_exprs=[mt.binary_annotation,
    ...                                                        mt.continuous_annotation])

    >>> # Show results
    >>> ht_scores.show(3)

    .. code-block:: text

        +---------------+-------------------+-----------------------+-------------+
        | locus         | binary_annotation | continuous_annotation |  univariate |
        +---------------+-------------------+-----------------------+-------------+
        | locus<GRCh37> |           float64 |               float64 |     float64 |
        +---------------+-------------------+-----------------------+-------------+
        | 20:82079      |       1.15183e+00 |           7.30145e+01 | 1.60117e+00 |
        | 20:103517     |       2.04604e+00 |           2.75392e+02 | 4.69239e+00 |
        | 20:108286     |       2.06585e+00 |           2.86453e+02 | 5.00124e+00 |
        +---------------+-------------------+-----------------------+-------------+


    Warning
    -------
        :func:`.ld_score` will fail if ``entry_expr`` results in any missing
        values. The special float value ``nan`` is not considered a
        missing value.

    **Further reading**

    For more in-depth discussion of LD scores, see:

    - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__
    - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__

    Notes
    -----

    `entry_expr`, `locus_expr`, `coord_expr` (if specified), and
    `annotation_exprs` (if specified) must come from the same
    MatrixTable.


    Parameters
    ----------
    entry_expr : :class:`.NumericExpression`
        Expression for entries of genotype matrix
        (e.g. ``mt.GT.n_alt_alleles()``).
    locus_expr : :class:`.LocusExpression`
        Row-indexed locus expression.
    radius : :obj:`int` or :obj:`float`
        Radius of window for row values (in units of `coord_expr` if set,
        otherwise in units of basepairs).
    coord_expr: :class:`.Float64Expression`, optional
        Row-indexed numeric expression for the row value used to window
        variants. By default, the row value is given by the locus
        position.
    annotation_exprs : :class:`.NumericExpression` or
                       :obj:`list` of :class:`.NumericExpression`, optional
        Annotation expression(s) to partition LD scores. Univariate
        annotation will always be included and does not need to be
        specified.
    block_size : :obj:`int`, optional
        Block size. Default given by :meth:`.BlockMatrix.default_block_size`.

    Returns
    -------
    :class:`.Table`
        Table keyed by `locus_expr` with LD scores for each variant and
        `annotation_expr`. The function will always return LD scores for
        the univariate (all SNPs) annotation."""

    mt = entry_expr._indices.source
    mt_locus_expr = locus_expr._indices.source

    if coord_expr is None:
        mt_coord_expr = mt_locus_expr
    else:
        mt_coord_expr = coord_expr._indices.source

    if not annotation_exprs:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr])
    else:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr]
                        + [mt == x._indices.source
                           for x in wrap_to_list(annotation_exprs)])

    if not check_mts:
        raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr
                            (if specified), and annotation_exprs (if
                            specified) must come from same MatrixTable.""")

    n = mt.count_cols()
    r2 = hl.row_correlation(entry_expr, block_size) ** 2
    r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0))

    starts, stops = hl.linalg.utils.locus_windows(locus_expr,
                                                  radius,
                                                  coord_expr)
    r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops)

    r2_adj_sparse_tmp = new_temp_file()
    r2_adj_sparse.write(r2_adj_sparse_tmp)
    r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp)

    if not annotation_exprs:
        cols = ['univariate']
        col_idxs = {0: 'univariate'}
        l2 = r2_adj_sparse.sum(axis=1)
    else:
        ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows()
        ht = ht.annotate(univariate=hl.literal(1.0))
        names = [name for name in ht.row if name not in ht.key]

        ht_union = hl.Table.union(
            *[(ht.annotate(name=hl.str(x),
                           value=hl.float(ht[x]))
               .select('name', 'value')) for x in names])
        mt_annotations = ht_union.to_matrix_table(
            row_key=list(ht_union.key),
            col_key=['name'])

        cols = mt_annotations.key_cols_by()['name'].collect()
        col_idxs = {i: cols[i] for i in range(len(cols))}

        a_tmp = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp)

        a = BlockMatrix.read(a_tmp)
        l2 = r2_adj_sparse @ a

    l2_bm_tmp = new_temp_file()
    l2_tsv_tmp = new_temp_file()
    l2.write(l2_bm_tmp, force_row_major=True)
    BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp)

    ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True)
    ht_scores = ht_scores.add_index()
    ht_scores = ht_scores.key_by('idx')
    ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i]
                                  for i in range(len(cols))})

    ht = mt.select_rows(__locus=locus_expr).rows()
    ht = ht.add_index()
    ht = ht.annotate(**ht_scores[ht.idx])
    ht = ht.key_by('__locus')
    ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key])
    ht = ht.rename({'__locus': 'locus'})

    return ht
Exemple #24
0
    def __init__(self, py, px, s, y=None, x=None, p_path=None):
        if y is None and x is None:
            low_rank = False
        elif y is not None and x is not None:
            low_rank = True
        else:
            raise ValueError('for low-rank, set both y and x; for full-rank, do not set y or x.')

        _check_dims(py, 'py', 1)
        _check_dims(px, 'px', 2)
        _check_dims(s, 's', 1)

        r = s.size
        f = px.shape[1]

        if py.size != r:
            raise ValueError("py and s must have the same size")
        if px.shape[0] != r:
            raise ValueError("px must have the same number of rows as the size of s")
        if low_rank:
            _check_dims(y, 'y', 1)
            _check_dims(x, 'x', 2)
            n = y.size
            if n <= r:
                raise ValueError("size of y must be larger than the size of s")
            if x.shape[0] != n:
                raise ValueError("x must have the same number of rows as the size of y")
            if x.shape[1] != f:
                raise ValueError("px and x must have the same number columns")
        else:
            n = r

        if p_path is not None:
            n_rows, n_cols = BlockMatrix.read(p_path).shape
            if n_cols != n:
                raise ValueError("LinearMixedModel: Number of columns in the block "
                                 f"matrix at 'p_path' ({n_cols}) must equal "
                                 f"the size of 'y' ({n})")
            if n_rows != r:
                raise ValueError("LinearMixedModel: Number of rows in the block "
                                 f"matrix at 'p_path' ({n_rows}) must equal "
                                 f"the size of 'py' ({r})")

        self.low_rank = low_rank
        self.n = n
        self.f = f
        self.r = r
        self.py = py
        self.px = px
        self.s = s
        self.y = y
        self.x = x
        self.p_path = p_path

        self._check_dof()

        self.beta = None
        self.sigma_sq = None
        self.tau_sq = None
        self.gamma = None
        self.log_gamma = None
        self.h_sq = None
        self.h_sq_standard_error = None
        self.optimize_result = None

        self._fitted = False

        if low_rank:
            self._yty = y @ y
            self._xty = x.T @ y
            self._xtx = x.T @ x

        self._dof = n - f
        self._d = None
        self._ydy = None
        self._xdy = None
        self._xdx = None

        self._dof_alt = n - (f + 1)
        self._d_alt = None
        self._ydy_alt = None
        self._xdy_alt = np.zeros(f + 1)
        self._xdx_alt = np.zeros((f + 1, f + 1))

        self._residual_sq = None

        self._scala_model = None
    def _test_linear_mixed_model_low_rank(self):
        seed = 0
        n_populations = 8
        fst = n_populations * [.9]
        n_samples = 500
        n_variants = 200
        n_orig_markers = 100
        n_culprits = 10
        n_covariates = 3
        sigma_sq = 1
        tau_sq = 1

        from numpy.random import RandomState
        prng = RandomState(seed)

        x = np.hstack((np.ones(shape=(n_samples, 1)),
                       prng.normal(size=(n_samples, n_covariates - 1))))

        mt = hl.balding_nichols_model(n_populations=n_populations,
                                      n_samples=n_samples,
                                      n_variants=n_variants,
                                      fst=fst,
                                      af_dist=hl.rand_unif(0.1, 0.9, seed=seed),
                                      seed=seed)

        pa_t_path = utils.new_temp_file(suffix='bm')
        a_t_path = utils.new_temp_file(suffix='bm')

        BlockMatrix.write_from_entry_expr(mt.GT.n_alt_alleles(), a_t_path)

        a = BlockMatrix.read(a_t_path).T.to_numpy()
        g = a[:, -n_orig_markers:]
        g_std = self._filter_and_standardize_cols(g)

        n_markers = g_std.shape[1]

        k = (g_std @ g_std.T) * n_samples / n_markers

        beta = np.arange(n_covariates)
        beta_stars = np.array([1] * n_culprits)

        y = prng.multivariate_normal(
            np.hstack((a[:, 0:n_culprits], x)) @ np.hstack((beta_stars, beta)),
            sigma_sq * k + tau_sq * np.eye(n_samples))

        # low rank computation of S, P
        l = g_std.T @ g_std
        sl, v = np.linalg.eigh(l)
        n_eigenvectors = int(np.sum(sl > 1e-10))
        sl = sl[-n_eigenvectors:]
        v = v[:, -n_eigenvectors:]
        s = sl * (n_samples / n_markers)
        p = (g_std @ (v / np.sqrt(sl))).T

        # compare with full rank S, P
        sk0, uk = np.linalg.eigh(k)
        sk = sk0[-n_eigenvectors:]
        pk = uk[:, -n_eigenvectors:].T
        assert np.allclose(sk, s)
        assert np.allclose(np.abs(pk), np.abs(p))

        # build and fit model
        py = p @ y
        px = p @ x
        pa = p @ a

        model = LinearMixedModel(py, px, s, y, x)
        assert model.n == n_samples
        assert model.f == n_covariates
        assert model.r == n_eigenvectors
        assert model.low_rank

        model.fit()

        # check effect sizes tend to be near 1 for first n_marker alternative models
        BlockMatrix.from_numpy(pa).T.write(pa_t_path, force_row_major=True)
        df_lmm = model.fit_alternatives(pa_t_path, a_t_path).to_pandas()

        assert 0.9 < np.mean(df_lmm['beta'][:n_culprits]) < 1.1

        # compare NumPy and Hail LMM per alternative
        df_numpy = model.fit_alternatives_numpy(pa, a).to_pandas()
        assert np.min(df_numpy['chi_sq']) > 0

        na_numpy = df_numpy.isna().any(axis=1)
        na_lmm = df_lmm.isna().any(axis=1)

        assert na_numpy.sum() <= 10
        assert na_lmm.sum() <= 10
        assert np.logical_xor(na_numpy, na_lmm).sum() <= 5

        mask = ~(na_numpy | na_lmm)

        lmm_vs_numpy_p_value = np.sort(np.abs(df_lmm['p_value'][mask] - df_numpy['p_value'][mask]))

        assert lmm_vs_numpy_p_value[10] < 1e-12  # 10 least p-values differences
        assert lmm_vs_numpy_p_value[-1] < 1e-8   # all p-values
Exemple #26
0
mt = mt.filter_cols(mt.super_population == 'EUR')
mt = hl.variant_qc(mt)
mt = mt.filter_rows((mt.variant_qc.AF[0] > 0.001)
                    & (mt.variant_qc.AF[1] > 0.001))

BlockMatrix.write_from_entry_expr(
    entry_expr=mt.GT.n_alt_alleles(),
    path=
    'gs://hail-datasets-hail-data/1000_Genomes_phase3_European_autosomes_maf_gt_001.bm',
    mean_impute=True,
    center=False,
    normalize=False,
    block_size=4096,
    overwrite=True)

bm = BlockMatrix.read(
    'gs://hail-datasets-hail-data/1000_Genomes_phase3_European_autosomes_maf_gt_001.bm'
)

metadata = hl.struct(name='1000_Genomes_phase3_European_autosomes_maf_gt_001',
                     reference_genome='GRCh37',
                     n_rows=bm.n_rows,
                     n_cols=bm.n_cols,
                     block_size=bm.block_size)

hl.experimental.write_expression(
    metadata,
    'gs://hail-datasets-hail-data/1000_Genomes_phase3_European_autosomes_maf_gt_001.metadata.he',
    overwrite=True)
Exemple #27
0
    def from_random_effects(cls, y, x, z,
                            p_path=None,
                            overwrite=False,
                            max_condition_number=1e-10,
                            complexity_bound=8192):
        r"""Initializes a model from :math:`y`, :math:`X`, and :math:`Z`.

        Examples
        --------
        >>> from hail.stats import LinearMixedModel
        >>> y = np.array([0.0, 1.0, 8.0, 9.0])
        >>> x = np.array([[1.0, 0.0],
        ...               [1.0, 2.0],
        ...               [1.0, 1.0],
        ...               [1.0, 4.0]])
        >>> z = np.array([[0.0, 0.0, 1.0],
        ...               [0.0, 1.0, 2.0],
        ...               [1.0, 2.0, 4.0],
        ...               [2.0, 4.0, 8.0]])
        >>> model, p = LinearMixedModel.from_random_effects(y, x, z)
        >>> model.fit()
        >>> model.h_sq
        0.38205307244271675

        Notes
        -----
        If :math:`n \leq m`, the returned model is full rank.

        If :math:`n > m`, the returned model is low rank. In this case only,
        eigenvalues less than or equal to `max_condition_number` times the top
        eigenvalue are dropped from :math:`S`, with the corresponding
        eigenvectors dropped from :math:`P`. This guards against precision
        loss on left eigenvectors computed via the right gramian :math:`Z^T Z`
        in :meth:`BlockMatrix.svd`.

        In either case, one can truncate to a rank :math:`r` model as follows.
        If `p` is an ndarray:

        >>> p_r = p[:r, :]     # doctest: +SKIP
        >>> s_r = model.s[:r]  # doctest: +SKIP
        >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x)  # doctest: +SKIP

        If `p` is a block matrix:

        >>> p[:r, :].write(p_r_path)          # doctest: +SKIP
        >>> p_r = BlockMatrix.read(p_r_path)  # doctest: +SKIP
        >>> s_r = model.s[:r]                 # doctest: +SKIP
        >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x, p_r_path)  # doctest: +SKIP

        This method applies no standardization to `z`.

        Warning
        -------
        If `z` is a block matrix, then ideally `z` should be the result of
        directly reading from disk (and possibly a transpose). This is most
        critical if :math:`n > m`, because in this case multiplication by `z`
        will result in all preceding transformations being repeated
        ``n / block_size`` times, as explained in :class:`.BlockMatrix`.

        At least one dimension must be less than or equal to 46300.
        See the warning in :meth:`.BlockMatrix.svd` for performance
        considerations.

        Parameters
        ----------
        y: :class:`ndarray`
            :math:`n` vector of observations :math:`y`.
        x: :class:`ndarray`
            :math:`n \times p` matrix of fixed effects :math:`X`.
        z: :class:`ndarray` or :class:`BlockMatrix`
            :math:`n \times m` matrix of random effects :math:`Z`.
        p_path: :obj:`str`, optional
            Path at which to write :math:`P` as a block matrix.
            Required if `z` is a block matrix.
        overwrite: :obj:`bool`
            If ``True``, overwrite an existing file at `p_path`.
        max_condition_number: :obj:`float`
            Maximum condition number. Must be greater than 1e-16.
        complexity_bound: :obj:`int`
            Complexity bound for :meth:`.BlockMatrix.svd` when `z` is a block
            matrix.

        Returns
        -------
        model: :class:`LinearMixedModel`
            Model constructed from :math:`y`, :math:`X`, and :math:`Z`.
        p: :class:`ndarray` or :class:`.BlockMatrix`
            Matrix :math:`P` whose rows are the eigenvectors of :math:`K`.
            The type is block matrix if `z` is a block matrix and
            :meth:`.BlockMatrix.svd` of `z` returns :math:`U` as a block matrix.
        """
        z_is_bm = isinstance(z, BlockMatrix)

        if z_is_bm and p_path is None:
            raise ValueError("from_random_effects: 'p_path' required when 'z'"
                             "is a block matrix.")

        if max_condition_number < 1e-16:
            raise ValueError("from_random_effects: 'max_condition_number' must "
                             f"be at least 1e-16, found {max_condition_number}")

        _check_dims(y, "y", 1)
        _check_dims(x, "x", 2)
        _check_dims(z, "z", 2)

        n, m = z.shape

        if y.shape[0] != n:
            raise ValueError("from_random_effects: 'y' and 'z' must have the "
                             "same number of rows")
        if x.shape[0] != n:
            raise ValueError("from_random_effects: 'x' and 'z' must have the "
                             "same number of rows")

        if z_is_bm:
            u, s0, _ = z.svd(complexity_bound=complexity_bound)
            p = u.T
            p_is_bm = isinstance(p, BlockMatrix)
        else:
            u, s0, _ = hl.linalg._svd(z, full_matrices=False)
            p = u.T
            p_is_bm = False

        s = s0 ** 2

        low_rank = n > m

        if low_rank:
            assert np.all(np.isfinite(s))
            r = np.searchsorted(-s, -max_condition_number * s[0])
            if r < m:
                info(f'from_random_effects: model rank reduced from {m} to {r} '
                     f'due to ill-condition.'
                     f'\n    Largest dropped eigenvalue was {s[r]}.')
            s = s[:r]
            p = p[:r, :]

        if p_path is not None:
            if p_is_bm:
                p.write(p_path, overwrite=overwrite)
                p = BlockMatrix.read(p_path)
            else:
                BlockMatrix.from_numpy(p).write(p_path, overwrite=overwrite)
        if p_is_bm:
            py, px = (p @ y.reshape(n, 1)).to_numpy().flatten(), (p @ x).to_numpy()
        else:
            py, px = p @ y, p @ x

        if low_rank:
            model = LinearMixedModel(py, px, s, y, x, p_path)
        else:
            model = LinearMixedModel(py, px, s, p_path=p_path)

        return model, p
Exemple #28
0
    def __init__(self, py, px, s, y=None, x=None, p_path=None):
        if y is None and x is None:
            low_rank = False
        elif y is not None and x is not None:
            low_rank = True
        else:
            raise ValueError('for low-rank, set both y and x; for full-rank, do not set y or x.')

        _check_dims(py, 'py', 1)
        _check_dims(px, 'px', 2)
        _check_dims(s, 's', 1)

        r = s.size
        f = px.shape[1]

        if py.size != r:
            raise ValueError("py and s must have the same size")
        if px.shape[0] != r:
            raise ValueError("px must have the same number of rows as the size of s")
        if low_rank:
            _check_dims(y, 'y', 1)
            _check_dims(x, 'x', 2)
            n = y.size
            if n <= r:
                raise ValueError("size of y must be larger than the size of s")
            if x.shape[0] != n:
                raise ValueError("x must have the same number of rows as the size of y")
            if x.shape[1] != f:
                raise ValueError("px and x must have the same number columns")
        else:
            n = r

        if p_path is not None:
            n_rows, n_cols = BlockMatrix.read(p_path).shape
            if n_cols != n:
                raise ValueError("LinearMixedModel: Number of columns in the block "
                                 f"matrix at 'p_path' ({n_cols}) must equal "
                                 f"the size of 'y' ({n})")
            if n_rows != r:
                raise ValueError("LinearMixedModel: Number of rows in the block "
                                 f"matrix at 'p_path' ({n_rows}) must equal "
                                 f"the size of 'py' ({r})")

        self.low_rank = low_rank
        self.n = n
        self.f = f
        self.r = r
        self.py = py
        self.px = px
        self.s = s
        self.y = y
        self.x = x
        self.p_path = p_path

        self._check_dof()

        self.beta = None
        self.sigma_sq = None
        self.tau_sq = None
        self.gamma = None
        self.log_gamma = None
        self.h_sq = None
        self.h_sq_standard_error = None
        self.optimize_result = None

        self._fitted = False

        if low_rank:
            self._yty = y @ y
            self._xty = x.T @ y
            self._xtx = x.T @ x

        self._dof = n - f
        self._d = None
        self._ydy = None
        self._xdy = None
        self._xdx = None

        self._dof_alt = n - (f + 1)
        self._d_alt = None
        self._ydy_alt = None
        self._xdy_alt = np.zeros(f + 1)
        self._xdx_alt = np.zeros((f + 1, f + 1))

        self._residual_sq = None

        self._scala_model = None
Exemple #29
0
import hail as hl
from hail.linalg import BlockMatrix
from os import path
import sys
import pandas as pd

chr_id, group_id = sys.argv[1], sys.argv[2]
print(chr_id + ' ' + group_id)
idx_comb = pd.read_csv("mapLDref.tsv.gz", sep="\t")
idx_comb['chr'] = idx_comb['chr'].astype(str)
idx_comb['group'] = idx_comb['group'].astype(str)

idx_comb_chr = idx_comb[idx_comb.chr == chr_id]
chridx = idx_comb_chr[idx_comb_chr.group == group_id].idx.tolist()

ext = 'chr' + chr_id + '.' + group_id
## Load data
bm = BlockMatrix.read('s3a://pan-ukb-us-east-1/ld_release/UKBB.EUR.ldadj.bm')

bmchr = bm.filter(chridx, chridx)
bmchr.write(ext + '.bm', force_row_major=True)
BlockMatrix.export(ext + '.bm', ext + '.csv.bgz', delimiter='\t')
Exemple #30
0
def get_ld_matrix(pop: str):
    return BlockMatrix.read(ld_matrix_path('genomes', pop))
Exemple #31
0
def ld_score(entry_expr,
             locus_expr,
             radius,
             coord_expr=None,
             annotation_exprs=None,
             block_size=None) -> Table:
    """Calculate LD scores.

    Example
    -------

    >>> # Load genetic data into MatrixTable
    >>> mt = hl.import_plink(bed='data/ldsc.bed',
    ...                      bim='data/ldsc.bim',
    ...                      fam='data/ldsc.fam')

    >>> # Create locus-keyed Table with numeric variant annotations
    >>> ht = hl.import_table('data/ldsc.annot',
    ...                      types={'BP': hl.tint,
    ...                             'binary': hl.tfloat,
    ...                             'continuous': hl.tfloat})
    >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP))
    >>> ht = ht.key_by('locus')

    >>> # Annotate MatrixTable with external annotations
    >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary,
    ...                       continuous_annotation=ht[mt.locus].continuous)

    >>> # Calculate LD scores using centimorgan coordinates
    >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(),
    ...                                      locus_expr=mt.locus,
    ...                                      radius=1.0,
    ...                                      coord_expr=mt.cm_position,
    ...                                      annotation_exprs=[mt.binary_annotation,
    ...                                                        mt.continuous_annotation])

    >>> # Show results
    >>> ht_scores.show(3)

    .. code-block:: text

        +---------------+-------------------+-----------------------+-------------+
        | locus         | binary_annotation | continuous_annotation |  univariate |
        +---------------+-------------------+-----------------------+-------------+
        | locus<GRCh37> |           float64 |               float64 |     float64 |
        +---------------+-------------------+-----------------------+-------------+
        | 20:82079      |       1.15183e+00 |           7.30145e+01 | 1.60117e+00 |
        | 20:103517     |       2.04604e+00 |           2.75392e+02 | 4.69239e+00 |
        | 20:108286     |       2.06585e+00 |           2.86453e+02 | 5.00124e+00 |
        +---------------+-------------------+-----------------------+-------------+


    Warning
    -------
        :func:`.ld_score` will fail if ``entry_expr`` results in any missing
        values. The special float value ``nan`` is not considered a
        missing value.

    **Further reading**

    For more in-depth discussion of LD scores, see:

    - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__
    - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__

    Notes
    -----

    `entry_expr`, `locus_expr`, `coord_expr` (if specified), and
    `annotation_exprs` (if specified) must come from the same
    MatrixTable.


    Parameters
    ----------
    entry_expr : :class:`.NumericExpression`
        Expression for entries of genotype matrix
        (e.g. ``mt.GT.n_alt_alleles()``).
    locus_expr : :class:`.LocusExpression`
        Row-indexed locus expression.
    radius : :obj:`int` or :obj:`float`
        Radius of window for row values (in units of `coord_expr` if set,
        otherwise in units of basepairs).
    coord_expr: :class:`.Float64Expression`, optional
        Row-indexed numeric expression for the row value used to window
        variants. By default, the row value is given by the locus
        position.
    annotation_exprs : :class:`.NumericExpression` or
                       :obj:`list` of :class:`.NumericExpression`, optional
        Annotation expression(s) to partition LD scores. Univariate
        annotation will always be included and does not need to be
        specified.
    block_size : :obj:`int`, optional
        Block size. Default given by :meth:`.BlockMatrix.default_block_size`.

    Returns
    -------
    :class:`.Table`
        Table keyed by `locus_expr` with LD scores for each variant and
        `annotation_expr`. The function will always return LD scores for
        the univariate (all SNPs) annotation."""

    mt = entry_expr._indices.source
    mt_locus_expr = locus_expr._indices.source

    if coord_expr is None:
        mt_coord_expr = mt_locus_expr
    else:
        mt_coord_expr = coord_expr._indices.source

    if not annotation_exprs:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr])
    else:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr] +
                        [mt == x._indices.source
                         for x in wrap_to_list(annotation_exprs)])

    if not check_mts:
        raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr
                            (if specified), and annotation_exprs (if
                            specified) must come from same MatrixTable.""")

    n = mt.count_cols()
    r2 = hl.row_correlation(entry_expr, block_size) ** 2
    r2_adj = ((n-1.0) / (n-2.0)) * r2 - (1.0 / (n-2.0))

    starts, stops = hl.linalg.utils.locus_windows(locus_expr,
                                                  radius,
                                                  coord_expr)
    r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops)

    r2_adj_sparse_tmp = new_temp_file()
    r2_adj_sparse.write(r2_adj_sparse_tmp)
    r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp)

    if not annotation_exprs:
        cols = ['univariate']
        col_idxs = {0: 'univariate'}
        l2 = r2_adj_sparse.sum(axis=1)
    else:
        ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows()
        ht = ht.annotate(univariate=hl.literal(1.0))
        names = [name for name in ht.row if name not in ht.key]

        ht_union = hl.Table.union(
            *[(ht.annotate(name=hl.str(x),
                           value=hl.float(ht[x]))
                 .select('name', 'value')) for x in names])
        mt_annotations = ht_union.to_matrix_table(
            row_key=list(ht_union.key),
            col_key=['name'])

        cols = mt_annotations.key_cols_by()['name'].collect()
        col_idxs = {i: cols[i] for i in range(len(cols))}

        a_tmp = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp)

        a = BlockMatrix.read(a_tmp)
        l2 = r2_adj_sparse @ a

    l2_bm_tmp = new_temp_file()
    l2_tsv_tmp = new_temp_file()
    l2.write(l2_bm_tmp, force_row_major=True)
    BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp)

    ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True)
    ht_scores = ht_scores.add_index()
    ht_scores = ht_scores.key_by('idx')
    ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i]
                                  for i in range(len(cols))})

    ht = mt.select_rows(__locus=locus_expr).rows()
    ht = ht.add_index()
    ht = ht.annotate(**ht_scores[ht.idx])
    ht = ht.key_by('__locus')
    ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key])
    ht = ht.rename({'__locus': 'locus'})

    return ht
def compute_test_prs_bm(genotype_bm_path, prs_bm_path, args):
    sumstats_bm = BlockMatrix.read(
        get_clump_sumstats_bm_path(args.high_quality))
    genotype_bm = BlockMatrix.read(genotype_bm_path)
    prs_bm: BlockMatrix = genotype_bm.T @ sumstats_bm
    prs_bm.write(prs_bm_path, args.overwrite)
Exemple #33
0
def main(args):
    pop = args.pop
    num_pcs = 10
    basic_covars = ['sex', 'age', 'age2', 'age_sex', 'age2_sex']
    covariates = basic_covars + [f'PC{x}' for x in range(1, num_pcs + 1)]

    tmp_mt_path = f'{temp_bucket_7day}/{pop}.mt'
    tmp_bm_path = f'{temp_bucket_7day}/{pop}.bm'

    if args.write_mt:
        mt = get_filtered_mt(chrom='all',
                             pop=pop,
                             entry_fields=['dosage'],
                             min_mac=19,
                             filter_mac_instead_of_ac=True)
        mt_x = get_filtered_mt(chrom='X',
                               pop=pop,
                               entry_fields=['dosage'],
                               min_mac=19,
                               filter_mac_instead_of_ac=True)
        mt = mt.union_rows(mt_x)
        mt = mt.annotate_rows(AF=hl.agg.mean(mt.dosage) / 2)
        mt = mt.checkpoint(tmp_mt_path, overwrite=args.overwrite)
        n = mt.count()[1]

        # write variant indexes
        ht = mt.rows().select().add_index()
        ht = ht.annotate_globals(n_samples=n, pop=pop)
        ht.write(get_ld_variant_index_path(pop), overwrite=args.overwrite)
    else:
        mt = hl.read_matrix_table(tmp_mt_path)
        n = mt.count()[1]

    if args.write_bm:
        # convert mt to bm
        BlockMatrix.write_from_entry_expr(mt.dosage,
                                          tmp_bm_path,
                                          mean_impute=True,
                                          center=False,
                                          normalize=False,
                                          overwrite=args.overwrite)
    bm = BlockMatrix.read(tmp_bm_path)

    if args.compute_ld_matrix:
        print(f'BlockMatrix shape: {bm.shape}')

        # mean-center and normalize bm
        bm_norm = normalize_bm(bm)
        bm_norm = checkpoint_tmp(bm_norm)

        # take covariates (with intercept), make hat bms for FWL projection
        cov = mt.cols().select(*covariates).to_pandas().drop(['s'], axis=1)
        cov['Intercept'] = 1.0
        hat1 = cov.values
        hat2 = np.dot(np.linalg.inv(np.dot(cov.transpose(), cov)),
                      cov.transpose())
        bm_hat1 = checkpoint_tmp(BlockMatrix.from_numpy(hat1))
        bm_hat2 = checkpoint_tmp(BlockMatrix.from_numpy(hat2))

        # Cov-adjustement; conducting in three steps due to huge matrix operation
        bm_Z = checkpoint_tmp(bm_norm @ bm_hat1)
        bm_Z = checkpoint_tmp(bm_Z @ bm_hat2)
        bm_Z = checkpoint_tmp(bm_norm - bm_Z)

        # compute ld matrix with a specified radius
        bm_ldadj = (bm_Z @ bm_Z.T) / n
        starts_and_stops = hl.linalg.utils.locus_windows(mt.locus,
                                                         radius=args.radius,
                                                         _localize=False)
        bm_ldadj = bm_ldadj._sparsify_row_intervals_expr(starts_and_stops,
                                                         blocks_only=False)

        # sparcify to a triangle matrix
        bm_ldadj = bm_ldadj.sparsify_triangle()
        bm_ldadj = bm_ldadj.checkpoint(get_ld_matrix_path(pop),
                                       overwrite=args.overwrite,
                                       force_row_major=True)
    else:
        bm_ldadj = BlockMatrix.read(get_ld_matrix_path(pop))

    if args.write_ldsc_hm3_snplist:
        # Note: currently, this writes snplists for all the populations at once
        write_ldsc_hm3_snplist(overwrite=args.overwrite)

    if args.compute_ldscore:
        ht_ldscore = copmute_ldscore(mt.rows(),
                                     bm_ldadj,
                                     n,
                                     radius=args.ld_score_radius,
                                     out_name=get_ld_score_ht_path(pop),
                                     overwrite=args.overwrite)
        export_ldscore(ht_ldscore, pop)
Exemple #34
0
def main(args):
    hl.init(default_reference='GRCh37', log='/prs.log',
            spark_conf={'spark.hadoop.fs.gs.requester.pays.mode': 'AUTO', 'spark.hadoop.fs.gs.requester.pays.project.id': 'ukbb-diversepops-neale'})

    if args.prepare_sumstats_matrix:
        # get meta mt and separate by pop combo
        meta_mt = hl.read_matrix_table(get_meta_analysis_results_path())
        meta_mt = separate_results_mt_by_pop(meta_mt, 'meta_analysis_data', 'meta_analysis')
        meta_mt = meta_mt.annotate_cols(clump_pops=meta_mt.meta_analysis_data.pop)
        meta_mt = meta_mt.key_cols_by('clump_pops', *meta_mt.col_key)
        
        # get sumstats mt and separate by pop combo
        ss_mt = get_final_sumstats_mt_for_export()
        ss_mt = separate_results_mt_by_pop(ss_mt, 'pheno_data', 'summary_stats')
        ss_mt = ss_mt.annotate_cols(clump_pops=hl.array([ss_mt.pheno_data.pop]))
        ss_mt = ss_mt.key_cols_by(*meta_mt.col_key)
        
        # join meta results and sumstats mt
        # NOTE: union_cols() requires the same entry fields schema
        meta_mt = meta_mt.select_entries(BETA = meta_mt.meta_analysis.BETA,
                                         Pvalue = meta_mt.meta_analysis.Pvalue).select_cols().select_rows()
        ss_mt = ss_mt.select_entries(BETA = ss_mt.summary_stats.BETA,
                                     Pvalue = ss_mt.summary_stats.Pvalue).select_cols().select_rows()
        mt = meta_mt.union_cols(ss_mt)
        
        # filter to distinct cols
        # NOTE: distinct_by_col() does not allow a col key of type `list`
        mt = mt.annotate_cols(clump_pops_str = hl.delimit(mt.clump_pops)).key_cols_by('clump_pops_str', *[k for k in mt.col_key if k!='clump_pops']).distinct_by_col()
        mt = mt.distinct_by_col()
        
        # ensure that betas are not missing
        ss_mt = ss_mt.annotate_cols(clump_pops_str = hl.delimit(ss_mt.clump_pops)).key_cols_by('clump_pops_str', *[k for k in ss_mt.col_key if k!='clump_pops'])
        mt = mt.annotate_entries(BETA = hl.or_else(mt.BETA, ss_mt[mt.row_key, mt.col_key].BETA),
                                 Pvalue = hl.or_else(mt.Pvalue, ss_mt[mt.row_key, mt.col_key].Pvalue))
        
        # read clump mt and separate by pop combo
        clump_mt = hl.read_matrix_table(get_clumping_results_path(high_quality=args.high_quality, 
                                                                  max_pops=args.max_pops))
        if args.max_pops:
            # if max_pops=True, the clump_mt is already separated by pop
            # these steps are necessary to make downstream code usable for both max_pops=True/False
            clump_mt = clump_mt.annotate_entries(plink_clump = hl.struct(TOTAL = clump_mt.TOTAL))
            clump_mt = clump_mt.annotate_cols(pop_index = 0)
        else:
            clump_mt = separate_results_mt_by_pop(clump_mt, 'clump_pops', 'plink_clump', skip_drop=True)
        
        clump_mt = clump_mt.annotate_cols(clump_pops_str = hl.delimit(clump_mt.clump_pops))
        clump_mt = clump_mt.drop('clump_pops').key_cols_by(*mt.col_key)
        
        # join sumstats/meta-analysis with clump mt
        mt = all_axis_join(mt, clump_mt)
        
        mt = mt.filter_cols(hl.is_defined(mt.pop_index))
        
        print(f'\n\nMatrix dimensions (before explode by p-threshold): {mt.count()}\n')
        mt = explode_by_p_threshold(mt).unfilter_entries()
        # Write pheno data for later use
        mt.add_col_index('idx').key_cols_by('idx').cols().write(
            get_clump_sumstats_col_ht_path(high_quality=args.high_quality,
                                           max_pops=args.max_pops), 
            args.overwrite)
        BlockMatrix.write_from_entry_expr(
            hl.or_else(mt.BETA * hl.is_defined(mt.plink_clump.TOTAL) * hl.int(mt.Pvalue < mt.p_threshold), 0.0),
            get_clump_sumstats_bm_path(high_quality=args.high_quality,
                                        max_pops=args.max_pops), 
            args.overwrite)
        # 2020-06-25 01:49:32 Hail: INFO: Wrote all 7078 blocks of 28987534 x 3530 matrix with block size 4096.
        # If clump_mt is significantly smaller than meta_mt, consider putting that on the left of the join,
        # then filter the genotype matrix to only those SNPs (pilot would go from 28.9M -> 21.2M)

    if args.prepare_genotype_matrix:
        meta_mt = hl.read_matrix_table(get_meta_analysis_results_path())
        mt = get_filtered_mt_with_x()
        mt = mt.filter_rows(hl.is_defined(meta_mt.rows()[mt.row_key]))
        # Write sample data for later use
        mt = mt.key_cols_by(userId=hl.int32(mt.s))
        mt.cols().add_index().write(genotype_samples_ht_path, args.overwrite)
        BlockMatrix.write_from_entry_expr(mt.dosage, genotype_bm_path, args.overwrite)
        # 2020-06-25 19:18:14 Hail: INFO: Wrote all 764424 blocks of 28987534 x 441345 matrix with block size 4096.

    if args.compute_prs:
        sumstats_bm = BlockMatrix.read(get_clump_sumstats_bm_path(high_quality=args.high_quality, 
                                                                  max_pops=args.max_pops))
        genotype_bm = BlockMatrix.read(genotype_bm_path)
        mul_splits = 197 # sumstats_bm.shape[1]//10000*10
        sum_splits = 20 #int(mul_splits/10)
        assert mul_splits>10 # if not more than 10, sum_splits is not necessary
        prs_bm = tree_matmul_tree_matsum(genotype_bm.T, sumstats_bm, mul_splits=mul_splits, 
                                         sum_splits=sum_splits, path_prefix = f'{temp_bucket}/prs/tree_matmul{"_max_pops" if args.max_pops else ""}',
                                         read_if_exists = True)
        prs_bm.write(get_prs_bm_path(high_quality=args.high_quality,
                                     max_pops=args.max_pops), args.overwrite)

    if args.create_prs_mt:
        prs_bm = BlockMatrix.read(get_prs_bm_path(high_quality=args.high_quality,
                                                  max_pops=args.max_pops))
        pheno_ht = hl.read_table(get_clump_sumstats_col_ht_path(high_quality=args.high_quality,
                                                                max_pops=args.max_pops)).key_by('idx')
        samples_ht = hl.read_table(genotype_samples_ht_path).key_by('idx')
        # 10k partitions for 370 GB table (441k x 108k) = 37 MB/partition
        # 5014 partitions for 240 GB table (441k x 72k) = 48 MB/partition (max_pops)
        n_partitions = 15000 #int(1000*(pheno_ht.count()/72*5)//1000) # or hard code
        mt = BlockMatrix.to_matrix_table_row_major(prs_bm, n_partitions=n_partitions).rename({'element': 'score'}) 
        mt = mt.annotate_cols(**pheno_ht[mt.col_key]).key_cols_by(*PHENO_KEY_FIELDS)
        mt = mt.annotate_rows(**samples_ht[mt.row_key]).key_rows_by('userId')
        mt.write(get_prs_mt_path(high_quality=args.high_quality, 
                                 max_pops=args.max_pops), 
                 args.overwrite)

    if args.assess_prs:
        prs_mt = hl.read_matrix_table(get_prs_mt_path(high_quality=args.high_quality, 
                                                      max_pops=args.max_pops))
        pheno_mt = get_ukb_pheno_mt()  # TODO: fix all phenos to new keying scheme
        pheno_mt = pheno_mt.key_cols_by(
            **pheno_mt.col_key.annotate(modifier=hl.if_else(pheno_mt.trait_type == "biomarkers", "irnt", pheno_mt.modifier)))
        mt = prs_mt.annotate_entries(**pheno_mt[prs_mt.row_key, prs_mt.col_key])
        mt = mt.annotate_cols(description = pheno_mt.cols()[mt.col_key].description)
        for pop in POPS:
            mt_pop = mt.filter_rows(mt.pop==pop)
            mt_pop = mt_pop.annotate_cols(prs_corr=hl.agg.linreg(mt_pop.both_sexes, [1.0, mt_pop.score]))
            cols = mt_pop.cols()
            cols.select('description', 
                        'p_threshold',
                        clump_pops_str=hl.delimit(cols.clump_pops,'-'),
                        prs_corr_r2=cols.prs_corr.multiple_r_squared, 
                        prs_corr_pval=cols.prs_corr.p_value[1], 
                        prs_corr_n=cols.prs_corr.n).export(f'gs://ukbb-diverse-temp-30day/prs/assess_prs{"_max_pops" if args.max_pops else ""}.{pop}.tsv.gz')
Exemple #35
0
def checkpoint_bm(bm, path, read_if_exists=True):
    if not hl.hadoop_is_file(f'{path}/_SUCCESS'):
        bm.write(path)
    bm = BlockMatrix.read(path)
    return bm