Ejemplo n.º 1
0
    def test_matrix_ops(self):
        nm = np.matrix([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
        m = BlockMatrix.from_numpy(nm, block_size=2)

        nrow = np.matrix([[7.0, 8.0, 9.0]])
        row = BlockMatrix.from_numpy(nrow, block_size=2)

        self._assert_eq(m.T, nm.T)
        self._assert_eq(m.T, nm.T)
        self._assert_eq(row.T, nrow.T)

        self._assert_eq(m @ m.T, nm @ nm.T)
        self._assert_eq(m @ nm.T, nm @ nm.T)
        self._assert_eq(row @ row.T, nrow @ nrow.T)
        self._assert_eq(row @ nrow.T, nrow @ nrow.T)

        self._assert_eq(m.T @ m, nm.T @ nm)
        self._assert_eq(m.T @ nm, nm.T @ nm)
        self._assert_eq(row.T @ row, nrow.T @ nrow)
        self._assert_eq(row.T @ nrow, nrow.T @ nrow)

        self.assertRaises(ValueError, lambda: m @ m)
        self.assertRaises(ValueError, lambda: m @ nm)

        self._assert_eq(m.diagonal(), np.array([1.0, 5.0]))
        self._assert_eq(m.T.diagonal(), np.array([1.0, 5.0]))
        self._assert_eq((m @ m.T).diagonal(), np.array([14.0, 77.0]))
Ejemplo n.º 2
0
    def test_stage_locally(self):
        nd = np.arange(0, 80, dtype=float).reshape(8, 10)
        bm_uri = new_temp_file()
        BlockMatrix.from_numpy(nd, block_size=3).write(bm_uri, stage_locally=True)

        bm = BlockMatrix.read(bm_uri)
        self._assert_eq(nd, bm)
Ejemplo n.º 3
0
    def test_from_entry_expr_options(self):
        def build_mt(a):
            data = [{'v': 0, 's': 0, 'x': a[0]},
                    {'v': 0, 's': 1, 'x': a[1]},
                    {'v': 0, 's': 2, 'x': a[2]}]
            ht = hl.Table.parallelize(data, hl.dtype('struct{v: int32, s: int32, x: float64}'))
            mt = ht.to_matrix_table(['v'], ['s'])
            ids = mt.key_cols_by()['s'].collect()
            return mt.choose_cols([ids.index(0), ids.index(1), ids.index(2)])

        def check(expr, mean_impute, center, normalize, expected):
            actual = np.squeeze(BlockMatrix.from_entry_expr(expr,
                                                            mean_impute=mean_impute,
                                                            center=center,
                                                            normalize=normalize).to_numpy())
            assert np.allclose(actual, expected)

        a = np.array([0.0, 1.0, 2.0])

        mt = build_mt(a)
        check(mt.x, False, False, False, a)
        check(mt.x, False, True, False, a - 1.0)
        check(mt.x, False, False, True, a / np.sqrt(5))
        check(mt.x, False, True, True, (a - 1.0) / np.sqrt(2))
        check(mt.x + 1 - 1, False, False, False, a)

        mt = build_mt([0.0, hl.null('float64'), 2.0])
        check(mt.x, True, False, False, a)
        check(mt.x, True, True, False, a - 1.0)
        check(mt.x, True, False, True, a / np.sqrt(5))
        check(mt.x, True, True, True, (a - 1.0) / np.sqrt(2))
        with self.assertRaises(Exception):
            BlockMatrix.from_entry_expr(mt.x)
Ejemplo n.º 4
0
    def test_sum(self):
        def sums_agree(bm, nd):
            self.assertAlmostEqual(bm.sum(), np.sum(nd))
            self._assert_close(bm.sum(axis=0), np.sum(nd, axis=0, keepdims=True))
            self._assert_close(bm.sum(axis=1), np.sum(nd, axis=1, keepdims=True))

        nd = np.random.normal(size=(11, 13))
        bm = BlockMatrix.from_numpy(nd, block_size=3)

        nd2 = np.zeros(shape=(5, 7))
        nd2[2, 4] = 1.0
        nd2[2, 5] = 2.0
        nd2[3, 4] = 3.0
        nd2[3, 5] = 4.0
        bm2 = BlockMatrix.from_numpy(nd2, block_size=2).sparsify_rectangles([[2, 4, 4, 6]])

        bm3 = BlockMatrix.from_numpy(nd2, block_size=2).sparsify_rectangles([[2, 4, 4, 6], [0, 5, 0, 1]])

        bm4 = BlockMatrix.from_numpy(nd2, block_size=2).sparsify_rectangles([[2, 4, 4, 6], [0, 1, 0, 7]])

        nd5 = np.zeros(shape=(5, 7))
        bm5 = BlockMatrix.fill(5, 7, value=0.0, block_size=2).sparsify_rectangles([])

        sums_agree(bm, nd)
        sums_agree(bm2, nd2)
        sums_agree(bm3, nd2)
        sums_agree(bm4, nd2)
        sums_agree(bm5, nd5)
Ejemplo n.º 5
0
    def test_sparsify_band(self):
        nd = np.array([[ 1.0,  2.0,  3.0,  4.0],
                       [ 5.0,  6.0,  7.0,  8.0],
                       [ 9.0, 10.0, 11.0, 12.0],
                       [13.0, 14.0, 15.0, 16.0]])
        bm = BlockMatrix.from_numpy(nd, block_size=2)

        self._assert_eq(
            bm.sparsify_band(lower=-1, upper=2),
            np.array([[ 1.,  2.,  3.,  0.],
                      [ 5.,  6.,  7.,  8.],
                      [ 0., 10., 11., 12.],
                      [ 0.,  0., 15., 16.]]))

        self._assert_eq(
            bm.sparsify_band(lower=0, upper=0, blocks_only=True),
            np.array([[ 1.,  2.,  0.,  0.],
                      [ 5.,  6.,  0.,  0.],
                      [ 0.,  0., 11., 12.],
                      [ 0.,  0., 15., 16.]]))

        nd2 = np.arange(0, 80, dtype=float).reshape(8, 10)
        bm2 = BlockMatrix.from_numpy(nd2, block_size=3)

        for bounds in [[0, 0], [1, 1], [2, 2], [-5, 5], [-7, 0], [0, 9], [-100, 100]]:
            lower, upper = bounds
            actual = bm2.sparsify_band(lower, upper, blocks_only=False).to_numpy()
            mask = np.fromfunction(lambda i, j: (lower <= j - i) * (j - i <= upper), (8, 10))
            self._assert_eq(actual, nd2 * mask)
Ejemplo n.º 6
0
    def test_fill(self):
        nd = np.ones((3, 5))
        bm = BlockMatrix.fill(3, 5, 1.0)
        bm2 = BlockMatrix.fill(3, 5, 1.0, block_size=2)

        self.assertTrue(bm.block_size == BlockMatrix.default_block_size())
        self.assertTrue(bm2.block_size == 2)
        self._assert_eq(bm, nd)
        self._assert_eq(bm2, nd)
Ejemplo n.º 7
0
    def test_svd(self):
        def assert_same_columns_up_to_sign(a, b):
            for j in range(a.shape[1]):
                assert np.allclose(a[:, j], b[:, j]) or np.allclose(-a[:, j], b[:, j])

        x0 = np.array([[-2.0, 0.0, 3.0],
                       [-1.0, 2.0, 4.0]])
        u0, s0, vt0 = np.linalg.svd(x0, full_matrices=False)

        x = BlockMatrix.from_numpy(x0)

        # _svd
        u, s, vt = x.svd()
        assert_same_columns_up_to_sign(u, u0)
        assert np.allclose(s, s0)
        assert_same_columns_up_to_sign(vt.T, vt0.T)

        s = x.svd(compute_uv=False)
        assert np.allclose(s, s0)

        # left _svd_gramian
        u, s, vt = x.svd(complexity_bound=0)
        assert_same_columns_up_to_sign(u, u0)
        assert np.allclose(s, s0)
        assert_same_columns_up_to_sign(vt.to_numpy().T, vt0.T)

        s = x.svd(compute_uv=False, complexity_bound=0)
        assert np.allclose(s, s0)

        # right _svd_gramian
        x = BlockMatrix.from_numpy(x0.T)
        u, s, vt = x.svd(complexity_bound=0)
        assert_same_columns_up_to_sign(u.to_numpy(), vt0.T)
        assert np.allclose(s, s0)
        assert_same_columns_up_to_sign(vt.T, u0)

        s = x.svd(compute_uv=False, complexity_bound=0)
        assert np.allclose(s, s0)

        # left _svd_gramian when dimensions agree
        x = BlockMatrix.from_numpy(x0[:, :2])
        u, s, vt = x.svd(complexity_bound=0)
        assert isinstance(u, np.ndarray)
        assert isinstance(vt, BlockMatrix)

        # rank-deficient X sets negative eigenvalues to 0.0
        a = np.array([[0.0, 1.0, np.e, np.pi, 10.0, 25.0]])
        x0 = a.T @ a  # rank 1
        e, _ = np.linalg.eigh(x0 @ x0.T)

        x = BlockMatrix.from_numpy(x0)
        _, s, _ = x.svd(complexity_bound=0)
        assert np.all(s >= 0.0)

        s = x.svd(compute_uv=False, complexity_bound=0)
        assert np.all(s >= 0)
Ejemplo n.º 8
0
    def test_svd(self):
        def assert_same_columns_up_to_sign(a, b):
            for j in range(a.shape[1]):
                assert np.allclose(a[:, j], b[:, j]) or np.allclose(-a[:, j], b[:, j])

        x0 = np.array([[-2.0, 0.0, 3.0],
                       [-1.0, 2.0, 4.0]])
        u0, s0, vt0 = np.linalg.svd(x0, full_matrices=False)

        x = BlockMatrix.from_numpy(x0)

        # _svd
        u, s, vt = x.svd()
        assert_same_columns_up_to_sign(u, u0)
        assert np.allclose(s, s0)
        assert_same_columns_up_to_sign(vt.T, vt0.T)

        s = x.svd(compute_uv=False)
        assert np.allclose(s, s0)

        # left _svd_gramian
        u, s, vt = x.svd(complexity_bound=0)
        assert_same_columns_up_to_sign(u, u0)
        assert np.allclose(s, s0)
        assert_same_columns_up_to_sign(vt.to_numpy().T, vt0.T)

        s = x.svd(compute_uv=False, complexity_bound=0)
        assert np.allclose(s, s0)

        # right _svd_gramian
        x = BlockMatrix.from_numpy(x0.T)
        u, s, vt = x.svd(complexity_bound=0)
        assert_same_columns_up_to_sign(u.to_numpy(), vt0.T)
        assert np.allclose(s, s0)
        assert_same_columns_up_to_sign(vt.T, u0)

        s = x.svd(compute_uv=False, complexity_bound=0)
        assert np.allclose(s, s0)

        # left _svd_gramian when dimensions agree
        x = BlockMatrix.from_numpy(x0[:, :2])
        u, s, vt = x.svd(complexity_bound=0)
        assert isinstance(u, np.ndarray)
        assert isinstance(vt, BlockMatrix)

        # rank-deficient X sets negative eigenvalues to 0.0
        a = np.array([[0.0, 1.0, np.e, np.pi, 10.0, 25.0]])
        x0 = a.T @ a  # rank 1
        e, _ = np.linalg.eigh(x0 @ x0.T)

        x = BlockMatrix.from_numpy(x0)
        _, s, _ = x.svd(complexity_bound=0)
        assert np.all(s >= 0.0)

        s = x.svd(compute_uv=False, complexity_bound=0)
        assert np.all(s >= 0)
Ejemplo n.º 9
0
    def test_write_overwrite(self):
        path = new_temp_file()

        bm = BlockMatrix.from_numpy(np.array([[0]]))
        bm.write(path)
        self.assertRaises(FatalError, lambda: bm.write(path))

        bm2 = BlockMatrix.from_numpy(np.array([[1]]))
        bm2.write(path, overwrite=True)
        self._assert_eq(BlockMatrix.read(path), bm2)
Ejemplo n.º 10
0
    def test_export_blocks(self):
        nd = np.ones(shape=(8, 10))
        bm = BlockMatrix.from_numpy(nd, block_size=20)

        bm_path = new_local_temp_dir()
        bm_uri = local_path_uri(bm_path)
        bm.export_blocks(bm_uri, binary=True)
        actual = BlockMatrix.rectangles_to_numpy(bm_path, binary=True)

        self._assert_eq(nd, actual)
Ejemplo n.º 11
0
    def test_export_blocks(self):
        nd = np.ones(shape=(8, 10))
        bm = BlockMatrix.from_numpy(nd, block_size=20)

        bm_path = new_local_temp_dir()
        bm_uri = local_path_uri(bm_path)
        bm.export_blocks(bm_uri, binary=True)
        actual = BlockMatrix.rectangles_to_numpy(bm_path, binary=True)

        self._assert_eq(nd, actual)
Ejemplo n.º 12
0
    def test_write_overwrite(self):
        path = new_temp_file()

        bm = BlockMatrix.from_numpy(np.array([[0]]))
        bm.write(path)
        self.assertRaises(FatalError, lambda: bm.write(path))

        bm2 = BlockMatrix.from_numpy(np.array([[1]]))
        bm2.write(path, overwrite=True)
        self._assert_eq(BlockMatrix.read(path), bm2)
Ejemplo n.º 13
0
def plot_correlation_matrices(chr_list):
    """
    Plot combined correlation matrices for genotype-correlation and 
    sumstats-correlation matrices
    """
    for ch in chr_list:
        ss_ch = BlockMatrix.read('gs://nbaya/sumstats_corr/' + variant_set +
                                 '_ss_correlation_chr{}.bm/'.format(ch))
        gt_ch = BlockMatrix.read('gs://nbaya/sumstats_corr/' + variant_set +
                                 '_gt_correlation_chr{}.bm/'.format(ch))
        M_max = int(
            1e4
        )  #max number of variants to be taken from the block matrices (suggested: 2e4)
        M = ss_ch.shape[0]  #dimension of block matrix
        #        for idx in range(int(M/M_max)+1):       #index of which disjoint window we are looking at in the block matrix
        for idx in range(
                0,
                int(M / M_max) + 1
        ):  #index of which disjoint window we are looking at in the block matrix
            M0 = M_max * (idx)  #start variant index for block matrix filtering
            M1 = min(M_max * (idx + 1),
                     M)  #stop variant index for block matrix filtering
            ss_np = ss_ch[M0:M1, M0:M1].to_numpy()
            gt_np = gt_ch[M0:M1, M0:M1].to_numpy()
            print('\nStarting variant window: [' + str(M0) + ',' + str(M1) +
                  ']')
            w = int(
                5e3
            )  #window width of variants for correlation matrix (suggested: 2e3)
            for i in range(int((M1 - M0 - 1) / w) + 1):
                w0 = w * i  #start variant index for window of correlation matrix
                w1 = min(
                    w * (i + 1), M1 -
                    M0)  #stop variant index for window of correlation matrix
                full = (ss_np[w0:w1, w0:w1] + gt_np[w0:w1, w0:w1].T)
                np.fill_diagonal(full, 1)
                fig, ax = plt.subplots()
                ax.imshow(full, cmap='bwr')
                ax.plot([0, w], [0, w], 'k--', alpha=0.5, lw=2)
                plt.xlim([0, w])
                plt.ylim([w, 0])
                ax.text(w * 0.83, w * 0.1, "SS", fontsize=60, alpha=0.5)
                ax.text(w * 0.02, w * 0.97, "GT", fontsize=60, alpha=0.5)
                plt.title('chr' + str(ch) + ' ' + variant_set + ' variants (' +
                          str(M0 + w0) + '-' + str(M0 + w1) + ')')
                fig = plt.gcf()
                fig.set_size_inches(10, 10)
                path = ('gs://nbaya/sumstats_corr/plots/chr' + str(ch) + '_' +
                        variant_set + '_' + str(M0 + w0).zfill(len(str(M))) +
                        '-' + str(M0 + w1).zfill(len(str(M))) + '.png')
                with hl.hadoop_open(path, 'wb') as f:
                    fig.savefig(f, dpi=600)
                plt.close()
            print('\nFinished variant window: [' + str(M0) + ',' + str(M1) +
                  ']')
Ejemplo n.º 14
0
    def test_to_table_maximum_cache_memory_in_bytes_limits(self):
        bm = BlockMatrix._create(5, 2, [float(i) for i in range(10)], 2)
        try:
            bm.to_table_row_major(2, maximum_cache_memory_in_bytes=15)._force_count()
        except Exception as exc:
            assert 'BlockMatrixCachedPartFile must be able to hold at least one row of every block in memory' in exc.args[0]
        else:
            assert False

        bm = BlockMatrix._create(5, 2, [float(i) for i in range(10)], 2)
        bm.to_table_row_major(2, maximum_cache_memory_in_bytes=16)._force_count()
Ejemplo n.º 15
0
def tree_matmul_tree_matsum(bm1,
                            bm2,
                            mul_splits: int,
                            sum_splits: int = None,
                            path_prefix: str = None,
                            read_if_exists=False):
    r'''
    Version of tree_matmul() that allows for intermediate sums of matrix 
    multiplication. `sum_splits` must be a divisor of `mul_splits`
    '''
    # TODO: Make a private function that acts recursively to ensure that the
    # matrix sums never include more than a maximum number of matrices
    assert mul_splits % sum_splits == 0, '`sum_splits` must be a divisor of `mul_splits'

    if not read_if_exists:
        print(bm1._n_block_cols)
        print(mul_splits)
        inner_brange_size = int(math.ceil(bm1._n_block_cols / mul_splits))
        print(f'inner_brange_size: {inner_brange_size}')
        split_points = list(range(0, bm1._n_block_cols,
                                  inner_brange_size)) + [bm1._n_block_cols]
        print(split_points)
        inner_ranges = list(zip(split_points[:-1], split_points[1:]))
        print(f'len(inner_ranges): {len(inner_ranges)}')
        blocks_to_multiply = [(bm1._select_blocks((0, bm1._n_block_rows),
                                                  (start, stop)),
                               bm2._select_blocks((start, stop),
                                                  (0, bm2._n_block_cols)))
                              for start, stop in inner_ranges]

        intermediate_multiply_exprs = [
            b1 @ b2 for b1, b2 in blocks_to_multiply
        ]
        print(len(intermediate_multiply_exprs))
        print(f'Writing {mul_splits} intermediate matrices to {path_prefix}')
        hl.experimental.write_block_matrices(intermediate_multiply_exprs,
                                             path_prefix)

    read_intermediates = [
        BlockMatrix.read(f"{path_prefix}_{i}") for i in range(0, mul_splits)
    ]

    tracked_partial_sums = []

    sum_block_size = math.ceil(mul_splits / sum_splits)
    for i in range(sum_splits):
        partial_sum_path = f"{path_prefix}-partial-{i}"
        sum(read_intermediates[i * sum_block_size:(i + 1) *
                               sum_block_size]).write(partial_sum_path,
                                                      overwrite=True)
        tracked_partial_sums.append(BlockMatrix.read(partial_sum_path))

    return sum(tracked_partial_sums)
Ejemplo n.º 16
0
def generate_cross_pop_ld_scores_from_ld_matrices(
        pop1,
        pop2,
        data_type,
        pop_data,
        min_frequency=0.01,
        call_rate_cutoff=0.8,
        adj: bool = False,
        radius: int = 1000000,
        overwrite=False,
        temp_bucket='gs://gnomad-tmp/ld'):
    n1 = pop_data.pop[pop1]
    n2 = pop_data.pop[pop2]
    ht1 = hl.read_table(ld_resources._ld_index_path(data_type, pop1, adj=adj))
    ht1 = ht1.filter((ht1.pop_freq.AF >= min_frequency)
                     & (ht1.pop_freq.AF <= 1 - min_frequency)
                     & (ht1.pop_freq.AN / n1 >= 2 * call_rate_cutoff))

    ht2 = hl.read_table(ld_resources._ld_index_path(data_type, pop2, adj=adj))
    ht2 = ht2.filter((ht2.pop_freq.AF >= min_frequency)
                     & (ht2.pop_freq.AF <= 1 - min_frequency)
                     & (ht2.pop_freq.AN / n2 >= 2 * call_rate_cutoff))

    ht1 = ht1.filter(hl.is_defined(ht2[ht1.key])).add_index(
        name='new_idx').checkpoint(f'{temp_bucket}/{pop1}_{pop2}.ht',
                                   overwrite=overwrite,
                                   _read_if_exists=not overwrite)
    ht2 = ht2.filter(hl.is_defined(ht1[ht2.key])).add_index(
        name='new_idx').checkpoint(f'{temp_bucket}/{pop2}_{pop1}.ht',
                                   overwrite=overwrite,
                                   _read_if_exists=not overwrite)
    indices1 = ht1.idx.collect()
    indices2 = ht2.idx.collect()
    assert len(indices1) == len(indices2)

    r1 = BlockMatrix.read(
        ld_resources._ld_matrix_path(data_type,
                                     pop1,
                                     min_frequency >= COMMON_FREQ,
                                     adj=adj)).filter(indices1, indices1)
    r2 = BlockMatrix.read(
        ld_resources._ld_matrix_path(data_type,
                                     pop2,
                                     min_frequency >= COMMON_FREQ,
                                     adj=adj)).filter(indices2, indices2)
    r_bm = r1 * r2

    # TODO: is a bias adjustment needed?
    # r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0))

    out_name = ld_resources._cross_pop_ld_scores_path(data_type, pop1, pop2,
                                                      adj)
    compute_and_annotate_ld_score(ht1, r_bm, radius, out_name, overwrite)
Ejemplo n.º 17
0
    def test_export_rectangles(self):
        nd = np.arange(0, 80, dtype=float).reshape(8, 10)

        rects1 = [[0, 1, 0, 1], [4, 5, 7, 8]]

        rects2 = [[4, 5, 0, 10], [0, 8, 4, 5]]

        rects3 = [[0, 1, 0, 1], [1, 2, 1, 2], [2, 3, 2, 3], [3, 5, 3, 6],
                  [3, 6, 3, 7], [3, 7, 3, 8], [4, 5, 0, 10], [0, 8, 4, 5],
                  [0, 8, 0, 10]]

        for rects in [rects1, rects2, rects3]:
            for block_size in [3, 4, 10]:
                bm_uri = new_temp_file()

                rect_path = new_local_temp_dir()
                rect_uri = local_path_uri(rect_path)

                (BlockMatrix.from_numpy(
                    nd,
                    block_size=block_size).sparsify_rectangles(rects).write(
                        bm_uri, force_row_major=True))

                BlockMatrix.export_rectangles(bm_uri, rect_uri, rects)

                for (i, r) in enumerate(rects):
                    file = rect_path + '/rect-' + str(i) + '_' + '-'.join(
                        map(str, r))
                    expected = nd[r[0]:r[1], r[2]:r[3]]
                    actual = np.loadtxt(file, ndmin=2)
                    self._assert_eq(expected, actual)

                rect_path_bytes = new_local_temp_dir()
                rect_uri_bytes = local_path_uri(rect_path_bytes)

                BlockMatrix.export_rectangles(bm_uri,
                                              rect_uri_bytes,
                                              rects,
                                              binary=True)

                for (i, r) in enumerate(rects):
                    file = rect_path_bytes + '/rect-' + str(
                        i) + '_' + '-'.join(map(str, r))
                    expected = nd[r[0]:r[1], r[2]:r[3]]
                    actual = np.reshape(np.fromfile(file),
                                        (r[1] - r[0], r[3] - r[2]))
                    self._assert_eq(expected, actual)

        bm_uri = new_temp_file()
        rect_uri = new_temp_file()

        (BlockMatrix.from_numpy(nd, block_size=5).sparsify_rectangles(
            [[0, 1, 0, 1]]).write(bm_uri, force_row_major=True))

        with self.assertRaises(FatalError) as e:
            BlockMatrix.export_rectangles(bm_uri, rect_uri, [[5, 6, 5, 6]])
            self.assertEquals(
                e.msg,
                'block (1, 1) missing for rectangle 0 with bounds [5, 6, 5, 6]'
            )
Ejemplo n.º 18
0
    def test_export_rectangles(self):
        nd = np.arange(0, 80, dtype=float).reshape(8, 10)

        rects1 = [[0, 1, 0, 1], [4, 5, 7, 8]]

        rects2 = [[4, 5, 0, 10], [0, 8, 4, 5]]

        rects3 = [[0, 1, 0, 1], [1, 2, 1, 2], [2, 3, 2, 3], [3, 5, 3, 6],
                  [3, 6, 3, 7], [3, 7, 3, 8], [4, 5, 0, 10], [0, 8, 4, 5],
                  [0, 8, 0, 10]]

        for rects in [rects1, rects2, rects3]:
            for block_size in [3, 4, 10]:
                rect_path = new_local_temp_dir()
                rect_uri = local_path_uri(rect_path)

                bm = BlockMatrix.from_numpy(nd, block_size=block_size)
                bm.export_rectangles(rect_uri, rects)

                self._assert_rectangles_eq(nd, rect_path, rects)

                rect_path_bytes = new_local_temp_dir()
                rect_uri_bytes = local_path_uri(rect_path_bytes)

                bm.export_rectangles(rect_uri_bytes, rects, binary=True)
                self._assert_rectangles_eq(nd,
                                           rect_path_bytes,
                                           rects,
                                           binary=True)
Ejemplo n.º 19
0
def generate_ld_scores_from_ld_matrix(pop_data,
                                      data_type,
                                      min_frequency=0.01,
                                      call_rate_cutoff=0.8,
                                      adj: bool = False,
                                      radius: int = 1000000,
                                      overwrite=False):
    # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR
    # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15 mins to compute LD scores per population (~$150 total)
    for label, pops in dict(pop_data).items():
        for pop, n in pops.items():
            ht = hl.read_table(
                ld_resources._ld_index_path(data_type, pop, adj=adj))
            ht = ht.filter((ht.pop_freq.AF >= min_frequency)
                           & (ht.pop_freq.AF <= 1 - min_frequency)
                           & (ht.pop_freq.AN / n >= 2 *
                              call_rate_cutoff)).add_index(name='new_idx')

            indices = ht.idx.collect()

            r2 = BlockMatrix.read(
                ld_resources._ld_matrix_path(data_type,
                                             pop,
                                             min_frequency >= COMMON_FREQ,
                                             adj=adj))
            r2 = r2.filter(indices, indices)**2
            r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0))

            out_name = ld_resources._ld_scores_path(data_type, pop, adj)
            compute_and_annotate_ld_score(ht, r2_adj, radius, out_name,
                                          overwrite)
Ejemplo n.º 20
0
    def test_sparsify_triangle(self):
        nd = np.array([[ 1.0,  2.0,  3.0,  4.0],
                       [ 5.0,  6.0,  7.0,  8.0],
                       [ 9.0, 10.0, 11.0, 12.0],
                       [13.0, 14.0, 15.0, 16.0]])
        bm = BlockMatrix.from_numpy(nd, block_size=2)

        self.assertFalse(bm.is_sparse)
        self.assertTrue(bm.sparsify_triangle().is_sparse)

        self._assert_eq(
            bm.sparsify_triangle(),
            np.array([[ 1.,  2.,  3.,  4.],
                      [ 0.,  6.,  7.,  8.],
                      [ 0.,  0., 11., 12.],
                      [ 0.,  0.,  0., 16.]]))

        self._assert_eq(
            bm.sparsify_triangle(lower=True),
            np.array([[ 1.,  0.,  0.,  0.],
                      [ 5.,  6.,  0.,  0.],
                      [ 9., 10., 11.,  0.],
                      [13., 14., 15., 16.]]))

        self._assert_eq(
            bm.sparsify_triangle(blocks_only=True),
            np.array([[ 1.,  2.,  3.,  4.],
                      [ 5.,  6.,  7.,  8.],
                      [ 0.,  0., 11., 12.],
                      [ 0.,  0., 15., 16.]]))
Ejemplo n.º 21
0
    def test_to_matrix_table(self):
        n_partitions = 2
        rows, cols = 2, 5
        bm = BlockMatrix._create(rows, cols, [float(i) for i in range(10)])
        actual = bm.to_matrix_table_row_major(n_partitions)

        expected = hl.utils.range_matrix_table(rows, cols)
        expected = expected.annotate_entries(element=hl.float64(expected.row_idx * cols + expected.col_idx))
        expected = expected.key_cols_by(col_idx=hl.int64(expected.col_idx))
        expected = expected.key_rows_by(row_idx=hl.int64(expected.row_idx))
        assert expected._same(actual)

        bm = BlockMatrix.random(50, 100, block_size=25, seed=0)
        mt = bm.to_matrix_table_row_major(n_partitions)
        mt_round_trip = BlockMatrix.from_entry_expr(mt.element).to_matrix_table_row_major()
        assert mt._same(mt_round_trip)
Ejemplo n.º 22
0
    def test_sparsify_triangle(self):
        nd = np.array([[ 1.0,  2.0,  3.0,  4.0],
                       [ 5.0,  6.0,  7.0,  8.0],
                       [ 9.0, 10.0, 11.0, 12.0],
                       [13.0, 14.0, 15.0, 16.0]])
        bm = BlockMatrix.from_numpy(nd, block_size=2)

        self.assertFalse(bm.is_sparse)
        self.assertTrue(bm.sparsify_triangle().is_sparse)

        self._assert_eq(
            bm.sparsify_triangle(),
            np.array([[ 1.,  2.,  3.,  4.],
                      [ 0.,  6.,  7.,  8.],
                      [ 0.,  0., 11., 12.],
                      [ 0.,  0.,  0., 16.]]))

        self._assert_eq(
            bm.sparsify_triangle(lower=True),
            np.array([[ 1.,  0.,  0.,  0.],
                      [ 5.,  6.,  0.,  0.],
                      [ 9., 10., 11.,  0.],
                      [13., 14., 15., 16.]]))

        self._assert_eq(
            bm.sparsify_triangle(blocks_only=True),
            np.array([[ 1.,  2.,  3.,  4.],
                      [ 5.,  6.,  7.,  8.],
                      [ 0.,  0., 11., 12.],
                      [ 0.,  0., 15., 16.]]))
Ejemplo n.º 23
0
    def test_random_uniform(self):
        uniform = BlockMatrix.random(10, 10, gaussian=False)

        nuniform = uniform.to_numpy()
        for row in nuniform:
            for entry in row:
                assert entry > 0
Ejemplo n.º 24
0
    def test_export_rectangles(self):
        nd = np.arange(0, 80, dtype=float).reshape(8, 10)

        rects1 = [[0, 1, 0, 1], [4, 5, 7, 8]]

        rects2 = [[4, 5, 0, 10], [0, 8, 4, 5]]

        rects3 = [[0, 1, 0, 1], [1, 2, 1, 2], [2, 3, 2, 3],
                  [3, 5, 3, 6], [3, 6, 3, 7], [3, 7, 3, 8],
                  [4, 5, 0, 10], [0, 8, 4, 5], [0, 8, 0, 10]]

        for rects in [rects1, rects2, rects3]:
            for block_size in [3, 4, 10]:
                rect_path = new_local_temp_dir()
                rect_uri = local_path_uri(rect_path)

                bm = BlockMatrix.from_numpy(nd, block_size=block_size)
                bm.export_rectangles(rect_uri, rects)

                self._assert_rectangles_eq(nd, rect_path, rects)

                rect_path_bytes = new_local_temp_dir()
                rect_uri_bytes = local_path_uri(rect_path_bytes)

                bm.export_rectangles(rect_uri_bytes, rects, binary=True)
                self._assert_rectangles_eq(nd, rect_path_bytes, rects, binary=True)
Ejemplo n.º 25
0
    def test_random_uniform(self):
        uniform = BlockMatrix.random(10, 10, gaussian=False)

        nuniform = uniform.to_numpy()
        for row in nuniform:
            for entry in row:
                assert entry > 0
Ejemplo n.º 26
0
    def bm(self) -> BlockMatrix:
        """
        Read and return the Hail MatrixTable resource.

        :return: Hail MatrixTable resource
        """
        return BlockMatrix.read(self.path)
Ejemplo n.º 27
0
 def check(expr, mean_impute, center, normalize, expected):
     actual = np.squeeze(
         BlockMatrix.from_entry_expr(expr,
                                     mean_impute=mean_impute,
                                     center=center,
                                     normalize=normalize).to_numpy())
     assert np.allclose(actual, expected)
Ejemplo n.º 28
0
    def test_to_matrix_table(self):
        n_partitions = 2
        rows, cols = 2, 5
        bm = BlockMatrix._create(rows, cols, [float(i) for i in range(10)])
        actual = bm.to_matrix_table_row_major(n_partitions)

        expected = hl.utils.range_matrix_table(rows, cols)
        expected = expected.annotate_entries(element=hl.float64(expected.row_idx * cols + expected.col_idx))
        expected = expected.key_cols_by(col_idx=hl.int64(expected.col_idx))
        expected = expected.key_rows_by(row_idx=hl.int64(expected.row_idx))
        assert expected._same(actual)

        bm = BlockMatrix.random(50, 100, block_size=25, seed=0)
        mt = bm.to_matrix_table_row_major(n_partitions)
        mt_round_trip = BlockMatrix.from_entry_expr(mt.element).to_matrix_table_row_major()
        assert mt._same(mt_round_trip)
Ejemplo n.º 29
0
    def test_to_table(self):
        schema = hl.tstruct(row_idx=hl.tint64, entries=hl.tarray(hl.tfloat64))
        rows = [{
            'row_idx': 0,
            'entries': [0.0, 1.0]
        }, {
            'row_idx': 1,
            'entries': [2.0, 3.0]
        }, {
            'row_idx': 2,
            'entries': [4.0, 5.0]
        }, {
            'row_idx': 3,
            'entries': [6.0, 7.0]
        }, {
            'row_idx': 4,
            'entries': [8.0, 9.0]
        }]

        for n_partitions in [1, 2, 3]:
            for block_size in [1, 2, 5]:
                expected = hl.Table.parallelize(rows, schema, 'row_idx',
                                                n_partitions)
                bm = BlockMatrix._create(5, 2, [float(i) for i in range(10)],
                                         block_size)
                actual = bm.to_table_row_major(n_partitions)
                self.assertTrue(expected._same(actual))
Ejemplo n.º 30
0
    def test_slicing(self):
        nd = np.array(np.arange(0, 80, dtype=float)).reshape(8, 10)
        bm = BlockMatrix.from_numpy(nd, block_size=3)

        for indices in [(0, 0), (5, 7), (-3, 9), (-8, -10)]:
            self._assert_eq(bm[indices], nd[indices])

        for indices in [(0, slice(3, 4)), (1, slice(3, 4)), (-8, slice(3, 4)),
                        (-1, slice(3, 4))]:
            self._assert_eq(bm[indices], np.expand_dims(nd[indices], 0))

        for indices in [(slice(3, 4), 0), (slice(3, 4), 1), (slice(3, 4), -8),
                        (slice(3, 4), -1)]:
            self._assert_eq(bm[indices], np.expand_dims(nd[indices], 1))

        for indices in [(slice(0, 8), slice(0, 10)),
                        (slice(0, 8, 2), slice(0, 10, 2)),
                        (slice(2, 4), slice(5, 7)),
                        (slice(-8, -1), slice(-10, -1)),
                        (slice(-8, -1, 2), slice(-10, -1, 2)),
                        (slice(None, 4, 1), slice(None, 4, 1)),
                        (slice(4, None), slice(4, None)),
                        (slice(None, None), slice(None, None))]:
            self._assert_eq(bm[indices], nd[indices])

        self.assertRaises(ValueError, lambda: bm[0, ])

        self.assertRaises(ValueError, lambda: bm[9, 0])
        self.assertRaises(ValueError, lambda: bm[-9, 0])
        self.assertRaises(ValueError, lambda: bm[0, 11])
        self.assertRaises(ValueError, lambda: bm[0, -11])

        self.assertRaises(ValueError, lambda: bm[::-1, 0])
        self.assertRaises(ValueError, lambda: bm[0, ::-1])

        self.assertRaises(ValueError, lambda: bm[:0, 0])
        self.assertRaises(ValueError, lambda: bm[0, :0])

        self.assertRaises(ValueError, lambda: bm[0:9, 0])
        self.assertRaises(ValueError, lambda: bm[-9:, 0])
        self.assertRaises(ValueError, lambda: bm[:-9, 0])

        self.assertRaises(ValueError, lambda: bm[0, :11])
        self.assertRaises(ValueError, lambda: bm[0, -11:])
        self.assertRaises(ValueError, lambda: bm[0, :-11])

        bm2 = bm.sparsify_row_intervals([0, 0, 0, 0, 0, 0, 0, 0],
                                        [2, 0, 0, 0, 0, 0, 0, 0])
        self.assertEqual(bm2[0, 1], 1.0)
        self.assertEqual(bm2[0, 2], 0.0)
        self.assertEqual(bm2[0, 9], 0.0)

        nd2 = np.zeros(shape=(8, 10))
        nd2[0, 1] = 1.0
        self._assert_eq(bm2[:, :], nd2)

        self._assert_eq(bm2[:, 1], nd2[:, 1:2])
        self._assert_eq(bm2[1, :], nd2[1:2, :])
        self._assert_eq(bm2[0:5, 0:5], nd2[0:5, 0:5])
Ejemplo n.º 31
0
def get_Z(N_r):
    r'''
    Returns `N_r`-dim standard normal random vector
    '''
    Z = BlockMatrix.random(
        n_rows=N_r, n_cols=1,
        gaussian=True)  # N_r-dimensional standard normal random vector
    return Z
Ejemplo n.º 32
0
    def test_special_elementwise_ops(self):
        nm = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
        m = BlockMatrix.from_numpy(nm)

        self._assert_close(m**3, nm**3)
        self._assert_close(m.sqrt(), np.sqrt(nm))
        self._assert_close(m.log(), np.log(nm))
        self._assert_close((m - 4).abs(), np.abs(nm - 4))
Ejemplo n.º 33
0
    def test_block_matrix_from_numpy(self):
        ndarray = np.matrix([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14]], dtype=np.float64)

        for block_size in [1, 2, 5, 1024]:
            block_matrix = BlockMatrix.from_numpy(ndarray, block_size)
            assert (block_matrix.n_rows == 3)
            assert (block_matrix.n_cols == 5)
            assert (block_matrix.to_numpy() == ndarray).all()
Ejemplo n.º 34
0
    def test_from_entry_expr_options(self):
        def build_mt(a):
            data = [{
                'v': 0,
                's': 0,
                'x': a[0]
            }, {
                'v': 0,
                's': 1,
                'x': a[1]
            }, {
                'v': 0,
                's': 2,
                'x': a[2]
            }]
            ht = hl.Table.parallelize(
                data, hl.dtype('struct{v: int32, s: int32, x: float64}'))
            mt = ht.to_matrix_table(['v'], ['s'])
            ids = mt.key_cols_by()['s'].collect()
            return mt.choose_cols([ids.index(0), ids.index(1), ids.index(2)])

        def check(expr, mean_impute, center, normalize, expected):
            actual = np.squeeze(
                BlockMatrix.from_entry_expr(expr,
                                            mean_impute=mean_impute,
                                            center=center,
                                            normalize=normalize).to_numpy())
            assert np.allclose(actual, expected)

        a = np.array([0.0, 1.0, 2.0])

        mt = build_mt(a)
        check(mt.x, False, False, False, a)
        check(mt.x, False, True, False, a - 1.0)
        check(mt.x, False, False, True, a / np.sqrt(5))
        check(mt.x, False, True, True, (a - 1.0) / np.sqrt(2))
        check(mt.x + 1 - 1, False, False, False, a)

        mt = build_mt([0.0, hl.null('float64'), 2.0])
        check(mt.x, True, False, False, a)
        check(mt.x, True, True, False, a - 1.0)
        check(mt.x, True, False, True, a / np.sqrt(5))
        check(mt.x, True, True, True, (a - 1.0) / np.sqrt(2))
        with self.assertRaises(Exception):
            BlockMatrix.from_entry_expr(mt.x)
Ejemplo n.º 35
0
    def test_sparsify_row_intervals(self):
        nd = np.array([[ 1.0,  2.0,  3.0,  4.0],
                       [ 5.0,  6.0,  7.0,  8.0],
                       [ 9.0, 10.0, 11.0, 12.0],
                       [13.0, 14.0, 15.0, 16.0]])
        bm = BlockMatrix.from_numpy(nd, block_size=2)

        self._assert_eq(
            bm.sparsify_row_intervals(
                starts=[1, 0, 2, 2],
                stops= [2, 0, 3, 4]),
            np.array([[ 0.,  2.,  0.,  0.],
                      [ 0.,  0.,  0.,  0.],
                      [ 0.,  0., 11.,  0.],
                      [ 0.,  0., 15., 16.]]))

        self._assert_eq(
            bm.sparsify_row_intervals(
                starts=[1, 0, 2, 2],
                stops= [2, 0, 3, 4],
                blocks_only=True),
            np.array([[ 1.,  2.,  0.,  0.],
                      [ 5.,  6.,  0.,  0.],
                      [ 0.,  0., 11., 12.],
                      [ 0.,  0., 15., 16.]]))

        nd2 = np.random.normal(size=(8, 10))
        bm2 = BlockMatrix.from_numpy(nd2, block_size=3)

        for bounds in [[[0, 1, 2, 3, 4, 5, 6, 7],
                        [1, 2, 3, 4, 5, 6, 7, 8]],
                       [[0, 0, 5, 3, 4, 5, 8, 2],
                        [9, 0, 5, 3, 4, 5, 9, 5]],
                       [[0, 5, 10, 8, 7, 6, 5, 4],
                        [0, 5, 10, 9, 8, 7, 6, 5]]]:
            starts, stops = bounds
            actual = bm2.sparsify_row_intervals(starts, stops, blocks_only=False).to_numpy()
            expected = nd2.copy()
            for i in range(0, 8):
                for j in range(0, starts[i]):
                    expected[i, j] = 0.0
                for j in range(stops[i], 10):
                    expected[i, j] = 0.0
            self._assert_eq(actual, expected)
Ejemplo n.º 36
0
    def test_sparsify_row_intervals(self):
        nd = np.array([[ 1.0,  2.0,  3.0,  4.0],
                       [ 5.0,  6.0,  7.0,  8.0],
                       [ 9.0, 10.0, 11.0, 12.0],
                       [13.0, 14.0, 15.0, 16.0]])
        bm = BlockMatrix.from_numpy(nd, block_size=2)

        self._assert_eq(
            bm.sparsify_row_intervals(
                starts=[1, 0, 2, 2],
                stops= [2, 0, 3, 4]),
            np.array([[ 0.,  2.,  0.,  0.],
                      [ 0.,  0.,  0.,  0.],
                      [ 0.,  0., 11.,  0.],
                      [ 0.,  0., 15., 16.]]))

        self._assert_eq(
            bm.sparsify_row_intervals(
                starts=[1, 0, 2, 2],
                stops= [2, 0, 3, 4],
                blocks_only=True),
            np.array([[ 1.,  2.,  0.,  0.],
                      [ 5.,  6.,  0.,  0.],
                      [ 0.,  0., 11., 12.],
                      [ 0.,  0., 15., 16.]]))

        nd2 = np.random.normal(size=(8, 10))
        bm2 = BlockMatrix.from_numpy(nd2, block_size=3)

        for bounds in [[[0, 1, 2, 3, 4, 5, 6, 7],
                        [1, 2, 3, 4, 5, 6, 7, 8]],
                       [[0, 0, 5, 3, 4, 5, 8, 2],
                        [9, 0, 5, 3, 4, 5, 9, 5]],
                       [[0, 5, 10, 8, 7, 6, 5, 4],
                        [0, 5, 10, 9, 8, 7, 6, 5]]]:
            starts, stops = bounds
            actual = bm2.sparsify_row_intervals(starts, stops, blocks_only=False).to_numpy()
            expected = nd2.copy()
            for i in range(0, 8):
                for j in range(0, starts[i]):
                    expected[i, j] = 0.0
                for j in range(stops[i], 10):
                    expected[i, j] = 0.0
            self._assert_eq(actual, expected)
Ejemplo n.º 37
0
    def test_special_elementwise_ops(self):
        nm = np.array([[1.0, 2.0, 3.0, 3.14], [4.0, 5.0, 6.0, 12.12]])
        m = BlockMatrix.from_numpy(nm)

        self._assert_close(m ** 3, nm ** 3)
        self._assert_close(m.sqrt(), np.sqrt(nm))
        self._assert_close(m.ceil(), np.ceil(nm))
        self._assert_close(m.floor(), np.floor(nm))
        self._assert_close(m.log(), np.log(nm))
        self._assert_close((m - 4).abs(), np.abs(nm - 4))
Ejemplo n.º 38
0
    def test_sum_with_sparsify(self):
        nd = np.zeros(shape=(5, 7))
        nd[2, 4] = 1.0
        nd[2, 5] = 2.0
        nd[3, 4] = 3.0
        nd[3, 5] = 4.0
        bm = BlockMatrix.from_numpy(nd, block_size=2).sparsify_rectangles([[2, 4, 4, 6]])

        bm2 = BlockMatrix.from_numpy(nd, block_size=2).sparsify_rectangles([[2, 4, 4, 6], [0, 5, 0, 1]])

        bm3 = BlockMatrix.from_numpy(nd, block_size=2).sparsify_rectangles([[2, 4, 4, 6], [0, 1, 0, 7]])

        nd4 = np.zeros(shape=(5, 7))
        bm4 = BlockMatrix.fill(5, 7, value=0.0, block_size=2).sparsify_rectangles([])

        self.assert_sums_agree(bm, nd)
        self.assert_sums_agree(bm2, nd)
        self.assert_sums_agree(bm3, nd)
        self.assert_sums_agree(bm4, nd4)
Ejemplo n.º 39
0
    def test_matrix_ops(self):
        nm = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
        m = BlockMatrix.from_numpy(nm, block_size=2)
        nsquare = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
        square = BlockMatrix.from_numpy(nsquare, block_size=2)

        nrow = np.array([[7.0, 8.0, 9.0]])
        row = BlockMatrix.from_numpy(nrow, block_size=2)

        self._assert_eq(m.T, nm.T)
        self._assert_eq(m.T, nm.T)
        self._assert_eq(row.T, nrow.T)

        self._assert_eq(m @ m.T, nm @ nm.T)
        self._assert_eq(m @ nm.T, nm @ nm.T)
        self._assert_eq(row @ row.T, nrow @ nrow.T)
        self._assert_eq(row @ nrow.T, nrow @ nrow.T)

        self._assert_eq(m.T @ m, nm.T @ nm)
        self._assert_eq(m.T @ nm, nm.T @ nm)
        self._assert_eq(row.T @ row, nrow.T @ nrow)
        self._assert_eq(row.T @ nrow, nrow.T @ nrow)

        self.assertRaises(ValueError, lambda: m @ m)
        self.assertRaises(ValueError, lambda: m @ nm)

        self._assert_eq(m.diagonal(), np.array([[1.0, 5.0]]))
        self._assert_eq(m.T.diagonal(), np.array([[1.0, 5.0]]))
        self._assert_eq((m @ m.T).diagonal(), np.array([[14.0, 77.0]]))

        self._assert_eq(m.sum(axis=0).T, np.array([[5.0], [7.0], [9.0]]))
        self._assert_eq(m.sum(axis=1).T, np.array([[6.0, 15.0]]))
        self._assert_eq(
            m.sum(axis=0).T + row,
            np.array([[12.0, 13.0, 14.0], [14.0, 15.0, 16.0],
                      [16.0, 17.0, 18.0]]))
        self._assert_eq(
            m.sum(axis=0) + row.T,
            np.array([[12.0, 14.0, 16.0], [13.0, 15.0, 17.0],
                      [14.0, 16.0, 18.0]]))
        self._assert_eq(
            square.sum(axis=0).T + square.sum(axis=1),
            np.array([[18.0], [30.0], [42.0]]))
Ejemplo n.º 40
0
    def test_sum_with_sparsify(self):
        nd = np.zeros(shape=(5, 7))
        nd[2, 4] = 1.0
        nd[2, 5] = 2.0
        nd[3, 4] = 3.0
        nd[3, 5] = 4.0
        bm = BlockMatrix.from_numpy(nd, block_size=2).sparsify_rectangles([[2, 4, 4, 6]])

        bm2 = BlockMatrix.from_numpy(nd, block_size=2).sparsify_rectangles([[2, 4, 4, 6], [0, 5, 0, 1]])

        bm3 = BlockMatrix.from_numpy(nd, block_size=2).sparsify_rectangles([[2, 4, 4, 6], [0, 1, 0, 7]])

        nd4 = np.zeros(shape=(5, 7))
        bm4 = BlockMatrix.fill(5, 7, value=0.0, block_size=2).sparsify_rectangles([])

        self.assert_sums_agree(bm, nd)
        self.assert_sums_agree(bm2, nd)
        self.assert_sums_agree(bm3, nd)
        self.assert_sums_agree(bm4, nd4)
Ejemplo n.º 41
0
    def test_slicing(self):
        nd = np.array(np.arange(0, 80, dtype=float)).reshape(8, 10)
        bm = BlockMatrix.from_numpy(nd, block_size=3)

        for indices in [(0, 0), (5, 7), (-3, 9), (-8, -10)]:
            self._assert_eq(bm[indices], nd[indices])

        for indices in [(0, slice(3, 4)),
                        (1, slice(3, 4)),
                        (-8, slice(3, 4)),
                        (-1, slice(3, 4))]:
            self._assert_eq(bm[indices], np.expand_dims(nd[indices], 0))
            self._assert_eq(bm[indices] - bm, nd[indices] - nd)
            self._assert_eq(bm - bm[indices], nd - nd[indices])

        for indices in [(slice(3, 4), 0),
                        (slice(3, 4), 1),
                        (slice(3, 4), -8),
                        (slice(3, 4), -1)]:
            self._assert_eq(bm[indices], np.expand_dims(nd[indices], 1))
            self._assert_eq(bm[indices] - bm, nd[indices] - nd)
            self._assert_eq(bm - bm[indices], nd - nd[indices])

        for indices in [(slice(0, 8), slice(0, 10)),
                        (slice(0, 8, 2), slice(0, 10, 2)),
                        (slice(2, 4), slice(5, 7)),
                        (slice(-8, -1), slice(-10, -1)),
                        (slice(-8, -1, 2), slice(-10, -1, 2)),
                        (slice(None, 4, 1), slice(None, 4, 1)),
                        (slice(4, None), slice(4, None)),
                        (slice(None, None), slice(None, None))]:
            self._assert_eq(bm[indices], nd[indices])
            self._assert_eq(bm[indices][:, :2], nd[indices][:, :2])
            self._assert_eq(bm[indices][:2, :], nd[indices][:2, :])

        self.assertRaises(ValueError, lambda: bm[0, ])

        self.assertRaises(ValueError, lambda: bm[9, 0])
        self.assertRaises(ValueError, lambda: bm[-9, 0])
        self.assertRaises(ValueError, lambda: bm[0, 11])
        self.assertRaises(ValueError, lambda: bm[0, -11])

        self.assertRaises(ValueError, lambda: bm[::-1, 0])
        self.assertRaises(ValueError, lambda: bm[0, ::-1])

        self.assertRaises(ValueError, lambda: bm[:0, 0])
        self.assertRaises(ValueError, lambda: bm[0, :0])

        self.assertRaises(ValueError, lambda: bm[0:9, 0])
        self.assertRaises(ValueError, lambda: bm[-9:, 0])
        self.assertRaises(ValueError, lambda: bm[:-9, 0])

        self.assertRaises(ValueError, lambda: bm[0, :11])
        self.assertRaises(ValueError, lambda: bm[0, -11:])
        self.assertRaises(ValueError, lambda: bm[0, :-11])
Ejemplo n.º 42
0
    def test_sparsify_blocks(self):
        block_list = [1, 2]
        np_square = np.arange(16, dtype=np.float64).reshape((4, 4))
        block_size = 2
        bm = BlockMatrix.from_numpy(np_square, block_size=block_size)
        bm = bm._sparsify_blocks(block_list)
        sparse_numpy = sparsify_numpy(np_square, block_size, block_list)
        assert np.array_equal(bm.to_numpy(), sparse_numpy)
        assert np.array_equal(
            sparse_numpy,
            np.array([[0, 0, 2, 3], [0, 0, 6, 7], [8, 9, 0, 0], [12, 13, 0,
                                                                 0]]))

        block_list = [4, 8, 10, 12, 13, 14]
        np_square = np.arange(225, dtype=np.float64).reshape((15, 15))
        block_size = 4
        bm = BlockMatrix.from_numpy(np_square, block_size=block_size)
        bm = bm._sparsify_blocks(block_list)
        sparse_numpy = sparsify_numpy(np_square, block_size, block_list)
        assert np.array_equal(bm.to_numpy(), sparse_numpy)
Ejemplo n.º 43
0
def get_toy_R(M, n_blocks, identity=False):
    r'''
    Creates "toy" LD matrix as a list of Hail Block Matrices for testing purposes.
    The list has length=`n_blocks`.
    '''
    R = []
    block_snp_idxs = np.array_split(range(M), n_blocks)
    block_sizes = [len(block) for block in block_snp_idxs]
    for block_size in block_sizes:
        if identity:
            R_block = BlockMatrix.from_numpy(np.identity(n=block_size))
        else:
            A = np.random.uniform(
                low=-1, high=1, size=(block_size, 1)
            )**11  # exponentiate to number (odd to preserve negative sign) to avoid highly correlated SNPs
            cov = A @ A.T
            np.fill_diagonal(cov, 1)
            R_block = BlockMatrix.from_numpy(cov)
        R.append(R_block)
    return R
Ejemplo n.º 44
0
    def test_sparsify_rectangles(self):
        nd = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
                       [9.0, 10.0, 11.0, 12.0], [13.0, 14.0, 15.0, 16.0]])
        bm = BlockMatrix.from_numpy(nd, block_size=2)

        self._assert_eq(
            bm.sparsify_rectangles([[0, 1, 0, 1], [0, 3, 0, 2], [1, 2, 0, 4]]),
            np.array([[1., 2., 3., 4.], [5., 6., 7., 8.], [9., 10., 0., 0.],
                      [13., 14., 0., 0.]]))

        self._assert_eq(bm.sparsify_rectangles([]), np.zeros(shape=(4, 4)))
Ejemplo n.º 45
0
    def test_export_rectangles(self):
        nd = np.arange(0, 80, dtype=float).reshape(8, 10)

        rects1 = [[0, 1, 0, 1], [4, 5, 7, 8]]

        rects2 = [[4, 5, 0, 10], [0, 8, 4, 5]]

        rects3 = [[0, 1, 0, 1], [1, 2, 1, 2], [2, 3, 2, 3],
                  [3, 5, 3, 6], [3, 6, 3, 7], [3, 7, 3, 8],
                  [4, 5, 0, 10], [0, 8, 4, 5], [0, 8, 0, 10]]

        for rects in [rects1, rects2, rects3]:
            for block_size in [3, 4, 10]:
                bm_uri = new_temp_file()

                rect_path = new_local_temp_dir()
                rect_uri = local_path_uri(rect_path)

                (BlockMatrix.from_numpy(nd, block_size=block_size)
                    .sparsify_rectangles(rects)
                    .write(bm_uri, force_row_major=True))

                BlockMatrix.export_rectangles(bm_uri, rect_uri, rects)

                for (i, r) in enumerate(rects):
                    file = rect_path + '/rect-' + str(i) + '_' + '-'.join(map(str, r))
                    expected = nd[r[0]:r[1], r[2]:r[3]]
                    actual = np.loadtxt(file, ndmin = 2)
                    self._assert_eq(expected, actual)

                rect_path_bytes = new_local_temp_dir()
                rect_uri_bytes = local_path_uri(rect_path_bytes)

                BlockMatrix.export_rectangles(bm_uri, rect_uri_bytes, rects, binary=True)

                for (i, r) in enumerate(rects):
                    file = rect_path_bytes + '/rect-' + str(i) + '_' + '-'.join(map(str, r))
                    expected = nd[r[0]:r[1], r[2]:r[3]]
                    actual = np.reshape(np.fromfile(file), (r[1] - r[0], r[3] - r[2]))
                    self._assert_eq(expected, actual)

        bm_uri = new_temp_file()
        rect_uri = new_temp_file()

        (BlockMatrix.from_numpy(nd, block_size=5)
            .sparsify_rectangles([[0, 1, 0, 1]])
            .write(bm_uri, force_row_major=True))

        with self.assertRaises(FatalError) as e:
            BlockMatrix.export_rectangles(bm_uri, rect_uri, [[5, 6, 5, 6]])
            self.assertEquals(e.msg, 'block (1, 1) missing for rectangle 0 with bounds [5, 6, 5, 6]')
Ejemplo n.º 46
0
    def test_promote(self):
        nx = np.matrix([[2.0]])
        nc = np.matrix([[1.0], [2.0]])
        nr = np.matrix([[1.0, 2.0, 3.0]])
        nm = np.matrix([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])

        e = 2
        x = BlockMatrix.from_numpy(nx)
        c = BlockMatrix.from_numpy(nc)
        r = BlockMatrix.from_numpy(nr)
        m = BlockMatrix.from_numpy(nm)

        nct, nrt, nmt = nc.T, nr.T, nm.T
        ct, rt, mt = c.T, r.T, m.T

        good = [(x, x),  (x, c),  (x, r),  (x, m), (x, e),
                (c, x),  (c, c),           (c, m), (c, e),
                (r, x),           (r, r),  (r, m), (r, e),
                (m, x),  (m, c),  (m, r),  (m, m), (m, e),
                (x, nx), (x, nc), (x, nr), (x, nm),
                (c, nx), (c, nc),          (c, nm),
                (r, nx),          (r, nr), (r, nm),
                (m, nx), (m, nc), (m, nr), (m, nm)]

        bad = [(c, r), (r, c), (c, ct), (r, rt),
               (c, rt), (c, mt), (ct, r), (ct, m),
               (r, ct), (r, mt), (rt, c), (rt, m),
               (m, ct), (m, rt), (m, mt), (mt, c), (mt, r), (mt, m),
               (c, nr), (r, nc), (c, nct), (r, nrt),
               (c, nrt), (c, nmt), (ct, nr), (ct, nm),
               (r, nct), (r, nmt), (rt, nc), (rt, nm),
               (m, nct), (m, nrt), (m, nmt), (mt, nc), (mt, nr), (mt, nm)]

        for (a, b) in good:
            a._promote(b, '')

        for (a, b) in bad:
            self.assertRaises(ValueError,
                              lambda: a._promote(b, ''))
Ejemplo n.º 47
0
    def test_export_rectangles_filtered(self):
        rect_path = new_local_temp_dir()
        rect_uri = local_path_uri(rect_path)
        nd = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
                       [9.0, 10.0, 11.0, 12.0], [13.0, 14.0, 15.0, 16.0]])
        bm = BlockMatrix.from_numpy(nd)
        bm = bm[1:3, 1:3]
        export_rects = [[0, 1, 0, 2], [1, 2, 0, 2]]
        bm.export_rectangles(rect_uri, export_rects)

        expected = np.array([[6.0, 7.0], [10.0, 11.0]])

        self._assert_rectangles_eq(expected, rect_path, export_rects)
Ejemplo n.º 48
0
    def test_promote(self):
        nx = np.matrix([[2.0]])
        nc = np.matrix([[1.0], [2.0]])
        nr = np.matrix([[1.0, 2.0, 3.0]])
        nm = np.matrix([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])

        e = 2
        x = BlockMatrix.from_numpy(nx)
        c = BlockMatrix.from_numpy(nc)
        r = BlockMatrix.from_numpy(nr)
        m = BlockMatrix.from_numpy(nm)

        nct, nrt, nmt = nc.T, nr.T, nm.T
        ct, rt, mt = c.T, r.T, m.T

        good = [(x, x),  (x, c),  (x, r),  (x, m), (x, e),
                (c, x),  (c, c),           (c, m), (c, e),
                (r, x),           (r, r),  (r, m), (r, e),
                (m, x),  (m, c),  (m, r),  (m, m), (m, e),
                (x, nx), (x, nc), (x, nr), (x, nm),
                (c, nx), (c, nc),          (c, nm),
                (r, nx),          (r, nr), (r, nm),
                (m, nx), (m, nc), (m, nr), (m, nm)]

        bad = [(c, r), (r, c), (c, ct), (r, rt),
               (c, rt), (c, mt), (ct, r), (ct, m),
               (r, ct), (r, mt), (rt, c), (rt, m),
               (m, ct), (m, rt), (m, mt), (mt, c), (mt, r), (mt, m),
               (c, nr), (r, nc), (c, nct), (r, nrt),
               (c, nrt), (c, nmt), (ct, nr), (ct, nm),
               (r, nct), (r, nmt), (rt, nc), (rt, nm),
               (m, nct), (m, nrt), (m, nmt), (mt, nc), (mt, nr), (mt, nm)]

        for (a, b) in good:
            a._promote(b, '')

        for (a, b) in bad:
            self.assertRaises(ValueError,
                              lambda: a._promote(b, ''))
Ejemplo n.º 49
0
    def test_to_table(self):
        schema = hl.tstruct(row_idx=hl.tint64, entries=hl.tarray(hl.tfloat64))
        rows = [{'row_idx': 0, 'entries': [0.0, 1.0]},
                {'row_idx': 1, 'entries': [2.0, 3.0]},
                {'row_idx': 2, 'entries': [4.0, 5.0]},
                {'row_idx': 3, 'entries': [6.0, 7.0]},
                {'row_idx': 4, 'entries': [8.0, 9.0]}]

        for n_partitions in [1, 2, 3]:
            for block_size in [1, 2, 5]:
                expected = hl.Table.parallelize(rows, schema, 'row_idx', n_partitions)
                bm = BlockMatrix._create(5, 2, [float(i) for i in range(10)], block_size)
                actual = bm.to_table_row_major(n_partitions)
                self.assertTrue(expected._same(actual))
Ejemplo n.º 50
0
    def test_slicing(self):
        nd = np.array(np.arange(0, 80, dtype=float)).reshape(8, 10)
        bm = BlockMatrix.from_numpy(nd, block_size=3)

        for indices in [(0, 0), (5, 7), (-3, 9), (-8, -10)]:
            self._assert_eq(bm[indices], nd[indices])

        for indices in [(0, slice(3, 4)),
                        (1, slice(3, 4)),
                        (-8, slice(3, 4)),
                        (-1, slice(3, 4))]:
            self._assert_eq(bm[indices], np.expand_dims(nd[indices], 0))

        for indices in [(slice(3, 4), 0),
                        (slice(3, 4), 1),
                        (slice(3, 4), -8),
                        (slice(3, 4), -1)]:
            self._assert_eq(bm[indices], np.expand_dims(nd[indices], 1))

        for indices in [(slice(0, 8), slice(0, 10)),
                        (slice(0, 8, 2), slice(0, 10, 2)),
                        (slice(2, 4), slice(5, 7)),
                        (slice(-8, -1), slice(-10, -1)),
                        (slice(-8, -1, 2), slice(-10, -1, 2)),
                        (slice(None, 4, 1), slice(None, 4, 1)),
                        (slice(4, None), slice(4, None)),
                        (slice(None, None), slice(None, None))]:
            self._assert_eq(bm[indices], nd[indices])

        self.assertRaises(ValueError, lambda: bm[0, ])

        self.assertRaises(ValueError, lambda: bm[9, 0])
        self.assertRaises(ValueError, lambda: bm[-9, 0])
        self.assertRaises(ValueError, lambda: bm[0, 11])
        self.assertRaises(ValueError, lambda: bm[0, -11])

        self.assertRaises(ValueError, lambda: bm[::-1, 0])
        self.assertRaises(ValueError, lambda: bm[0, ::-1])

        self.assertRaises(ValueError, lambda: bm[:0, 0])
        self.assertRaises(ValueError, lambda: bm[0, :0])

        self.assertRaises(ValueError, lambda: bm[0:9, 0])
        self.assertRaises(ValueError, lambda: bm[-9:, 0])
        self.assertRaises(ValueError, lambda: bm[:-9, 0])

        self.assertRaises(ValueError, lambda: bm[0, :11])
        self.assertRaises(ValueError, lambda: bm[0, -11:])
        self.assertRaises(ValueError, lambda: bm[0, :-11])
Ejemplo n.º 51
0
    def test_slices_with_sparsify(self):
        nd = np.array(np.arange(0, 80, dtype=float)).reshape(8, 10)
        bm = BlockMatrix.from_numpy(nd, block_size=3)
        bm2 = bm.sparsify_row_intervals([0, 0, 0, 0, 0, 0, 0, 0], [2, 0, 0, 0, 0, 0, 0, 0])
        self.assertEqual(bm2[0, 1], 1.0)
        self.assertEqual(bm2[0, 2], 0.0)
        self.assertEqual(bm2[0, 9], 0.0)

        nd2 = np.zeros(shape=(8, 10))
        nd2[0, 1] = 1.0
        self._assert_eq(bm2[:, :], nd2)

        self._assert_eq(bm2[:, 1], nd2[:, 1:2])
        self._assert_eq(bm2[1, :], nd2[1:2, :])
        self._assert_eq(bm2[0:5, 0:5], nd2[0:5, 0:5])
Ejemplo n.º 52
0
    def test_sparsify_rectangles(self):
        nd = np.array([[ 1.0,  2.0,  3.0,  4.0],
                       [ 5.0,  6.0,  7.0,  8.0],
                       [ 9.0, 10.0, 11.0, 12.0],
                       [13.0, 14.0, 15.0, 16.0]])
        bm = BlockMatrix.from_numpy(nd, block_size=2)

        self._assert_eq(
            bm.sparsify_rectangles([[0, 1, 0, 1], [0, 3, 0, 2], [1, 2, 0, 4]]),
            np.array([[ 1.,  2.,  3.,  4.],
                      [ 5.,  6.,  7.,  8.],
                      [ 9., 10.,  0.,  0.],
                      [13., 14.,  0.,  0.]]))

        self._assert_eq(bm.sparsify_rectangles([]), np.zeros(shape=(4, 4)))
Ejemplo n.º 53
0
    def test_export_rectangles_filtered(self):
        rect_path = new_local_temp_dir()
        rect_uri = local_path_uri(rect_path)
        nd = np.array([[1.0, 2.0, 3.0, 4.0],
                       [5.0, 6.0, 7.0, 8.0],
                       [9.0, 10.0, 11.0, 12.0],
                       [13.0, 14.0, 15.0, 16.0]])
        bm = BlockMatrix.from_numpy(nd)
        bm = bm[1:3, 1:3]
        export_rects = [[0, 1, 0, 2], [1, 2, 0, 2]]
        bm.export_rectangles(rect_uri, export_rects)

        expected = np.array([[6.0, 7.0],
                             [10.0, 11.0]])

        self._assert_rectangles_eq(expected, rect_path, export_rects)
Ejemplo n.º 54
0
    def test_block_matrix_entries(self):
        n_rows, n_cols = 5, 3
        rows = [{'i': i, 'j': j, 'entry': float(i + j)} for i in range(n_rows) for j in range(n_cols)]
        schema = hl.tstruct(i=hl.tint32, j=hl.tint32, entry=hl.tfloat64)
        table = hl.Table.parallelize([hl.struct(i=row['i'], j=row['j'], entry=row['entry']) for row in rows], schema)
        table = table.annotate(i=hl.int64(table.i),
                               j=hl.int64(table.j)).key_by('i', 'j')

        ndarray = np.reshape(list(map(lambda row: row['entry'], rows)), (n_rows, n_cols))

        for block_size in [1, 2, 1024]:
            block_matrix = BlockMatrix.from_numpy(ndarray, block_size)
            entries_table = block_matrix.entries()
            self.assertEqual(entries_table.count(), n_cols * n_rows)
            self.assertEqual(len(entries_table.row), 3)
            self.assertTrue(table._same(entries_table))
Ejemplo n.º 55
0
    def test_export_rectangles_sparse(self):
        rect_path = new_local_temp_dir()
        rect_uri = local_path_uri(rect_path)
        nd = np.array([[1.0, 2.0, 3.0, 4.0],
                       [5.0, 6.0, 7.0, 8.0],
                       [9.0, 10.0, 11.0, 12.0],
                       [13.0, 14.0, 15.0, 16.0]])
        bm = BlockMatrix.from_numpy(nd, block_size=2)
        sparsify_rects = [[0, 1, 0, 1], [0, 3, 0, 2], [1, 2, 0, 4]]
        export_rects = [[0, 1, 0, 1], [0, 3, 0, 2], [1, 2, 0, 4], [2, 4, 2, 4]]
        bm.sparsify_rectangles(sparsify_rects).export_rectangles(rect_uri, export_rects)

        expected = np.array([[1.0, 2.0, 3.0, 4.0],
                             [5.0, 6.0, 7.0, 8.0],
                             [9.0, 10.0, 0.0, 0.0],
                             [13.0, 14.0, 0.0, 0.0]])

        self._assert_rectangles_eq(expected, rect_path, export_rects)
Ejemplo n.º 56
0
    def test_to_from_numpy(self):
        n_rows = 10
        n_cols = 11
        data = np.random.rand(n_rows * n_cols)

        bm = BlockMatrix._create(n_rows, n_cols, data.tolist(), block_size=4)
        a = data.reshape((n_rows, n_cols))

        with tempfile.NamedTemporaryFile() as bm_f:
            with tempfile.NamedTemporaryFile() as a_f:
                bm.tofile(bm_f.name)
                a.tofile(a_f.name)

                a1 = bm.to_numpy()
                a2 = BlockMatrix.from_numpy(a, block_size=5).to_numpy()
                a3 = np.fromfile(bm_f.name).reshape((n_rows, n_cols))
                a4 = BlockMatrix.fromfile(a_f.name, n_rows, n_cols, block_size=3).to_numpy()
                a5 = BlockMatrix.fromfile(bm_f.name, n_rows, n_cols).to_numpy()

                self._assert_eq(a1, a)
                self._assert_eq(a2, a)
                self._assert_eq(a3, a)
                self._assert_eq(a4, a)
                self._assert_eq(a5, a)

        bmt = bm.T
        at = a.T

        with tempfile.NamedTemporaryFile() as bmt_f:
            with tempfile.NamedTemporaryFile() as at_f:
                bmt.tofile(bmt_f.name)
                at.tofile(at_f.name)

                at1 = bmt.to_numpy()
                at2 = BlockMatrix.from_numpy(at).to_numpy()
                at3 = np.fromfile(bmt_f.name).reshape((n_cols, n_rows))
                at4 = BlockMatrix.fromfile(at_f.name, n_cols, n_rows).to_numpy()
                at5 = BlockMatrix.fromfile(bmt_f.name, n_cols, n_rows).to_numpy()

                self._assert_eq(at1, at)
                self._assert_eq(at2, at)
                self._assert_eq(at3, at)
                self._assert_eq(at4, at)
                self._assert_eq(at5, at)

        self._assert_eq(bm.to_numpy(_force_blocking=True), a)
Ejemplo n.º 57
0
    def test_from_entry_expr(self):
        mt = get_dataset()
        mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache()

        a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy()
        a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy()
        a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy()

        self._assert_eq(a1, a2)
        self._assert_eq(a1, a3)

        path = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32)
        a4 = BlockMatrix.read(path).to_numpy()
        self._assert_eq(a1, a4)
Ejemplo n.º 58
0
    def test_rectangles_to_numpy(self):
        nd = np.array([[1.0, 2.0, 3.0],
                       [4.0, 5.0, 6.0],
                       [7.0, 8.0, 9.0]])

        rects = [[0, 3, 0, 1], [1, 2, 0, 2]]

        rect_path = new_local_temp_dir()
        rect_uri = local_path_uri(rect_path)
        BlockMatrix.from_numpy(nd).export_rectangles(rect_uri, rects)

        rect_bytes_path = new_local_temp_dir()
        rect_bytes_uri = local_path_uri(rect_bytes_path)
        BlockMatrix.from_numpy(nd).export_rectangles(rect_bytes_uri, rects, binary=True)

        expected = np.array([[1.0, 0.0],
                             [4.0, 5.0],
                             [7.0, 0.0]])
        self._assert_eq(expected, BlockMatrix.rectangles_to_numpy(rect_path))
        self._assert_eq(expected, BlockMatrix.rectangles_to_numpy(rect_bytes_path, binary=True))
Ejemplo n.º 59
0
    def test_linear_mixed_model_fastlmm(self):
        # FastLMM Test data is from all.bed, all.bim, all.fam, cov.txt, pheno_10_causals.txt:
        #   https://github.com/MicrosoftGenomics/FaST-LMM/tree/master/tests/datasets/synth
        #
        # Data is filtered to chromosome 1,3 and samples 0-124,375-499 (2000 variants and 250 samples)
        #
        # Results are computed with single_snp (with LOCO) as in:
        #   https://github.com/MicrosoftGenomics/FaST-LMM/blob/master/doc/ipynb/FaST-LMM.ipynb

        n, m = 250, 1000  # per chromosome

        x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1')
        y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1')

        mt = hl.import_plink(bed=resource('fastlmmTest.bed'),
                             bim=resource('fastlmmTest.bim'),
                             fam=resource('fastlmmTest.fam'),
                             reference_genome=None)
        mt = mt.annotate_cols(x=x_table[mt.col_key].f2)
        mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache()

        x = np.array([np.ones(n), mt.key_cols_by()['x'].collect()]).T
        y = np.array(mt.key_cols_by()['y'].collect())

        mt_chr1 = mt.filter_rows(mt.locus.contig == '1')
        mt_chr3 = mt.filter_rows(mt.locus.contig == '3')

        # testing chrom 1 for h2, betas, p-values
        h2_fastlmm = 0.14276125
        beta_fastlmm = [0.012202061, 0.037718282, -0.033572693, 0.29171541, -0.045644170]

        # FastLMM p-values do not agree to high precision because FastLMM regresses
        # out x from each SNP first and does an F(1, dof)-test on (beta / se)^2
        # (t-test), whereas Hail does likelihood ratio test.
        # We verify below that Hail's p-values remain fixed going forward.
        # fastlmm = [0.84650294, 0.57865098, 0.59050998, 1.6649473e-06, 0.46892059]
        pval_hail = [0.84543084, 0.57596760, 0.58788517, 1.4057279e-06, 0.46578204]

        gamma_fastlmm = h2_fastlmm / (1 - h2_fastlmm)

        g = BlockMatrix.from_entry_expr(mt_chr1.GT.n_alt_alleles()).to_numpy().T
        g_std = self._filter_and_standardize_cols(g)

        # full rank
        k = (g_std @ g_std.T) * (n / m)
        s, u = np.linalg.eigh(k)
        p = u.T
        model = LinearMixedModel(p @ y, p @ x, s)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)

        h2_std_error = 0.13770773  # hard coded having checked against plot
        assert np.isclose(model.h_sq_standard_error, h2_std_error)

        h_sq_norm_lkhd = model.h_sq_normalized_lkhd()[1:-1]
        argmax = int(100 * h2_fastlmm)
        assert argmax <= np.argmax(h_sq_norm_lkhd) + 1 <= argmax + 1
        assert np.isclose(np.sum(h_sq_norm_lkhd), 1.0)

        mt3_chr3_5var = mt_chr3.filter_rows(mt_chr3.locus.position < 2005)  # first 5
        a = BlockMatrix.from_entry_expr(mt3_chr3_5var.GT.n_alt_alleles()).to_numpy().T

        # FastLMM standardizes each variant to have mean 0 and variance 1.
        a = self._filter_and_standardize_cols(a) * np.sqrt(n)
        pa = p @ a

        model.fit(log_gamma=np.log(gamma_fastlmm))

        res = model.fit_alternatives_numpy(pa, return_pandas=True)

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        pa_t_path = utils.new_temp_file(suffix='bm')
        BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True)

        res = model.fit_alternatives(pa_t_path).to_pandas()

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        # low rank
        ld = g_std.T @ g_std
        sl, v = np.linalg.eigh(ld)
        n_eigenvectors = int(np.sum(sl > 1e-10))
        assert n_eigenvectors < n
        sl = sl[-n_eigenvectors:]
        v = v[:, -n_eigenvectors:]
        s = sl * (n / m)
        p = (g_std @ (v / np.sqrt(sl))).T
        model = LinearMixedModel(p @ y, p @ x, s, y, x)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)
        assert np.isclose(model.h_sq_standard_error, h2_std_error)

        model.fit(log_gamma=np.log(gamma_fastlmm))

        pa = p @ a
        res = model.fit_alternatives_numpy(pa, a, return_pandas=True)

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        a_t_path = utils.new_temp_file(suffix='bm')
        BlockMatrix.from_numpy(a.T).write(a_t_path, force_row_major=True)

        pa_t_path = utils.new_temp_file(suffix='bm')
        BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True)

        res = model.fit_alternatives(pa_t_path, a_t_path).to_pandas()

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        # testing chrom 3 for h2
        h2_fastlmm = 0.36733240

        g = BlockMatrix.from_entry_expr(mt_chr3.GT.n_alt_alleles()).to_numpy().T
        g_std = self._filter_and_standardize_cols(g)

        # full rank
        k = (g_std @ g_std.T) * (n / m)
        s, u = np.linalg.eigh(k)
        p = u.T
        model = LinearMixedModel(p @ y, p @ x, s)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)

        h2_std_error = 0.17409641  # hard coded having checked against plot
        assert np.isclose(model.h_sq_standard_error, h2_std_error)

        h_sq_norm_lkhd = model.h_sq_normalized_lkhd()[1:-1]
        argmax = int(100 * h2_fastlmm)
        assert argmax <= np.argmax(h_sq_norm_lkhd) + 1 <= argmax + 1
        assert np.isclose(np.sum(h_sq_norm_lkhd), 1.0)

        # low rank
        l = g_std.T @ g_std
        sl, v = np.linalg.eigh(l)
        n_eigenvectors = int(np.sum(sl > 1e-10))
        assert n_eigenvectors < n
        sl = sl[-n_eigenvectors:]
        v = v[:, -n_eigenvectors:]
        s = sl * (n / m)
        p = (g_std @ (v / np.sqrt(sl))).T
        model = LinearMixedModel(p @ y, p @ x, s, y, x)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)
        assert np.isclose(model.h_sq_standard_error, h2_std_error)
Ejemplo n.º 60
0
def ld_score(entry_expr,
             locus_expr,
             radius,
             coord_expr=None,
             annotation_exprs=None,
             block_size=None) -> Table:
    """Calculate LD scores.

    Example
    -------

    >>> # Load genetic data into MatrixTable
    >>> mt = hl.import_plink(bed='data/ldsc.bed',
    ...                      bim='data/ldsc.bim',
    ...                      fam='data/ldsc.fam')

    >>> # Create locus-keyed Table with numeric variant annotations
    >>> ht = hl.import_table('data/ldsc.annot',
    ...                      types={'BP': hl.tint,
    ...                             'binary': hl.tfloat,
    ...                             'continuous': hl.tfloat})
    >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP))
    >>> ht = ht.key_by('locus')

    >>> # Annotate MatrixTable with external annotations
    >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary,
    ...                       continuous_annotation=ht[mt.locus].continuous)

    >>> # Calculate LD scores using centimorgan coordinates
    >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(),
    ...                                      locus_expr=mt.locus,
    ...                                      radius=1.0,
    ...                                      coord_expr=mt.cm_position,
    ...                                      annotation_exprs=[mt.binary_annotation,
    ...                                                        mt.continuous_annotation])

    >>> # Show results
    >>> ht_scores.show(3)

    .. code-block:: text

        +---------------+-------------------+-----------------------+-------------+
        | locus         | binary_annotation | continuous_annotation |  univariate |
        +---------------+-------------------+-----------------------+-------------+
        | locus<GRCh37> |           float64 |               float64 |     float64 |
        +---------------+-------------------+-----------------------+-------------+
        | 20:82079      |       1.15183e+00 |           7.30145e+01 | 1.60117e+00 |
        | 20:103517     |       2.04604e+00 |           2.75392e+02 | 4.69239e+00 |
        | 20:108286     |       2.06585e+00 |           2.86453e+02 | 5.00124e+00 |
        +---------------+-------------------+-----------------------+-------------+


    Warning
    -------
        :func:`.ld_score` will fail if ``entry_expr`` results in any missing
        values. The special float value ``nan`` is not considered a
        missing value.

    **Further reading**

    For more in-depth discussion of LD scores, see:

    - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__
    - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__

    Notes
    -----

    `entry_expr`, `locus_expr`, `coord_expr` (if specified), and
    `annotation_exprs` (if specified) must come from the same
    MatrixTable.


    Parameters
    ----------
    entry_expr : :class:`.NumericExpression`
        Expression for entries of genotype matrix
        (e.g. ``mt.GT.n_alt_alleles()``).
    locus_expr : :class:`.LocusExpression`
        Row-indexed locus expression.
    radius : :obj:`int` or :obj:`float`
        Radius of window for row values (in units of `coord_expr` if set,
        otherwise in units of basepairs).
    coord_expr: :class:`.Float64Expression`, optional
        Row-indexed numeric expression for the row value used to window
        variants. By default, the row value is given by the locus
        position.
    annotation_exprs : :class:`.NumericExpression` or
                       :obj:`list` of :class:`.NumericExpression`, optional
        Annotation expression(s) to partition LD scores. Univariate
        annotation will always be included and does not need to be
        specified.
    block_size : :obj:`int`, optional
        Block size. Default given by :meth:`.BlockMatrix.default_block_size`.

    Returns
    -------
    :class:`.Table`
        Table keyed by `locus_expr` with LD scores for each variant and
        `annotation_expr`. The function will always return LD scores for
        the univariate (all SNPs) annotation."""

    mt = entry_expr._indices.source
    mt_locus_expr = locus_expr._indices.source

    if coord_expr is None:
        mt_coord_expr = mt_locus_expr
    else:
        mt_coord_expr = coord_expr._indices.source

    if not annotation_exprs:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr])
    else:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr] +
                        [mt == x._indices.source
                         for x in wrap_to_list(annotation_exprs)])

    if not check_mts:
        raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr
                            (if specified), and annotation_exprs (if
                            specified) must come from same MatrixTable.""")

    n = mt.count_cols()
    r2 = hl.row_correlation(entry_expr, block_size) ** 2
    r2_adj = ((n-1.0) / (n-2.0)) * r2 - (1.0 / (n-2.0))

    starts, stops = hl.linalg.utils.locus_windows(locus_expr,
                                                  radius,
                                                  coord_expr)
    r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops)

    r2_adj_sparse_tmp = new_temp_file()
    r2_adj_sparse.write(r2_adj_sparse_tmp)
    r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp)

    if not annotation_exprs:
        cols = ['univariate']
        col_idxs = {0: 'univariate'}
        l2 = r2_adj_sparse.sum(axis=1)
    else:
        ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows()
        ht = ht.annotate(univariate=hl.literal(1.0))
        names = [name for name in ht.row if name not in ht.key]

        ht_union = hl.Table.union(
            *[(ht.annotate(name=hl.str(x),
                           value=hl.float(ht[x]))
                 .select('name', 'value')) for x in names])
        mt_annotations = ht_union.to_matrix_table(
            row_key=list(ht_union.key),
            col_key=['name'])

        cols = mt_annotations.key_cols_by()['name'].collect()
        col_idxs = {i: cols[i] for i in range(len(cols))}

        a_tmp = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp)

        a = BlockMatrix.read(a_tmp)
        l2 = r2_adj_sparse @ a

    l2_bm_tmp = new_temp_file()
    l2_tsv_tmp = new_temp_file()
    l2.write(l2_bm_tmp, force_row_major=True)
    BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp)

    ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True)
    ht_scores = ht_scores.add_index()
    ht_scores = ht_scores.key_by('idx')
    ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i]
                                  for i in range(len(cols))})

    ht = mt.select_rows(__locus=locus_expr).rows()
    ht = ht.add_index()
    ht = ht.annotate(**ht_scores[ht.idx])
    ht = ht.key_by('__locus')
    ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key])
    ht = ht.rename({'__locus': 'locus'})

    return ht