def test_matrix_ops(self): nm = np.matrix([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) m = BlockMatrix.from_numpy(nm, block_size=2) nrow = np.matrix([[7.0, 8.0, 9.0]]) row = BlockMatrix.from_numpy(nrow, block_size=2) self._assert_eq(m.T, nm.T) self._assert_eq(m.T, nm.T) self._assert_eq(row.T, nrow.T) self._assert_eq(m @ m.T, nm @ nm.T) self._assert_eq(m @ nm.T, nm @ nm.T) self._assert_eq(row @ row.T, nrow @ nrow.T) self._assert_eq(row @ nrow.T, nrow @ nrow.T) self._assert_eq(m.T @ m, nm.T @ nm) self._assert_eq(m.T @ nm, nm.T @ nm) self._assert_eq(row.T @ row, nrow.T @ nrow) self._assert_eq(row.T @ nrow, nrow.T @ nrow) self.assertRaises(ValueError, lambda: m @ m) self.assertRaises(ValueError, lambda: m @ nm) self._assert_eq(m.diagonal(), np.array([1.0, 5.0])) self._assert_eq(m.T.diagonal(), np.array([1.0, 5.0])) self._assert_eq((m @ m.T).diagonal(), np.array([14.0, 77.0]))
def test_stage_locally(self): nd = np.arange(0, 80, dtype=float).reshape(8, 10) bm_uri = new_temp_file() BlockMatrix.from_numpy(nd, block_size=3).write(bm_uri, stage_locally=True) bm = BlockMatrix.read(bm_uri) self._assert_eq(nd, bm)
def test_from_entry_expr_options(self): def build_mt(a): data = [{'v': 0, 's': 0, 'x': a[0]}, {'v': 0, 's': 1, 'x': a[1]}, {'v': 0, 's': 2, 'x': a[2]}] ht = hl.Table.parallelize(data, hl.dtype('struct{v: int32, s: int32, x: float64}')) mt = ht.to_matrix_table(['v'], ['s']) ids = mt.key_cols_by()['s'].collect() return mt.choose_cols([ids.index(0), ids.index(1), ids.index(2)]) def check(expr, mean_impute, center, normalize, expected): actual = np.squeeze(BlockMatrix.from_entry_expr(expr, mean_impute=mean_impute, center=center, normalize=normalize).to_numpy()) assert np.allclose(actual, expected) a = np.array([0.0, 1.0, 2.0]) mt = build_mt(a) check(mt.x, False, False, False, a) check(mt.x, False, True, False, a - 1.0) check(mt.x, False, False, True, a / np.sqrt(5)) check(mt.x, False, True, True, (a - 1.0) / np.sqrt(2)) check(mt.x + 1 - 1, False, False, False, a) mt = build_mt([0.0, hl.null('float64'), 2.0]) check(mt.x, True, False, False, a) check(mt.x, True, True, False, a - 1.0) check(mt.x, True, False, True, a / np.sqrt(5)) check(mt.x, True, True, True, (a - 1.0) / np.sqrt(2)) with self.assertRaises(Exception): BlockMatrix.from_entry_expr(mt.x)
def test_sum(self): def sums_agree(bm, nd): self.assertAlmostEqual(bm.sum(), np.sum(nd)) self._assert_close(bm.sum(axis=0), np.sum(nd, axis=0, keepdims=True)) self._assert_close(bm.sum(axis=1), np.sum(nd, axis=1, keepdims=True)) nd = np.random.normal(size=(11, 13)) bm = BlockMatrix.from_numpy(nd, block_size=3) nd2 = np.zeros(shape=(5, 7)) nd2[2, 4] = 1.0 nd2[2, 5] = 2.0 nd2[3, 4] = 3.0 nd2[3, 5] = 4.0 bm2 = BlockMatrix.from_numpy(nd2, block_size=2).sparsify_rectangles([[2, 4, 4, 6]]) bm3 = BlockMatrix.from_numpy(nd2, block_size=2).sparsify_rectangles([[2, 4, 4, 6], [0, 5, 0, 1]]) bm4 = BlockMatrix.from_numpy(nd2, block_size=2).sparsify_rectangles([[2, 4, 4, 6], [0, 1, 0, 7]]) nd5 = np.zeros(shape=(5, 7)) bm5 = BlockMatrix.fill(5, 7, value=0.0, block_size=2).sparsify_rectangles([]) sums_agree(bm, nd) sums_agree(bm2, nd2) sums_agree(bm3, nd2) sums_agree(bm4, nd2) sums_agree(bm5, nd5)
def test_sparsify_band(self): nd = np.array([[ 1.0, 2.0, 3.0, 4.0], [ 5.0, 6.0, 7.0, 8.0], [ 9.0, 10.0, 11.0, 12.0], [13.0, 14.0, 15.0, 16.0]]) bm = BlockMatrix.from_numpy(nd, block_size=2) self._assert_eq( bm.sparsify_band(lower=-1, upper=2), np.array([[ 1., 2., 3., 0.], [ 5., 6., 7., 8.], [ 0., 10., 11., 12.], [ 0., 0., 15., 16.]])) self._assert_eq( bm.sparsify_band(lower=0, upper=0, blocks_only=True), np.array([[ 1., 2., 0., 0.], [ 5., 6., 0., 0.], [ 0., 0., 11., 12.], [ 0., 0., 15., 16.]])) nd2 = np.arange(0, 80, dtype=float).reshape(8, 10) bm2 = BlockMatrix.from_numpy(nd2, block_size=3) for bounds in [[0, 0], [1, 1], [2, 2], [-5, 5], [-7, 0], [0, 9], [-100, 100]]: lower, upper = bounds actual = bm2.sparsify_band(lower, upper, blocks_only=False).to_numpy() mask = np.fromfunction(lambda i, j: (lower <= j - i) * (j - i <= upper), (8, 10)) self._assert_eq(actual, nd2 * mask)
def test_fill(self): nd = np.ones((3, 5)) bm = BlockMatrix.fill(3, 5, 1.0) bm2 = BlockMatrix.fill(3, 5, 1.0, block_size=2) self.assertTrue(bm.block_size == BlockMatrix.default_block_size()) self.assertTrue(bm2.block_size == 2) self._assert_eq(bm, nd) self._assert_eq(bm2, nd)
def test_svd(self): def assert_same_columns_up_to_sign(a, b): for j in range(a.shape[1]): assert np.allclose(a[:, j], b[:, j]) or np.allclose(-a[:, j], b[:, j]) x0 = np.array([[-2.0, 0.0, 3.0], [-1.0, 2.0, 4.0]]) u0, s0, vt0 = np.linalg.svd(x0, full_matrices=False) x = BlockMatrix.from_numpy(x0) # _svd u, s, vt = x.svd() assert_same_columns_up_to_sign(u, u0) assert np.allclose(s, s0) assert_same_columns_up_to_sign(vt.T, vt0.T) s = x.svd(compute_uv=False) assert np.allclose(s, s0) # left _svd_gramian u, s, vt = x.svd(complexity_bound=0) assert_same_columns_up_to_sign(u, u0) assert np.allclose(s, s0) assert_same_columns_up_to_sign(vt.to_numpy().T, vt0.T) s = x.svd(compute_uv=False, complexity_bound=0) assert np.allclose(s, s0) # right _svd_gramian x = BlockMatrix.from_numpy(x0.T) u, s, vt = x.svd(complexity_bound=0) assert_same_columns_up_to_sign(u.to_numpy(), vt0.T) assert np.allclose(s, s0) assert_same_columns_up_to_sign(vt.T, u0) s = x.svd(compute_uv=False, complexity_bound=0) assert np.allclose(s, s0) # left _svd_gramian when dimensions agree x = BlockMatrix.from_numpy(x0[:, :2]) u, s, vt = x.svd(complexity_bound=0) assert isinstance(u, np.ndarray) assert isinstance(vt, BlockMatrix) # rank-deficient X sets negative eigenvalues to 0.0 a = np.array([[0.0, 1.0, np.e, np.pi, 10.0, 25.0]]) x0 = a.T @ a # rank 1 e, _ = np.linalg.eigh(x0 @ x0.T) x = BlockMatrix.from_numpy(x0) _, s, _ = x.svd(complexity_bound=0) assert np.all(s >= 0.0) s = x.svd(compute_uv=False, complexity_bound=0) assert np.all(s >= 0)
def test_write_overwrite(self): path = new_temp_file() bm = BlockMatrix.from_numpy(np.array([[0]])) bm.write(path) self.assertRaises(FatalError, lambda: bm.write(path)) bm2 = BlockMatrix.from_numpy(np.array([[1]])) bm2.write(path, overwrite=True) self._assert_eq(BlockMatrix.read(path), bm2)
def test_export_blocks(self): nd = np.ones(shape=(8, 10)) bm = BlockMatrix.from_numpy(nd, block_size=20) bm_path = new_local_temp_dir() bm_uri = local_path_uri(bm_path) bm.export_blocks(bm_uri, binary=True) actual = BlockMatrix.rectangles_to_numpy(bm_path, binary=True) self._assert_eq(nd, actual)
def plot_correlation_matrices(chr_list): """ Plot combined correlation matrices for genotype-correlation and sumstats-correlation matrices """ for ch in chr_list: ss_ch = BlockMatrix.read('gs://nbaya/sumstats_corr/' + variant_set + '_ss_correlation_chr{}.bm/'.format(ch)) gt_ch = BlockMatrix.read('gs://nbaya/sumstats_corr/' + variant_set + '_gt_correlation_chr{}.bm/'.format(ch)) M_max = int( 1e4 ) #max number of variants to be taken from the block matrices (suggested: 2e4) M = ss_ch.shape[0] #dimension of block matrix # for idx in range(int(M/M_max)+1): #index of which disjoint window we are looking at in the block matrix for idx in range( 0, int(M / M_max) + 1 ): #index of which disjoint window we are looking at in the block matrix M0 = M_max * (idx) #start variant index for block matrix filtering M1 = min(M_max * (idx + 1), M) #stop variant index for block matrix filtering ss_np = ss_ch[M0:M1, M0:M1].to_numpy() gt_np = gt_ch[M0:M1, M0:M1].to_numpy() print('\nStarting variant window: [' + str(M0) + ',' + str(M1) + ']') w = int( 5e3 ) #window width of variants for correlation matrix (suggested: 2e3) for i in range(int((M1 - M0 - 1) / w) + 1): w0 = w * i #start variant index for window of correlation matrix w1 = min( w * (i + 1), M1 - M0) #stop variant index for window of correlation matrix full = (ss_np[w0:w1, w0:w1] + gt_np[w0:w1, w0:w1].T) np.fill_diagonal(full, 1) fig, ax = plt.subplots() ax.imshow(full, cmap='bwr') ax.plot([0, w], [0, w], 'k--', alpha=0.5, lw=2) plt.xlim([0, w]) plt.ylim([w, 0]) ax.text(w * 0.83, w * 0.1, "SS", fontsize=60, alpha=0.5) ax.text(w * 0.02, w * 0.97, "GT", fontsize=60, alpha=0.5) plt.title('chr' + str(ch) + ' ' + variant_set + ' variants (' + str(M0 + w0) + '-' + str(M0 + w1) + ')') fig = plt.gcf() fig.set_size_inches(10, 10) path = ('gs://nbaya/sumstats_corr/plots/chr' + str(ch) + '_' + variant_set + '_' + str(M0 + w0).zfill(len(str(M))) + '-' + str(M0 + w1).zfill(len(str(M))) + '.png') with hl.hadoop_open(path, 'wb') as f: fig.savefig(f, dpi=600) plt.close() print('\nFinished variant window: [' + str(M0) + ',' + str(M1) + ']')
def test_to_table_maximum_cache_memory_in_bytes_limits(self): bm = BlockMatrix._create(5, 2, [float(i) for i in range(10)], 2) try: bm.to_table_row_major(2, maximum_cache_memory_in_bytes=15)._force_count() except Exception as exc: assert 'BlockMatrixCachedPartFile must be able to hold at least one row of every block in memory' in exc.args[0] else: assert False bm = BlockMatrix._create(5, 2, [float(i) for i in range(10)], 2) bm.to_table_row_major(2, maximum_cache_memory_in_bytes=16)._force_count()
def tree_matmul_tree_matsum(bm1, bm2, mul_splits: int, sum_splits: int = None, path_prefix: str = None, read_if_exists=False): r''' Version of tree_matmul() that allows for intermediate sums of matrix multiplication. `sum_splits` must be a divisor of `mul_splits` ''' # TODO: Make a private function that acts recursively to ensure that the # matrix sums never include more than a maximum number of matrices assert mul_splits % sum_splits == 0, '`sum_splits` must be a divisor of `mul_splits' if not read_if_exists: print(bm1._n_block_cols) print(mul_splits) inner_brange_size = int(math.ceil(bm1._n_block_cols / mul_splits)) print(f'inner_brange_size: {inner_brange_size}') split_points = list(range(0, bm1._n_block_cols, inner_brange_size)) + [bm1._n_block_cols] print(split_points) inner_ranges = list(zip(split_points[:-1], split_points[1:])) print(f'len(inner_ranges): {len(inner_ranges)}') blocks_to_multiply = [(bm1._select_blocks((0, bm1._n_block_rows), (start, stop)), bm2._select_blocks((start, stop), (0, bm2._n_block_cols))) for start, stop in inner_ranges] intermediate_multiply_exprs = [ b1 @ b2 for b1, b2 in blocks_to_multiply ] print(len(intermediate_multiply_exprs)) print(f'Writing {mul_splits} intermediate matrices to {path_prefix}') hl.experimental.write_block_matrices(intermediate_multiply_exprs, path_prefix) read_intermediates = [ BlockMatrix.read(f"{path_prefix}_{i}") for i in range(0, mul_splits) ] tracked_partial_sums = [] sum_block_size = math.ceil(mul_splits / sum_splits) for i in range(sum_splits): partial_sum_path = f"{path_prefix}-partial-{i}" sum(read_intermediates[i * sum_block_size:(i + 1) * sum_block_size]).write(partial_sum_path, overwrite=True) tracked_partial_sums.append(BlockMatrix.read(partial_sum_path)) return sum(tracked_partial_sums)
def generate_cross_pop_ld_scores_from_ld_matrices( pop1, pop2, data_type, pop_data, min_frequency=0.01, call_rate_cutoff=0.8, adj: bool = False, radius: int = 1000000, overwrite=False, temp_bucket='gs://gnomad-tmp/ld'): n1 = pop_data.pop[pop1] n2 = pop_data.pop[pop2] ht1 = hl.read_table(ld_resources._ld_index_path(data_type, pop1, adj=adj)) ht1 = ht1.filter((ht1.pop_freq.AF >= min_frequency) & (ht1.pop_freq.AF <= 1 - min_frequency) & (ht1.pop_freq.AN / n1 >= 2 * call_rate_cutoff)) ht2 = hl.read_table(ld_resources._ld_index_path(data_type, pop2, adj=adj)) ht2 = ht2.filter((ht2.pop_freq.AF >= min_frequency) & (ht2.pop_freq.AF <= 1 - min_frequency) & (ht2.pop_freq.AN / n2 >= 2 * call_rate_cutoff)) ht1 = ht1.filter(hl.is_defined(ht2[ht1.key])).add_index( name='new_idx').checkpoint(f'{temp_bucket}/{pop1}_{pop2}.ht', overwrite=overwrite, _read_if_exists=not overwrite) ht2 = ht2.filter(hl.is_defined(ht1[ht2.key])).add_index( name='new_idx').checkpoint(f'{temp_bucket}/{pop2}_{pop1}.ht', overwrite=overwrite, _read_if_exists=not overwrite) indices1 = ht1.idx.collect() indices2 = ht2.idx.collect() assert len(indices1) == len(indices2) r1 = BlockMatrix.read( ld_resources._ld_matrix_path(data_type, pop1, min_frequency >= COMMON_FREQ, adj=adj)).filter(indices1, indices1) r2 = BlockMatrix.read( ld_resources._ld_matrix_path(data_type, pop2, min_frequency >= COMMON_FREQ, adj=adj)).filter(indices2, indices2) r_bm = r1 * r2 # TODO: is a bias adjustment needed? # r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) out_name = ld_resources._cross_pop_ld_scores_path(data_type, pop1, pop2, adj) compute_and_annotate_ld_score(ht1, r_bm, radius, out_name, overwrite)
def test_export_rectangles(self): nd = np.arange(0, 80, dtype=float).reshape(8, 10) rects1 = [[0, 1, 0, 1], [4, 5, 7, 8]] rects2 = [[4, 5, 0, 10], [0, 8, 4, 5]] rects3 = [[0, 1, 0, 1], [1, 2, 1, 2], [2, 3, 2, 3], [3, 5, 3, 6], [3, 6, 3, 7], [3, 7, 3, 8], [4, 5, 0, 10], [0, 8, 4, 5], [0, 8, 0, 10]] for rects in [rects1, rects2, rects3]: for block_size in [3, 4, 10]: bm_uri = new_temp_file() rect_path = new_local_temp_dir() rect_uri = local_path_uri(rect_path) (BlockMatrix.from_numpy( nd, block_size=block_size).sparsify_rectangles(rects).write( bm_uri, force_row_major=True)) BlockMatrix.export_rectangles(bm_uri, rect_uri, rects) for (i, r) in enumerate(rects): file = rect_path + '/rect-' + str(i) + '_' + '-'.join( map(str, r)) expected = nd[r[0]:r[1], r[2]:r[3]] actual = np.loadtxt(file, ndmin=2) self._assert_eq(expected, actual) rect_path_bytes = new_local_temp_dir() rect_uri_bytes = local_path_uri(rect_path_bytes) BlockMatrix.export_rectangles(bm_uri, rect_uri_bytes, rects, binary=True) for (i, r) in enumerate(rects): file = rect_path_bytes + '/rect-' + str( i) + '_' + '-'.join(map(str, r)) expected = nd[r[0]:r[1], r[2]:r[3]] actual = np.reshape(np.fromfile(file), (r[1] - r[0], r[3] - r[2])) self._assert_eq(expected, actual) bm_uri = new_temp_file() rect_uri = new_temp_file() (BlockMatrix.from_numpy(nd, block_size=5).sparsify_rectangles( [[0, 1, 0, 1]]).write(bm_uri, force_row_major=True)) with self.assertRaises(FatalError) as e: BlockMatrix.export_rectangles(bm_uri, rect_uri, [[5, 6, 5, 6]]) self.assertEquals( e.msg, 'block (1, 1) missing for rectangle 0 with bounds [5, 6, 5, 6]' )
def test_export_rectangles(self): nd = np.arange(0, 80, dtype=float).reshape(8, 10) rects1 = [[0, 1, 0, 1], [4, 5, 7, 8]] rects2 = [[4, 5, 0, 10], [0, 8, 4, 5]] rects3 = [[0, 1, 0, 1], [1, 2, 1, 2], [2, 3, 2, 3], [3, 5, 3, 6], [3, 6, 3, 7], [3, 7, 3, 8], [4, 5, 0, 10], [0, 8, 4, 5], [0, 8, 0, 10]] for rects in [rects1, rects2, rects3]: for block_size in [3, 4, 10]: rect_path = new_local_temp_dir() rect_uri = local_path_uri(rect_path) bm = BlockMatrix.from_numpy(nd, block_size=block_size) bm.export_rectangles(rect_uri, rects) self._assert_rectangles_eq(nd, rect_path, rects) rect_path_bytes = new_local_temp_dir() rect_uri_bytes = local_path_uri(rect_path_bytes) bm.export_rectangles(rect_uri_bytes, rects, binary=True) self._assert_rectangles_eq(nd, rect_path_bytes, rects, binary=True)
def generate_ld_scores_from_ld_matrix(pop_data, data_type, min_frequency=0.01, call_rate_cutoff=0.8, adj: bool = False, radius: int = 1000000, overwrite=False): # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15 mins to compute LD scores per population (~$150 total) for label, pops in dict(pop_data).items(): for pop, n in pops.items(): ht = hl.read_table( ld_resources._ld_index_path(data_type, pop, adj=adj)) ht = ht.filter((ht.pop_freq.AF >= min_frequency) & (ht.pop_freq.AF <= 1 - min_frequency) & (ht.pop_freq.AN / n >= 2 * call_rate_cutoff)).add_index(name='new_idx') indices = ht.idx.collect() r2 = BlockMatrix.read( ld_resources._ld_matrix_path(data_type, pop, min_frequency >= COMMON_FREQ, adj=adj)) r2 = r2.filter(indices, indices)**2 r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) out_name = ld_resources._ld_scores_path(data_type, pop, adj) compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite)
def test_sparsify_triangle(self): nd = np.array([[ 1.0, 2.0, 3.0, 4.0], [ 5.0, 6.0, 7.0, 8.0], [ 9.0, 10.0, 11.0, 12.0], [13.0, 14.0, 15.0, 16.0]]) bm = BlockMatrix.from_numpy(nd, block_size=2) self.assertFalse(bm.is_sparse) self.assertTrue(bm.sparsify_triangle().is_sparse) self._assert_eq( bm.sparsify_triangle(), np.array([[ 1., 2., 3., 4.], [ 0., 6., 7., 8.], [ 0., 0., 11., 12.], [ 0., 0., 0., 16.]])) self._assert_eq( bm.sparsify_triangle(lower=True), np.array([[ 1., 0., 0., 0.], [ 5., 6., 0., 0.], [ 9., 10., 11., 0.], [13., 14., 15., 16.]])) self._assert_eq( bm.sparsify_triangle(blocks_only=True), np.array([[ 1., 2., 3., 4.], [ 5., 6., 7., 8.], [ 0., 0., 11., 12.], [ 0., 0., 15., 16.]]))
def test_to_matrix_table(self): n_partitions = 2 rows, cols = 2, 5 bm = BlockMatrix._create(rows, cols, [float(i) for i in range(10)]) actual = bm.to_matrix_table_row_major(n_partitions) expected = hl.utils.range_matrix_table(rows, cols) expected = expected.annotate_entries(element=hl.float64(expected.row_idx * cols + expected.col_idx)) expected = expected.key_cols_by(col_idx=hl.int64(expected.col_idx)) expected = expected.key_rows_by(row_idx=hl.int64(expected.row_idx)) assert expected._same(actual) bm = BlockMatrix.random(50, 100, block_size=25, seed=0) mt = bm.to_matrix_table_row_major(n_partitions) mt_round_trip = BlockMatrix.from_entry_expr(mt.element).to_matrix_table_row_major() assert mt._same(mt_round_trip)
def test_random_uniform(self): uniform = BlockMatrix.random(10, 10, gaussian=False) nuniform = uniform.to_numpy() for row in nuniform: for entry in row: assert entry > 0
def bm(self) -> BlockMatrix: """ Read and return the Hail MatrixTable resource. :return: Hail MatrixTable resource """ return BlockMatrix.read(self.path)
def check(expr, mean_impute, center, normalize, expected): actual = np.squeeze( BlockMatrix.from_entry_expr(expr, mean_impute=mean_impute, center=center, normalize=normalize).to_numpy()) assert np.allclose(actual, expected)
def test_to_table(self): schema = hl.tstruct(row_idx=hl.tint64, entries=hl.tarray(hl.tfloat64)) rows = [{ 'row_idx': 0, 'entries': [0.0, 1.0] }, { 'row_idx': 1, 'entries': [2.0, 3.0] }, { 'row_idx': 2, 'entries': [4.0, 5.0] }, { 'row_idx': 3, 'entries': [6.0, 7.0] }, { 'row_idx': 4, 'entries': [8.0, 9.0] }] for n_partitions in [1, 2, 3]: for block_size in [1, 2, 5]: expected = hl.Table.parallelize(rows, schema, 'row_idx', n_partitions) bm = BlockMatrix._create(5, 2, [float(i) for i in range(10)], block_size) actual = bm.to_table_row_major(n_partitions) self.assertTrue(expected._same(actual))
def test_slicing(self): nd = np.array(np.arange(0, 80, dtype=float)).reshape(8, 10) bm = BlockMatrix.from_numpy(nd, block_size=3) for indices in [(0, 0), (5, 7), (-3, 9), (-8, -10)]: self._assert_eq(bm[indices], nd[indices]) for indices in [(0, slice(3, 4)), (1, slice(3, 4)), (-8, slice(3, 4)), (-1, slice(3, 4))]: self._assert_eq(bm[indices], np.expand_dims(nd[indices], 0)) for indices in [(slice(3, 4), 0), (slice(3, 4), 1), (slice(3, 4), -8), (slice(3, 4), -1)]: self._assert_eq(bm[indices], np.expand_dims(nd[indices], 1)) for indices in [(slice(0, 8), slice(0, 10)), (slice(0, 8, 2), slice(0, 10, 2)), (slice(2, 4), slice(5, 7)), (slice(-8, -1), slice(-10, -1)), (slice(-8, -1, 2), slice(-10, -1, 2)), (slice(None, 4, 1), slice(None, 4, 1)), (slice(4, None), slice(4, None)), (slice(None, None), slice(None, None))]: self._assert_eq(bm[indices], nd[indices]) self.assertRaises(ValueError, lambda: bm[0, ]) self.assertRaises(ValueError, lambda: bm[9, 0]) self.assertRaises(ValueError, lambda: bm[-9, 0]) self.assertRaises(ValueError, lambda: bm[0, 11]) self.assertRaises(ValueError, lambda: bm[0, -11]) self.assertRaises(ValueError, lambda: bm[::-1, 0]) self.assertRaises(ValueError, lambda: bm[0, ::-1]) self.assertRaises(ValueError, lambda: bm[:0, 0]) self.assertRaises(ValueError, lambda: bm[0, :0]) self.assertRaises(ValueError, lambda: bm[0:9, 0]) self.assertRaises(ValueError, lambda: bm[-9:, 0]) self.assertRaises(ValueError, lambda: bm[:-9, 0]) self.assertRaises(ValueError, lambda: bm[0, :11]) self.assertRaises(ValueError, lambda: bm[0, -11:]) self.assertRaises(ValueError, lambda: bm[0, :-11]) bm2 = bm.sparsify_row_intervals([0, 0, 0, 0, 0, 0, 0, 0], [2, 0, 0, 0, 0, 0, 0, 0]) self.assertEqual(bm2[0, 1], 1.0) self.assertEqual(bm2[0, 2], 0.0) self.assertEqual(bm2[0, 9], 0.0) nd2 = np.zeros(shape=(8, 10)) nd2[0, 1] = 1.0 self._assert_eq(bm2[:, :], nd2) self._assert_eq(bm2[:, 1], nd2[:, 1:2]) self._assert_eq(bm2[1, :], nd2[1:2, :]) self._assert_eq(bm2[0:5, 0:5], nd2[0:5, 0:5])
def get_Z(N_r): r''' Returns `N_r`-dim standard normal random vector ''' Z = BlockMatrix.random( n_rows=N_r, n_cols=1, gaussian=True) # N_r-dimensional standard normal random vector return Z
def test_special_elementwise_ops(self): nm = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) m = BlockMatrix.from_numpy(nm) self._assert_close(m**3, nm**3) self._assert_close(m.sqrt(), np.sqrt(nm)) self._assert_close(m.log(), np.log(nm)) self._assert_close((m - 4).abs(), np.abs(nm - 4))
def test_block_matrix_from_numpy(self): ndarray = np.matrix([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14]], dtype=np.float64) for block_size in [1, 2, 5, 1024]: block_matrix = BlockMatrix.from_numpy(ndarray, block_size) assert (block_matrix.n_rows == 3) assert (block_matrix.n_cols == 5) assert (block_matrix.to_numpy() == ndarray).all()
def test_from_entry_expr_options(self): def build_mt(a): data = [{ 'v': 0, 's': 0, 'x': a[0] }, { 'v': 0, 's': 1, 'x': a[1] }, { 'v': 0, 's': 2, 'x': a[2] }] ht = hl.Table.parallelize( data, hl.dtype('struct{v: int32, s: int32, x: float64}')) mt = ht.to_matrix_table(['v'], ['s']) ids = mt.key_cols_by()['s'].collect() return mt.choose_cols([ids.index(0), ids.index(1), ids.index(2)]) def check(expr, mean_impute, center, normalize, expected): actual = np.squeeze( BlockMatrix.from_entry_expr(expr, mean_impute=mean_impute, center=center, normalize=normalize).to_numpy()) assert np.allclose(actual, expected) a = np.array([0.0, 1.0, 2.0]) mt = build_mt(a) check(mt.x, False, False, False, a) check(mt.x, False, True, False, a - 1.0) check(mt.x, False, False, True, a / np.sqrt(5)) check(mt.x, False, True, True, (a - 1.0) / np.sqrt(2)) check(mt.x + 1 - 1, False, False, False, a) mt = build_mt([0.0, hl.null('float64'), 2.0]) check(mt.x, True, False, False, a) check(mt.x, True, True, False, a - 1.0) check(mt.x, True, False, True, a / np.sqrt(5)) check(mt.x, True, True, True, (a - 1.0) / np.sqrt(2)) with self.assertRaises(Exception): BlockMatrix.from_entry_expr(mt.x)
def test_sparsify_row_intervals(self): nd = np.array([[ 1.0, 2.0, 3.0, 4.0], [ 5.0, 6.0, 7.0, 8.0], [ 9.0, 10.0, 11.0, 12.0], [13.0, 14.0, 15.0, 16.0]]) bm = BlockMatrix.from_numpy(nd, block_size=2) self._assert_eq( bm.sparsify_row_intervals( starts=[1, 0, 2, 2], stops= [2, 0, 3, 4]), np.array([[ 0., 2., 0., 0.], [ 0., 0., 0., 0.], [ 0., 0., 11., 0.], [ 0., 0., 15., 16.]])) self._assert_eq( bm.sparsify_row_intervals( starts=[1, 0, 2, 2], stops= [2, 0, 3, 4], blocks_only=True), np.array([[ 1., 2., 0., 0.], [ 5., 6., 0., 0.], [ 0., 0., 11., 12.], [ 0., 0., 15., 16.]])) nd2 = np.random.normal(size=(8, 10)) bm2 = BlockMatrix.from_numpy(nd2, block_size=3) for bounds in [[[0, 1, 2, 3, 4, 5, 6, 7], [1, 2, 3, 4, 5, 6, 7, 8]], [[0, 0, 5, 3, 4, 5, 8, 2], [9, 0, 5, 3, 4, 5, 9, 5]], [[0, 5, 10, 8, 7, 6, 5, 4], [0, 5, 10, 9, 8, 7, 6, 5]]]: starts, stops = bounds actual = bm2.sparsify_row_intervals(starts, stops, blocks_only=False).to_numpy() expected = nd2.copy() for i in range(0, 8): for j in range(0, starts[i]): expected[i, j] = 0.0 for j in range(stops[i], 10): expected[i, j] = 0.0 self._assert_eq(actual, expected)
def test_special_elementwise_ops(self): nm = np.array([[1.0, 2.0, 3.0, 3.14], [4.0, 5.0, 6.0, 12.12]]) m = BlockMatrix.from_numpy(nm) self._assert_close(m ** 3, nm ** 3) self._assert_close(m.sqrt(), np.sqrt(nm)) self._assert_close(m.ceil(), np.ceil(nm)) self._assert_close(m.floor(), np.floor(nm)) self._assert_close(m.log(), np.log(nm)) self._assert_close((m - 4).abs(), np.abs(nm - 4))
def test_sum_with_sparsify(self): nd = np.zeros(shape=(5, 7)) nd[2, 4] = 1.0 nd[2, 5] = 2.0 nd[3, 4] = 3.0 nd[3, 5] = 4.0 bm = BlockMatrix.from_numpy(nd, block_size=2).sparsify_rectangles([[2, 4, 4, 6]]) bm2 = BlockMatrix.from_numpy(nd, block_size=2).sparsify_rectangles([[2, 4, 4, 6], [0, 5, 0, 1]]) bm3 = BlockMatrix.from_numpy(nd, block_size=2).sparsify_rectangles([[2, 4, 4, 6], [0, 1, 0, 7]]) nd4 = np.zeros(shape=(5, 7)) bm4 = BlockMatrix.fill(5, 7, value=0.0, block_size=2).sparsify_rectangles([]) self.assert_sums_agree(bm, nd) self.assert_sums_agree(bm2, nd) self.assert_sums_agree(bm3, nd) self.assert_sums_agree(bm4, nd4)
def test_matrix_ops(self): nm = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) m = BlockMatrix.from_numpy(nm, block_size=2) nsquare = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) square = BlockMatrix.from_numpy(nsquare, block_size=2) nrow = np.array([[7.0, 8.0, 9.0]]) row = BlockMatrix.from_numpy(nrow, block_size=2) self._assert_eq(m.T, nm.T) self._assert_eq(m.T, nm.T) self._assert_eq(row.T, nrow.T) self._assert_eq(m @ m.T, nm @ nm.T) self._assert_eq(m @ nm.T, nm @ nm.T) self._assert_eq(row @ row.T, nrow @ nrow.T) self._assert_eq(row @ nrow.T, nrow @ nrow.T) self._assert_eq(m.T @ m, nm.T @ nm) self._assert_eq(m.T @ nm, nm.T @ nm) self._assert_eq(row.T @ row, nrow.T @ nrow) self._assert_eq(row.T @ nrow, nrow.T @ nrow) self.assertRaises(ValueError, lambda: m @ m) self.assertRaises(ValueError, lambda: m @ nm) self._assert_eq(m.diagonal(), np.array([[1.0, 5.0]])) self._assert_eq(m.T.diagonal(), np.array([[1.0, 5.0]])) self._assert_eq((m @ m.T).diagonal(), np.array([[14.0, 77.0]])) self._assert_eq(m.sum(axis=0).T, np.array([[5.0], [7.0], [9.0]])) self._assert_eq(m.sum(axis=1).T, np.array([[6.0, 15.0]])) self._assert_eq( m.sum(axis=0).T + row, np.array([[12.0, 13.0, 14.0], [14.0, 15.0, 16.0], [16.0, 17.0, 18.0]])) self._assert_eq( m.sum(axis=0) + row.T, np.array([[12.0, 14.0, 16.0], [13.0, 15.0, 17.0], [14.0, 16.0, 18.0]])) self._assert_eq( square.sum(axis=0).T + square.sum(axis=1), np.array([[18.0], [30.0], [42.0]]))
def test_slicing(self): nd = np.array(np.arange(0, 80, dtype=float)).reshape(8, 10) bm = BlockMatrix.from_numpy(nd, block_size=3) for indices in [(0, 0), (5, 7), (-3, 9), (-8, -10)]: self._assert_eq(bm[indices], nd[indices]) for indices in [(0, slice(3, 4)), (1, slice(3, 4)), (-8, slice(3, 4)), (-1, slice(3, 4))]: self._assert_eq(bm[indices], np.expand_dims(nd[indices], 0)) self._assert_eq(bm[indices] - bm, nd[indices] - nd) self._assert_eq(bm - bm[indices], nd - nd[indices]) for indices in [(slice(3, 4), 0), (slice(3, 4), 1), (slice(3, 4), -8), (slice(3, 4), -1)]: self._assert_eq(bm[indices], np.expand_dims(nd[indices], 1)) self._assert_eq(bm[indices] - bm, nd[indices] - nd) self._assert_eq(bm - bm[indices], nd - nd[indices]) for indices in [(slice(0, 8), slice(0, 10)), (slice(0, 8, 2), slice(0, 10, 2)), (slice(2, 4), slice(5, 7)), (slice(-8, -1), slice(-10, -1)), (slice(-8, -1, 2), slice(-10, -1, 2)), (slice(None, 4, 1), slice(None, 4, 1)), (slice(4, None), slice(4, None)), (slice(None, None), slice(None, None))]: self._assert_eq(bm[indices], nd[indices]) self._assert_eq(bm[indices][:, :2], nd[indices][:, :2]) self._assert_eq(bm[indices][:2, :], nd[indices][:2, :]) self.assertRaises(ValueError, lambda: bm[0, ]) self.assertRaises(ValueError, lambda: bm[9, 0]) self.assertRaises(ValueError, lambda: bm[-9, 0]) self.assertRaises(ValueError, lambda: bm[0, 11]) self.assertRaises(ValueError, lambda: bm[0, -11]) self.assertRaises(ValueError, lambda: bm[::-1, 0]) self.assertRaises(ValueError, lambda: bm[0, ::-1]) self.assertRaises(ValueError, lambda: bm[:0, 0]) self.assertRaises(ValueError, lambda: bm[0, :0]) self.assertRaises(ValueError, lambda: bm[0:9, 0]) self.assertRaises(ValueError, lambda: bm[-9:, 0]) self.assertRaises(ValueError, lambda: bm[:-9, 0]) self.assertRaises(ValueError, lambda: bm[0, :11]) self.assertRaises(ValueError, lambda: bm[0, -11:]) self.assertRaises(ValueError, lambda: bm[0, :-11])
def test_sparsify_blocks(self): block_list = [1, 2] np_square = np.arange(16, dtype=np.float64).reshape((4, 4)) block_size = 2 bm = BlockMatrix.from_numpy(np_square, block_size=block_size) bm = bm._sparsify_blocks(block_list) sparse_numpy = sparsify_numpy(np_square, block_size, block_list) assert np.array_equal(bm.to_numpy(), sparse_numpy) assert np.array_equal( sparse_numpy, np.array([[0, 0, 2, 3], [0, 0, 6, 7], [8, 9, 0, 0], [12, 13, 0, 0]])) block_list = [4, 8, 10, 12, 13, 14] np_square = np.arange(225, dtype=np.float64).reshape((15, 15)) block_size = 4 bm = BlockMatrix.from_numpy(np_square, block_size=block_size) bm = bm._sparsify_blocks(block_list) sparse_numpy = sparsify_numpy(np_square, block_size, block_list) assert np.array_equal(bm.to_numpy(), sparse_numpy)
def get_toy_R(M, n_blocks, identity=False): r''' Creates "toy" LD matrix as a list of Hail Block Matrices for testing purposes. The list has length=`n_blocks`. ''' R = [] block_snp_idxs = np.array_split(range(M), n_blocks) block_sizes = [len(block) for block in block_snp_idxs] for block_size in block_sizes: if identity: R_block = BlockMatrix.from_numpy(np.identity(n=block_size)) else: A = np.random.uniform( low=-1, high=1, size=(block_size, 1) )**11 # exponentiate to number (odd to preserve negative sign) to avoid highly correlated SNPs cov = A @ A.T np.fill_diagonal(cov, 1) R_block = BlockMatrix.from_numpy(cov) R.append(R_block) return R
def test_sparsify_rectangles(self): nd = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0], [13.0, 14.0, 15.0, 16.0]]) bm = BlockMatrix.from_numpy(nd, block_size=2) self._assert_eq( bm.sparsify_rectangles([[0, 1, 0, 1], [0, 3, 0, 2], [1, 2, 0, 4]]), np.array([[1., 2., 3., 4.], [5., 6., 7., 8.], [9., 10., 0., 0.], [13., 14., 0., 0.]])) self._assert_eq(bm.sparsify_rectangles([]), np.zeros(shape=(4, 4)))
def test_export_rectangles(self): nd = np.arange(0, 80, dtype=float).reshape(8, 10) rects1 = [[0, 1, 0, 1], [4, 5, 7, 8]] rects2 = [[4, 5, 0, 10], [0, 8, 4, 5]] rects3 = [[0, 1, 0, 1], [1, 2, 1, 2], [2, 3, 2, 3], [3, 5, 3, 6], [3, 6, 3, 7], [3, 7, 3, 8], [4, 5, 0, 10], [0, 8, 4, 5], [0, 8, 0, 10]] for rects in [rects1, rects2, rects3]: for block_size in [3, 4, 10]: bm_uri = new_temp_file() rect_path = new_local_temp_dir() rect_uri = local_path_uri(rect_path) (BlockMatrix.from_numpy(nd, block_size=block_size) .sparsify_rectangles(rects) .write(bm_uri, force_row_major=True)) BlockMatrix.export_rectangles(bm_uri, rect_uri, rects) for (i, r) in enumerate(rects): file = rect_path + '/rect-' + str(i) + '_' + '-'.join(map(str, r)) expected = nd[r[0]:r[1], r[2]:r[3]] actual = np.loadtxt(file, ndmin = 2) self._assert_eq(expected, actual) rect_path_bytes = new_local_temp_dir() rect_uri_bytes = local_path_uri(rect_path_bytes) BlockMatrix.export_rectangles(bm_uri, rect_uri_bytes, rects, binary=True) for (i, r) in enumerate(rects): file = rect_path_bytes + '/rect-' + str(i) + '_' + '-'.join(map(str, r)) expected = nd[r[0]:r[1], r[2]:r[3]] actual = np.reshape(np.fromfile(file), (r[1] - r[0], r[3] - r[2])) self._assert_eq(expected, actual) bm_uri = new_temp_file() rect_uri = new_temp_file() (BlockMatrix.from_numpy(nd, block_size=5) .sparsify_rectangles([[0, 1, 0, 1]]) .write(bm_uri, force_row_major=True)) with self.assertRaises(FatalError) as e: BlockMatrix.export_rectangles(bm_uri, rect_uri, [[5, 6, 5, 6]]) self.assertEquals(e.msg, 'block (1, 1) missing for rectangle 0 with bounds [5, 6, 5, 6]')
def test_promote(self): nx = np.matrix([[2.0]]) nc = np.matrix([[1.0], [2.0]]) nr = np.matrix([[1.0, 2.0, 3.0]]) nm = np.matrix([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) e = 2 x = BlockMatrix.from_numpy(nx) c = BlockMatrix.from_numpy(nc) r = BlockMatrix.from_numpy(nr) m = BlockMatrix.from_numpy(nm) nct, nrt, nmt = nc.T, nr.T, nm.T ct, rt, mt = c.T, r.T, m.T good = [(x, x), (x, c), (x, r), (x, m), (x, e), (c, x), (c, c), (c, m), (c, e), (r, x), (r, r), (r, m), (r, e), (m, x), (m, c), (m, r), (m, m), (m, e), (x, nx), (x, nc), (x, nr), (x, nm), (c, nx), (c, nc), (c, nm), (r, nx), (r, nr), (r, nm), (m, nx), (m, nc), (m, nr), (m, nm)] bad = [(c, r), (r, c), (c, ct), (r, rt), (c, rt), (c, mt), (ct, r), (ct, m), (r, ct), (r, mt), (rt, c), (rt, m), (m, ct), (m, rt), (m, mt), (mt, c), (mt, r), (mt, m), (c, nr), (r, nc), (c, nct), (r, nrt), (c, nrt), (c, nmt), (ct, nr), (ct, nm), (r, nct), (r, nmt), (rt, nc), (rt, nm), (m, nct), (m, nrt), (m, nmt), (mt, nc), (mt, nr), (mt, nm)] for (a, b) in good: a._promote(b, '') for (a, b) in bad: self.assertRaises(ValueError, lambda: a._promote(b, ''))
def test_export_rectangles_filtered(self): rect_path = new_local_temp_dir() rect_uri = local_path_uri(rect_path) nd = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0], [13.0, 14.0, 15.0, 16.0]]) bm = BlockMatrix.from_numpy(nd) bm = bm[1:3, 1:3] export_rects = [[0, 1, 0, 2], [1, 2, 0, 2]] bm.export_rectangles(rect_uri, export_rects) expected = np.array([[6.0, 7.0], [10.0, 11.0]]) self._assert_rectangles_eq(expected, rect_path, export_rects)
def test_to_table(self): schema = hl.tstruct(row_idx=hl.tint64, entries=hl.tarray(hl.tfloat64)) rows = [{'row_idx': 0, 'entries': [0.0, 1.0]}, {'row_idx': 1, 'entries': [2.0, 3.0]}, {'row_idx': 2, 'entries': [4.0, 5.0]}, {'row_idx': 3, 'entries': [6.0, 7.0]}, {'row_idx': 4, 'entries': [8.0, 9.0]}] for n_partitions in [1, 2, 3]: for block_size in [1, 2, 5]: expected = hl.Table.parallelize(rows, schema, 'row_idx', n_partitions) bm = BlockMatrix._create(5, 2, [float(i) for i in range(10)], block_size) actual = bm.to_table_row_major(n_partitions) self.assertTrue(expected._same(actual))
def test_slicing(self): nd = np.array(np.arange(0, 80, dtype=float)).reshape(8, 10) bm = BlockMatrix.from_numpy(nd, block_size=3) for indices in [(0, 0), (5, 7), (-3, 9), (-8, -10)]: self._assert_eq(bm[indices], nd[indices]) for indices in [(0, slice(3, 4)), (1, slice(3, 4)), (-8, slice(3, 4)), (-1, slice(3, 4))]: self._assert_eq(bm[indices], np.expand_dims(nd[indices], 0)) for indices in [(slice(3, 4), 0), (slice(3, 4), 1), (slice(3, 4), -8), (slice(3, 4), -1)]: self._assert_eq(bm[indices], np.expand_dims(nd[indices], 1)) for indices in [(slice(0, 8), slice(0, 10)), (slice(0, 8, 2), slice(0, 10, 2)), (slice(2, 4), slice(5, 7)), (slice(-8, -1), slice(-10, -1)), (slice(-8, -1, 2), slice(-10, -1, 2)), (slice(None, 4, 1), slice(None, 4, 1)), (slice(4, None), slice(4, None)), (slice(None, None), slice(None, None))]: self._assert_eq(bm[indices], nd[indices]) self.assertRaises(ValueError, lambda: bm[0, ]) self.assertRaises(ValueError, lambda: bm[9, 0]) self.assertRaises(ValueError, lambda: bm[-9, 0]) self.assertRaises(ValueError, lambda: bm[0, 11]) self.assertRaises(ValueError, lambda: bm[0, -11]) self.assertRaises(ValueError, lambda: bm[::-1, 0]) self.assertRaises(ValueError, lambda: bm[0, ::-1]) self.assertRaises(ValueError, lambda: bm[:0, 0]) self.assertRaises(ValueError, lambda: bm[0, :0]) self.assertRaises(ValueError, lambda: bm[0:9, 0]) self.assertRaises(ValueError, lambda: bm[-9:, 0]) self.assertRaises(ValueError, lambda: bm[:-9, 0]) self.assertRaises(ValueError, lambda: bm[0, :11]) self.assertRaises(ValueError, lambda: bm[0, -11:]) self.assertRaises(ValueError, lambda: bm[0, :-11])
def test_slices_with_sparsify(self): nd = np.array(np.arange(0, 80, dtype=float)).reshape(8, 10) bm = BlockMatrix.from_numpy(nd, block_size=3) bm2 = bm.sparsify_row_intervals([0, 0, 0, 0, 0, 0, 0, 0], [2, 0, 0, 0, 0, 0, 0, 0]) self.assertEqual(bm2[0, 1], 1.0) self.assertEqual(bm2[0, 2], 0.0) self.assertEqual(bm2[0, 9], 0.0) nd2 = np.zeros(shape=(8, 10)) nd2[0, 1] = 1.0 self._assert_eq(bm2[:, :], nd2) self._assert_eq(bm2[:, 1], nd2[:, 1:2]) self._assert_eq(bm2[1, :], nd2[1:2, :]) self._assert_eq(bm2[0:5, 0:5], nd2[0:5, 0:5])
def test_sparsify_rectangles(self): nd = np.array([[ 1.0, 2.0, 3.0, 4.0], [ 5.0, 6.0, 7.0, 8.0], [ 9.0, 10.0, 11.0, 12.0], [13.0, 14.0, 15.0, 16.0]]) bm = BlockMatrix.from_numpy(nd, block_size=2) self._assert_eq( bm.sparsify_rectangles([[0, 1, 0, 1], [0, 3, 0, 2], [1, 2, 0, 4]]), np.array([[ 1., 2., 3., 4.], [ 5., 6., 7., 8.], [ 9., 10., 0., 0.], [13., 14., 0., 0.]])) self._assert_eq(bm.sparsify_rectangles([]), np.zeros(shape=(4, 4)))
def test_block_matrix_entries(self): n_rows, n_cols = 5, 3 rows = [{'i': i, 'j': j, 'entry': float(i + j)} for i in range(n_rows) for j in range(n_cols)] schema = hl.tstruct(i=hl.tint32, j=hl.tint32, entry=hl.tfloat64) table = hl.Table.parallelize([hl.struct(i=row['i'], j=row['j'], entry=row['entry']) for row in rows], schema) table = table.annotate(i=hl.int64(table.i), j=hl.int64(table.j)).key_by('i', 'j') ndarray = np.reshape(list(map(lambda row: row['entry'], rows)), (n_rows, n_cols)) for block_size in [1, 2, 1024]: block_matrix = BlockMatrix.from_numpy(ndarray, block_size) entries_table = block_matrix.entries() self.assertEqual(entries_table.count(), n_cols * n_rows) self.assertEqual(len(entries_table.row), 3) self.assertTrue(table._same(entries_table))
def test_export_rectangles_sparse(self): rect_path = new_local_temp_dir() rect_uri = local_path_uri(rect_path) nd = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0], [13.0, 14.0, 15.0, 16.0]]) bm = BlockMatrix.from_numpy(nd, block_size=2) sparsify_rects = [[0, 1, 0, 1], [0, 3, 0, 2], [1, 2, 0, 4]] export_rects = [[0, 1, 0, 1], [0, 3, 0, 2], [1, 2, 0, 4], [2, 4, 2, 4]] bm.sparsify_rectangles(sparsify_rects).export_rectangles(rect_uri, export_rects) expected = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 0.0, 0.0], [13.0, 14.0, 0.0, 0.0]]) self._assert_rectangles_eq(expected, rect_path, export_rects)
def test_to_from_numpy(self): n_rows = 10 n_cols = 11 data = np.random.rand(n_rows * n_cols) bm = BlockMatrix._create(n_rows, n_cols, data.tolist(), block_size=4) a = data.reshape((n_rows, n_cols)) with tempfile.NamedTemporaryFile() as bm_f: with tempfile.NamedTemporaryFile() as a_f: bm.tofile(bm_f.name) a.tofile(a_f.name) a1 = bm.to_numpy() a2 = BlockMatrix.from_numpy(a, block_size=5).to_numpy() a3 = np.fromfile(bm_f.name).reshape((n_rows, n_cols)) a4 = BlockMatrix.fromfile(a_f.name, n_rows, n_cols, block_size=3).to_numpy() a5 = BlockMatrix.fromfile(bm_f.name, n_rows, n_cols).to_numpy() self._assert_eq(a1, a) self._assert_eq(a2, a) self._assert_eq(a3, a) self._assert_eq(a4, a) self._assert_eq(a5, a) bmt = bm.T at = a.T with tempfile.NamedTemporaryFile() as bmt_f: with tempfile.NamedTemporaryFile() as at_f: bmt.tofile(bmt_f.name) at.tofile(at_f.name) at1 = bmt.to_numpy() at2 = BlockMatrix.from_numpy(at).to_numpy() at3 = np.fromfile(bmt_f.name).reshape((n_cols, n_rows)) at4 = BlockMatrix.fromfile(at_f.name, n_cols, n_rows).to_numpy() at5 = BlockMatrix.fromfile(bmt_f.name, n_cols, n_rows).to_numpy() self._assert_eq(at1, at) self._assert_eq(at2, at) self._assert_eq(at3, at) self._assert_eq(at4, at) self._assert_eq(at5, at) self._assert_eq(bm.to_numpy(_force_blocking=True), a)
def test_from_entry_expr(self): mt = get_dataset() mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache() a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy() a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy() a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy() self._assert_eq(a1, a2) self._assert_eq(a1, a3) path = new_temp_file() BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32) a4 = BlockMatrix.read(path).to_numpy() self._assert_eq(a1, a4)
def test_rectangles_to_numpy(self): nd = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) rects = [[0, 3, 0, 1], [1, 2, 0, 2]] rect_path = new_local_temp_dir() rect_uri = local_path_uri(rect_path) BlockMatrix.from_numpy(nd).export_rectangles(rect_uri, rects) rect_bytes_path = new_local_temp_dir() rect_bytes_uri = local_path_uri(rect_bytes_path) BlockMatrix.from_numpy(nd).export_rectangles(rect_bytes_uri, rects, binary=True) expected = np.array([[1.0, 0.0], [4.0, 5.0], [7.0, 0.0]]) self._assert_eq(expected, BlockMatrix.rectangles_to_numpy(rect_path)) self._assert_eq(expected, BlockMatrix.rectangles_to_numpy(rect_bytes_path, binary=True))
def test_linear_mixed_model_fastlmm(self): # FastLMM Test data is from all.bed, all.bim, all.fam, cov.txt, pheno_10_causals.txt: # https://github.com/MicrosoftGenomics/FaST-LMM/tree/master/tests/datasets/synth # # Data is filtered to chromosome 1,3 and samples 0-124,375-499 (2000 variants and 250 samples) # # Results are computed with single_snp (with LOCO) as in: # https://github.com/MicrosoftGenomics/FaST-LMM/blob/master/doc/ipynb/FaST-LMM.ipynb n, m = 250, 1000 # per chromosome x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1') y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1') mt = hl.import_plink(bed=resource('fastlmmTest.bed'), bim=resource('fastlmmTest.bim'), fam=resource('fastlmmTest.fam'), reference_genome=None) mt = mt.annotate_cols(x=x_table[mt.col_key].f2) mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache() x = np.array([np.ones(n), mt.key_cols_by()['x'].collect()]).T y = np.array(mt.key_cols_by()['y'].collect()) mt_chr1 = mt.filter_rows(mt.locus.contig == '1') mt_chr3 = mt.filter_rows(mt.locus.contig == '3') # testing chrom 1 for h2, betas, p-values h2_fastlmm = 0.14276125 beta_fastlmm = [0.012202061, 0.037718282, -0.033572693, 0.29171541, -0.045644170] # FastLMM p-values do not agree to high precision because FastLMM regresses # out x from each SNP first and does an F(1, dof)-test on (beta / se)^2 # (t-test), whereas Hail does likelihood ratio test. # We verify below that Hail's p-values remain fixed going forward. # fastlmm = [0.84650294, 0.57865098, 0.59050998, 1.6649473e-06, 0.46892059] pval_hail = [0.84543084, 0.57596760, 0.58788517, 1.4057279e-06, 0.46578204] gamma_fastlmm = h2_fastlmm / (1 - h2_fastlmm) g = BlockMatrix.from_entry_expr(mt_chr1.GT.n_alt_alleles()).to_numpy().T g_std = self._filter_and_standardize_cols(g) # full rank k = (g_std @ g_std.T) * (n / m) s, u = np.linalg.eigh(k) p = u.T model = LinearMixedModel(p @ y, p @ x, s) model.fit() assert np.isclose(model.h_sq, h2_fastlmm) h2_std_error = 0.13770773 # hard coded having checked against plot assert np.isclose(model.h_sq_standard_error, h2_std_error) h_sq_norm_lkhd = model.h_sq_normalized_lkhd()[1:-1] argmax = int(100 * h2_fastlmm) assert argmax <= np.argmax(h_sq_norm_lkhd) + 1 <= argmax + 1 assert np.isclose(np.sum(h_sq_norm_lkhd), 1.0) mt3_chr3_5var = mt_chr3.filter_rows(mt_chr3.locus.position < 2005) # first 5 a = BlockMatrix.from_entry_expr(mt3_chr3_5var.GT.n_alt_alleles()).to_numpy().T # FastLMM standardizes each variant to have mean 0 and variance 1. a = self._filter_and_standardize_cols(a) * np.sqrt(n) pa = p @ a model.fit(log_gamma=np.log(gamma_fastlmm)) res = model.fit_alternatives_numpy(pa, return_pandas=True) assert np.allclose(res['beta'], beta_fastlmm) assert np.allclose(res['p_value'], pval_hail) pa_t_path = utils.new_temp_file(suffix='bm') BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True) res = model.fit_alternatives(pa_t_path).to_pandas() assert np.allclose(res['beta'], beta_fastlmm) assert np.allclose(res['p_value'], pval_hail) # low rank ld = g_std.T @ g_std sl, v = np.linalg.eigh(ld) n_eigenvectors = int(np.sum(sl > 1e-10)) assert n_eigenvectors < n sl = sl[-n_eigenvectors:] v = v[:, -n_eigenvectors:] s = sl * (n / m) p = (g_std @ (v / np.sqrt(sl))).T model = LinearMixedModel(p @ y, p @ x, s, y, x) model.fit() assert np.isclose(model.h_sq, h2_fastlmm) assert np.isclose(model.h_sq_standard_error, h2_std_error) model.fit(log_gamma=np.log(gamma_fastlmm)) pa = p @ a res = model.fit_alternatives_numpy(pa, a, return_pandas=True) assert np.allclose(res['beta'], beta_fastlmm) assert np.allclose(res['p_value'], pval_hail) a_t_path = utils.new_temp_file(suffix='bm') BlockMatrix.from_numpy(a.T).write(a_t_path, force_row_major=True) pa_t_path = utils.new_temp_file(suffix='bm') BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True) res = model.fit_alternatives(pa_t_path, a_t_path).to_pandas() assert np.allclose(res['beta'], beta_fastlmm) assert np.allclose(res['p_value'], pval_hail) # testing chrom 3 for h2 h2_fastlmm = 0.36733240 g = BlockMatrix.from_entry_expr(mt_chr3.GT.n_alt_alleles()).to_numpy().T g_std = self._filter_and_standardize_cols(g) # full rank k = (g_std @ g_std.T) * (n / m) s, u = np.linalg.eigh(k) p = u.T model = LinearMixedModel(p @ y, p @ x, s) model.fit() assert np.isclose(model.h_sq, h2_fastlmm) h2_std_error = 0.17409641 # hard coded having checked against plot assert np.isclose(model.h_sq_standard_error, h2_std_error) h_sq_norm_lkhd = model.h_sq_normalized_lkhd()[1:-1] argmax = int(100 * h2_fastlmm) assert argmax <= np.argmax(h_sq_norm_lkhd) + 1 <= argmax + 1 assert np.isclose(np.sum(h_sq_norm_lkhd), 1.0) # low rank l = g_std.T @ g_std sl, v = np.linalg.eigh(l) n_eigenvectors = int(np.sum(sl > 1e-10)) assert n_eigenvectors < n sl = sl[-n_eigenvectors:] v = v[:, -n_eigenvectors:] s = sl * (n / m) p = (g_std @ (v / np.sqrt(sl))).T model = LinearMixedModel(p @ y, p @ x, s, y, x) model.fit() assert np.isclose(model.h_sq, h2_fastlmm) assert np.isclose(model.h_sq_standard_error, h2_std_error)
def ld_score(entry_expr, locus_expr, radius, coord_expr=None, annotation_exprs=None, block_size=None) -> Table: """Calculate LD scores. Example ------- >>> # Load genetic data into MatrixTable >>> mt = hl.import_plink(bed='data/ldsc.bed', ... bim='data/ldsc.bim', ... fam='data/ldsc.fam') >>> # Create locus-keyed Table with numeric variant annotations >>> ht = hl.import_table('data/ldsc.annot', ... types={'BP': hl.tint, ... 'binary': hl.tfloat, ... 'continuous': hl.tfloat}) >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP)) >>> ht = ht.key_by('locus') >>> # Annotate MatrixTable with external annotations >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary, ... continuous_annotation=ht[mt.locus].continuous) >>> # Calculate LD scores using centimorgan coordinates >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(), ... locus_expr=mt.locus, ... radius=1.0, ... coord_expr=mt.cm_position, ... annotation_exprs=[mt.binary_annotation, ... mt.continuous_annotation]) >>> # Show results >>> ht_scores.show(3) .. code-block:: text +---------------+-------------------+-----------------------+-------------+ | locus | binary_annotation | continuous_annotation | univariate | +---------------+-------------------+-----------------------+-------------+ | locus<GRCh37> | float64 | float64 | float64 | +---------------+-------------------+-----------------------+-------------+ | 20:82079 | 1.15183e+00 | 7.30145e+01 | 1.60117e+00 | | 20:103517 | 2.04604e+00 | 2.75392e+02 | 4.69239e+00 | | 20:108286 | 2.06585e+00 | 2.86453e+02 | 5.00124e+00 | +---------------+-------------------+-----------------------+-------------+ Warning ------- :func:`.ld_score` will fail if ``entry_expr`` results in any missing values. The special float value ``nan`` is not considered a missing value. **Further reading** For more in-depth discussion of LD scores, see: - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__ Notes ----- `entry_expr`, `locus_expr`, `coord_expr` (if specified), and `annotation_exprs` (if specified) must come from the same MatrixTable. Parameters ---------- entry_expr : :class:`.NumericExpression` Expression for entries of genotype matrix (e.g. ``mt.GT.n_alt_alleles()``). locus_expr : :class:`.LocusExpression` Row-indexed locus expression. radius : :obj:`int` or :obj:`float` Radius of window for row values (in units of `coord_expr` if set, otherwise in units of basepairs). coord_expr: :class:`.Float64Expression`, optional Row-indexed numeric expression for the row value used to window variants. By default, the row value is given by the locus position. annotation_exprs : :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression`, optional Annotation expression(s) to partition LD scores. Univariate annotation will always be included and does not need to be specified. block_size : :obj:`int`, optional Block size. Default given by :meth:`.BlockMatrix.default_block_size`. Returns ------- :class:`.Table` Table keyed by `locus_expr` with LD scores for each variant and `annotation_expr`. The function will always return LD scores for the univariate (all SNPs) annotation.""" mt = entry_expr._indices.source mt_locus_expr = locus_expr._indices.source if coord_expr is None: mt_coord_expr = mt_locus_expr else: mt_coord_expr = coord_expr._indices.source if not annotation_exprs: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr]) else: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr] + [mt == x._indices.source for x in wrap_to_list(annotation_exprs)]) if not check_mts: raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr (if specified), and annotation_exprs (if specified) must come from same MatrixTable.""") n = mt.count_cols() r2 = hl.row_correlation(entry_expr, block_size) ** 2 r2_adj = ((n-1.0) / (n-2.0)) * r2 - (1.0 / (n-2.0)) starts, stops = hl.linalg.utils.locus_windows(locus_expr, radius, coord_expr) r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops) r2_adj_sparse_tmp = new_temp_file() r2_adj_sparse.write(r2_adj_sparse_tmp) r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp) if not annotation_exprs: cols = ['univariate'] col_idxs = {0: 'univariate'} l2 = r2_adj_sparse.sum(axis=1) else: ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows() ht = ht.annotate(univariate=hl.literal(1.0)) names = [name for name in ht.row if name not in ht.key] ht_union = hl.Table.union( *[(ht.annotate(name=hl.str(x), value=hl.float(ht[x])) .select('name', 'value')) for x in names]) mt_annotations = ht_union.to_matrix_table( row_key=list(ht_union.key), col_key=['name']) cols = mt_annotations.key_cols_by()['name'].collect() col_idxs = {i: cols[i] for i in range(len(cols))} a_tmp = new_temp_file() BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp) a = BlockMatrix.read(a_tmp) l2 = r2_adj_sparse @ a l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_temp_file() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index() ht_scores = ht_scores.key_by('idx') ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i] for i in range(len(cols))}) ht = mt.select_rows(__locus=locus_expr).rows() ht = ht.add_index() ht = ht.annotate(**ht_scores[ht.idx]) ht = ht.key_by('__locus') ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key]) ht = ht.rename({'__locus': 'locus'}) return ht