def plsa_fit( X, k, n_row_blocks=8, n_col_blocks=8, init="random", n_iter=100, n_iter_per_test=10, tolerance=0.001, e_step_thresh=1e-32, random_state=None, ): rng = check_random_state(random_state) p_z_given_d_init, p_w_given_z_init = plsa_init(X, k, init=init, rng=rng) A = X.tocsr().astype(np.float32) n = A.shape[0] m = A.shape[1] block_row_size = np.uint16(np.ceil(A.shape[0] / n_row_blocks)) block_col_size = np.uint16(np.ceil(A.shape[1] / n_col_blocks)) p_z_given_d = np.zeros((block_row_size * n_row_blocks, k), dtype=np.float32) p_z_given_d[:p_z_given_d_init.shape[0]] = p_z_given_d_init p_z_given_d = p_z_given_d.reshape(n_row_blocks, block_row_size, k) p_w_given_z = np.zeros((k, block_col_size * n_col_blocks), dtype=np.float32) p_w_given_z[:, :p_w_given_z_init.shape[1]] = p_w_given_z_init # p_w_given_z = np.transpose( # p_w_given_z.T.reshape(n_col_blocks, block_col_size, k), axes=[0, 2, 1] # ).astype(np.float32, order="C") p_w_given_z = np.stack(np.hsplit(p_w_given_z, 10)) A_blocks = [[0] * n_col_blocks for i in range(n_row_blocks)] max_nnz_per_block = 0 for i in range(n_row_blocks): row_start = block_row_size * i row_end = min(row_start + block_row_size, n) for j in range(n_col_blocks): col_start = block_col_size * j col_end = min(col_start + block_col_size, m) A_blocks[i][j] = A[row_start:row_end, col_start:col_end].tocoo() if A_blocks[i][j].nnz > max_nnz_per_block: max_nnz_per_block = A_blocks[i][j].nnz block_rows_ndarray = np.full( (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32) block_cols_ndarray = np.full( (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32) block_vals_ndarray = np.zeros( (n_row_blocks, n_col_blocks, max_nnz_per_block), dtype=np.float32) for i in range(n_row_blocks): for j in range(n_col_blocks): nnz = A_blocks[i][j].nnz block_rows_ndarray[i, j, :nnz] = A_blocks[i][j].row block_cols_ndarray[i, j, :nnz] = A_blocks[i][j].col block_vals_ndarray[i, j, :nnz] = A_blocks[i][j].data p_z_given_d, p_w_given_z = plsa_fit_inner_blockwise( block_rows_ndarray, block_cols_ndarray, block_vals_ndarray, p_w_given_z, p_z_given_d, block_row_size, block_col_size, n_iter=n_iter, n_iter_per_test=n_iter_per_test, tolerance=tolerance, e_step_thresh=e_step_thresh, ) p_z_given_d = np.vstack(p_z_given_d)[:n, :] p_w_given_z = np.hstack(p_w_given_z)[:, :m] return p_z_given_d, p_w_given_z
def plsa_fit( data, k, n_row_blocks=8, n_col_blocks=8, init="random", n_iter=100, n_iter_per_test=10, tolerance=0.001, e_step_thresh=1e-32, random_state=None, ): rng = check_random_state(random_state) p_z_given_d_init, p_w_given_z_init = plsa_init(data, k, init=init, rng=rng) A = data.tocsr().astype(np.float32) n = A.shape[0] m = A.shape[1] block_row_size = np.uint16(np.ceil(A.shape[0] / n_row_blocks)) block_col_size = np.uint16(np.ceil(A.shape[1] / n_col_blocks)) p_z_given_d = np.zeros((block_row_size * n_row_blocks, k), dtype=np.float32) p_z_given_d[:p_z_given_d_init.shape[0]] = p_z_given_d_init p_z_given_d = p_z_given_d.reshape(n_row_blocks, block_row_size, k) p_w_given_z = np.zeros((k, block_col_size * n_col_blocks), dtype=np.float32) p_w_given_z[:, :p_w_given_z_init.shape[1]] = p_w_given_z_init p_w_given_z = np.transpose(p_w_given_z.T.reshape(n_col_blocks, block_col_size, k), axes=[0, 2, 1]).astype(np.float32, order="C") A_blocks = [[0] * n_col_blocks for i in range(n_row_blocks)] max_nnz_per_block = 0 for i in range(n_row_blocks): row_start = block_row_size * i row_end = min(row_start + block_row_size, n) for j in range(n_col_blocks): col_start = block_col_size * j col_end = min(col_start + block_col_size, m) A_blocks[i][j] = A[row_start:row_end, col_start:col_end].tocoo() if A_blocks[i][j].nnz > max_nnz_per_block: max_nnz_per_block = A_blocks[i][j].nnz block_rows_ndarray = np.full( (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32) block_cols_ndarray = np.full( (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32) block_vals_ndarray = np.zeros( (n_row_blocks, n_col_blocks, max_nnz_per_block), dtype=np.float32) for i in range(n_row_blocks): for j in range(n_col_blocks): nnz = A_blocks[i][j].nnz block_rows_ndarray[i, j, :nnz] = A_blocks[i][j].row block_cols_ndarray[i, j, :nnz] = A_blocks[i][j].col block_vals_ndarray[i, j, :nnz] = A_blocks[i][j].data n_d_blocks = block_rows_ndarray.shape[0] n_w_blocks = block_rows_ndarray.shape[1] block_size = block_rows_ndarray.shape[2] p_z_given_wd_block = np.zeros((n_d_blocks, n_w_blocks, block_size, k), dtype=np.float32) blocked_next_p_w_given_z = np.zeros( ( np.int64(n_d_blocks), np.int64(n_w_blocks), np.int64(k), np.int64(block_col_size), ), dtype=np.float32, ) blocked_next_p_z_given_d = np.zeros( ( np.int64(n_w_blocks), np.int64(n_d_blocks), np.int64(block_row_size), np.int64(k), ), dtype=np.float32, ) norms_pwz = np.zeros((n_d_blocks, n_w_blocks, k), dtype=np.float64) previous_log_likelihood = log_likelihood_by_blocks( block_rows_ndarray, block_cols_ndarray, block_vals_ndarray, p_w_given_z, p_z_given_d, ) d_block_rows_ndarray = cuda.to_device(block_rows_ndarray) d_block_cols_ndarray = cuda.to_device(block_cols_ndarray) d_block_vals_ndarray = cuda.to_device(block_vals_ndarray) d_blocked_next_p_w_given_z = cuda.to_device(blocked_next_p_w_given_z) d_blocked_next_p_z_given_d = cuda.to_device(blocked_next_p_z_given_d) d_p_z_given_wd_block = cuda.to_device(p_z_given_wd_block) d_p_w_given_z = cuda.to_device(p_w_given_z) d_p_z_given_d = cuda.to_device(p_z_given_d) d_norms_pwz = cuda.to_device(norms_pwz) n_d = p_z_given_d.shape[1] n_w = p_w_given_z.shape[2] for i in range(n_iter // n_iter_per_test): for j in range(n_iter_per_test): plsa_e_step[(n_d_blocks, n_w_blocks), 256]( d_block_rows_ndarray, d_block_cols_ndarray, d_p_w_given_z, d_p_z_given_d, d_p_z_given_wd_block, e_step_thresh, ) cuda.synchronize() plsa_partial_m_step[(n_d_blocks, n_w_blocks), k]( d_block_rows_ndarray, d_block_cols_ndarray, d_block_vals_ndarray, d_p_w_given_z, d_p_z_given_d, d_blocked_next_p_w_given_z, d_blocked_next_p_z_given_d, d_p_z_given_wd_block, d_norms_pwz, ) cuda.synchronize() normalize_m_step_p_z_given_d[n_d_blocks, 256](d_blocked_next_p_z_given_d, d_p_z_given_d) normalize_m_step_p_w_given_z[n_w_blocks, 256](d_blocked_next_p_w_given_z, d_p_w_given_z, d_norms_pwz) cuda.synchronize() p_z_given_d = d_p_z_given_d.copy_to_host() p_w_given_z = d_p_w_given_z.copy_to_host() current_log_likelihood = log_likelihood_by_blocks( block_rows_ndarray, block_cols_ndarray, block_vals_ndarray, p_w_given_z, p_z_given_d, ) change = np.abs(current_log_likelihood - previous_log_likelihood) if change / np.abs(current_log_likelihood) < tolerance: break else: previous_log_likelihood = current_log_likelihood for i in range(n_iter % n_iter_per_test): plsa_e_step[(n_d_blocks, n_w_blocks), 256]( d_block_rows_ndarray, d_block_cols_ndarray, d_p_w_given_z, d_p_z_given_d, d_p_z_given_wd_block, e_step_thresh, ) cuda.synchronize() plsa_partial_m_step[(n_d_blocks, n_w_blocks), k]( d_block_rows_ndarray, d_block_cols_ndarray, d_block_vals_ndarray, d_p_w_given_z, d_p_z_given_d, d_blocked_next_p_w_given_z, d_blocked_next_p_z_given_d, d_p_z_given_wd_block, d_norms_pwz, ) cuda.synchronize() normalize_m_step_p_z_given_d[n_d_blocks, 256](d_blocked_next_p_z_given_d, d_p_z_given_d) normalize_m_step_p_w_given_z[n_w_blocks, 256](d_blocked_next_p_w_given_z, d_p_w_given_z, d_norms_pwz) cuda.synchronize() p_z_given_d = d_p_z_given_d.copy_to_host() p_w_given_z = d_p_w_given_z.copy_to_host() p_z_given_d = np.vstack(p_z_given_d)[:n, :] p_w_given_z = np.hstack(p_w_given_z)[:, :m] return p_z_given_d, p_w_given_z
def plsa_fit( X, k, sample_weight, init="random", block_size=65536, n_iter=100, n_iter_per_test=10, tolerance=0.001, e_step_thresh=1e-32, random_state=None, ): """Fit a pLSA model to a data matrix ``X`` with ``k`` topics, an initialized according to ``init``. This will run an EM method to optimize estimates of P(z|d) and P(w|z). The will perform at most ``n_iter`` EM step iterations, while checking for relative improvement of the log-likelihood of the data under the model every ``n_iter_per_test`` iterations, and stops early if that is under ``tolerance``. Parameters ---------- X: sparse matrix of shape (n_docs, n_words) The data matrix pLSA is attempting to fit to. k: int The number of topics for pLSA to fit with. sample_weight: array of shape (n_docs,) Input document weights. init: string or tuple (optional, default="random") The intialization method to use. This should be one of: * ``"random"`` * ``"nndsvd"`` * ``"nmf"`` or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words). block_size: int (optional, default=65536) The number of nonzero entries of X to process in a block. The larger this value the faster the compute may go, but at higher memory cost. n_iter: int The maximum number iterations of EM to perform n_iter_per_test: int The number of iterations between tests for relative improvement in log-likelihood. tolerance: float The threshold of relative improvement in log-likelihood required to continue iterations. e_step_thresh: float (optional, default=1e-32) Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls below threshold then write a zero for P(z|w,d). random_state: int, RandomState instance or None, (optional, default: None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Used in in initialization. Returns ------- p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words) The resulting model values of P(z|d) and P(w|z) """ rng = check_random_state(random_state) p_z_given_d, p_w_given_z = plsa_init(X, k, init=init, rng=rng) p_z_given_d = p_z_given_d.astype(np.float32, order="C") p_w_given_z = p_w_given_z.astype(np.float32, order="C") use_sample_weights = np.any(sample_weight != 1.0) A = X.tocoo().astype(np.float32) p_z_given_d, p_w_given_z = plsa_fit_inner_blockwise( A.row, A.col, A.data, p_w_given_z, p_z_given_d, sample_weight, block_size=block_size, n_iter=n_iter, n_iter_per_test=n_iter_per_test, tolerance=tolerance, e_step_thresh=e_step_thresh, use_sample_weights=use_sample_weights, ) return p_z_given_d, p_w_given_z
def plsa_fit( X, k, n_row_blocks=8, n_col_blocks=8, init="random", n_iter=100, n_iter_per_test=10, tolerance=0.001, e_step_thresh=1e-32, random_state=None, ): rng = check_random_state(random_state) p_z_given_d, p_w_given_z = plsa_init(X, k, init=init, rng=rng) p_z_given_d = p_z_given_d.astype(np.float32, order="C") p_w_given_z = p_w_given_z.astype(np.float32, order="C") A = X.tocsr().astype(np.float32) n = A.shape[0] m = A.shape[1] block_row_size = np.uint32(np.ceil(A.shape[0] / n_row_blocks)) block_col_size = np.uint32(np.ceil(A.shape[1] / n_col_blocks)) A_blocks = [[0] * n_col_blocks for i in range(n_row_blocks)] max_nnz_per_block = 0 for i in range(n_row_blocks): row_start = block_row_size * i row_end = min(row_start + block_row_size, n) for j in range(n_col_blocks): col_start = block_col_size * j col_end = min(col_start + block_col_size, m) A_blocks[i][j] = A[row_start:row_end, col_start:col_end].tocoo() if A_blocks[i][j].nnz > max_nnz_per_block: max_nnz_per_block = A_blocks[i][j].nnz del A block_rows_ndarray = np.full( (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32, ) block_cols_ndarray = np.full( (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32, ) block_vals_ndarray = np.zeros( (n_row_blocks, n_col_blocks, max_nnz_per_block), dtype=np.float32, ) for i in range(n_row_blocks): for j in range(n_col_blocks): nnz = A_blocks[i][j].nnz block_rows_ndarray[i, j, :nnz] = A_blocks[i][j].row block_cols_ndarray[i, j, :nnz] = A_blocks[i][j].col block_vals_ndarray[i, j, :nnz] = A_blocks[i][j].data del A_blocks block_rows_ndarray = da.from_array( block_rows_ndarray, chunks=(1, 1, max_nnz_per_block), ) block_cols_ndarray = da.from_array( block_cols_ndarray, chunks=(1, 1, max_nnz_per_block), ) block_vals_ndarray = da.from_array( block_vals_ndarray, chunks=(1, 1, max_nnz_per_block), ) p_z_given_d, p_w_given_z = plsa_fit_inner_dask( block_rows_ndarray, block_cols_ndarray, block_vals_ndarray, p_w_given_z, p_z_given_d, block_row_size, block_col_size, n_iter=n_iter, n_iter_per_test=n_iter_per_test, tolerance=tolerance, e_step_thresh=e_step_thresh, ) return p_z_given_d, p_w_given_z