def __init__(self, corpus, vocab_embeddings, vocab, num_tables, alpha, log=None, save_path=None, show_topics=None, cholesky_decomp=False, num_words_for_formatting=None): """ :param corpus: :param vocab_embeddings: :param vocab: :param num_tables: :param alpha: :param log: :param save_path: :param show_topics: :param cholesky_decomp: :param num_words_for_formatting: By default, each topic is formatted by computing the probability of every word in the vocabulary under that topic. This can take a long time for a large vocabulary. If given, this limits the number considered to the first N in the vocabulary (which makes sense if the vocabulary is ordered with most common words first). """ if log is None: log = get_logger("GLDA") self.log = log # Vocab is used for outputting topics self.vocab = vocab self.cholesky_decomp = cholesky_decomp self.show_topics = show_topics self.save_path = save_path # Dirichlet hyperparam self.alpha = alpha # dataVectors self.vocab_embeddings = vocab_embeddings self.embedding_size = vocab_embeddings.shape[1] # List of list of ints self.corpus = corpus # numIterations # K, num tables self.num_tables = num_tables # N, num docs self.num_documents = len(corpus) # In the current iteration, map of table_id's to number of customers. Table id starts from 0 self.table_counts = np.zeros((self.num_tables), dtype=np.int32) # K x N array.tableCounts[i][j] represents how many words of document j are present in topic i. self.table_counts_per_doc = np.zeros( (self.num_tables, self.num_documents), dtype=np.int32) # Stores the table (topic) assignment of each customer in each iteration # tableAssignments[i][j] gives the table assignment of customer j of the ith document. self.table_assignments = [] # The following 4 parameters are arraylist (list) and not maps (dict) because, # if they are K tables, they are continuously numbered from 0 to K-1 and hence we can directly index them # Mean vector associated with each table in the current iteration. # This is the bayesian mean (i.e has the prior part too) self.table_means = np.zeros((self.num_tables, self.embedding_size), dtype=np.float64) # inverse of covariance matrix associated with each table in the current iteration. # The covariance matrix is scaled before taking the inverse by \frac{k_N + 1}{k_N (v_N - D + 1)} # (This is because the t-distribution take the matrix scaled as input) self.table_inverse_covariances = np.zeros( (self.num_tables, self.embedding_size, self.embedding_size), dtype=np.float64) # log-determinant of covariance matrix for each table. # Since 0.5 * logDet is required in (see logMultivariateTDensity), therefore that value is kept. self.log_determinants = np.zeros(self.num_tables, dtype=np.float64) # Stores the sum of the vectors of customers at a given table self.sum_table_customers = np.zeros( (self.num_tables, self.embedding_size), dtype=np.float64) # Stores the squared sum of the vectors of customers at a given table self.sum_squared_table_customers = np.zeros( (self.num_tables, self.embedding_size, self.embedding_size), dtype=np.float64) if self.cholesky_decomp: # Cholesky Lower Triangular Decomposition of covariance matrix associated with each table. self.table_cholesky_ltriangular_mat = np.zeros( (self.num_tables, self.embedding_size, self.embedding_size), dtype=np.float64) else: self.table_cholesky_ltriangular_mat = None # Normal inverse wishart prior self.prior = Wishart(self.vocab_embeddings) # Cache k_0\mu_0\mu_0^T, only compute it once # Used in calculate_table_params() self.k0mu0mu0T = self.prior.kappa * np.outer(self.prior.mu, self.prior.mu) self.num_words_for_formatting = num_words_for_formatting self.log.info("Initializing assignments") self.initialize()
def __init__(self, corpus, vocab_embeddings, vocab, num_tables, alpha=None, kappa=0.1, log=None, save_path=None, show_topics=None, mh_steps=2, num_words_for_formatting=None, das_normalization=True, show_progress=True): """ :param corpus: :param vocab_embeddings: :param vocab: :param num_tables: :param alpha: Dirichlet concentration. Defaults to 1/num_tables :param kappa: :param log: :param save_path: :param show_topics: :param mh_steps: :param num_words_for_formatting: By default, each topic is formatted by computing the probability of every word in the vocabulary under that topic. This can take a long time for a large vocabulary. If given, this limits the number considered to the first N in the vocabulary (which makes sense if the vocabulary is ordered with most common words first). :param das_normalization: Use the normalization of probability distributions used by Das, Zaheer and Dyer's original implementation when computing the sampling probability to choose whether to use the document posterior or language model part of the topic posterior. If False, do not normalize in this way, but use an alternative, which looks to me like it's more correct mathematically. """ if log is None: log = get_logger("GLDA") self.log = log self.show_progress = show_progress # Vocab is used for outputting topics self.vocab = vocab self.show_topics = show_topics self.save_path = save_path # MH sampling steps self.mh_steps = mh_steps # Dirichlet hyperparam if alpha is None: alpha = 1. / num_tables self.alpha = alpha self.das_normalization = das_normalization # dataVectors self.vocab_embeddings = vocab_embeddings self.embedding_size = vocab_embeddings.shape[1] self.num_terms = vocab_embeddings.shape[0] # List of list of ints self.corpus = corpus # numIterations # K, num tables self.num_tables = num_tables # N, num docs self.num_documents = len(corpus) # In the current iteration, map of table_id's to number of customers. Table id starts from 0 # Use shared memory self.table_counts = SharedArray.create(self.num_tables, "int") # K x N array.tableCounts[i][j] represents how many words of document j are present in topic i. self.table_counts_per_doc = np.zeros( (self.num_tables, self.num_documents), dtype=np.int32) # Stores the table (topic) assignment of each customer in each iteration # tableAssignments[i][j] gives the table assignment of customer j of the ith document. self.table_assignments = [] # The following 4 parameters are arraylist (list) and not maps (dict) because, # if they are K tables, they are continuously numbered from 0 to K-1 and hence we can directly index them # Mean vector associated with each table in the current iteration. # This is the bayesian mean (i.e has the prior part too) # Use shared memory self.table_means = SharedArray.create( (self.num_tables, self.embedding_size), "float") # log-determinant of covariance matrix for each table. # Since 0.5 * logDet is required in (see logMultivariateTDensity), therefore that value is kept. # Use shared memory self.log_determinants = SharedArray.create(self.num_tables, "float") # Stores the squared sum of the vectors of customers at a given table self.sum_squared_table_customers = np.zeros( (self.num_tables, self.embedding_size, self.embedding_size), dtype=np.float64) # Cholesky Lower Triangular Decomposition of covariance matrix associated with each table. # Use shared memory self.table_cholesky_ltriangular_mat = SharedArray.create( (self.num_tables, self.embedding_size, self.embedding_size), "float") # Normal inverse wishart prior self.prior = Wishart(self.vocab_embeddings, kappa=kappa) # Cache k_0\mu_0\mu_0^T, only compute it once # Used in calculate_table_params() self.k0mu0mu0T = self.prior.kappa * np.outer(self.prior.mu, self.prior.mu) self.num_words_for_formatting = num_words_for_formatting self.aliases = VoseAliases.create(self.num_terms, self.num_tables) self.log.info("Initializing assignments") self.initialize()