def _loop_until_convergence( model: ARTM, start_iter: int, logger: logging.Logger, batch_vectorizer: BatchVectorizer, converge_thresh: float, max_iter: int, doctopic_eps: float, quiet: bool, ) -> Tuple[ARTM, float, int]: converged = False num_iter = 0 prev_score = np.inf while not converged and num_iter < max_iter: num_iter += 1 model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=1, reset_nwt=False) scores = compute_scores(model, batch_vectorizer, doctopic_eps) score = scores["distinctness"] converged = abs(score - prev_score) / prev_score < converge_thresh if not quiet and not ((start_iter + num_iter) % 25): print_scores(logger, scores, "\tIteration %d" % (start_iter + num_iter)) prev_score = score return model, score, start_iter + num_iter
def _save_model( model: ARTM, logger: logging.Logger, batch_vectorizer: BatchVectorizer, num_docs: int, batch_vectorizer_train: BatchVectorizer, num_docs_train: int, doctopic_output_path: str, wordtopic_output_path: str, consolidate: bool, ) -> None: if consolidate: check_doctopic( logger, model.transform_sparse(batch_vectorizer_train)[0].todense().T, num_docs_train, ) doctopic = model.transform_sparse(batch_vectorizer)[0].todense().T if check_doctopic(logger, doctopic, num_docs): if os.path.exists(doctopic_output_path): logger.info("Removing previous document-topic matrix ...") os.remove(doctopic_output_path) if os.path.exists(wordtopic_output_path): logger.info("Removing previous document-topic matrix ...") os.remove(wordtopic_output_path) logger.info("Saving topics per document ...") np.save(doctopic_output_path, doctopic) logger.info("Saved topics per document in '%s'.", doctopic_output_path) logger.info("Saving word/topic distribution ...") np.save(wordtopic_output_path, model.get_phi_dense()[0].T) logger.info("Saved word/topic distribution in '%s'.\n", wordtopic_output_path) else: logger.info("Document-topic matrix is corrupted, no saving.\n")
def load_topic_model(topic_model: artm.ARTM, file_name: str) -> Union[artm.ARTM, None]: if (not os.path.isfile(file_name + '.p_wt')) or ( not os.path.isfile(file_name + '.n_wt')): return None topic_model.load(os.path.join(file_name + '.p_wt'), 'p_wt') topic_model.load(os.path.join(file_name + '.n_wt'), 'n_wt') return topic_model
def compute_scores(model: ARTM, batch_vectorizer: BatchVectorizer, doctopic_eps: float) -> Dict[str, float]: scores: Dict[str, float] = {} scores["topic sparsity"] = model.score_tracker["topic sparsity"].last_value wordtopic, words, topics = model.get_phi_dense() doctopic = model.transform_sparse(batch_vectorizer)[0].todense().T scores["doc sparsity"] = np.sum( doctopic < doctopic_eps) / (doctopic.shape[0] * doctopic.shape[1]) scores["distinctness"] = np.mean( np.sum(compute_distinctness(wordtopic.T, len(topics), len(words)), axis=1) / (len(topics) - 1)) return scores
def _safe_copy_phi(model: artm.ARTM, phi: pd.DataFrame, dataset: Dataset, small_num_fit_iterations: int = 3) -> np.ndarray: if small_num_fit_iterations == 0: phi_ref = _copy_phi(model, phi) return phi_ref phi_ref = None # TODO: small_num_fit_iterations bigger than 1 seems not working for big matrices for _ in range(small_num_fit_iterations): phi_ref = _copy_phi(model, phi, phi_ref=phi_ref) model.fit_offline(dataset.get_batch_vectorizer(), 1) return phi_ref
def select_keywords_from_topic_model(self, topic_model: artm.ARTM) -> List[str]: phi = topic_model.get_phi() all_words = phi.index n_words = all_words.shape[0] set_of_keywords = set() for topic_name in phi.columns: column = phi[topic_name] set_of_keywords |= set( map( lambda keyword_and_probability: keyword_and_probability[0], filter( lambda value: value[1] >= self.probability_threshold, map( lambda idx: (all_words[idx].replace( '_', ' '), column[all_words[idx]]), range(n_words))))) del column return sorted(list(set_of_keywords))
def train_artm( bow_name: str, exp_name: str, force: bool, batch_size: int, max_topic: int, converge_thresh: float, max_iter: int, sparse_word_coeff: float, sparse_doc_coeff: float, decor_coeff: float, select_coeff: float, doctopic_eps: float, wordtopic_eps: float, min_prob: float, min_docs_abs: Optional[int], min_docs_rel: Optional[float], quiet: bool, consolidate: bool, log_level: str, ) -> None: """Train ARTM model from the input BoW.""" logger = create_logger(log_level, __name__) input_dir = os.path.join(BOW_DIR, bow_name) check_file_exists(os.path.join(input_dir, VOCAB_FILENAME)) docword_input_path = os.path.join(input_dir, DOCWORD_FILENAME) check_file_exists(docword_input_path) if consolidate: check_file_exists(os.path.join(input_dir, VOCAB_CONCAT_FILENAME)) docword_concat_input_path = os.path.join(input_dir, DOCWORD_CONCAT_FILENAME) check_file_exists(docword_concat_input_path) output_dir = os.path.join(TOPICS_DIR, bow_name, exp_name) doctopic_output_path = os.path.join(output_dir, DOCTOPIC_FILENAME) check_remove(doctopic_output_path, logger, force) wordtopic_output_path = os.path.join(output_dir, WORDTOPIC_FILENAME) check_remove(wordtopic_output_path, logger, force) create_directory(output_dir, logger) logger.info("Creating batch vectorizer from bags of words ...") batch_vectorizer, num_docs = create_artm_batch_vectorizer( "bow_tm", input_dir, batch_size, docword_input_path, logger) if consolidate: logger.info( "Creating batch vectorizer from consolidated bags of words ...") batch_vectorizer_train, num_docs_train = create_artm_batch_vectorizer( "bow_concat_tm", input_dir, batch_size, docword_concat_input_path, logger) else: batch_vectorizer_train, num_docs_train = batch_vectorizer, num_docs if min_docs_rel is None: min_docs = min_docs_abs else: check_range(min_docs_rel, "min-docs-rel") min_docs = int(num_docs_train * min_docs_rel) model_artm = ARTM( cache_theta=True, reuse_theta=True, theta_name="theta", dictionary=batch_vectorizer_train.dictionary, num_document_passes=1, num_topics=max_topic, scores=[SparsityPhiScore(name="topic sparsity", eps=wordtopic_eps)], regularizers=[ SmoothSparsePhiRegularizer(name="Sparse Topic", tau=0), SmoothSparseThetaRegularizer(name="Sparse Doc", tau=0), DecorrelatorPhiRegularizer(name="Decorrelator", tau=decor_coeff), TopicSelectionThetaRegularizer(name="Selector", tau=0), ], ) num_iter = 0 def loop_until_convergence(model: ARTM, n_iter: int) -> Tuple[ARTM, float, int]: return _loop_until_convergence( model, n_iter, logger, batch_vectorizer_train, converge_thresh, max_iter, doctopic_eps, quiet, ) def save_model(model: ARTM) -> None: _save_model( model, logger, batch_vectorizer, num_docs, batch_vectorizer_train, num_docs_train, doctopic_output_path, wordtopic_output_path, consolidate, ) logger.info("Starting training ...") logger.info("Decorrelating topics ...") model_artm, _, num_iter = loop_until_convergence(model_artm, num_iter) check_doctopic( logger, model_artm.transform_sparse(batch_vectorizer_train)[0].todense().T, num_docs_train, ) logger.info("Finished first phase at iteration %d", num_iter) print_scores(logger, compute_scores(model_artm, batch_vectorizer, doctopic_eps), "Scores:") logger.info("Applying selection regularization on topics ...") model_artm.regularizers["Sparse Topic"].tau = 0 model_artm.regularizers["Sparse Doc"].tau = 0 model_artm.regularizers["Decorrelator"].tau = 0 model_artm.regularizers["Selector"].tau = select_coeff model_artm, score_1, num_iter = loop_until_convergence( model_artm, num_iter) logger.info("Finished second phase at iteration %d", num_iter) print_scores( logger, compute_scores(model_artm, batch_vectorizer, doctopic_eps), "Scores before topic removal:", ) logger.info( "Removing topics with less than %d documents with probability over %.2f.", min_docs, min_prob, ) doctopic = model_artm.transform_sparse( batch_vectorizer_train)[0].todense().T valid_topics = np.sum(doctopic > min_prob, axis=0) > min_docs topic_names = [ topic_name for topic_ind, topic_name in enumerate(model_artm.topic_names) if valid_topics[0, topic_ind] ] model_artm.reshape_topics(topic_names) if len(valid_topics): logger.info("New number of topics: %d", len(topic_names)) else: raise RuntimeError( "Removed all topics, please soften your selection criteria (aborting)." ) print_scores( logger, compute_scores(model_artm, batch_vectorizer, doctopic_eps), "Scores after topic removal:", ) save_model(model_artm) logger.info("Inducing sparsity ...") model_artm.regularizers["Selector"].tau = 0 model_artm.regularizers["Decorrelator"].tau = decor_coeff model_artm.regularizers["Sparse Topic"].tau = -sparse_word_coeff model_artm.regularizers["Sparse Doc"].tau = -sparse_doc_coeff model_artm, score_2, num_iter = loop_until_convergence( model_artm, num_iter) logger.info("Finished last phase of training at iteration %d", num_iter) print_scores(logger, compute_scores(model_artm, batch_vectorizer, doctopic_eps), "Scores:") if score_1 < score_2: save_model(model_artm) else: logger.info("Sparsity worsened the model, no saving.")
def save_topic_model(topic_model: artm.ARTM, file_name: str): topic_model.save(os.path.join(file_name + '.p_wt'), 'p_wt') topic_model.save(os.path.join(file_name + '.n_wt'), 'n_wt')