def test_infer_topics_data(method): pytest.importorskip("gensim") minimum_probability: float = 0.001 n_tokens: int = 5 train_corpus, inferred_model = create_model_data(method) inferred_topics_data: tm.InferredTopicsData = tm.predict_topics( topic_model=inferred_model.topic_model, corpus=train_corpus.corpus, id2token=train_corpus.id2token, document_index=train_corpus.document_index, minimum_probability=minimum_probability, n_tokens=n_tokens, ) assert inferred_topics_data is not None assert isinstance(inferred_topics_data.document_index, pd.DataFrame) assert isinstance(inferred_topics_data.dictionary, pd.DataFrame) assert isinstance(inferred_topics_data.topic_token_weights, pd.DataFrame) assert isinstance(inferred_topics_data.topic_token_overview, pd.DataFrame) assert isinstance(inferred_topics_data.document_topic_weights, pd.DataFrame) assert inferred_topics_data.year_period == (2019, 2020) assert set(inferred_topics_data.topic_ids) == {0, 1, 2, 3} assert len(inferred_topics_data.document_index) == 5 assert list(inferred_topics_data.topic_token_weights.topic_id.unique()) == [0, 1, 2, 3] assert list(inferred_topics_data.topic_token_overview.index) == [0, 1, 2, 3] assert set(inferred_topics_data.document_topic_weights.topic_id.unique()) == {0, 1, 2, 3}
def predict( self: TopicModelMixinProtocol, *, inferred_model: tm.InferredModel, id2token: dict, corpus: corpora.Sparse2Corpus | pc.VectorizedCorpus, document_index: pd.DataFrame, target_folder: str, n_tokens: int, minimum_probability: float, **kwargs, ) -> tm.InferredTopicsData: """[summary] Args: inferred_model (tm.InferredModel): [description] id2token (dict): [description] corpus (Sparse2Corpus): [description] document_index (pd.DataFrame): [description] target_folder (str): [description] topics_data (tm.InferredTopicsData, optional): If set, pick data from thi. Defaults to None. n_tokens (int, optional): [description]. Defaults to 200. minimum_probability (float, optional): [description]. Defaults to 0.001. Raises: ValueError: [description] Returns: tm.InferredTopicsData: [description] """ if not isinstance(corpus, (pc.VectorizedCorpus, corpora.Sparse2Corpus)): # raise ValueError(f"predict: corpus type {type(corpus)} not supported in predict (use sparse instead)") corpus = self.instream_to_vectorized_corpus(token2id=id2token2token2id(id2token)) if isinstance(corpus, pc.VectorizedCorpus): """Make sure we use corpus' own data""" document_index = corpus.document_index id2token = corpus.id2token topics_data: tm.InferredTopicsData = tm.predict_topics( inferred_model.topic_model, corpus=corpus, id2token=id2token, document_index=document_index, n_tokens=n_tokens, minimum_probability=minimum_probability, **kwargs, ) inferred_model.store_options(target_folder) topics_data.store(target_folder) return topics_data
def compute_topic_model_handler(*_): self.model_widgets.output.clear_output() buzy(True) gensim_logger.setLevel(logging.INFO if self.model_widgets. show_trace.value else logging.WARNING) with self.model_widgets.output: try: name: str = str(uuid.uuid1()) # FIXME: Move code block out of GUI (to workflows) target_folder = os.path.join(self.data_folder, name) vectorizer_args = dict( apply_idf=self.model_widgets.apply_idf.value) topic_modeller_args = dict( n_topics=self.model_widgets.n_topics.value, max_iter=self.model_widgets.max_iter.value, learning_method='online', n_jobs=1, ) method = self.model_widgets.method.value train_corpus = tm.TrainingCorpus( corpus=list(self.get_corpus_terms(corpus)), document_index=self.document_index, vectorizer_args=vectorizer_args, ) trained_model: tm.InferredModel = tm.train_model( train_corpus=train_corpus, method=method, engine_args=topic_modeller_args) trained_model.topic_model.save( os.path.join(target_folder, 'gensim.model')) trained_model.store(folder=target_folder, store_compressed=True) train_corpus.store(folder=target_folder) inferred_topics: tm.InferredTopicsData = tm.predict_topics( topic_model=trained_model.topic_model, corpus=train_corpus.corpus, id2token=train_corpus.id2token, document_index=train_corpus.document_index, n_tokens=self.n_tokens, minimum_probability=self.minimum_probability, ) inferred_topics.store(target_folder=target_folder, pickled=False) self.state.update(trained_model=trained_model, inferred_topics=inferred_topics, train_corpus=train_corpus) topics: pd.DataFrame = get_topics_unstacked( self.state.topic_model, n_tokens=100, id2term=self.inferred_topics.id2term, topic_ids=self.inferred_topics.topic_ids, ) display(topics) except Exception as ex: logger.error(ex) self.state.update(inferred_topics=None) raise finally: buzy(False)
def compute( name: str = None, corpus_folder: str = None, corpus_source: str = None, engine: str = "gensim_lda-multicore", engine_args: dict = None, filename_field: str = None, minimum_probability: float = 0.001, n_tokens: int = 200, store_corpus: bool = False, compressed: bool = True, ): if engine not in SUPPORTED_ENGINES: raise ValueError(f"Engine {engine} not supported or deprecated") if corpus_source is None and corpus_folder is None: raise ValueError("corpus filename") if len(filename_field or []) == 0: raise ValueError("corpus filename fields") if corpus_folder is None: corpus_folder, _ = os.path.split(os.path.abspath(corpus_source)) target_folder = os.path.join(corpus_folder, name) os.makedirs(target_folder, exist_ok=True) reader_opts = TextReaderOpts( filename_pattern="*.txt", filename_filter=None, filename_fields=filename_field, ) transform_opts = TextTransformOpts(fix_whitespaces=False, fix_hyphenation=True) tokens_reader = TextTokenizer( source=corpus_source, transform_opts=transform_opts, reader_opts=reader_opts, ) corpus: TokenizedCorpus = TokenizedCorpus(reader=tokens_reader, transform_opts=None) train_corpus: tm.TrainingCorpus = tm.TrainingCorpus( corpus=corpus, corpus_options=dict( reader_opts=reader_opts.props, transform_opts=transform_opts.props, ), ) inferred_model: tm.InferredModel = tm.train_model( train_corpus=train_corpus, method=engine, engine_args=engine_args, ) inferred_model.topic_model.save( os.path.join(target_folder, 'gensim.model.gz')) inferred_model.store(target_folder, store_compressed=compressed) if store_corpus: train_corpus.store(target_folder) inferred_topics: tm.InferredTopicsData = tm.predict_topics( inferred_model.topic_model, corpus=train_corpus.corpus, id2token=train_corpus.id2token, document_index=train_corpus.document_index, minimum_probability=minimum_probability, n_tokens=n_tokens, ) inferred_topics.store(target_folder)
def compute( *, target_name: str = None, corpus_source: str = None, target_folder: str = None, reader_opts: TextReaderOpts = None, text_transform_opts: TextTransformOpts = None, transform_opts: TokensTransformOpts = None, engine: str = "gensim_lda-multicore", engine_args: dict = None, store_corpus: bool = False, store_compressed: bool = True, n_tokens: int = 200, minimum_probability: float = 0.001, ): """ runner """ tokens_reader = TextTokenizer( source=corpus_source, transform_opts=text_transform_opts, reader_opts=reader_opts, ) corpus: TokenizedCorpus = TokenizedCorpus(reader=tokens_reader, transform_opts=transform_opts) train_corpus: tm.TrainingCorpus = tm.TrainingCorpus( corpus=corpus, document_index=corpus.document_index, token2id=corpus.token2id, corpus_options=dict( reader_opts=reader_opts.props, transform_opts=transform_opts.props, ), ) inferred_model: tm.InferredModel = tm.train_model( train_corpus=train_corpus, method=engine, engine_args=engine_args, ) inferred_model.topic_model.save(jj(target_folder, 'gensim.model.gz')) inferred_model.store(target_folder, store_compressed=store_compressed) if store_corpus: train_corpus.store(target_folder) inferred_topics: tm.InferredTopicsData = tm.predict_topics( inferred_model.topic_model, corpus=train_corpus.corpus, id2token=train_corpus.id2token, document_index=train_corpus.document_index, n_tokens=n_tokens, minimum_probability=minimum_probability, ) inferred_topics.store(target_folder) return dict(folder=target_folder, tag=target_name)