Ejemplo n.º 1
0
def _create_inferred_model(method: str, train_corpus: tm.TrainingCorpus) -> tm.InferredModel:

    inferred_model: tm.InferredModel = tm.train_model(
        train_corpus=train_corpus,
        method=method,
        engine_args={
            'n_topics': 4,
            'passes': 1,
            'random_seed': 42,
            'workers': 1,
            'max_iter': 100,
            'work_folder': f'./tests/output/{uuid.uuid4()}',
        },
    )

    return inferred_model
Ejemplo n.º 2
0
    def train(self: TopicModelMixinProtocol, train_corpus: tm.TrainingCorpus) -> tm.InferredModel:

        inferred_model: tm.InferredModel = tm.train_model(
            train_corpus=train_corpus, method=self.engine, engine_args=self.engine_args
        )

        os.makedirs(self.target_subfolder, exist_ok=True)

        inferred_model.topic_model.save(jj(self.target_subfolder, 'gensim.model.gz'))

        inferred_model.store(
            folder=self.target_subfolder,
            store_compressed=self.store_compressed,
        )

        if self.store_corpus:
            train_corpus.store(self.target_subfolder)

        return inferred_model
Ejemplo n.º 3
0
        def compute_topic_model_handler(*_):

            self.model_widgets.output.clear_output()

            buzy(True)

            gensim_logger.setLevel(logging.INFO if self.model_widgets.
                                   show_trace.value else logging.WARNING)

            with self.model_widgets.output:

                try:

                    name: str = str(uuid.uuid1())

                    # FIXME: Move code block out of GUI (to workflows)
                    target_folder = os.path.join(self.data_folder, name)

                    vectorizer_args = dict(
                        apply_idf=self.model_widgets.apply_idf.value)

                    topic_modeller_args = dict(
                        n_topics=self.model_widgets.n_topics.value,
                        max_iter=self.model_widgets.max_iter.value,
                        learning_method='online',
                        n_jobs=1,
                    )

                    method = self.model_widgets.method.value

                    train_corpus = tm.TrainingCorpus(
                        corpus=list(self.get_corpus_terms(corpus)),
                        document_index=self.document_index,
                        vectorizer_args=vectorizer_args,
                    )

                    trained_model: tm.InferredModel = tm.train_model(
                        train_corpus=train_corpus,
                        method=method,
                        engine_args=topic_modeller_args)

                    trained_model.topic_model.save(
                        os.path.join(target_folder, 'gensim.model'))
                    trained_model.store(folder=target_folder,
                                        store_compressed=True)
                    train_corpus.store(folder=target_folder)

                    inferred_topics: tm.InferredTopicsData = tm.predict_topics(
                        topic_model=trained_model.topic_model,
                        corpus=train_corpus.corpus,
                        id2token=train_corpus.id2token,
                        document_index=train_corpus.document_index,
                        n_tokens=self.n_tokens,
                        minimum_probability=self.minimum_probability,
                    )

                    inferred_topics.store(target_folder=target_folder,
                                          pickled=False)

                    self.state.update(trained_model=trained_model,
                                      inferred_topics=inferred_topics,
                                      train_corpus=train_corpus)

                    topics: pd.DataFrame = get_topics_unstacked(
                        self.state.topic_model,
                        n_tokens=100,
                        id2term=self.inferred_topics.id2term,
                        topic_ids=self.inferred_topics.topic_ids,
                    )

                    display(topics)

                except Exception as ex:
                    logger.error(ex)
                    self.state.update(inferred_topics=None)
                    raise
                finally:
                    buzy(False)
Ejemplo n.º 4
0
def compute(
    name: str = None,
    corpus_folder: str = None,
    corpus_source: str = None,
    engine: str = "gensim_lda-multicore",
    engine_args: dict = None,
    filename_field: str = None,
    minimum_probability: float = 0.001,
    n_tokens: int = 200,
    store_corpus: bool = False,
    compressed: bool = True,
):

    if engine not in SUPPORTED_ENGINES:
        raise ValueError(f"Engine {engine} not supported or deprecated")

    if corpus_source is None and corpus_folder is None:
        raise ValueError("corpus filename")

    if len(filename_field or []) == 0:
        raise ValueError("corpus filename fields")

    if corpus_folder is None:
        corpus_folder, _ = os.path.split(os.path.abspath(corpus_source))

    target_folder = os.path.join(corpus_folder, name)

    os.makedirs(target_folder, exist_ok=True)

    reader_opts = TextReaderOpts(
        filename_pattern="*.txt",
        filename_filter=None,
        filename_fields=filename_field,
    )

    transform_opts = TextTransformOpts(fix_whitespaces=False,
                                       fix_hyphenation=True)

    tokens_reader = TextTokenizer(
        source=corpus_source,
        transform_opts=transform_opts,
        reader_opts=reader_opts,
    )

    corpus: TokenizedCorpus = TokenizedCorpus(reader=tokens_reader,
                                              transform_opts=None)

    train_corpus: tm.TrainingCorpus = tm.TrainingCorpus(
        corpus=corpus,
        corpus_options=dict(
            reader_opts=reader_opts.props,
            transform_opts=transform_opts.props,
        ),
    )

    inferred_model: tm.InferredModel = tm.train_model(
        train_corpus=train_corpus,
        method=engine,
        engine_args=engine_args,
    )

    inferred_model.topic_model.save(
        os.path.join(target_folder, 'gensim.model.gz'))

    inferred_model.store(target_folder, store_compressed=compressed)

    if store_corpus:
        train_corpus.store(target_folder)

    inferred_topics: tm.InferredTopicsData = tm.predict_topics(
        inferred_model.topic_model,
        corpus=train_corpus.corpus,
        id2token=train_corpus.id2token,
        document_index=train_corpus.document_index,
        minimum_probability=minimum_probability,
        n_tokens=n_tokens,
    )

    inferred_topics.store(target_folder)
Ejemplo n.º 5
0
def compute(
    *,
    target_name: str = None,
    corpus_source: str = None,
    target_folder: str = None,
    reader_opts: TextReaderOpts = None,
    text_transform_opts: TextTransformOpts = None,
    transform_opts: TokensTransformOpts = None,
    engine: str = "gensim_lda-multicore",
    engine_args: dict = None,
    store_corpus: bool = False,
    store_compressed: bool = True,
    n_tokens: int = 200,
    minimum_probability: float = 0.001,
):
    """ runner """

    tokens_reader = TextTokenizer(
        source=corpus_source,
        transform_opts=text_transform_opts,
        reader_opts=reader_opts,
    )

    corpus: TokenizedCorpus = TokenizedCorpus(reader=tokens_reader,
                                              transform_opts=transform_opts)

    train_corpus: tm.TrainingCorpus = tm.TrainingCorpus(
        corpus=corpus,
        document_index=corpus.document_index,
        token2id=corpus.token2id,
        corpus_options=dict(
            reader_opts=reader_opts.props,
            transform_opts=transform_opts.props,
        ),
    )

    inferred_model: tm.InferredModel = tm.train_model(
        train_corpus=train_corpus,
        method=engine,
        engine_args=engine_args,
    )

    inferred_model.topic_model.save(jj(target_folder, 'gensim.model.gz'))

    inferred_model.store(target_folder, store_compressed=store_compressed)

    if store_corpus:
        train_corpus.store(target_folder)

    inferred_topics: tm.InferredTopicsData = tm.predict_topics(
        inferred_model.topic_model,
        corpus=train_corpus.corpus,
        id2token=train_corpus.id2token,
        document_index=train_corpus.document_index,
        n_tokens=n_tokens,
        minimum_probability=minimum_probability,
    )

    inferred_topics.store(target_folder)

    return dict(folder=target_folder, tag=target_name)