def get_theta(phi: np.array, dataset: Dataset) -> pd.DataFrame:
        artm_model = artm.ARTM(num_topics=phi.shape[1])
        artm_model.initialize(dataset.get_dictionary())

        artm_model.fit_offline(dataset.get_batch_vectorizer(), 1)

        (_,
         phi_ref) = artm_model.master.attach_model(model=artm_model.model_pwt)

        np.copyto(phi_ref, phi)

        return artm_model.transform(dataset.get_batch_vectorizer())
コード例 #2
0
def regularization_train_func(dataset: Dataset,
                              model_number: int,
                              num_topics: int,
                              num_fit_iterations: int,
                              scores: List[BaseScore] = None,
                              decorrelating_tau: float = 10**5,
                              smoothing_tau: float = 1e-5,
                              sparsing_tau: float = -0.01,
                              **kwargs) -> TopicModel:

    topic_model = _get_topic_model(
        dataset,
        num_topics=num_topics,
        seed=model_number,
        **kwargs,
    )

    topic_model._model.regularizers.add(
        artm.regularizers.DecorrelatorPhiRegularizer(tau=decorrelating_tau))

    for topic_name in list(topic_model.get_phi().columns):
        topic_model._model.regularizers.add(
            artm.regularizers.SmoothSparsePhiRegularizer(
                tau=smoothing_tau, topic_names=topic_name))

    num_fit_iterations_with_scores = 1
    first_num_fit_iterations = int(
        0.75 * (num_fit_iterations - num_fit_iterations_with_scores))
    second_num_fit_iterations = (num_fit_iterations -
                                 num_fit_iterations_with_scores -
                                 first_num_fit_iterations)

    topic_model._fit(dataset.get_batch_vectorizer(),
                     num_iterations=first_num_fit_iterations)

    for regularizer_name in topic_model._model.regularizers.data:
        topic_model._model.regularizers[regularizer_name].tau = 0

    topic_model._model.regularizers.add(
        artm.regularizers.SmoothSparsePhiRegularizer(tau=sparsing_tau))

    topic_model._fit(dataset.get_batch_vectorizer(),
                     num_iterations=max(
                         0, second_num_fit_iterations -
                         num_fit_iterations_with_scores))
    _fit_model_with_scores(topic_model,
                           dataset,
                           scores,
                           num_fit_iterations=num_fit_iterations_with_scores)

    return topic_model
コード例 #3
0
def magic_clutch():
    test_dataset = None

    try:
        # Just some dataset, whatever
        test_dataset = Dataset(
            data_path=os.path.join(os.path.dirname(topicnet.__file__), 'tests',
                                   'test_data', 'test_dataset.csv'),
            internals_folder_path=tempfile.mkdtemp(prefix='magic_clutch__'))

        # If not itialize a new score at least once in the notebook
        # it won't be possible to load it
        _ = HoldoutPerplexityScore(
            '',
            test_dataset,
        )
        _ = MeanLiftScore('', test_dataset, [])
        _ = UniformThetaDivergenceScore('', test_dataset, [])

        _ = build_every_score(test_dataset, test_dataset, {"word": "@word"})

        _ = IntratextCoherenceScore("jbi", test_dataset)
        _ = SophisticatedTopTokensCoherenceScore("sds", test_dataset)

    finally:
        if test_dataset is not None and os.path.isdir(
                test_dataset._internals_folder_path):
            shutil.rmtree(test_dataset._internals_folder_path)
コード例 #4
0
def specific_initial_phi_train_func(dataset: Dataset,
                                    model_number: int,
                                    num_topics: int,
                                    num_fit_iterations: int,
                                    scores: List[BaseScore] = None,
                                    initialize_phi_func: Callable[
                                        [Dataset, int, int],
                                        pd.DataFrame] = None,
                                    **kwargs) -> TopicModel:

    topic_model = _get_topic_model(
        dataset,
        num_topics=num_topics,
        seed=model_number,
        **kwargs,
    )

    if initialize_phi_func is None:
        initialize_phi_func = initialize_phi_funcs.initialize_randomly

    initial_phi = initialize_phi_func(dataset, model_number, num_topics)
    init_phi_utils._copy_phi(topic_model._model, initial_phi)

    num_fit_iterations_with_scores = 1

    topic_model._fit(dataset.get_batch_vectorizer(),
                     num_iterations=max(
                         0,
                         num_fit_iterations - num_fit_iterations_with_scores))
    _fit_model_with_scores(topic_model,
                           dataset,
                           scores,
                           num_fit_iterations=num_fit_iterations_with_scores)

    return topic_model
コード例 #5
0
def default_train_func(dataset: Dataset,
                       model_number: int,
                       num_topics: int,
                       num_fit_iterations: int,
                       scores: List[BaseScore] = None,
                       **kwargs) -> TopicModel:
    """

    Additional Parameters
    ---------------------
    kwargs
        Some params for `_get_topic_model`, such as `cache_theta` and `num_processors`
    """

    topic_model = _get_topic_model(
        dataset,
        num_topics=num_topics,
        seed=model_number,
        **kwargs,
    )

    num_fit_iterations_with_scores = 1

    topic_model._fit(dataset.get_batch_vectorizer(),
                     num_iterations=max(
                         0,
                         num_fit_iterations - num_fit_iterations_with_scores))
    _fit_model_with_scores(topic_model,
                           dataset,
                           scores,
                           num_fit_iterations=num_fit_iterations_with_scores)

    return topic_model
コード例 #6
0
    def load(cls, path: str):
        with open(path, 'rb') as f:
            score = dill.load(f)

        score._dataset = Dataset(
            score._dataset_file_path,
            internals_folder_path=score._dataset_internals_folder_path,
            keep_in_memory=score._keep_dataset_in_memory,
        )

        return score
コード例 #7
0
def _fit_model_with_scores(topic_model: TopicModel,
                           dataset: Dataset,
                           scores: List[BaseScore] = None,
                           num_fit_iterations: int = 1):

    if scores is not None:
        for score in scores:
            score._attach(topic_model)

    topic_model._fit(dataset.get_batch_vectorizer(),
                     num_iterations=num_fit_iterations)
コード例 #8
0
    def setup_class(cls):
        cls.model = _MockModel(cls.create_phi())

        document_words = cls.create_documents()
        dataset_table = cls.create_dataset_table(document_words)

        cls.data_folder_path = tempfile.mkdtemp()
        cls.dataset_file_path = os.path.join(
            cls.data_folder_path,
            DATASET_FILE_NAME,
        )

        dataset_table.to_csv(cls.dataset_file_path, index=False)

        cls.dataset = Dataset(cls.dataset_file_path)
コード例 #9
0
def split_into_train_test(dataset: Dataset,
                          config: dict,
                          save_folder: str = None):
    # TODO: no need for `config` here, just `batches_prefix`

    documents = list(dataset._data.index)
    dn = config['batches_prefix']

    random = np.random.RandomState(seed=123)

    random.shuffle(documents)

    test_size = 0.2

    train_documents = documents[:int(1.0 - test_size * len(documents))]
    test_documents = documents[len(train_documents):]

    assert len(train_documents) + len(test_documents) == len(documents)

    # TODO: test with keep_in_memory = False just in case
    train_data = dataset._data.loc[train_documents]
    test_data = dataset._data.loc[test_documents]
    train_data['id'] = train_data.index
    test_data['id'] = test_data.index

    to_csv_kwargs = dict()

    if not dataset._small_data:
        to_csv_kwargs['single_file'] = True

    if save_folder is None:
        save_folder = '.'
    elif not os.path.isdir(save_folder):
        os.mkdir(save_folder)

    train_dataset_path = os.path.join(save_folder, f'{dn}_train.csv')
    test_dataset_path = os.path.join(save_folder, f'{dn}_test.csv')
    train_data.to_csv(train_dataset_path, index=False, **to_csv_kwargs)
    test_data.to_csv(test_dataset_path, index=False, **to_csv_kwargs)

    train_dataset = Dataset(
        train_dataset_path,
        batch_vectorizer_path=f'{dn}_train_internals',
        keep_in_memory=dataset._small_data,
    )
    test_dataset = Dataset(
        test_dataset_path,
        batch_vectorizer_path=f'{dn}_test_internals',
        keep_in_memory=dataset._small_data,
    )

    # TODO: quick hack, i'm not sure what for
    test_dataset._to_dataset = lambda: test_dataset
    train_dataset._to_dataset = lambda: train_dataset

    return train_dataset, test_dataset
コード例 #10
0
def _get_topic_model(dataset: Dataset,
                     phi: pd.DataFrame = None,
                     num_topics: int = None,
                     seed: int = None,
                     scores: List[BaseScore] = None,
                     num_safe_fit_iterations: int = 3,
                     num_processors: int = 3,
                     cache_theta: bool = False) -> TopicModel:

    dictionary = dataset.get_dictionary()

    if num_topics is not None and phi is not None:
        assert num_topics >= phi.shape[1]
    elif num_topics is None and phi is not None:
        num_topics = phi.shape[1]
    elif num_topics is None and phi is None:
        raise ValueError()

    topic_names = [f'topic_{i}' for i in range(num_topics)]

    if seed is None:
        artm_model = artm.ARTM(topic_names=topic_names)
    else:
        artm_model = artm.ARTM(topic_names=topic_names, seed=seed)

    artm_model.num_processors = num_processors
    artm_model.initialize(dictionary)

    if phi is None:
        pass
    elif num_safe_fit_iterations is not None and num_safe_fit_iterations > 0:
        init_phi_utils._safe_copy_phi(artm_model, phi, dataset,
                                      num_safe_fit_iterations)
    else:
        init_phi_utils._copy_phi(artm_model, phi)

    topic_model = TopicModel(artm_model=artm_model,
                             model_id='0',
                             cache_theta=cache_theta,
                             theta_columns_naming='title')

    if scores is not None:
        for score in scores:
            score._attach(topic_model)

    return topic_model
コード例 #11
0
    def test_call_toptokens_small_big_data(self, keep_in_memory) -> None:
        dataset = Dataset(self.dataset_file_path, keep_in_memory=keep_in_memory)
        score = _TopTokensCoherenceScore(dataset)

        self._check_call(score)
コード例 #12
0
    def test_call_intratext_small_big_data(self, keep_in_memory) -> None:
        dataset = Dataset(self.dataset_file_path, keep_in_memory=keep_in_memory)
        score = _IntratextCoherenceScore(dataset)

        self._check_call(score)
コード例 #13
0
def background_topics_train_func(dataset: Dataset,
                                 model_number: int,
                                 num_topics: int,
                                 num_fit_iterations: int,
                                 scores: List[BaseScore] = None,
                                 num_background_topics: int = 2,
                                 smoothing_tau: float = 0.01,
                                 **kwargs) -> TopicModel:

    topic_model = _get_topic_model(
        dataset,
        num_topics=num_topics + num_background_topics,
        seed=model_number,
        **kwargs,
    )

    for background_topic_name in list(
            topic_model.get_phi().columns)[-num_background_topics:]:
        topic_model._model.regularizers.add(
            artm.regularizers.SmoothSparsePhiRegularizer(
                tau=smoothing_tau,
                topic_names=background_topic_name  # TODO: why not list?
            ))

    topic_model._fit(dataset.get_batch_vectorizer(),
                     num_iterations=num_fit_iterations)

    specific_topics_phi = topic_model.get_phi(
    ).iloc[:, :-num_background_topics]

    del topic_model

    topic_model = _get_topic_model(
        dataset,
        num_topics=num_topics,
        seed=model_number,
    )

    num_fit_iterations_with_scores = 1
    num_fit_iterations_without_scores = num_fit_iterations - num_fit_iterations_with_scores

    phi_ref = None

    for fit_iteration in range(num_fit_iterations_without_scores):
        phi_ref = init_phi_utils._copy_phi(topic_model._model,
                                           specific_topics_phi,
                                           phi_ref=phi_ref)
        topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=1)

    phi_ref = init_phi_utils._copy_phi(topic_model._model,
                                       specific_topics_phi,
                                       phi_ref=phi_ref)
    _fit_model_with_scores(topic_model,
                           dataset,
                           scores,
                           num_fit_iterations=num_fit_iterations_with_scores)

    # TODO: not very safe here? (if cache_theta us True, Theta not updated here)
    init_phi_utils._copy_phi(topic_model._model,
                             specific_topics_phi,
                             phi_ref=phi_ref)

    return topic_model