Beispiel #1
0
    def init_hierarchical_model(class_ids):
        score = [artm.PerplexityScore(name='perplexity_words', class_ids=['body']),
                 artm.PerplexityScore(name='perplexity_bigrams', class_ids=['bigrams'])]

        top_tokens = [artm.TopTokensScore(name='top_words', num_tokens=15, class_id='body'),
                      artm.TopTokensScore(name='top_bigrams', num_tokens=10, class_id='bigrams')]

        sparsity = [artm.SparsityThetaScore(name='sparsity_theta', eps=1e-6),
                    artm.SparsityPhiScore(name='sparsity_phi_words', class_id='words', eps=1e-6),
                    artm.SparsityPhiScore(name='sparsity_phi_bigrams', class_id='bigrams', eps=1e-6)]

        regularizers = [artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['body'], name='decorr_words'),
                        artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['bigram'], name='decorr_bigrams'),
                        artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['categories'], name='decorr_categories'),
                        artm.SmoothSparseThetaRegularizer(tau=0, name='sparsity_theta'),
                        artm.SmoothSparsePhiRegularizer(tau=0, class_ids=['body'], name='sparsity_words'),
                        artm.SmoothSparsePhiRegularizer(tau=0, class_ids=['bigram'], name='sparsity_bigrams')]

        hmodel = artm.hARTM(class_ids=class_ids,
                            cache_theta=True,
                            reuse_theta=True,
                            scores=score + top_tokens + sparsity,
                            regularizers=regularizers,
                            theta_columns_naming='title')
        return hmodel
Beispiel #2
0
def test_perplexity_strategy_mul(experiment_enviroment):
    """ """
    tm, dataset, experiment, dictionary = experiment_enviroment

    regularizer_parameters = {
        "regularizer":
        artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer',
                                        class_ids=MAIN_MODALITY),
        "tau_grid": []
    }

    cube = RegularizersModifierCube(
        num_iter=20,
        regularizer_parameters=regularizer_parameters,
        strategy=PerplexityStrategy(0.001, 10, 25, threshold=1.0),
        tracked_score_function='PerplexityScore',
        reg_search='mul',
        relative_coefficients=False,
        verbose=True)

    with pytest.warns(UserWarning,
                      match="Perplexity is too high for threshold"):
        tmodels = cube(tm, dataset)

    visited_taus = extract_visited_taus(tmodels)
    expected_taus = [0, 0.001, 0.01, 0.1, 1.0, 10.0]
    assert visited_taus == expected_taus

    SCORES = [3.756, 3.75, 3.72, 6.043]
    real_scores = extract_strategic_scores(cube)
    if real_scores != SCORES:
        warnings.warn(f"real_scores == {real_scores}" f"expected == {SCORES}")

    assert cube.strategy.best_point[0][2] == 1.0
Beispiel #3
0
    def init_model(self, dictionary_path=None):
        """dictionary_path: optional, used with pretrained model"""
        self.dictionary = artm.Dictionary()
        if dictionary_path is None:
            self.dictionary.gather(data_path=self.batches_path)
            self.dictionary.filter(min_tf=10, max_df_rate=0.1)
            self.dictionary.save_text(
                f"{self.dir_path}/dicts/dict_{self.name_dataset}.txt")
        else:
            self.dictionary.load_text(dictionary_path)

        self.model = artm.ARTM(
            num_topics=self.n_topics,
            dictionary=self.dictionary,
            show_progress_bars=True,
        )

        # scores
        self.model.scores.add(
            artm.PerplexityScore(name="PerplexityScore",
                                 dictionary=self.dictionary))
        self.model.scores.add(
            artm.SparsityThetaScore(name="SparsityThetaScore"))
        self.model.scores.add(artm.SparsityPhiScore(name="SparsityPhiScore"))

        # regularizers
        self.model.regularizers.add(
            artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=-0.1))
        self.model.regularizers.add(
            artm.SmoothSparseThetaRegularizer(name="SparseTheta", tau=-0.5))
        self.model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi", tau=1.5e5))
def define_model(n_topics: int, dictionary: artm.Dictionary,
                 sparse_theta: float, sparse_phi: float,
                 decorrelator_phi: float) -> artm.artm_model.ARTM:
    """
    Define the ARTM model.
    :param n_topics: number of topics.
    :param dictionary: batch vectorizer dictionary.
    :param sparse_theta: sparse theta parameter.
    :param sparse_phi: sparse phi Parameter.
    :param decorrelator_phi: decorellator phi Parameter.
    :return: ARTM model.
    """
    print("Defining the model.")
    topic_names = ["topic_{}".format(i) for i in range(1, n_topics + 1)]
    model_artm = artm.ARTM(
        topic_names=topic_names,
        cache_theta=True,
        scores=[
            artm.PerplexityScore(name="PerplexityScore",
                                 dictionary=dictionary),
            artm.SparsityPhiScore(name="SparsityPhiScore"),
            artm.SparsityThetaScore(name="SparsityThetaScore"),
            artm.TopicKernelScore(name="TopicKernelScore",
                                  probability_mass_threshold=0.3),
            artm.TopTokensScore(name="TopTokensScore", num_tokens=15)
        ],
        regularizers=[
            artm.SmoothSparseThetaRegularizer(name="SparseTheta",
                                              tau=sparse_theta),
            artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=sparse_phi),
            artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi",
                                            tau=decorrelator_phi)
        ])
    return model_artm
Beispiel #5
0
def test_perplexity_strategy_grid(experiment_enviroment, thread_flag):
    """ """
    tm, dataset, experiment, dictionary = experiment_enviroment

    tau_grid = [0.1, 0.5, 1, 5, 50]
    regularizer_parameters = {
        "regularizer":
        artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer',
                                        class_ids=MAIN_MODALITY),
        "tau_grid":
        tau_grid
    }

    cube = RegularizersModifierCube(
        num_iter=3,
        regularizer_parameters=regularizer_parameters,
        strategy=PerplexityStrategy(1, 5),
        tracked_score_function='PerplexityScore',
        reg_search="grid",
        use_relative_coefficients=False,
        separate_thread=thread_flag)
    with pytest.warns(UserWarning, match='Grid would be used instead'):
        dummies = cube(tm, dataset)
        tmodels = [dummy.restore() for dummy in dummies]

    visited_taus = extract_visited_taus(tmodels)
    expected_taus = [0] + tau_grid
    assert visited_taus == expected_taus

    SCORES = [3.756, 3.756, 3.753, 3.75, 3.72, 2.887]
    real_scores = extract_strategic_scores(cube)
    if real_scores != SCORES:
        warnings.warn(f"real_scores == {real_scores}" f"expected == {SCORES}")

    assert cube.strategy.best_point[0][2] == 50
Beispiel #6
0
def test_perplexity_strategy_add(experiment_enviroment, thread_flag):
    """ """
    tm, dataset, experiment, dictionary = experiment_enviroment

    regularizer_parameters = {
        "regularizer":
        artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer',
                                        class_ids=MAIN_MODALITY),
        "tau_grid": []
    }

    cube = RegularizersModifierCube(
        num_iter=3,
        regularizer_parameters=regularizer_parameters,
        strategy=PerplexityStrategy(1, 1, max_len=5),
        tracked_score_function='PerplexityScore',
        reg_search='add',
        use_relative_coefficients=False,
        verbose=True,
        separate_thread=thread_flag)
    with pytest.warns(UserWarning, match="Max progression length"):
        dummies = cube(tm, dataset)
        tmodels = [dummy.restore() for dummy in dummies]

    visited_taus = extract_visited_taus(tmodels)
    expected_taus = [0, 1, 2, 3, 4, 5]
    assert visited_taus == expected_taus

    SCORES = [3.756, 3.75, 3.743, 3.736, 3.728, 3.72]
    real_scores = extract_strategic_scores(cube)
    if real_scores != SCORES:
        warnings.warn(f"real_scores == {real_scores}" f"expected == {SCORES}")

    assert cube.strategy.best_point[0][2] == 5
Beispiel #7
0
def create_model_fn_4(n_iteration):
    tmp_model = cmh.create_model(current_dictionary=dictionary,
                                 n_topics=100,
                                 n_doc_passes=5,
                                 seed_value=100 + n_iteration,
                                 n_top_tokens=15,
                                 p_mass_threshold=0.25)
    tmp_model.regularizers.add(
        artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer',
                                        class_ids=['@default_class']))
    tmp_model.regularizers.add(
        artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
    tmp_model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer',
                                        class_ids=['@default_class']))
    tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10
    tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5
    tmp_model.regularizers['ss_phi_regularizer'].tau = -2
    tmp_model = cmh.fit_one_model(
        plot_maker,
        batch_vectorizer,
        models_file,
        config,
        tmp_model,
        _n_iterations=20,
        _model_name='model_20_m4_iter_{}'.format(n_iteration))
    return tmp_model
    def test_experiment_prune(cls):
        """ """
        cls.topic_model.experiment = None
        experiment_run = Experiment(
            cls.topic_model,
            experiment_id="run_experiment",
            save_path=cls.experiment_path,
        )
        test_cube = RegularizersModifierCube(
            num_iter=5,
            regularizer_parameters={
                'regularizer':
                artm.DecorrelatorPhiRegularizer(name='decorrelation_phi',
                                                tau=1),
                'tau_grid': [],
            },
            strategy=PerplexityStrategy(0.001, 10, 25, threshold=1.0),
            tracked_score_function='PerplexityScore@all',
            reg_search='mul',
            relative_coefficients=False,
            verbose=True)

        test_cube(cls.topic_model, cls.dataset)
        experiment_run.set_criteria(1, 'some_criterion')

        new_seed = experiment_run.get_models_by_depth(level=1)[0]
        experiment = Experiment(
            topic_model=new_seed,
            experiment_id="prune_experiment",
            save_path=cls.experiment_path,
            save_model_history=True,
        )
        assert len(experiment.models) == 1
Beispiel #9
0
    def decor_train(self):
        if self.model is None:
            print('Initialise the model first')
            return

        self.model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(name='decorr',
                                            topic_names=self.specific,
                                            tau=self.decor))
Beispiel #10
0
def create_and_learn_ARTM_decorPhi_modal(name="",
                                         topic_number=750,
                                         num_collection_passes=1,
                                         weigths=[1., 1., 1., 1.],
                                         decorTau=1.0):

    batch_vectorizer_train = None
    batch_vectorizer_train = artm.BatchVectorizer(data_path='./' + name,
                                                  data_format='vowpal_wabbit',
                                                  target_folder='folder' +
                                                  name)
    dictionary = artm.Dictionary()
    dictionary.gather(data_path=batch_vectorizer_train.data_path)
    topic_names = ['topic_{}'.format(i) for i in range(topic_number)]

    model = artm.ARTM(topic_names=topic_names,
                      class_ids={
                          '@text': weigths[0],
                          '@first': weigths[1],
                          '@second': weigths[2],
                          '@third': weigths[3]
                      },
                      cache_theta=True,
                      theta_columns_naming='title',
                      scores=[
                          artm.PerplexityScore(name='PerplexityScore',
                                               dictionary=dictionary)
                      ])
    model.regularizers.add(
        artm.DecorrelatorPhiRegularizer(
            name='DecorrelatorPhi_modals',
            tau=decorTau,
            class_ids=['@first', '@second', '@third']))

    model.initialize(dictionary=dictionary)

    model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
    model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
    model.scores.add(
        artm.TopicKernelScore(name='TopicKernelScore',
                              class_id='@text',
                              probability_mass_threshold=0.3))
    model.scores.add(
        artm.TopTokensScore(name='TopTokensScore',
                            num_tokens=6,
                            class_id='@text'))
    model.scores.add(
        artm.SparsityPhiScore(name='sparsity_phi_score', class_id='@third'))

    model.num_document_passes = 1

    model.fit_offline(batch_vectorizer=batch_vectorizer_train,
                      num_collection_passes=num_collection_passes)

    theta_train = model.transform(batch_vectorizer=batch_vectorizer_train)

    return model, theta_train
Beispiel #11
0
def create_model_fn_20_complex_reg_1(n_iteration):
    n_topics = 20
    common_topics = [u'topic_0', u'topic_1']
    subject_topics = list(
        set([u'topic_{}'.format(idx)
             for idx in range(2, 20)]) - set(common_topics))
    tmp_model = create_model_complex(current_dictionary=dictionary,
                                     n_topics=n_topics,
                                     n_doc_passes=5,
                                     seed_value=100 + n_iteration,
                                     n_top_tokens=15,
                                     p_mass_threshold=0.25,
                                     common_topics=common_topics,
                                     subject_topics=subject_topics)
    # subject topics
    tmp_model.regularizers.add(
        artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer_subject',
                                          topic_names=subject_topics))
    tmp_model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer_subject',
                                        topic_names=subject_topics,
                                        class_ids=['@default_class']))
    tmp_model.regularizers.add(
        artm.DecorrelatorPhiRegularizer(
            name='decorrelator_phi_regularizer_subject',
            topic_names=subject_topics,
            class_ids=['@default_class']))
    tmp_model.regularizers['ss_theta_regularizer_subject'].tau = -0.5
    tmp_model.regularizers['ss_phi_regularizer_subject'].tau = -0.5
    tmp_model.regularizers['decorrelator_phi_regularizer_subject'].tau = -10

    # common topics
    tmp_model.regularizers.add(
        artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer_common',
                                          topic_names=subject_topics))
    tmp_model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer_common',
                                        topic_names=subject_topics,
                                        class_ids=['@default_class']))
    #     tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer_common',
    #         topic_names=subject_topics, class_ids=['@default_class']))
    tmp_model.regularizers['ss_theta_regularizer_common'].tau = 0.5
    tmp_model.regularizers['ss_phi_regularizer_common'].tau = 0.5
    #     tmp_model.regularizers['decorrelator_phi_regularizer_common'].tau = -10

    tmp_model = fit_one_model_complex(
        plot_maker,
        batch_vectorizer,
        models_file,
        config,
        tmp_model,
        _n_iterations=20,
        _model_name='model_20_complex_reg_1_iter_{}'.format(n_iteration))
    return tmp_model
Beispiel #12
0
def generate_decorrelators(
        specific_topic_names_lvl1, background_topic_names_lvl1,
        words_class_ids=MAIN_MODALITY,
        class_ids_for_bcg_decorrelation=MAIN_MODALITY,
        ngramms_modalities_for_decor=NGRAM_MODALITY):
    """
    Creates an array of pre-configured regularizers
    using specified coefficients
    """
    decorrelator_tau_ngramms = 5*1e-3
    decorrelator_tau_words_specific = 5*1e-2
    decorrelator_tau_words_bcg = 5*1e-3

    regularizers = [
        artm.DecorrelatorPhiRegularizer(
            gamma=0,
            tau=decorrelator_tau_words_specific,
            name='decorrelation',
            topic_names=specific_topic_names_lvl1,
            class_ids=words_class_ids,
        ),
        artm.DecorrelatorPhiRegularizer(
            tau=decorrelator_tau_words_bcg,
            name='decorrelation_background',
            topic_names=background_topic_names_lvl1,
            class_ids=words_class_ids,
        ),
        artm.DecorrelatorPhiRegularizer(
            tau=decorrelator_tau_ngramms,
            name='decorrelation_ngramms',
            topic_names=specific_topic_names_lvl1,
            class_ids=ngramms_modalities_for_decor
        ),
        artm.DecorrelatorPhiRegularizer(
            tau=decorrelator_tau_ngramms,
            name='decorrelation_ngramms_background',
            topic_names=background_topic_names_lvl1,
            class_ids=class_ids_for_bcg_decorrelation
        )
    ]
    return regularizers
 def create_topic_model(self, topic_model_name: str,
                        batch_vectorizer: artm.BatchVectorizer,
                        dictionary: artm.Dictionary) -> artm.ARTM:
     topic_model = artm.ARTM(num_topics=self.number_of_topics,
                             dictionary=dictionary,
                             cache_theta=False)
     topic_model.scores.add(
         artm.PerplexityScore(name='perplexity_score',
                              dictionary=dictionary))
     topic_model.scores.add(
         artm.SparsityPhiScore(name='sparsity_phi_score'))
     topic_model.scores.add(
         artm.SparsityThetaScore(name='sparsity_theta_score'))
     topic_model.num_document_passes = 5
     topic_model.num_processors = max(1, os.cpu_count() - 1)
     topic_model.regularizers.add(
         artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer'))
     topic_model.regularizers.add(
         artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
     topic_model.regularizers.add(
         artm.DecorrelatorPhiRegularizer(
             name='decorrelator_phi_regularizer'))
     topic_model.regularizers['sparse_phi_regularizer'].tau = -1.0
     topic_model.regularizers['sparse_theta_regularizer'].tau = -0.5
     topic_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+5
     best_score = None
     keyword_extraction_logger.info(
         'epoch  perplexity_score  sparsity_phi_score  sparsity_theta_score'
     )
     for restart_index in range(10):
         topic_model.fit_offline(batch_vectorizer=batch_vectorizer,
                                 num_collection_passes=3)
         if best_score is None:
             best_score = topic_model.score_tracker[
                 'perplexity_score'].last_value
         else:
             if best_score > topic_model.score_tracker[
                     'perplexity_score'].last_value:
                 best_score = topic_model.score_tracker[
                     'perplexity_score'].last_value
                 self.save_topic_model(topic_model, topic_model_name)
         keyword_extraction_logger.info(
             '{0:5}  {1:16.9}  {2:18.9}  {3:20.9}'.format(
                 (restart_index + 1) * 3,
                 topic_model.score_tracker['perplexity_score'].last_value,
                 topic_model.score_tracker['sparsity_phi_score'].last_value,
                 topic_model.score_tracker['sparsity_theta_score'].
                 last_value))
     del topic_model
     return self.load_topic_model(
         artm.ARTM(num_topics=self.number_of_topics,
                   dictionary=dictionary,
                   cache_theta=False), topic_model_name)
Beispiel #14
0
    def _get_corpus_model(self,
                          corpus_vector_spaced,
                          clustering_method='artm'):
        if 'gensim' == clustering_method:
            return self._get_model_LSI(corpus_vector_spaced)
        elif 'sklearn' == clustering_method:
            return self._get_model_LDA(corpus_vector_spaced)
        elif 'artm' == clustering_method:
            batch_vectorizer = corpus_vector_spaced['batch_vectorizer']
            dictionary = corpus_vector_spaced['dictionary']

            topic_names = [
                'topic_{}'.format(i) for i in range(self.num_of_clusters)
            ]

            model_artm = artm.ARTM(
                topic_names=topic_names,
                cache_theta=True,
                scores=[
                    artm.PerplexityScore(name='PerplexityScore',
                                         dictionary=dictionary)
                ],
                regularizers=[
                    artm.SmoothSparseThetaRegularizer(name='SparseTheta',
                                                      tau=-0.15)
                ])

            model_artm.scores.add(
                artm.SparsityPhiScore(name='SparsityPhiScore'))
            model_artm.scores.add(
                artm.SparsityThetaScore(name='SparsityThetaScore'))
            model_artm.scores.add(
                artm.TopicKernelScore(name='TopicKernelScore',
                                      probability_mass_threshold=0.3))
            model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore',
                                                      num_tokens=10),
                                  overwrite=True)

            model_artm.regularizers.add(
                artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1))
            model_artm.regularizers['SparseTheta'].tau = -0.2
            model_artm.regularizers.add(
                artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi',
                                                tau=1.5e+5))

            model_artm.num_document_passes = 1

            model_artm.initialize(dictionary)
            model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                                   num_collection_passes=30)

            return model_artm.transform(batch_vectorizer=batch_vectorizer).T
Beispiel #15
0
def experiment(filename, tau_phi, tau_theta):
    batch_vectorizer = artm.BatchVectorizer(data_path=filename, data_format='vowpal_wabbit',
                                            target_folder='batches')

    dictionary = batch_vectorizer.dictionary

    topic_num = 30
    tokens_num = 100
    print("ARTM training")
    topic_names = ['topic_{}'.format(i) for i in range(topic_num)]
    model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True)
    model_plsa = artm.ARTM(topic_names=topic_names, cache_theta=True,
                           scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)])
    model_lda = artm.LDA(num_topics=topic_num)

    model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
    model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
    model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=tokens_num))
    model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
    model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
    model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
    model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

    model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
    model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
    model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score'))
    model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
    model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
    model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
    model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

    model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer'))
    model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
    model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer'))

    model_artm.regularizers['sparse_phi_regularizer'].tau = tau_phi
    model_artm.regularizers['sparse_theta_regularizer'].tau = tau_theta
    model_artm.regularizers['decorrelator_phi_regularizer'].tau = 1e+3

    model_plsa.initialize(dictionary=dictionary)
    model_artm.initialize(dictionary=dictionary)
    model_lda.initialize(dictionary=dictionary)

    passes = 100
    model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
    model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
    model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)

    print_measures(model_plsa, model_artm, model_lda)
Beispiel #16
0
def create_thematic_model(checked_list, num_topics, num_tokens, phi_tau,
                          theta_tau, decorr_tau):
    """ Create a thematic model """
    gluing_bag_of_words(checked_list)

    batch_vectorizer = artm.BatchVectorizer(data_path=COLLECTION_PATH,
                                            data_format='vowpal_wabbit',
                                            target_folder=TARGET_FOLDER,
                                            batch_size=len(checked_list))
    dictionary = artm.Dictionary(data_path=TARGET_FOLDER)
    model = artm.ARTM(
        num_topics=num_topics,
        num_document_passes=len(checked_list),
        dictionary=dictionary,
        regularizers=[
            artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer',
                                            tau=phi_tau),
            artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer',
                                              tau=theta_tau),
            artm.DecorrelatorPhiRegularizer(
                name='decorrelator_phi_regularizer', tau=decorr_tau),
        ],
        scores=[
            artm.PerplexityScore(name='perplexity_score',
                                 dictionary=dictionary),
            artm.SparsityPhiScore(name='sparsity_phi_score'),
            artm.SparsityThetaScore(name='sparsity_theta_score'),
            artm.TopTokensScore(name='top_tokens_score', num_tokens=num_tokens)
        ])

    model.fit_offline(batch_vectorizer=batch_vectorizer,
                      num_collection_passes=len(checked_list))

    top_tokens = model.score_tracker['top_tokens_score']

    topic_dictionary = OrderedDict()

    for topic_name in model.topic_names:
        list_name = []
        for (token, weight) in zip(top_tokens.last_tokens[topic_name],
                                   top_tokens.last_weights[topic_name]):
            list_name.append(token + '-' + str(round(weight, 3)))
        topic_dictionary[str(topic_name)] = list_name

    return model.score_tracker[
        'perplexity_score'].last_value, model.score_tracker[
            'sparsity_phi_score'].last_value, model.score_tracker[
                'sparsity_theta_score'].last_value, topic_dictionary
def topic_model(class_ids, dictionary, num_of_topics, num_back, tau, tf):

    names_of_topics = [str(x) for x in range(num_of_topics)]
    dictionary.filter(min_tf=tf, class_id='subjects')
    dictionary.filter(min_tf=tf, class_id='objects')
    dictionary.filter(min_tf=tf, class_id='pairs')

    model = artm.ARTM(
        num_topics=num_of_topics,
        #reuse_theta=True,
        cache_theta=True,
        topic_names=names_of_topics,
        class_ids=class_ids,
        #regularizers=regularizers_artm,
        dictionary=dictionary)

    model.scores.add(
        artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary))

    model.scores.add(
        artm.SparsityPhiScore(name='SparcityPhiScore',
                              topic_names=model.topic_names[:-num_back]))

    model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(
            name='SparsePhiRegularizer',
            class_ids=class_ids,
            topic_names=model.topic_names[:-num_back],
            tau=-tau))
    model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(
            name='SmoothPhiRegularizer',
            class_ids=class_ids,
            topic_names=model.topic_names[-num_back:],
            tau=tau))

    model.regularizers.add(
        artm.DecorrelatorPhiRegularizer(
            name='DecorrelatorRegularizer',
            class_ids=class_ids,
            topic_names=model.topic_names[:-num_back],
            tau=tau))
    model.regularizers.add(
        artm.SmoothSparseThetaRegularizer(
            name='SparseThetaRegularizer',
            topic_names=model.topic_names[-num_back],
            tau=tau))
    return model
Beispiel #18
0
    def train(self, batch_vectorizer):
        if self.model is None:
            print('Initialise the model first!')
            return

        self.model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(name='decorr',
                                            topic_names=self.specific,
                                            tau=self.decor))
        #         self.model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorr_2',
        #                                                               topic_names=self.back, tau=self.decor_2))
        self.model.fit_offline(batch_vectorizer=batch_vectorizer,
                               num_collection_passes=self.n1)

        #         if ((self.n2 != 0) and (self.B != 0)):
        if (self.B != 0):
            self.model.regularizers.add(
                artm.SmoothSparseThetaRegularizer(name='SmoothPhi',
                                                  topic_names=self.back,
                                                  tau=self.spb))
            self.model.regularizers.add(
                artm.SmoothSparseThetaRegularizer(name='SmoothTheta',
                                                  topic_names=self.back,
                                                  tau=self.stb))
            self.model.fit_offline(batch_vectorizer=batch_vectorizer,
                                   num_collection_passes=self.n2)

        self.model.regularizers.add(
            artm.SmoothSparseThetaRegularizer(name='SparsePhi',
                                              topic_names=self.specific,
                                              tau=self.sp1))
        self.model.regularizers.add(
            artm.SmoothSparseThetaRegularizer(name='SparseTheta',
                                              topic_names=self.specific,
                                              tau=self.st1))
        self.model.fit_offline(batch_vectorizer=batch_vectorizer,
                               num_collection_passes=self.n3)

        #         if (self.n4 != 0):
        #             self.model.regularizers['SparsePhi'].tau = self.sp2
        #             self.model.regularizers['SparseTheta'].tau = self.st2
        #             self.model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=self.n4)

        print('Training is complete')
Beispiel #19
0
def pipeline_plsa_bigartm(lines,
                          TOPIC_NUMBER,
                          ngram_range,
                          topnwords,
                          LOGS_DATA_PATH="plsa.txt",
                          TARGET_FOLDER="plsa"):

    make_file(lines, ngram_range, LOGS_DATA_PATH)

    batch_vectorizer = artm.BatchVectorizer(data_path=LOGS_DATA_PATH,
                                            data_format='vowpal_wabbit',
                                            target_folder=TARGET_FOLDER)

    model_artm = artm.ARTM(num_topics=TOPIC_NUMBER, cache_theta=True)
    model_artm.initialize(dictionary=batch_vectorizer.dictionary)

    model_artm.regularizers.add(
        artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=0.05))
    model_artm.regularizers.add(
        artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5))
    model_artm.regularizers.add(
        artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.01))

    model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
    model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
    model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore',
                                              num_tokens=topnwords),
                          overwrite=True)
    model_artm.scores.add(
        artm.PerplexityScore(name='PerplexityScore',
                             dictionary=batch_vectorizer.dictionary))

    model_artm.num_document_passes = 2
    model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                           num_collection_passes=15)

    topic_names = {}
    for topic_name in model_artm.topic_names:
        topic_names[topic_name] = model_artm.score_tracker[
            'TopTokensScore'].last_tokens[topic_name]

    #return label_after_bigarm(model_artm),  topic_names
    return "nothing, sorry", topic_names
Beispiel #20
0
    def build_model(self, d_dir, n_document_passes=1):
        batch_vectorizer_train = artm.BatchVectorizer(data_path=os.path.join(
            d_dir, 'data_batches_train'),
                                                      data_format="batches")

        batch_vectorizer_test = artm.BatchVectorizer(data_path=os.path.join(
            d_dir, 'data_batches_test'),
                                                     data_format="batches")

        dictionary = artm.Dictionary()
        dictionary.gather(data_path=os.path.join(d_dir, 'for_dict'))

        model = artm.ARTM(num_topics=self.n_topics,
                          dictionary=dictionary,
                          cache_theta=True,
                          reuse_theta=True)

        # Sparsity p(c|t)
        model.scores.add(
            artm.SparsityPhiScore(eps=EPS,
                                  name='SparsityPhiScoreC',
                                  class_id=self.c))

        # Sparsity p(w|t)
        model.scores.add(
            artm.SparsityPhiScore(eps=EPS,
                                  name='SparsityPhiScoreGram3',
                                  class_id=self.gram3))

        #Regularization of sparsity p(gram3|t)
        model.regularizers.add(
            artm.SmoothSparsePhiRegularizer(name='SparsePhiGram3Regularizer',
                                            class_ids=[self.gram3]))

        #Regularization of decorr p(gram3|t)
        model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(
                name='DecorrelatorPhiGram3Regularizer',
                class_ids=[self.gram3]))

        model.num_document_passes = n_document_passes
        return (model, batch_vectorizer_train, batch_vectorizer_test)
def init_baseline_artm(
    dataset,
    modalities_to_use,
    main_modality,
    num_topics,
    bcg_topics,
    model_params: dict = None,
):
    """
    Creates simple artm model with standard scores.

    Parameters
    ----------
    dataset : Dataset
    modalities_to_use : list of str
    main_modality : str
    num_topics : int

    Returns
    -------
    model: artm.ARTM() instance
    """
    if model_params is None:
        model_params = dict()

    model = init_bcg_sparse_model(dataset, modalities_to_use, main_modality,
                                  num_topics, bcg_topics, model_params)
    specific_topic_names = model.topic_names[:-bcg_topics]

    model.regularizers.add(
        artm.DecorrelatorPhiRegularizer(
            gamma=0,
            tau=model_params.get('decorrelation_tau', 0.01),
            name='decorrelation',
            topic_names=specific_topic_names,
            class_ids=modalities_to_use,
        ))

    return model
def init_decorrelated_plsa(dataset,
                           modalities_to_use,
                           main_modality,
                           num_topics,
                           model_params: dict = None):
    """
    Creates simple artm model with standard scores.

    Parameters
    ----------
    dataset : Dataset
    modalities_to_use : list of str
    main_modality : str
    num_topics : int
    model_params : dict

    Returns
    -------
    model: artm.ARTM() instance
    """
    if model_params is None:
        model_params = dict()

    model = init_plsa(dataset, modalities_to_use, main_modality, num_topics)
    tau = model_params.get('decorrelation_tau', 0.01)

    specific_topic_names = model.topic_names  # let's decorrelate everything
    model.regularizers.add(
        artm.DecorrelatorPhiRegularizer(
            gamma=0,
            tau=tau,
            name='decorrelation',
            topic_names=specific_topic_names,
            class_ids=modalities_to_use,
        ))

    return model
Beispiel #23
0
    topics_names = ["subject_" + str(i) for i in range(topic_num)] + \
                   ["background_" + str(i) for i in range(background_topic_num)]  # назначаем имена темам

    subj_topics = topics_names[:topic_num]
    bgr_topics = topics_names[topic_num:]

    model = artm.ARTM(
        num_document_passes=document_passes_num,
        num_topics=topic_num + background_topic_num,
        topic_names=topics_names,
        seed=100,  # helps to get stable results
        num_processors=processors_num)

    model.regularizers.add(
        artm.DecorrelatorPhiRegularizer(name='Decorrelator',
                                        tau=10**4))  # обычный декоррелятор
    model.regularizers.add(
        artm.SmoothSparseThetaRegularizer(
            name='SmoothTheta', topic_names=bgr_topics,
            tau=0.3))  # сглаживаем Theta для фоновых тем
    model.regularizers.add(
        artm.SmoothSparseThetaRegularizer(
            name='SparseTheta', topic_names=subj_topics,
            tau=-0.3))  # разреживаем Theta для "хороших" тем
    model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(
            name='SmoothPhi',
            topic_names=bgr_topics,
            class_ids=["text"],
            tau=0.1))  # сглаживаем Theta для фоновых тем
    model.regularizers.add(
Beispiel #24
0
model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score'))
model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer'))
model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer'))

model_artm.regularizers['sparse_phi_regularizer'].tau = 0.01
model_artm.regularizers['sparse_theta_regularizer'].tau = -1.06
# model_artm.regularizers['decorrelator_phi_regularizer'].tau = 1e+5

model_plsa.initialize(dictionary=dictionary)
model_artm.initialize(dictionary=dictionary)
model_lda.initialize(dictionary=dictionary)

passes = 10
model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)

Beispiel #25
0
dictionary.load(dictionary_path=(filename + '/dictionary.dict'))
dictionary.load(dictionary_path=(filename + '/dictionary.dict'))

model_artm.initialize(dictionary=dictionary)

model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
model_artm.scores.add(
    artm.TopicKernelScore(name='TopicKernelScore',
                          probability_mass_threshold=0.3))

model_artm.regularizers.add(
    artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1))
model_artm.regularizers.add(
    artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5))
model_artm.regularizers.add(
    artm.TopicSelectionThetaRegularizer(name='TopicSelection', tau=0.25))

model_artm.regularizers['SparsePhi'].tau = -0.5
model_artm.regularizers['SparseTheta'].tau = -0.5
model_artm.regularizers['DecorrelatorPhi'].tau = 1e+5

model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore',
                                          num_tokens=10))

model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                       num_collection_passes=40)

ex = model_artm.get_theta()
for i, el in enumerate(ex.sum(axis=1)):
Beispiel #26
0
 def __init__(self, 
              dictionary, 
              class_ids, 
              tmp_files_path='', 
              theta_columns_naming='title',
              cache_theta = True,
              num_levels=None, 
              level_names=None, 
              num_topics=None, 
              topic_names=None, 
              num_backgrounds=None, 
              background_names=None,
              smooth_background_tau=None,
              decorrelate_phi_tau=None,
              parent_topics_proportion=None,
              spars_psi_tau=None,
              smooth_theta_fit=1.0,
              num_collection_passes=1,
              num_tokens=10):
     
     self.model = artm.hARTM(dictionary=dictionary, 
                             class_ids=class_ids, 
                             theta_columns_naming=theta_columns_naming, 
                             tmp_files_path=tmp_files_path, 
                             cache_theta=cache_theta)
     
     self.level_names = _generate_names(num_levels, level_names, 'level')
     
     topic_names = _generate_names_levels(len(self.level_names), num_topics, topic_names, 'topic')
     background_names = _generate_names_levels(len(self.level_names), num_backgrounds, background_names, 'background')
         
     for topic_names_level, background_names_level in zip(topic_names, background_names):
         
         topic_names_level = topic_names_level + background_names_level
         level = self.model.add_level(num_topics=len(topic_names_level), topic_names=topic_names_level)
         
     if smooth_background_tau is not None:
         
         for level, background_names_level in zip(self.model, background_names):
             level.regularizers.add(artm.SmoothSparsePhiRegularizer('SPhi_back', 
                                                                    tau=smooth_background_tau, 
                                                                    gamma=0,
                                                                    topic_names=background_names_level))
         
     if decorrelate_phi_tau is not None:
         
         for level in self.model:
             level.regularizers.add(artm.DecorrelatorPhiRegularizer('DPhi', tau=decorrelate_phi_tau, gamma=0))
         
     if (parent_topics_proportion is not None) and (spars_psi_tau is not None):
         
         for level, parent_topics_proportion_level in zip(self.model[1:], parent_topics_proportion):
             
             for topic_name, parent_topic_proportion in parent_topics_proportion_level.items(): 
                 level.regularizers.add(artm.HierarchySparsingThetaRegularizer(name=f'HSTheta_{topic_name}', 
                                                                               topic_names=topic_name, 
                                                                               tau=spars_psi_tau, 
                                                                               parent_topic_proportion=parent_topic_proportion))
                 
     self.smooth_theta_fit = smooth_theta_fit
     self.num_collection_passes = num_collection_passes
                 
     for level in self.model:
         
         for class_id, weight in class_ids.items():
             
             if weight > 0:
                 level.scores.add(artm.TopTokensScore(name=f'TT_{class_id}', class_id=class_id, num_tokens=num_tokens))
Beispiel #27
0
def test_func():
    # constants
    num_tokens = 11
    probability_mass_threshold = 0.9
    sp_reg_tau = -0.1
    decor_tau = 1.5e+5
    decor_rel_tau = 0.3
    num_collection_passes = 15
    num_document_passes = 1
    num_topics = 15
    vocab_size = 6906
    num_docs = 3430

    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')
    batches_folder = tempfile.mkdtemp()

    sp_zero_eps = 0.001
    sparsity_phi_value = [
        0.034, 0.064, 0.093, 0.120, 0.145, 0.170, 0.194, 0.220, 0.246, 0.277,
        0.312, 0.351, 0.390, 0.428, 0.464
    ]

    sparsity_phi_rel_value = [
        0.442, 0.444, 0.444, 0.446, 0.448, 0.449, 0.458, 0.468, 0.476, 0.488,
        0.501, 0.522, 0.574, 0.609, 0.670
    ]

    sparsity_theta_value = [0.0] * num_collection_passes

    perp_zero_eps = 2.0
    perplexity_value = [
        6873, 2590, 2685, 2578, 2603, 2552, 2536, 2481, 2419, 2331, 2235, 2140,
        2065, 2009, 1964
    ]

    perplexity_rel_value = [
        6873, 2667, 2458, 2323, 2150, 2265, 2015, 1967, 1807, 1747, 1713, 1607,
        1632, 1542, 1469
    ]

    top_zero_eps = 0.0001
    top_tokens_num_tokens = [num_tokens * num_topics] * num_collection_passes
    top_tokens_topic_0_tokens = [
        u'party', u'state', u'campaign', u'tax', u'political', u'republican',
        u'senate', u'candidate', u'democratic', u'court', u'president'
    ]
    top_tokens_topic_0_weights = [
        0.0209, 0.0104, 0.0094, 0.0084, 0.0068, 0.0067, 0.0065, 0.0058, 0.0053,
        0.0053, 0.0051
    ]

    ker_zero_eps = 0.02
    topic_kernel_topic_0_contrast = 0.96
    topic_kernel_topic_0_purity = 0.014
    topic_kernel_topic_0_size = 18.0
    topic_kernel_average_size = [
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.13, 0.6, 1.6, 3.53, 7.15, 12.6,
        20.4, 29.06
    ]
    topic_kernel_average_contrast = [
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.12, 0.31, 0.7, 0.96, 0.96, 0.96,
        0.96, 0.97
    ]
    topic_kernel_average_purity = [
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.01, 0.015, 0.017, 0.02,
        0.03, 0.04, 0.05
    ]

    len_last_document_ids = 10

    try:
        batch_vectorizer = artm.BatchVectorizer(data_path=data_path,
                                                data_format='bow_uci',
                                                collection_name='kos',
                                                target_folder=batches_folder)

        dictionary = artm.Dictionary()
        dictionary.gather(data_path=batch_vectorizer.data_path)

        model = artm.ARTM(
            topic_names=['topic_{}'.format(i) for i in range(num_topics)],
            dictionary=dictionary.name,
            cache_theta=True)

        model.regularizers.add(
            artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=sp_reg_tau))
        model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi',
                                            tau=decor_tau))

        model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
        model.scores.add(
            artm.PerplexityScore(name='PerplexityScore',
                                 use_unigram_document_model=False,
                                 dictionary=dictionary))
        model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
        model.scores.add(
            artm.TopTokensScore(name='TopTokensScore', num_tokens=num_tokens))
        model.scores.add(
            artm.TopicKernelScore(
                name='TopicKernelScore',
                probability_mass_threshold=probability_mass_threshold))
        model.scores.add(artm.ThetaSnippetScore(name='ThetaSnippetScore'))

        model.num_document_passes = num_document_passes
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=num_collection_passes)

        for i in range(num_collection_passes):
            assert abs(model.score_tracker['SparsityPhiScore'].value[i] -
                       sparsity_phi_value[i]) < sp_zero_eps

        for i in range(num_collection_passes):
            assert abs(model.score_tracker['SparsityThetaScore'].value[i] -
                       sparsity_theta_value[i]) < sp_zero_eps

        for i in range(num_collection_passes):
            assert abs(model.score_tracker['PerplexityScore'].value[i] -
                       perplexity_value[i]) < perp_zero_eps

        for i in range(num_collection_passes):
            assert model.score_tracker['TopTokensScore'].num_tokens[
                i] == top_tokens_num_tokens[i]

        for i in range(num_tokens):
            assert model.score_tracker['TopTokensScore'].last_tokens[
                model.topic_names[0]][i] == top_tokens_topic_0_tokens[i]
            assert abs(model.score_tracker['TopTokensScore'].last_weights[
                model.topic_names[0]][i] -
                       top_tokens_topic_0_weights[i]) < top_zero_eps

        assert len(model.score_tracker['TopicKernelScore'].last_tokens[
            model.topic_names[0]]) > 0

        assert abs(topic_kernel_topic_0_contrast -
                   model.score_tracker['TopicKernelScore'].last_contrast[
                       model.topic_names[0]]) < ker_zero_eps
        assert abs(topic_kernel_topic_0_purity -
                   model.score_tracker['TopicKernelScore'].last_purity[
                       model.topic_names[0]]) < ker_zero_eps
        assert abs(topic_kernel_topic_0_size -
                   model.score_tracker['TopicKernelScore'].last_size[
                       model.topic_names[0]]) < ker_zero_eps

        for i in range(num_collection_passes):
            assert abs(
                model.score_tracker['TopicKernelScore'].average_size[i] -
                topic_kernel_average_size[i]) < ker_zero_eps
            assert abs(
                model.score_tracker['TopicKernelScore'].average_contrast[i] -
                topic_kernel_average_contrast[i]) < ker_zero_eps
            assert abs(
                model.score_tracker['TopicKernelScore'].average_purity[i] -
                topic_kernel_average_purity[i]) < ker_zero_eps

        model.fit_online(batch_vectorizer=batch_vectorizer)

        info = model.info
        assert info is not None
        assert len(info.config.topic_name) == num_topics
        assert len(info.score) >= len(model.score_tracker)
        assert len(info.regularizer) == len(model.regularizers.data)
        assert len(info.cache_entry) > 0

        temp = model.score_tracker['ThetaSnippetScore'].last_document_ids
        assert len_last_document_ids == len(temp)
        assert len(model.score_tracker['ThetaSnippetScore'].last_snippet[
            temp[0]]) == num_topics

        phi = model.get_phi()
        assert phi.shape == (vocab_size, num_topics)
        theta = model.get_theta()
        assert theta.shape == (num_topics, num_docs)

        assert model.library_version.count('.') == 2  # major.minor.patch

        # test relative coefficients for Phi matrix regularizers
        model = artm.ARTM(num_topics=num_topics,
                          dictionary=dictionary.name,
                          cache_theta=False)

        model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi',
                                            tau=decor_rel_tau))
        model.regularizers['DecorrelatorPhi'].gamma = 0.0

        model.scores.add(
            artm.PerplexityScore(name='PerplexityScore',
                                 use_unigram_document_model=False,
                                 dictionary=dictionary))
        model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))

        model.num_document_passes = num_document_passes
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=num_collection_passes)

        for i in range(num_collection_passes):
            assert abs(model.score_tracker['SparsityPhiScore'].value[i] -
                       sparsity_phi_rel_value[i]) < sp_zero_eps

        for i in range(num_collection_passes):
            assert abs(model.score_tracker['PerplexityScore'].value[i] -
                       perplexity_rel_value[i]) < perp_zero_eps
    finally:
        shutil.rmtree(batches_folder)
def test_func():
    num_topics = 5
    tolerance = 0.05
    batches_folder = tempfile.mkdtemp()

    try:
        with open(os.path.join(batches_folder, 'temp.vw.txt'), 'w') as fout:
            fout.write('title_0 aaa:1 bbb:2 ccc:3\n')
            fout.write('title_1 aaa:1 bbb:2 ccc:3\n')
            fout.write('title_2 aaa:1 bbb:2 ccc:3\n')
            fout.write('title_3 aaa:1 bbb:2 ccc:3\n')

        batch_vectorizer = artm.BatchVectorizer(data_path=os.path.join(
            batches_folder, 'temp.vw.txt'),
                                                data_format='vowpal_wabbit',
                                                target_folder=batches_folder)
        model = artm.ARTM(num_topics=num_topics,
                          dictionary=batch_vectorizer.dictionary,
                          num_document_passes=1)

        model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(name='DPR', tau=1))
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=1)

        phi = model.get_phi()
        real_values = [
            [0.32, 0.95, 0.2, 0.55, 0.32],
            [0.33, 0.0, 0.68, 0.35, 0.63],
            [0.35, 0.05, 0.11, 0.1, 0.05],
        ]

        for elems, values in zip(phi.values.tolist(), real_values):
            for e, v in zip(elems, values):
                assert abs(e - v) < tolerance

        model.regularizers['DPR'].topic_names = [
            model.topic_names[0], model.topic_names[1]
        ]
        model.regularizers['DPR'].topic_pairs = {
            model.topic_names[0]: {
                model.topic_names[1]: 100.0,
                model.topic_names[2]: 100.0
            }
        }
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=1)

        phi = model.get_phi()
        real_values = [
            [0.0, 0.94, 0.22, 0.58, 0.35],
            [0.0, 0.0, 0.63, 0.3, 0.58],
            [0.0, 0.06, 0.14, 0.12, 0.07],
        ]

        for elems, values in zip(phi.values.tolist(), real_values):
            for e, v in zip(elems, values):
                assert abs(e - v) < tolerance

        model.regularizers['DPR'].topic_pairs = {
            model.topic_names[1]: {
                model.topic_names[0]: 10000.0
            }
        }
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=1)

        phi = model.get_phi()
        real_values = [
            [0.0, 0.91, 0.21, 0.54, 0.35],
            [0.0, 0.0, 0.55, 0.26, 0.53],
            [0.0, 0.08, 0.24, 0.20, 0.12],
        ]

        for elems, values in zip(phi.values.tolist(), real_values):
            for e, v in zip(elems, values):
                assert abs(e - v) < tolerance
    finally:
        shutil.rmtree(batches_folder)
Beispiel #29
0
def test_func():
    num_topics = 5
    tolerance = 0.01
    batches_folder = tempfile.mkdtemp()

    try:
        with open(os.path.join(batches_folder, 'temp.vw.txt'), 'w') as fout:
            fout.write('title_0 aaa:1 bbb:2 ccc:3\n')
            fout.write('title_1 aaa:1 bbb:2 ccc:3\n')
            fout.write('title_2 aaa:1 bbb:2 ccc:3\n')
            fout.write('title_3 aaa:1 bbb:2 ccc:3\n')

        batch_vectorizer = artm.BatchVectorizer(data_path=os.path.join(
            batches_folder, 'temp.vw.txt'),
                                                data_format='vowpal_wabbit',
                                                target_folder=batches_folder)
        model = artm.ARTM(num_topics=num_topics,
                          dictionary=batch_vectorizer.dictionary,
                          num_document_passes=1)

        model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(name='DPR', tau=1))
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=1)

        def _f(w):
            return ('@default_class', w)

        phi = model.get_phi()
        real_topics = pd.DataFrame(
            columns=['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4'],
            index=[_f('ccc'), _f('bbb'), _f('aaa')],
            data=[[0.32, 0.95, 0.2, 0.55, 0.32], [0.33, 0.0, 0.68, 0.35, 0.63],
                  [0.35, 0.05, 0.12, 0.1, 0.05]])

        assert (phi - real_topics).abs().values.max() < tolerance

        model.regularizers['DPR'].topic_names = [
            model.topic_names[0], model.topic_names[1]
        ]
        model.regularizers['DPR'].topic_pairs = {
            model.topic_names[0]: {
                model.topic_names[1]: 100.0,
                model.topic_names[2]: 100.0
            }
        }
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=1)

        phi = model.get_phi()
        real_topics = pd.DataFrame(
            columns=['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4'],
            index=[_f('ccc'), _f('bbb'), _f('aaa')],
            data=[[0.0, 0.94, 0.22, 0.58, 0.35], [0.0, 0.0, 0.63, 0.3, 0.58],
                  [0.0, 0.06, 0.15, 0.12, 0.07]])

        assert (phi - real_topics).abs().values.max() < tolerance

        model.regularizers['DPR'].topic_pairs = {
            model.topic_names[1]: {
                model.topic_names[0]: 10000.0
            }
        }
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=1)

        phi = model.get_phi()
        real_topics = pd.DataFrame(
            columns=['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4'],
            index=[_f('ccc'), _f('bbb'), _f('aaa')],
            data=[[0.0, 0.91, 0.21, 0.54, 0.35], [0.0, 0.0, 0.55, 0.26, 0.53],
                  [0.0, 0.09, 0.24, 0.20, 0.12]])

        assert (phi - real_topics).abs().values.max() < tolerance
    finally:
        shutil.rmtree(batches_folder)
Beispiel #30
0
def test_func():
    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')
    batches_folder = tempfile.mkdtemp()
    dump_folder = tempfile.mkdtemp()

    try:
        batch_vectorizer = artm.BatchVectorizer(data_path=data_path,
                                                data_format='bow_uci',
                                                collection_name='kos',
                                                target_folder=batches_folder)

        model_1 = artm.ARTM(num_processors=7,
                            cache_theta=True,
                            num_document_passes=5,
                            reuse_theta=True,
                            seed=10,
                            num_topics=15,
                            class_ids={'@default_class': 1.0},
                            theta_name='THETA',
                            dictionary=batch_vectorizer.dictionary)

        model_2 = artm.ARTM(num_processors=7,
                            cache_theta=False,
                            num_document_passes=5,
                            reuse_theta=False,
                            seed=10,
                            num_topics=15,
                            class_ids={'@default_class': 1.0},
                            dictionary=batch_vectorizer.dictionary)

        for model in [model_1, model_2]:
            model.scores.add(
                artm.PerplexityScore(name='perp',
                                     dictionary=batch_vectorizer.dictionary))
            model.scores.add(artm.SparsityThetaScore(name='sp_theta', eps=0.1))
            model.scores.add(artm.TopTokensScore(name='top_tok',
                                                 num_tokens=10))
            model.scores.add(
                artm.SparsityPhiScore(name='sp_nwt',
                                      model_name=model.model_nwt))
            model.scores.add(
                artm.TopicKernelScore(name='kernel',
                                      topic_names=model.topic_names[0:5],
                                      probability_mass_threshold=0.4))

            topic_pairs = {}
            for topic_name_1 in model.topic_names:
                for topic_name_2 in model.topic_names:
                    if topic_name_1 not in topic_pairs:
                        topic_pairs[topic_name_1] = {}
                    topic_pairs[topic_name_1][
                        topic_name_2] = numpy.random.randint(0, 3)

            model.regularizers.add(
                artm.DecorrelatorPhiRegularizer(name='decor',
                                                tau=100000.0,
                                                topic_pairs=topic_pairs))
            model.regularizers.add(
                artm.SmoothSparsePhiRegularizer(
                    name='smsp_phi',
                    tau=-0.5,
                    gamma=0.3,
                    dictionary=batch_vectorizer.dictionary))
            model.regularizers.add(
                artm.SmoothSparseThetaRegularizer(name='smsp_theta',
                                                  tau=0.1,
                                                  doc_topic_coef=[2.0] *
                                                  model.num_topics))
            model.regularizers.add(
                artm.SmoothPtdwRegularizer(name='sm_ptdw', tau=0.1))

            # learn first model and dump it on disc
            model.fit_offline(batch_vectorizer, num_collection_passes=10)
            model.fit_online(batch_vectorizer, update_every=1)

            model.dump_artm_model(os.path.join(dump_folder, 'target'))

            params = {}
            with open(os.path.join(dump_folder, 'target', 'parameters.json'),
                      'r') as fin:
                params = json.load(fin)
            _assert_json_params(params)

            # create second model from the dump and check the results are equal
            model_new = artm.load_artm_model(
                os.path.join(dump_folder, 'target'))

            _assert_params_equality(model, model_new)
            _assert_scores_equality(model, model_new)
            _assert_regularizers_equality(model, model_new)
            _assert_score_values_equality(model, model_new)
            _assert_matrices_equality(model, model_new)

            # continue learning of both models
            model.fit_offline(batch_vectorizer, num_collection_passes=3)
            model.fit_online(batch_vectorizer, update_every=1)

            model_new.fit_offline(batch_vectorizer, num_collection_passes=3)
            model_new.fit_online(batch_vectorizer, update_every=1)

            # check new results are also equal
            _assert_params_equality(model, model_new)
            _assert_scores_equality(model, model_new)
            _assert_regularizers_equality(model, model_new)
            _assert_score_values_equality(model, model_new)
            _assert_matrices_equality(model, model_new)

            shutil.rmtree(os.path.join(dump_folder, 'target'))
    finally:
        shutil.rmtree(batches_folder)
        shutil.rmtree(dump_folder)