コード例 #1
0
def generate_sparse_regularizers(
        specific_topic_names, background_topic_names,
        class_ids_for_bcg_smoothing=MAIN_MODALITY,
        specific_words_classes=MAIN_MODALITY):
    """
    Creates an array of pre-configured regularizers
    using specified coefficients
    """
    gimel_smooth_specific = 3e-10
    gimel_smooth_bcg = 0.3
    regularizers = [
        artm.SmoothSparsePhiRegularizer(
            tau=gimel_smooth_specific,
            name='smooth_phi_specific',
            topic_names=specific_topic_names,
            class_ids=specific_words_classes
        ),
        artm.SmoothSparseThetaRegularizer(
            tau=gimel_smooth_specific,
            name='smooth_theta_specific',
            topic_names=specific_topic_names
        ),
        artm.SmoothSparseThetaRegularizer(
            tau=gimel_smooth_bcg,
            name='smooth_theta_background',
            topic_names=background_topic_names
        ),
        artm.SmoothSparsePhiRegularizer(
            tau=gimel_smooth_bcg,
            name='smooth_phi_background',
            topic_names=background_topic_names,
            class_ids=class_ids_for_bcg_smoothing
        ),
    ]
    return regularizers
コード例 #2
0
def create_model_fn_20_complex_reg_1(n_iteration):
    n_topics = 20
    common_topics = [u'topic_0', u'topic_1']
    subject_topics = list(
        set([u'topic_{}'.format(idx)
             for idx in range(2, 20)]) - set(common_topics))
    tmp_model = create_model_complex(current_dictionary=dictionary,
                                     n_topics=n_topics,
                                     n_doc_passes=5,
                                     seed_value=100 + n_iteration,
                                     n_top_tokens=15,
                                     p_mass_threshold=0.25,
                                     common_topics=common_topics,
                                     subject_topics=subject_topics)
    # subject topics
    tmp_model.regularizers.add(
        artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer_subject',
                                          topic_names=subject_topics))
    tmp_model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer_subject',
                                        topic_names=subject_topics,
                                        class_ids=['@default_class']))
    tmp_model.regularizers.add(
        artm.DecorrelatorPhiRegularizer(
            name='decorrelator_phi_regularizer_subject',
            topic_names=subject_topics,
            class_ids=['@default_class']))
    tmp_model.regularizers['ss_theta_regularizer_subject'].tau = -0.5
    tmp_model.regularizers['ss_phi_regularizer_subject'].tau = -0.5
    tmp_model.regularizers['decorrelator_phi_regularizer_subject'].tau = -10

    # common topics
    tmp_model.regularizers.add(
        artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer_common',
                                          topic_names=subject_topics))
    tmp_model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer_common',
                                        topic_names=subject_topics,
                                        class_ids=['@default_class']))
    #     tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer_common',
    #         topic_names=subject_topics, class_ids=['@default_class']))
    tmp_model.regularizers['ss_theta_regularizer_common'].tau = 0.5
    tmp_model.regularizers['ss_phi_regularizer_common'].tau = 0.5
    #     tmp_model.regularizers['decorrelator_phi_regularizer_common'].tau = -10

    tmp_model = fit_one_model_complex(
        plot_maker,
        batch_vectorizer,
        models_file,
        config,
        tmp_model,
        _n_iterations=20,
        _model_name='model_20_complex_reg_1_iter_{}'.format(n_iteration))
    return tmp_model
コード例 #3
0
def create_model_fn_4(n_iteration):
    tmp_model = cmh.create_model(current_dictionary=dictionary,
                                 n_topics=100,
                                 n_doc_passes=5,
                                 seed_value=100 + n_iteration,
                                 n_top_tokens=15,
                                 p_mass_threshold=0.25)
    tmp_model.regularizers.add(
        artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer',
                                        class_ids=['@default_class']))
    tmp_model.regularizers.add(
        artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
    tmp_model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer',
                                        class_ids=['@default_class']))
    tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10
    tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5
    tmp_model.regularizers['ss_phi_regularizer'].tau = -2
    tmp_model = cmh.fit_one_model(
        plot_maker,
        batch_vectorizer,
        models_file,
        config,
        tmp_model,
        _n_iterations=20,
        _model_name='model_20_m4_iter_{}'.format(n_iteration))
    return tmp_model
コード例 #4
0
def define_model(n_topics: int, dictionary: artm.Dictionary,
                 sparse_theta: float, sparse_phi: float,
                 decorrelator_phi: float) -> artm.artm_model.ARTM:
    """
    Define the ARTM model.
    :param n_topics: number of topics.
    :param dictionary: batch vectorizer dictionary.
    :param sparse_theta: sparse theta parameter.
    :param sparse_phi: sparse phi Parameter.
    :param decorrelator_phi: decorellator phi Parameter.
    :return: ARTM model.
    """
    print("Defining the model.")
    topic_names = ["topic_{}".format(i) for i in range(1, n_topics + 1)]
    model_artm = artm.ARTM(
        topic_names=topic_names,
        cache_theta=True,
        scores=[
            artm.PerplexityScore(name="PerplexityScore",
                                 dictionary=dictionary),
            artm.SparsityPhiScore(name="SparsityPhiScore"),
            artm.SparsityThetaScore(name="SparsityThetaScore"),
            artm.TopicKernelScore(name="TopicKernelScore",
                                  probability_mass_threshold=0.3),
            artm.TopTokensScore(name="TopTokensScore", num_tokens=15)
        ],
        regularizers=[
            artm.SmoothSparseThetaRegularizer(name="SparseTheta",
                                              tau=sparse_theta),
            artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=sparse_phi),
            artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi",
                                            tau=decorrelator_phi)
        ])
    return model_artm
コード例 #5
0
    def init_model(self, dictionary_path=None):
        """dictionary_path: optional, used with pretrained model"""
        self.dictionary = artm.Dictionary()
        if dictionary_path is None:
            self.dictionary.gather(data_path=self.batches_path)
            self.dictionary.filter(min_tf=10, max_df_rate=0.1)
            self.dictionary.save_text(
                f"{self.dir_path}/dicts/dict_{self.name_dataset}.txt")
        else:
            self.dictionary.load_text(dictionary_path)

        self.model = artm.ARTM(
            num_topics=self.n_topics,
            dictionary=self.dictionary,
            show_progress_bars=True,
        )

        # scores
        self.model.scores.add(
            artm.PerplexityScore(name="PerplexityScore",
                                 dictionary=self.dictionary))
        self.model.scores.add(
            artm.SparsityThetaScore(name="SparsityThetaScore"))
        self.model.scores.add(artm.SparsityPhiScore(name="SparsityPhiScore"))

        # regularizers
        self.model.regularizers.add(
            artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=-0.1))
        self.model.regularizers.add(
            artm.SmoothSparseThetaRegularizer(name="SparseTheta", tau=-0.5))
        self.model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi", tau=1.5e5))
コード例 #6
0
ファイル: test_topic_model.py プロジェクト: xtonev/TopicNet
def test_fancy_fit_is_ok(experiment_enviroment):
    tm, dataset, experiment, dictionary = experiment_enviroment
    model_artm = artm.ARTM(
        num_topics=5,
        num_document_passes=1,
        dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore')],
        theta_columns_naming='title',
        class_ids={
            MAIN_MODALITY: 1,
            NGRAM_MODALITY: 1,
            EXTRA_MODALITY: 1,
            '@psyduck': 42
        },
        regularizers=[
            artm.SmoothSparseThetaRegularizer(name='smooth_theta', tau=10.0),
        ])
    custom_scores = {'mean_kernel_size': ScoreExample()}

    tm = TopicModel(model_artm,
                    model_id='absolutely_new_id',
                    custom_scores=custom_scores)

    num_iterations = 10
    tm._fit(dataset.get_batch_vectorizer(), num_iterations)
    params = tm.get_jsonable_from_parameters()
    assert "smooth_theta" in params["regularizers"]
    PATH = "tests/experiments/save_standalone/"
    tm.save(PATH)
    tm2 = TopicModel.load(PATH)
    assert (tm.get_phi() == tm2.get_phi()).all().all()
コード例 #7
0
    def init_hierarchical_model(class_ids):
        score = [artm.PerplexityScore(name='perplexity_words', class_ids=['body']),
                 artm.PerplexityScore(name='perplexity_bigrams', class_ids=['bigrams'])]

        top_tokens = [artm.TopTokensScore(name='top_words', num_tokens=15, class_id='body'),
                      artm.TopTokensScore(name='top_bigrams', num_tokens=10, class_id='bigrams')]

        sparsity = [artm.SparsityThetaScore(name='sparsity_theta', eps=1e-6),
                    artm.SparsityPhiScore(name='sparsity_phi_words', class_id='words', eps=1e-6),
                    artm.SparsityPhiScore(name='sparsity_phi_bigrams', class_id='bigrams', eps=1e-6)]

        regularizers = [artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['body'], name='decorr_words'),
                        artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['bigram'], name='decorr_bigrams'),
                        artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['categories'], name='decorr_categories'),
                        artm.SmoothSparseThetaRegularizer(tau=0, name='sparsity_theta'),
                        artm.SmoothSparsePhiRegularizer(tau=0, class_ids=['body'], name='sparsity_words'),
                        artm.SmoothSparsePhiRegularizer(tau=0, class_ids=['bigram'], name='sparsity_bigrams')]

        hmodel = artm.hARTM(class_ids=class_ids,
                            cache_theta=True,
                            reuse_theta=True,
                            scores=score + top_tokens + sparsity,
                            regularizers=regularizers,
                            theta_columns_naming='title')
        return hmodel
コード例 #8
0
def fit():
    batch_id = str(uuid.uuid4())
    app.logger.info("batch %s", batch_id)

    rjson = request.json
    terms = rjson['terms']
    topics_cnt = rjson['topics']

    batch = artm.messages.Batch()
    term_to_id = {}
    all_terms = []

    batch = artm.messages.Batch()
    batch.id = batch_id

    for i, doc in enumerate(terms):
        item = batch.item.add()
        item.id = i
        field = item.field.add()
        for term in doc:
            if not term in term_to_id:
                term_to_id[term] = len(all_terms)
                all_terms.append(term)
            field.token_id.append(term_to_id[term])
            field.token_count.append(1)

    for t in all_terms:
        batch.token.append(t)

    os.mkdir(batch_id)
    with open(os.path.join(batch_id, "batch.batch"), 'wb') as fout:
        fout.write(batch.SerializeToString())

    app.logger.info("batch %s is created", batch_id)

    dictionary = artm.Dictionary()
    dictionary.gather(batch_id)

    model_artm = artm.ARTM(
        topic_names=['topic_{}'.format(i) for i in xrange(topics_cnt)],
        scores=[
            artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)
        ],
        regularizers=[
            artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.15)
        ],
        show_progress_bars=False)

    batch_vectorizer = artm.BatchVectorizer(data_path=batch_id,
                                            data_format="batches")

    model_artm.initialize(dictionary=dictionary)
    app.logger.info("model is starting to fit")
    model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                           num_collection_passes=1)
    app.logger.info("mode was fitted")

    model_artm.save(os.path.join(batch_id, "model"))

    return jsonify({"id": batch_id})
コード例 #9
0
    def train(self, batch_vectorizer):
        if self.model is None:
            print('Initialise the model first!')
            return

        self.model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(name='decorr',
                                            topic_names=self.specific,
                                            tau=self.decor))
        #         self.model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorr_2',
        #                                                               topic_names=self.back, tau=self.decor_2))
        self.model.fit_offline(batch_vectorizer=batch_vectorizer,
                               num_collection_passes=self.n1)

        #         if ((self.n2 != 0) and (self.B != 0)):
        if (self.B != 0):
            self.model.regularizers.add(
                artm.SmoothSparseThetaRegularizer(name='SmoothPhi',
                                                  topic_names=self.back,
                                                  tau=self.spb))
            self.model.regularizers.add(
                artm.SmoothSparseThetaRegularizer(name='SmoothTheta',
                                                  topic_names=self.back,
                                                  tau=self.stb))
            self.model.fit_offline(batch_vectorizer=batch_vectorizer,
                                   num_collection_passes=self.n2)

        self.model.regularizers.add(
            artm.SmoothSparseThetaRegularizer(name='SparsePhi',
                                              topic_names=self.specific,
                                              tau=self.sp1))
        self.model.regularizers.add(
            artm.SmoothSparseThetaRegularizer(name='SparseTheta',
                                              topic_names=self.specific,
                                              tau=self.st1))
        self.model.fit_offline(batch_vectorizer=batch_vectorizer,
                               num_collection_passes=self.n3)

        #         if (self.n4 != 0):
        #             self.model.regularizers['SparsePhi'].tau = self.sp2
        #             self.model.regularizers['SparseTheta'].tau = self.st2
        #             self.model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=self.n4)

        print('Training is complete')
コード例 #10
0
ファイル: baseline_compute.py プロジェクト: andreitsy/topics
def compute_big_artm(num_topics, tau, dictionary, batch_vectorizer, score_computer):
    artm_model = artm.ARTM(num_topics=num_topics,
                           num_document_passes=5,
                           dictionary=dictionary,
                           scores=[artm.PerplexityScore(name='s1')],
                           regularizers=[artm.SmoothSparseThetaRegularizer(name='r1', tau=tau)], cache_theta=True)
    artm_model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=10)
    theta_bigartm = artm_model.get_theta()
    bigartm_predicts = get_df_clusters_predicted(theta_bigartm, url_list)
    score = score_computer.compute_score(bigartm_predicts["story_id_predicted"])
    logging.info("num_topics={}, tau={},"
                 "bigARTM score = {}".format(num_topics, tau, score))
コード例 #11
0
 def create_topic_model(self, topic_model_name: str,
                        batch_vectorizer: artm.BatchVectorizer,
                        dictionary: artm.Dictionary) -> artm.ARTM:
     topic_model = artm.ARTM(num_topics=self.number_of_topics,
                             dictionary=dictionary,
                             cache_theta=False)
     topic_model.scores.add(
         artm.PerplexityScore(name='perplexity_score',
                              dictionary=dictionary))
     topic_model.scores.add(
         artm.SparsityPhiScore(name='sparsity_phi_score'))
     topic_model.scores.add(
         artm.SparsityThetaScore(name='sparsity_theta_score'))
     topic_model.num_document_passes = 5
     topic_model.num_processors = max(1, os.cpu_count() - 1)
     topic_model.regularizers.add(
         artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer'))
     topic_model.regularizers.add(
         artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
     topic_model.regularizers.add(
         artm.DecorrelatorPhiRegularizer(
             name='decorrelator_phi_regularizer'))
     topic_model.regularizers['sparse_phi_regularizer'].tau = -1.0
     topic_model.regularizers['sparse_theta_regularizer'].tau = -0.5
     topic_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+5
     best_score = None
     keyword_extraction_logger.info(
         'epoch  perplexity_score  sparsity_phi_score  sparsity_theta_score'
     )
     for restart_index in range(10):
         topic_model.fit_offline(batch_vectorizer=batch_vectorizer,
                                 num_collection_passes=3)
         if best_score is None:
             best_score = topic_model.score_tracker[
                 'perplexity_score'].last_value
         else:
             if best_score > topic_model.score_tracker[
                     'perplexity_score'].last_value:
                 best_score = topic_model.score_tracker[
                     'perplexity_score'].last_value
                 self.save_topic_model(topic_model, topic_model_name)
         keyword_extraction_logger.info(
             '{0:5}  {1:16.9}  {2:18.9}  {3:20.9}'.format(
                 (restart_index + 1) * 3,
                 topic_model.score_tracker['perplexity_score'].last_value,
                 topic_model.score_tracker['sparsity_phi_score'].last_value,
                 topic_model.score_tracker['sparsity_theta_score'].
                 last_value))
     del topic_model
     return self.load_topic_model(
         artm.ARTM(num_topics=self.number_of_topics,
                   dictionary=dictionary,
                   cache_theta=False), topic_model_name)
コード例 #12
0
 def fit_supervised(self, model, X, y):
                                        
     len_y = len(y)
     topic_names = model.topic_names
     
     doc_topic_coef = np.zeros((len_y, model.num_topics))
     doc_topic_coef[range(len_y), [topic_names.index(topic_name) for topic_name in y]] = 1.0
     
     model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SST', 
                                                              tau=self.smooth_theta_fit, 
                                                              doc_titles=y.index, 
                                                              doc_topic_coef=doc_topic_coef.tolist()))
コード例 #13
0
    def _get_corpus_model(self,
                          corpus_vector_spaced,
                          clustering_method='artm'):
        if 'gensim' == clustering_method:
            return self._get_model_LSI(corpus_vector_spaced)
        elif 'sklearn' == clustering_method:
            return self._get_model_LDA(corpus_vector_spaced)
        elif 'artm' == clustering_method:
            batch_vectorizer = corpus_vector_spaced['batch_vectorizer']
            dictionary = corpus_vector_spaced['dictionary']

            topic_names = [
                'topic_{}'.format(i) for i in range(self.num_of_clusters)
            ]

            model_artm = artm.ARTM(
                topic_names=topic_names,
                cache_theta=True,
                scores=[
                    artm.PerplexityScore(name='PerplexityScore',
                                         dictionary=dictionary)
                ],
                regularizers=[
                    artm.SmoothSparseThetaRegularizer(name='SparseTheta',
                                                      tau=-0.15)
                ])

            model_artm.scores.add(
                artm.SparsityPhiScore(name='SparsityPhiScore'))
            model_artm.scores.add(
                artm.SparsityThetaScore(name='SparsityThetaScore'))
            model_artm.scores.add(
                artm.TopicKernelScore(name='TopicKernelScore',
                                      probability_mass_threshold=0.3))
            model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore',
                                                      num_tokens=10),
                                  overwrite=True)

            model_artm.regularizers.add(
                artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1))
            model_artm.regularizers['SparseTheta'].tau = -0.2
            model_artm.regularizers.add(
                artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi',
                                                tau=1.5e+5))

            model_artm.num_document_passes = 1

            model_artm.initialize(dictionary)
            model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                                   num_collection_passes=30)

            return model_artm.transform(batch_vectorizer=batch_vectorizer).T
コード例 #14
0
def experiment(filename, tau_phi, tau_theta):
    batch_vectorizer = artm.BatchVectorizer(data_path=filename, data_format='vowpal_wabbit',
                                            target_folder='batches')

    dictionary = batch_vectorizer.dictionary

    topic_num = 30
    tokens_num = 100
    print("ARTM training")
    topic_names = ['topic_{}'.format(i) for i in range(topic_num)]
    model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True)
    model_plsa = artm.ARTM(topic_names=topic_names, cache_theta=True,
                           scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)])
    model_lda = artm.LDA(num_topics=topic_num)

    model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
    model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
    model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=tokens_num))
    model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
    model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
    model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
    model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

    model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
    model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
    model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score'))
    model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
    model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
    model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
    model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

    model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer'))
    model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
    model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer'))

    model_artm.regularizers['sparse_phi_regularizer'].tau = tau_phi
    model_artm.regularizers['sparse_theta_regularizer'].tau = tau_theta
    model_artm.regularizers['decorrelator_phi_regularizer'].tau = 1e+3

    model_plsa.initialize(dictionary=dictionary)
    model_artm.initialize(dictionary=dictionary)
    model_lda.initialize(dictionary=dictionary)

    passes = 100
    model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
    model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
    model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)

    print_measures(model_plsa, model_artm, model_lda)
コード例 #15
0
ファイル: text.py プロジェクト: Temirlan/t-model
def create_thematic_model(checked_list, num_topics, num_tokens, phi_tau,
                          theta_tau, decorr_tau):
    """ Create a thematic model """
    gluing_bag_of_words(checked_list)

    batch_vectorizer = artm.BatchVectorizer(data_path=COLLECTION_PATH,
                                            data_format='vowpal_wabbit',
                                            target_folder=TARGET_FOLDER,
                                            batch_size=len(checked_list))
    dictionary = artm.Dictionary(data_path=TARGET_FOLDER)
    model = artm.ARTM(
        num_topics=num_topics,
        num_document_passes=len(checked_list),
        dictionary=dictionary,
        regularizers=[
            artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer',
                                            tau=phi_tau),
            artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer',
                                              tau=theta_tau),
            artm.DecorrelatorPhiRegularizer(
                name='decorrelator_phi_regularizer', tau=decorr_tau),
        ],
        scores=[
            artm.PerplexityScore(name='perplexity_score',
                                 dictionary=dictionary),
            artm.SparsityPhiScore(name='sparsity_phi_score'),
            artm.SparsityThetaScore(name='sparsity_theta_score'),
            artm.TopTokensScore(name='top_tokens_score', num_tokens=num_tokens)
        ])

    model.fit_offline(batch_vectorizer=batch_vectorizer,
                      num_collection_passes=len(checked_list))

    top_tokens = model.score_tracker['top_tokens_score']

    topic_dictionary = OrderedDict()

    for topic_name in model.topic_names:
        list_name = []
        for (token, weight) in zip(top_tokens.last_tokens[topic_name],
                                   top_tokens.last_weights[topic_name]):
            list_name.append(token + '-' + str(round(weight, 3)))
        topic_dictionary[str(topic_name)] = list_name

    return model.score_tracker[
        'perplexity_score'].last_value, model.score_tracker[
            'sparsity_phi_score'].last_value, model.score_tracker[
                'sparsity_theta_score'].last_value, topic_dictionary
コード例 #16
0
def topic_model(class_ids, dictionary, num_of_topics, num_back, tau, tf):

    names_of_topics = [str(x) for x in range(num_of_topics)]
    dictionary.filter(min_tf=tf, class_id='subjects')
    dictionary.filter(min_tf=tf, class_id='objects')
    dictionary.filter(min_tf=tf, class_id='pairs')

    model = artm.ARTM(
        num_topics=num_of_topics,
        #reuse_theta=True,
        cache_theta=True,
        topic_names=names_of_topics,
        class_ids=class_ids,
        #regularizers=regularizers_artm,
        dictionary=dictionary)

    model.scores.add(
        artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary))

    model.scores.add(
        artm.SparsityPhiScore(name='SparcityPhiScore',
                              topic_names=model.topic_names[:-num_back]))

    model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(
            name='SparsePhiRegularizer',
            class_ids=class_ids,
            topic_names=model.topic_names[:-num_back],
            tau=-tau))
    model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(
            name='SmoothPhiRegularizer',
            class_ids=class_ids,
            topic_names=model.topic_names[-num_back:],
            tau=tau))

    model.regularizers.add(
        artm.DecorrelatorPhiRegularizer(
            name='DecorrelatorRegularizer',
            class_ids=class_ids,
            topic_names=model.topic_names[:-num_back],
            tau=tau))
    model.regularizers.add(
        artm.SmoothSparseThetaRegularizer(
            name='SparseThetaRegularizer',
            topic_names=model.topic_names[-num_back],
            tau=tau))
    return model
コード例 #17
0
def create_model_fn_20_reg_1(n_iteration):
    tmp_model = create_model(current_dictionary=dictionary,
                             n_topics=20,
                             n_doc_passes=5,
                             seed_value=100 + n_iteration,
                             n_top_tokens=15,
                             p_mass_threshold=0.25)
    tmp_model.regularizers.add(
        artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
    tmp_model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer',
                                        class_ids=['@default_class']))
    tmp_model.regularizers['ss_theta_regularizer'].tau = -3
    tmp_model.regularizers['ss_phi_regularizer'].tau = -3
    tmp_model = fit_one_model(
        tmp_model,
        _n_iterations=20,
        _model_name='model_20_reg_1_iter_{}'.format(n_iteration))
    return tmp_model
コード例 #18
0
def pipeline_plsa_bigartm(lines,
                          TOPIC_NUMBER,
                          ngram_range,
                          topnwords,
                          LOGS_DATA_PATH="plsa.txt",
                          TARGET_FOLDER="plsa"):

    make_file(lines, ngram_range, LOGS_DATA_PATH)

    batch_vectorizer = artm.BatchVectorizer(data_path=LOGS_DATA_PATH,
                                            data_format='vowpal_wabbit',
                                            target_folder=TARGET_FOLDER)

    model_artm = artm.ARTM(num_topics=TOPIC_NUMBER, cache_theta=True)
    model_artm.initialize(dictionary=batch_vectorizer.dictionary)

    model_artm.regularizers.add(
        artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=0.05))
    model_artm.regularizers.add(
        artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5))
    model_artm.regularizers.add(
        artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.01))

    model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
    model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
    model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore',
                                              num_tokens=topnwords),
                          overwrite=True)
    model_artm.scores.add(
        artm.PerplexityScore(name='PerplexityScore',
                             dictionary=batch_vectorizer.dictionary))

    model_artm.num_document_passes = 2
    model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                           num_collection_passes=15)

    topic_names = {}
    for topic_name in model_artm.topic_names:
        topic_names[topic_name] = model_artm.score_tracker[
            'TopTokensScore'].last_tokens[topic_name]

    #return label_after_bigarm(model_artm),  topic_names
    return "nothing, sorry", topic_names
コード例 #19
0
def init_simple_default_model(
    dataset,
    modalities_to_use,
    main_modality,
    specific_topics,
    background_topics,
):
    """
    Creates simple artm model with standard scores.

    Parameters
    ----------
    dataset : Dataset
    modalities_to_use : list of str
    main_modality : str
    specific_topics : list or int
    background_topics : list or int

    Returns
    -------
    model: artm.ARTM() instance
    """
    if isinstance(specific_topics, list):
        specific_topic_names = list(specific_topics)
    else:
        specific_topics = int(specific_topics)
        specific_topic_names = [f'topic_{i}' for i in range(specific_topics)]
    n_specific_topics = len(specific_topic_names)
    if isinstance(background_topics, list):
        background_topic_names = list(background_topics)
    else:
        background_topics = int(background_topics)
        background_topic_names = [
            f'background_{n_specific_topics + i}'
            for i in range(background_topics)
        ]
    n_background_topics = len(background_topic_names)
    dictionary = dataset.get_dictionary()

    baseline_class_ids = {class_id: 1 for class_id in modalities_to_use}
    tokens_data = count_vocab_size(dictionary, modalities_to_use)
    abs_weights = modality_weight_rel2abs(tokens_data, baseline_class_ids,
                                          main_modality)

    model = init_model(
        topic_names=specific_topic_names + background_topic_names,
        class_ids=abs_weights,
    )

    if n_background_topics > 0:
        model.regularizers.add(
            artm.SmoothSparsePhiRegularizer(
                name='smooth_phi_bcg',
                topic_names=background_topic_names,
                tau=0.0,
                class_ids=[main_modality],
            ), )
        model.regularizers.add(
            artm.SmoothSparseThetaRegularizer(
                name='smooth_theta_bcg',
                topic_names=background_topic_names,
                tau=0.0,
            ), )

    model.initialize(dictionary)
    add_standard_scores(model,
                        dictionary,
                        main_modality=main_modality,
                        all_modalities=modalities_to_use)

    return model
コード例 #20
0
def test_func():
    num_topics = 5
    batches_folder = tempfile.mkdtemp()

    try:
        with open(os.path.join(batches_folder, 'temp.vw.txt'), 'w') as fout:
            fout.write('title_0 aaa:1 bbb:2 ccc:3\n')
            fout.write('title_1 aaa:1 bbb:2 ccc:3\n')
            fout.write('title_2 aaa:1 bbb:2 ccc:3\n')
            fout.write('title_3 aaa:1 bbb:2 ccc:3\n')

        batch_vectorizer = artm.BatchVectorizer(data_path=os.path.join(
            batches_folder, 'temp.vw.txt'),
                                                data_format='vowpal_wabbit',
                                                target_folder=batches_folder)
        model = artm.ARTM(num_topics=num_topics,
                          dictionary=batch_vectorizer.dictionary,
                          num_document_passes=1,
                          cache_theta=True,
                          theta_columns_naming='title')

        model.regularizers.add(
            artm.SmoothSparseThetaRegularizer(
                name='SST', tau=-1000.0, doc_titles=['title_0', 'title_2']))
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=1)

        theta = model.get_theta()
        real_values = [
            [0.0, 0.14, 0.0, 0.14],
            [0.0, 0.25, 0.0, 0.25],
            [0.0, 0.19, 0.0, 0.19],
            [0.0, 0.21, 0.0, 0.21],
            [0.0, 0.21, 0.0, 0.21],
        ]

        for elems, values in zip(theta.values.tolist(), real_values):
            for e, v in zip(elems, values):
                assert abs(e - v) < 0.01

        model.initialize(dictionary=batch_vectorizer.dictionary)
        model.regularizers['SST'].doc_titles = [
            'title_0', 'title_2', 'title_1'
        ]
        model.regularizers['SST'].doc_topic_coef = [0.0, 1.0, 1.0, 0.0, 0.0]
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=1)

        theta = model.get_theta()
        real_values = [
            [0.26, 0.26, 0.26, 0.14],
            [0.0, 0.0, 0.0, 0.25],
            [0.0, 0.0, 0.0, 0.19],
            [0.36, 0.36, 0.36, 0.21],
            [0.38, 0.38, 0.38, 0.21],
        ]

        for elems, values in zip(theta.values.tolist(), real_values):
            for e, v in zip(elems, values):
                assert abs(e - v) < 0.01

        model.initialize(dictionary=batch_vectorizer.dictionary)
        model.regularizers['SST'].doc_titles = ['title_0', 'title_3']
        model.regularizers['SST'].doc_topic_coef = [[
            -1.0, 1.0, 0.0, 0.0, -1.0
        ], [0.0, 1.0, 0.0, -1.0, 0.0]]
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=1)

        theta = model.get_theta()
        real_values = [
            [0.499311, 0.146202, 0.146202, 0.000873],
            [0.0, 0.247351, 0.247351, 0.0],
            [0.000556, 0.185883, 0.185883, 0.001110],
            [0.000617, 0.206015, 0.206015, 0.996735],
            [0.499516, 0.214550, 0.214550, 0.001282],
        ]

        for elems, values in zip(theta.values.tolist(), real_values):
            for e, v in zip(elems, values):
                assert abs(e - v) < 0.000001

        model.initialize(dictionary=batch_vectorizer.dictionary)
        model.regularizers['SST'].doc_titles = []
        model.regularizers['SST'].doc_topic_coef = [0.0, 1.0, 1.0, 0.0, 0.0]
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=1)

        theta = model.get_theta()
        real_values = [
            [0.26, 0.26, 0.26, 0.26],
            [0.0, 0.0, 0.0, 0.0],
            [0.0, 0.0, 0.0, 0.0],
            [0.36, 0.36, 0.36, 0.36],
            [0.38, 0.38, 0.38, 0.38],
        ]

        for elems, values in zip(theta.values.tolist(), real_values):
            for e, v in zip(elems, values):
                assert abs(e - v) < 0.01
    finally:
        shutil.rmtree(batches_folder)
コード例 #21
0
ファイル: topics.py プロジェクト: Serenitas/topic-modeller
model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score'))
model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer'))
model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer'))

model_artm.regularizers['sparse_phi_regularizer'].tau = 0.01
model_artm.regularizers['sparse_theta_regularizer'].tau = -1.06
# model_artm.regularizers['decorrelator_phi_regularizer'].tau = 1e+5

model_plsa.initialize(dictionary=dictionary)
model_artm.initialize(dictionary=dictionary)
model_lda.initialize(dictionary=dictionary)

passes = 10
model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
コード例 #22
0
                                            data_format='bow_uci',
                                            collection_name=filename,
                                            target_folder=filename)
else:
    batch_vectorizer = artm.BatchVectorizer(data_path=filename,
                                            data_format='batches')

dictionary = artm.Dictionary()

model_artm = artm.ARTM(topic_names=['topic_{}'.format(i) for i in xrange(15)],
                       scores=[
                           artm.PerplexityScore(name='PerplexityScore',
                                                dictionary=dictionary)
                       ],
                       regularizers=[
                           artm.SmoothSparseThetaRegularizer(
                               name='SparseTheta', tau=-0.15)
                       ],
                       cache_theta=True)

if not os.path.isfile(filename + '/dictionary.dict'):
    dictionary.gather(data_path=batch_vectorizer.data_path)
    dictionary.save(dictionary_path=filename + '/dictionary.dict')

dictionary.load(dictionary_path=(filename + '/dictionary.dict'))
dictionary.load(dictionary_path=(filename + '/dictionary.dict'))

model_artm.initialize(dictionary=dictionary)

model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
model_artm.scores.add(
コード例 #23
0
def make_artm(col):
    """Get artm theta matrixes"""

    collection_train = pd.DataFrame(collection).iloc[train_index].reset_index()
    collection_test = pd.DataFrame(collection).iloc[test_index].reset_index()
    le = LabelEncoder()
    y_transormed = le.fit_transform(df_y)

    arr = []
    for index_number, i, c in zip(collection_train['index'],
                                  collection_train[0], y_transormed):
        arr.append(
            str(index_number) + ' |@default_class ' + unicode(i) +
            ' |@labels_class ' + unicode(c))

    arr_test = []
    for index_number, i in zip(collection_test['index'], collection_test[0]):
        arr_test.append(str(index_number) + ' |@default_class ' + unicode(i))

    pd.DataFrame(arr, index=None).to_csv('leaver_vw_form.txt',
                                         sep='\t',
                                         encoding='UTF-8',
                                         index=False,
                                         header=None)
    pd.DataFrame(arr_test, index=None).to_csv('leaver_vw_form_test.txt',
                                              sep='\t',
                                              encoding='UTF-8',
                                              index=False,
                                              header=None)

    batch_vectorizer = artm.BatchVectorizer(
        data_path="leaver_vw_form.txt",
        data_format="vowpal_wabbit",
        target_folder="leaver_vw_form_train",
        batch_size=100)
    batch_vectorizer_test = artm.BatchVectorizer(
        data_path="leaver_vw_form_test.txt",
        data_format="vowpal_wabbit",
        target_folder="leaver_vw_form_test",
        batch_size=100)

    T = pd.DataFrame(df_y)[u'Процесс'].nunique()
    print("количество тем составляет - {}".format(T))  # количество тем
    topic_names = ["sbj" + str(i) for i in range(T)]

    model_artm = artm.ARTM(num_topics=T,
                           topic_names=topic_names,
                           class_ids={
                               '@default_class': 1,
                               '@labels_class': 700
                           },
                           num_document_passes=10,
                           seed=79,
                           reuse_theta=True,
                           cache_theta=True,
                           scores=[
                               artm.TopTokensScore(name='top_tokens_score',
                                                   num_tokens=30,
                                                   class_id='@default_class')
                           ],
                           regularizers=[
                               artm.SmoothSparseThetaRegularizer(
                                   name='SparseTheta', tau=-0.15)
                           ])

    dictionary = artm.Dictionary(name='dictionary')
    dictionary.gather(batch_vectorizer.data_path)

    model_artm.initialize('dictionary')

    dictionary.filter(min_tf=2, min_df_rate=0.01)

    model_artm.scores.add(
        artm.SparsityPhiScore(name='sparsity_phi_score',
                              class_id='@labels_class'))
    model_artm.regularizers.add(
        artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_def',
                                        class_ids=['@default_class']))
    model_artm.regularizers.add(
        artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_lab',
                                        class_ids=['@labels_class']))

    model_artm.scores.add(
        artm.PerplexityScore(name='PerplexityScore', dictionary='dictionary'))
    model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                           num_collection_passes=15)

    test_transformed = model_artm.transform(batch_vectorizer_test,
                                            predict_class_id='@labels_class').T
    train_transformed = model_artm.transform(
        batch_vectorizer, predict_class_id='@labels_class').T

    test_transformed = test_transformed.reset_index().sort_values('index')
    test_transformed = test_transformed.reset_index(drop=True)
    del test_transformed['index']
    test_transformed = test_transformed[sorted(test_transformed.columns)]
    train_transformed = train_transformed.reset_index().sort_values('index')
    del train_transformed['index']
    train_transformed = train_transformed[sorted(train_transformed.columns)]
    train_transformed = train_transformed.reset_index(drop=True)
    artm_transformed = pd.concat([train_transformed, test_transformed],
                                 axis=0).reset_index(drop=True)

    return artm_transformed
コード例 #24
0
    subj_topics = topics_names[:topic_num]
    bgr_topics = topics_names[topic_num:]

    model = artm.ARTM(num_document_passes=document_passes_num,
                      num_topics=topic_num + background_topic_num,
                      topic_names=topics_names,
                      num_processors=12,
                      seed=100)
    '''
    Тот самый регуляризатор, который будет связывать нашу theta с получаемой моделью.
    Через doc_titles указываем документы, которым соответствуют строки theta.
    '''
    model.regularizers.add(
        artm.SmoothSparseThetaRegularizer(
            name='Theta',
            tau=10**3,
            doc_topic_coef=theta,
            doc_titles=[str(i) for i in range(len(data.train_docs))]))

    model.class_ids = {
        "title": 1,
        "text": 1,
        "label": 5,
    }
    model.initialize(dictionary.filter(min_df=10))

    model.fit_offline(
        batch_vectorizer=BatchVectorizer(batches=train_batch,
                                         process_in_memory_model=model),
        num_collection_passes=3
    )  # в этот раз, поскольку theta задана, 3 прохода вполне достаточно
コード例 #25
0
ファイル: model_constructor.py プロジェクト: yyht/TopicNet
def init_simple_default_model(
    dataset: Dataset,
    modalities_to_use: List[str] or Dict[str, float],
    main_modality: str,
    specific_topics: List[str] or int,
    background_topics: List[str] or int,
) -> artm.ARTM:
    """
    Creates simple `artm.ARTM` model with standard scores.

    Parameters
    ----------
    dataset
        Dataset for model initialization
    modalities_to_use
        What modalities a model should know.
        If `modalities_to_use` is a dictionary,
        all given weights are assumed to be relative to `main_modality`:
        weights will then be recalculated to absolute ones
        using `dataset` and `main_modality`.
        If `modalities_to_use` is a list,
        then all relative weights are set equal to one.

        The result model's `class_ids` field will contain absolute modality weights.
    main_modality
        Modality relative to which all modality weights are considered
    specific_topics
        Specific topic names or their number
    background_topics
        Background topic names or their number

    Returns
    -------
    model : artm.ARTM

    """
    if isinstance(modalities_to_use, dict):
        modalities_weights = modalities_to_use
    else:
        modalities_weights = {class_id: 1 for class_id in modalities_to_use}

    specific_topic_names, background_topic_names = create_default_topics(
        specific_topics, background_topics)
    dictionary = dataset.get_dictionary()

    tokens_data = count_vocab_size(dictionary, modalities_to_use)
    abs_weights = modality_weight_rel2abs(tokens_data, modalities_weights,
                                          main_modality)

    model = init_model(
        topic_names=specific_topic_names + background_topic_names,
        class_ids=abs_weights,
    )

    if len(background_topic_names) > 0:
        model.regularizers.add(
            artm.SmoothSparsePhiRegularizer(
                name='smooth_phi_bcg',
                topic_names=background_topic_names,
                tau=0.0,
                class_ids=[main_modality],
            ), )
        model.regularizers.add(
            artm.SmoothSparseThetaRegularizer(
                name='smooth_theta_bcg',
                topic_names=background_topic_names,
                tau=0.0,
            ), )

    model.initialize(dictionary)
    add_standard_scores(model,
                        main_modality=main_modality,
                        all_modalities=modalities_to_use)

    return model
コード例 #26
0
ファイル: test_lda_model.py プロジェクト: karthi2016/bigartm
def test_func():
    # constants
    num_tokens = 15
    alpha = 0.01
    beta = 0.02
    num_collection_passes = 15
    num_document_passes = 1
    num_topics = 15
    vocab_size = 6906
    num_docs = 3430
    zero_eps = 0.001

    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')
    batches_folder = tempfile.mkdtemp()

    try:
        batch_vectorizer = artm.BatchVectorizer(data_path=data_path,
                                                data_format='bow_uci',
                                                collection_name='kos',
                                                target_folder=batches_folder)

        dictionary = artm.Dictionary()
        dictionary.gather(data_path=batch_vectorizer.data_path)

        model_artm = artm.ARTM(num_topics=num_topics,
                               dictionary=dictionary,
                               cache_theta=True,
                               reuse_theta=True)

        model_artm.regularizers.add(
            artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=beta))
        model_artm.regularizers.add(
            artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=alpha))

        model_artm.scores.add(
            artm.SparsityThetaScore(name='SparsityThetaScore'))
        model_artm.scores.add(
            artm.PerplexityScore(name='PerplexityScore',
                                 dictionary=dictionary))
        model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
        model_artm.scores.add(
            artm.TopTokensScore(name='TopTokensScore', num_tokens=num_tokens))

        model_lda = artm.LDA(num_topics=num_topics,
                             alpha=alpha,
                             beta=beta,
                             dictionary=dictionary,
                             cache_theta=True)
        model_lda.initialize(dictionary=dictionary)

        model_artm.num_document_passes = num_document_passes
        model_lda.num_document_passes = num_document_passes

        model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                               num_collection_passes=num_collection_passes)
        model_lda.fit_offline(batch_vectorizer=batch_vectorizer,
                              num_collection_passes=num_collection_passes)

        for i in range(num_collection_passes):
            assert abs(model_artm.score_tracker['SparsityPhiScore'].value[i] -
                       model_lda.sparsity_phi_value[i]) < zero_eps

        for i in range(num_collection_passes):
            assert abs(
                model_artm.score_tracker['SparsityThetaScore'].value[i] -
                model_lda.sparsity_theta_value[i]) < zero_eps

        for i in range(num_collection_passes):
            assert abs(model_artm.score_tracker['PerplexityScore'].value[i] -
                       model_lda.perplexity_value[i]) < zero_eps

        lda_tt = model_lda.get_top_tokens(num_tokens=num_tokens)
        assert len(lda_tt) == num_topics

        for i in range(num_topics):
            for j in range(num_tokens):
                assert model_artm.score_tracker['TopTokensScore'].last_tokens[
                    model_artm.topic_names[i]][j] == lda_tt[i][j]

        lda_tt = model_lda.get_top_tokens(num_tokens=num_tokens,
                                          with_weights=True)
        for i in range(num_tokens):
            assert abs(model_artm.score_tracker['TopTokensScore'].last_weights[
                model_artm.topic_names[0]][i] - lda_tt[0][i][1]) < zero_eps

        model_lda.fit_online(batch_vectorizer=batch_vectorizer)

        phi = model_lda.phi_
        assert phi.shape == (vocab_size, num_topics)
        theta = model_lda.get_theta()
        assert theta.shape == (num_topics, num_docs)

        assert model_lda.library_version.count('.') == 2  # major.minor.patch

        model_lda = artm.LDA(num_topics=num_topics,
                             alpha=alpha,
                             beta=([0.1] * num_topics),
                             dictionary=dictionary,
                             cache_theta=True)
        assert model_lda._internal_model.regularizers.size() == num_topics + 1
    finally:
        shutil.rmtree(batches_folder)
def init_bcg_sparse_model(dataset,
                          modalities_to_use,
                          main_modality,
                          specific_topics,
                          bcg_topics,
                          model_params: dict = None):
    """
    Creates simple artm model with standard scores.

    Parameters
    ----------
    dataset : Dataset
    modalities_to_use : list of str or dict
    main_modality : str
    specific_topics : int
    bcg_topics : int

    Returns
    -------
    model: artm.ARTM() instance
    """
    if model_params is None:
        model_params = dict()

    model = init_plsa(dataset, modalities_to_use, main_modality,
                      specific_topics, bcg_topics)
    background_topic_names = model.topic_names[-bcg_topics:]
    specific_topic_names = model.topic_names[:-bcg_topics]

    dictionary = dataset.get_dictionary()
    baseline_class_ids = {class_id: 1 for class_id in modalities_to_use}
    data_stats = count_vocab_size(dictionary, baseline_class_ids)

    # all coefficients are relative
    regularizers = [
        artm.SmoothSparsePhiRegularizer(
            name='smooth_phi_bcg',
            topic_names=background_topic_names,
            tau=model_params.get("smooth_bcg_tau", 0.1),
            class_ids=[main_modality],
        ),
        artm.SmoothSparseThetaRegularizer(
            name='smooth_theta_bcg',
            topic_names=background_topic_names,
            tau=model_params.get("smooth_bcg_tau", 0.1),
        ),
        artm.SmoothSparsePhiRegularizer(
            name='sparse_phi_sp',
            topic_names=specific_topic_names,
            tau=model_params.get("sparse_sp_tau", -0.05),
            class_ids=[main_modality],
        ),
        artm.SmoothSparseThetaRegularizer(
            name='sparse_theta_sp',
            topic_names=specific_topic_names,
            tau=model_params.get("sparse_sp_tau", -0.05),
        ),
    ]
    for reg in regularizers:
        model.regularizers.add(
            transform_regularizer(data_stats,
                                  reg,
                                  model.class_ids,
                                  n_topics=len(reg.topic_names)))

    return model
def init_lda(
    dataset: Dataset,
    modalities_to_use: List[str],
    main_modality: str,
    num_topics: int,
    model_params: dict = None,
):
    """
    Creates simple artm model with standard scores.

    Parameters
    ----------
    dataset
    modalities_to_use
    main_modality
    num_topics
    model_params

    Returns
    -------
    model: artm.ARTM() instance
    """
    if model_params is None:
        model_params = dict()

    model = init_plsa(dataset, modalities_to_use, main_modality, num_topics)

    prior = model_params.get('prior', 'symmetric')

    # What GenSim returns by default (everything is 'symmetric')
    # see https://github.com/RaRe-Technologies/gensim/blob/master/gensim/models/ldamodel.py#L521
    # Note that you can specify prior shape for alpha and beta separately,
    # but we do not do that here
    if prior == "symmetric":
        alpha = 1.0 / num_topics
        eta = 1.0 / num_topics
    elif prior == "asymmetric":
        # following the recommendation from
        # http://papers.nips.cc/paper/3854-rethinking-lda-why-priors-matter
        # we will use symmetric prior over Phi and asymmetric over Theta
        eta = 1.0 / num_topics
        num_terms = 0  # isn't used, so let's not compute it
        alpha = _init_dirichlet_prior("alpha", num_topics, num_terms=num_terms)

    elif prior == "double_asymmetric":
        # this stuff is needed for asymmetric Phi initialization:
        artm_dict = dataset.get_dictionary()
        temp_df = artm_dict2df(artm_dict)  # noqa: F821
        num_terms = temp_df.query("class_id in @modalities_to_use").shape[0]
        eta = _init_dirichlet_prior("eta", num_topics, num_terms)
        alpha = _init_dirichlet_prior("alpha", num_topics, num_terms)
        # TODO: turns out, BigARTM does not support tau as a list of floats (or dictionary)
        # so we need to use custom regularizer instead
        # (TopicPrior doesn't work because it provides $beta_t$ instead of $beta_w$)
        raise NotImplementedError

    elif prior == "heuristic":
        # Found in doi.org/10.1007/s10664-015-9379-3 (2016)
        #  "We use the defacto standard heuristics of α=50/K and β=0.01
        #  (Biggers et al. 2014) for our hyperparameter values"
        alpha = 50.0 / num_topics
        eta = 0.01
    else:
        raise TypeError(f"prior type '{prior}' is not supported")

    model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(
            name='smooth_phi',
            tau=eta,
            class_ids=[main_modality],
        ), )

    if isinstance(alpha, (list, np.ndarray)):
        alpha = [float(a) for a in alpha]

        assert (len(alpha) == len(model.topic_names))

        for i, topic in enumerate(model.topic_names):
            model.regularizers.add(
                artm.SmoothSparseThetaRegularizer(name=f'smooth_theta_{i}',
                                                  tau=alpha[i],
                                                  topic_names=topic))
    else:
        model.regularizers.add(
            artm.SmoothSparseThetaRegularizer(
                name='smooth_theta',
                tau=alpha,
            ), )

    return model
コード例 #29
0
def test_func():
    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')
    batches_folder = tempfile.mkdtemp()
    dump_folder = tempfile.mkdtemp()

    try:
        batch_vectorizer = artm.BatchVectorizer(data_path=data_path,
                                                data_format='bow_uci',
                                                collection_name='kos',
                                                target_folder=batches_folder)

        model_1 = artm.ARTM(num_processors=7,
                            cache_theta=True,
                            num_document_passes=5,
                            reuse_theta=True,
                            seed=10,
                            num_topics=15,
                            class_ids={'@default_class': 1.0},
                            theta_name='THETA',
                            dictionary=batch_vectorizer.dictionary)

        model_2 = artm.ARTM(num_processors=7,
                            cache_theta=False,
                            num_document_passes=5,
                            reuse_theta=False,
                            seed=10,
                            num_topics=15,
                            class_ids={'@default_class': 1.0},
                            dictionary=batch_vectorizer.dictionary)

        for model in [model_1, model_2]:
            model.scores.add(
                artm.PerplexityScore(name='perp',
                                     dictionary=batch_vectorizer.dictionary))
            model.scores.add(artm.SparsityThetaScore(name='sp_theta', eps=0.1))
            model.scores.add(artm.TopTokensScore(name='top_tok',
                                                 num_tokens=10))
            model.scores.add(
                artm.SparsityPhiScore(name='sp_nwt',
                                      model_name=model.model_nwt))
            model.scores.add(
                artm.TopicKernelScore(name='kernel',
                                      topic_names=model.topic_names[0:5],
                                      probability_mass_threshold=0.4))

            topic_pairs = {}
            for topic_name_1 in model.topic_names:
                for topic_name_2 in model.topic_names:
                    if topic_name_1 not in topic_pairs:
                        topic_pairs[topic_name_1] = {}
                    topic_pairs[topic_name_1][
                        topic_name_2] = numpy.random.randint(0, 3)

            model.regularizers.add(
                artm.DecorrelatorPhiRegularizer(name='decor',
                                                tau=100000.0,
                                                topic_pairs=topic_pairs))
            model.regularizers.add(
                artm.SmoothSparsePhiRegularizer(
                    name='smsp_phi',
                    tau=-0.5,
                    gamma=0.3,
                    dictionary=batch_vectorizer.dictionary))
            model.regularizers.add(
                artm.SmoothSparseThetaRegularizer(name='smsp_theta',
                                                  tau=0.1,
                                                  doc_topic_coef=[2.0] *
                                                  model.num_topics))
            model.regularizers.add(
                artm.SmoothPtdwRegularizer(name='sm_ptdw', tau=0.1))

            # learn first model and dump it on disc
            model.fit_offline(batch_vectorizer, num_collection_passes=10)
            model.fit_online(batch_vectorizer, update_every=1)

            model.dump_artm_model(os.path.join(dump_folder, 'target'))

            params = {}
            with open(os.path.join(dump_folder, 'target', 'parameters.json'),
                      'r') as fin:
                params = json.load(fin)
            _assert_json_params(params)

            # create second model from the dump and check the results are equal
            model_new = artm.load_artm_model(
                os.path.join(dump_folder, 'target'))

            _assert_params_equality(model, model_new)
            _assert_scores_equality(model, model_new)
            _assert_regularizers_equality(model, model_new)
            _assert_score_values_equality(model, model_new)
            _assert_matrices_equality(model, model_new)

            # continue learning of both models
            model.fit_offline(batch_vectorizer, num_collection_passes=3)
            model.fit_online(batch_vectorizer, update_every=1)

            model_new.fit_offline(batch_vectorizer, num_collection_passes=3)
            model_new.fit_online(batch_vectorizer, update_every=1)

            # check new results are also equal
            _assert_params_equality(model, model_new)
            _assert_scores_equality(model, model_new)
            _assert_regularizers_equality(model, model_new)
            _assert_score_values_equality(model, model_new)
            _assert_matrices_equality(model, model_new)

            shutil.rmtree(os.path.join(dump_folder, 'target'))
    finally:
        shutil.rmtree(batches_folder)
        shutil.rmtree(dump_folder)
コード例 #30
0
    subj_topics = topics_names[:topic_num]
    bgr_topics = topics_names[topic_num:]

    model = artm.ARTM(
        num_document_passes=document_passes_num,
        num_topics=topic_num + background_topic_num,
        topic_names=topics_names,
        seed=100,  # helps to get stable results
        num_processors=processors_num)

    model.regularizers.add(
        artm.DecorrelatorPhiRegularizer(name='Decorrelator',
                                        tau=10**4))  # обычный декоррелятор
    model.regularizers.add(
        artm.SmoothSparseThetaRegularizer(
            name='SmoothTheta', topic_names=bgr_topics,
            tau=0.3))  # сглаживаем Theta для фоновых тем
    model.regularizers.add(
        artm.SmoothSparseThetaRegularizer(
            name='SparseTheta', topic_names=subj_topics,
            tau=-0.3))  # разреживаем Theta для "хороших" тем
    model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(
            name='SmoothPhi',
            topic_names=bgr_topics,
            class_ids=["text"],
            tau=0.1))  # сглаживаем Theta для фоновых тем
    model.regularizers.add(
        artm.SmoothSparsePhiRegularizer(
            name='SparsePhi',
            topic_names=subj_topics,