예제 #1
0
def two_experiment_enviroments(request):
    """ """
    with warnings.catch_warnings():
        warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1)
        dataset = Dataset('tests/test_data/test_dataset.csv')
        dictionary = dataset.get_dictionary()

    model_artm_1 = artm.ARTM(
        num_processors=1,
        num_topics=5, cache_theta=True,
        num_document_passes=1, dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore'),
                artm.SparsityPhiScore(name='SparsityPhiScore', class_id=MAIN_MODALITY)]
    )

    model_artm_2 = artm.ARTM(
        num_processors=1,
        num_topics=5, cache_theta=True,
        num_document_passes=1, dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore'),
                artm.SparsityPhiScore(name='SparsityPhiScore', class_id=MAIN_MODALITY)]
    )

    tm_1 = TopicModel(model_artm_1, model_id='new_id_1')
    tm_2 = TopicModel(model_artm_2, model_id='new_id_2')

    experiment_1 = Experiment(
        experiment_id="test_1", save_path="tests/experiments", topic_model=tm_1
    )
    experiment_2 = Experiment(
        experiment_id="test_2", save_path="tests/experiments", topic_model=tm_2
    )

    return tm_1, experiment_1, tm_2, experiment_2, dataset, dictionary
 def create_topic_model(self, topic_model_name: str,
                        batch_vectorizer: artm.BatchVectorizer,
                        dictionary: artm.Dictionary) -> artm.ARTM:
     topic_model = artm.ARTM(num_topics=self.number_of_topics,
                             dictionary=dictionary,
                             cache_theta=False)
     topic_model.scores.add(
         artm.PerplexityScore(name='perplexity_score',
                              dictionary=dictionary))
     topic_model.scores.add(
         artm.SparsityPhiScore(name='sparsity_phi_score'))
     topic_model.scores.add(
         artm.SparsityThetaScore(name='sparsity_theta_score'))
     topic_model.num_document_passes = 5
     topic_model.num_processors = max(1, os.cpu_count() - 1)
     topic_model.regularizers.add(
         artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer'))
     topic_model.regularizers.add(
         artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
     topic_model.regularizers.add(
         artm.DecorrelatorPhiRegularizer(
             name='decorrelator_phi_regularizer'))
     topic_model.regularizers['sparse_phi_regularizer'].tau = -1.0
     topic_model.regularizers['sparse_theta_regularizer'].tau = -0.5
     topic_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+5
     best_score = None
     keyword_extraction_logger.info(
         'epoch  perplexity_score  sparsity_phi_score  sparsity_theta_score'
     )
     for restart_index in range(10):
         topic_model.fit_offline(batch_vectorizer=batch_vectorizer,
                                 num_collection_passes=3)
         if best_score is None:
             best_score = topic_model.score_tracker[
                 'perplexity_score'].last_value
         else:
             if best_score > topic_model.score_tracker[
                     'perplexity_score'].last_value:
                 best_score = topic_model.score_tracker[
                     'perplexity_score'].last_value
                 self.save_topic_model(topic_model, topic_model_name)
         keyword_extraction_logger.info(
             '{0:5}  {1:16.9}  {2:18.9}  {3:20.9}'.format(
                 (restart_index + 1) * 3,
                 topic_model.score_tracker['perplexity_score'].last_value,
                 topic_model.score_tracker['sparsity_phi_score'].last_value,
                 topic_model.score_tracker['sparsity_theta_score'].
                 last_value))
     del topic_model
     return self.load_topic_model(
         artm.ARTM(num_topics=self.number_of_topics,
                   dictionary=dictionary,
                   cache_theta=False), topic_model_name)
예제 #3
0
파일: test_cubes.py 프로젝트: yyht/TopicNet
def test_phi_matrix_after_lda_sampled_regularizer(experiment_enviroment):
    with warnings.catch_warnings():
        warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1)
        dataset = Dataset(DATA_PATH)
        dictionary = dataset.get_dictionary()
        batch_vectorizer = dataset.get_batch_vectorizer()

    topic_prior_reg = TopicPriorSampledRegularizer(
        name='topic_prior',
        tau=5,
        num_topics=5,
        beta_prior=[10, 1, 100, 2, 1000])

    model_artm_1 = artm.ARTM(
        num_processors=1,
        num_topics=5,
        cache_theta=True,
        class_ids={
            MAIN_MODALITY: 1.0,
            NGRAM_MODALITY: 1.0
        },
        num_document_passes=1,
        dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore', )],
    )
    model_artm_2 = artm.ARTM(
        num_processors=1,
        num_topics=5,
        cache_theta=True,
        class_ids={
            MAIN_MODALITY: 1.0,
            NGRAM_MODALITY: 1.0
        },
        num_document_passes=1,
        dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore', )],
    )

    tm_1 = TopicModel(
        model_artm_1,
        model_id='new_id_1',
        custom_regularizers={topic_prior_reg.name: topic_prior_reg})
    tm_2 = TopicModel(model_artm_2, model_id='new_id_2')

    tm_1._fit(batch_vectorizer, 10)
    tm_2._fit(batch_vectorizer, 10)

    phi_first = tm_1.get_phi()
    phi_second = tm_2.get_phi()

    assert any(phi_first != phi_second
               ), 'Phi matrices are the same after regularization.'
예제 #4
0
def experiment(filename, tau_phi, tau_theta):
    batch_vectorizer = artm.BatchVectorizer(data_path=filename, data_format='vowpal_wabbit',
                                            target_folder='batches')

    dictionary = batch_vectorizer.dictionary

    topic_num = 30
    tokens_num = 100
    print("ARTM training")
    topic_names = ['topic_{}'.format(i) for i in range(topic_num)]
    model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True)
    model_plsa = artm.ARTM(topic_names=topic_names, cache_theta=True,
                           scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)])
    model_lda = artm.LDA(num_topics=topic_num)

    model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
    model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
    model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=tokens_num))
    model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
    model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
    model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
    model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

    model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
    model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
    model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score'))
    model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
    model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
    model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
    model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

    model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer'))
    model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
    model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer'))

    model_artm.regularizers['sparse_phi_regularizer'].tau = tau_phi
    model_artm.regularizers['sparse_theta_regularizer'].tau = tau_theta
    model_artm.regularizers['decorrelator_phi_regularizer'].tau = 1e+3

    model_plsa.initialize(dictionary=dictionary)
    model_artm.initialize(dictionary=dictionary)
    model_lda.initialize(dictionary=dictionary)

    passes = 100
    model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
    model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
    model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)

    print_measures(model_plsa, model_artm, model_lda)
예제 #5
0
def test_fancy_fit_is_ok(experiment_enviroment):
    tm, dataset, experiment, dictionary = experiment_enviroment
    model_artm = artm.ARTM(
        num_topics=5,
        num_document_passes=1,
        dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore')],
        theta_columns_naming='title',
        class_ids={
            MAIN_MODALITY: 1,
            NGRAM_MODALITY: 1,
            EXTRA_MODALITY: 1,
            '@psyduck': 42
        },
        regularizers=[
            artm.SmoothSparseThetaRegularizer(name='smooth_theta', tau=10.0),
        ])
    custom_scores = {'mean_kernel_size': ScoreExample()}

    tm = TopicModel(model_artm,
                    model_id='absolutely_new_id',
                    custom_scores=custom_scores)

    num_iterations = 10
    tm._fit(dataset.get_batch_vectorizer(), num_iterations)
    params = tm.get_jsonable_from_parameters()
    assert "smooth_theta" in params["regularizers"]
    PATH = "tests/experiments/save_standalone/"
    tm.save(PATH)
    tm2 = TopicModel.load(PATH)
    assert (tm.get_phi() == tm2.get_phi()).all().all()
예제 #6
0
    def __init__(self,
                 artm_model=None,
                 model_id=None,
                 parent_model_id=None,
                 data_path=None,
                 description=None,
                 experiment=None,
                 custom_scores=dict(),
                 *args,
                 **kwargs):
        """
        Initialize stage, also used for loading previously saved experiments.

        Parameters
        ----------
        artm_model : artm model or None
            model to use, None if you want to create model (Default value = None)
        model_id : str
            model id (Default value = None)
        parent_model_id : str
            model id from which current model was created (Default value = None)
        data_path : str
            path to the data (Default value = None)
        description : list of dict
            description of the model (Default value = None)
        experiment : Experiment
            the experiment to which the model is bound (Default value = None)
        custom_scores : dict
            dictionary with score names as keys and score classes as functions
            (score class with functionality like those of BaseScore)

        """
        super().__init__(model_id=model_id,
                         parent_model_id=parent_model_id,
                         experiment=experiment,
                         *args,
                         **kwargs)

        if artm_model is None:
            try:
                self._model = artm.ARTM(**kwargs)
            except ArtmException as e:
                error_message = repr(e)
                raise ValueError(
                    f'Cannot create artm model with parameters {kwargs}.\n'
                    "ARTM failed with following: " + error_message)
        else:
            self._model = artm_model

        self.data_path = data_path
        self.custom_scores = custom_scores

        self._score_caches = None  # returned by model.score, reset by model._fit

        self._description = []
        if description is None and self._model._initialized:
            init_params = self.get_jsonable_from_parameters()
            self._description = [{"action": "init", "params": [init_params]}]
        else:
            self._description = description
예제 #7
0
    def __init__(self, uci_dir, dictionary, n_topics):
        bv = artm.BatchVectorizer(data_format='bow_uci', data_path=uci_dir, collection_name='corpus',
                                  target_folder=uci_dir + '/artm_batches')
        bv_dict = bv.dictionary

        logging.info("Fitting the ARTM model")
        model = artm.ARTM(dictionary=bv_dict, num_topics=n_topics)

        model.fit_offline(batch_vectorizer=bv, num_collection_passes=10)

        logging.info("Processing word-topic matrices")

        # Create a new word-topic matrix according to dictionary indices
        self.phi = np.zeros(model.phi_.shape, dtype=np.float64)
        for word, vec in model.phi_.iterrows():
            idx = dictionary.token2id[word[1]]
            self.phi[idx, :] = vec

        logging.info("Building the index for ARTM")
        corpus = model.transform(bv).T.sort_index()
        corpus = [matutils.full2sparse(row) for index, row in corpus.iterrows()]
        self.index = similarities.MatrixSimilarity(corpus, num_features=n_topics, num_best=self.N_BEST)

        self.model = model
        self.dictionary = dictionary
예제 #8
0
def experiment_enviroment(request):
    """ """
    with warnings.catch_warnings():
        warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1)
        dataset = Dataset('tests/test_data/test_dataset.csv')
        dictionary = dataset.get_dictionary()

    model_artm = artm.ARTM(
        num_topics=5,
        class_ids={MAIN_MODALITY: 1.0, NGRAM_MODALITY: 1.0, EXTRA_MODALITY: 1.0},
        num_document_passes=1, dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore', )],
        theta_columns_naming='title',
    )
    custom_scores = {'mean_kernel_size': ScoreExample()}

    tm = TopicModel(model_artm, model_id='new_id', custom_scores=custom_scores)
    experiment = Experiment(experiment_id="test", save_path="tests/experiments", topic_model=tm)

    def resource_teardown():
        """ """
        shutil.rmtree("tests/experiments")
        shutil.rmtree(dataset._internals_folder_path)

    request.addfinalizer(resource_teardown)

    return tm, dataset, experiment, dictionary
예제 #9
0
def fit():
    batch_id = str(uuid.uuid4())
    app.logger.info("batch %s", batch_id)

    rjson = request.json
    terms = rjson['terms']
    topics_cnt = rjson['topics']

    batch = artm.messages.Batch()
    term_to_id = {}
    all_terms = []

    batch = artm.messages.Batch()
    batch.id = batch_id

    for i, doc in enumerate(terms):
        item = batch.item.add()
        item.id = i
        field = item.field.add()
        for term in doc:
            if not term in term_to_id:
                term_to_id[term] = len(all_terms)
                all_terms.append(term)
            field.token_id.append(term_to_id[term])
            field.token_count.append(1)

    for t in all_terms:
        batch.token.append(t)

    os.mkdir(batch_id)
    with open(os.path.join(batch_id, "batch.batch"), 'wb') as fout:
        fout.write(batch.SerializeToString())

    app.logger.info("batch %s is created", batch_id)

    dictionary = artm.Dictionary()
    dictionary.gather(batch_id)

    model_artm = artm.ARTM(
        topic_names=['topic_{}'.format(i) for i in xrange(topics_cnt)],
        scores=[
            artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)
        ],
        regularizers=[
            artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.15)
        ],
        show_progress_bars=False)

    batch_vectorizer = artm.BatchVectorizer(data_path=batch_id,
                                            data_format="batches")

    model_artm.initialize(dictionary=dictionary)
    app.logger.info("model is starting to fit")
    model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                           num_collection_passes=1)
    app.logger.info("mode was fitted")

    model_artm.save(os.path.join(batch_id, "model"))

    return jsonify({"id": batch_id})
예제 #10
0
    def init_model(self, dictionary_path=None):
        """dictionary_path: optional, used with pretrained model"""
        self.dictionary = artm.Dictionary()
        if dictionary_path is None:
            self.dictionary.gather(data_path=self.batches_path)
            self.dictionary.filter(min_tf=10, max_df_rate=0.1)
            self.dictionary.save_text(
                f"{self.dir_path}/dicts/dict_{self.name_dataset}.txt")
        else:
            self.dictionary.load_text(dictionary_path)

        self.model = artm.ARTM(
            num_topics=self.n_topics,
            dictionary=self.dictionary,
            show_progress_bars=True,
        )

        # scores
        self.model.scores.add(
            artm.PerplexityScore(name="PerplexityScore",
                                 dictionary=self.dictionary))
        self.model.scores.add(
            artm.SparsityThetaScore(name="SparsityThetaScore"))
        self.model.scores.add(artm.SparsityPhiScore(name="SparsityPhiScore"))

        # regularizers
        self.model.regularizers.add(
            artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=-0.1))
        self.model.regularizers.add(
            artm.SmoothSparseThetaRegularizer(name="SparseTheta", tau=-0.5))
        self.model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi", tau=1.5e5))
예제 #11
0
def define_model(n_topics: int, dictionary: artm.Dictionary,
                 sparse_theta: float, sparse_phi: float,
                 decorrelator_phi: float) -> artm.artm_model.ARTM:
    """
    Define the ARTM model.
    :param n_topics: number of topics.
    :param dictionary: batch vectorizer dictionary.
    :param sparse_theta: sparse theta parameter.
    :param sparse_phi: sparse phi Parameter.
    :param decorrelator_phi: decorellator phi Parameter.
    :return: ARTM model.
    """
    print("Defining the model.")
    topic_names = ["topic_{}".format(i) for i in range(1, n_topics + 1)]
    model_artm = artm.ARTM(
        topic_names=topic_names,
        cache_theta=True,
        scores=[
            artm.PerplexityScore(name="PerplexityScore",
                                 dictionary=dictionary),
            artm.SparsityPhiScore(name="SparsityPhiScore"),
            artm.SparsityThetaScore(name="SparsityThetaScore"),
            artm.TopicKernelScore(name="TopicKernelScore",
                                  probability_mass_threshold=0.3),
            artm.TopTokensScore(name="TopTokensScore", num_tokens=15)
        ],
        regularizers=[
            artm.SmoothSparseThetaRegularizer(name="SparseTheta",
                                              tau=sparse_theta),
            artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=sparse_phi),
            artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi",
                                            tau=decorrelator_phi)
        ])
    return model_artm
예제 #12
0
def experiment_enviroment(request):
    """ """
    with warnings.catch_warnings():
        warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1)
        dataset = Dataset('tests/test_data/test_dataset.csv')
        dictionary = dataset.get_dictionary()

    model_artm = artm.ARTM(
        num_processors=1,
        num_topics=5,
        cache_theta=True,
        num_document_passes=1,
        dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore', )],
    )
    model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
    model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
    ex_score = ScoreExample()
    tm = TopicModel(model_artm,
                    model_id='new_id',
                    custom_scores={'example_score': ex_score})
    # experiment starts without model
    experiment = Experiment(tm,
                            experiment_id="test_cube_creator",
                            save_path="tests/experiments")
    return tm, dataset, experiment, dictionary
예제 #13
0
def get_topic_weights(data_folder, tm_index):
    import artm
    import os

    from dags.bigartm.services.bigartm_utils import load_monkey_patch
    from util.constants import BASE_DAG_DIR

    print("!!!", "Get topic weights")
    batches_folder = os.path.join(data_folder, "batches")
    batch_vectorizer = artm.BatchVectorizer(data_path=batches_folder,
                                            data_format='batches')
    model_folder = os.path.join(BASE_DAG_DIR, "bigartm_models")
    model_artm = artm.ARTM(num_topics=tm_index.number_of_topics,
                           class_ids={"text": 1},
                           theta_columns_naming="title",
                           reuse_theta=True,
                           cache_theta=True,
                           num_processors=4)
    model_artm.load = load_monkey_patch
    model_artm.load(model_artm,
                    os.path.join(model_folder, f"model_{tm_index.name}.model"))

    theta = model_artm.transform(batch_vectorizer)

    theta_values = theta.values.transpose().astype(float)
    theta_topics = theta.index.array.to_numpy().astype(str)
    theta_documents = theta.columns.array.to_numpy().astype(str)

    return theta_values, theta_topics, theta_documents
예제 #14
0
파일: test_cubes.py 프로젝트: yyht/TopicNet
def experiment_enviroment(request):
    """ """
    with warnings.catch_warnings():
        warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1)
        dataset = Dataset(DATA_PATH)
        dictionary = dataset.get_dictionary()

    model_artm = artm.ARTM(
        num_processors=1,
        num_topics=5,
        cache_theta=True,
        class_ids={
            MAIN_MODALITY: 1.0,
            NGRAM_MODALITY: 1.0
        },
        num_document_passes=1,
        dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore', )],
    )

    tm = TopicModel(model_artm, model_id='new_id')
    experiment = Experiment(experiment_id="test_cubes",
                            save_path="tests/experiments",
                            topic_model=tm)

    return tm, dataset, experiment, dictionary
def create_model_complex(current_dictionary,
                         n_topics,
                         n_doc_passes,
                         seed_value,
                         n_top_tokens,
                         p_mass_threshold,
                         common_topics,
                         subject_topics,
                         class_name='@default_class',
                         _debug_print=False):
    if _debug_print:
        print '[{}] creating model'.format(datetime.now())
    model = artm.ARTM(num_topics=n_topics,
                      dictionary=current_dictionary,
                      cache_theta=True,
                      seed=seed_value,
                      class_ids={class_name: 1.0})
    model.num_document_passes = n_doc_passes
    add_complex_scores_to_model(model,
                                current_dictionary,
                                n_top_tokens=n_top_tokens,
                                p_mass_threshold=p_mass_threshold,
                                common_topics=common_topics,
                                subject_topics=subject_topics,
                                class_name=class_name)
    return model
예제 #16
0
def bigartm_predict(mess_bigartm, top1_cat):
    try:
        mess_bigartm = ' |text ' + mess_bigartm

        #загрузка модели
        T = 10
        model_artm = artm.ARTM(num_topics=T,
                               topic_names=['sbj' + str(i) for i in range(T)],
                               class_ids={'text': 1})
        model_artm.load(os.path.join(artm_dir, top1_cat + ".dump"))

        #сохранить текст в файл
        with open(os.path.join(currdir, 'flask/test_artm.txt'), 'w') as f:
            f.write(mess_bigartm + '\n')

        batch_vectorizer_test = artm.BatchVectorizer(
            data_path=os.path.join(currdir, 'flask/test_artm.txt'),
            data_format='vowpal_wabbit',
            target_folder=os.path.join(currdir, 'flask/test'),
            batch_size=100)
        theta_test = model_artm.transform(
            batch_vectorizer=batch_vectorizer_test)

    except Exception:
        print('Ошибка загрузки bigartm')
    return theta_test
예제 #17
0
def init_model(T,
               B,
               batches_dir,
               regularizers_dict,
               num_document_passes=30,
               weights_dict=None,
               min_df=None,
               max_tf=None):
    T = int(T)
    B = int(B)
    main_topics_num = T
    model_artm = artm.ARTM(
        num_topics=T + B,
        topic_names=[
            "topic{}".format(i) if i < main_topics_num else "back{}".format(i)
            for i in range(B)
        ],
        cache_theta=True,
        show_progress_bars=True,
        class_ids=weights_dict,
        num_document_passes=num_document_passes)

    topic_names = model_artm.topic_names
    model_artm, my_dictionary = dictionary_initialization(
        model_artm, batches_dir, min_df, max_tf)
    print("Model is initialized!")

    if regularizers_dict:
        model_artm = reset_regularizers(model_artm, regularizers_dict)
    model_artm = init_score_tracker(model_artm, my_dictionary)
    return model_artm
 def select_from_corpus(self, list_of_files: List[str],
                        preprocessor: BaseTextPreprocessor,
                        spacy_nlp: Language) -> List[str]:
     topic_model_name = os.path.normpath(self.topic_model_name.strip())
     if len(topic_model_name) == 0:
         raise ValueError('A topic model name is empty!')
     dir_name = os.path.dirname(topic_model_name)
     base_name = os.path.basename(topic_model_name)
     if len(dir_name) == 0:
         dir_name = os.path.curdir
     if len(base_name) == 0:
         raise ValueError(
             '`{0}` is incorrect name for a topic model! Base name of file is empty!'
             .format(self.topic_model_name))
     if not os.path.isdir(dir_name):
         raise ValueError(
             '`{0}` is incorrect name for a topic model! Directory `{1}` does not exist!'
             .format(self.topic_model_name, dir_name))
     collection_name = os.path.normpath(
         os.path.join(dir_name, base_name + '.collection'))
     collection_docword_name = os.path.normpath(
         os.path.join(dir_name, 'docword.' + base_name + '.collection'))
     collection_vocab_name = os.path.normpath(
         os.path.join(dir_name, 'vocab.' + base_name + '.collection'))
     if (not os.path.isfile(collection_docword_name)) or (
             not os.path.isfile(collection_vocab_name)):
         self.create_collection_as_bow_uci(list_of_files, preprocessor,
                                           spacy_nlp,
                                           collection_docword_name,
                                           collection_vocab_name)
     batches_path = os.path.normpath(
         os.path.join(dir_name, base_name + '.data_batches'))
     if os.path.isdir(batches_path):
         batch_vectorizer = artm.BatchVectorizer(data_path=batches_path,
                                                 data_format='batches')
     else:
         batch_vectorizer = artm.BatchVectorizer(
             data_path=dir_name,
             data_format='bow_uci',
             collection_name=collection_name,
             target_folder=batches_path)
     dictionary = artm.Dictionary()
     dictionary_name = os.path.normpath(topic_model_name + '.dictionary')
     if os.path.isfile(dictionary_name):
         dictionary.load(dictionary_name)
     else:
         dictionary.gather(data_path=batches_path)
         dictionary.save(dictionary_name)
     topic_model = self.load_topic_model(
         artm.ARTM(num_topics=self.number_of_topics,
                   dictionary=dictionary,
                   cache_theta=False), topic_model_name)
     if topic_model is None:
         topic_model = self.create_topic_model(topic_model_name,
                                               batch_vectorizer, dictionary)
         if topic_model is None:
             raise ValueError(
                 'The trained topic model cannot be loaded from the file `{0}`!'
                 .format(topic_model_name))
     return self.select_keywords_from_topic_model(topic_model)
예제 #19
0
    def setup_class(cls):
        """ """
        with warnings.catch_warnings():
            warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1)
            dataset = Dataset('tests/test_data/test_dataset.csv')
            raw_data = []
            with open('tests/test_data/test_vw.txt', encoding='utf-8') as file:
                for line in file:
                    raw_data += [line.split(' ')]
            dictionary = dataset.get_dictionary()
            batch_vectorizer = dataset.get_batch_vectorizer()

        model_artm = artm.ARTM(
            num_topics=NUM_TOPICS,
            class_ids=dict.fromkeys(CLASS_IDS, 1.0),
            topic_names=TOPIC_NAMES,
            cache_theta=True,
            num_document_passes=NUM_DOCUMENT_PASSES,
            dictionary=dictionary,
            scores=[artm.PerplexityScore(name='PerplexityScore')],
        )

        cls.topic_model = TopicModel(model_artm, model_id='model_id')
        cls.topic_model._fit(batch_vectorizer, num_iterations=NUM_ITERATIONS)
        cls.raw_data = raw_data
예제 #20
0
def cluster_artm(text):
    bach_vectorizer = artm.BatchVectorizer(data_path=text,
                                           data_format='vowpal_wabbit', target_folder='batch_small',
                                           batch_size=20)
    T = 10  # количество тем
    topic_names = ["sbj" + str(i) for i in range(T - 1)] + ["bcg"]

    model_artm = artm.ARTM(num_topics=T, topic_names=topic_names, reuse_theta=True,
                           num_document_passes=1)

    np.random.seed(1)
    dictionary = artm.Dictionary()
    dictionary.gather(data_path=bach_vectorizer.data_path)
    model_artm.initialize(dictionary)

    model_artm.scores.add(artm.TopTokensScore(name='metric1', num_tokens=15))

    model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='smoothing', dictionary=dictionary,
                                                                topic_names='bcg', tau=1e5))

    model_artm.fit_offline(batch_vectorizer=bach_vectorizer, num_collection_passes=6)
    model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='stimulates',
                                                                dictionary=dictionary,
                                                                topic_names=["sbj" + str(i) for i in range(0, 29)],
                                                                tau=-1e5))

    model_artm.fit_offline(batch_vectorizer=bach_vectorizer, num_collection_passes=6)

    for topic_name in model_artm.topic_names:
        with open('cluster_log_artm.txt', 'a') as f_in:
            f_in.write(topic_name + ':')
            for word in model_artm.score_tracker["metric1"].last_tokens[topic_name]:
                f_in.write(word + ' ')
            f_in.write('\n')
예제 #21
0
파일: test_cubes.py 프로젝트: yyht/TopicNet
def test_custom_regularizer_cubed_controlled(experiment_enviroment,
                                             thread_flag, by_name):
    """ """
    _, dataset, _, dictionary = experiment_enviroment
    multiplier = 2
    initial_tau = 5

    custom_reg = TopicPriorSampledRegularizer(name='topic_prior',
                                              tau=initial_tau,
                                              num_topics=5,
                                              beta_prior=[10, 1, 100, 2, 1000])

    model_artm = artm.ARTM(
        num_processors=1,
        num_topics=5,
        cache_theta=True,
        class_ids={
            MAIN_MODALITY: 1.0,
            NGRAM_MODALITY: 1.0
        },
        num_document_passes=1,
        dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore', )],
    )
    tm = TopicModel(
        model_artm,
        model_id='new_id_1',
        custom_regularizers={custom_reg.name: custom_reg} if by_name else {})
    experiment = Experiment(  # noqa: F841
        tm,
        experiment_id="cubed_controlled_reg",
        save_path="tests/experiments")

    parameters = {
        "score_to_track": None,
        "tau_converter": f"prev_tau * {multiplier}",
        "user_value_grid": [0.3],
        "max_iters": float("inf")
    }

    if by_name:
        parameters["reg_name"] = custom_reg.name
    else:
        parameters["regularizer"] = custom_reg

    num_iter = 10
    cube = RegularizationControllerCube(num_iter=num_iter,
                                        parameters=parameters,
                                        reg_search="grid",
                                        use_relative_coefficients=False,
                                        separate_thread=thread_flag)
    dummies = cube(tm, dataset)

    tmodels = [dummy.restore() for dummy in dummies]

    for one_model in tmodels:
        actual_tau = one_model.all_regularizers[custom_reg.name].tau

        assert actual_tau == initial_tau * (multiplier**num_iter)
예제 #22
0
def _get_topic_model(dataset: Dataset,
                     phi: pd.DataFrame = None,
                     num_topics: int = None,
                     seed: int = None,
                     scores: List[BaseScore] = None,
                     num_safe_fit_iterations: int = 3,
                     num_processors: int = 3,
                     cache_theta: bool = False) -> TopicModel:

    dictionary = dataset.get_dictionary()

    if num_topics is not None and phi is not None:
        assert num_topics >= phi.shape[1]
    elif num_topics is None and phi is not None:
        num_topics = phi.shape[1]
    elif num_topics is None and phi is None:
        raise ValueError()

    topic_names = [f'topic_{i}' for i in range(num_topics)]

    if seed is None:
        artm_model = artm.ARTM(topic_names=topic_names)
    else:
        artm_model = artm.ARTM(topic_names=topic_names, seed=seed)

    artm_model.num_processors = num_processors
    artm_model.initialize(dictionary)

    if phi is None:
        pass
    elif num_safe_fit_iterations is not None and num_safe_fit_iterations > 0:
        init_phi_utils._safe_copy_phi(artm_model, phi, dataset,
                                      num_safe_fit_iterations)
    else:
        init_phi_utils._copy_phi(artm_model, phi)

    topic_model = TopicModel(artm_model=artm_model,
                             model_id='0',
                             cache_theta=cache_theta,
                             theta_columns_naming='title')

    if scores is not None:
        for score in scores:
            score._attach(topic_model)

    return topic_model
예제 #23
0
    def load(self, path):
        """
        :Description: loads models of already constructed hierarchy

        :param str path: a path where hierarchy was saved by hARTM.save method

        :Notes:
          * Loaded models will overwrite ARTM.topic_names and class_ids fields of each level.
          * All class_ids weights will be set to 1.0, you need to specify them by\
            hand if it's necessary.
          * The method call will empty ARTM.score_tracker of each level.
          * All regularizers and scores will be forgotten.
          * etc.
          * We strongly recommend you to reset all important parameters of the ARTM\
            models and hARTM, used earlier.
        """
        info_filename = glob.glob(os.path.join(path, "info.dump"))
        if len(info_filename) != 1:
            raise ValueError("Given path is not hARTM safe")

        with open(info_filename[0], "rb") as fin:
            info = pickle.load(fin)

        model_filenames = glob.glob(os.path.join(path, "*.model"))
        if len(
            {len(info["parent_level_weight"]) + 1,
             len(model_filenames) / 2}) > 1:
            raise ValueError("Given path is not hARTM safe")

        model_filenames = sorted(model_filenames)

        self._levels = []
        for level_idx, num_topics in enumerate(info["num_level_topics"]):
            if not len(self._levels):
                model = artm.ARTM(num_topics=num_topics,
                                  seed=self._get_seed(level_idx),
                                  **self._common_models_args)
            else:
                parent_level_weight = info["parent_level_weight"][level_idx -
                                                                  1]
                model = ARTM_Level(parent_model=self._levels[-1],
                                   phi_batch_weight=parent_level_weight,
                                   phi_batch_path=self._tmp_files_path,
                                   num_topics=num_topics,
                                   seed=self._get_seed(level_idx),
                                   **self._common_models_args)

            filename = model_filenames[2 * level_idx + 1]
            model.load(filename, "p_wt")
            filename = model_filenames[2 * level_idx]
            model.load(filename, "n_wt")

            config = model.master._config
            config.opt_for_avx = False

            model.master._lib.ArtmReconfigureMasterModel(
                model.master.master_id, config)
            self._levels.append(model)
예제 #24
0
 def artifacts(self, *args):
     self.exp_res = ExperimentalResults.create_from_json_file(args[1])
     self._topic_names = self.exp_res.scalars.domain_topics + self.exp_res.scalars.background_topics
     _artm = artm.ARTM(topic_names=self.exp_res.scalars.domain_topics +
                       self.exp_res.scalars.background_topics,
                       dictionary=self.dataset.lexicon,
                       show_progress_bars=False)
     _artm.load(args[0])
     return _artm
예제 #25
0
def experiment_enviroment(request):
    """ """
    with warnings.catch_warnings():
        warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1)
        dataset = Dataset('tests/test_data/test_dataset.csv')
        dictionary = dataset.get_dictionary()

    model_artm = artm.ARTM(
        num_processors=3,
        num_topics=5,
        cache_theta=True,
        num_document_passes=1,
        dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore')])
    model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
    model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))

    ex_score = ScoreExample()
    tm = TopicModel(model_artm,
                    model_id='new_id',
                    custom_scores={'example_score': ex_score})

    experiment = Experiment(tm,
                            experiment_id="test_pipeline",
                            save_path="tests/experiments")
    cube_settings = [{
        'CubeCreator': {
            'num_iter': 10,
            'parameters': [
                {
                    'name': 'seed',
                    'values': [82019, 322],
                },
            ],
            'reg_search': 'grid',
            'separate_thread': USE_MULTIPROCESS,
        },
        'selection': [
            'model.seed = 82019 and PerplexityScore -> min COLLECT 2',
        ]
    }, {
        'RegularizersModifierCube': {
            'num_iter': 10,
            'regularizer_parameters': {
                "regularizer": artm.regularizers.SmoothSparsePhiRegularizer(),
                "tau_grid": [0.1, 0.5, 1, 5, 10]
            },
            'reg_search': 'grid',
            'use_relative_coefficients': False,
            'separate_thread': USE_MULTIPROCESS,
        },
        'selection': [
            'PerplexityScore -> max COLLECT 2',
        ]
    }]

    return tm, dataset, experiment, dictionary, cube_settings
예제 #26
0
def create_and_learn_ARTM_decorPhi_modal(name="",
                                         topic_number=750,
                                         num_collection_passes=1,
                                         weigths=[1., 1., 1., 1.],
                                         decorTau=1.0):

    batch_vectorizer_train = None
    batch_vectorizer_train = artm.BatchVectorizer(data_path='./' + name,
                                                  data_format='vowpal_wabbit',
                                                  target_folder='folder' +
                                                  name)
    dictionary = artm.Dictionary()
    dictionary.gather(data_path=batch_vectorizer_train.data_path)
    topic_names = ['topic_{}'.format(i) for i in range(topic_number)]

    model = artm.ARTM(topic_names=topic_names,
                      class_ids={
                          '@text': weigths[0],
                          '@first': weigths[1],
                          '@second': weigths[2],
                          '@third': weigths[3]
                      },
                      cache_theta=True,
                      theta_columns_naming='title',
                      scores=[
                          artm.PerplexityScore(name='PerplexityScore',
                                               dictionary=dictionary)
                      ])
    model.regularizers.add(
        artm.DecorrelatorPhiRegularizer(
            name='DecorrelatorPhi_modals',
            tau=decorTau,
            class_ids=['@first', '@second', '@third']))

    model.initialize(dictionary=dictionary)

    model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
    model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
    model.scores.add(
        artm.TopicKernelScore(name='TopicKernelScore',
                              class_id='@text',
                              probability_mass_threshold=0.3))
    model.scores.add(
        artm.TopTokensScore(name='TopTokensScore',
                            num_tokens=6,
                            class_id='@text'))
    model.scores.add(
        artm.SparsityPhiScore(name='sparsity_phi_score', class_id='@third'))

    model.num_document_passes = 1

    model.fit_offline(batch_vectorizer=batch_vectorizer_train,
                      num_collection_passes=num_collection_passes)

    theta_train = model.transform(batch_vectorizer=batch_vectorizer_train)

    return model, theta_train
예제 #27
0
    def add_level(self,
                  num_topics=None,
                  topic_names=None,
                  parent_level_weight=1):
        """
        :Description: adds new level to the hierarchy

        :param int num_topics: the number of topics in level model, will be overwriten if
                               parameter topic_names is set
        :param topic_names: names of topics in model
        :type topic_names: list of str
        :param float parent_level_weight: the coefficient of smoothing n_wt by n_wa,
                                          a enumerates parent topics

        :return: ARTM or derived ARTM_Level instance

        :Notes:
          *  hierarchy structure assumes the number of topics on each following level is greater
             than on previous one
          *  work with returned value as with usual ARTM model
          *  to access any level, use [] or get_level method
          *  Important! You cannot add next level before previous one is initialized and fit.
        """
        if topic_names is not None:
            num_topics = len(topic_names)

        level_idx = len(self._levels)

        if level_idx:
            if num_topics <= self._levels[-1].num_topics:
                warnings.warn(
                    "Adding level with num_topics = %s less or equal than parent level's num_topics = %s"
                    % (num_topics, self._levels[-1].num_topics))

            self._levels.append(
                ARTM_Level(parent_model=self._levels[-1],
                           phi_batch_weight=parent_level_weight,
                           phi_batch_path=self._tmp_files_path,
                           model_name=self._model_name,
                           num_topics=num_topics,
                           topic_names=topic_names,
                           seed=self._get_seed(level_idx),
                           **self._common_models_args))

        else:
            self._levels.append(
                artm.ARTM(num_topics=num_topics,
                          topic_names=topic_names,
                          seed=self._get_seed(level_idx),
                          **self._common_models_args))

        level = self._levels[-1]
        config = level.master._config
        config.opt_for_avx = False
        level.master._lib.ArtmReconfigureMasterModel(level.master.master_id,
                                                     config)
        return level
예제 #28
0
파일: test_cubes.py 프로젝트: yyht/TopicNet
def test_custom_regularizer_cubed(experiment_enviroment, thread_flag, by_name):
    """ """
    _, dataset, _, dictionary = experiment_enviroment
    tau_grid = [1, 0, -1]

    custom_reg = TopicPriorSampledRegularizer(name='topic_prior',
                                              tau=5,
                                              num_topics=5,
                                              beta_prior=[10, 1, 100, 2, 1000])

    model_artm = artm.ARTM(
        num_processors=1,
        num_topics=5,
        cache_theta=True,
        class_ids={
            MAIN_MODALITY: 1.0,
            NGRAM_MODALITY: 1.0
        },
        num_document_passes=1,
        dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore', )],
    )
    tm = TopicModel(
        model_artm,
        model_id='new_id_1',
        custom_regularizers={custom_reg.name: custom_reg} if by_name else {})
    experiment = Experiment(  # noqa: F841
        tm,
        experiment_id="cubed_reg",
        save_path="tests/experiments")

    if by_name:
        regularizer_parameters = {
            "name": custom_reg.name,
            "tau_grid": tau_grid
        }
    else:
        regularizer_parameters = {
            "regularizer": custom_reg,
            "tau_grid": tau_grid
        }

    cube = RegularizersModifierCube(
        num_iter=10,
        regularizer_parameters=regularizer_parameters,
        reg_search="grid",
        use_relative_coefficients=False,
        separate_thread=thread_flag)
    dummies = cube(tm, dataset)

    tmodels = [dummy.restore() for dummy in dummies]

    assert len(tmodels) == len(tau_grid)
    for tau, one_model in zip(tau_grid, tmodels):
        assert one_model.all_regularizers[custom_reg.name].tau == tau
예제 #29
0
def get_phi_index(dataset: Dataset) -> Index:
    artm_model_template = artm.ARTM(num_topics=1, num_processors=1)
    artm_model_template.initialize(dictionary=dataset.get_dictionary())
    model_template = TopicModel(artm_model=artm_model_template)
    phi_template = model_template.get_phi()
    phi_index = phi_template.index

    del model_template
    del artm_model_template

    return phi_index
예제 #30
0
def compute_big_artm(num_topics, tau, dictionary, batch_vectorizer, score_computer):
    artm_model = artm.ARTM(num_topics=num_topics,
                           num_document_passes=5,
                           dictionary=dictionary,
                           scores=[artm.PerplexityScore(name='s1')],
                           regularizers=[artm.SmoothSparseThetaRegularizer(name='r1', tau=tau)], cache_theta=True)
    artm_model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=10)
    theta_bigartm = artm_model.get_theta()
    bigartm_predicts = get_df_clusters_predicted(theta_bigartm, url_list)
    score = score_computer.compute_score(bigartm_predicts["story_id_predicted"])
    logging.info("num_topics={}, tau={},"
                 "bigARTM score = {}".format(num_topics, tau, score))