def two_experiment_enviroments(request): """ """ with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1) dataset = Dataset('tests/test_data/test_dataset.csv') dictionary = dataset.get_dictionary() model_artm_1 = artm.ARTM( num_processors=1, num_topics=5, cache_theta=True, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore'), artm.SparsityPhiScore(name='SparsityPhiScore', class_id=MAIN_MODALITY)] ) model_artm_2 = artm.ARTM( num_processors=1, num_topics=5, cache_theta=True, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore'), artm.SparsityPhiScore(name='SparsityPhiScore', class_id=MAIN_MODALITY)] ) tm_1 = TopicModel(model_artm_1, model_id='new_id_1') tm_2 = TopicModel(model_artm_2, model_id='new_id_2') experiment_1 = Experiment( experiment_id="test_1", save_path="tests/experiments", topic_model=tm_1 ) experiment_2 = Experiment( experiment_id="test_2", save_path="tests/experiments", topic_model=tm_2 ) return tm_1, experiment_1, tm_2, experiment_2, dataset, dictionary
def create_topic_model(self, topic_model_name: str, batch_vectorizer: artm.BatchVectorizer, dictionary: artm.Dictionary) -> artm.ARTM: topic_model = artm.ARTM(num_topics=self.number_of_topics, dictionary=dictionary, cache_theta=False) topic_model.scores.add( artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) topic_model.scores.add( artm.SparsityPhiScore(name='sparsity_phi_score')) topic_model.scores.add( artm.SparsityThetaScore(name='sparsity_theta_score')) topic_model.num_document_passes = 5 topic_model.num_processors = max(1, os.cpu_count() - 1) topic_model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer')) topic_model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer')) topic_model.regularizers.add( artm.DecorrelatorPhiRegularizer( name='decorrelator_phi_regularizer')) topic_model.regularizers['sparse_phi_regularizer'].tau = -1.0 topic_model.regularizers['sparse_theta_regularizer'].tau = -0.5 topic_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+5 best_score = None keyword_extraction_logger.info( 'epoch perplexity_score sparsity_phi_score sparsity_theta_score' ) for restart_index in range(10): topic_model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=3) if best_score is None: best_score = topic_model.score_tracker[ 'perplexity_score'].last_value else: if best_score > topic_model.score_tracker[ 'perplexity_score'].last_value: best_score = topic_model.score_tracker[ 'perplexity_score'].last_value self.save_topic_model(topic_model, topic_model_name) keyword_extraction_logger.info( '{0:5} {1:16.9} {2:18.9} {3:20.9}'.format( (restart_index + 1) * 3, topic_model.score_tracker['perplexity_score'].last_value, topic_model.score_tracker['sparsity_phi_score'].last_value, topic_model.score_tracker['sparsity_theta_score']. last_value)) del topic_model return self.load_topic_model( artm.ARTM(num_topics=self.number_of_topics, dictionary=dictionary, cache_theta=False), topic_model_name)
def test_phi_matrix_after_lda_sampled_regularizer(experiment_enviroment): with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1) dataset = Dataset(DATA_PATH) dictionary = dataset.get_dictionary() batch_vectorizer = dataset.get_batch_vectorizer() topic_prior_reg = TopicPriorSampledRegularizer( name='topic_prior', tau=5, num_topics=5, beta_prior=[10, 1, 100, 2, 1000]) model_artm_1 = artm.ARTM( num_processors=1, num_topics=5, cache_theta=True, class_ids={ MAIN_MODALITY: 1.0, NGRAM_MODALITY: 1.0 }, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore', )], ) model_artm_2 = artm.ARTM( num_processors=1, num_topics=5, cache_theta=True, class_ids={ MAIN_MODALITY: 1.0, NGRAM_MODALITY: 1.0 }, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore', )], ) tm_1 = TopicModel( model_artm_1, model_id='new_id_1', custom_regularizers={topic_prior_reg.name: topic_prior_reg}) tm_2 = TopicModel(model_artm_2, model_id='new_id_2') tm_1._fit(batch_vectorizer, 10) tm_2._fit(batch_vectorizer, 10) phi_first = tm_1.get_phi() phi_second = tm_2.get_phi() assert any(phi_first != phi_second ), 'Phi matrices are the same after regularization.'
def experiment(filename, tau_phi, tau_theta): batch_vectorizer = artm.BatchVectorizer(data_path=filename, data_format='vowpal_wabbit', target_folder='batches') dictionary = batch_vectorizer.dictionary topic_num = 30 tokens_num = 100 print("ARTM training") topic_names = ['topic_{}'.format(i) for i in range(topic_num)] model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True) model_plsa = artm.ARTM(topic_names=topic_names, cache_theta=True, scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)]) model_lda = artm.LDA(num_topics=topic_num) model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=tokens_num)) model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score')) model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer')) model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer')) model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer')) model_artm.regularizers['sparse_phi_regularizer'].tau = tau_phi model_artm.regularizers['sparse_theta_regularizer'].tau = tau_theta model_artm.regularizers['decorrelator_phi_regularizer'].tau = 1e+3 model_plsa.initialize(dictionary=dictionary) model_artm.initialize(dictionary=dictionary) model_lda.initialize(dictionary=dictionary) passes = 100 model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) print_measures(model_plsa, model_artm, model_lda)
def test_fancy_fit_is_ok(experiment_enviroment): tm, dataset, experiment, dictionary = experiment_enviroment model_artm = artm.ARTM( num_topics=5, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore')], theta_columns_naming='title', class_ids={ MAIN_MODALITY: 1, NGRAM_MODALITY: 1, EXTRA_MODALITY: 1, '@psyduck': 42 }, regularizers=[ artm.SmoothSparseThetaRegularizer(name='smooth_theta', tau=10.0), ]) custom_scores = {'mean_kernel_size': ScoreExample()} tm = TopicModel(model_artm, model_id='absolutely_new_id', custom_scores=custom_scores) num_iterations = 10 tm._fit(dataset.get_batch_vectorizer(), num_iterations) params = tm.get_jsonable_from_parameters() assert "smooth_theta" in params["regularizers"] PATH = "tests/experiments/save_standalone/" tm.save(PATH) tm2 = TopicModel.load(PATH) assert (tm.get_phi() == tm2.get_phi()).all().all()
def __init__(self, artm_model=None, model_id=None, parent_model_id=None, data_path=None, description=None, experiment=None, custom_scores=dict(), *args, **kwargs): """ Initialize stage, also used for loading previously saved experiments. Parameters ---------- artm_model : artm model or None model to use, None if you want to create model (Default value = None) model_id : str model id (Default value = None) parent_model_id : str model id from which current model was created (Default value = None) data_path : str path to the data (Default value = None) description : list of dict description of the model (Default value = None) experiment : Experiment the experiment to which the model is bound (Default value = None) custom_scores : dict dictionary with score names as keys and score classes as functions (score class with functionality like those of BaseScore) """ super().__init__(model_id=model_id, parent_model_id=parent_model_id, experiment=experiment, *args, **kwargs) if artm_model is None: try: self._model = artm.ARTM(**kwargs) except ArtmException as e: error_message = repr(e) raise ValueError( f'Cannot create artm model with parameters {kwargs}.\n' "ARTM failed with following: " + error_message) else: self._model = artm_model self.data_path = data_path self.custom_scores = custom_scores self._score_caches = None # returned by model.score, reset by model._fit self._description = [] if description is None and self._model._initialized: init_params = self.get_jsonable_from_parameters() self._description = [{"action": "init", "params": [init_params]}] else: self._description = description
def __init__(self, uci_dir, dictionary, n_topics): bv = artm.BatchVectorizer(data_format='bow_uci', data_path=uci_dir, collection_name='corpus', target_folder=uci_dir + '/artm_batches') bv_dict = bv.dictionary logging.info("Fitting the ARTM model") model = artm.ARTM(dictionary=bv_dict, num_topics=n_topics) model.fit_offline(batch_vectorizer=bv, num_collection_passes=10) logging.info("Processing word-topic matrices") # Create a new word-topic matrix according to dictionary indices self.phi = np.zeros(model.phi_.shape, dtype=np.float64) for word, vec in model.phi_.iterrows(): idx = dictionary.token2id[word[1]] self.phi[idx, :] = vec logging.info("Building the index for ARTM") corpus = model.transform(bv).T.sort_index() corpus = [matutils.full2sparse(row) for index, row in corpus.iterrows()] self.index = similarities.MatrixSimilarity(corpus, num_features=n_topics, num_best=self.N_BEST) self.model = model self.dictionary = dictionary
def experiment_enviroment(request): """ """ with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1) dataset = Dataset('tests/test_data/test_dataset.csv') dictionary = dataset.get_dictionary() model_artm = artm.ARTM( num_topics=5, class_ids={MAIN_MODALITY: 1.0, NGRAM_MODALITY: 1.0, EXTRA_MODALITY: 1.0}, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore', )], theta_columns_naming='title', ) custom_scores = {'mean_kernel_size': ScoreExample()} tm = TopicModel(model_artm, model_id='new_id', custom_scores=custom_scores) experiment = Experiment(experiment_id="test", save_path="tests/experiments", topic_model=tm) def resource_teardown(): """ """ shutil.rmtree("tests/experiments") shutil.rmtree(dataset._internals_folder_path) request.addfinalizer(resource_teardown) return tm, dataset, experiment, dictionary
def fit(): batch_id = str(uuid.uuid4()) app.logger.info("batch %s", batch_id) rjson = request.json terms = rjson['terms'] topics_cnt = rjson['topics'] batch = artm.messages.Batch() term_to_id = {} all_terms = [] batch = artm.messages.Batch() batch.id = batch_id for i, doc in enumerate(terms): item = batch.item.add() item.id = i field = item.field.add() for term in doc: if not term in term_to_id: term_to_id[term] = len(all_terms) all_terms.append(term) field.token_id.append(term_to_id[term]) field.token_count.append(1) for t in all_terms: batch.token.append(t) os.mkdir(batch_id) with open(os.path.join(batch_id, "batch.batch"), 'wb') as fout: fout.write(batch.SerializeToString()) app.logger.info("batch %s is created", batch_id) dictionary = artm.Dictionary() dictionary.gather(batch_id) model_artm = artm.ARTM( topic_names=['topic_{}'.format(i) for i in xrange(topics_cnt)], scores=[ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary) ], regularizers=[ artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.15) ], show_progress_bars=False) batch_vectorizer = artm.BatchVectorizer(data_path=batch_id, data_format="batches") model_artm.initialize(dictionary=dictionary) app.logger.info("model is starting to fit") model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=1) app.logger.info("mode was fitted") model_artm.save(os.path.join(batch_id, "model")) return jsonify({"id": batch_id})
def init_model(self, dictionary_path=None): """dictionary_path: optional, used with pretrained model""" self.dictionary = artm.Dictionary() if dictionary_path is None: self.dictionary.gather(data_path=self.batches_path) self.dictionary.filter(min_tf=10, max_df_rate=0.1) self.dictionary.save_text( f"{self.dir_path}/dicts/dict_{self.name_dataset}.txt") else: self.dictionary.load_text(dictionary_path) self.model = artm.ARTM( num_topics=self.n_topics, dictionary=self.dictionary, show_progress_bars=True, ) # scores self.model.scores.add( artm.PerplexityScore(name="PerplexityScore", dictionary=self.dictionary)) self.model.scores.add( artm.SparsityThetaScore(name="SparsityThetaScore")) self.model.scores.add(artm.SparsityPhiScore(name="SparsityPhiScore")) # regularizers self.model.regularizers.add( artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=-0.1)) self.model.regularizers.add( artm.SmoothSparseThetaRegularizer(name="SparseTheta", tau=-0.5)) self.model.regularizers.add( artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi", tau=1.5e5))
def define_model(n_topics: int, dictionary: artm.Dictionary, sparse_theta: float, sparse_phi: float, decorrelator_phi: float) -> artm.artm_model.ARTM: """ Define the ARTM model. :param n_topics: number of topics. :param dictionary: batch vectorizer dictionary. :param sparse_theta: sparse theta parameter. :param sparse_phi: sparse phi Parameter. :param decorrelator_phi: decorellator phi Parameter. :return: ARTM model. """ print("Defining the model.") topic_names = ["topic_{}".format(i) for i in range(1, n_topics + 1)] model_artm = artm.ARTM( topic_names=topic_names, cache_theta=True, scores=[ artm.PerplexityScore(name="PerplexityScore", dictionary=dictionary), artm.SparsityPhiScore(name="SparsityPhiScore"), artm.SparsityThetaScore(name="SparsityThetaScore"), artm.TopicKernelScore(name="TopicKernelScore", probability_mass_threshold=0.3), artm.TopTokensScore(name="TopTokensScore", num_tokens=15) ], regularizers=[ artm.SmoothSparseThetaRegularizer(name="SparseTheta", tau=sparse_theta), artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=sparse_phi), artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi", tau=decorrelator_phi) ]) return model_artm
def experiment_enviroment(request): """ """ with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1) dataset = Dataset('tests/test_data/test_dataset.csv') dictionary = dataset.get_dictionary() model_artm = artm.ARTM( num_processors=1, num_topics=5, cache_theta=True, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore', )], ) model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) ex_score = ScoreExample() tm = TopicModel(model_artm, model_id='new_id', custom_scores={'example_score': ex_score}) # experiment starts without model experiment = Experiment(tm, experiment_id="test_cube_creator", save_path="tests/experiments") return tm, dataset, experiment, dictionary
def get_topic_weights(data_folder, tm_index): import artm import os from dags.bigartm.services.bigartm_utils import load_monkey_patch from util.constants import BASE_DAG_DIR print("!!!", "Get topic weights") batches_folder = os.path.join(data_folder, "batches") batch_vectorizer = artm.BatchVectorizer(data_path=batches_folder, data_format='batches') model_folder = os.path.join(BASE_DAG_DIR, "bigartm_models") model_artm = artm.ARTM(num_topics=tm_index.number_of_topics, class_ids={"text": 1}, theta_columns_naming="title", reuse_theta=True, cache_theta=True, num_processors=4) model_artm.load = load_monkey_patch model_artm.load(model_artm, os.path.join(model_folder, f"model_{tm_index.name}.model")) theta = model_artm.transform(batch_vectorizer) theta_values = theta.values.transpose().astype(float) theta_topics = theta.index.array.to_numpy().astype(str) theta_documents = theta.columns.array.to_numpy().astype(str) return theta_values, theta_topics, theta_documents
def experiment_enviroment(request): """ """ with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1) dataset = Dataset(DATA_PATH) dictionary = dataset.get_dictionary() model_artm = artm.ARTM( num_processors=1, num_topics=5, cache_theta=True, class_ids={ MAIN_MODALITY: 1.0, NGRAM_MODALITY: 1.0 }, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore', )], ) tm = TopicModel(model_artm, model_id='new_id') experiment = Experiment(experiment_id="test_cubes", save_path="tests/experiments", topic_model=tm) return tm, dataset, experiment, dictionary
def create_model_complex(current_dictionary, n_topics, n_doc_passes, seed_value, n_top_tokens, p_mass_threshold, common_topics, subject_topics, class_name='@default_class', _debug_print=False): if _debug_print: print '[{}] creating model'.format(datetime.now()) model = artm.ARTM(num_topics=n_topics, dictionary=current_dictionary, cache_theta=True, seed=seed_value, class_ids={class_name: 1.0}) model.num_document_passes = n_doc_passes add_complex_scores_to_model(model, current_dictionary, n_top_tokens=n_top_tokens, p_mass_threshold=p_mass_threshold, common_topics=common_topics, subject_topics=subject_topics, class_name=class_name) return model
def bigartm_predict(mess_bigartm, top1_cat): try: mess_bigartm = ' |text ' + mess_bigartm #загрузка модели T = 10 model_artm = artm.ARTM(num_topics=T, topic_names=['sbj' + str(i) for i in range(T)], class_ids={'text': 1}) model_artm.load(os.path.join(artm_dir, top1_cat + ".dump")) #сохранить текст в файл with open(os.path.join(currdir, 'flask/test_artm.txt'), 'w') as f: f.write(mess_bigartm + '\n') batch_vectorizer_test = artm.BatchVectorizer( data_path=os.path.join(currdir, 'flask/test_artm.txt'), data_format='vowpal_wabbit', target_folder=os.path.join(currdir, 'flask/test'), batch_size=100) theta_test = model_artm.transform( batch_vectorizer=batch_vectorizer_test) except Exception: print('Ошибка загрузки bigartm') return theta_test
def init_model(T, B, batches_dir, regularizers_dict, num_document_passes=30, weights_dict=None, min_df=None, max_tf=None): T = int(T) B = int(B) main_topics_num = T model_artm = artm.ARTM( num_topics=T + B, topic_names=[ "topic{}".format(i) if i < main_topics_num else "back{}".format(i) for i in range(B) ], cache_theta=True, show_progress_bars=True, class_ids=weights_dict, num_document_passes=num_document_passes) topic_names = model_artm.topic_names model_artm, my_dictionary = dictionary_initialization( model_artm, batches_dir, min_df, max_tf) print("Model is initialized!") if regularizers_dict: model_artm = reset_regularizers(model_artm, regularizers_dict) model_artm = init_score_tracker(model_artm, my_dictionary) return model_artm
def select_from_corpus(self, list_of_files: List[str], preprocessor: BaseTextPreprocessor, spacy_nlp: Language) -> List[str]: topic_model_name = os.path.normpath(self.topic_model_name.strip()) if len(topic_model_name) == 0: raise ValueError('A topic model name is empty!') dir_name = os.path.dirname(topic_model_name) base_name = os.path.basename(topic_model_name) if len(dir_name) == 0: dir_name = os.path.curdir if len(base_name) == 0: raise ValueError( '`{0}` is incorrect name for a topic model! Base name of file is empty!' .format(self.topic_model_name)) if not os.path.isdir(dir_name): raise ValueError( '`{0}` is incorrect name for a topic model! Directory `{1}` does not exist!' .format(self.topic_model_name, dir_name)) collection_name = os.path.normpath( os.path.join(dir_name, base_name + '.collection')) collection_docword_name = os.path.normpath( os.path.join(dir_name, 'docword.' + base_name + '.collection')) collection_vocab_name = os.path.normpath( os.path.join(dir_name, 'vocab.' + base_name + '.collection')) if (not os.path.isfile(collection_docword_name)) or ( not os.path.isfile(collection_vocab_name)): self.create_collection_as_bow_uci(list_of_files, preprocessor, spacy_nlp, collection_docword_name, collection_vocab_name) batches_path = os.path.normpath( os.path.join(dir_name, base_name + '.data_batches')) if os.path.isdir(batches_path): batch_vectorizer = artm.BatchVectorizer(data_path=batches_path, data_format='batches') else: batch_vectorizer = artm.BatchVectorizer( data_path=dir_name, data_format='bow_uci', collection_name=collection_name, target_folder=batches_path) dictionary = artm.Dictionary() dictionary_name = os.path.normpath(topic_model_name + '.dictionary') if os.path.isfile(dictionary_name): dictionary.load(dictionary_name) else: dictionary.gather(data_path=batches_path) dictionary.save(dictionary_name) topic_model = self.load_topic_model( artm.ARTM(num_topics=self.number_of_topics, dictionary=dictionary, cache_theta=False), topic_model_name) if topic_model is None: topic_model = self.create_topic_model(topic_model_name, batch_vectorizer, dictionary) if topic_model is None: raise ValueError( 'The trained topic model cannot be loaded from the file `{0}`!' .format(topic_model_name)) return self.select_keywords_from_topic_model(topic_model)
def setup_class(cls): """ """ with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1) dataset = Dataset('tests/test_data/test_dataset.csv') raw_data = [] with open('tests/test_data/test_vw.txt', encoding='utf-8') as file: for line in file: raw_data += [line.split(' ')] dictionary = dataset.get_dictionary() batch_vectorizer = dataset.get_batch_vectorizer() model_artm = artm.ARTM( num_topics=NUM_TOPICS, class_ids=dict.fromkeys(CLASS_IDS, 1.0), topic_names=TOPIC_NAMES, cache_theta=True, num_document_passes=NUM_DOCUMENT_PASSES, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore')], ) cls.topic_model = TopicModel(model_artm, model_id='model_id') cls.topic_model._fit(batch_vectorizer, num_iterations=NUM_ITERATIONS) cls.raw_data = raw_data
def cluster_artm(text): bach_vectorizer = artm.BatchVectorizer(data_path=text, data_format='vowpal_wabbit', target_folder='batch_small', batch_size=20) T = 10 # количество тем topic_names = ["sbj" + str(i) for i in range(T - 1)] + ["bcg"] model_artm = artm.ARTM(num_topics=T, topic_names=topic_names, reuse_theta=True, num_document_passes=1) np.random.seed(1) dictionary = artm.Dictionary() dictionary.gather(data_path=bach_vectorizer.data_path) model_artm.initialize(dictionary) model_artm.scores.add(artm.TopTokensScore(name='metric1', num_tokens=15)) model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='smoothing', dictionary=dictionary, topic_names='bcg', tau=1e5)) model_artm.fit_offline(batch_vectorizer=bach_vectorizer, num_collection_passes=6) model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='stimulates', dictionary=dictionary, topic_names=["sbj" + str(i) for i in range(0, 29)], tau=-1e5)) model_artm.fit_offline(batch_vectorizer=bach_vectorizer, num_collection_passes=6) for topic_name in model_artm.topic_names: with open('cluster_log_artm.txt', 'a') as f_in: f_in.write(topic_name + ':') for word in model_artm.score_tracker["metric1"].last_tokens[topic_name]: f_in.write(word + ' ') f_in.write('\n')
def test_custom_regularizer_cubed_controlled(experiment_enviroment, thread_flag, by_name): """ """ _, dataset, _, dictionary = experiment_enviroment multiplier = 2 initial_tau = 5 custom_reg = TopicPriorSampledRegularizer(name='topic_prior', tau=initial_tau, num_topics=5, beta_prior=[10, 1, 100, 2, 1000]) model_artm = artm.ARTM( num_processors=1, num_topics=5, cache_theta=True, class_ids={ MAIN_MODALITY: 1.0, NGRAM_MODALITY: 1.0 }, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore', )], ) tm = TopicModel( model_artm, model_id='new_id_1', custom_regularizers={custom_reg.name: custom_reg} if by_name else {}) experiment = Experiment( # noqa: F841 tm, experiment_id="cubed_controlled_reg", save_path="tests/experiments") parameters = { "score_to_track": None, "tau_converter": f"prev_tau * {multiplier}", "user_value_grid": [0.3], "max_iters": float("inf") } if by_name: parameters["reg_name"] = custom_reg.name else: parameters["regularizer"] = custom_reg num_iter = 10 cube = RegularizationControllerCube(num_iter=num_iter, parameters=parameters, reg_search="grid", use_relative_coefficients=False, separate_thread=thread_flag) dummies = cube(tm, dataset) tmodels = [dummy.restore() for dummy in dummies] for one_model in tmodels: actual_tau = one_model.all_regularizers[custom_reg.name].tau assert actual_tau == initial_tau * (multiplier**num_iter)
def _get_topic_model(dataset: Dataset, phi: pd.DataFrame = None, num_topics: int = None, seed: int = None, scores: List[BaseScore] = None, num_safe_fit_iterations: int = 3, num_processors: int = 3, cache_theta: bool = False) -> TopicModel: dictionary = dataset.get_dictionary() if num_topics is not None and phi is not None: assert num_topics >= phi.shape[1] elif num_topics is None and phi is not None: num_topics = phi.shape[1] elif num_topics is None and phi is None: raise ValueError() topic_names = [f'topic_{i}' for i in range(num_topics)] if seed is None: artm_model = artm.ARTM(topic_names=topic_names) else: artm_model = artm.ARTM(topic_names=topic_names, seed=seed) artm_model.num_processors = num_processors artm_model.initialize(dictionary) if phi is None: pass elif num_safe_fit_iterations is not None and num_safe_fit_iterations > 0: init_phi_utils._safe_copy_phi(artm_model, phi, dataset, num_safe_fit_iterations) else: init_phi_utils._copy_phi(artm_model, phi) topic_model = TopicModel(artm_model=artm_model, model_id='0', cache_theta=cache_theta, theta_columns_naming='title') if scores is not None: for score in scores: score._attach(topic_model) return topic_model
def load(self, path): """ :Description: loads models of already constructed hierarchy :param str path: a path where hierarchy was saved by hARTM.save method :Notes: * Loaded models will overwrite ARTM.topic_names and class_ids fields of each level. * All class_ids weights will be set to 1.0, you need to specify them by\ hand if it's necessary. * The method call will empty ARTM.score_tracker of each level. * All regularizers and scores will be forgotten. * etc. * We strongly recommend you to reset all important parameters of the ARTM\ models and hARTM, used earlier. """ info_filename = glob.glob(os.path.join(path, "info.dump")) if len(info_filename) != 1: raise ValueError("Given path is not hARTM safe") with open(info_filename[0], "rb") as fin: info = pickle.load(fin) model_filenames = glob.glob(os.path.join(path, "*.model")) if len( {len(info["parent_level_weight"]) + 1, len(model_filenames) / 2}) > 1: raise ValueError("Given path is not hARTM safe") model_filenames = sorted(model_filenames) self._levels = [] for level_idx, num_topics in enumerate(info["num_level_topics"]): if not len(self._levels): model = artm.ARTM(num_topics=num_topics, seed=self._get_seed(level_idx), **self._common_models_args) else: parent_level_weight = info["parent_level_weight"][level_idx - 1] model = ARTM_Level(parent_model=self._levels[-1], phi_batch_weight=parent_level_weight, phi_batch_path=self._tmp_files_path, num_topics=num_topics, seed=self._get_seed(level_idx), **self._common_models_args) filename = model_filenames[2 * level_idx + 1] model.load(filename, "p_wt") filename = model_filenames[2 * level_idx] model.load(filename, "n_wt") config = model.master._config config.opt_for_avx = False model.master._lib.ArtmReconfigureMasterModel( model.master.master_id, config) self._levels.append(model)
def artifacts(self, *args): self.exp_res = ExperimentalResults.create_from_json_file(args[1]) self._topic_names = self.exp_res.scalars.domain_topics + self.exp_res.scalars.background_topics _artm = artm.ARTM(topic_names=self.exp_res.scalars.domain_topics + self.exp_res.scalars.background_topics, dictionary=self.dataset.lexicon, show_progress_bars=False) _artm.load(args[0]) return _artm
def experiment_enviroment(request): """ """ with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1) dataset = Dataset('tests/test_data/test_dataset.csv') dictionary = dataset.get_dictionary() model_artm = artm.ARTM( num_processors=3, num_topics=5, cache_theta=True, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore')]) model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) ex_score = ScoreExample() tm = TopicModel(model_artm, model_id='new_id', custom_scores={'example_score': ex_score}) experiment = Experiment(tm, experiment_id="test_pipeline", save_path="tests/experiments") cube_settings = [{ 'CubeCreator': { 'num_iter': 10, 'parameters': [ { 'name': 'seed', 'values': [82019, 322], }, ], 'reg_search': 'grid', 'separate_thread': USE_MULTIPROCESS, }, 'selection': [ 'model.seed = 82019 and PerplexityScore -> min COLLECT 2', ] }, { 'RegularizersModifierCube': { 'num_iter': 10, 'regularizer_parameters': { "regularizer": artm.regularizers.SmoothSparsePhiRegularizer(), "tau_grid": [0.1, 0.5, 1, 5, 10] }, 'reg_search': 'grid', 'use_relative_coefficients': False, 'separate_thread': USE_MULTIPROCESS, }, 'selection': [ 'PerplexityScore -> max COLLECT 2', ] }] return tm, dataset, experiment, dictionary, cube_settings
def create_and_learn_ARTM_decorPhi_modal(name="", topic_number=750, num_collection_passes=1, weigths=[1., 1., 1., 1.], decorTau=1.0): batch_vectorizer_train = None batch_vectorizer_train = artm.BatchVectorizer(data_path='./' + name, data_format='vowpal_wabbit', target_folder='folder' + name) dictionary = artm.Dictionary() dictionary.gather(data_path=batch_vectorizer_train.data_path) topic_names = ['topic_{}'.format(i) for i in range(topic_number)] model = artm.ARTM(topic_names=topic_names, class_ids={ '@text': weigths[0], '@first': weigths[1], '@second': weigths[2], '@third': weigths[3] }, cache_theta=True, theta_columns_naming='title', scores=[ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary) ]) model.regularizers.add( artm.DecorrelatorPhiRegularizer( name='DecorrelatorPhi_modals', tau=decorTau, class_ids=['@first', '@second', '@third'])) model.initialize(dictionary=dictionary) model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model.scores.add( artm.TopicKernelScore(name='TopicKernelScore', class_id='@text', probability_mass_threshold=0.3)) model.scores.add( artm.TopTokensScore(name='TopTokensScore', num_tokens=6, class_id='@text')) model.scores.add( artm.SparsityPhiScore(name='sparsity_phi_score', class_id='@third')) model.num_document_passes = 1 model.fit_offline(batch_vectorizer=batch_vectorizer_train, num_collection_passes=num_collection_passes) theta_train = model.transform(batch_vectorizer=batch_vectorizer_train) return model, theta_train
def add_level(self, num_topics=None, topic_names=None, parent_level_weight=1): """ :Description: adds new level to the hierarchy :param int num_topics: the number of topics in level model, will be overwriten if parameter topic_names is set :param topic_names: names of topics in model :type topic_names: list of str :param float parent_level_weight: the coefficient of smoothing n_wt by n_wa, a enumerates parent topics :return: ARTM or derived ARTM_Level instance :Notes: * hierarchy structure assumes the number of topics on each following level is greater than on previous one * work with returned value as with usual ARTM model * to access any level, use [] or get_level method * Important! You cannot add next level before previous one is initialized and fit. """ if topic_names is not None: num_topics = len(topic_names) level_idx = len(self._levels) if level_idx: if num_topics <= self._levels[-1].num_topics: warnings.warn( "Adding level with num_topics = %s less or equal than parent level's num_topics = %s" % (num_topics, self._levels[-1].num_topics)) self._levels.append( ARTM_Level(parent_model=self._levels[-1], phi_batch_weight=parent_level_weight, phi_batch_path=self._tmp_files_path, model_name=self._model_name, num_topics=num_topics, topic_names=topic_names, seed=self._get_seed(level_idx), **self._common_models_args)) else: self._levels.append( artm.ARTM(num_topics=num_topics, topic_names=topic_names, seed=self._get_seed(level_idx), **self._common_models_args)) level = self._levels[-1] config = level.master._config config.opt_for_avx = False level.master._lib.ArtmReconfigureMasterModel(level.master.master_id, config) return level
def test_custom_regularizer_cubed(experiment_enviroment, thread_flag, by_name): """ """ _, dataset, _, dictionary = experiment_enviroment tau_grid = [1, 0, -1] custom_reg = TopicPriorSampledRegularizer(name='topic_prior', tau=5, num_topics=5, beta_prior=[10, 1, 100, 2, 1000]) model_artm = artm.ARTM( num_processors=1, num_topics=5, cache_theta=True, class_ids={ MAIN_MODALITY: 1.0, NGRAM_MODALITY: 1.0 }, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore', )], ) tm = TopicModel( model_artm, model_id='new_id_1', custom_regularizers={custom_reg.name: custom_reg} if by_name else {}) experiment = Experiment( # noqa: F841 tm, experiment_id="cubed_reg", save_path="tests/experiments") if by_name: regularizer_parameters = { "name": custom_reg.name, "tau_grid": tau_grid } else: regularizer_parameters = { "regularizer": custom_reg, "tau_grid": tau_grid } cube = RegularizersModifierCube( num_iter=10, regularizer_parameters=regularizer_parameters, reg_search="grid", use_relative_coefficients=False, separate_thread=thread_flag) dummies = cube(tm, dataset) tmodels = [dummy.restore() for dummy in dummies] assert len(tmodels) == len(tau_grid) for tau, one_model in zip(tau_grid, tmodels): assert one_model.all_regularizers[custom_reg.name].tau == tau
def get_phi_index(dataset: Dataset) -> Index: artm_model_template = artm.ARTM(num_topics=1, num_processors=1) artm_model_template.initialize(dictionary=dataset.get_dictionary()) model_template = TopicModel(artm_model=artm_model_template) phi_template = model_template.get_phi() phi_index = phi_template.index del model_template del artm_model_template return phi_index
def compute_big_artm(num_topics, tau, dictionary, batch_vectorizer, score_computer): artm_model = artm.ARTM(num_topics=num_topics, num_document_passes=5, dictionary=dictionary, scores=[artm.PerplexityScore(name='s1')], regularizers=[artm.SmoothSparseThetaRegularizer(name='r1', tau=tau)], cache_theta=True) artm_model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=10) theta_bigartm = artm_model.get_theta() bigartm_predicts = get_df_clusters_predicted(theta_bigartm, url_list) score = score_computer.compute_score(bigartm_predicts["story_id_predicted"]) logging.info("num_topics={}, tau={}," "bigARTM score = {}".format(num_topics, tau, score))