def init_hierarchical_model(class_ids): score = [artm.PerplexityScore(name='perplexity_words', class_ids=['body']), artm.PerplexityScore(name='perplexity_bigrams', class_ids=['bigrams'])] top_tokens = [artm.TopTokensScore(name='top_words', num_tokens=15, class_id='body'), artm.TopTokensScore(name='top_bigrams', num_tokens=10, class_id='bigrams')] sparsity = [artm.SparsityThetaScore(name='sparsity_theta', eps=1e-6), artm.SparsityPhiScore(name='sparsity_phi_words', class_id='words', eps=1e-6), artm.SparsityPhiScore(name='sparsity_phi_bigrams', class_id='bigrams', eps=1e-6)] regularizers = [artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['body'], name='decorr_words'), artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['bigram'], name='decorr_bigrams'), artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['categories'], name='decorr_categories'), artm.SmoothSparseThetaRegularizer(tau=0, name='sparsity_theta'), artm.SmoothSparsePhiRegularizer(tau=0, class_ids=['body'], name='sparsity_words'), artm.SmoothSparsePhiRegularizer(tau=0, class_ids=['bigram'], name='sparsity_bigrams')] hmodel = artm.hARTM(class_ids=class_ids, cache_theta=True, reuse_theta=True, scores=score + top_tokens + sparsity, regularizers=regularizers, theta_columns_naming='title') return hmodel
def test_perplexity_strategy_mul(experiment_enviroment): """ """ tm, dataset, experiment, dictionary = experiment_enviroment regularizer_parameters = { "regularizer": artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=MAIN_MODALITY), "tau_grid": [] } cube = RegularizersModifierCube( num_iter=20, regularizer_parameters=regularizer_parameters, strategy=PerplexityStrategy(0.001, 10, 25, threshold=1.0), tracked_score_function='PerplexityScore', reg_search='mul', relative_coefficients=False, verbose=True) with pytest.warns(UserWarning, match="Perplexity is too high for threshold"): tmodels = cube(tm, dataset) visited_taus = extract_visited_taus(tmodels) expected_taus = [0, 0.001, 0.01, 0.1, 1.0, 10.0] assert visited_taus == expected_taus SCORES = [3.756, 3.75, 3.72, 6.043] real_scores = extract_strategic_scores(cube) if real_scores != SCORES: warnings.warn(f"real_scores == {real_scores}" f"expected == {SCORES}") assert cube.strategy.best_point[0][2] == 1.0
def init_model(self, dictionary_path=None): """dictionary_path: optional, used with pretrained model""" self.dictionary = artm.Dictionary() if dictionary_path is None: self.dictionary.gather(data_path=self.batches_path) self.dictionary.filter(min_tf=10, max_df_rate=0.1) self.dictionary.save_text( f"{self.dir_path}/dicts/dict_{self.name_dataset}.txt") else: self.dictionary.load_text(dictionary_path) self.model = artm.ARTM( num_topics=self.n_topics, dictionary=self.dictionary, show_progress_bars=True, ) # scores self.model.scores.add( artm.PerplexityScore(name="PerplexityScore", dictionary=self.dictionary)) self.model.scores.add( artm.SparsityThetaScore(name="SparsityThetaScore")) self.model.scores.add(artm.SparsityPhiScore(name="SparsityPhiScore")) # regularizers self.model.regularizers.add( artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=-0.1)) self.model.regularizers.add( artm.SmoothSparseThetaRegularizer(name="SparseTheta", tau=-0.5)) self.model.regularizers.add( artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi", tau=1.5e5))
def define_model(n_topics: int, dictionary: artm.Dictionary, sparse_theta: float, sparse_phi: float, decorrelator_phi: float) -> artm.artm_model.ARTM: """ Define the ARTM model. :param n_topics: number of topics. :param dictionary: batch vectorizer dictionary. :param sparse_theta: sparse theta parameter. :param sparse_phi: sparse phi Parameter. :param decorrelator_phi: decorellator phi Parameter. :return: ARTM model. """ print("Defining the model.") topic_names = ["topic_{}".format(i) for i in range(1, n_topics + 1)] model_artm = artm.ARTM( topic_names=topic_names, cache_theta=True, scores=[ artm.PerplexityScore(name="PerplexityScore", dictionary=dictionary), artm.SparsityPhiScore(name="SparsityPhiScore"), artm.SparsityThetaScore(name="SparsityThetaScore"), artm.TopicKernelScore(name="TopicKernelScore", probability_mass_threshold=0.3), artm.TopTokensScore(name="TopTokensScore", num_tokens=15) ], regularizers=[ artm.SmoothSparseThetaRegularizer(name="SparseTheta", tau=sparse_theta), artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=sparse_phi), artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi", tau=decorrelator_phi) ]) return model_artm
def test_perplexity_strategy_grid(experiment_enviroment, thread_flag): """ """ tm, dataset, experiment, dictionary = experiment_enviroment tau_grid = [0.1, 0.5, 1, 5, 50] regularizer_parameters = { "regularizer": artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=MAIN_MODALITY), "tau_grid": tau_grid } cube = RegularizersModifierCube( num_iter=3, regularizer_parameters=regularizer_parameters, strategy=PerplexityStrategy(1, 5), tracked_score_function='PerplexityScore', reg_search="grid", use_relative_coefficients=False, separate_thread=thread_flag) with pytest.warns(UserWarning, match='Grid would be used instead'): dummies = cube(tm, dataset) tmodels = [dummy.restore() for dummy in dummies] visited_taus = extract_visited_taus(tmodels) expected_taus = [0] + tau_grid assert visited_taus == expected_taus SCORES = [3.756, 3.756, 3.753, 3.75, 3.72, 2.887] real_scores = extract_strategic_scores(cube) if real_scores != SCORES: warnings.warn(f"real_scores == {real_scores}" f"expected == {SCORES}") assert cube.strategy.best_point[0][2] == 50
def test_perplexity_strategy_add(experiment_enviroment, thread_flag): """ """ tm, dataset, experiment, dictionary = experiment_enviroment regularizer_parameters = { "regularizer": artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=MAIN_MODALITY), "tau_grid": [] } cube = RegularizersModifierCube( num_iter=3, regularizer_parameters=regularizer_parameters, strategy=PerplexityStrategy(1, 1, max_len=5), tracked_score_function='PerplexityScore', reg_search='add', use_relative_coefficients=False, verbose=True, separate_thread=thread_flag) with pytest.warns(UserWarning, match="Max progression length"): dummies = cube(tm, dataset) tmodels = [dummy.restore() for dummy in dummies] visited_taus = extract_visited_taus(tmodels) expected_taus = [0, 1, 2, 3, 4, 5] assert visited_taus == expected_taus SCORES = [3.756, 3.75, 3.743, 3.736, 3.728, 3.72] real_scores = extract_strategic_scores(cube) if real_scores != SCORES: warnings.warn(f"real_scores == {real_scores}" f"expected == {SCORES}") assert cube.strategy.best_point[0][2] == 5
def create_model_fn_4(n_iteration): tmp_model = cmh.create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100 + n_iteration, n_top_tokens=15, p_mass_threshold=0.25) tmp_model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class'])) tmp_model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer')) tmp_model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['@default_class'])) tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10 tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5 tmp_model.regularizers['ss_phi_regularizer'].tau = -2 tmp_model = cmh.fit_one_model( plot_maker, batch_vectorizer, models_file, config, tmp_model, _n_iterations=20, _model_name='model_20_m4_iter_{}'.format(n_iteration)) return tmp_model
def test_experiment_prune(cls): """ """ cls.topic_model.experiment = None experiment_run = Experiment( cls.topic_model, experiment_id="run_experiment", save_path=cls.experiment_path, ) test_cube = RegularizersModifierCube( num_iter=5, regularizer_parameters={ 'regularizer': artm.DecorrelatorPhiRegularizer(name='decorrelation_phi', tau=1), 'tau_grid': [], }, strategy=PerplexityStrategy(0.001, 10, 25, threshold=1.0), tracked_score_function='PerplexityScore@all', reg_search='mul', relative_coefficients=False, verbose=True) test_cube(cls.topic_model, cls.dataset) experiment_run.set_criteria(1, 'some_criterion') new_seed = experiment_run.get_models_by_depth(level=1)[0] experiment = Experiment( topic_model=new_seed, experiment_id="prune_experiment", save_path=cls.experiment_path, save_model_history=True, ) assert len(experiment.models) == 1
def decor_train(self): if self.model is None: print('Initialise the model first') return self.model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='decorr', topic_names=self.specific, tau=self.decor))
def create_and_learn_ARTM_decorPhi_modal(name="", topic_number=750, num_collection_passes=1, weigths=[1., 1., 1., 1.], decorTau=1.0): batch_vectorizer_train = None batch_vectorizer_train = artm.BatchVectorizer(data_path='./' + name, data_format='vowpal_wabbit', target_folder='folder' + name) dictionary = artm.Dictionary() dictionary.gather(data_path=batch_vectorizer_train.data_path) topic_names = ['topic_{}'.format(i) for i in range(topic_number)] model = artm.ARTM(topic_names=topic_names, class_ids={ '@text': weigths[0], '@first': weigths[1], '@second': weigths[2], '@third': weigths[3] }, cache_theta=True, theta_columns_naming='title', scores=[ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary) ]) model.regularizers.add( artm.DecorrelatorPhiRegularizer( name='DecorrelatorPhi_modals', tau=decorTau, class_ids=['@first', '@second', '@third'])) model.initialize(dictionary=dictionary) model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model.scores.add( artm.TopicKernelScore(name='TopicKernelScore', class_id='@text', probability_mass_threshold=0.3)) model.scores.add( artm.TopTokensScore(name='TopTokensScore', num_tokens=6, class_id='@text')) model.scores.add( artm.SparsityPhiScore(name='sparsity_phi_score', class_id='@third')) model.num_document_passes = 1 model.fit_offline(batch_vectorizer=batch_vectorizer_train, num_collection_passes=num_collection_passes) theta_train = model.transform(batch_vectorizer=batch_vectorizer_train) return model, theta_train
def create_model_fn_20_complex_reg_1(n_iteration): n_topics = 20 common_topics = [u'topic_0', u'topic_1'] subject_topics = list( set([u'topic_{}'.format(idx) for idx in range(2, 20)]) - set(common_topics)) tmp_model = create_model_complex(current_dictionary=dictionary, n_topics=n_topics, n_doc_passes=5, seed_value=100 + n_iteration, n_top_tokens=15, p_mass_threshold=0.25, common_topics=common_topics, subject_topics=subject_topics) # subject topics tmp_model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer_subject', topic_names=subject_topics)) tmp_model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer_subject', topic_names=subject_topics, class_ids=['@default_class'])) tmp_model.regularizers.add( artm.DecorrelatorPhiRegularizer( name='decorrelator_phi_regularizer_subject', topic_names=subject_topics, class_ids=['@default_class'])) tmp_model.regularizers['ss_theta_regularizer_subject'].tau = -0.5 tmp_model.regularizers['ss_phi_regularizer_subject'].tau = -0.5 tmp_model.regularizers['decorrelator_phi_regularizer_subject'].tau = -10 # common topics tmp_model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer_common', topic_names=subject_topics)) tmp_model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer_common', topic_names=subject_topics, class_ids=['@default_class'])) # tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer_common', # topic_names=subject_topics, class_ids=['@default_class'])) tmp_model.regularizers['ss_theta_regularizer_common'].tau = 0.5 tmp_model.regularizers['ss_phi_regularizer_common'].tau = 0.5 # tmp_model.regularizers['decorrelator_phi_regularizer_common'].tau = -10 tmp_model = fit_one_model_complex( plot_maker, batch_vectorizer, models_file, config, tmp_model, _n_iterations=20, _model_name='model_20_complex_reg_1_iter_{}'.format(n_iteration)) return tmp_model
def generate_decorrelators( specific_topic_names_lvl1, background_topic_names_lvl1, words_class_ids=MAIN_MODALITY, class_ids_for_bcg_decorrelation=MAIN_MODALITY, ngramms_modalities_for_decor=NGRAM_MODALITY): """ Creates an array of pre-configured regularizers using specified coefficients """ decorrelator_tau_ngramms = 5*1e-3 decorrelator_tau_words_specific = 5*1e-2 decorrelator_tau_words_bcg = 5*1e-3 regularizers = [ artm.DecorrelatorPhiRegularizer( gamma=0, tau=decorrelator_tau_words_specific, name='decorrelation', topic_names=specific_topic_names_lvl1, class_ids=words_class_ids, ), artm.DecorrelatorPhiRegularizer( tau=decorrelator_tau_words_bcg, name='decorrelation_background', topic_names=background_topic_names_lvl1, class_ids=words_class_ids, ), artm.DecorrelatorPhiRegularizer( tau=decorrelator_tau_ngramms, name='decorrelation_ngramms', topic_names=specific_topic_names_lvl1, class_ids=ngramms_modalities_for_decor ), artm.DecorrelatorPhiRegularizer( tau=decorrelator_tau_ngramms, name='decorrelation_ngramms_background', topic_names=background_topic_names_lvl1, class_ids=class_ids_for_bcg_decorrelation ) ] return regularizers
def create_topic_model(self, topic_model_name: str, batch_vectorizer: artm.BatchVectorizer, dictionary: artm.Dictionary) -> artm.ARTM: topic_model = artm.ARTM(num_topics=self.number_of_topics, dictionary=dictionary, cache_theta=False) topic_model.scores.add( artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) topic_model.scores.add( artm.SparsityPhiScore(name='sparsity_phi_score')) topic_model.scores.add( artm.SparsityThetaScore(name='sparsity_theta_score')) topic_model.num_document_passes = 5 topic_model.num_processors = max(1, os.cpu_count() - 1) topic_model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer')) topic_model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer')) topic_model.regularizers.add( artm.DecorrelatorPhiRegularizer( name='decorrelator_phi_regularizer')) topic_model.regularizers['sparse_phi_regularizer'].tau = -1.0 topic_model.regularizers['sparse_theta_regularizer'].tau = -0.5 topic_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+5 best_score = None keyword_extraction_logger.info( 'epoch perplexity_score sparsity_phi_score sparsity_theta_score' ) for restart_index in range(10): topic_model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=3) if best_score is None: best_score = topic_model.score_tracker[ 'perplexity_score'].last_value else: if best_score > topic_model.score_tracker[ 'perplexity_score'].last_value: best_score = topic_model.score_tracker[ 'perplexity_score'].last_value self.save_topic_model(topic_model, topic_model_name) keyword_extraction_logger.info( '{0:5} {1:16.9} {2:18.9} {3:20.9}'.format( (restart_index + 1) * 3, topic_model.score_tracker['perplexity_score'].last_value, topic_model.score_tracker['sparsity_phi_score'].last_value, topic_model.score_tracker['sparsity_theta_score']. last_value)) del topic_model return self.load_topic_model( artm.ARTM(num_topics=self.number_of_topics, dictionary=dictionary, cache_theta=False), topic_model_name)
def _get_corpus_model(self, corpus_vector_spaced, clustering_method='artm'): if 'gensim' == clustering_method: return self._get_model_LSI(corpus_vector_spaced) elif 'sklearn' == clustering_method: return self._get_model_LDA(corpus_vector_spaced) elif 'artm' == clustering_method: batch_vectorizer = corpus_vector_spaced['batch_vectorizer'] dictionary = corpus_vector_spaced['dictionary'] topic_names = [ 'topic_{}'.format(i) for i in range(self.num_of_clusters) ] model_artm = artm.ARTM( topic_names=topic_names, cache_theta=True, scores=[ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary) ], regularizers=[ artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.15) ]) model_artm.scores.add( artm.SparsityPhiScore(name='SparsityPhiScore')) model_artm.scores.add( artm.SparsityThetaScore(name='SparsityThetaScore')) model_artm.scores.add( artm.TopicKernelScore(name='TopicKernelScore', probability_mass_threshold=0.3)) model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore', num_tokens=10), overwrite=True) model_artm.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1)) model_artm.regularizers['SparseTheta'].tau = -0.2 model_artm.regularizers.add( artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5)) model_artm.num_document_passes = 1 model_artm.initialize(dictionary) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=30) return model_artm.transform(batch_vectorizer=batch_vectorizer).T
def experiment(filename, tau_phi, tau_theta): batch_vectorizer = artm.BatchVectorizer(data_path=filename, data_format='vowpal_wabbit', target_folder='batches') dictionary = batch_vectorizer.dictionary topic_num = 30 tokens_num = 100 print("ARTM training") topic_names = ['topic_{}'.format(i) for i in range(topic_num)] model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True) model_plsa = artm.ARTM(topic_names=topic_names, cache_theta=True, scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)]) model_lda = artm.LDA(num_topics=topic_num) model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=tokens_num)) model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score')) model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer')) model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer')) model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer')) model_artm.regularizers['sparse_phi_regularizer'].tau = tau_phi model_artm.regularizers['sparse_theta_regularizer'].tau = tau_theta model_artm.regularizers['decorrelator_phi_regularizer'].tau = 1e+3 model_plsa.initialize(dictionary=dictionary) model_artm.initialize(dictionary=dictionary) model_lda.initialize(dictionary=dictionary) passes = 100 model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) print_measures(model_plsa, model_artm, model_lda)
def create_thematic_model(checked_list, num_topics, num_tokens, phi_tau, theta_tau, decorr_tau): """ Create a thematic model """ gluing_bag_of_words(checked_list) batch_vectorizer = artm.BatchVectorizer(data_path=COLLECTION_PATH, data_format='vowpal_wabbit', target_folder=TARGET_FOLDER, batch_size=len(checked_list)) dictionary = artm.Dictionary(data_path=TARGET_FOLDER) model = artm.ARTM( num_topics=num_topics, num_document_passes=len(checked_list), dictionary=dictionary, regularizers=[ artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer', tau=phi_tau), artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer', tau=theta_tau), artm.DecorrelatorPhiRegularizer( name='decorrelator_phi_regularizer', tau=decorr_tau), ], scores=[ artm.PerplexityScore(name='perplexity_score', dictionary=dictionary), artm.SparsityPhiScore(name='sparsity_phi_score'), artm.SparsityThetaScore(name='sparsity_theta_score'), artm.TopTokensScore(name='top_tokens_score', num_tokens=num_tokens) ]) model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=len(checked_list)) top_tokens = model.score_tracker['top_tokens_score'] topic_dictionary = OrderedDict() for topic_name in model.topic_names: list_name = [] for (token, weight) in zip(top_tokens.last_tokens[topic_name], top_tokens.last_weights[topic_name]): list_name.append(token + '-' + str(round(weight, 3))) topic_dictionary[str(topic_name)] = list_name return model.score_tracker[ 'perplexity_score'].last_value, model.score_tracker[ 'sparsity_phi_score'].last_value, model.score_tracker[ 'sparsity_theta_score'].last_value, topic_dictionary
def topic_model(class_ids, dictionary, num_of_topics, num_back, tau, tf): names_of_topics = [str(x) for x in range(num_of_topics)] dictionary.filter(min_tf=tf, class_id='subjects') dictionary.filter(min_tf=tf, class_id='objects') dictionary.filter(min_tf=tf, class_id='pairs') model = artm.ARTM( num_topics=num_of_topics, #reuse_theta=True, cache_theta=True, topic_names=names_of_topics, class_ids=class_ids, #regularizers=regularizers_artm, dictionary=dictionary) model.scores.add( artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)) model.scores.add( artm.SparsityPhiScore(name='SparcityPhiScore', topic_names=model.topic_names[:-num_back])) model.regularizers.add( artm.SmoothSparsePhiRegularizer( name='SparsePhiRegularizer', class_ids=class_ids, topic_names=model.topic_names[:-num_back], tau=-tau)) model.regularizers.add( artm.SmoothSparsePhiRegularizer( name='SmoothPhiRegularizer', class_ids=class_ids, topic_names=model.topic_names[-num_back:], tau=tau)) model.regularizers.add( artm.DecorrelatorPhiRegularizer( name='DecorrelatorRegularizer', class_ids=class_ids, topic_names=model.topic_names[:-num_back], tau=tau)) model.regularizers.add( artm.SmoothSparseThetaRegularizer( name='SparseThetaRegularizer', topic_names=model.topic_names[-num_back], tau=tau)) return model
def train(self, batch_vectorizer): if self.model is None: print('Initialise the model first!') return self.model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='decorr', topic_names=self.specific, tau=self.decor)) # self.model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorr_2', # topic_names=self.back, tau=self.decor_2)) self.model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=self.n1) # if ((self.n2 != 0) and (self.B != 0)): if (self.B != 0): self.model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='SmoothPhi', topic_names=self.back, tau=self.spb)) self.model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='SmoothTheta', topic_names=self.back, tau=self.stb)) self.model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=self.n2) self.model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='SparsePhi', topic_names=self.specific, tau=self.sp1)) self.model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='SparseTheta', topic_names=self.specific, tau=self.st1)) self.model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=self.n3) # if (self.n4 != 0): # self.model.regularizers['SparsePhi'].tau = self.sp2 # self.model.regularizers['SparseTheta'].tau = self.st2 # self.model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=self.n4) print('Training is complete')
def pipeline_plsa_bigartm(lines, TOPIC_NUMBER, ngram_range, topnwords, LOGS_DATA_PATH="plsa.txt", TARGET_FOLDER="plsa"): make_file(lines, ngram_range, LOGS_DATA_PATH) batch_vectorizer = artm.BatchVectorizer(data_path=LOGS_DATA_PATH, data_format='vowpal_wabbit', target_folder=TARGET_FOLDER) model_artm = artm.ARTM(num_topics=TOPIC_NUMBER, cache_theta=True) model_artm.initialize(dictionary=batch_vectorizer.dictionary) model_artm.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=0.05)) model_artm.regularizers.add( artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5)) model_artm.regularizers.add( artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.01)) model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore', num_tokens=topnwords), overwrite=True) model_artm.scores.add( artm.PerplexityScore(name='PerplexityScore', dictionary=batch_vectorizer.dictionary)) model_artm.num_document_passes = 2 model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=15) topic_names = {} for topic_name in model_artm.topic_names: topic_names[topic_name] = model_artm.score_tracker[ 'TopTokensScore'].last_tokens[topic_name] #return label_after_bigarm(model_artm), topic_names return "nothing, sorry", topic_names
def build_model(self, d_dir, n_document_passes=1): batch_vectorizer_train = artm.BatchVectorizer(data_path=os.path.join( d_dir, 'data_batches_train'), data_format="batches") batch_vectorizer_test = artm.BatchVectorizer(data_path=os.path.join( d_dir, 'data_batches_test'), data_format="batches") dictionary = artm.Dictionary() dictionary.gather(data_path=os.path.join(d_dir, 'for_dict')) model = artm.ARTM(num_topics=self.n_topics, dictionary=dictionary, cache_theta=True, reuse_theta=True) # Sparsity p(c|t) model.scores.add( artm.SparsityPhiScore(eps=EPS, name='SparsityPhiScoreC', class_id=self.c)) # Sparsity p(w|t) model.scores.add( artm.SparsityPhiScore(eps=EPS, name='SparsityPhiScoreGram3', class_id=self.gram3)) #Regularization of sparsity p(gram3|t) model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhiGram3Regularizer', class_ids=[self.gram3])) #Regularization of decorr p(gram3|t) model.regularizers.add( artm.DecorrelatorPhiRegularizer( name='DecorrelatorPhiGram3Regularizer', class_ids=[self.gram3])) model.num_document_passes = n_document_passes return (model, batch_vectorizer_train, batch_vectorizer_test)
def init_baseline_artm( dataset, modalities_to_use, main_modality, num_topics, bcg_topics, model_params: dict = None, ): """ Creates simple artm model with standard scores. Parameters ---------- dataset : Dataset modalities_to_use : list of str main_modality : str num_topics : int Returns ------- model: artm.ARTM() instance """ if model_params is None: model_params = dict() model = init_bcg_sparse_model(dataset, modalities_to_use, main_modality, num_topics, bcg_topics, model_params) specific_topic_names = model.topic_names[:-bcg_topics] model.regularizers.add( artm.DecorrelatorPhiRegularizer( gamma=0, tau=model_params.get('decorrelation_tau', 0.01), name='decorrelation', topic_names=specific_topic_names, class_ids=modalities_to_use, )) return model
def init_decorrelated_plsa(dataset, modalities_to_use, main_modality, num_topics, model_params: dict = None): """ Creates simple artm model with standard scores. Parameters ---------- dataset : Dataset modalities_to_use : list of str main_modality : str num_topics : int model_params : dict Returns ------- model: artm.ARTM() instance """ if model_params is None: model_params = dict() model = init_plsa(dataset, modalities_to_use, main_modality, num_topics) tau = model_params.get('decorrelation_tau', 0.01) specific_topic_names = model.topic_names # let's decorrelate everything model.regularizers.add( artm.DecorrelatorPhiRegularizer( gamma=0, tau=tau, name='decorrelation', topic_names=specific_topic_names, class_ids=modalities_to_use, )) return model
topics_names = ["subject_" + str(i) for i in range(topic_num)] + \ ["background_" + str(i) for i in range(background_topic_num)] # назначаем имена темам subj_topics = topics_names[:topic_num] bgr_topics = topics_names[topic_num:] model = artm.ARTM( num_document_passes=document_passes_num, num_topics=topic_num + background_topic_num, topic_names=topics_names, seed=100, # helps to get stable results num_processors=processors_num) model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='Decorrelator', tau=10**4)) # обычный декоррелятор model.regularizers.add( artm.SmoothSparseThetaRegularizer( name='SmoothTheta', topic_names=bgr_topics, tau=0.3)) # сглаживаем Theta для фоновых тем model.regularizers.add( artm.SmoothSparseThetaRegularizer( name='SparseTheta', topic_names=subj_topics, tau=-0.3)) # разреживаем Theta для "хороших" тем model.regularizers.add( artm.SmoothSparsePhiRegularizer( name='SmoothPhi', topic_names=bgr_topics, class_ids=["text"], tau=0.1)) # сглаживаем Theta для фоновых тем model.regularizers.add(
model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score')) model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer')) model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer')) model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer')) model_artm.regularizers['sparse_phi_regularizer'].tau = 0.01 model_artm.regularizers['sparse_theta_regularizer'].tau = -1.06 # model_artm.regularizers['decorrelator_phi_regularizer'].tau = 1e+5 model_plsa.initialize(dictionary=dictionary) model_artm.initialize(dictionary=dictionary) model_lda.initialize(dictionary=dictionary) passes = 10 model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
dictionary.load(dictionary_path=(filename + '/dictionary.dict')) dictionary.load(dictionary_path=(filename + '/dictionary.dict')) model_artm.initialize(dictionary=dictionary) model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model_artm.scores.add( artm.TopicKernelScore(name='TopicKernelScore', probability_mass_threshold=0.3)) model_artm.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1)) model_artm.regularizers.add( artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5)) model_artm.regularizers.add( artm.TopicSelectionThetaRegularizer(name='TopicSelection', tau=0.25)) model_artm.regularizers['SparsePhi'].tau = -0.5 model_artm.regularizers['SparseTheta'].tau = -0.5 model_artm.regularizers['DecorrelatorPhi'].tau = 1e+5 model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore', num_tokens=10)) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=40) ex = model_artm.get_theta() for i, el in enumerate(ex.sum(axis=1)):
def __init__(self, dictionary, class_ids, tmp_files_path='', theta_columns_naming='title', cache_theta = True, num_levels=None, level_names=None, num_topics=None, topic_names=None, num_backgrounds=None, background_names=None, smooth_background_tau=None, decorrelate_phi_tau=None, parent_topics_proportion=None, spars_psi_tau=None, smooth_theta_fit=1.0, num_collection_passes=1, num_tokens=10): self.model = artm.hARTM(dictionary=dictionary, class_ids=class_ids, theta_columns_naming=theta_columns_naming, tmp_files_path=tmp_files_path, cache_theta=cache_theta) self.level_names = _generate_names(num_levels, level_names, 'level') topic_names = _generate_names_levels(len(self.level_names), num_topics, topic_names, 'topic') background_names = _generate_names_levels(len(self.level_names), num_backgrounds, background_names, 'background') for topic_names_level, background_names_level in zip(topic_names, background_names): topic_names_level = topic_names_level + background_names_level level = self.model.add_level(num_topics=len(topic_names_level), topic_names=topic_names_level) if smooth_background_tau is not None: for level, background_names_level in zip(self.model, background_names): level.regularizers.add(artm.SmoothSparsePhiRegularizer('SPhi_back', tau=smooth_background_tau, gamma=0, topic_names=background_names_level)) if decorrelate_phi_tau is not None: for level in self.model: level.regularizers.add(artm.DecorrelatorPhiRegularizer('DPhi', tau=decorrelate_phi_tau, gamma=0)) if (parent_topics_proportion is not None) and (spars_psi_tau is not None): for level, parent_topics_proportion_level in zip(self.model[1:], parent_topics_proportion): for topic_name, parent_topic_proportion in parent_topics_proportion_level.items(): level.regularizers.add(artm.HierarchySparsingThetaRegularizer(name=f'HSTheta_{topic_name}', topic_names=topic_name, tau=spars_psi_tau, parent_topic_proportion=parent_topic_proportion)) self.smooth_theta_fit = smooth_theta_fit self.num_collection_passes = num_collection_passes for level in self.model: for class_id, weight in class_ids.items(): if weight > 0: level.scores.add(artm.TopTokensScore(name=f'TT_{class_id}', class_id=class_id, num_tokens=num_tokens))
def test_func(): # constants num_tokens = 11 probability_mass_threshold = 0.9 sp_reg_tau = -0.1 decor_tau = 1.5e+5 decor_rel_tau = 0.3 num_collection_passes = 15 num_document_passes = 1 num_topics = 15 vocab_size = 6906 num_docs = 3430 data_path = os.environ.get('BIGARTM_UNITTEST_DATA') batches_folder = tempfile.mkdtemp() sp_zero_eps = 0.001 sparsity_phi_value = [ 0.034, 0.064, 0.093, 0.120, 0.145, 0.170, 0.194, 0.220, 0.246, 0.277, 0.312, 0.351, 0.390, 0.428, 0.464 ] sparsity_phi_rel_value = [ 0.442, 0.444, 0.444, 0.446, 0.448, 0.449, 0.458, 0.468, 0.476, 0.488, 0.501, 0.522, 0.574, 0.609, 0.670 ] sparsity_theta_value = [0.0] * num_collection_passes perp_zero_eps = 2.0 perplexity_value = [ 6873, 2590, 2685, 2578, 2603, 2552, 2536, 2481, 2419, 2331, 2235, 2140, 2065, 2009, 1964 ] perplexity_rel_value = [ 6873, 2667, 2458, 2323, 2150, 2265, 2015, 1967, 1807, 1747, 1713, 1607, 1632, 1542, 1469 ] top_zero_eps = 0.0001 top_tokens_num_tokens = [num_tokens * num_topics] * num_collection_passes top_tokens_topic_0_tokens = [ u'party', u'state', u'campaign', u'tax', u'political', u'republican', u'senate', u'candidate', u'democratic', u'court', u'president' ] top_tokens_topic_0_weights = [ 0.0209, 0.0104, 0.0094, 0.0084, 0.0068, 0.0067, 0.0065, 0.0058, 0.0053, 0.0053, 0.0051 ] ker_zero_eps = 0.02 topic_kernel_topic_0_contrast = 0.96 topic_kernel_topic_0_purity = 0.014 topic_kernel_topic_0_size = 18.0 topic_kernel_average_size = [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.13, 0.6, 1.6, 3.53, 7.15, 12.6, 20.4, 29.06 ] topic_kernel_average_contrast = [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.12, 0.31, 0.7, 0.96, 0.96, 0.96, 0.96, 0.97 ] topic_kernel_average_purity = [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.01, 0.015, 0.017, 0.02, 0.03, 0.04, 0.05 ] len_last_document_ids = 10 try: batch_vectorizer = artm.BatchVectorizer(data_path=data_path, data_format='bow_uci', collection_name='kos', target_folder=batches_folder) dictionary = artm.Dictionary() dictionary.gather(data_path=batch_vectorizer.data_path) model = artm.ARTM( topic_names=['topic_{}'.format(i) for i in range(num_topics)], dictionary=dictionary.name, cache_theta=True) model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=sp_reg_tau)) model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=decor_tau)) model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model.scores.add( artm.PerplexityScore(name='PerplexityScore', use_unigram_document_model=False, dictionary=dictionary)) model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model.scores.add( artm.TopTokensScore(name='TopTokensScore', num_tokens=num_tokens)) model.scores.add( artm.TopicKernelScore( name='TopicKernelScore', probability_mass_threshold=probability_mass_threshold)) model.scores.add(artm.ThetaSnippetScore(name='ThetaSnippetScore')) model.num_document_passes = num_document_passes model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) for i in range(num_collection_passes): assert abs(model.score_tracker['SparsityPhiScore'].value[i] - sparsity_phi_value[i]) < sp_zero_eps for i in range(num_collection_passes): assert abs(model.score_tracker['SparsityThetaScore'].value[i] - sparsity_theta_value[i]) < sp_zero_eps for i in range(num_collection_passes): assert abs(model.score_tracker['PerplexityScore'].value[i] - perplexity_value[i]) < perp_zero_eps for i in range(num_collection_passes): assert model.score_tracker['TopTokensScore'].num_tokens[ i] == top_tokens_num_tokens[i] for i in range(num_tokens): assert model.score_tracker['TopTokensScore'].last_tokens[ model.topic_names[0]][i] == top_tokens_topic_0_tokens[i] assert abs(model.score_tracker['TopTokensScore'].last_weights[ model.topic_names[0]][i] - top_tokens_topic_0_weights[i]) < top_zero_eps assert len(model.score_tracker['TopicKernelScore'].last_tokens[ model.topic_names[0]]) > 0 assert abs(topic_kernel_topic_0_contrast - model.score_tracker['TopicKernelScore'].last_contrast[ model.topic_names[0]]) < ker_zero_eps assert abs(topic_kernel_topic_0_purity - model.score_tracker['TopicKernelScore'].last_purity[ model.topic_names[0]]) < ker_zero_eps assert abs(topic_kernel_topic_0_size - model.score_tracker['TopicKernelScore'].last_size[ model.topic_names[0]]) < ker_zero_eps for i in range(num_collection_passes): assert abs( model.score_tracker['TopicKernelScore'].average_size[i] - topic_kernel_average_size[i]) < ker_zero_eps assert abs( model.score_tracker['TopicKernelScore'].average_contrast[i] - topic_kernel_average_contrast[i]) < ker_zero_eps assert abs( model.score_tracker['TopicKernelScore'].average_purity[i] - topic_kernel_average_purity[i]) < ker_zero_eps model.fit_online(batch_vectorizer=batch_vectorizer) info = model.info assert info is not None assert len(info.config.topic_name) == num_topics assert len(info.score) >= len(model.score_tracker) assert len(info.regularizer) == len(model.regularizers.data) assert len(info.cache_entry) > 0 temp = model.score_tracker['ThetaSnippetScore'].last_document_ids assert len_last_document_ids == len(temp) assert len(model.score_tracker['ThetaSnippetScore'].last_snippet[ temp[0]]) == num_topics phi = model.get_phi() assert phi.shape == (vocab_size, num_topics) theta = model.get_theta() assert theta.shape == (num_topics, num_docs) assert model.library_version.count('.') == 2 # major.minor.patch # test relative coefficients for Phi matrix regularizers model = artm.ARTM(num_topics=num_topics, dictionary=dictionary.name, cache_theta=False) model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=decor_rel_tau)) model.regularizers['DecorrelatorPhi'].gamma = 0.0 model.scores.add( artm.PerplexityScore(name='PerplexityScore', use_unigram_document_model=False, dictionary=dictionary)) model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model.num_document_passes = num_document_passes model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) for i in range(num_collection_passes): assert abs(model.score_tracker['SparsityPhiScore'].value[i] - sparsity_phi_rel_value[i]) < sp_zero_eps for i in range(num_collection_passes): assert abs(model.score_tracker['PerplexityScore'].value[i] - perplexity_rel_value[i]) < perp_zero_eps finally: shutil.rmtree(batches_folder)
def test_func(): num_topics = 5 tolerance = 0.05 batches_folder = tempfile.mkdtemp() try: with open(os.path.join(batches_folder, 'temp.vw.txt'), 'w') as fout: fout.write('title_0 aaa:1 bbb:2 ccc:3\n') fout.write('title_1 aaa:1 bbb:2 ccc:3\n') fout.write('title_2 aaa:1 bbb:2 ccc:3\n') fout.write('title_3 aaa:1 bbb:2 ccc:3\n') batch_vectorizer = artm.BatchVectorizer(data_path=os.path.join( batches_folder, 'temp.vw.txt'), data_format='vowpal_wabbit', target_folder=batches_folder) model = artm.ARTM(num_topics=num_topics, dictionary=batch_vectorizer.dictionary, num_document_passes=1) model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='DPR', tau=1)) model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=1) phi = model.get_phi() real_values = [ [0.32, 0.95, 0.2, 0.55, 0.32], [0.33, 0.0, 0.68, 0.35, 0.63], [0.35, 0.05, 0.11, 0.1, 0.05], ] for elems, values in zip(phi.values.tolist(), real_values): for e, v in zip(elems, values): assert abs(e - v) < tolerance model.regularizers['DPR'].topic_names = [ model.topic_names[0], model.topic_names[1] ] model.regularizers['DPR'].topic_pairs = { model.topic_names[0]: { model.topic_names[1]: 100.0, model.topic_names[2]: 100.0 } } model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=1) phi = model.get_phi() real_values = [ [0.0, 0.94, 0.22, 0.58, 0.35], [0.0, 0.0, 0.63, 0.3, 0.58], [0.0, 0.06, 0.14, 0.12, 0.07], ] for elems, values in zip(phi.values.tolist(), real_values): for e, v in zip(elems, values): assert abs(e - v) < tolerance model.regularizers['DPR'].topic_pairs = { model.topic_names[1]: { model.topic_names[0]: 10000.0 } } model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=1) phi = model.get_phi() real_values = [ [0.0, 0.91, 0.21, 0.54, 0.35], [0.0, 0.0, 0.55, 0.26, 0.53], [0.0, 0.08, 0.24, 0.20, 0.12], ] for elems, values in zip(phi.values.tolist(), real_values): for e, v in zip(elems, values): assert abs(e - v) < tolerance finally: shutil.rmtree(batches_folder)
def test_func(): num_topics = 5 tolerance = 0.01 batches_folder = tempfile.mkdtemp() try: with open(os.path.join(batches_folder, 'temp.vw.txt'), 'w') as fout: fout.write('title_0 aaa:1 bbb:2 ccc:3\n') fout.write('title_1 aaa:1 bbb:2 ccc:3\n') fout.write('title_2 aaa:1 bbb:2 ccc:3\n') fout.write('title_3 aaa:1 bbb:2 ccc:3\n') batch_vectorizer = artm.BatchVectorizer(data_path=os.path.join( batches_folder, 'temp.vw.txt'), data_format='vowpal_wabbit', target_folder=batches_folder) model = artm.ARTM(num_topics=num_topics, dictionary=batch_vectorizer.dictionary, num_document_passes=1) model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='DPR', tau=1)) model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=1) def _f(w): return ('@default_class', w) phi = model.get_phi() real_topics = pd.DataFrame( columns=['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4'], index=[_f('ccc'), _f('bbb'), _f('aaa')], data=[[0.32, 0.95, 0.2, 0.55, 0.32], [0.33, 0.0, 0.68, 0.35, 0.63], [0.35, 0.05, 0.12, 0.1, 0.05]]) assert (phi - real_topics).abs().values.max() < tolerance model.regularizers['DPR'].topic_names = [ model.topic_names[0], model.topic_names[1] ] model.regularizers['DPR'].topic_pairs = { model.topic_names[0]: { model.topic_names[1]: 100.0, model.topic_names[2]: 100.0 } } model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=1) phi = model.get_phi() real_topics = pd.DataFrame( columns=['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4'], index=[_f('ccc'), _f('bbb'), _f('aaa')], data=[[0.0, 0.94, 0.22, 0.58, 0.35], [0.0, 0.0, 0.63, 0.3, 0.58], [0.0, 0.06, 0.15, 0.12, 0.07]]) assert (phi - real_topics).abs().values.max() < tolerance model.regularizers['DPR'].topic_pairs = { model.topic_names[1]: { model.topic_names[0]: 10000.0 } } model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=1) phi = model.get_phi() real_topics = pd.DataFrame( columns=['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4'], index=[_f('ccc'), _f('bbb'), _f('aaa')], data=[[0.0, 0.91, 0.21, 0.54, 0.35], [0.0, 0.0, 0.55, 0.26, 0.53], [0.0, 0.09, 0.24, 0.20, 0.12]]) assert (phi - real_topics).abs().values.max() < tolerance finally: shutil.rmtree(batches_folder)
def test_func(): data_path = os.environ.get('BIGARTM_UNITTEST_DATA') batches_folder = tempfile.mkdtemp() dump_folder = tempfile.mkdtemp() try: batch_vectorizer = artm.BatchVectorizer(data_path=data_path, data_format='bow_uci', collection_name='kos', target_folder=batches_folder) model_1 = artm.ARTM(num_processors=7, cache_theta=True, num_document_passes=5, reuse_theta=True, seed=10, num_topics=15, class_ids={'@default_class': 1.0}, theta_name='THETA', dictionary=batch_vectorizer.dictionary) model_2 = artm.ARTM(num_processors=7, cache_theta=False, num_document_passes=5, reuse_theta=False, seed=10, num_topics=15, class_ids={'@default_class': 1.0}, dictionary=batch_vectorizer.dictionary) for model in [model_1, model_2]: model.scores.add( artm.PerplexityScore(name='perp', dictionary=batch_vectorizer.dictionary)) model.scores.add(artm.SparsityThetaScore(name='sp_theta', eps=0.1)) model.scores.add(artm.TopTokensScore(name='top_tok', num_tokens=10)) model.scores.add( artm.SparsityPhiScore(name='sp_nwt', model_name=model.model_nwt)) model.scores.add( artm.TopicKernelScore(name='kernel', topic_names=model.topic_names[0:5], probability_mass_threshold=0.4)) topic_pairs = {} for topic_name_1 in model.topic_names: for topic_name_2 in model.topic_names: if topic_name_1 not in topic_pairs: topic_pairs[topic_name_1] = {} topic_pairs[topic_name_1][ topic_name_2] = numpy.random.randint(0, 3) model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='decor', tau=100000.0, topic_pairs=topic_pairs)) model.regularizers.add( artm.SmoothSparsePhiRegularizer( name='smsp_phi', tau=-0.5, gamma=0.3, dictionary=batch_vectorizer.dictionary)) model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='smsp_theta', tau=0.1, doc_topic_coef=[2.0] * model.num_topics)) model.regularizers.add( artm.SmoothPtdwRegularizer(name='sm_ptdw', tau=0.1)) # learn first model and dump it on disc model.fit_offline(batch_vectorizer, num_collection_passes=10) model.fit_online(batch_vectorizer, update_every=1) model.dump_artm_model(os.path.join(dump_folder, 'target')) params = {} with open(os.path.join(dump_folder, 'target', 'parameters.json'), 'r') as fin: params = json.load(fin) _assert_json_params(params) # create second model from the dump and check the results are equal model_new = artm.load_artm_model( os.path.join(dump_folder, 'target')) _assert_params_equality(model, model_new) _assert_scores_equality(model, model_new) _assert_regularizers_equality(model, model_new) _assert_score_values_equality(model, model_new) _assert_matrices_equality(model, model_new) # continue learning of both models model.fit_offline(batch_vectorizer, num_collection_passes=3) model.fit_online(batch_vectorizer, update_every=1) model_new.fit_offline(batch_vectorizer, num_collection_passes=3) model_new.fit_online(batch_vectorizer, update_every=1) # check new results are also equal _assert_params_equality(model, model_new) _assert_scores_equality(model, model_new) _assert_regularizers_equality(model, model_new) _assert_score_values_equality(model, model_new) _assert_matrices_equality(model, model_new) shutil.rmtree(os.path.join(dump_folder, 'target')) finally: shutil.rmtree(batches_folder) shutil.rmtree(dump_folder)