def generate_sparse_regularizers( specific_topic_names, background_topic_names, class_ids_for_bcg_smoothing=MAIN_MODALITY, specific_words_classes=MAIN_MODALITY): """ Creates an array of pre-configured regularizers using specified coefficients """ gimel_smooth_specific = 3e-10 gimel_smooth_bcg = 0.3 regularizers = [ artm.SmoothSparsePhiRegularizer( tau=gimel_smooth_specific, name='smooth_phi_specific', topic_names=specific_topic_names, class_ids=specific_words_classes ), artm.SmoothSparseThetaRegularizer( tau=gimel_smooth_specific, name='smooth_theta_specific', topic_names=specific_topic_names ), artm.SmoothSparseThetaRegularizer( tau=gimel_smooth_bcg, name='smooth_theta_background', topic_names=background_topic_names ), artm.SmoothSparsePhiRegularizer( tau=gimel_smooth_bcg, name='smooth_phi_background', topic_names=background_topic_names, class_ids=class_ids_for_bcg_smoothing ), ] return regularizers
def create_model_fn_20_complex_reg_1(n_iteration): n_topics = 20 common_topics = [u'topic_0', u'topic_1'] subject_topics = list( set([u'topic_{}'.format(idx) for idx in range(2, 20)]) - set(common_topics)) tmp_model = create_model_complex(current_dictionary=dictionary, n_topics=n_topics, n_doc_passes=5, seed_value=100 + n_iteration, n_top_tokens=15, p_mass_threshold=0.25, common_topics=common_topics, subject_topics=subject_topics) # subject topics tmp_model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer_subject', topic_names=subject_topics)) tmp_model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer_subject', topic_names=subject_topics, class_ids=['@default_class'])) tmp_model.regularizers.add( artm.DecorrelatorPhiRegularizer( name='decorrelator_phi_regularizer_subject', topic_names=subject_topics, class_ids=['@default_class'])) tmp_model.regularizers['ss_theta_regularizer_subject'].tau = -0.5 tmp_model.regularizers['ss_phi_regularizer_subject'].tau = -0.5 tmp_model.regularizers['decorrelator_phi_regularizer_subject'].tau = -10 # common topics tmp_model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer_common', topic_names=subject_topics)) tmp_model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer_common', topic_names=subject_topics, class_ids=['@default_class'])) # tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer_common', # topic_names=subject_topics, class_ids=['@default_class'])) tmp_model.regularizers['ss_theta_regularizer_common'].tau = 0.5 tmp_model.regularizers['ss_phi_regularizer_common'].tau = 0.5 # tmp_model.regularizers['decorrelator_phi_regularizer_common'].tau = -10 tmp_model = fit_one_model_complex( plot_maker, batch_vectorizer, models_file, config, tmp_model, _n_iterations=20, _model_name='model_20_complex_reg_1_iter_{}'.format(n_iteration)) return tmp_model
def create_model_fn_4(n_iteration): tmp_model = cmh.create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100 + n_iteration, n_top_tokens=15, p_mass_threshold=0.25) tmp_model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class'])) tmp_model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer')) tmp_model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['@default_class'])) tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10 tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5 tmp_model.regularizers['ss_phi_regularizer'].tau = -2 tmp_model = cmh.fit_one_model( plot_maker, batch_vectorizer, models_file, config, tmp_model, _n_iterations=20, _model_name='model_20_m4_iter_{}'.format(n_iteration)) return tmp_model
def define_model(n_topics: int, dictionary: artm.Dictionary, sparse_theta: float, sparse_phi: float, decorrelator_phi: float) -> artm.artm_model.ARTM: """ Define the ARTM model. :param n_topics: number of topics. :param dictionary: batch vectorizer dictionary. :param sparse_theta: sparse theta parameter. :param sparse_phi: sparse phi Parameter. :param decorrelator_phi: decorellator phi Parameter. :return: ARTM model. """ print("Defining the model.") topic_names = ["topic_{}".format(i) for i in range(1, n_topics + 1)] model_artm = artm.ARTM( topic_names=topic_names, cache_theta=True, scores=[ artm.PerplexityScore(name="PerplexityScore", dictionary=dictionary), artm.SparsityPhiScore(name="SparsityPhiScore"), artm.SparsityThetaScore(name="SparsityThetaScore"), artm.TopicKernelScore(name="TopicKernelScore", probability_mass_threshold=0.3), artm.TopTokensScore(name="TopTokensScore", num_tokens=15) ], regularizers=[ artm.SmoothSparseThetaRegularizer(name="SparseTheta", tau=sparse_theta), artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=sparse_phi), artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi", tau=decorrelator_phi) ]) return model_artm
def init_model(self, dictionary_path=None): """dictionary_path: optional, used with pretrained model""" self.dictionary = artm.Dictionary() if dictionary_path is None: self.dictionary.gather(data_path=self.batches_path) self.dictionary.filter(min_tf=10, max_df_rate=0.1) self.dictionary.save_text( f"{self.dir_path}/dicts/dict_{self.name_dataset}.txt") else: self.dictionary.load_text(dictionary_path) self.model = artm.ARTM( num_topics=self.n_topics, dictionary=self.dictionary, show_progress_bars=True, ) # scores self.model.scores.add( artm.PerplexityScore(name="PerplexityScore", dictionary=self.dictionary)) self.model.scores.add( artm.SparsityThetaScore(name="SparsityThetaScore")) self.model.scores.add(artm.SparsityPhiScore(name="SparsityPhiScore")) # regularizers self.model.regularizers.add( artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=-0.1)) self.model.regularizers.add( artm.SmoothSparseThetaRegularizer(name="SparseTheta", tau=-0.5)) self.model.regularizers.add( artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi", tau=1.5e5))
def test_fancy_fit_is_ok(experiment_enviroment): tm, dataset, experiment, dictionary = experiment_enviroment model_artm = artm.ARTM( num_topics=5, num_document_passes=1, dictionary=dictionary, scores=[artm.PerplexityScore(name='PerplexityScore')], theta_columns_naming='title', class_ids={ MAIN_MODALITY: 1, NGRAM_MODALITY: 1, EXTRA_MODALITY: 1, '@psyduck': 42 }, regularizers=[ artm.SmoothSparseThetaRegularizer(name='smooth_theta', tau=10.0), ]) custom_scores = {'mean_kernel_size': ScoreExample()} tm = TopicModel(model_artm, model_id='absolutely_new_id', custom_scores=custom_scores) num_iterations = 10 tm._fit(dataset.get_batch_vectorizer(), num_iterations) params = tm.get_jsonable_from_parameters() assert "smooth_theta" in params["regularizers"] PATH = "tests/experiments/save_standalone/" tm.save(PATH) tm2 = TopicModel.load(PATH) assert (tm.get_phi() == tm2.get_phi()).all().all()
def init_hierarchical_model(class_ids): score = [artm.PerplexityScore(name='perplexity_words', class_ids=['body']), artm.PerplexityScore(name='perplexity_bigrams', class_ids=['bigrams'])] top_tokens = [artm.TopTokensScore(name='top_words', num_tokens=15, class_id='body'), artm.TopTokensScore(name='top_bigrams', num_tokens=10, class_id='bigrams')] sparsity = [artm.SparsityThetaScore(name='sparsity_theta', eps=1e-6), artm.SparsityPhiScore(name='sparsity_phi_words', class_id='words', eps=1e-6), artm.SparsityPhiScore(name='sparsity_phi_bigrams', class_id='bigrams', eps=1e-6)] regularizers = [artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['body'], name='decorr_words'), artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['bigram'], name='decorr_bigrams'), artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['categories'], name='decorr_categories'), artm.SmoothSparseThetaRegularizer(tau=0, name='sparsity_theta'), artm.SmoothSparsePhiRegularizer(tau=0, class_ids=['body'], name='sparsity_words'), artm.SmoothSparsePhiRegularizer(tau=0, class_ids=['bigram'], name='sparsity_bigrams')] hmodel = artm.hARTM(class_ids=class_ids, cache_theta=True, reuse_theta=True, scores=score + top_tokens + sparsity, regularizers=regularizers, theta_columns_naming='title') return hmodel
def fit(): batch_id = str(uuid.uuid4()) app.logger.info("batch %s", batch_id) rjson = request.json terms = rjson['terms'] topics_cnt = rjson['topics'] batch = artm.messages.Batch() term_to_id = {} all_terms = [] batch = artm.messages.Batch() batch.id = batch_id for i, doc in enumerate(terms): item = batch.item.add() item.id = i field = item.field.add() for term in doc: if not term in term_to_id: term_to_id[term] = len(all_terms) all_terms.append(term) field.token_id.append(term_to_id[term]) field.token_count.append(1) for t in all_terms: batch.token.append(t) os.mkdir(batch_id) with open(os.path.join(batch_id, "batch.batch"), 'wb') as fout: fout.write(batch.SerializeToString()) app.logger.info("batch %s is created", batch_id) dictionary = artm.Dictionary() dictionary.gather(batch_id) model_artm = artm.ARTM( topic_names=['topic_{}'.format(i) for i in xrange(topics_cnt)], scores=[ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary) ], regularizers=[ artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.15) ], show_progress_bars=False) batch_vectorizer = artm.BatchVectorizer(data_path=batch_id, data_format="batches") model_artm.initialize(dictionary=dictionary) app.logger.info("model is starting to fit") model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=1) app.logger.info("mode was fitted") model_artm.save(os.path.join(batch_id, "model")) return jsonify({"id": batch_id})
def train(self, batch_vectorizer): if self.model is None: print('Initialise the model first!') return self.model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='decorr', topic_names=self.specific, tau=self.decor)) # self.model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorr_2', # topic_names=self.back, tau=self.decor_2)) self.model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=self.n1) # if ((self.n2 != 0) and (self.B != 0)): if (self.B != 0): self.model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='SmoothPhi', topic_names=self.back, tau=self.spb)) self.model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='SmoothTheta', topic_names=self.back, tau=self.stb)) self.model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=self.n2) self.model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='SparsePhi', topic_names=self.specific, tau=self.sp1)) self.model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='SparseTheta', topic_names=self.specific, tau=self.st1)) self.model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=self.n3) # if (self.n4 != 0): # self.model.regularizers['SparsePhi'].tau = self.sp2 # self.model.regularizers['SparseTheta'].tau = self.st2 # self.model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=self.n4) print('Training is complete')
def compute_big_artm(num_topics, tau, dictionary, batch_vectorizer, score_computer): artm_model = artm.ARTM(num_topics=num_topics, num_document_passes=5, dictionary=dictionary, scores=[artm.PerplexityScore(name='s1')], regularizers=[artm.SmoothSparseThetaRegularizer(name='r1', tau=tau)], cache_theta=True) artm_model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=10) theta_bigartm = artm_model.get_theta() bigartm_predicts = get_df_clusters_predicted(theta_bigartm, url_list) score = score_computer.compute_score(bigartm_predicts["story_id_predicted"]) logging.info("num_topics={}, tau={}," "bigARTM score = {}".format(num_topics, tau, score))
def create_topic_model(self, topic_model_name: str, batch_vectorizer: artm.BatchVectorizer, dictionary: artm.Dictionary) -> artm.ARTM: topic_model = artm.ARTM(num_topics=self.number_of_topics, dictionary=dictionary, cache_theta=False) topic_model.scores.add( artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) topic_model.scores.add( artm.SparsityPhiScore(name='sparsity_phi_score')) topic_model.scores.add( artm.SparsityThetaScore(name='sparsity_theta_score')) topic_model.num_document_passes = 5 topic_model.num_processors = max(1, os.cpu_count() - 1) topic_model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer')) topic_model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer')) topic_model.regularizers.add( artm.DecorrelatorPhiRegularizer( name='decorrelator_phi_regularizer')) topic_model.regularizers['sparse_phi_regularizer'].tau = -1.0 topic_model.regularizers['sparse_theta_regularizer'].tau = -0.5 topic_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+5 best_score = None keyword_extraction_logger.info( 'epoch perplexity_score sparsity_phi_score sparsity_theta_score' ) for restart_index in range(10): topic_model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=3) if best_score is None: best_score = topic_model.score_tracker[ 'perplexity_score'].last_value else: if best_score > topic_model.score_tracker[ 'perplexity_score'].last_value: best_score = topic_model.score_tracker[ 'perplexity_score'].last_value self.save_topic_model(topic_model, topic_model_name) keyword_extraction_logger.info( '{0:5} {1:16.9} {2:18.9} {3:20.9}'.format( (restart_index + 1) * 3, topic_model.score_tracker['perplexity_score'].last_value, topic_model.score_tracker['sparsity_phi_score'].last_value, topic_model.score_tracker['sparsity_theta_score']. last_value)) del topic_model return self.load_topic_model( artm.ARTM(num_topics=self.number_of_topics, dictionary=dictionary, cache_theta=False), topic_model_name)
def fit_supervised(self, model, X, y): len_y = len(y) topic_names = model.topic_names doc_topic_coef = np.zeros((len_y, model.num_topics)) doc_topic_coef[range(len_y), [topic_names.index(topic_name) for topic_name in y]] = 1.0 model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SST', tau=self.smooth_theta_fit, doc_titles=y.index, doc_topic_coef=doc_topic_coef.tolist()))
def _get_corpus_model(self, corpus_vector_spaced, clustering_method='artm'): if 'gensim' == clustering_method: return self._get_model_LSI(corpus_vector_spaced) elif 'sklearn' == clustering_method: return self._get_model_LDA(corpus_vector_spaced) elif 'artm' == clustering_method: batch_vectorizer = corpus_vector_spaced['batch_vectorizer'] dictionary = corpus_vector_spaced['dictionary'] topic_names = [ 'topic_{}'.format(i) for i in range(self.num_of_clusters) ] model_artm = artm.ARTM( topic_names=topic_names, cache_theta=True, scores=[ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary) ], regularizers=[ artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.15) ]) model_artm.scores.add( artm.SparsityPhiScore(name='SparsityPhiScore')) model_artm.scores.add( artm.SparsityThetaScore(name='SparsityThetaScore')) model_artm.scores.add( artm.TopicKernelScore(name='TopicKernelScore', probability_mass_threshold=0.3)) model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore', num_tokens=10), overwrite=True) model_artm.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1)) model_artm.regularizers['SparseTheta'].tau = -0.2 model_artm.regularizers.add( artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5)) model_artm.num_document_passes = 1 model_artm.initialize(dictionary) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=30) return model_artm.transform(batch_vectorizer=batch_vectorizer).T
def experiment(filename, tau_phi, tau_theta): batch_vectorizer = artm.BatchVectorizer(data_path=filename, data_format='vowpal_wabbit', target_folder='batches') dictionary = batch_vectorizer.dictionary topic_num = 30 tokens_num = 100 print("ARTM training") topic_names = ['topic_{}'.format(i) for i in range(topic_num)] model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True) model_plsa = artm.ARTM(topic_names=topic_names, cache_theta=True, scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)]) model_lda = artm.LDA(num_topics=topic_num) model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=tokens_num)) model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score')) model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer')) model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer')) model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer')) model_artm.regularizers['sparse_phi_regularizer'].tau = tau_phi model_artm.regularizers['sparse_theta_regularizer'].tau = tau_theta model_artm.regularizers['decorrelator_phi_regularizer'].tau = 1e+3 model_plsa.initialize(dictionary=dictionary) model_artm.initialize(dictionary=dictionary) model_lda.initialize(dictionary=dictionary) passes = 100 model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) print_measures(model_plsa, model_artm, model_lda)
def create_thematic_model(checked_list, num_topics, num_tokens, phi_tau, theta_tau, decorr_tau): """ Create a thematic model """ gluing_bag_of_words(checked_list) batch_vectorizer = artm.BatchVectorizer(data_path=COLLECTION_PATH, data_format='vowpal_wabbit', target_folder=TARGET_FOLDER, batch_size=len(checked_list)) dictionary = artm.Dictionary(data_path=TARGET_FOLDER) model = artm.ARTM( num_topics=num_topics, num_document_passes=len(checked_list), dictionary=dictionary, regularizers=[ artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer', tau=phi_tau), artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer', tau=theta_tau), artm.DecorrelatorPhiRegularizer( name='decorrelator_phi_regularizer', tau=decorr_tau), ], scores=[ artm.PerplexityScore(name='perplexity_score', dictionary=dictionary), artm.SparsityPhiScore(name='sparsity_phi_score'), artm.SparsityThetaScore(name='sparsity_theta_score'), artm.TopTokensScore(name='top_tokens_score', num_tokens=num_tokens) ]) model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=len(checked_list)) top_tokens = model.score_tracker['top_tokens_score'] topic_dictionary = OrderedDict() for topic_name in model.topic_names: list_name = [] for (token, weight) in zip(top_tokens.last_tokens[topic_name], top_tokens.last_weights[topic_name]): list_name.append(token + '-' + str(round(weight, 3))) topic_dictionary[str(topic_name)] = list_name return model.score_tracker[ 'perplexity_score'].last_value, model.score_tracker[ 'sparsity_phi_score'].last_value, model.score_tracker[ 'sparsity_theta_score'].last_value, topic_dictionary
def topic_model(class_ids, dictionary, num_of_topics, num_back, tau, tf): names_of_topics = [str(x) for x in range(num_of_topics)] dictionary.filter(min_tf=tf, class_id='subjects') dictionary.filter(min_tf=tf, class_id='objects') dictionary.filter(min_tf=tf, class_id='pairs') model = artm.ARTM( num_topics=num_of_topics, #reuse_theta=True, cache_theta=True, topic_names=names_of_topics, class_ids=class_ids, #regularizers=regularizers_artm, dictionary=dictionary) model.scores.add( artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)) model.scores.add( artm.SparsityPhiScore(name='SparcityPhiScore', topic_names=model.topic_names[:-num_back])) model.regularizers.add( artm.SmoothSparsePhiRegularizer( name='SparsePhiRegularizer', class_ids=class_ids, topic_names=model.topic_names[:-num_back], tau=-tau)) model.regularizers.add( artm.SmoothSparsePhiRegularizer( name='SmoothPhiRegularizer', class_ids=class_ids, topic_names=model.topic_names[-num_back:], tau=tau)) model.regularizers.add( artm.DecorrelatorPhiRegularizer( name='DecorrelatorRegularizer', class_ids=class_ids, topic_names=model.topic_names[:-num_back], tau=tau)) model.regularizers.add( artm.SmoothSparseThetaRegularizer( name='SparseThetaRegularizer', topic_names=model.topic_names[-num_back], tau=tau)) return model
def create_model_fn_20_reg_1(n_iteration): tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100 + n_iteration, n_top_tokens=15, p_mass_threshold=0.25) tmp_model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer')) tmp_model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['@default_class'])) tmp_model.regularizers['ss_theta_regularizer'].tau = -3 tmp_model.regularizers['ss_phi_regularizer'].tau = -3 tmp_model = fit_one_model( tmp_model, _n_iterations=20, _model_name='model_20_reg_1_iter_{}'.format(n_iteration)) return tmp_model
def pipeline_plsa_bigartm(lines, TOPIC_NUMBER, ngram_range, topnwords, LOGS_DATA_PATH="plsa.txt", TARGET_FOLDER="plsa"): make_file(lines, ngram_range, LOGS_DATA_PATH) batch_vectorizer = artm.BatchVectorizer(data_path=LOGS_DATA_PATH, data_format='vowpal_wabbit', target_folder=TARGET_FOLDER) model_artm = artm.ARTM(num_topics=TOPIC_NUMBER, cache_theta=True) model_artm.initialize(dictionary=batch_vectorizer.dictionary) model_artm.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=0.05)) model_artm.regularizers.add( artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5)) model_artm.regularizers.add( artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.01)) model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore', num_tokens=topnwords), overwrite=True) model_artm.scores.add( artm.PerplexityScore(name='PerplexityScore', dictionary=batch_vectorizer.dictionary)) model_artm.num_document_passes = 2 model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=15) topic_names = {} for topic_name in model_artm.topic_names: topic_names[topic_name] = model_artm.score_tracker[ 'TopTokensScore'].last_tokens[topic_name] #return label_after_bigarm(model_artm), topic_names return "nothing, sorry", topic_names
def init_simple_default_model( dataset, modalities_to_use, main_modality, specific_topics, background_topics, ): """ Creates simple artm model with standard scores. Parameters ---------- dataset : Dataset modalities_to_use : list of str main_modality : str specific_topics : list or int background_topics : list or int Returns ------- model: artm.ARTM() instance """ if isinstance(specific_topics, list): specific_topic_names = list(specific_topics) else: specific_topics = int(specific_topics) specific_topic_names = [f'topic_{i}' for i in range(specific_topics)] n_specific_topics = len(specific_topic_names) if isinstance(background_topics, list): background_topic_names = list(background_topics) else: background_topics = int(background_topics) background_topic_names = [ f'background_{n_specific_topics + i}' for i in range(background_topics) ] n_background_topics = len(background_topic_names) dictionary = dataset.get_dictionary() baseline_class_ids = {class_id: 1 for class_id in modalities_to_use} tokens_data = count_vocab_size(dictionary, modalities_to_use) abs_weights = modality_weight_rel2abs(tokens_data, baseline_class_ids, main_modality) model = init_model( topic_names=specific_topic_names + background_topic_names, class_ids=abs_weights, ) if n_background_topics > 0: model.regularizers.add( artm.SmoothSparsePhiRegularizer( name='smooth_phi_bcg', topic_names=background_topic_names, tau=0.0, class_ids=[main_modality], ), ) model.regularizers.add( artm.SmoothSparseThetaRegularizer( name='smooth_theta_bcg', topic_names=background_topic_names, tau=0.0, ), ) model.initialize(dictionary) add_standard_scores(model, dictionary, main_modality=main_modality, all_modalities=modalities_to_use) return model
def test_func(): num_topics = 5 batches_folder = tempfile.mkdtemp() try: with open(os.path.join(batches_folder, 'temp.vw.txt'), 'w') as fout: fout.write('title_0 aaa:1 bbb:2 ccc:3\n') fout.write('title_1 aaa:1 bbb:2 ccc:3\n') fout.write('title_2 aaa:1 bbb:2 ccc:3\n') fout.write('title_3 aaa:1 bbb:2 ccc:3\n') batch_vectorizer = artm.BatchVectorizer(data_path=os.path.join( batches_folder, 'temp.vw.txt'), data_format='vowpal_wabbit', target_folder=batches_folder) model = artm.ARTM(num_topics=num_topics, dictionary=batch_vectorizer.dictionary, num_document_passes=1, cache_theta=True, theta_columns_naming='title') model.regularizers.add( artm.SmoothSparseThetaRegularizer( name='SST', tau=-1000.0, doc_titles=['title_0', 'title_2'])) model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=1) theta = model.get_theta() real_values = [ [0.0, 0.14, 0.0, 0.14], [0.0, 0.25, 0.0, 0.25], [0.0, 0.19, 0.0, 0.19], [0.0, 0.21, 0.0, 0.21], [0.0, 0.21, 0.0, 0.21], ] for elems, values in zip(theta.values.tolist(), real_values): for e, v in zip(elems, values): assert abs(e - v) < 0.01 model.initialize(dictionary=batch_vectorizer.dictionary) model.regularizers['SST'].doc_titles = [ 'title_0', 'title_2', 'title_1' ] model.regularizers['SST'].doc_topic_coef = [0.0, 1.0, 1.0, 0.0, 0.0] model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=1) theta = model.get_theta() real_values = [ [0.26, 0.26, 0.26, 0.14], [0.0, 0.0, 0.0, 0.25], [0.0, 0.0, 0.0, 0.19], [0.36, 0.36, 0.36, 0.21], [0.38, 0.38, 0.38, 0.21], ] for elems, values in zip(theta.values.tolist(), real_values): for e, v in zip(elems, values): assert abs(e - v) < 0.01 model.initialize(dictionary=batch_vectorizer.dictionary) model.regularizers['SST'].doc_titles = ['title_0', 'title_3'] model.regularizers['SST'].doc_topic_coef = [[ -1.0, 1.0, 0.0, 0.0, -1.0 ], [0.0, 1.0, 0.0, -1.0, 0.0]] model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=1) theta = model.get_theta() real_values = [ [0.499311, 0.146202, 0.146202, 0.000873], [0.0, 0.247351, 0.247351, 0.0], [0.000556, 0.185883, 0.185883, 0.001110], [0.000617, 0.206015, 0.206015, 0.996735], [0.499516, 0.214550, 0.214550, 0.001282], ] for elems, values in zip(theta.values.tolist(), real_values): for e, v in zip(elems, values): assert abs(e - v) < 0.000001 model.initialize(dictionary=batch_vectorizer.dictionary) model.regularizers['SST'].doc_titles = [] model.regularizers['SST'].doc_topic_coef = [0.0, 1.0, 1.0, 0.0, 0.0] model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=1) theta = model.get_theta() real_values = [ [0.26, 0.26, 0.26, 0.26], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.36, 0.36, 0.36, 0.36], [0.38, 0.38, 0.38, 0.38], ] for elems, values in zip(theta.values.tolist(), real_values): for e, v in zip(elems, values): assert abs(e - v) < 0.01 finally: shutil.rmtree(batches_folder)
model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score')) model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer')) model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer')) model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer')) model_artm.regularizers['sparse_phi_regularizer'].tau = 0.01 model_artm.regularizers['sparse_theta_regularizer'].tau = -1.06 # model_artm.regularizers['decorrelator_phi_regularizer'].tau = 1e+5 model_plsa.initialize(dictionary=dictionary) model_artm.initialize(dictionary=dictionary) model_lda.initialize(dictionary=dictionary) passes = 10 model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
data_format='bow_uci', collection_name=filename, target_folder=filename) else: batch_vectorizer = artm.BatchVectorizer(data_path=filename, data_format='batches') dictionary = artm.Dictionary() model_artm = artm.ARTM(topic_names=['topic_{}'.format(i) for i in xrange(15)], scores=[ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary) ], regularizers=[ artm.SmoothSparseThetaRegularizer( name='SparseTheta', tau=-0.15) ], cache_theta=True) if not os.path.isfile(filename + '/dictionary.dict'): dictionary.gather(data_path=batch_vectorizer.data_path) dictionary.save(dictionary_path=filename + '/dictionary.dict') dictionary.load(dictionary_path=(filename + '/dictionary.dict')) dictionary.load(dictionary_path=(filename + '/dictionary.dict')) model_artm.initialize(dictionary=dictionary) model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model_artm.scores.add(
def make_artm(col): """Get artm theta matrixes""" collection_train = pd.DataFrame(collection).iloc[train_index].reset_index() collection_test = pd.DataFrame(collection).iloc[test_index].reset_index() le = LabelEncoder() y_transormed = le.fit_transform(df_y) arr = [] for index_number, i, c in zip(collection_train['index'], collection_train[0], y_transormed): arr.append( str(index_number) + ' |@default_class ' + unicode(i) + ' |@labels_class ' + unicode(c)) arr_test = [] for index_number, i in zip(collection_test['index'], collection_test[0]): arr_test.append(str(index_number) + ' |@default_class ' + unicode(i)) pd.DataFrame(arr, index=None).to_csv('leaver_vw_form.txt', sep='\t', encoding='UTF-8', index=False, header=None) pd.DataFrame(arr_test, index=None).to_csv('leaver_vw_form_test.txt', sep='\t', encoding='UTF-8', index=False, header=None) batch_vectorizer = artm.BatchVectorizer( data_path="leaver_vw_form.txt", data_format="vowpal_wabbit", target_folder="leaver_vw_form_train", batch_size=100) batch_vectorizer_test = artm.BatchVectorizer( data_path="leaver_vw_form_test.txt", data_format="vowpal_wabbit", target_folder="leaver_vw_form_test", batch_size=100) T = pd.DataFrame(df_y)[u'Процесс'].nunique() print("количество тем составляет - {}".format(T)) # количество тем topic_names = ["sbj" + str(i) for i in range(T)] model_artm = artm.ARTM(num_topics=T, topic_names=topic_names, class_ids={ '@default_class': 1, '@labels_class': 700 }, num_document_passes=10, seed=79, reuse_theta=True, cache_theta=True, scores=[ artm.TopTokensScore(name='top_tokens_score', num_tokens=30, class_id='@default_class') ], regularizers=[ artm.SmoothSparseThetaRegularizer( name='SparseTheta', tau=-0.15) ]) dictionary = artm.Dictionary(name='dictionary') dictionary.gather(batch_vectorizer.data_path) model_artm.initialize('dictionary') dictionary.filter(min_tf=2, min_df_rate=0.01) model_artm.scores.add( artm.SparsityPhiScore(name='sparsity_phi_score', class_id='@labels_class')) model_artm.regularizers.add( artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_def', class_ids=['@default_class'])) model_artm.regularizers.add( artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_lab', class_ids=['@labels_class'])) model_artm.scores.add( artm.PerplexityScore(name='PerplexityScore', dictionary='dictionary')) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=15) test_transformed = model_artm.transform(batch_vectorizer_test, predict_class_id='@labels_class').T train_transformed = model_artm.transform( batch_vectorizer, predict_class_id='@labels_class').T test_transformed = test_transformed.reset_index().sort_values('index') test_transformed = test_transformed.reset_index(drop=True) del test_transformed['index'] test_transformed = test_transformed[sorted(test_transformed.columns)] train_transformed = train_transformed.reset_index().sort_values('index') del train_transformed['index'] train_transformed = train_transformed[sorted(train_transformed.columns)] train_transformed = train_transformed.reset_index(drop=True) artm_transformed = pd.concat([train_transformed, test_transformed], axis=0).reset_index(drop=True) return artm_transformed
subj_topics = topics_names[:topic_num] bgr_topics = topics_names[topic_num:] model = artm.ARTM(num_document_passes=document_passes_num, num_topics=topic_num + background_topic_num, topic_names=topics_names, num_processors=12, seed=100) ''' Тот самый регуляризатор, который будет связывать нашу theta с получаемой моделью. Через doc_titles указываем документы, которым соответствуют строки theta. ''' model.regularizers.add( artm.SmoothSparseThetaRegularizer( name='Theta', tau=10**3, doc_topic_coef=theta, doc_titles=[str(i) for i in range(len(data.train_docs))])) model.class_ids = { "title": 1, "text": 1, "label": 5, } model.initialize(dictionary.filter(min_df=10)) model.fit_offline( batch_vectorizer=BatchVectorizer(batches=train_batch, process_in_memory_model=model), num_collection_passes=3 ) # в этот раз, поскольку theta задана, 3 прохода вполне достаточно
def init_simple_default_model( dataset: Dataset, modalities_to_use: List[str] or Dict[str, float], main_modality: str, specific_topics: List[str] or int, background_topics: List[str] or int, ) -> artm.ARTM: """ Creates simple `artm.ARTM` model with standard scores. Parameters ---------- dataset Dataset for model initialization modalities_to_use What modalities a model should know. If `modalities_to_use` is a dictionary, all given weights are assumed to be relative to `main_modality`: weights will then be recalculated to absolute ones using `dataset` and `main_modality`. If `modalities_to_use` is a list, then all relative weights are set equal to one. The result model's `class_ids` field will contain absolute modality weights. main_modality Modality relative to which all modality weights are considered specific_topics Specific topic names or their number background_topics Background topic names or their number Returns ------- model : artm.ARTM """ if isinstance(modalities_to_use, dict): modalities_weights = modalities_to_use else: modalities_weights = {class_id: 1 for class_id in modalities_to_use} specific_topic_names, background_topic_names = create_default_topics( specific_topics, background_topics) dictionary = dataset.get_dictionary() tokens_data = count_vocab_size(dictionary, modalities_to_use) abs_weights = modality_weight_rel2abs(tokens_data, modalities_weights, main_modality) model = init_model( topic_names=specific_topic_names + background_topic_names, class_ids=abs_weights, ) if len(background_topic_names) > 0: model.regularizers.add( artm.SmoothSparsePhiRegularizer( name='smooth_phi_bcg', topic_names=background_topic_names, tau=0.0, class_ids=[main_modality], ), ) model.regularizers.add( artm.SmoothSparseThetaRegularizer( name='smooth_theta_bcg', topic_names=background_topic_names, tau=0.0, ), ) model.initialize(dictionary) add_standard_scores(model, main_modality=main_modality, all_modalities=modalities_to_use) return model
def test_func(): # constants num_tokens = 15 alpha = 0.01 beta = 0.02 num_collection_passes = 15 num_document_passes = 1 num_topics = 15 vocab_size = 6906 num_docs = 3430 zero_eps = 0.001 data_path = os.environ.get('BIGARTM_UNITTEST_DATA') batches_folder = tempfile.mkdtemp() try: batch_vectorizer = artm.BatchVectorizer(data_path=data_path, data_format='bow_uci', collection_name='kos', target_folder=batches_folder) dictionary = artm.Dictionary() dictionary.gather(data_path=batch_vectorizer.data_path) model_artm = artm.ARTM(num_topics=num_topics, dictionary=dictionary, cache_theta=True, reuse_theta=True) model_artm.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=beta)) model_artm.regularizers.add( artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=alpha)) model_artm.scores.add( artm.SparsityThetaScore(name='SparsityThetaScore')) model_artm.scores.add( artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)) model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model_artm.scores.add( artm.TopTokensScore(name='TopTokensScore', num_tokens=num_tokens)) model_lda = artm.LDA(num_topics=num_topics, alpha=alpha, beta=beta, dictionary=dictionary, cache_theta=True) model_lda.initialize(dictionary=dictionary) model_artm.num_document_passes = num_document_passes model_lda.num_document_passes = num_document_passes model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) for i in range(num_collection_passes): assert abs(model_artm.score_tracker['SparsityPhiScore'].value[i] - model_lda.sparsity_phi_value[i]) < zero_eps for i in range(num_collection_passes): assert abs( model_artm.score_tracker['SparsityThetaScore'].value[i] - model_lda.sparsity_theta_value[i]) < zero_eps for i in range(num_collection_passes): assert abs(model_artm.score_tracker['PerplexityScore'].value[i] - model_lda.perplexity_value[i]) < zero_eps lda_tt = model_lda.get_top_tokens(num_tokens=num_tokens) assert len(lda_tt) == num_topics for i in range(num_topics): for j in range(num_tokens): assert model_artm.score_tracker['TopTokensScore'].last_tokens[ model_artm.topic_names[i]][j] == lda_tt[i][j] lda_tt = model_lda.get_top_tokens(num_tokens=num_tokens, with_weights=True) for i in range(num_tokens): assert abs(model_artm.score_tracker['TopTokensScore'].last_weights[ model_artm.topic_names[0]][i] - lda_tt[0][i][1]) < zero_eps model_lda.fit_online(batch_vectorizer=batch_vectorizer) phi = model_lda.phi_ assert phi.shape == (vocab_size, num_topics) theta = model_lda.get_theta() assert theta.shape == (num_topics, num_docs) assert model_lda.library_version.count('.') == 2 # major.minor.patch model_lda = artm.LDA(num_topics=num_topics, alpha=alpha, beta=([0.1] * num_topics), dictionary=dictionary, cache_theta=True) assert model_lda._internal_model.regularizers.size() == num_topics + 1 finally: shutil.rmtree(batches_folder)
def init_bcg_sparse_model(dataset, modalities_to_use, main_modality, specific_topics, bcg_topics, model_params: dict = None): """ Creates simple artm model with standard scores. Parameters ---------- dataset : Dataset modalities_to_use : list of str or dict main_modality : str specific_topics : int bcg_topics : int Returns ------- model: artm.ARTM() instance """ if model_params is None: model_params = dict() model = init_plsa(dataset, modalities_to_use, main_modality, specific_topics, bcg_topics) background_topic_names = model.topic_names[-bcg_topics:] specific_topic_names = model.topic_names[:-bcg_topics] dictionary = dataset.get_dictionary() baseline_class_ids = {class_id: 1 for class_id in modalities_to_use} data_stats = count_vocab_size(dictionary, baseline_class_ids) # all coefficients are relative regularizers = [ artm.SmoothSparsePhiRegularizer( name='smooth_phi_bcg', topic_names=background_topic_names, tau=model_params.get("smooth_bcg_tau", 0.1), class_ids=[main_modality], ), artm.SmoothSparseThetaRegularizer( name='smooth_theta_bcg', topic_names=background_topic_names, tau=model_params.get("smooth_bcg_tau", 0.1), ), artm.SmoothSparsePhiRegularizer( name='sparse_phi_sp', topic_names=specific_topic_names, tau=model_params.get("sparse_sp_tau", -0.05), class_ids=[main_modality], ), artm.SmoothSparseThetaRegularizer( name='sparse_theta_sp', topic_names=specific_topic_names, tau=model_params.get("sparse_sp_tau", -0.05), ), ] for reg in regularizers: model.regularizers.add( transform_regularizer(data_stats, reg, model.class_ids, n_topics=len(reg.topic_names))) return model
def init_lda( dataset: Dataset, modalities_to_use: List[str], main_modality: str, num_topics: int, model_params: dict = None, ): """ Creates simple artm model with standard scores. Parameters ---------- dataset modalities_to_use main_modality num_topics model_params Returns ------- model: artm.ARTM() instance """ if model_params is None: model_params = dict() model = init_plsa(dataset, modalities_to_use, main_modality, num_topics) prior = model_params.get('prior', 'symmetric') # What GenSim returns by default (everything is 'symmetric') # see https://github.com/RaRe-Technologies/gensim/blob/master/gensim/models/ldamodel.py#L521 # Note that you can specify prior shape for alpha and beta separately, # but we do not do that here if prior == "symmetric": alpha = 1.0 / num_topics eta = 1.0 / num_topics elif prior == "asymmetric": # following the recommendation from # http://papers.nips.cc/paper/3854-rethinking-lda-why-priors-matter # we will use symmetric prior over Phi and asymmetric over Theta eta = 1.0 / num_topics num_terms = 0 # isn't used, so let's not compute it alpha = _init_dirichlet_prior("alpha", num_topics, num_terms=num_terms) elif prior == "double_asymmetric": # this stuff is needed for asymmetric Phi initialization: artm_dict = dataset.get_dictionary() temp_df = artm_dict2df(artm_dict) # noqa: F821 num_terms = temp_df.query("class_id in @modalities_to_use").shape[0] eta = _init_dirichlet_prior("eta", num_topics, num_terms) alpha = _init_dirichlet_prior("alpha", num_topics, num_terms) # TODO: turns out, BigARTM does not support tau as a list of floats (or dictionary) # so we need to use custom regularizer instead # (TopicPrior doesn't work because it provides $beta_t$ instead of $beta_w$) raise NotImplementedError elif prior == "heuristic": # Found in doi.org/10.1007/s10664-015-9379-3 (2016) # "We use the defacto standard heuristics of α=50/K and β=0.01 # (Biggers et al. 2014) for our hyperparameter values" alpha = 50.0 / num_topics eta = 0.01 else: raise TypeError(f"prior type '{prior}' is not supported") model.regularizers.add( artm.SmoothSparsePhiRegularizer( name='smooth_phi', tau=eta, class_ids=[main_modality], ), ) if isinstance(alpha, (list, np.ndarray)): alpha = [float(a) for a in alpha] assert (len(alpha) == len(model.topic_names)) for i, topic in enumerate(model.topic_names): model.regularizers.add( artm.SmoothSparseThetaRegularizer(name=f'smooth_theta_{i}', tau=alpha[i], topic_names=topic)) else: model.regularizers.add( artm.SmoothSparseThetaRegularizer( name='smooth_theta', tau=alpha, ), ) return model
def test_func(): data_path = os.environ.get('BIGARTM_UNITTEST_DATA') batches_folder = tempfile.mkdtemp() dump_folder = tempfile.mkdtemp() try: batch_vectorizer = artm.BatchVectorizer(data_path=data_path, data_format='bow_uci', collection_name='kos', target_folder=batches_folder) model_1 = artm.ARTM(num_processors=7, cache_theta=True, num_document_passes=5, reuse_theta=True, seed=10, num_topics=15, class_ids={'@default_class': 1.0}, theta_name='THETA', dictionary=batch_vectorizer.dictionary) model_2 = artm.ARTM(num_processors=7, cache_theta=False, num_document_passes=5, reuse_theta=False, seed=10, num_topics=15, class_ids={'@default_class': 1.0}, dictionary=batch_vectorizer.dictionary) for model in [model_1, model_2]: model.scores.add( artm.PerplexityScore(name='perp', dictionary=batch_vectorizer.dictionary)) model.scores.add(artm.SparsityThetaScore(name='sp_theta', eps=0.1)) model.scores.add(artm.TopTokensScore(name='top_tok', num_tokens=10)) model.scores.add( artm.SparsityPhiScore(name='sp_nwt', model_name=model.model_nwt)) model.scores.add( artm.TopicKernelScore(name='kernel', topic_names=model.topic_names[0:5], probability_mass_threshold=0.4)) topic_pairs = {} for topic_name_1 in model.topic_names: for topic_name_2 in model.topic_names: if topic_name_1 not in topic_pairs: topic_pairs[topic_name_1] = {} topic_pairs[topic_name_1][ topic_name_2] = numpy.random.randint(0, 3) model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='decor', tau=100000.0, topic_pairs=topic_pairs)) model.regularizers.add( artm.SmoothSparsePhiRegularizer( name='smsp_phi', tau=-0.5, gamma=0.3, dictionary=batch_vectorizer.dictionary)) model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='smsp_theta', tau=0.1, doc_topic_coef=[2.0] * model.num_topics)) model.regularizers.add( artm.SmoothPtdwRegularizer(name='sm_ptdw', tau=0.1)) # learn first model and dump it on disc model.fit_offline(batch_vectorizer, num_collection_passes=10) model.fit_online(batch_vectorizer, update_every=1) model.dump_artm_model(os.path.join(dump_folder, 'target')) params = {} with open(os.path.join(dump_folder, 'target', 'parameters.json'), 'r') as fin: params = json.load(fin) _assert_json_params(params) # create second model from the dump and check the results are equal model_new = artm.load_artm_model( os.path.join(dump_folder, 'target')) _assert_params_equality(model, model_new) _assert_scores_equality(model, model_new) _assert_regularizers_equality(model, model_new) _assert_score_values_equality(model, model_new) _assert_matrices_equality(model, model_new) # continue learning of both models model.fit_offline(batch_vectorizer, num_collection_passes=3) model.fit_online(batch_vectorizer, update_every=1) model_new.fit_offline(batch_vectorizer, num_collection_passes=3) model_new.fit_online(batch_vectorizer, update_every=1) # check new results are also equal _assert_params_equality(model, model_new) _assert_scores_equality(model, model_new) _assert_regularizers_equality(model, model_new) _assert_score_values_equality(model, model_new) _assert_matrices_equality(model, model_new) shutil.rmtree(os.path.join(dump_folder, 'target')) finally: shutil.rmtree(batches_folder) shutil.rmtree(dump_folder)
subj_topics = topics_names[:topic_num] bgr_topics = topics_names[topic_num:] model = artm.ARTM( num_document_passes=document_passes_num, num_topics=topic_num + background_topic_num, topic_names=topics_names, seed=100, # helps to get stable results num_processors=processors_num) model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='Decorrelator', tau=10**4)) # обычный декоррелятор model.regularizers.add( artm.SmoothSparseThetaRegularizer( name='SmoothTheta', topic_names=bgr_topics, tau=0.3)) # сглаживаем Theta для фоновых тем model.regularizers.add( artm.SmoothSparseThetaRegularizer( name='SparseTheta', topic_names=subj_topics, tau=-0.3)) # разреживаем Theta для "хороших" тем model.regularizers.add( artm.SmoothSparsePhiRegularizer( name='SmoothPhi', topic_names=bgr_topics, class_ids=["text"], tau=0.1)) # сглаживаем Theta для фоновых тем model.regularizers.add( artm.SmoothSparsePhiRegularizer( name='SparsePhi', topic_names=subj_topics,