def add_complex_scores_to_model(artm_model, n_top_tokens, p_mass_threshold, common_topics, subject_topics, class_name, _debug_print=False): if _debug_print: print '[{}] adding scores'.format(datetime.now()) # subject artm_model.scores.add( artm.PerplexityScore(name='perplexity_score_subject', dictionary=dictionary, topic_names=subject_topics)) artm_model.scores.add( artm.SparsityPhiScore(name='ss_phi_score_subject', class_id=class_name, topic_names=subject_topics)) artm_model.scores.add( artm.SparsityThetaScore(name='ss_theta_score_subject', topic_names=subject_topics)) artm_model.scores.add( artm.TopicKernelScore(name='topic_kernel_score_subject', class_id=class_name, topic_names=subject_topics, probability_mass_threshold=p_mass_threshold)) artm_model.scores.add( artm.TopTokensScore(name='top_tokens_score_subject', class_id=class_name, topic_names=subject_topics, num_tokens=n_top_tokens)) # common artm_model.scores.add( artm.PerplexityScore(name='perplexity_score_common', dictionary=dictionary, topic_names=common_topics)) artm_model.scores.add( artm.SparsityPhiScore(name='ss_phi_score_common', class_id=class_name, topic_names=common_topics)) artm_model.scores.add( artm.SparsityThetaScore(name='ss_theta_score_common', topic_names=common_topics)) artm_model.scores.add( artm.TopicKernelScore(name='topic_kernel_score_common', class_id=class_name, topic_names=common_topics, probability_mass_threshold=p_mass_threshold)) artm_model.scores.add( artm.TopTokensScore(name='top_tokens_score_common', class_id=class_name, topic_names=common_topics, num_tokens=n_top_tokens))
def experiment(filename, tau_phi, tau_theta): batch_vectorizer = artm.BatchVectorizer(data_path=filename, data_format='vowpal_wabbit', target_folder='batches') dictionary = batch_vectorizer.dictionary topic_num = 30 tokens_num = 100 print("ARTM training") topic_names = ['topic_{}'.format(i) for i in range(topic_num)] model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True) model_plsa = artm.ARTM(topic_names=topic_names, cache_theta=True, scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)]) model_lda = artm.LDA(num_topics=topic_num) model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=tokens_num)) model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score')) model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer')) model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer')) model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer')) model_artm.regularizers['sparse_phi_regularizer'].tau = tau_phi model_artm.regularizers['sparse_theta_regularizer'].tau = tau_theta model_artm.regularizers['decorrelator_phi_regularizer'].tau = 1e+3 model_plsa.initialize(dictionary=dictionary) model_artm.initialize(dictionary=dictionary) model_lda.initialize(dictionary=dictionary) passes = 100 model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes) print_measures(model_plsa, model_artm, model_lda)
def add_standard_scores(model, dictionary, main_modality="@lemmatized", all_modalities=("@lemmatized", "@ngramms")): """ Adds standard scores for the model. """ assert main_modality in all_modalities, "main_modality must be part of all_modalities" model.scores.add( artm.scores.PerplexityScore(name='PerplexityScore@all', class_ids=all_modalities)) model.scores.add(artm.scores.SparsityThetaScore(name='SparsityThetaScore')) for modality in all_modalities: model.scores.add( artm.scores.SparsityPhiScore(name=f'SparsityPhiScore{modality}', class_id=modality)) model.scores.add( artm.scores.PerplexityScore(name=f'PerplexityScore{modality}', class_ids=[modality])) model.scores.add( artm.TopicKernelScore(name=f'TopicKernel{modality}', probability_mass_threshold=0.3, class_id=modality))
def define_model(n_topics: int, dictionary: artm.Dictionary, sparse_theta: float, sparse_phi: float, decorrelator_phi: float) -> artm.artm_model.ARTM: """ Define the ARTM model. :param n_topics: number of topics. :param dictionary: batch vectorizer dictionary. :param sparse_theta: sparse theta parameter. :param sparse_phi: sparse phi Parameter. :param decorrelator_phi: decorellator phi Parameter. :return: ARTM model. """ print("Defining the model.") topic_names = ["topic_{}".format(i) for i in range(1, n_topics + 1)] model_artm = artm.ARTM( topic_names=topic_names, cache_theta=True, scores=[ artm.PerplexityScore(name="PerplexityScore", dictionary=dictionary), artm.SparsityPhiScore(name="SparsityPhiScore"), artm.SparsityThetaScore(name="SparsityThetaScore"), artm.TopicKernelScore(name="TopicKernelScore", probability_mass_threshold=0.3), artm.TopTokensScore(name="TopTokensScore", num_tokens=15) ], regularizers=[ artm.SmoothSparseThetaRegularizer(name="SparseTheta", tau=sparse_theta), artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=sparse_phi), artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi", tau=decorrelator_phi) ]) return model_artm
def set_scores(self): self.model.scores.add( artm.PerplexityScore(name='PerplexityScore', dictionary=self.dictionary)) self.model.scores.add( artm.SparsityPhiScore(name='SparsityPhiScore', class_id='@default_class', topic_names=self.specific)) self.model.scores.add( artm.SparsityThetaScore(name='SparsityThetaScore', topic_names=self.specific)) # Fraction of background words in the whole collection self.model.scores.add( artm.BackgroundTokensRatioScore(name='BackgroundTokensRatioScore', class_id='@default_class')) # Kernel characteristics self.model.scores.add( artm.TopicKernelScore(name='TopicKernelScore', class_id='@default_class', topic_names=self.specific, probability_mass_threshold=0.5, dictionary=self.dictionary)) # Looking at top tokens self.model.scores.add( artm.TopTokensScore(name='TopTokensScore', class_id='@default_class', num_tokens=100))
def create_and_learn_ARTM_decorPhi_modal(name="", topic_number=750, num_collection_passes=1, weigths=[1., 1., 1., 1.], decorTau=1.0): batch_vectorizer_train = None batch_vectorizer_train = artm.BatchVectorizer(data_path='./' + name, data_format='vowpal_wabbit', target_folder='folder' + name) dictionary = artm.Dictionary() dictionary.gather(data_path=batch_vectorizer_train.data_path) topic_names = ['topic_{}'.format(i) for i in range(topic_number)] model = artm.ARTM(topic_names=topic_names, class_ids={ '@text': weigths[0], '@first': weigths[1], '@second': weigths[2], '@third': weigths[3] }, cache_theta=True, theta_columns_naming='title', scores=[ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary) ]) model.regularizers.add( artm.DecorrelatorPhiRegularizer( name='DecorrelatorPhi_modals', tau=decorTau, class_ids=['@first', '@second', '@third'])) model.initialize(dictionary=dictionary) model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model.scores.add( artm.TopicKernelScore(name='TopicKernelScore', class_id='@text', probability_mass_threshold=0.3)) model.scores.add( artm.TopTokensScore(name='TopTokensScore', num_tokens=6, class_id='@text')) model.scores.add( artm.SparsityPhiScore(name='sparsity_phi_score', class_id='@third')) model.num_document_passes = 1 model.fit_offline(batch_vectorizer=batch_vectorizer_train, num_collection_passes=num_collection_passes) theta_train = model.transform(batch_vectorizer=batch_vectorizer_train) return model, theta_train
def _get_corpus_model(self, corpus_vector_spaced, clustering_method='artm'): if 'gensim' == clustering_method: return self._get_model_LSI(corpus_vector_spaced) elif 'sklearn' == clustering_method: return self._get_model_LDA(corpus_vector_spaced) elif 'artm' == clustering_method: batch_vectorizer = corpus_vector_spaced['batch_vectorizer'] dictionary = corpus_vector_spaced['dictionary'] topic_names = [ 'topic_{}'.format(i) for i in range(self.num_of_clusters) ] model_artm = artm.ARTM( topic_names=topic_names, cache_theta=True, scores=[ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary) ], regularizers=[ artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.15) ]) model_artm.scores.add( artm.SparsityPhiScore(name='SparsityPhiScore')) model_artm.scores.add( artm.SparsityThetaScore(name='SparsityThetaScore')) model_artm.scores.add( artm.TopicKernelScore(name='TopicKernelScore', probability_mass_threshold=0.3)) model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore', num_tokens=10), overwrite=True) model_artm.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1)) model_artm.regularizers['SparseTheta'].tau = -0.2 model_artm.regularizers.add( artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5)) model_artm.num_document_passes = 1 model_artm.initialize(dictionary) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=30) return model_artm.transform(batch_vectorizer=batch_vectorizer).T
def add_standard_scores( model: artm.ARTM, dictionary: artm.Dictionary = None, main_modality: str = "@lemmatized", all_modalities: List[str] = ("@lemmatized", "@ngramms") ) -> None: """ Adds standard scores for the model. Parameters ---------- model dictionary Obsolete parameter, not used main_modality all_modalities """ assert main_modality in all_modalities, "main_modality must be part of all_modalities" if dictionary is not None: warnings.warn('Parameter `dictionary` is obsolete:' ' it is not used in the function "add_standard_scores"!') model.scores.add( artm.scores.PerplexityScore( name='PerplexityScore@all', class_ids=all_modalities, )) model.scores.add(artm.scores.SparsityThetaScore(name='SparsityThetaScore')) for modality in all_modalities: model.scores.add( artm.scores.SparsityPhiScore( name=f'SparsityPhiScore{modality}', class_id=modality, )) model.scores.add( artm.scores.PerplexityScore( name=f'PerplexityScore{modality}', class_ids=[modality], )) model.scores.add( artm.TopicKernelScore( name=f'TopicKernel{modality}', probability_mass_threshold=0.3, class_id=modality, ))
def add_scores_to_model(current_dictionary, artm_model, n_top_tokens, p_mass_threshold): artm_model.scores.add( artm.PerplexityScore(name='perplexity_score', use_unigram_document_model=False, dictionary=current_dictionary)) artm_model.scores.add( artm.SparsityPhiScore(name='sparsity_phi_score', class_id='ngramm')) artm_model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) artm_model.scores.add( artm.TopicKernelScore(name='topic_kernel_score', class_id='ngramm', probability_mass_threshold=p_mass_threshold)) artm_model.scores.add( artm.TopTokensScore(name='top_tokens_score', class_id='ngramm', num_tokens=n_top_tokens))
def create_and_learn_PLSA(name="", topic_number=750, num_collection_passes=1): batch_vectorizer_train = None batch_vectorizer_train = artm.BatchVectorizer(data_path='./' + name, data_format='vowpal_wabbit', target_folder='folder' + name) dictionary = artm.Dictionary() dictionary.gather(data_path=batch_vectorizer_train.data_path) topic_names = ['topic_{}'.format(i) for i in range(topic_number)] model_plsa = artm.ARTM(topic_names=topic_names, class_ids={ '@text': 1.0, '@first': 1.0, '@second': 1.0, '@third': 1.0 }, cache_theta=True, theta_columns_naming='title', scores=[ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary) ]) model_plsa.initialize(dictionary=dictionary) model_plsa.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model_plsa.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model_plsa.scores.add( artm.TopicKernelScore(name='TopicKernelScore', class_id='@text', probability_mass_threshold=0.3)) model_plsa.scores.add( artm.TopTokensScore(name='TopTokensScore', num_tokens=6, class_id='@text')) model_plsa.num_document_passes = 1 model_plsa.fit_offline(batch_vectorizer=batch_vectorizer_train, num_collection_passes=num_collection_passes) theta_train = model_plsa.transform(batch_vectorizer=batch_vectorizer_train) return model_plsa, theta_train
def create_model_with_background(dictionary, num_tokens, num_document_passes): sm_phi_tau = 0.0001 * 1e-4 sp_phi_tau = -0.0001 * 1e-4 decor_phi_tau = 1 specific_topics = ['topic {}'.format(i) for i in range(1, 20)] topic_names = specific_topics + ["background"] scores = [ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary), artm.TopTokensScore( name='TopTokensScore', num_tokens=10, class_id='plain_text' ), # web version of Palmetto works only with <= 10 tokens artm.SparsityPhiScore(name='SparsityPhiScore'), artm.SparsityThetaScore(name='SparsityThetaScore'), artm.TopicKernelScore(name='TopicKernelScore', probability_mass_threshold=0.3, class_id='plain_text') ] model = artm.ARTM(topic_names=specific_topics + ["background"], regularizers=[], cache_theta=True, scores=scores, class_ids={'plain_text': 1.0}) model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-sp_phi_tau, topic_names=specific_topics)) model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SmoothPhi', tau=sm_phi_tau, topic_names=["background"])) # model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=decor_phi_tau)) model.initialize(dictionary=dictionary) model.num_document_passes = num_document_passes return model
def add_scores_to_model(artm_model, dictionary, n_top_tokens, p_mass_threshold, class_name, _debug_print=False): if _debug_print: print '[{}] adding scores'.format(datetime.now()) artm_model.scores.add( artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) artm_model.scores.add( artm.SparsityPhiScore(name='ss_phi_score', class_id=class_name)) artm_model.scores.add(artm.SparsityThetaScore(name='ss_theta_score')) artm_model.scores.add( artm.TopicKernelScore(name='topic_kernel_score', class_id=class_name, probability_mass_threshold=p_mass_threshold)) artm_model.scores.add( artm.TopTokensScore(name='top_tokens_score', class_id=class_name, num_tokens=n_top_tokens))
def init_score_tracker(model_artm, my_dictionary, class_id='text'): model_artm.scores.add(artm.PerplexityScore(name='PerplexityScore', dictionary=my_dictionary), overwrite=True) model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore', class_id=class_id), overwrite=True) model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'), overwrite=True) model_artm.scores.add(artm.TopTokensScore(name="top_words", num_tokens=200, class_id=class_id), overwrite=True) model_artm.scores.add(artm.TopicKernelScore( name='TopicKernelScore', class_id=class_id, probability_mass_threshold=0.6), overwrite=True) print('Scores are set!')
dictionary = batch_vectorizer.dictionary topic_num = 10 tokens_num = 100 print("ARTM training") topic_names = ['topic_{}'.format(i) for i in range(topic_num)] model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True) model_plsa = artm.ARTM(topic_names=topic_names, cache_theta=True, scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)]) model_lda = artm.LDA(num_topics=topic_num) model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=tokens_num)) model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score')) model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score')) model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score')) model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score')) model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer')) model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
], cache_theta=True) if not os.path.isfile(filename + '/dictionary.dict'): dictionary.gather(data_path=batch_vectorizer.data_path) dictionary.save(dictionary_path=filename + '/dictionary.dict') dictionary.load(dictionary_path=(filename + '/dictionary.dict')) dictionary.load(dictionary_path=(filename + '/dictionary.dict')) model_artm.initialize(dictionary=dictionary) model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model_artm.scores.add( artm.TopicKernelScore(name='TopicKernelScore', probability_mass_threshold=0.3)) model_artm.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1)) model_artm.regularizers.add( artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5)) model_artm.regularizers.add( artm.TopicSelectionThetaRegularizer(name='TopicSelection', tau=0.25)) model_artm.regularizers['SparsePhi'].tau = -0.5 model_artm.regularizers['SparseTheta'].tau = -0.5 model_artm.regularizers['DecorrelatorPhi'].tau = 1e+5 model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore', num_tokens=10))
def topic_model_clf(X, y, topic_num=30): labels_decreasing_size_order = list(y.value_counts().index) (X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) file_train = 'temp_files/X_train.txt' file_test = 'temp_files/X_test.txt' temp_df = pd.DataFrame() temp_df['text'] = X_train temp_df['class_label'] = y_train write_vw(temp_df, X_train.index, file_train) temp_df = pd.DataFrame() temp_df['text'] = X_test write_vw(temp_df, X_test.index, file_test) if len(glob.glob(os.path.join('batches_train*.batch'))) < 1: batch_vectorizer_train = artm.BatchVectorizer( data_path=file_train, data_format='vowpal_wabbit', target_folder='batches_train', gather_dictionary=True) else: batch_vectorizer_train = artm.BatchVectorizer( data_path='batches_train', data_format='batches', gather_dictionary=True) if len(glob.glob(os.path.join('batches_test' + '*.batch'))) < 1: batch_vectorizer_test = artm.BatchVectorizer( data_path=file_test, data_format='vowpal_wabbit', target_folder='batches_test', gather_dictionary=True) else: batch_vectorizer_test = artm.BatchVectorizer(data_path='batches_test', data_format='batches', gather_dictionary=True) model = artm.ARTM(num_topics=topic_num, class_ids={ '@text': 5.0, '@class_label': 100.0 }, cache_theta=True, dictionary=batch_vectorizer_train.dictionary, theta_columns_naming='title') scores = [ artm.PerplexityScore(name='Perplexity', dictionary=batch_vectorizer_train.dictionary, class_ids=['@text']), artm.SparsityPhiScore(name='SparsityPhiText', class_id='@text'), artm.SparsityPhiScore(name='SparsityPhiClasses', class_id='@class_label'), artm.SparsityThetaScore(name='SparsityTheta'), artm.TopicKernelScore(name='TopicKernelText', probability_mass_threshold=0.1, class_id='@text'), artm.TopTokensScore(name='TopTokensText', class_id='@text', num_tokens=20), artm.TopTokensScore(name='TopTokensClasses', class_id='@class_label', num_tokens=10) ] regularizers = [ artm.DecorrelatorPhiRegularizer(name='DeccorText', class_ids=['@text'], tau=10000), artm.SmoothSparsePhiRegularizer(name='SmoothPhiText', class_ids=['@text'], tau=0), artm.SmoothSparsePhiRegularizer(name='SmoothPhiClasses', class_ids=['@class_label'], tau=-1), # artm.SmoothSparsePhiRegularizer(name='SmoothBackgroundPhi', tau=100, topic_names=['background_topic']), artm.SmoothSparseThetaRegularizer(name='SmoothTheta', tau=-1.5), # artm.SmoothSparseThetaRegularizer(name='SmoothBackgroundTheta', tau=100, topic_names=['background_topic']) ] for r in regularizers: model.regularizers.add(r) for s in scores: model.scores.add(s) for i in tqdm(range(35)): model.fit_offline(batch_vectorizer=batch_vectorizer_train, num_collection_passes=1) p_cd = model.transform(batch_vectorizer=batch_vectorizer_test, predict_class_id='@class_label') # пооптимизируем это место y_pred = p_cd.idxmax(axis=0).astype(int)[[str(x) for x in X_test.index]].values # y_pred = p_cd[[str(x) for x in X_test.index]].idxmax(axis=0).values # metrics_visualization(target_pred=y_pred, target_true=y_test, # top_tokens_class=model.score_tracker['TopTokensClasses'], # top_tokens_text=model.score_tracker['TopTokensText'], # score_tracker=model.score_tracker, # scores_names=['Perplexity', 'SparsityPhiClasses', # 'SparsityPhiText', 'SparsityTheta']) print('Accuracy_score: {}'.format(accuracy_score(y_test, y_pred))) plt.hist(y_pred, color='g', label='pred') plt.hist(y_test, color='b', alpha=0.7, label='true') plt.title('Topic Model') plt.show() # print(classification_report(y_test, y_pred, labels=labels_decreasing_size_order)) create_confusion_matrix(y_test, y_pred, labels=labels_decreasing_size_order).savefig( '../../reports/topic_model_conf_matrix.png') micro_roc_auc = roc_auc_score(label_binarize(y_test, classes=list(range(0, 17))), p_cd.T, average='micro') macro_roc_auc = roc_auc_score(label_binarize(y_test, classes=list(range(0, 17))), p_cd.T, average='macro') macro_f1 = f1_score(y_test, y_pred, average='macro') micro_f1 = f1_score(y_test, y_pred, average='micro') log_loss_score = log_loss(y_test, p_cd.T) return (micro_roc_auc, macro_roc_auc, micro_f1, macro_f1, log_loss_score, precision_recall_fscore_support( y_test, y_pred, labels=labels_decreasing_size_order))
def calc_coeffs(): batch_vectorizer = artm.BatchVectorizer(data_path='lemmed.txt', data_format='vowpal_wabbit', target_folder='batches') dictionary = batch_vectorizer.dictionary topic_num = 10 topic_names = ['topic_{}'.format(i) for i in range(topic_num)] model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True) model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary)) model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score')) model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score')) model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score')) model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3)) model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer')) model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer')) model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer')) best_tau_phi = -5.0 best_tau_theta = -5.0 best_perplexity = 1000000 print("Started parameters choosing") for i in range(-20, 20, 5): for j in range(-20, 20, 5): model_artm.regularizers['sparse_phi_regularizer'].tau = (i / 10.0) model_artm.regularizers['sparse_theta_regularizer'].tau = (j / 10.0) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=100) if model_artm.score_tracker['perplexity_score'].last_value < best_perplexity: best_perplexity = model_artm.score_tracker['perplexity_score'].last_value best_tau_phi = (i / 10.0) best_tau_theta = (j / 10.0) print(best_perplexity, " ", best_tau_phi, " ", best_tau_theta) print("RESULT 1 ", best_perplexity, " ", best_tau_phi, " ", best_tau_theta) for i in range(int(10 * best_tau_phi) - 5, int(10 * best_tau_phi) + 5, 1): for j in range(int(10 * best_tau_theta) - 5, int(10 * best_tau_theta) + 5, 1): model_artm.regularizers['sparse_phi_regularizer'].tau = (i / 10.0) model_artm.regularizers['sparse_theta_regularizer'].tau = (j / 10.0) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=100) if model_artm.score_tracker['perplexity_score'].last_value < best_perplexity: best_perplexity = model_artm.score_tracker['perplexity_score'].last_value best_tau_phi = (i / 10.0) best_tau_theta = (j / 10.0) print(best_perplexity, " ", best_tau_phi, " ", best_tau_theta) print("RESULT 2 ", best_perplexity, " ", best_tau_phi, " ", best_tau_theta) for i in range(int(100 * best_tau_phi) - 10, int(100 * best_tau_phi) + 10, 1): for j in range(int(100 * best_tau_theta) - 10, int(100 * best_tau_theta) + 10, 1): model_artm.regularizers['sparse_phi_regularizer'].tau = (i / 100.0) model_artm.regularizers['sparse_theta_regularizer'].tau = (j / 100.0) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=100) if model_artm.score_tracker['perplexity_score'].last_value < best_perplexity: best_perplexity = model_artm.score_tracker['perplexity_score'].last_value best_tau_phi = (i / 100.0) best_tau_theta = (j / 100.0) print(best_perplexity, " ", best_tau_phi, " ", best_tau_theta) print("RESULT 3 ", best_perplexity, " ", best_tau_phi, " ", best_tau_theta) return {"tau_phi": best_tau_phi, "tau_theta": best_tau_theta}
batch_vectorizer = artm.BatchVectorizer(data_path=path + "\\" + subd + "\\" + "batches_pos", data_format='batches') modelPLSA = artm.ARTM(topic_names=['topic_{}'.format(i) for i in xrange(100)], scores=[ artm.PerplexityScore( name='PerplexityScore', use_unigram_document_model=False, dictionary=batch_vectorizer.dictionary, class_ids=["text"]), artm.SparsityPhiScore(name='SparsityPhiScore', class_id="text"), artm.SparsityThetaScore(name='SparsityThetaScore'), artm.TopicKernelScore(name='TopicKernelScore', probability_mass_threshold=0.3, class_id="text"), artm.TopTokensScore(name='TopTokensScore', num_tokens=100, class_id="text") ], cache_theta=True) modelPLSA.initialize(dictionary=batch_vectorizer.dictionary) modelPLSA.num_document_passes = 5 modelPLSA.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=30) print "===========================PLSA PerplexityScore start===================================="
def test_func(): data_path = os.environ.get('BIGARTM_UNITTEST_DATA') batches_folder = tempfile.mkdtemp() dump_folder = tempfile.mkdtemp() try: batch_vectorizer = artm.BatchVectorizer(data_path=data_path, data_format='bow_uci', collection_name='kos', target_folder=batches_folder) model_1 = artm.ARTM(num_processors=7, cache_theta=True, num_document_passes=5, reuse_theta=True, seed=10, num_topics=15, class_ids={'@default_class': 1.0}, theta_name='THETA', dictionary=batch_vectorizer.dictionary) model_2 = artm.ARTM(num_processors=7, cache_theta=False, num_document_passes=5, reuse_theta=False, seed=10, num_topics=15, class_ids={'@default_class': 1.0}, dictionary=batch_vectorizer.dictionary) for model in [model_1, model_2]: model.scores.add( artm.PerplexityScore(name='perp', dictionary=batch_vectorizer.dictionary)) model.scores.add(artm.SparsityThetaScore(name='sp_theta', eps=0.1)) model.scores.add(artm.TopTokensScore(name='top_tok', num_tokens=10)) model.scores.add( artm.SparsityPhiScore(name='sp_nwt', model_name=model.model_nwt)) model.scores.add( artm.TopicKernelScore(name='kernel', topic_names=model.topic_names[0:5], probability_mass_threshold=0.4)) topic_pairs = {} for topic_name_1 in model.topic_names: for topic_name_2 in model.topic_names: if topic_name_1 not in topic_pairs: topic_pairs[topic_name_1] = {} topic_pairs[topic_name_1][ topic_name_2] = numpy.random.randint(0, 3) model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='decor', tau=100000.0, topic_pairs=topic_pairs)) model.regularizers.add( artm.SmoothSparsePhiRegularizer( name='smsp_phi', tau=-0.5, gamma=0.3, dictionary=batch_vectorizer.dictionary)) model.regularizers.add( artm.SmoothSparseThetaRegularizer(name='smsp_theta', tau=0.1, doc_topic_coef=[2.0] * model.num_topics)) model.regularizers.add( artm.SmoothPtdwRegularizer(name='sm_ptdw', tau=0.1)) # learn first model and dump it on disc model.fit_offline(batch_vectorizer, num_collection_passes=10) model.fit_online(batch_vectorizer, update_every=1) model.dump_artm_model(os.path.join(dump_folder, 'target')) params = {} with open(os.path.join(dump_folder, 'target', 'parameters.json'), 'r') as fin: params = json.load(fin) _assert_json_params(params) # create second model from the dump and check the results are equal model_new = artm.load_artm_model( os.path.join(dump_folder, 'target')) _assert_params_equality(model, model_new) _assert_scores_equality(model, model_new) _assert_regularizers_equality(model, model_new) _assert_score_values_equality(model, model_new) _assert_matrices_equality(model, model_new) # continue learning of both models model.fit_offline(batch_vectorizer, num_collection_passes=3) model.fit_online(batch_vectorizer, update_every=1) model_new.fit_offline(batch_vectorizer, num_collection_passes=3) model_new.fit_online(batch_vectorizer, update_every=1) # check new results are also equal _assert_params_equality(model, model_new) _assert_scores_equality(model, model_new) _assert_regularizers_equality(model, model_new) _assert_score_values_equality(model, model_new) _assert_matrices_equality(model, model_new) shutil.rmtree(os.path.join(dump_folder, 'target')) finally: shutil.rmtree(batches_folder) shutil.rmtree(dump_folder)
def test_func(): # constants dictionary_name = 'dictionary' num_tokens = 11 probability_mass_threshold = 0.9 sp_reg_tau = -0.1 decor_tau = 1.5e+5 num_collection_passes = 15 num_document_passes = 1 num_topics = 15 vocab_size = 6906 num_docs = 3430 data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) batches_folder = tempfile.mkdtemp() sp_zero_eps = 0.001 sparsity_phi_value = [ 0.034, 0.064, 0.093, 0.120, 0.145, 0.170, 0.194, 0.220, 0.246, 0.277, 0.312, 0.351, 0.390, 0.428, 0.464 ] sparsity_theta_value = [0.0] * num_collection_passes perp_zero_eps = 2.0 perplexity_value = [ 6873, 2590, 2685, 2578, 2603, 2552, 2536, 2481, 2419, 2331, 2235, 2140, 2065, 2009, 1964 ] top_zero_eps = 0.0001 top_tokens_num_tokens = [num_tokens * num_topics] * num_collection_passes top_tokens_topic_0_tokens = [ u'party', u'state', u'campaign', u'tax', u'political', u'republican', u'senate', u'candidate', u'democratic', u'court', u'president' ] top_tokens_topic_0_weights = [ 0.0209, 0.0104, 0.0094, 0.0084, 0.0068, 0.0067, 0.0065, 0.0058, 0.0053, 0.0053, 0.0051 ] ker_zero_eps = 0.01 topic_kernel_topic_0_contrast = 0.96 topic_kernel_topic_0_purity = 0.014 topic_kernel_topic_0_size = 18.0 topic_kernel_average_size = [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.13, 0.53, 1.6, 3.33, 7.13, 12.067, 19.53, 27.8 ] topic_kernel_average_contrast = [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.12, 0.25, 0.7, 0.96, 0.96, 0.96, 0.96, 0.97 ] topic_kernel_average_purity = [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.01, 0.015, 0.017, 0.02, 0.03, 0.04, 0.05 ] len_last_document_ids = 10 try: data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) batch_vectorizer = None batch_vectorizer = artm.BatchVectorizer(data_path=data_path, data_format='bow_uci', collection_name='kos', target_folder=batches_folder) model = artm.ARTM( topic_names=['topic_{}'.format(i) for i in xrange(num_topics)], cache_theta=True) model.gather_dictionary(dictionary_name, batch_vectorizer.data_path) model.initialize(dictionary_name=dictionary_name) model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=sp_reg_tau)) model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=decor_tau)) model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model.scores.add( artm.PerplexityScore(name='PerplexityScore', use_unigram_document_model=False, dictionary_name=dictionary_name)) model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model.scores.add( artm.TopTokensScore(name='TopTokensScore', num_tokens=num_tokens)) model.scores.add( artm.TopicKernelScore( name='TopicKernelScore', probability_mass_threshold=probability_mass_threshold)) model.scores.add(artm.ThetaSnippetScore(name='ThetaSnippetScore')) model.num_document_passes = num_document_passes model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) for i in xrange(num_collection_passes): assert abs(model.score_tracker['SparsityPhiScore'].value[i] - sparsity_phi_value[i]) < sp_zero_eps for i in xrange(num_collection_passes): assert abs(model.score_tracker['SparsityThetaScore'].value[i] - sparsity_theta_value[i]) < sp_zero_eps for i in xrange(num_collection_passes): assert abs(model.score_tracker['PerplexityScore'].value[i] - perplexity_value[i]) < perp_zero_eps for i in xrange(num_collection_passes): assert model.score_tracker['TopTokensScore'].num_tokens[ i] == top_tokens_num_tokens[i] for i in xrange(num_tokens): assert model.score_tracker['TopTokensScore'].last_tokens[ model.topic_names[0]][i] == top_tokens_topic_0_tokens[i] assert abs(model.score_tracker['TopTokensScore'].last_weights[ model.topic_names[0]][i] - top_tokens_topic_0_weights[i]) < top_zero_eps assert len(model.score_tracker['TopicKernelScore'].last_tokens[ model.topic_names[0]]) > 0 assert abs(topic_kernel_topic_0_contrast - model.score_tracker['TopicKernelScore'].last_contrast[ model.topic_names[0]]) < ker_zero_eps assert abs(topic_kernel_topic_0_purity - model.score_tracker['TopicKernelScore'].last_purity[ model.topic_names[0]]) < ker_zero_eps assert abs(topic_kernel_topic_0_size - model.score_tracker['TopicKernelScore'].last_size[ model.topic_names[0]]) < ker_zero_eps for i in xrange(num_collection_passes): assert abs( model.score_tracker['TopicKernelScore'].average_size[i] - topic_kernel_average_size[i]) < ker_zero_eps assert abs( model.score_tracker['TopicKernelScore'].average_contrast[i] - topic_kernel_average_contrast[i]) < ker_zero_eps assert abs( model.score_tracker['TopicKernelScore'].average_purity[i] - topic_kernel_average_purity[i]) < ker_zero_eps model.fit_online(batch_vectorizer=batch_vectorizer) info = model.info assert info is not None assert len(info.config.topic_name) == num_topics assert len(info.score) == len(model.score_tracker) assert len(info.regularizer) == len(model.regularizers.data) assert len(info.cache_entry) > 0 temp = model.score_tracker['ThetaSnippetScore'].last_document_ids assert len_last_document_ids == len(temp) assert len(model.score_tracker['ThetaSnippetScore'].last_snippet[ temp[0]]) == num_topics phi = model.get_phi() assert phi.shape == (vocab_size, num_topics) theta = model.get_theta() assert theta.shape == (num_topics, num_docs) finally: shutil.rmtree(batches_folder)
def test_func(): # constants num_tokens = 11 probability_mass_threshold = 0.9 sp_reg_tau = -0.1 decor_tau = 1.5e+5 decor_rel_tau = 0.3 num_collection_passes = 15 num_document_passes = 1 num_topics = 15 vocab_size = 6906 num_docs = 3430 data_path = os.environ.get('BIGARTM_UNITTEST_DATA') batches_folder = tempfile.mkdtemp() sp_zero_eps = 0.001 sparsity_phi_value = [ 0.034, 0.064, 0.093, 0.120, 0.145, 0.170, 0.194, 0.220, 0.246, 0.277, 0.312, 0.351, 0.390, 0.428, 0.464 ] sparsity_phi_rel_value = [ 0.442, 0.444, 0.444, 0.446, 0.448, 0.449, 0.458, 0.468, 0.476, 0.488, 0.501, 0.522, 0.574, 0.609, 0.670 ] sparsity_theta_value = [0.0] * num_collection_passes perp_zero_eps = 2.0 perplexity_value = [ 6873, 2590, 2685, 2578, 2603, 2552, 2536, 2481, 2419, 2331, 2235, 2140, 2065, 2009, 1964 ] perplexity_rel_value = [ 6873, 2667, 2458, 2323, 2150, 2265, 2015, 1967, 1807, 1747, 1713, 1607, 1632, 1542, 1469 ] top_zero_eps = 0.0001 top_tokens_num_tokens = [num_tokens * num_topics] * num_collection_passes top_tokens_topic_0_tokens = [ u'party', u'state', u'campaign', u'tax', u'political', u'republican', u'senate', u'candidate', u'democratic', u'court', u'president' ] top_tokens_topic_0_weights = [ 0.0209, 0.0104, 0.0094, 0.0084, 0.0068, 0.0067, 0.0065, 0.0058, 0.0053, 0.0053, 0.0051 ] ker_zero_eps = 0.02 topic_kernel_topic_0_contrast = 0.96 topic_kernel_topic_0_purity = 0.014 topic_kernel_topic_0_size = 18.0 topic_kernel_average_size = [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.13, 0.6, 1.6, 3.53, 7.15, 12.6, 20.4, 29.06 ] topic_kernel_average_contrast = [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.12, 0.31, 0.7, 0.96, 0.96, 0.96, 0.96, 0.97 ] topic_kernel_average_purity = [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.01, 0.015, 0.017, 0.02, 0.03, 0.04, 0.05 ] len_last_document_ids = 10 try: batch_vectorizer = artm.BatchVectorizer(data_path=data_path, data_format='bow_uci', collection_name='kos', target_folder=batches_folder) dictionary = artm.Dictionary() dictionary.gather(data_path=batch_vectorizer.data_path) model = artm.ARTM( topic_names=['topic_{}'.format(i) for i in range(num_topics)], dictionary=dictionary.name, cache_theta=True) model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=sp_reg_tau)) model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=decor_tau)) model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model.scores.add( artm.PerplexityScore(name='PerplexityScore', use_unigram_document_model=False, dictionary=dictionary)) model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model.scores.add( artm.TopTokensScore(name='TopTokensScore', num_tokens=num_tokens)) model.scores.add( artm.TopicKernelScore( name='TopicKernelScore', probability_mass_threshold=probability_mass_threshold)) model.scores.add(artm.ThetaSnippetScore(name='ThetaSnippetScore')) model.num_document_passes = num_document_passes model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) for i in range(num_collection_passes): assert abs(model.score_tracker['SparsityPhiScore'].value[i] - sparsity_phi_value[i]) < sp_zero_eps for i in range(num_collection_passes): assert abs(model.score_tracker['SparsityThetaScore'].value[i] - sparsity_theta_value[i]) < sp_zero_eps for i in range(num_collection_passes): assert abs(model.score_tracker['PerplexityScore'].value[i] - perplexity_value[i]) < perp_zero_eps for i in range(num_collection_passes): assert model.score_tracker['TopTokensScore'].num_tokens[ i] == top_tokens_num_tokens[i] for i in range(num_tokens): assert model.score_tracker['TopTokensScore'].last_tokens[ model.topic_names[0]][i] == top_tokens_topic_0_tokens[i] assert abs(model.score_tracker['TopTokensScore'].last_weights[ model.topic_names[0]][i] - top_tokens_topic_0_weights[i]) < top_zero_eps assert len(model.score_tracker['TopicKernelScore'].last_tokens[ model.topic_names[0]]) > 0 assert abs(topic_kernel_topic_0_contrast - model.score_tracker['TopicKernelScore'].last_contrast[ model.topic_names[0]]) < ker_zero_eps assert abs(topic_kernel_topic_0_purity - model.score_tracker['TopicKernelScore'].last_purity[ model.topic_names[0]]) < ker_zero_eps assert abs(topic_kernel_topic_0_size - model.score_tracker['TopicKernelScore'].last_size[ model.topic_names[0]]) < ker_zero_eps for i in range(num_collection_passes): assert abs( model.score_tracker['TopicKernelScore'].average_size[i] - topic_kernel_average_size[i]) < ker_zero_eps assert abs( model.score_tracker['TopicKernelScore'].average_contrast[i] - topic_kernel_average_contrast[i]) < ker_zero_eps assert abs( model.score_tracker['TopicKernelScore'].average_purity[i] - topic_kernel_average_purity[i]) < ker_zero_eps model.fit_online(batch_vectorizer=batch_vectorizer) info = model.info assert info is not None assert len(info.config.topic_name) == num_topics assert len(info.score) >= len(model.score_tracker) assert len(info.regularizer) == len(model.regularizers.data) assert len(info.cache_entry) > 0 temp = model.score_tracker['ThetaSnippetScore'].last_document_ids assert len_last_document_ids == len(temp) assert len(model.score_tracker['ThetaSnippetScore'].last_snippet[ temp[0]]) == num_topics phi = model.get_phi() assert phi.shape == (vocab_size, num_topics) theta = model.get_theta() assert theta.shape == (num_topics, num_docs) assert model.library_version.count('.') == 2 # major.minor.patch # test relative coefficients for Phi matrix regularizers model = artm.ARTM(num_topics=num_topics, dictionary=dictionary.name, cache_theta=False) model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=decor_rel_tau)) model.regularizers['DecorrelatorPhi'].gamma = 0.0 model.scores.add( artm.PerplexityScore(name='PerplexityScore', use_unigram_document_model=False, dictionary=dictionary)) model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model.num_document_passes = num_document_passes model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) for i in range(num_collection_passes): assert abs(model.score_tracker['SparsityPhiScore'].value[i] - sparsity_phi_rel_value[i]) < sp_zero_eps for i in range(num_collection_passes): assert abs(model.score_tracker['PerplexityScore'].value[i] - perplexity_rel_value[i]) < perp_zero_eps finally: shutil.rmtree(batches_folder)
def model_train(batches_folder, models_folder_name, perform_actualize, tm_index, regularization_params, name, name_translit, index_tm): import artm import os import datetime import numpy as np from util.constants import BASE_DAG_DIR from nlpmonitor.settings import ES_CLIENT print("Initializing vectorizer, model") batch_vectorizer = artm.BatchVectorizer(data_path=batches_folder, data_format='batches') model_folder = os.path.join(BASE_DAG_DIR, models_folder_name) model_artm = artm.ARTM(num_topics=tm_index.number_of_topics, class_ids={"text": 1}, theta_columns_naming="title", reuse_theta=True, cache_theta=True, num_processors=4) if not perform_actualize: dictionary = artm.Dictionary() if "scopus" in name and os.path.exists( os.path.join("/big_data/", "scopus250k.dict")): print("Loading dictionary") dictionary.load(os.path.join("/big_data/", "scopus250k.dict")) else: print("Gathering dictionary") dictionary.gather(batch_vectorizer.data_path, symmetric_cooc_values=True) print("Filtering dictionary") dictionary.filter(max_dictionary_size=250_000) if "scopus" in name and not os.path.exists( os.path.join("/big_data/", "scopus250k.dict")): print("Saving dictionary") dictionary.save(os.path.join("/big_data/", "scopus250k.dict")) print("Model - initial settings") model_artm.initialize(dictionary) # Add scores model_artm.scores.add(artm.PerplexityScore(name='PerplexityScore')) model_artm.scores.add( artm.TopicKernelScore(name='TopicKernelScore', class_id='text', probability_mass_threshold=0.3)) # Regularize model_artm.regularizers.add( artm.SmoothSparseThetaRegularizer( name='SparseTheta', tau=regularization_params['SmoothSparseThetaRegularizer'])) model_artm.regularizers.add( artm.SmoothSparsePhiRegularizer( name='SparsePhi', tau=regularization_params['SmoothSparsePhiRegularizer'])) model_artm.regularizers.add( artm.DecorrelatorPhiRegularizer( name='DecorrelatorPhi', tau=regularization_params['DecorrelatorPhiRegularizer'])) model_artm.regularizers.add( artm.ImproveCoherencePhiRegularizer( name='ImproveCoherencePhi', tau=regularization_params['ImproveCoherencePhiRegularizer'])) print("!!!", "Start model train", datetime.datetime.now()) # Fit model model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=10) if not os.path.exists(model_folder): os.mkdir(model_folder) model_artm.save( os.path.join( model_folder, f"model_{name if not name_translit else name_translit}.model")) print("!!!", "Get topics", datetime.datetime.now()) # Create topics in ES topics = [] phi = model_artm.get_phi() for topic in phi: phi_filtered = phi[phi[topic] > 0.0001] topic_words = [{ "word": ind[1], "weight": float(phi[topic][ind]) } for ind in phi_filtered[topic].index] topic_words = sorted(topic_words, key=lambda x: x['weight'], reverse=True)[:100] topics.append({ "id": topic, "topic_words": topic_words, "name": ", ".join([w['word'] for w in topic_words[:5]]) }) # Add metrics purity = np.mean( model_artm.score_tracker['TopicKernelScore'].last_average_purity) contrast = np.mean( model_artm.score_tracker['TopicKernelScore'].last_average_contrast) coherence = np.mean( model_artm.score_tracker['TopicKernelScore'].average_coherence) perplexity = model_artm.score_tracker['PerplexityScore'].last_value print("!!!", "Write topics", datetime.datetime.now()) update_body = { "topics": topics, "purity": purity, "contrast": contrast, "coherence": coherence, "perplexity": perplexity, "tau_smooth_sparse_theta": regularization_params['SmoothSparseThetaRegularizer'], "tau_smooth_sparse_phi": regularization_params['SmoothSparsePhiRegularizer'], "tau_decorrelator_phi": regularization_params['DecorrelatorPhiRegularizer'], "tau_coherence_phi": regularization_params['ImproveCoherencePhiRegularizer'], } ES_CLIENT.update(index=index_tm, id=tm_index.meta.id, body={"doc": update_body}) else: print("!!!", "Loading existing model") # Monkey patching stupid BigARTM bug model_artm.load = load_monkey_patch model_artm.load( model_artm, os.path.join( model_folder, f"model_{name if not name_translit else name_translit}.model")) return model_artm, batch_vectorizer