def fit(): batch_id = str(uuid.uuid4()) app.logger.info("batch %s", batch_id) rjson = request.json terms = rjson['terms'] topics_cnt = rjson['topics'] batch = artm.messages.Batch() term_to_id = {} all_terms = [] batch = artm.messages.Batch() batch.id = batch_id for i, doc in enumerate(terms): item = batch.item.add() item.id = i field = item.field.add() for term in doc: if not term in term_to_id: term_to_id[term] = len(all_terms) all_terms.append(term) field.token_id.append(term_to_id[term]) field.token_count.append(1) for t in all_terms: batch.token.append(t) os.mkdir(batch_id) with open(os.path.join(batch_id, "batch.batch"), 'wb') as fout: fout.write(batch.SerializeToString()) app.logger.info("batch %s is created", batch_id) dictionary = artm.Dictionary() dictionary.gather(batch_id) model_artm = artm.ARTM( topic_names=['topic_{}'.format(i) for i in xrange(topics_cnt)], scores=[ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary) ], regularizers=[ artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.15) ], show_progress_bars=False) batch_vectorizer = artm.BatchVectorizer(data_path=batch_id, data_format="batches") model_artm.initialize(dictionary=dictionary) app.logger.info("model is starting to fit") model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=1) app.logger.info("mode was fitted") model_artm.save(os.path.join(batch_id, "model")) return jsonify({"id": batch_id})
def cluster_artm(text): bach_vectorizer = artm.BatchVectorizer(data_path=text, data_format='vowpal_wabbit', target_folder='batch_small', batch_size=20) T = 10 # количество тем topic_names = ["sbj" + str(i) for i in range(T - 1)] + ["bcg"] model_artm = artm.ARTM(num_topics=T, topic_names=topic_names, reuse_theta=True, num_document_passes=1) np.random.seed(1) dictionary = artm.Dictionary() dictionary.gather(data_path=bach_vectorizer.data_path) model_artm.initialize(dictionary) model_artm.scores.add(artm.TopTokensScore(name='metric1', num_tokens=15)) model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='smoothing', dictionary=dictionary, topic_names='bcg', tau=1e5)) model_artm.fit_offline(batch_vectorizer=bach_vectorizer, num_collection_passes=6) model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='stimulates', dictionary=dictionary, topic_names=["sbj" + str(i) for i in range(0, 29)], tau=-1e5)) model_artm.fit_offline(batch_vectorizer=bach_vectorizer, num_collection_passes=6) for topic_name in model_artm.topic_names: with open('cluster_log_artm.txt', 'a') as f_in: f_in.write(topic_name + ':') for word in model_artm.score_tracker["metric1"].last_tokens[topic_name]: f_in.write(word + ' ') f_in.write('\n')
def init_model(self, dictionary_path=None): """dictionary_path: optional, used with pretrained model""" self.dictionary = artm.Dictionary() if dictionary_path is None: self.dictionary.gather(data_path=self.batches_path) self.dictionary.filter(min_tf=10, max_df_rate=0.1) self.dictionary.save_text( f"{self.dir_path}/dicts/dict_{self.name_dataset}.txt") else: self.dictionary.load_text(dictionary_path) self.model = artm.ARTM( num_topics=self.n_topics, dictionary=self.dictionary, show_progress_bars=True, ) # scores self.model.scores.add( artm.PerplexityScore(name="PerplexityScore", dictionary=self.dictionary)) self.model.scores.add( artm.SparsityThetaScore(name="SparsityThetaScore")) self.model.scores.add(artm.SparsityPhiScore(name="SparsityPhiScore")) # regularizers self.model.regularizers.add( artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=-0.1)) self.model.regularizers.add( artm.SmoothSparseThetaRegularizer(name="SparseTheta", tau=-0.5)) self.model.regularizers.add( artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi", tau=1.5e5))
def get_dictionary(self) -> artm.Dictionary: """ Gets dataset's dictionary. Returns ------- artm.Dictionary """ if self._cached_dict is not None: return self._cached_dict dictionary = artm.Dictionary() same_collection, path_to_collection = self._check_collection() if same_collection: if not os.path.isfile(self._dictionary_file_path): dictionary.gather(data_path=self._batches_folder_path) dictionary.save(dictionary_path=self._dictionary_file_path) dictionary.load(dictionary_path=self._dictionary_file_path) self._cached_dict = dictionary else: _ = self.get_batch_vectorizer() dictionary.gather(data_path=self._batches_folder_path) if os.path.isfile(self._dictionary_file_path): os.remove(self._dictionary_file_path) dictionary.save(dictionary_path=self._dictionary_file_path) dictionary.load(dictionary_path=self._dictionary_file_path) self._cached_dict = dictionary return self._cached_dict
def dictionary_initialization(model_artm, batches_dir, min_df, max_tf): my_dictionary = artm.Dictionary() my_dictionary.gather(data_path=batches_dir) my_dictionary.filter(min_df=min_df, max_tf=max_tf) model_artm.initialize(my_dictionary) return model_artm, my_dictionary
def select_from_corpus(self, list_of_files: List[str], preprocessor: BaseTextPreprocessor, spacy_nlp: Language) -> List[str]: topic_model_name = os.path.normpath(self.topic_model_name.strip()) if len(topic_model_name) == 0: raise ValueError('A topic model name is empty!') dir_name = os.path.dirname(topic_model_name) base_name = os.path.basename(topic_model_name) if len(dir_name) == 0: dir_name = os.path.curdir if len(base_name) == 0: raise ValueError( '`{0}` is incorrect name for a topic model! Base name of file is empty!' .format(self.topic_model_name)) if not os.path.isdir(dir_name): raise ValueError( '`{0}` is incorrect name for a topic model! Directory `{1}` does not exist!' .format(self.topic_model_name, dir_name)) collection_name = os.path.normpath( os.path.join(dir_name, base_name + '.collection')) collection_docword_name = os.path.normpath( os.path.join(dir_name, 'docword.' + base_name + '.collection')) collection_vocab_name = os.path.normpath( os.path.join(dir_name, 'vocab.' + base_name + '.collection')) if (not os.path.isfile(collection_docword_name)) or ( not os.path.isfile(collection_vocab_name)): self.create_collection_as_bow_uci(list_of_files, preprocessor, spacy_nlp, collection_docword_name, collection_vocab_name) batches_path = os.path.normpath( os.path.join(dir_name, base_name + '.data_batches')) if os.path.isdir(batches_path): batch_vectorizer = artm.BatchVectorizer(data_path=batches_path, data_format='batches') else: batch_vectorizer = artm.BatchVectorizer( data_path=dir_name, data_format='bow_uci', collection_name=collection_name, target_folder=batches_path) dictionary = artm.Dictionary() dictionary_name = os.path.normpath(topic_model_name + '.dictionary') if os.path.isfile(dictionary_name): dictionary.load(dictionary_name) else: dictionary.gather(data_path=batches_path) dictionary.save(dictionary_name) topic_model = self.load_topic_model( artm.ARTM(num_topics=self.number_of_topics, dictionary=dictionary, cache_theta=False), topic_model_name) if topic_model is None: topic_model = self.create_topic_model(topic_model_name, batch_vectorizer, dictionary) if topic_model is None: raise ValueError( 'The trained topic model cannot be loaded from the file `{0}`!' .format(topic_model_name)) return self.select_keywords_from_topic_model(topic_model)
def test_func(): # constants num_tokens = 15 parent_level_weight = 1 num_collection_passes = 15 num_document_passes = 10 num_topics_level0 = 15 num_topics_level1 = 50 regularizer_tau = 10 ** 5 vocab_size = 6906 num_docs = 3430 zero_eps = 0.001 data_path = os.environ.get('BIGARTM_UNITTEST_DATA') batches_folder = tempfile.mkdtemp() parent_batch_folder = tempfile.mkdtemp() try: batch_vectorizer = artm.BatchVectorizer(data_path=data_path, data_format='bow_uci', collection_name='kos', target_folder=batches_folder) dictionary = artm.Dictionary() dictionary.gather(data_path=batch_vectorizer.data_path) hier = artm.hARTM(dictionary=dictionary, cache_theta=True, num_document_passes=num_document_passes) level0 = hier.add_level(num_topics=num_topics_level0) level0.initialize(dictionary=dictionary) level0.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) hier.tmp_files_path = parent_batch_folder level1 = hier.add_level(num_topics=num_topics_level1, parent_level_weight=parent_level_weight) level1.initialize(dictionary=dictionary) level1.regularizers.add(artm.HierarchySparsingThetaRegularizer(name="HierSp", tau=regularizer_tau)) level1.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) phi = hier.get_level(1).get_phi() assert phi.shape == (vocab_size, num_topics_level1) # theta = hier.get_level(1).get_theta() # assert theta.shape == (num_topics_level1, num_docs) psi = hier.get_level(1).get_psi() support = psi.values.max(axis=1).min() # This test gives different results on python27 and python35. Authors need to investigate. on_python_27 = abs(support - 0.0978 < zero_eps) on_python_35 = abs(support - 0.1522 < zero_eps) assert(on_python_27 or on_python_35) finally: shutil.rmtree(batches_folder) shutil.rmtree(parent_batch_folder)
def create_and_learn_ARTM_decorPhi_modal(name="", topic_number=750, num_collection_passes=1, weigths=[1., 1., 1., 1.], decorTau=1.0): batch_vectorizer_train = None batch_vectorizer_train = artm.BatchVectorizer(data_path='./' + name, data_format='vowpal_wabbit', target_folder='folder' + name) dictionary = artm.Dictionary() dictionary.gather(data_path=batch_vectorizer_train.data_path) topic_names = ['topic_{}'.format(i) for i in range(topic_number)] model = artm.ARTM(topic_names=topic_names, class_ids={ '@text': weigths[0], '@first': weigths[1], '@second': weigths[2], '@third': weigths[3] }, cache_theta=True, theta_columns_naming='title', scores=[ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary) ]) model.regularizers.add( artm.DecorrelatorPhiRegularizer( name='DecorrelatorPhi_modals', tau=decorTau, class_ids=['@first', '@second', '@third'])) model.initialize(dictionary=dictionary) model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model.scores.add( artm.TopicKernelScore(name='TopicKernelScore', class_id='@text', probability_mass_threshold=0.3)) model.scores.add( artm.TopTokensScore(name='TopTokensScore', num_tokens=6, class_id='@text')) model.scores.add( artm.SparsityPhiScore(name='sparsity_phi_score', class_id='@third')) model.num_document_passes = 1 model.fit_offline(batch_vectorizer=batch_vectorizer_train, num_collection_passes=num_collection_passes) theta_train = model.transform(batch_vectorizer=batch_vectorizer_train) return model, theta_train
def get_batches(self): dataset_path = os.path.join(settings.DATA_DIR, "datasets", self.text_id) batches_folder = os.path.join(dataset_path, "batches") dictionary_file_name = os.path.join(batches_folder, "dictionary.txt") batch_vectorizer = artm.BatchVectorizer(data_path=batches_folder, data_format="batches") dictionary = artm.Dictionary(name="dictionary") dictionary.load_text(dictionary_file_name) return batch_vectorizer, dictionary
def __init__(self, source_file, batches_folder='batches', batch_size=100): self.source_file = source_file self.batches_folder = batches_folder self.batch_vectorizer = artm.BatchVectorizer(data_path=self.source_file, data_format="vowpal_wabbit", target_folder=self.batches_folder, batch_size=batch_size) dict_name = os.path.join(self.batches_folder, "dictionary.dict") self.dictionary = artm.Dictionary() if not os.path.exists(dict_name): self.dictionary.gather(batches_folder) self.dictionary.save(dict_name) else: self.dictionary.load(dict_name)
def _compute_lift( self, phi: pd.DataFrame, chosen_words_array: List[pd.Index] = None, ): # inspired by gist.github.com/jrnold/daa039f02486009a24cf3e83403dabf0 artm_dict = artm.Dictionary(dictionary_path=self._dict_path) dict_df = artm_dict2df(artm_dict).query("class_id in @self.modalities") # TODO: this is possible to do using aggregate / join and stuff for m in self.modalities: subdf = dict_df.query("class_id == @m") idx = subdf.index # theretically, token_freq is unnecasry duplicate of token_value # in practice, we have float32 errors and also user could run dictionary filtering # without setting recalculate_value=True dict_df.loc[idx, 'token_freq'] = dict_df.loc[idx, 'token_tf'] / subdf.token_tf.sum() dict_df.set_index(["class_id", "token"], inplace=True) dict_df.index.names = ['modality', 'token'] dict_df.sort_index(inplace=True) phi.sort_index(inplace=True) known_chosen_words_array = [ words.intersection(dict_df.index) for words in chosen_words_array ] if known_chosen_words_array: merged_index = reduce( lambda idx1, idx2: idx1.union(idx2), known_chosen_words_array ) chosen_words = merged_index.drop_duplicates() dict_df = dict_df.loc[chosen_words] phi = phi.loc[chosen_words] data = np.log(phi.values) - np.log(dict_df[['token_freq']].values) log_lift = pd.DataFrame(data=np.log(data), index=phi.index, columns=phi.columns) if not known_chosen_words_array: return log_lift result = [] for t, words in zip(phi.columns, known_chosen_words_array): result.append(log_lift.loc[words, t].sum()) log_lift_total = pd.Series(data=result, index=phi.columns) return log_lift_total
def run(): print 'BigARTM version ', artm.version(), '\n\n\n' preprocessing_for_artm(True) topics = 10 batch_vectorizer = artm.BatchVectorizer( data_path="/home/goncharoff/PythonLab/labs/labs/lab5/result/result.txt", data_format="vowpal_wabbit", target_folder="batch_vectorizer_target_folder", batch_size=10) topic_names = ["topic#1" + str(i) for i in range(topics - 1)] + ["bcg"] dictionary = artm.Dictionary("dictionary") dictionary.gather(batch_vectorizer.data_path) artm_plsa(batch_vectorizer, topics, topic_names, dictionary) artm_lda(batch_vectorizer, topics, dictionary)
def run(): print 'BigARTM version ', artm.version(), '\n\n\n' preprocessing_for_artm(True) topics = 10 batch_vectorizer = artm.BatchVectorizer( data_path="../data/lenta.txt", data_format="vowpal_wabbit", target_folder="batch_vectorizer_target_folder", batch_size=10) topic_names = ["topic#1" + str(i) for i in range(topics - 1)] + ["bcg"] dictionary = artm.Dictionary("dictionary") dictionary.gather(batch_vectorizer.data_path) artm_plsa(batch_vectorizer, topics, topic_names, dictionary) artm_lda(batch_vectorizer, topics, dictionary) subprocess.call(['./clear.sh'])
def test_func(): topic_selection_tau = 0.5 num_collection_passes = 3 num_document_passes = 10 num_topics = 15 data_path = os.environ.get('BIGARTM_UNITTEST_DATA') batches_folder = tempfile.mkdtemp() perplexity_eps = 0.1 perplexity_value = [ 6676.941798754971, 2534.963709464024, 2463.1544861984794 ] try: batch_vectorizer = artm.BatchVectorizer(data_path=data_path, data_format='bow_uci', collection_name='kos', target_folder=batches_folder) dictionary = artm.Dictionary(data_path=batches_folder) model = artm.ARTM(num_topics=num_topics, dictionary=dictionary, num_document_passes=num_document_passes) model.regularizers.add( artm.TopicSelectionThetaRegularizer(name='TopicSelection', tau=topic_selection_tau)) model.scores.add(artm.PerplexityScore(name='PerplexityScore')) model.scores.add( artm.TopicMassPhiScore(name='TopicMass', model_name=model.model_nwt)) model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) # Verify that only 8 topics are non-zero (due to TopicSelection regularizer) topics_left = sum(x == 0 for x in model.get_score('TopicMass').topic_mass) assert 8 == topics_left # the following asssertion fails on travis-ci builds, but passes locally for i in range(num_collection_passes): assert abs(model.score_tracker['PerplexityScore'].value[i] - perplexity_value[i]) < perplexity_eps model.fit_online(batch_vectorizer=batch_vectorizer) finally: shutil.rmtree(batches_folder)
def create_thematic_model(checked_list, num_topics, num_tokens, phi_tau, theta_tau, decorr_tau): """ Create a thematic model """ gluing_bag_of_words(checked_list) batch_vectorizer = artm.BatchVectorizer(data_path=COLLECTION_PATH, data_format='vowpal_wabbit', target_folder=TARGET_FOLDER, batch_size=len(checked_list)) dictionary = artm.Dictionary(data_path=TARGET_FOLDER) model = artm.ARTM( num_topics=num_topics, num_document_passes=len(checked_list), dictionary=dictionary, regularizers=[ artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer', tau=phi_tau), artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer', tau=theta_tau), artm.DecorrelatorPhiRegularizer( name='decorrelator_phi_regularizer', tau=decorr_tau), ], scores=[ artm.PerplexityScore(name='perplexity_score', dictionary=dictionary), artm.SparsityPhiScore(name='sparsity_phi_score'), artm.SparsityThetaScore(name='sparsity_theta_score'), artm.TopTokensScore(name='top_tokens_score', num_tokens=num_tokens) ]) model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=len(checked_list)) top_tokens = model.score_tracker['top_tokens_score'] topic_dictionary = OrderedDict() for topic_name in model.topic_names: list_name = [] for (token, weight) in zip(top_tokens.last_tokens[topic_name], top_tokens.last_weights[topic_name]): list_name.append(token + '-' + str(round(weight, 3))) topic_dictionary[str(topic_name)] = list_name return model.score_tracker[ 'perplexity_score'].last_value, model.score_tracker[ 'sparsity_phi_score'].last_value, model.score_tracker[ 'sparsity_theta_score'].last_value, topic_dictionary
def init_model(self, params_string, dict_path): self.set_params(params_string) self.back = ['back{}'.format(i) for i in range(self.B)] self.dictionary = artm.Dictionary() self.dictionary.load_text(dictionary_path=dict_path) self.model = artm.ARTM( num_topics=self.S + self.B, class_ids=['@default_class'], dictionary=self.dictionary, show_progress_bars=False, # cache_theta=True, topic_names=self.specific + self.back, num_processors=32) self.set_scores()
def create_and_learn_PLSA(name="", topic_number=750, num_collection_passes=1): batch_vectorizer_train = None batch_vectorizer_train = artm.BatchVectorizer(data_path='./' + name, data_format='vowpal_wabbit', target_folder='folder' + name) dictionary = artm.Dictionary() dictionary.gather(data_path=batch_vectorizer_train.data_path) topic_names = ['topic_{}'.format(i) for i in range(topic_number)] model_plsa = artm.ARTM(topic_names=topic_names, class_ids={ '@text': 1.0, '@first': 1.0, '@second': 1.0, '@third': 1.0 }, cache_theta=True, theta_columns_naming='title', scores=[ artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary) ]) model_plsa.initialize(dictionary=dictionary) model_plsa.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model_plsa.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model_plsa.scores.add( artm.TopicKernelScore(name='TopicKernelScore', class_id='@text', probability_mass_threshold=0.3)) model_plsa.scores.add( artm.TopTokensScore(name='TopTokensScore', num_tokens=6, class_id='@text')) model_plsa.num_document_passes = 1 model_plsa.fit_offline(batch_vectorizer=batch_vectorizer_train, num_collection_passes=num_collection_passes) theta_train = model_plsa.transform(batch_vectorizer=batch_vectorizer_train) return model_plsa, theta_train
def main(): print artm.version() config = ConfigPaths('config.cfg') plot_maker = PlotMaker() printer = PrintHelper() print config.models_file_name batch_vectorizer = artm.BatchVectorizer( data_path=config.output_batches_path, data_format='batches') dictionary = artm.Dictionary() dictionary.load(dictionary_path=config.dictionary_path + '.dict') models_file = open(config.models_file_name, 'a') # model = process_one_model(config, batch_vectorizer, models_file, printer, plot_maker, # dictionary, _n_topics=50, _n_doc_passes=5, _seed_value=100, _n_top_tokens=10, _p_mass_threshold=0.25, # _n_iterations=20, _model_name='model1') exp = Experiment( Pool(topics_filter=OptimizationTopicsFilter(eps=10**(-2.5), verbose=False), save_topics=True)) for i in xrange(3): model_artm = process_one_model(config, batch_vectorizer, models_file, printer, plot_maker, dictionary, _n_topics=50, _n_doc_passes=5, _seed_value=100, _n_top_tokens=10, _p_mass_threshold=0.25, _n_iterations=20, _model_name='model_{}'.format(i)) #display_points(model_artm.get_phi()) exp.collect_topics(model_artm.get_phi(), model_artm.get_theta()) vals, bins = exp.topics_pool.topics_filter.plot_hist() save_hist(vals, bins, "data_iter_{}.csv".format(i)) print exp.topics_pool.get_basic_topics_count() # models_file.close()
def build_model(self, d_dir, n_document_passes=1): batch_vectorizer_train = artm.BatchVectorizer(data_path=os.path.join( d_dir, 'data_batches_train'), data_format="batches") batch_vectorizer_test = artm.BatchVectorizer(data_path=os.path.join( d_dir, 'data_batches_test'), data_format="batches") dictionary = artm.Dictionary() dictionary.gather(data_path=os.path.join(d_dir, 'for_dict')) model = artm.ARTM(num_topics=self.n_topics, dictionary=dictionary, cache_theta=True, reuse_theta=True) # Sparsity p(c|t) model.scores.add( artm.SparsityPhiScore(eps=EPS, name='SparsityPhiScoreC', class_id=self.c)) # Sparsity p(w|t) model.scores.add( artm.SparsityPhiScore(eps=EPS, name='SparsityPhiScoreGram3', class_id=self.gram3)) #Regularization of sparsity p(gram3|t) model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhiGram3Regularizer', class_ids=[self.gram3])) #Regularization of decorr p(gram3|t) model.regularizers.add( artm.DecorrelatorPhiRegularizer( name='DecorrelatorPhiGram3Regularizer', class_ids=[self.gram3])) model.num_document_passes = n_document_passes return (model, batch_vectorizer_train, batch_vectorizer_test)
def read_collection(target_folder, vw_name): if len(glob.glob(os.path.join(target_folder, '*.batch'))) < 1: batch_vectorizer = artm.BatchVectorizer( data_path=vw_name, data_format='vowpal_wabbit', target_folder=target_folder) else: batch_vectorizer = artm.BatchVectorizer( data_path=target_folder, data_format='batches') dictionary = artm.Dictionary() dict_path = os.path.join(target_folder, 'dict.dict') if not os.path.isfile(dict_path): dictionary.gather(data_path=batch_vectorizer.data_path) dictionary.save(dictionary_path=dict_path) dictionary.load(dictionary_path=dict_path) return batch_vectorizer, dictionary
def get_dictionary(self, batch_vectorizer_path=None): """ Get dictionary. Parameters ---------- batch_vectorizer_path : str (Default value = None) Returns ------- dictionary : """ if self._cached_dict is not None: return self._cached_dict if batch_vectorizer_path is None: batch_vectorizer_path = self._batch_vectorizer_path dictionary = artm.Dictionary() dict_path = os.path.join(batch_vectorizer_path, 'dict.dict') same_collection, path_to_collection = self._check_collection( batch_vectorizer_path ) if same_collection: if not os.path.isfile(dict_path): dictionary.gather(data_path=batch_vectorizer_path) dictionary.save(dictionary_path=dict_path) dictionary.load(dictionary_path=dict_path) self._cached_dict = dictionary return dictionary else: _ = self.get_batch_vectorizer(batch_vectorizer_path) dictionary.gather(data_path=batch_vectorizer_path) dictionary.save(dictionary_path=dict_path) dictionary.load(dictionary_path=dict_path) self._cached_dict = dictionary return dictionary
class DatasetCollection(object): dir_path = attr.ib(init=True, converter=str, validator=_id_dir, repr=True) allowed_modality_names = attr.ib( init=True, default=['@labels_class', '@ideology_class']) name = attr.ib(init=False, default=attr.Factory( lambda self: path.basename(self.dir_path), takes_self=True)) vocab_file = attr.ib(init=False, default=attr.Factory(lambda self: path.join( self.dir_path, 'vocab.{}.txt'.format(self.name)), takes_self=True)) lexicon = attr.ib(init=False, default=attr.Factory( lambda self: artm.Dictionary(name=self.name), takes_self=True)) doc_labeling_modality_name = attr.ib(init=False, default='') class_names = attr.ib(init=False, default=[], validator=_class_names) # nb_docs = attr.ib(init=False, default=attr.Factory(lambda self: _file_len(path.join(self.dir_path, 'vowpal.{}.txt'.format(self.name))), takes_self=True)) ppmi_file = attr.ib(init=False, default=attr.Factory(lambda self: self._cooc_tf(), takes_self=True)) def __attrs_post_init__(self): self.lexicon.gather(data_path=self.dir_path, cooc_file_path=self.ppmi_file, vocab_file_path=self.vocab_file, symmetric_cooc_values=True) def _cooc_tf(self): c = glob('{}/ppmi_*tf.txt'.format(self.dir_path)) if not c: raise RuntimeError( "Did not find any 'ppmi' (computed with simple 'tf' scheme) files in dataset directory '{}'" .format(self.dir_path)) return c[0]
def test_func(): biterms_tau = 0.0 num_collection_passes = 1 num_document_passes = 1 num_topics = 3 phi_first_elem = 0.2109 # check that initialization had not changed phi_eps = 0.0001 batches_folder = tempfile.mkdtemp() vocab_file_name = os.path.join(batches_folder, 'vocab.txt') cooc_file_name = cooc_file_path = os.path.join(batches_folder, 'cooc_data.txt') phi_values = [[0.380308, 0.659777, 0.429884], [0.330372, 0.012429, 0.081726], [0.277840, 0.020186, 0.334808], [0.011480, 0.307608, 0.153582]] try: batch = artm.messages.Batch() batch.token.append('A') batch.token.append('B') batch.token.append('C') batch.token.append('D') item = batch.item.add() item.token_id.append(0) item.token_id.append(2) item.token_id.append(3) item.token_id.append(0) item.token_weight.append(2) item.token_weight.append(4) item.token_weight.append(1) item.token_weight.append(1) item = batch.item.add() item.token_id.append(1) item.token_id.append(2) item.token_id.append(0) item.token_id.append(3) item.token_weight.append(3) item.token_weight.append(2) item.token_weight.append(4) item.token_weight.append(1) with open( os.path.join(batches_folder, '{}.batch'.format(uuid.uuid4())), 'wb') as fout: fout.write(batch.SerializeToString()) batch = artm.messages.Batch() batch.token.append('A') batch.token.append('B') batch.token.append('D') item = batch.item.add() item.token_id.append(0) item.token_id.append(1) item.token_id.append(2) item.token_weight.append(2) item.token_weight.append(1) item.token_weight.append(1) item = batch.item.add() item.token_id.append(0) item.token_id.append(2) item.token_weight.append(6) item.token_weight.append(2) with open( os.path.join(batches_folder, '{}.batch'.format(uuid.uuid4())), 'wb') as fout: fout.write(batch.SerializeToString()) with open(vocab_file_name, 'w') as fout: for e in ['A', 'B', 'C', 'D']: fout.write('{0}\n'.format(e)) with open(cooc_file_name, 'w') as fout: fout.write('0 3 5.0\n') fout.write('0 1 4.0\n') fout.write('0 2 5.0\n') fout.write('1 3 2.0\n') fout.write('1 2 2.0\n') fout.write('2 3 2.0\n') dictionary = artm.Dictionary() dictionary.gather(data_path=batches_folder, vocab_file_path=vocab_file_name, cooc_file_path=cooc_file_name) batch_vectorizer = artm.BatchVectorizer(data_path=batches_folder, data_format='batches') model = artm.ARTM(num_topics=num_topics, dictionary=dictionary, num_document_passes=num_document_passes) model.regularizers.add( artm.BitermsPhiRegularizer(name='Biterms', tau=biterms_tau, dictionary=dictionary)) assert abs(model.phi_.as_matrix()[0][0] - phi_first_elem) < phi_eps model.fit_offline(batch_vectorizer=batch_vectorizer) for i in range(len(phi_values)): for j in range(len(phi_values[0])): assert abs(model.phi_.as_matrix()[i][j] - phi_values[i][j]) < phi_eps finally: shutil.rmtree(batches_folder)
def test_func(): # constants num_tokens = 11 probability_mass_threshold = 0.9 sp_reg_tau = -0.1 decor_tau = 1.5e+5 decor_rel_tau = 0.3 num_collection_passes = 15 num_document_passes = 1 num_topics = 15 vocab_size = 6906 num_docs = 3430 data_path = os.environ.get('BIGARTM_UNITTEST_DATA') batches_folder = tempfile.mkdtemp() sp_zero_eps = 0.001 sparsity_phi_value = [ 0.034, 0.064, 0.093, 0.120, 0.145, 0.170, 0.194, 0.220, 0.246, 0.277, 0.312, 0.351, 0.390, 0.428, 0.464 ] sparsity_phi_rel_value = [ 0.442, 0.444, 0.444, 0.446, 0.448, 0.449, 0.458, 0.468, 0.476, 0.488, 0.501, 0.522, 0.574, 0.609, 0.670 ] sparsity_theta_value = [0.0] * num_collection_passes perp_zero_eps = 2.0 perplexity_value = [ 6873, 2590, 2685, 2578, 2603, 2552, 2536, 2481, 2419, 2331, 2235, 2140, 2065, 2009, 1964 ] perplexity_rel_value = [ 6873, 2667, 2458, 2323, 2150, 2265, 2015, 1967, 1807, 1747, 1713, 1607, 1632, 1542, 1469 ] top_zero_eps = 0.0001 top_tokens_num_tokens = [num_tokens * num_topics] * num_collection_passes top_tokens_topic_0_tokens = [ u'party', u'state', u'campaign', u'tax', u'political', u'republican', u'senate', u'candidate', u'democratic', u'court', u'president' ] top_tokens_topic_0_weights = [ 0.0209, 0.0104, 0.0094, 0.0084, 0.0068, 0.0067, 0.0065, 0.0058, 0.0053, 0.0053, 0.0051 ] ker_zero_eps = 0.02 topic_kernel_topic_0_contrast = 0.96 topic_kernel_topic_0_purity = 0.014 topic_kernel_topic_0_size = 18.0 topic_kernel_average_size = [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.13, 0.6, 1.6, 3.53, 7.15, 12.6, 20.4, 29.06 ] topic_kernel_average_contrast = [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.12, 0.31, 0.7, 0.96, 0.96, 0.96, 0.96, 0.97 ] topic_kernel_average_purity = [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.01, 0.015, 0.017, 0.02, 0.03, 0.04, 0.05 ] len_last_document_ids = 10 try: batch_vectorizer = artm.BatchVectorizer(data_path=data_path, data_format='bow_uci', collection_name='kos', target_folder=batches_folder) dictionary = artm.Dictionary() dictionary.gather(data_path=batch_vectorizer.data_path) model = artm.ARTM( topic_names=['topic_{}'.format(i) for i in range(num_topics)], dictionary=dictionary.name, cache_theta=True) model.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=sp_reg_tau)) model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=decor_tau)) model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model.scores.add( artm.PerplexityScore(name='PerplexityScore', use_unigram_document_model=False, dictionary=dictionary)) model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model.scores.add( artm.TopTokensScore(name='TopTokensScore', num_tokens=num_tokens)) model.scores.add( artm.TopicKernelScore( name='TopicKernelScore', probability_mass_threshold=probability_mass_threshold)) model.scores.add(artm.ThetaSnippetScore(name='ThetaSnippetScore')) model.num_document_passes = num_document_passes model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) for i in range(num_collection_passes): assert abs(model.score_tracker['SparsityPhiScore'].value[i] - sparsity_phi_value[i]) < sp_zero_eps for i in range(num_collection_passes): assert abs(model.score_tracker['SparsityThetaScore'].value[i] - sparsity_theta_value[i]) < sp_zero_eps for i in range(num_collection_passes): assert abs(model.score_tracker['PerplexityScore'].value[i] - perplexity_value[i]) < perp_zero_eps for i in range(num_collection_passes): assert model.score_tracker['TopTokensScore'].num_tokens[ i] == top_tokens_num_tokens[i] for i in range(num_tokens): assert model.score_tracker['TopTokensScore'].last_tokens[ model.topic_names[0]][i] == top_tokens_topic_0_tokens[i] assert abs(model.score_tracker['TopTokensScore'].last_weights[ model.topic_names[0]][i] - top_tokens_topic_0_weights[i]) < top_zero_eps assert len(model.score_tracker['TopicKernelScore'].last_tokens[ model.topic_names[0]]) > 0 assert abs(topic_kernel_topic_0_contrast - model.score_tracker['TopicKernelScore'].last_contrast[ model.topic_names[0]]) < ker_zero_eps assert abs(topic_kernel_topic_0_purity - model.score_tracker['TopicKernelScore'].last_purity[ model.topic_names[0]]) < ker_zero_eps assert abs(topic_kernel_topic_0_size - model.score_tracker['TopicKernelScore'].last_size[ model.topic_names[0]]) < ker_zero_eps for i in range(num_collection_passes): assert abs( model.score_tracker['TopicKernelScore'].average_size[i] - topic_kernel_average_size[i]) < ker_zero_eps assert abs( model.score_tracker['TopicKernelScore'].average_contrast[i] - topic_kernel_average_contrast[i]) < ker_zero_eps assert abs( model.score_tracker['TopicKernelScore'].average_purity[i] - topic_kernel_average_purity[i]) < ker_zero_eps model.fit_online(batch_vectorizer=batch_vectorizer) info = model.info assert info is not None assert len(info.config.topic_name) == num_topics assert len(info.score) >= len(model.score_tracker) assert len(info.regularizer) == len(model.regularizers.data) assert len(info.cache_entry) > 0 temp = model.score_tracker['ThetaSnippetScore'].last_document_ids assert len_last_document_ids == len(temp) assert len(model.score_tracker['ThetaSnippetScore'].last_snippet[ temp[0]]) == num_topics phi = model.get_phi() assert phi.shape == (vocab_size, num_topics) theta = model.get_theta() assert theta.shape == (num_topics, num_docs) assert model.library_version.count('.') == 2 # major.minor.patch # test relative coefficients for Phi matrix regularizers model = artm.ARTM(num_topics=num_topics, dictionary=dictionary.name, cache_theta=False) model.regularizers.add( artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=decor_rel_tau)) model.regularizers['DecorrelatorPhi'].gamma = 0.0 model.scores.add( artm.PerplexityScore(name='PerplexityScore', use_unigram_document_model=False, dictionary=dictionary)) model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model.num_document_passes = num_document_passes model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) for i in range(num_collection_passes): assert abs(model.score_tracker['SparsityPhiScore'].value[i] - sparsity_phi_rel_value[i]) < sp_zero_eps for i in range(num_collection_passes): assert abs(model.score_tracker['PerplexityScore'].value[i] - perplexity_rel_value[i]) < perp_zero_eps finally: shutil.rmtree(batches_folder)
num_working_files = 20 # Vowpal Wabbit batch_vectorizer = None if len(glob.glob(os.path.join(pn_folder, vw_folder, '*.batch'))) < 1: batch_vectorizer = artm.BatchVectorizer( data_path=os.path.join(pn_folder, vw_folder, 'vw.txt'), data_format='vowpal_wabbit', target_folder=os.path.join(pn_folder, vw_folder)) else: batch_vectorizer = artm.BatchVectorizer(data_path=os.path.join( pn_folder, vw_folder), data_format='batches') dictionary = artm.Dictionary() dict_path = os.path.join(pn_folder, vw_folder, 'dict.dict') if not os.path.isfile(dict_path): dictionary.gather(data_path=batch_vectorizer.data_path) dictionary.save(dictionary_path=dict_path) dictionary.load(dictionary_path=dict_path) dictionary.filter(min_df=2, max_df_rate=0.4) N = 1 # model model = create_model(dictionary=dictionary, num_tokens=num_top_tokens,
N_DOCUMENTS = 1000 N_TOPICS = 15 N_PASSES = 15 DATA_DIR = "../Files/DataPreprocessing/{}_documents".format(N_DOCUMENTS) BASE_DIR = "../Files/TopicModeling/{}_documents".format(N_DOCUMENTS) SAVE_DIR = os.path.join(BASE_DIR, "models/lda") BATCHES_DIR = os.path.join(BASE_DIR, "batches") DICTIONARY_FILE = os.path.join(BASE_DIR, "dictionary.dict") COOC_FILE = os.path.join(BASE_DIR, "cooc_tf") VOCAB_FILE = os.path.join(DATA_DIR, "vocab") start = time.time() bv = artm.BatchVectorizer(data_path=BATCHES_DIR, data_format="batches") dictionary = artm.Dictionary() dictionary.load(DICTIONARY_FILE) cooc_dict = artm.Dictionary() cooc_dict.gather(data_path=BATCHES_DIR, cooc_file_path=COOC_FILE, vocab_file_path=VOCAB_FILE, symmetric_cooc_values=True) coherence_score = artm.TopTokensScore(name='TopTokensCoherenceScore', dictionary=cooc_dict, num_tokens=15) model_artm = artm.LDA(num_topics=N_TOPICS) model_artm._internal_model.scores.add(
result += sys.getsizeof(v) return result if __name__ == "__main__": global_time_start = time.time() batches_folder, window_size = __read_params() batches_list = glob.glob(os.path.join(batches_folder, '*.batch')) dictionaries_list = [name for name in glob.glob(os.path.join(batches_folder, '*.dict'))] if len(batches_list) < 1 or len(dictionaries_list) < 1: raise RuntimeError('No batches or dictionaries were found in given folder') else: print('{} batches were found, start processing'.format(len(batches_list))) temp_dict = artm.Dictionary() temp_dict.load(dictionaries_list[0]) file_name = '../cooc_info/{}_temp_dict.txt'.format(time.time()) temp_dict.save_text(file_name) dictionary = {} with codecs.open(file_name, 'r', 'utf-8') as fin: fin.next() fin.next() for index, line in enumerate(fin): dictionary[line.split(' ')[0][0: -1]] = index os.remove(file_name) global_cooc_dictionary = {} for index, filename in enumerate(batches_list): local_time_start = time.time()
def train(self): vocabulary_file = self._prepare_texts_full( ) if self.analyze_full_doc == True else self._prepare_texts_from_summary( ) target_folder = self._get_bigARTM_dir() batch_vectorizer = artm.BatchVectorizer(data_path=vocabulary_file, data_format='vowpal_wabbit', target_folder=target_folder, batch_size=100) dict_path = self._get_dictionary_path() dict_file = '{}.dict'.format(dict_path) if os.path.isfile(dict_file): os.remove(dict_file) my_dictionary = artm.Dictionary() my_dictionary.gather(data_path=target_folder, vocab_file_path=vocabulary_file) my_dictionary.save(dictionary_path=dict_path) my_dictionary.load(dictionary_path=dict_file) T = self.num_of_topics topic_names = ["sbj" + str(i) for i in range(T - 1)] + ["bcg"] self.model_artm = artm.ARTM(num_topics=T, topic_names=topic_names, class_ids={ "text": 1, "doc_guid": 1 }, dictionary=my_dictionary, cache_theta=True) self.model_artm.initialize(dictionary=my_dictionary) self.model_artm.scores.add( artm.TopTokensScore(name="text_words", num_tokens=15, class_id="text")) self.model_artm.scores.add( artm.TopTokensScore(name="doc_guid_words", num_tokens=15, class_id="doc_guid")) self.model_artm.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=1e5, dictionary=my_dictionary, class_ids="text", topic_names="bcg")) self.model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=30) self.model_artm.regularizers.add( artm.SmoothSparsePhiRegularizer( name='SparsePhi-1e5', tau=-1e5, dictionary=my_dictionary, class_ids="text", topic_names=["sbj" + str(i) for i in range(T - 1)])) self.model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=15) self.training_done = True
def gather_dictionary(self, custom_vocab=False): self.log("Creating ARTM dictionary...") dictionary = artm.Dictionary(name="dictionary") batches_folder = os.path.join(self.get_folder(), "batches") vocab_file_path = os.path.join(self.get_folder(), "vocab.txt") if custom_vocab: dictionary.gather(batches_folder, vocab_file_path=vocab_file_path) else: dictionary.gather(batches_folder) vocab_file = open(vocab_file_path, "w", encoding="utf-8") dictionary_file_name = os.path.join(self.get_folder(), "batches", "dictionary.txt") dictionary.save_text(dictionary_file_name) self.log("Saving terms to database...") term_index_id = -3 self.modalities_count = 0 self.terms_index = dict() modalities_index = dict() with open(dictionary_file_name, "r", encoding='utf-8') as f: for line in f: term_index_id += 1 if term_index_id < 0: continue parsed = line.replace(',', ' ').split() term = Term() term.dataset = self term.text = parsed[0] term.index_id = term_index_id term.token_value = float(parsed[2]) term.token_tf = int(parsed[3].split('.')[0]) term.token_df = int(parsed[4].split('.')[0]) modality_name = parsed[1] if modality_name not in modalities_index: modality = Modality() modality.index_id = self.modalities_count self.modalities_count += 1 modality.name = modality_name modality.dataset = self modality.save() modalities_index[modality_name] = modality modality = modalities_index[modality_name] term.modality = modality modality.terms_count += 1 term.save() if not custom_vocab: vocab_file.write("%s %s\n" % (parsed[0], parsed[1])) self.terms_index[term.text] = term self.terms_index[term.text + "$#" + term.modality.name] = term self.terms_index[term.index_id] = term if term_index_id % 10000 == 0: self.log(str(term_index_id)) # print(term_index_id) if not custom_vocab: vocab_file.close() self.terms_count = term_index_id + 1 self.terms_count = term_index_id + 1 self.log("Saving modalities...") max_modality_size = 0 word_modality_id = -1 for key, modality in modalities_index.items(): if modality.terms_count > max_modality_size: word_modality_id = modality.id max_modality_size = modality.terms_count for key, modality in modalities_index.items(): if modality.id == word_modality_id: modality.weight_spectrum = 1 modality.weight_naming = 1 if 'tag' in modality.name: modality.is_tag = True modality.save() self.normalize_modalities_weights()
def test_func(): # constants num_tokens = 15 alpha = 0.01 beta = 0.02 num_collection_passes = 15 num_document_passes = 1 num_topics = 15 vocab_size = 6906 num_docs = 3430 zero_eps = 0.001 data_path = os.environ.get('BIGARTM_UNITTEST_DATA') batches_folder = tempfile.mkdtemp() try: batch_vectorizer = artm.BatchVectorizer(data_path=data_path, data_format='bow_uci', collection_name='kos', target_folder=batches_folder) dictionary = artm.Dictionary() dictionary.gather(data_path=batch_vectorizer.data_path) model_artm = artm.ARTM(num_topics=num_topics, dictionary=dictionary, cache_theta=True, reuse_theta=True) model_artm.regularizers.add( artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=beta)) model_artm.regularizers.add( artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=alpha)) model_artm.scores.add( artm.SparsityThetaScore(name='SparsityThetaScore')) model_artm.scores.add( artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)) model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model_artm.scores.add( artm.TopTokensScore(name='TopTokensScore', num_tokens=num_tokens)) model_lda = artm.LDA(num_topics=num_topics, alpha=alpha, beta=beta, dictionary=dictionary, cache_theta=True) model_lda.initialize(dictionary=dictionary) model_artm.num_document_passes = num_document_passes model_lda.num_document_passes = num_document_passes model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes) for i in range(num_collection_passes): assert abs(model_artm.score_tracker['SparsityPhiScore'].value[i] - model_lda.sparsity_phi_value[i]) < zero_eps for i in range(num_collection_passes): assert abs( model_artm.score_tracker['SparsityThetaScore'].value[i] - model_lda.sparsity_theta_value[i]) < zero_eps for i in range(num_collection_passes): assert abs(model_artm.score_tracker['PerplexityScore'].value[i] - model_lda.perplexity_value[i]) < zero_eps lda_tt = model_lda.get_top_tokens(num_tokens=num_tokens) assert len(lda_tt) == num_topics for i in range(num_topics): for j in range(num_tokens): assert model_artm.score_tracker['TopTokensScore'].last_tokens[ model_artm.topic_names[i]][j] == lda_tt[i][j] lda_tt = model_lda.get_top_tokens(num_tokens=num_tokens, with_weights=True) for i in range(num_tokens): assert abs(model_artm.score_tracker['TopTokensScore'].last_weights[ model_artm.topic_names[0]][i] - lda_tt[0][i][1]) < zero_eps model_lda.fit_online(batch_vectorizer=batch_vectorizer) phi = model_lda.phi_ assert phi.shape == (vocab_size, num_topics) theta = model_lda.get_theta() assert theta.shape == (num_topics, num_docs) assert model_lda.library_version.count('.') == 2 # major.minor.patch model_lda = artm.LDA(num_topics=num_topics, alpha=alpha, beta=([0.1] * num_topics), dictionary=dictionary, cache_theta=True) assert model_lda._internal_model.regularizers.size() == num_topics + 1 finally: shutil.rmtree(batches_folder)