def test_training_with_saved_data(data_dir): handler = TextHandler(data_dir + "gnews/GoogleNews.txt") handler.prepare() # create vocabulary and training data # load BERT data with open(data_dir + "gnews/bert_embeddings_gnews", "rb") as filino: training_bert = pickle.load(filino) training_dataset = CTMDataset(handler.bow, training_bert, handler.idx2token) ctm = CTM(input_size=len(handler.vocab), bert_input_size=768, num_epochs=1, inference_type="combined", n_components=5) ctm.fit(training_dataset) # run the model print(ctm.get_topics(2)) ctm.get_doc_topic_distribution(training_dataset) assert isclose(np.sum(ctm.get_topic_word_distribution()[0]), 1, rel_tol=1e-5, abs_tol=0.0)
def test_training(data_dir): handler = TextHandler(data_dir + "sample_text_document") handler.prepare() # create vocabulary and training data train_bert = bert_embeddings_from_file( data_dir + 'sample_text_document', "distiluse-base-multilingual-cased") training_dataset = CTMDataset(handler.bow, train_bert, handler.idx2token) ctm = CTM(input_size=len(handler.vocab), bert_input_size=512, num_epochs=1, inference_type="combined", n_components=5) ctm.fit(training_dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5 thetas = ctm.get_thetas(training_dataset) assert len(thetas) == len(train_bert)
class TopicModel(): def __init__(self, train_size: int = TRAIN_SIZE, n_topics: int = NUM_TOPICS, batch_size: int = BATCH_SIZE, shuffle=True): data_loader = TwitterDataset("TRAIN", train_size, batch_size=batch_size, shuffle=shuffle) data_loader.init() self.data_loader = data_loader self.ctm = CTM(input_size=len(self.data_loader.vocab), bert_input_size=512, num_epochs=20, batch_size=MINI_BATCH_SIZE, inference_type="contextual", n_components=n_topics, reduce_on_plateau=True, lr=1e-4, hidden_sizes=HIDDEN_UNITS, num_data_loader_workers=0) def train(self): for i in range(self.data_loader.n_batches): logger.info(f"Starting Batch {i+1}") self.data_loader.encoded_next_batch() training_dataset = CTMDataset(self.data_loader.bow, self.data_loader.bert_embeddings, self.data_loader.idx2token) self.ctm.fit(training_dataset) logger.info("\n---------------------------") logger.info("--------Topics---------") topics = self.ctm.get_topic_lists(10) for t in topics: logger.info(t) logger.info("---------------------------\n") return self.ctm.get_topic_lists(10)
def test_training_all_classes_ctm(data_dir): handler = TextHandler(data_dir + "sample_text_document") handler.prepare() # create vocabulary and training data train_bert = bert_embeddings_from_file( data_dir + 'sample_text_document', "distiluse-base-multilingual-cased") training_dataset = CTMDataset(handler.bow, train_bert, handler.idx2token) ctm = CTM(input_size=len(handler.vocab), bert_input_size=512, num_epochs=1, inference_type="combined", n_components=5) ctm.fit(training_dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5 thetas = ctm.get_doc_topic_distribution(training_dataset) assert len(thetas) == len(train_bert) ctm = ZeroShotTM(input_size=len(handler.vocab), bert_input_size=512, num_epochs=1, n_components=5) ctm.fit(training_dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5 thetas = ctm.get_doc_topic_distribution(training_dataset) assert len(thetas) == len(train_bert) ctm = CombinedTM(input_size=len(handler.vocab), bert_input_size=512, num_epochs=1, n_components=5) ctm.fit(training_dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5 thetas = ctm.get_doc_topic_distribution(training_dataset) assert len(thetas) == len(train_bert) with open(data_dir + 'sample_text_document') as filino: data = filino.readlines() handler = TextHandler(sentences=data) handler.prepare() # create vocabulary and training data train_bert = bert_embeddings_from_list( data, "distiluse-base-multilingual-cased") training_dataset = CTMDataset(handler.bow, train_bert, handler.idx2token) ctm = CTM(input_size=len(handler.vocab), bert_input_size=512, num_epochs=1, inference_type="combined", n_components=5) ctm.fit(training_dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5 thetas = ctm.get_doc_topic_distribution(training_dataset) assert len(thetas) == len(train_bert) qt = QuickText("distiluse-base-multilingual-cased", text_for_bow=data, text_for_bert=data) dataset = qt.load_dataset() ctm = ZeroShotTM(input_size=len(qt.vocab), bert_input_size=512, num_epochs=1, n_components=5) ctm.fit(dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5 qt_from_conf = QuickText("distiluse-base-multilingual-cased", None, None) qt_from_conf.load_configuration(qt.bow, qt.data_bert, qt.vocab, qt.idx2token) dataset = qt_from_conf.load_dataset() ctm = ZeroShotTM(input_size=len(qt.vocab), bert_input_size=512, num_epochs=1, n_components=5) ctm.fit(dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5
import os import numpy as np import pickle import torch from contextualized_topic_models.datasets.dataset import CTMDatasetTopReg from contextualized_topic_models.utils.data_preparation import TextHandler, labels_from_file handler = TextHandler( "contextualized_topic_models/data/wiki/wiki_train_en_prep.txt") handler.prepare() train_bert = bert_embeddings_from_file('contextualized_topic_models/data/wiki/wiki_train_en_unprep.txt', \ '../sentence-transformers/sentence_transformers/output/training_wiki_topics_4_xlm-roberta-base-2020-10-24_13-38-14/') labels, nb_labels = labels_from_file( 'contextualized_topic_models/data/wiki/topic_full_misc.json') training_dataset = CTMDatasetTopReg(handler.bow, train_bert, handler.idx2token, labels) num_topics = 100 # the nb_labels argument turns this from a CTM into a TCCTM ctm = CTM(input_size=len(handler.vocab), bert_input_size=768, num_epochs=60, hidden_sizes=(100, ), inference_type="contextual", n_components=num_topics, num_data_loader_workers=0, nb_labels=nb_labels) ctm.fit(training_dataset) ctm.save("models/wiki/wiki_xlmr_en_topics_4_topreg")