def test_training_with_saved_data(data_dir): handler = TextHandler(data_dir + "gnews/GoogleNews.txt") handler.prepare() # create vocabulary and training data # load BERT data with open(data_dir + "gnews/bert_embeddings_gnews", "rb") as filino: training_bert = pickle.load(filino) training_dataset = CTMDataset(handler.bow, training_bert, handler.idx2token) ctm = CTM(input_size=len(handler.vocab), bert_input_size=768, num_epochs=1, inference_type="combined", n_components=5) ctm.fit(training_dataset) # run the model print(ctm.get_topics(2)) ctm.get_thetas(training_dataset)
def test_training(data_dir): handler = TextHandler(data_dir + "sample_text_document") handler.prepare() # create vocabulary and training data train_bert = bert_embeddings_from_file( data_dir + 'sample_text_document', "distiluse-base-multilingual-cased") training_dataset = CTMDataset(handler.bow, train_bert, handler.idx2token) ctm = CTM(input_size=len(handler.vocab), bert_input_size=512, num_epochs=1, inference_type="combined", n_components=5) ctm.fit(training_dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5 thetas = ctm.get_thetas(training_dataset) assert len(thetas) == len(train_bert)
handler_en.prepare() testing_bert_en = bert_embeddings_from_file( "contextualized_topic_models/data/wiki/wiki_test_en_unprep_sub.txt", sys.argv[3]) testing_dataset_en = CTMDataset(handler_en.bow, testing_bert_en, handler_en.idx2token) ctm = CTM(input_size=len(handler_en.vocab), inference_type="contextual", bert_input_size=768) # ctm = torch.load(sys.argv[1], map_location="cpu") ctm.load(sys.argv[1], sys.argv[2]) num_topics = 100 thetas_en = ctm.get_thetas(testing_dataset_en, n_samples=100) with open("temp/topics_en_simple.txt", 'w') as test_out: topics = np.squeeze(np.argmax(thetas_en, axis=1).T) for topic in topics: test_out.write(str(topic) + '\n') # randomly shuffled en baseline # np.random.seed(3) # np.random.shuffle(thetas_en) # plot topic histogram # labels, values = zip(*Counter(np.squeeze(np.argmax(thetas_en, axis=1).T)).items()) # indexes = np.arange(len(labels)) # width = 1 # plt.bar(indexes, values, width) # plt.xticks(indexes + width * 0.5, labels)
def test_training_all_classes_ctm(data_dir): handler = TextHandler(data_dir + "sample_text_document") handler.prepare() # create vocabulary and training data train_bert = bert_embeddings_from_file( data_dir + 'sample_text_document', "distiluse-base-multilingual-cased") training_dataset = CTMDataset(handler.bow, train_bert, handler.idx2token) ctm = CTM(input_size=len(handler.vocab), bert_input_size=512, num_epochs=1, inference_type="combined", n_components=5) ctm.fit(training_dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5 thetas = ctm.get_thetas(training_dataset) assert len(thetas) == len(train_bert) ctm = ZeroShotTM(input_size=len(handler.vocab), bert_input_size=512, num_epochs=1, n_components=5) ctm.fit(training_dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5 thetas = ctm.get_thetas(training_dataset) assert len(thetas) == len(train_bert) ctm = CombinedTM(input_size=len(handler.vocab), bert_input_size=512, num_epochs=1, n_components=5) ctm.fit(training_dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5 thetas = ctm.get_thetas(training_dataset) assert len(thetas) == len(train_bert) with open(data_dir + 'sample_text_document') as filino: data = filino.readlines() handler = TextHandler(sentences=data) handler.prepare() # create vocabulary and training data train_bert = bert_embeddings_from_list( data, "distiluse-base-multilingual-cased") training_dataset = CTMDataset(handler.bow, train_bert, handler.idx2token) ctm = CTM(input_size=len(handler.vocab), bert_input_size=512, num_epochs=1, inference_type="combined", n_components=5) ctm.fit(training_dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5 thetas = ctm.get_thetas(training_dataset) assert len(thetas) == len(train_bert) qt = QuickText("distiluse-base-multilingual-cased", unpreprocessed_sentences=data, preprocessed_sentences=data) dataset = qt.load_dataset() ctm = ZeroShotTM(input_size=len(qt.vocab), bert_input_size=512, num_epochs=1, n_components=5) ctm.fit(dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5