def fit(self, X, y=None) -> None: documents, features = X.shape ds = CountTensorDataset(X.astype(np.float32)) self.autoencoder = ProdLDA(in_dimension=features, hidden1_dimension=self.hidden1_dimension, hidden2_dimension=self.hidden2_dimension, topics=self.topics) if self.cuda: self.autoencoder.cuda() ae_optimizer = Adam(self.autoencoder.parameters(), lr=self.lr, betas=(0.99, 0.999)) train( ds, self.autoencoder, cuda=self.cuda, validation=None, epochs=self.epochs, batch_size=self.batch_size, optimizer=ae_optimizer, sampler=WeightedRandomSampler(torch.ones(documents), min(documents, self.samples)), silent=True, num_workers=0 # TODO causes a bug to change this on Mac )
def test_train(): autoencoder = Mock() autoencoder.return_value = [torch.tensor([1, 1], dtype=torch.float)] * 3 autoencoder.loss.return_value = torch.tensor( [1, 1], dtype=torch.float).requires_grad_() optimizer = Mock() dataset = TensorDataset(torch.zeros(100, 1000)) train(dataset=dataset, autoencoder=autoencoder, epochs=1, batch_size=10, optimizer=optimizer) autoencoder.train.assert_called_once() assert autoencoder.call_count == 10 assert optimizer.zero_grad.call_count == 10 assert optimizer.step.call_count == 10
def test_train_validation(): autoencoder = Mock() autoencoder.return_value = [torch.zeros(10, 10).float()] * 3 autoencoder.loss.return_value = torch.zeros(10, 10).float().requires_grad_() optimizer = Mock() dataset = TensorDataset(torch.zeros(100, 1000)) validation_dataset = TensorDataset(torch.zeros(10, 1000)) train(dataset=dataset, validation=validation_dataset, autoencoder=autoencoder, epochs=1, batch_size=10, optimizer=optimizer) assert autoencoder.train.call_count == 2 assert autoencoder.call_count == 11 assert optimizer.zero_grad.call_count == 10 assert optimizer.step.call_count == 10
def main(cuda, batch_size, epochs, top_words, testing_mode, verbose_mode): print("Loading input data") # TODO fix relative paths data_train = load_npz("data/train.txt.npz") data_val = load_npz("data/test.txt.npz") corpus = Sparse2Corpus(data_train, documents_columns=False) with open("data/vocab.pkl", "rb") as f: vocab = pickle.load(f) reverse_vocab = {vocab[word]: word for word in vocab} indexed_vocab = [ reverse_vocab[index] for index in range(len(reverse_vocab)) ] writer = SummaryWriter() # create the TensorBoard object # callback function to call during training, uses writer from the scope def training_callback(autoencoder, epoch, lr, loss, perplexity): if verbose_mode: decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [[ reverse_vocab[item.item()] for item in topic ] for topic in decoder_weight.topk(top_words, dim=0)[1].t()] cm = CoherenceModel( topics=topics, corpus=corpus, dictionary=Dictionary.from_corpus(corpus, reverse_vocab), coherence="u_mass", ) coherence = cm.get_coherence() coherences = cm.get_coherence_per_topic() for index, topic in enumerate(topics): print( str(index) + ":" + str(coherences[index]) + ":" + ",".join(topic)) print(coherence) else: coherence = 0 writer.add_scalars( "data/autoencoder", { "lr": lr, "loss": loss, "perplexity": perplexity, "coherence": coherence, }, global_step=epoch, ) ds_train = CountTensorDataset(data_train) ds_val = CountTensorDataset(data_val) autoencoder = ProdLDA(in_dimension=len(vocab), hidden1_dimension=100, hidden2_dimension=100, topics=50) if cuda: autoencoder.cuda() print("Training stage.") ae_optimizer = Adam(autoencoder.parameters(), 0.0001, betas=(0.99, 0.999)) train( ds_train, autoencoder, cuda=cuda, validation=ds_val, epochs=epochs, batch_size=batch_size, optimizer=ae_optimizer, update_callback=training_callback, sampler=WeightedRandomSampler(torch.ones(data_train.shape[0]), 20000), num_workers=4, ) autoencoder.eval() decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [[reverse_vocab[item.item()] for item in topic] for topic in decoder_weight.topk(top_words, dim=0)[1].t()] cm = CoherenceModel( topics=topics, corpus=corpus, dictionary=Dictionary.from_corpus(corpus, reverse_vocab), coherence="u_mass", ) coherence = cm.get_coherence() coherences = cm.get_coherence_per_topic() for index, topic in enumerate(topics): print( str(index) + ":" + str(coherences[index]) + ":" + ",".join(topic)) print(coherence) if not testing_mode: writer.add_embedding( autoencoder.encoder.linear1.weight.detach().cpu().t(), metadata=indexed_vocab, tag="feature_embeddings", ) writer.close()
def main( cuda, batch_size, epochs, top_words, testing_mode, ): print("Loading input data") # TODO fix relative paths data_train = np.load("data/train.txt.npy", encoding="bytes") data_val = np.load("data/test.txt.npy", encoding="bytes") with open("data/vocab.pkl", "rb") as f: vocab = pickle.load(f) reverse_vocab = {vocab[word]: word for word in vocab} indexed_vocab = [ reverse_vocab[index] for index in range(len(reverse_vocab)) ] writer = SummaryWriter() # create the TensorBoard object # callback function to call during training, uses writer from the scope def training_callback(autoencoder, epoch, lr, loss, perplexity): writer.add_scalars( "data/autoencoder", { "lr": lr, "loss": loss, "perplexity": perplexity, }, global_step=epoch, ) decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [[reverse_vocab[item.item()] for item in topic] for topic in decoder_weight.topk(top_words, dim=0)[1].t()] for index, topic in enumerate(topics): print(str(index) + ":" + ",".join(topic)) ds_train = TensorDataset(torch.from_numpy(data_train).float()) ds_val = TensorDataset(torch.from_numpy(data_val).float()) autoencoder = ProdLDA(in_dimension=len(vocab), hidden1_dimension=100, hidden2_dimension=100, topics=50) if cuda: autoencoder.cuda() print("Training stage.") ae_optimizer = Adam(autoencoder.parameters(), 0.001, betas=(0.99, 0.999)) train( ds_train, autoencoder, cuda=cuda, validation=ds_val, epochs=epochs, batch_size=batch_size, optimizer=ae_optimizer, update_callback=training_callback, ) autoencoder.eval() decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [[reverse_vocab[item.item()] for item in topic] for topic in decoder_weight.topk(top_words, dim=0)[1].t()] for topic in topics: print(",".join(topic)) if not testing_mode: writer.add_embedding( autoencoder.encoder.linear1.weight.detach().cpu().t(), metadata=indexed_vocab, tag="feature_embeddings", ) writer.close()
def main( cuda, batch_size, epochs, top_words, testing_mode, ): print('Loading input data') # TODO fix relative paths input_train = np.load('data/train.txt.npy', encoding='bytes') input_val = np.load('data/test.txt.npy', encoding='bytes') with open('data/vocab.pkl', 'rb') as f: vocab = pickle.load(f) reverse_vocab = {vocab[word]: word for word in vocab} indexed_vocab = [ reverse_vocab[index] for index in range(len(reverse_vocab)) ] data_train = np.array([ np.bincount(doc.astype('int'), minlength=len(vocab)) for doc in input_train if doc.sum() > 0 ]) data_val = np.array([ np.bincount(doc.astype('int'), minlength=len(vocab)) for doc in input_val if doc.sum() > 0 ]) writer = SummaryWriter() # create the TensorBoard object # callback function to call during training, uses writer from the scope def training_callback(autoencoder, epoch, lr, loss, perplexity): writer.add_scalars('data/autoencoder', { 'lr': lr, 'loss': loss, 'perplexity': perplexity, }, global_step=epoch) decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [[reverse_vocab[item.item()] for item in topic] for topic in decoder_weight.topk(top_words, dim=0)[1].t()] for index, topic in enumerate(topics): print(str(index) + ':' + ','.join(topic)) ds_train = TensorDataset(torch.from_numpy(data_train).float()) ds_val = TensorDataset(torch.from_numpy(data_val).float()) autoencoder = ProdLDA(in_dimension=len(vocab), hidden1_dimension=100, hidden2_dimension=100, topics=50) if cuda: autoencoder.cuda() print('Training stage.') ae_optimizer = Adam(autoencoder.parameters(), 0.001, betas=(0.99, 0.999)) train(ds_train, autoencoder, cuda=cuda, validation=ds_val, epochs=epochs, batch_size=batch_size, optimizer=ae_optimizer, update_callback=training_callback) autoencoder.eval() decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [[reverse_vocab[item.item()] for item in topic] for topic in decoder_weight.topk(top_words, dim=0)[1].t()] for topic in topics: print(','.join(topic)) if not testing_mode: writer.add_embedding( autoencoder.encoder.linear1.weight.detach().cpu().t(), metadata=indexed_vocab, tag='feature_embeddings', ) writer.close()