def fit(self, X, y=None) -> None: documents, features = X.shape ds = CountTensorDataset(X.astype(np.float32)) self.autoencoder = ProdLDA(in_dimension=features, hidden1_dimension=self.hidden1_dimension, hidden2_dimension=self.hidden2_dimension, topics=self.topics) if self.cuda: self.autoencoder.cuda() ae_optimizer = Adam(self.autoencoder.parameters(), lr=self.lr, betas=(0.99, 0.999)) train( ds, self.autoencoder, cuda=self.cuda, validation=None, epochs=self.epochs, batch_size=self.batch_size, optimizer=ae_optimizer, sampler=WeightedRandomSampler(torch.ones(documents), min(documents, self.samples)), silent=True, num_workers=0 # TODO causes a bug to change this on Mac )
def test_loss_basic(): vae = ProdLDA(in_dimension=10, hidden1_dimension=20, hidden2_dimension=10, topics=5) for size in [10, 100, 1000]: batch = torch.zeros(size, 10) loss = vae.loss(batch, batch, vae.prior_mean, vae.prior_logvar) assert loss.shape == (size, ) assert loss.mean().item() == 0 assert torch.all(torch.lt(torch.abs(loss), 0)).item() == 0
def test_parameters(): vae = ProdLDA(in_dimension=10, hidden1_dimension=20, hidden2_dimension=10, topics=5) # encoder # two each for the linear units assert len(tuple(vae.encoder.parameters())) == 4 assert (len( tuple(param for param in vae.encoder.parameters() if param.requires_grad)) == 4) # mean and logvar # two for the linear, two for the batchnorm assert len(tuple(vae.mean.parameters())) == 4 assert (len( tuple(param for param in vae.mean.parameters() if param.requires_grad)) == 3) assert len(tuple(vae.logvar.parameters())) == 4 assert (len( tuple(param for param in vae.logvar.parameters() if param.requires_grad)) == 3) # decoder # one for the linear, two for the batchnorm assert len(tuple(vae.decoder.parameters())) == 3 # batchnorm has no scale assert (len( tuple(param for param in vae.decoder.parameters() if param.requires_grad)) == 2)
def test_copy_embeddings_model(): lookup = {9: torch.ones(20), 8: torch.tensor(20).fill_(2)} vae = ProdLDA(in_dimension=10, hidden1_dimension=20, hidden2_dimension=10, topics=5, word_embeddings=lookup) assert vae.encoder.linear1.weight[:, 9].eq(1).all() assert vae.encoder.linear1.weight[:, 8].eq(2).all()
def test_forward_dimensions(): vae = ProdLDA(in_dimension=10, hidden1_dimension=20, hidden2_dimension=10, topics=5) for size in [10, 100, 1000]: batch = torch.zeros(size, 10) recon, mean, logvar = vae(batch) assert recon.shape == batch.shape assert mean.shape == (size, 5) assert logvar.shape == (size, 5)
def test_not_train_embeddings(): vae = ProdLDA(in_dimension=10, hidden1_dimension=20, hidden2_dimension=10, topics=5, train_word_embeddings=False) for size in [10, 100, 1000]: batch = torch.zeros(size, 10) recon, mean, logvar = vae(batch) assert recon.shape == batch.shape assert mean.shape == (size, 5) assert logvar.shape == (size, 5)
def main(cuda, batch_size, epochs, top_words, testing_mode, verbose_mode): print("Loading input data") # TODO fix relative paths data_train = load_npz("data/train.txt.npz") data_val = load_npz("data/test.txt.npz") corpus = Sparse2Corpus(data_train, documents_columns=False) with open("data/vocab.pkl", "rb") as f: vocab = pickle.load(f) reverse_vocab = {vocab[word]: word for word in vocab} indexed_vocab = [ reverse_vocab[index] for index in range(len(reverse_vocab)) ] writer = SummaryWriter() # create the TensorBoard object # callback function to call during training, uses writer from the scope def training_callback(autoencoder, epoch, lr, loss, perplexity): if verbose_mode: decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [[ reverse_vocab[item.item()] for item in topic ] for topic in decoder_weight.topk(top_words, dim=0)[1].t()] cm = CoherenceModel( topics=topics, corpus=corpus, dictionary=Dictionary.from_corpus(corpus, reverse_vocab), coherence="u_mass", ) coherence = cm.get_coherence() coherences = cm.get_coherence_per_topic() for index, topic in enumerate(topics): print( str(index) + ":" + str(coherences[index]) + ":" + ",".join(topic)) print(coherence) else: coherence = 0 writer.add_scalars( "data/autoencoder", { "lr": lr, "loss": loss, "perplexity": perplexity, "coherence": coherence, }, global_step=epoch, ) ds_train = CountTensorDataset(data_train) ds_val = CountTensorDataset(data_val) autoencoder = ProdLDA(in_dimension=len(vocab), hidden1_dimension=100, hidden2_dimension=100, topics=50) if cuda: autoencoder.cuda() print("Training stage.") ae_optimizer = Adam(autoencoder.parameters(), 0.0001, betas=(0.99, 0.999)) train( ds_train, autoencoder, cuda=cuda, validation=ds_val, epochs=epochs, batch_size=batch_size, optimizer=ae_optimizer, update_callback=training_callback, sampler=WeightedRandomSampler(torch.ones(data_train.shape[0]), 20000), num_workers=4, ) autoencoder.eval() decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [[reverse_vocab[item.item()] for item in topic] for topic in decoder_weight.topk(top_words, dim=0)[1].t()] cm = CoherenceModel( topics=topics, corpus=corpus, dictionary=Dictionary.from_corpus(corpus, reverse_vocab), coherence="u_mass", ) coherence = cm.get_coherence() coherences = cm.get_coherence_per_topic() for index, topic in enumerate(topics): print( str(index) + ":" + str(coherences[index]) + ":" + ",".join(topic)) print(coherence) if not testing_mode: writer.add_embedding( autoencoder.encoder.linear1.weight.detach().cpu().t(), metadata=indexed_vocab, tag="feature_embeddings", ) writer.close()
def main( cuda, batch_size, epochs, top_words, testing_mode, ): print("Loading input data") # TODO fix relative paths data_train = np.load("data/train.txt.npy", encoding="bytes") data_val = np.load("data/test.txt.npy", encoding="bytes") with open("data/vocab.pkl", "rb") as f: vocab = pickle.load(f) reverse_vocab = {vocab[word]: word for word in vocab} indexed_vocab = [ reverse_vocab[index] for index in range(len(reverse_vocab)) ] writer = SummaryWriter() # create the TensorBoard object # callback function to call during training, uses writer from the scope def training_callback(autoencoder, epoch, lr, loss, perplexity): writer.add_scalars( "data/autoencoder", { "lr": lr, "loss": loss, "perplexity": perplexity, }, global_step=epoch, ) decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [[reverse_vocab[item.item()] for item in topic] for topic in decoder_weight.topk(top_words, dim=0)[1].t()] for index, topic in enumerate(topics): print(str(index) + ":" + ",".join(topic)) ds_train = TensorDataset(torch.from_numpy(data_train).float()) ds_val = TensorDataset(torch.from_numpy(data_val).float()) autoencoder = ProdLDA(in_dimension=len(vocab), hidden1_dimension=100, hidden2_dimension=100, topics=50) if cuda: autoencoder.cuda() print("Training stage.") ae_optimizer = Adam(autoencoder.parameters(), 0.001, betas=(0.99, 0.999)) train( ds_train, autoencoder, cuda=cuda, validation=ds_val, epochs=epochs, batch_size=batch_size, optimizer=ae_optimizer, update_callback=training_callback, ) autoencoder.eval() decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [[reverse_vocab[item.item()] for item in topic] for topic in decoder_weight.topk(top_words, dim=0)[1].t()] for topic in topics: print(",".join(topic)) if not testing_mode: writer.add_embedding( autoencoder.encoder.linear1.weight.detach().cpu().t(), metadata=indexed_vocab, tag="feature_embeddings", ) writer.close()
def main( cuda, batch_size, epochs, top_words, testing_mode, ): print('Loading input data') # TODO fix relative paths input_train = np.load('data/train.txt.npy', encoding='bytes') input_val = np.load('data/test.txt.npy', encoding='bytes') with open('data/vocab.pkl', 'rb') as f: vocab = pickle.load(f) reverse_vocab = {vocab[word]: word for word in vocab} indexed_vocab = [ reverse_vocab[index] for index in range(len(reverse_vocab)) ] data_train = np.array([ np.bincount(doc.astype('int'), minlength=len(vocab)) for doc in input_train if doc.sum() > 0 ]) data_val = np.array([ np.bincount(doc.astype('int'), minlength=len(vocab)) for doc in input_val if doc.sum() > 0 ]) writer = SummaryWriter() # create the TensorBoard object # callback function to call during training, uses writer from the scope def training_callback(autoencoder, epoch, lr, loss, perplexity): writer.add_scalars('data/autoencoder', { 'lr': lr, 'loss': loss, 'perplexity': perplexity, }, global_step=epoch) decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [[reverse_vocab[item.item()] for item in topic] for topic in decoder_weight.topk(top_words, dim=0)[1].t()] for index, topic in enumerate(topics): print(str(index) + ':' + ','.join(topic)) ds_train = TensorDataset(torch.from_numpy(data_train).float()) ds_val = TensorDataset(torch.from_numpy(data_val).float()) autoencoder = ProdLDA(in_dimension=len(vocab), hidden1_dimension=100, hidden2_dimension=100, topics=50) if cuda: autoencoder.cuda() print('Training stage.') ae_optimizer = Adam(autoencoder.parameters(), 0.001, betas=(0.99, 0.999)) train(ds_train, autoencoder, cuda=cuda, validation=ds_val, epochs=epochs, batch_size=batch_size, optimizer=ae_optimizer, update_callback=training_callback) autoencoder.eval() decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [[reverse_vocab[item.item()] for item in topic] for topic in decoder_weight.topk(top_words, dim=0)[1].t()] for topic in topics: print(','.join(topic)) if not testing_mode: writer.add_embedding( autoencoder.encoder.linear1.weight.detach().cpu().t(), metadata=indexed_vocab, tag='feature_embeddings', ) writer.close()
class ProdLDATransformer(TransformerMixin, BaseEstimator): def __init__(self, cuda=None, batch_size=200, epochs=80, hidden1_dimension=100, hidden2_dimension=100, topics=50, lr=0.001, samples=20000, score_num=7, score_type='coherence') -> None: self.cuda = torch.cuda.is_available() if cuda is None else cuda self.batch_size = batch_size self.epochs = epochs self.hidden1_dimension = hidden1_dimension self.hidden2_dimension = hidden2_dimension self.topics = topics self.lr = lr self.samples = samples self.autoencoder = None self.score_type = score_type self.score_num = score_num if self.score_type not in ['coherence']: raise ValueError('score_type must be "coherence"') def fit(self, X, y=None) -> None: documents, features = X.shape ds = CountTensorDataset(X.astype(np.float32)) self.autoencoder = ProdLDA(in_dimension=features, hidden1_dimension=self.hidden1_dimension, hidden2_dimension=self.hidden2_dimension, topics=self.topics) if self.cuda: self.autoencoder.cuda() ae_optimizer = Adam(self.autoencoder.parameters(), lr=self.lr, betas=(0.99, 0.999)) train( ds, self.autoencoder, cuda=self.cuda, validation=None, epochs=self.epochs, batch_size=self.batch_size, optimizer=ae_optimizer, sampler=WeightedRandomSampler(torch.ones(documents), min(documents, self.samples)), silent=True, num_workers=0 # TODO causes a bug to change this on Mac ) def transform(self, X): if self.autoencoder is None: raise NotFittedError self.autoencoder.eval() ds = CountTensorDataset(X.astype(np.float32)) output = predict( ds, self.autoencoder, encode=True, silent=True, batch_size=self.batch_size, num_workers=0 # TODO causes a bug to change this on Mac ) return output.numpy() def score(self, X, y=None, sample_weight=None) -> float: # TODO this needs further testing for correctness, WIP if self.autoencoder is None: raise NotFittedError self.autoencoder.eval() corpus = Sparse2Corpus(X, documents_columns=False) decoder_weight = self.autoencoder.decoder.linear.weight.detach().cpu() id2word = {index: str(index) for index in range(X.shape[1])} topics = [[str(item.item()) for item in topic] for topic in decoder_weight.topk( min(self.score_num, X.shape[1]), dim=0)[1].t()] cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=Dictionary.from_corpus(corpus, id2word), coherence='u_mass') return cm.get_coherence()