Beispiel #1
0
 def fit(self, X, y=None) -> None:
     documents, features = X.shape
     ds = CountTensorDataset(X.astype(np.float32))
     self.autoencoder = ProdLDA(in_dimension=features,
                                hidden1_dimension=self.hidden1_dimension,
                                hidden2_dimension=self.hidden2_dimension,
                                topics=self.topics)
     if self.cuda:
         self.autoencoder.cuda()
     ae_optimizer = Adam(self.autoencoder.parameters(),
                         lr=self.lr,
                         betas=(0.99, 0.999))
     train(
         ds,
         self.autoencoder,
         cuda=self.cuda,
         validation=None,
         epochs=self.epochs,
         batch_size=self.batch_size,
         optimizer=ae_optimizer,
         sampler=WeightedRandomSampler(torch.ones(documents),
                                       min(documents, self.samples)),
         silent=True,
         num_workers=0  # TODO causes a bug to change this on Mac
     )
Beispiel #2
0
def test_loss_basic():
    vae = ProdLDA(in_dimension=10,
                  hidden1_dimension=20,
                  hidden2_dimension=10,
                  topics=5)
    for size in [10, 100, 1000]:
        batch = torch.zeros(size, 10)
        loss = vae.loss(batch, batch, vae.prior_mean, vae.prior_logvar)
        assert loss.shape == (size, )
        assert loss.mean().item() == 0
        assert torch.all(torch.lt(torch.abs(loss), 0)).item() == 0
Beispiel #3
0
def test_parameters():
    vae = ProdLDA(in_dimension=10,
                  hidden1_dimension=20,
                  hidden2_dimension=10,
                  topics=5)
    # encoder
    # two each for the linear units
    assert len(tuple(vae.encoder.parameters())) == 4
    assert (len(
        tuple(param for param in vae.encoder.parameters()
              if param.requires_grad)) == 4)
    # mean and logvar
    # two for the linear, two for the batchnorm
    assert len(tuple(vae.mean.parameters())) == 4
    assert (len(
        tuple(param for param in vae.mean.parameters()
              if param.requires_grad)) == 3)
    assert len(tuple(vae.logvar.parameters())) == 4
    assert (len(
        tuple(param for param in vae.logvar.parameters()
              if param.requires_grad)) == 3)
    # decoder
    # one for the linear, two for the batchnorm
    assert len(tuple(vae.decoder.parameters())) == 3
    # batchnorm has no scale
    assert (len(
        tuple(param for param in vae.decoder.parameters()
              if param.requires_grad)) == 2)
Beispiel #4
0
def test_copy_embeddings_model():
    lookup = {9: torch.ones(20), 8: torch.tensor(20).fill_(2)}
    vae = ProdLDA(in_dimension=10,
                  hidden1_dimension=20,
                  hidden2_dimension=10,
                  topics=5,
                  word_embeddings=lookup)
    assert vae.encoder.linear1.weight[:, 9].eq(1).all()
    assert vae.encoder.linear1.weight[:, 8].eq(2).all()
Beispiel #5
0
def test_forward_dimensions():
    vae = ProdLDA(in_dimension=10,
                  hidden1_dimension=20,
                  hidden2_dimension=10,
                  topics=5)
    for size in [10, 100, 1000]:
        batch = torch.zeros(size, 10)
        recon, mean, logvar = vae(batch)
        assert recon.shape == batch.shape
        assert mean.shape == (size, 5)
        assert logvar.shape == (size, 5)
Beispiel #6
0
def test_not_train_embeddings():
    vae = ProdLDA(in_dimension=10,
                  hidden1_dimension=20,
                  hidden2_dimension=10,
                  topics=5,
                  train_word_embeddings=False)
    for size in [10, 100, 1000]:
        batch = torch.zeros(size, 10)
        recon, mean, logvar = vae(batch)
        assert recon.shape == batch.shape
        assert mean.shape == (size, 5)
        assert logvar.shape == (size, 5)
Beispiel #7
0
def main(cuda, batch_size, epochs, top_words, testing_mode, verbose_mode):
    print("Loading input data")
    # TODO fix relative paths
    data_train = load_npz("data/train.txt.npz")
    data_val = load_npz("data/test.txt.npz")
    corpus = Sparse2Corpus(data_train, documents_columns=False)
    with open("data/vocab.pkl", "rb") as f:
        vocab = pickle.load(f)
    reverse_vocab = {vocab[word]: word for word in vocab}
    indexed_vocab = [
        reverse_vocab[index] for index in range(len(reverse_vocab))
    ]
    writer = SummaryWriter()  # create the TensorBoard object

    # callback function to call during training, uses writer from the scope
    def training_callback(autoencoder, epoch, lr, loss, perplexity):
        if verbose_mode:
            decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
            topics = [[
                reverse_vocab[item.item()] for item in topic
            ] for topic in decoder_weight.topk(top_words, dim=0)[1].t()]
            cm = CoherenceModel(
                topics=topics,
                corpus=corpus,
                dictionary=Dictionary.from_corpus(corpus, reverse_vocab),
                coherence="u_mass",
            )
            coherence = cm.get_coherence()
            coherences = cm.get_coherence_per_topic()
            for index, topic in enumerate(topics):
                print(
                    str(index) + ":" + str(coherences[index]) + ":" +
                    ",".join(topic))
            print(coherence)
        else:
            coherence = 0
        writer.add_scalars(
            "data/autoencoder",
            {
                "lr": lr,
                "loss": loss,
                "perplexity": perplexity,
                "coherence": coherence,
            },
            global_step=epoch,
        )

    ds_train = CountTensorDataset(data_train)
    ds_val = CountTensorDataset(data_val)
    autoencoder = ProdLDA(in_dimension=len(vocab),
                          hidden1_dimension=100,
                          hidden2_dimension=100,
                          topics=50)
    if cuda:
        autoencoder.cuda()
    print("Training stage.")
    ae_optimizer = Adam(autoencoder.parameters(), 0.0001, betas=(0.99, 0.999))
    train(
        ds_train,
        autoencoder,
        cuda=cuda,
        validation=ds_val,
        epochs=epochs,
        batch_size=batch_size,
        optimizer=ae_optimizer,
        update_callback=training_callback,
        sampler=WeightedRandomSampler(torch.ones(data_train.shape[0]), 20000),
        num_workers=4,
    )
    autoencoder.eval()
    decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
    topics = [[reverse_vocab[item.item()] for item in topic]
              for topic in decoder_weight.topk(top_words, dim=0)[1].t()]
    cm = CoherenceModel(
        topics=topics,
        corpus=corpus,
        dictionary=Dictionary.from_corpus(corpus, reverse_vocab),
        coherence="u_mass",
    )
    coherence = cm.get_coherence()
    coherences = cm.get_coherence_per_topic()
    for index, topic in enumerate(topics):
        print(
            str(index) + ":" + str(coherences[index]) + ":" + ",".join(topic))
    print(coherence)
    if not testing_mode:
        writer.add_embedding(
            autoencoder.encoder.linear1.weight.detach().cpu().t(),
            metadata=indexed_vocab,
            tag="feature_embeddings",
        )
    writer.close()
Beispiel #8
0
def main(
    cuda,
    batch_size,
    epochs,
    top_words,
    testing_mode,
):
    print("Loading input data")
    # TODO fix relative paths
    data_train = np.load("data/train.txt.npy", encoding="bytes")
    data_val = np.load("data/test.txt.npy", encoding="bytes")
    with open("data/vocab.pkl", "rb") as f:
        vocab = pickle.load(f)
    reverse_vocab = {vocab[word]: word for word in vocab}
    indexed_vocab = [
        reverse_vocab[index] for index in range(len(reverse_vocab))
    ]

    writer = SummaryWriter()  # create the TensorBoard object

    # callback function to call during training, uses writer from the scope
    def training_callback(autoencoder, epoch, lr, loss, perplexity):
        writer.add_scalars(
            "data/autoencoder",
            {
                "lr": lr,
                "loss": loss,
                "perplexity": perplexity,
            },
            global_step=epoch,
        )
        decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
        topics = [[reverse_vocab[item.item()] for item in topic]
                  for topic in decoder_weight.topk(top_words, dim=0)[1].t()]
        for index, topic in enumerate(topics):
            print(str(index) + ":" + ",".join(topic))

    ds_train = TensorDataset(torch.from_numpy(data_train).float())
    ds_val = TensorDataset(torch.from_numpy(data_val).float())
    autoencoder = ProdLDA(in_dimension=len(vocab),
                          hidden1_dimension=100,
                          hidden2_dimension=100,
                          topics=50)
    if cuda:
        autoencoder.cuda()
    print("Training stage.")
    ae_optimizer = Adam(autoencoder.parameters(), 0.001, betas=(0.99, 0.999))
    train(
        ds_train,
        autoencoder,
        cuda=cuda,
        validation=ds_val,
        epochs=epochs,
        batch_size=batch_size,
        optimizer=ae_optimizer,
        update_callback=training_callback,
    )
    autoencoder.eval()
    decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
    topics = [[reverse_vocab[item.item()] for item in topic]
              for topic in decoder_weight.topk(top_words, dim=0)[1].t()]
    for topic in topics:
        print(",".join(topic))
    if not testing_mode:
        writer.add_embedding(
            autoencoder.encoder.linear1.weight.detach().cpu().t(),
            metadata=indexed_vocab,
            tag="feature_embeddings",
        )
    writer.close()
Beispiel #9
0
def main(
    cuda,
    batch_size,
    epochs,
    top_words,
    testing_mode,
):
    print('Loading input data')
    # TODO fix relative paths
    input_train = np.load('data/train.txt.npy', encoding='bytes')
    input_val = np.load('data/test.txt.npy', encoding='bytes')
    with open('data/vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
    reverse_vocab = {vocab[word]: word for word in vocab}
    indexed_vocab = [
        reverse_vocab[index] for index in range(len(reverse_vocab))
    ]
    data_train = np.array([
        np.bincount(doc.astype('int'), minlength=len(vocab))
        for doc in input_train if doc.sum() > 0
    ])
    data_val = np.array([
        np.bincount(doc.astype('int'), minlength=len(vocab))
        for doc in input_val if doc.sum() > 0
    ])

    writer = SummaryWriter()  # create the TensorBoard object

    # callback function to call during training, uses writer from the scope
    def training_callback(autoencoder, epoch, lr, loss, perplexity):
        writer.add_scalars('data/autoencoder', {
            'lr': lr,
            'loss': loss,
            'perplexity': perplexity,
        },
                           global_step=epoch)
        decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
        topics = [[reverse_vocab[item.item()] for item in topic]
                  for topic in decoder_weight.topk(top_words, dim=0)[1].t()]
        for index, topic in enumerate(topics):
            print(str(index) + ':' + ','.join(topic))

    ds_train = TensorDataset(torch.from_numpy(data_train).float())
    ds_val = TensorDataset(torch.from_numpy(data_val).float())
    autoencoder = ProdLDA(in_dimension=len(vocab),
                          hidden1_dimension=100,
                          hidden2_dimension=100,
                          topics=50)
    if cuda:
        autoencoder.cuda()
    print('Training stage.')
    ae_optimizer = Adam(autoencoder.parameters(), 0.001, betas=(0.99, 0.999))
    train(ds_train,
          autoencoder,
          cuda=cuda,
          validation=ds_val,
          epochs=epochs,
          batch_size=batch_size,
          optimizer=ae_optimizer,
          update_callback=training_callback)
    autoencoder.eval()
    decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
    topics = [[reverse_vocab[item.item()] for item in topic]
              for topic in decoder_weight.topk(top_words, dim=0)[1].t()]
    for topic in topics:
        print(','.join(topic))
    if not testing_mode:
        writer.add_embedding(
            autoencoder.encoder.linear1.weight.detach().cpu().t(),
            metadata=indexed_vocab,
            tag='feature_embeddings',
        )
    writer.close()
Beispiel #10
0
class ProdLDATransformer(TransformerMixin, BaseEstimator):
    def __init__(self,
                 cuda=None,
                 batch_size=200,
                 epochs=80,
                 hidden1_dimension=100,
                 hidden2_dimension=100,
                 topics=50,
                 lr=0.001,
                 samples=20000,
                 score_num=7,
                 score_type='coherence') -> None:
        self.cuda = torch.cuda.is_available() if cuda is None else cuda
        self.batch_size = batch_size
        self.epochs = epochs
        self.hidden1_dimension = hidden1_dimension
        self.hidden2_dimension = hidden2_dimension
        self.topics = topics
        self.lr = lr
        self.samples = samples
        self.autoencoder = None
        self.score_type = score_type
        self.score_num = score_num
        if self.score_type not in ['coherence']:
            raise ValueError('score_type must be "coherence"')

    def fit(self, X, y=None) -> None:
        documents, features = X.shape
        ds = CountTensorDataset(X.astype(np.float32))
        self.autoencoder = ProdLDA(in_dimension=features,
                                   hidden1_dimension=self.hidden1_dimension,
                                   hidden2_dimension=self.hidden2_dimension,
                                   topics=self.topics)
        if self.cuda:
            self.autoencoder.cuda()
        ae_optimizer = Adam(self.autoencoder.parameters(),
                            lr=self.lr,
                            betas=(0.99, 0.999))
        train(
            ds,
            self.autoencoder,
            cuda=self.cuda,
            validation=None,
            epochs=self.epochs,
            batch_size=self.batch_size,
            optimizer=ae_optimizer,
            sampler=WeightedRandomSampler(torch.ones(documents),
                                          min(documents, self.samples)),
            silent=True,
            num_workers=0  # TODO causes a bug to change this on Mac
        )

    def transform(self, X):
        if self.autoencoder is None:
            raise NotFittedError
        self.autoencoder.eval()
        ds = CountTensorDataset(X.astype(np.float32))
        output = predict(
            ds,
            self.autoencoder,
            encode=True,
            silent=True,
            batch_size=self.batch_size,
            num_workers=0  # TODO causes a bug to change this on Mac
        )
        return output.numpy()

    def score(self, X, y=None, sample_weight=None) -> float:
        # TODO this needs further testing for correctness, WIP
        if self.autoencoder is None:
            raise NotFittedError
        self.autoencoder.eval()
        corpus = Sparse2Corpus(X, documents_columns=False)
        decoder_weight = self.autoencoder.decoder.linear.weight.detach().cpu()
        id2word = {index: str(index) for index in range(X.shape[1])}
        topics = [[str(item.item()) for item in topic]
                  for topic in decoder_weight.topk(
                      min(self.score_num, X.shape[1]), dim=0)[1].t()]
        cm = CoherenceModel(topics=topics,
                            corpus=corpus,
                            dictionary=Dictionary.from_corpus(corpus, id2word),
                            coherence='u_mass')
        return cm.get_coherence()