Esempio n. 1
0
 def fit(self, X, y=None) -> None:
     documents, features = X.shape
     ds = CountTensorDataset(X.astype(np.float32))
     self.autoencoder = ProdLDA(in_dimension=features,
                                hidden1_dimension=self.hidden1_dimension,
                                hidden2_dimension=self.hidden2_dimension,
                                topics=self.topics)
     if self.cuda:
         self.autoencoder.cuda()
     ae_optimizer = Adam(self.autoencoder.parameters(),
                         lr=self.lr,
                         betas=(0.99, 0.999))
     train(
         ds,
         self.autoencoder,
         cuda=self.cuda,
         validation=None,
         epochs=self.epochs,
         batch_size=self.batch_size,
         optimizer=ae_optimizer,
         sampler=WeightedRandomSampler(torch.ones(documents),
                                       min(documents, self.samples)),
         silent=True,
         num_workers=0  # TODO causes a bug to change this on Mac
     )
Esempio n. 2
0
def test_train():
    autoencoder = Mock()
    autoencoder.return_value = [torch.tensor([1, 1], dtype=torch.float)] * 3
    autoencoder.loss.return_value = torch.tensor(
        [1, 1], dtype=torch.float).requires_grad_()
    optimizer = Mock()
    dataset = TensorDataset(torch.zeros(100, 1000))
    train(dataset=dataset,
          autoencoder=autoencoder,
          epochs=1,
          batch_size=10,
          optimizer=optimizer)
    autoencoder.train.assert_called_once()
    assert autoencoder.call_count == 10
    assert optimizer.zero_grad.call_count == 10
    assert optimizer.step.call_count == 10
Esempio n. 3
0
def test_train_validation():
    autoencoder = Mock()
    autoencoder.return_value = [torch.zeros(10, 10).float()] * 3
    autoencoder.loss.return_value = torch.zeros(10,
                                                10).float().requires_grad_()
    optimizer = Mock()
    dataset = TensorDataset(torch.zeros(100, 1000))
    validation_dataset = TensorDataset(torch.zeros(10, 1000))
    train(dataset=dataset,
          validation=validation_dataset,
          autoencoder=autoencoder,
          epochs=1,
          batch_size=10,
          optimizer=optimizer)
    assert autoencoder.train.call_count == 2
    assert autoencoder.call_count == 11
    assert optimizer.zero_grad.call_count == 10
    assert optimizer.step.call_count == 10
Esempio n. 4
0
def main(cuda, batch_size, epochs, top_words, testing_mode, verbose_mode):
    print("Loading input data")
    # TODO fix relative paths
    data_train = load_npz("data/train.txt.npz")
    data_val = load_npz("data/test.txt.npz")
    corpus = Sparse2Corpus(data_train, documents_columns=False)
    with open("data/vocab.pkl", "rb") as f:
        vocab = pickle.load(f)
    reverse_vocab = {vocab[word]: word for word in vocab}
    indexed_vocab = [
        reverse_vocab[index] for index in range(len(reverse_vocab))
    ]
    writer = SummaryWriter()  # create the TensorBoard object

    # callback function to call during training, uses writer from the scope
    def training_callback(autoencoder, epoch, lr, loss, perplexity):
        if verbose_mode:
            decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
            topics = [[
                reverse_vocab[item.item()] for item in topic
            ] for topic in decoder_weight.topk(top_words, dim=0)[1].t()]
            cm = CoherenceModel(
                topics=topics,
                corpus=corpus,
                dictionary=Dictionary.from_corpus(corpus, reverse_vocab),
                coherence="u_mass",
            )
            coherence = cm.get_coherence()
            coherences = cm.get_coherence_per_topic()
            for index, topic in enumerate(topics):
                print(
                    str(index) + ":" + str(coherences[index]) + ":" +
                    ",".join(topic))
            print(coherence)
        else:
            coherence = 0
        writer.add_scalars(
            "data/autoencoder",
            {
                "lr": lr,
                "loss": loss,
                "perplexity": perplexity,
                "coherence": coherence,
            },
            global_step=epoch,
        )

    ds_train = CountTensorDataset(data_train)
    ds_val = CountTensorDataset(data_val)
    autoencoder = ProdLDA(in_dimension=len(vocab),
                          hidden1_dimension=100,
                          hidden2_dimension=100,
                          topics=50)
    if cuda:
        autoencoder.cuda()
    print("Training stage.")
    ae_optimizer = Adam(autoencoder.parameters(), 0.0001, betas=(0.99, 0.999))
    train(
        ds_train,
        autoencoder,
        cuda=cuda,
        validation=ds_val,
        epochs=epochs,
        batch_size=batch_size,
        optimizer=ae_optimizer,
        update_callback=training_callback,
        sampler=WeightedRandomSampler(torch.ones(data_train.shape[0]), 20000),
        num_workers=4,
    )
    autoencoder.eval()
    decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
    topics = [[reverse_vocab[item.item()] for item in topic]
              for topic in decoder_weight.topk(top_words, dim=0)[1].t()]
    cm = CoherenceModel(
        topics=topics,
        corpus=corpus,
        dictionary=Dictionary.from_corpus(corpus, reverse_vocab),
        coherence="u_mass",
    )
    coherence = cm.get_coherence()
    coherences = cm.get_coherence_per_topic()
    for index, topic in enumerate(topics):
        print(
            str(index) + ":" + str(coherences[index]) + ":" + ",".join(topic))
    print(coherence)
    if not testing_mode:
        writer.add_embedding(
            autoencoder.encoder.linear1.weight.detach().cpu().t(),
            metadata=indexed_vocab,
            tag="feature_embeddings",
        )
    writer.close()
Esempio n. 5
0
def main(
    cuda,
    batch_size,
    epochs,
    top_words,
    testing_mode,
):
    print("Loading input data")
    # TODO fix relative paths
    data_train = np.load("data/train.txt.npy", encoding="bytes")
    data_val = np.load("data/test.txt.npy", encoding="bytes")
    with open("data/vocab.pkl", "rb") as f:
        vocab = pickle.load(f)
    reverse_vocab = {vocab[word]: word for word in vocab}
    indexed_vocab = [
        reverse_vocab[index] for index in range(len(reverse_vocab))
    ]

    writer = SummaryWriter()  # create the TensorBoard object

    # callback function to call during training, uses writer from the scope
    def training_callback(autoencoder, epoch, lr, loss, perplexity):
        writer.add_scalars(
            "data/autoencoder",
            {
                "lr": lr,
                "loss": loss,
                "perplexity": perplexity,
            },
            global_step=epoch,
        )
        decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
        topics = [[reverse_vocab[item.item()] for item in topic]
                  for topic in decoder_weight.topk(top_words, dim=0)[1].t()]
        for index, topic in enumerate(topics):
            print(str(index) + ":" + ",".join(topic))

    ds_train = TensorDataset(torch.from_numpy(data_train).float())
    ds_val = TensorDataset(torch.from_numpy(data_val).float())
    autoencoder = ProdLDA(in_dimension=len(vocab),
                          hidden1_dimension=100,
                          hidden2_dimension=100,
                          topics=50)
    if cuda:
        autoencoder.cuda()
    print("Training stage.")
    ae_optimizer = Adam(autoencoder.parameters(), 0.001, betas=(0.99, 0.999))
    train(
        ds_train,
        autoencoder,
        cuda=cuda,
        validation=ds_val,
        epochs=epochs,
        batch_size=batch_size,
        optimizer=ae_optimizer,
        update_callback=training_callback,
    )
    autoencoder.eval()
    decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
    topics = [[reverse_vocab[item.item()] for item in topic]
              for topic in decoder_weight.topk(top_words, dim=0)[1].t()]
    for topic in topics:
        print(",".join(topic))
    if not testing_mode:
        writer.add_embedding(
            autoencoder.encoder.linear1.weight.detach().cpu().t(),
            metadata=indexed_vocab,
            tag="feature_embeddings",
        )
    writer.close()
Esempio n. 6
0
def main(
    cuda,
    batch_size,
    epochs,
    top_words,
    testing_mode,
):
    print('Loading input data')
    # TODO fix relative paths
    input_train = np.load('data/train.txt.npy', encoding='bytes')
    input_val = np.load('data/test.txt.npy', encoding='bytes')
    with open('data/vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
    reverse_vocab = {vocab[word]: word for word in vocab}
    indexed_vocab = [
        reverse_vocab[index] for index in range(len(reverse_vocab))
    ]
    data_train = np.array([
        np.bincount(doc.astype('int'), minlength=len(vocab))
        for doc in input_train if doc.sum() > 0
    ])
    data_val = np.array([
        np.bincount(doc.astype('int'), minlength=len(vocab))
        for doc in input_val if doc.sum() > 0
    ])

    writer = SummaryWriter()  # create the TensorBoard object

    # callback function to call during training, uses writer from the scope
    def training_callback(autoencoder, epoch, lr, loss, perplexity):
        writer.add_scalars('data/autoencoder', {
            'lr': lr,
            'loss': loss,
            'perplexity': perplexity,
        },
                           global_step=epoch)
        decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
        topics = [[reverse_vocab[item.item()] for item in topic]
                  for topic in decoder_weight.topk(top_words, dim=0)[1].t()]
        for index, topic in enumerate(topics):
            print(str(index) + ':' + ','.join(topic))

    ds_train = TensorDataset(torch.from_numpy(data_train).float())
    ds_val = TensorDataset(torch.from_numpy(data_val).float())
    autoencoder = ProdLDA(in_dimension=len(vocab),
                          hidden1_dimension=100,
                          hidden2_dimension=100,
                          topics=50)
    if cuda:
        autoencoder.cuda()
    print('Training stage.')
    ae_optimizer = Adam(autoencoder.parameters(), 0.001, betas=(0.99, 0.999))
    train(ds_train,
          autoencoder,
          cuda=cuda,
          validation=ds_val,
          epochs=epochs,
          batch_size=batch_size,
          optimizer=ae_optimizer,
          update_callback=training_callback)
    autoencoder.eval()
    decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
    topics = [[reverse_vocab[item.item()] for item in topic]
              for topic in decoder_weight.topk(top_words, dim=0)[1].t()]
    for topic in topics:
        print(','.join(topic))
    if not testing_mode:
        writer.add_embedding(
            autoencoder.encoder.linear1.weight.detach().cpu().t(),
            metadata=indexed_vocab,
            tag='feature_embeddings',
        )
    writer.close()