Beispiel #1
0
from tmnt.estimator import BowEstimator, CovariateBowEstimator
import numpy as np
import gluonnlp as nlp
import os
from sklearn.datasets import fetch_20newsgroups
from tmnt.preprocess.vectorizer import TMNTVectorizer
from tmnt.inference import BowVAEInferencer

n_samples = 2000
n_features = 1000

data, y = fetch_20newsgroups(shuffle=True,
                             random_state=1,
                             remove=('headers', 'footers', 'quotes'),
                             return_X_y=True)
data_samples = data[:n_samples]
tf_vectorizer = TMNTVectorizer(vocab_size=1000)
X, _ = tf_vectorizer.fit_transform(data_samples)

num_covar_values = int(np.max(y)) + 1  # get the number of possible labels
m_estimator = CovariateBowEstimator(tf_vectorizer.get_vocab(),
                                    num_covar_values)
_ = m_estimator.fit(X, y)  # fit a covariate model using y
m_inferencer = BowVAEInferencer(m_estimator.model)

## the following returns a list of top 5 words per topic per covariate/label
t_terms = m_inferencer.get_top_k_words_per_topic_per_covariate(5)

## top-5 terms for each topic over label/covariate index = 4
cov_4_topics = t_terms[4]
Beispiel #2
0
                              remove=('headers', 'footers', 'quotes'),
                              return_X_y=True)
train_data = data[:2000]
dev_data   = data[-2000:]
train_y    = y[:2000]
dev_y      = y[-2000:]
model_name = 'bert_12_768_12'
dataset = 'book_corpus_wiki_en_uncased'
batch_size = 32
seq_len = 64
pad = True
tr_ds = ArrayDataset(train_data, train_y)
dev_ds = ArrayDataset(dev_data, dev_y)

vectorizer = TMNTVectorizer(vocab_size=2000)
vectorizer.fit_transform(train_data)

ctx = mx.cpu() ## or mx.gpu(N) if using GPU device=N

tr_dataset, dev_dataset, num_examples, bert_base, _ = get_bert_datasets(None, vectorizer,
                                                                        tr_ds, dev_ds, batch_size, seq_len,
                                                                        bert_model_name=model_name,
                                                                        bert_dataset=dataset,
                                                                        pad=False, ctx=ctx)
num_classes = int(np.max(y) + 1)

estimator = SeqBowEstimator(bert_base, bert_model_name = model_name, bert_data_name = dataset,
                            n_labels = num_classes,
                            bow_vocab = vectorizer.get_vocab(),
                            optimizer='bertadam',
                            batch_size=batch_size, ctx=ctx, log_interval=1,
Beispiel #3
0
"""

from tmnt.estimator import BowEstimator
import numpy as np
import gluonnlp as nlp
import os
import umap
from sklearn.datasets import fetch_20newsgroups
from tmnt.preprocess.vectorizer import TMNTVectorizer
from tmnt.inference import BowVAEInferencer

data, y = fetch_20newsgroups(shuffle=True,
                             random_state=1,
                             remove=('headers', 'footers', 'quotes'),
                             return_X_y=True)
tf_vectorizer = TMNTVectorizer(vocab_size=1000)
X, _ = tf_vectorizer.fit_transform(data)

num_label_values = int(np.max(y)) + 1  # get the number of possible labels
gamma = 1.0  ## balanced unsupervised and supservised losses
## total loss = topic_loss + gamma * classification_loss

l_estimator = BowEstimator(tf_vectorizer.get_vocab(),
                           n_labels=num_label_values,
                           gamma=gamma)
_ = l_estimator.fit(X, y)  # fit a joint topic + classification model using y
v_results = l_estimator.validate(X, y)
l_inferencer = BowVAEInferencer(l_estimator.model)
embeddings = l_inferencer.get_umap_embeddings(X)
l_inferencer.plot_to(embeddings, y, None)