Ejemplo n.º 1
0
def make_model(src_vocab, 
               tgt_vocab, 
               N=6, 
               d_model=512, 
               d_ff=2048, 
               h=8, 
               dropout=0.1):
    attn = MultiHeadedAttention(h, d_model, dropout)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    pe = PositionalEncoding(d_model, dropout)

    encoder_layer = EncoderLayer(d_model, copy(attn), copy(ff), dropout)
    encoder = Encoder(encoder_layer, N)

    decoder_layer = DecoderLayer(d_model, copy(attn), copy(attn), copy(ff), dropout)
    decoder = Decoder(decoder_layer, N)

    src_embed = nn.Sequential(Embedding(src_vocab, d_model), copy(pe))
    tgt_embed = nn.Sequential(Embedding(tgt_vocab, d_model), copy(pe))

    generator = Generator(d_model, tgt_vocab)

    model = EncoderDecoder(encoder, decoder, src_embed, tgt_embed, generator)

    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return model
Ejemplo n.º 2
0
    def load_attr_embeddings(self):
        for attr_name in self.attr_embedding_map.keys():
            print("load attr embedding for : {}".format(attr_name))

            attr_embedding_info = self.attr_embedding_map[attr_name]
            path = attr_embedding_info["path"]
            cache_mod = attr_embedding_info["cache_mod"]
            read_func = attr_embedding_info["read_func"]

            attr_embedding = Embedding(path, cache_mod, read_func)
            attr_embedding.load()

            self.loaded_attr_embedding_map[attr_name] = attr_embedding
Ejemplo n.º 3
0
    def train(self, learning_rate=0.01, print_step=1000, stop_threshold=0):
        losses = []
        aver_losses = []
        wa_scores = []
        if print_step == 0:
            print_step = self.n_batches

        for _ in range(self.epochs):
            iteration = 0
            start = time.time()

            with open(self.filename, 'r') as f:
                reader = csv.reader(f)
                for row in reader:
                    # Print step
                    iteration += 1
                    if iteration % print_step == 0:
                        end = time.time()
                        print("Epochs: {}".format(_),
                              "Iteration: {}".format(iteration),
                              "Avg. Training loss: {:.4f}".format(np.mean(losses)),
                              "{:.4f} sec/ {} sample".format((end - start), self.batch_size * print_step))
                        aver_losses.append(np.mean(losses))
                        losses = []
                        start = time.time()

                    # Print word analogy
                    if iteration % (print_step * 10) == 0:
                        eval = Embedding(np.array(self.E), self.int_to_vocab, self.vocab_to_int)
                        wa_score = self.word_analogy.evaluate(eval, high_level_category=False, restrict_top_words=False)
                        wa_scores.append(wa_score['all'])

                        self.export_model(self.output_dictionary + 'step-{}/'.format(int(iteration)))

                    loss = self._train_one_sample(int(row[0]), int(row[1]), learning_rate)
                    losses.append(loss)

            eval = Embedding(np.array(self.E), self.int_to_vocab, self.vocab_to_int)
            wa_score = self.word_analogy.evaluate(eval, high_level_category=False, restrict_top_words=False)
            wa_scores.append(wa_score['all'])
            print('Epochs: {}, WA score: {}'.format(_, wa_score['all']))

            # Save step
            if _ % 5 == 0:
                self.export_model(self.output_dictionary + 'step-{}/'.format(int(_)))

        # export losses
        utils.save_pkl(aver_losses, self.output_dictionary + config['TRAIN']['loss_file'])
        utils.save_pkl(wa_scores, self.output_dictionary + config['TRAIN']['acc_file'])
Ejemplo n.º 4
0
            # accuracy
            acc = mean(result)

            # result
            print("Category: %-30s, accuracy: %f (all: %d)" %
                  (cat, acc, len(X_cat)))
            predictions[cat] = acc

        # overrall
        total_count = 0
        acc = 0
        for cat in cat_list:
            cat_count = len(X[cat])
            acc += cat_count * predictions.get(cat)
            total_count += cat_count
        predictions['all'] = acc / total_count
        print("All Category accuracy: %f" % (acc / total_count))

        return predictions


if __name__ == "__main__":
    word_analogy = WordAnalogy()
    word_analogy.set_top_words('../../data/processed data/top_30000_words.txt')
    embedding = Embedding.from_file(
        '../../output/50dim/embedding-e=50-n_sampled=200-epochs=35-batch_size=10000_1.txt'
    )
    result = word_analogy.evaluate(embedding,
                                   high_level_category=False,
                                   restrict_top_words=False)
Ejemplo n.º 5
0
if __name__ == "__main__":
    wordsim = Wordsim()
    word_analogy = WordAnalogy()
    word_analogy.set_top_words('../../data/processed data/top_30000_words.txt')

    suffix = '_1'
    dimension_list = [50, 100, 150, 200]
    wa_list = []
    ws_list = []

    for dimension in dimension_list:
        filename = '../../output/convergence_test/3000samples/31epochs/snml/{}dim/embedding-e={}-n_sampled=3000-epochs=31-batch_size=10000.txt'.format(
            dimension, dimension)
        print('Reading: ', filename)
        embedding = Embedding.from_file(filename)

        wa_result = word_analogy.evaluate(embedding,
                                          high_level_category=False,
                                          restrict_top_words=False)
        ws_result = wordsim.evaluate(embedding)

        wa_list.append(wa_result['all'])
        ws_list.append(ws_result['EN-WS-353-ALL'][2])

    print('Word analogy: ')
    for wa in wa_list:
        print(wa)
    print('Word sim: ')
    for ws in ws_list:
        print(ws)
Ejemplo n.º 6
0
import torch.autograd as autograd
import torch.nn.functional as F
import torch.optim as optim
import time

from utils.dataset import Dataset
from utils.evaluation import Evaluation
from utils.embedding import Embedding
from utils.utils import stemming, vocabulary
from utils.utils import removeStopwords
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt

torch.manual_seed(1)
embedding = Embedding()
data = Dataset()
vocab = vocabulary(data.trainset)
word_to_ix = {word: i for i, word in enumerate(vocab)}


class RNN(nn.Module):
    def __init__(self, embedding_dim, hidden_size, vocab_size):
        super(RNN, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size + 1, embedding_dim)
        self.embedding.weight.data.copy_(self.loadEmbedding())
        self.t_lstm = nn.LSTM(embedding_dim,
                              hidden_size,
Ejemplo n.º 7
0
    @staticmethod
    def pprint(result):
        from prettytable import PrettyTable
        x = PrettyTable(["Dataset", "Found", "Not Found", "Score (rho)"])
        x.align["Dataset"] = "l"
        for k, v in result.items():
            x.add_row([k,v[0],v[1],v[2]])
        print(x)

    def evaluate(self, embedding):
        result = {}
        for file_name, data in self.dataset.items():
            pred, label, found, notfound = [] ,[], 0, 0
            for datum in data:
                if embedding.in_vocab(datum[0]) and embedding.in_vocab(datum[1]):
                    found += 1
                    pred.append(umath.cos(embedding.vector(datum[0]), embedding.vector(datum[1])))
                    label.append(datum[2])
                else:
                    notfound += 1
            result[file_name] = (found, notfound, umath.rho(label,pred)*100)
        return result


if __name__ == "__main__":
    wordsim = Wordsim()
    embedding = Embedding.from_file('../../output/100dim/embedding-e=100-n_sampled=200-epochs=10-batch_size=10000.txt')
    result = wordsim.evaluate(embedding)
    # wordsim.pprint(result)
    print(result['EN-WS-353-ALL'][2])
Ejemplo n.º 8
0
    def train(self, print_step=1000, stop_threshold=0):
        iteration = 1
        loss = 0
        losses = []
        epoch_sum_loss = 0.
        last_epoch_loss = 999999.
        wa_scores = []
        if print_step == 0:
            print_step = self.n_batches

        try:
            start = time.time()
            while True:
                train_loss, _ = self.sess.run(
                    [self.full_cost, self.full_optimizer])
                # train_loss, _ = self.sess.run([self.cost, self.optimizer])
                loss += train_loss
                epoch_sum_loss += train_loss
                losses.append(train_loss)

                if iteration % print_step == 0:
                    end = time.time()
                    print(
                        "Iteration: {}".format(iteration),
                        "Avg. Training loss: {:.4f}".format(loss / print_step),
                        "{:.4f} sec/ {} sample".format(
                            (end - start), self.batch_size * print_step))
                    loss = 0
                    start = time.time()

                if iteration % self.n_batches == 0:
                    epochs = iteration / self.n_batches
                    epoch_loss = epoch_sum_loss / self.n_batches
                    epoch_sum_loss = 0
                    epoch_loss_diff = np.abs(epoch_loss - last_epoch_loss)
                    print('Epochs {} loss: {}'.format(epochs, epoch_loss))

                    # word analogy score
                    embedding = self.sess.run(self.embedding_g)
                    eval = Embedding(embedding, self.int_to_vocab,
                                     self.vocab_to_int)
                    wa_score = self.word_analogy.evaluate(
                        eval,
                        high_level_category=False,
                        restrict_top_words=False)
                    wa_scores.append(wa_score['all'])

                    # stop criteria
                    if epoch_loss_diff < stop_threshold:
                        self.epochs = iteration / self.n_batches
                        # output file
                        self.embedding_file = config['TRAIN'][
                            'embedding'].format(self.n_embedding,
                                                self.n_sampled,
                                                int(self.epochs),
                                                self.batch_size)
                        print('Loss diff: {}, stop training.'.format(
                            epoch_loss_diff))
                        print(self.output_dictionary + self.embedding_file)
                        break

                    # Save step
                    if epochs % 10 == 0:
                        self.embedding = self.sess.run(self.embedding_g)
                        self.softmax_w = self.sess.run(self.softmax_w_g)
                        self.export_model(self.output_dictionary +
                                          'step-{}/'.format(int(epochs)))

                    last_epoch_loss = epoch_loss

                iteration += 1
        except tf.errors.OutOfRangeError:
            print("End of dataset")

        # export embedding matrix
        self.embedding = self.sess.run(self.embedding_g)
        self.softmax_w = self.sess.run(self.softmax_w_g)

        # export losses
        utils.save_pkl(losses,
                       self.output_dictionary + config['TRAIN']['loss_file'])
        utils.save_pkl(wa_scores,
                       self.output_dictionary + config['TRAIN']['acc_file'])