def __init__(self, verbose=True):
        self.verbose = verbose
        if self.verbose:
            printerr('Word embedding path: ' + args.word_embedding)
        self.word_embedding = WordEmbedding(args.word_embedding)

        if self.verbose:
            printerr('Dataset prefix:' + args.dataset_prefix)
        self.data = SNLI(args.dataset_prefix, args.train_size, True, True)

        # trim the word embeddings to contain only words in the dataset
        if self.verbose:
            printerr("Before trim word embedding, " +
                     str(self.word_embedding.embeddings.size(0)) + " words")
        self.word_embedding.trim_by_counts(self.data.word_counts)
        if self.verbose:
            printerr("After trim word embedding, " +
                     str(self.word_embedding.embeddings.size(0)) + " words")
        self.word_embedding.extend_by_counts(self.data.word_counts)
        if self.verbose:
            printerr("After adding training words, " +
                     str(self.word_embedding.embeddings.size(0)) + " words")

        # mark word ids in snli trees
        for _data in self.data.train:
            _data['p_tree'].mark_word_id(self.word_embedding)
            _data['h_tree'].mark_word_id(self.word_embedding)
        for _data in self.data.dev:
            _data['p_tree'].mark_word_id(self.word_embedding)
            _data['h_tree'].mark_word_id(self.word_embedding)

        config = {
            'hidden_dim': args.hidden_dim,
            'relation_num': 3,
            'cuda_flag': args.cuda,
            'drop_p': args.drop_out
        }
        self.model = DoubleAttention_vRNN(self.word_embedding, config)
        self.optimizer = optim.Adadelta(self.model.parameters(),
                                        lr=args.learning_rate)
        if not args.dump is None:
            self.dump = args.dump + self.model.name
        else:
            self.dump = None

        if args.cuda:
            self.model.cuda()
            printerr('Using GPU %s' % str(args.gpu_id))
        printerr('Training ' + self.model.name)
    return ret


def tree2set(t):
    global _set
    _set = set()

    def func(node):
        _set.add(node.get_str4conceptnet())

    t.postorder_traverse(func)
    return _set


word_embedding = WordEmbedding('./data/wordembedding')
snli = SNLI('./data/')

printerr("Before trim word embedding, " +
         str(word_embedding.embeddings.size(0)) + " words")
word_embedding.trim_by_counts(snli.word_counts)
printerr("After trim word embedding, " +
         str(word_embedding.embeddings.size(0)) + " words")
word_embedding.extend_by_counts(snli.train_word_counts)
printerr("After adding training words, " +
         str(word_embedding.embeddings.size(0)) + " words")

phrases = set()
print 'Gathering phrases in train data...'
for data in snli.train:
    phrases = phrases | tree2set(data['p_tree'])
Esempio n. 3
0
from word_mapping import *

filename = 'glove/glove.42B.300d.txt'
agg_checkpoint_name = 'saved_models/agg_predictor.pth'
select_checkpoint_name = 'saved_models/sel_predictor.pth'
cond_checkpoint_name = 'saved_models/cond_predictor.pth'

N_word = 300
batch_size = 10
hidden_dim = 100
n_epochs = 5
table_name = 'EMPLOYEE'

word_embed = load_word_emb(filename)

word_emb = WordEmbedding(N_word, word_embed)

model = Model(hidden_dim, N_word, word_emb)
model.agg_predictor.load_state_dict(torch.load(agg_checkpoint_name))
model.cond_predictor.load_state_dict(torch.load(cond_checkpoint_name))
model.sel_predictor.load_state_dict(torch.load(select_checkpoint_name))

model.eval()

sentence = sys.argv[1]
sentence = process_sentence(sentence)

question = [sentence.split(' ')]

columns = [[['id'], ['batch'], ['name'],
            ['salary']]]  #,   [ ['id'],['batch'],['name'],['salary']]  ]
class Trainer(object):
    def __init__(self, verbose=True):
        self.verbose = verbose
        if self.verbose:
            printerr('Word embedding path: ' + args.word_embedding)
        self.word_embedding = WordEmbedding(args.word_embedding)

        if self.verbose:
            printerr('Dataset prefix:' + args.dataset_prefix)
        self.data = SNLI(args.dataset_prefix, args.train_size, True, True)

        # trim the word embeddings to contain only words in the dataset
        if self.verbose:
            printerr("Before trim word embedding, " +
                     str(self.word_embedding.embeddings.size(0)) + " words")
        self.word_embedding.trim_by_counts(self.data.word_counts)
        if self.verbose:
            printerr("After trim word embedding, " +
                     str(self.word_embedding.embeddings.size(0)) + " words")
        self.word_embedding.extend_by_counts(self.data.word_counts)
        if self.verbose:
            printerr("After adding training words, " +
                     str(self.word_embedding.embeddings.size(0)) + " words")

        # mark word ids in snli trees
        for _data in self.data.train:
            _data['p_tree'].mark_word_id(self.word_embedding)
            _data['h_tree'].mark_word_id(self.word_embedding)
        for _data in self.data.dev:
            _data['p_tree'].mark_word_id(self.word_embedding)
            _data['h_tree'].mark_word_id(self.word_embedding)

        config = {
            'hidden_dim': args.hidden_dim,
            'relation_num': 3,
            'cuda_flag': args.cuda,
            'drop_p': args.drop_out
        }
        self.model = DoubleAttention_vRNN(self.word_embedding, config)
        self.optimizer = optim.Adadelta(self.model.parameters(),
                                        lr=args.learning_rate)
        if not args.dump is None:
            self.dump = args.dump + self.model.name
        else:
            self.dump = None

        if args.cuda:
            self.model.cuda()
            printerr('Using GPU %s' % str(args.gpu_id))
        printerr('Training ' + self.model.name)

    def train(self):
        best_dev_acc = 0.0
        profiler = SimpleProfiler()

        for i in xrange(0, args.epoches):
            if self.verbose:
                printerr("Starting epoch%d" % i)

            profiler.reset('train')
            profiler.start("train")
            train_loss = self.train_step(self.data.train)
            profiler.pause("train")

            print 'epoch:', i, 'train_loss:', train_loss, 'time:', profiler.get_time(
                'train')

            if (i + 1) % args.valid_freq == 0:
                profiler.reset('dev')
                profiler.start('dev')
                dev_acc = self.eval_step(self.data.dev)
                profiler.pause('dev')
                print '\t evaluating at epoch:', i, 'acc:', dev_acc, 'time:', profiler.get_time(
                    'dev')

                if best_dev_acc < dev_acc:
                    best_dev_acc = dev_acc

                    if not self.dump is None:
                        file_name = "%s.epoch%d.acc%.4f.pickle" % (self.dump,
                                                                   i, dev_acc)
                        printerr("saving weights to " + file_name)
                        torch.save(self.model.state_dict(), file_name)

    def train_step(self, data):
        total = str(len(data))
        index = 0
        train_loss = 0.0
        print 'training model'
        for _data in data:
            p_tree = _data['p_tree']
            h_tree = _data['h_tree']
            if args.cuda:
                target = Variable(torch.LongTensor([_data['label']]).cuda())
            else:
                target = Variable(torch.LongTensor([_data['label']]))
            self.optimizer.zero_grad()
            output = self.model(p_tree, h_tree)
            p_tree.clear_vars()
            h_tree.clear_vars()
            # loss = F.nll_loss(output, target)
            loss = F.cross_entropy(output, target)
            loss.backward()
            self.optimizer.step()
            index += 1
            train_loss += loss.data[0]
            print '\r', str(index), '/', total, 'loss:', loss.data[0],
        print '\t'
        return train_loss

    def eval_step(self, data):
        right_count = 0
        for _data in data:
            p_tree = _data['p_tree']
            h_tree = _data['h_tree']
            output = self.model(p_tree, h_tree)
            p_tree.clear_vars()
            h_tree.clear_vars()
            max_v = output.data.max(1)[1][0][0]
            right = True if max_v == _data['label'] else False
            if right:
                right_count += 1
        return float(right_count) / float(len(data))