def __init__(self, verbose=True):
        self.verbose = verbose
        if self.verbose:
            printerr('Word embedding path: ' + args.word_embedding)
        self.word_embedding = WordEmbedding(args.word_embedding)

        if self.verbose:
            printerr('Dataset prefix:' + args.dataset_prefix)
        self.data = SNLI(args.dataset_prefix, args.train_size, True, True)

        # trim the word embeddings to contain only words in the dataset
        if self.verbose:
            printerr("Before trim word embedding, " +
                     str(self.word_embedding.embeddings.size(0)) + " words")
        self.word_embedding.trim_by_counts(self.data.word_counts)
        if self.verbose:
            printerr("After trim word embedding, " +
                     str(self.word_embedding.embeddings.size(0)) + " words")
        self.word_embedding.extend_by_counts(self.data.word_counts)
        if self.verbose:
            printerr("After adding training words, " +
                     str(self.word_embedding.embeddings.size(0)) + " words")

        # mark word ids in snli trees
        for _data in self.data.train:
            _data['p_tree'].mark_word_id(self.word_embedding)
            _data['h_tree'].mark_word_id(self.word_embedding)
        for _data in self.data.dev:
            _data['p_tree'].mark_word_id(self.word_embedding)
            _data['h_tree'].mark_word_id(self.word_embedding)

        config = {
            'hidden_dim': args.hidden_dim,
            'relation_num': 3,
            'cuda_flag': args.cuda,
            'drop_p': args.drop_out
        }
        self.model = DoubleAttention_vRNN(self.word_embedding, config)
        self.optimizer = optim.Adadelta(self.model.parameters(),
                                        lr=args.learning_rate)
        if not args.dump is None:
            self.dump = args.dump + self.model.name
        else:
            self.dump = None

        if args.cuda:
            self.model.cuda()
            printerr('Using GPU %s' % str(args.gpu_id))
        printerr('Training ' + self.model.name)
    return ret


def tree2set(t):
    global _set
    _set = set()

    def func(node):
        _set.add(node.get_str4conceptnet())

    t.postorder_traverse(func)
    return _set


word_embedding = WordEmbedding('./data/wordembedding')
snli = SNLI('./data/')

printerr("Before trim word embedding, " +
         str(word_embedding.embeddings.size(0)) + " words")
word_embedding.trim_by_counts(snli.word_counts)
printerr("After trim word embedding, " +
         str(word_embedding.embeddings.size(0)) + " words")
word_embedding.extend_by_counts(snli.train_word_counts)
printerr("After adding training words, " +
         str(word_embedding.embeddings.size(0)) + " words")

phrases = set()
print 'Gathering phrases in train data...'
for data in snli.train:
    phrases = phrases | tree2set(data['p_tree'])
Esempio n. 3
0
from word_mapping import *

filename = 'glove/glove.42B.300d.txt'
agg_checkpoint_name = 'saved_models/agg_predictor.pth'
select_checkpoint_name = 'saved_models/sel_predictor.pth'
cond_checkpoint_name = 'saved_models/cond_predictor.pth'

N_word = 300
batch_size = 10
hidden_dim = 100
n_epochs = 5
table_name = 'EMPLOYEE'

word_embed = load_word_emb(filename)

word_emb = WordEmbedding(N_word, word_embed)

model = Model(hidden_dim, N_word, word_emb)
model.agg_predictor.load_state_dict(torch.load(agg_checkpoint_name))
model.cond_predictor.load_state_dict(torch.load(cond_checkpoint_name))
model.sel_predictor.load_state_dict(torch.load(select_checkpoint_name))

model.eval()

sentence = sys.argv[1]
sentence = process_sentence(sentence)

question = [sentence.split(' ')]

columns = [[['id'], ['batch'], ['name'],
            ['salary']]]  #,   [ ['id'],['batch'],['name'],['salary']]  ]