#preModel = preTrainModel() #saver = tf.train.Saver({"trained_embedding": embedding}) #Embedding training #trainingSession(preModel) #preTraining = False model = model() total_loss = loss(model) train_op = train(total_loss, lr) init = tf.global_variables_initializer() sess.run(init) print("Starting Training... Please Wait...") best_f1 = -np.inf current_lr = lr best_epoch = 0 for fold in range(0, 5): # load the dataset train_set, valid_set, test_set, dic = load.atisfold(fold) idx2label = dict((k, v) for v, k in dic['labels2idx'].items()) idx2word = dict((k, v) for v, k in dic['words2idx'].items()) train_lex, train_ne, train_y = train_set valid_lex, valid_ne, valid_y = valid_set test_lex, test_ne, test_y = test_set # maxSentenceLength = np.amax([len(i) for i in train_lex+train_ne+train_y]) nsentences = len(train_lex) print("Fold ", fold + 1) best_f1 = trainingSession(model, best_f1)
'lr': 0.0627142536696559, 'verbose': 1, 'decay': False, # decay on the learning rate if improvement stops 'win': 7, # number of words in the context window 'bs': 9, # number of backprop through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 100, # dimension of word embedding 'nepochs': 50 } folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) # load the dataset train_set, valid_set, test_set, dic = load.atisfold(s['fold']) idx2label = dict((k, v) for v, k in dic['labels2idx'].items()) idx2word = dict((k, v) for v, k in dic['words2idx'].items()) train_lex, train_ne, train_y = train_set valid_lex, valid_ne, valid_y = valid_set test_lex, test_ne, test_y = test_set vocsize = len(set(reduce(\ lambda x, y: list(x)+list(y),\ train_lex+valid_lex+test_lex))) nclasses = len(set(reduce(\ lambda x, y: list(x)+list(y),\ train_y+test_y+valid_y)))
def main(): settings = { 'fold': 3, # 5 folds 0,1,2,3,4 'lr': 0.0627142536696559, 'verbose': 1, 'decay': False, # decay on the learning rate if improvement stops 'win': 7, # number of words in the context window 'bs': 9, # number of backprop through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 100, # dimension of word embedding 'nepochs': 50 } folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) # load the dataset train_set, valid_set, test_set, dic = load.atisfold(settings['fold']) idx2label = dict((k, v) for v, k in dic['labels2idx'].iteritems()) idx2word = dict((k, v) for v, k in dic['words2idx'].iteritems()) train_lex, train_ne, train_y = train_set valid_lex, valid_ne, valid_y = valid_set test_lex, test_ne, test_y = test_set vocsize = len(dic['words2idx']) nclasses = len(dic['labels2idx']) nsentences = len(train_lex) # instantiate the model numpy.random.seed(settings['seed']) random.seed(settings['seed']) if LOAD: print "Loading model from %s..." % folder rnn = ElmanRNNModel.load(folder) else: rnn = ElmanRNNModel( hidden_dims=settings['nhidden'], num_classes=nclasses, vocab_size=vocsize, embed_dims=settings['emb_dimension'], context_size=settings['win'] ) # train with early stopping on validation set best_f1 = -numpy.inf settings['current_lr'] = settings['lr'] for e in xrange(settings['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], settings['seed']) settings['current_epoch'] = e tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], settings['win']) words = map( lambda x: numpy.asarray(x).astype('int32'), minibatch(cwords, settings['bs']) ) labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, settings['current_lr']) rnn.normalize() if settings['verbose']: print '[learning] epoch %i >> %2.2f%%' % (e, (i+1)*100./nsentences), \ 'completed in %.2f (sec) <<\r' % (time.time()-tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x, settings['win'])).astype('int32'))) for x in test_lex ] groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y ] words_test = [map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map( lambda idx: idx2label[idx], rnn.classify( numpy.asarray(contextwin(x, settings['win'])).astype('int32')) ) for x in valid_lex ] groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y] words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] if settings['verbose']: print 'NEW BEST: epoch', e, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' '*20 settings['vf1'], settings['vp'], settings['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] settings['tf1'], settings['tp'], settings['tr'] = res_test['f1'], res_test['p'], res_test['r'] settings['be'] = e subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) else: print '' # learning rate decay if no improvement in 10 epochs if settings['decay'] and abs(settings['be'] - settings['current_epoch']) >= 10: settings['current_lr'] *= 0.5 if settings['current_lr'] < 1e-5: break print 'BEST RESULT: epoch', e, 'valid F1', settings['vf1'], 'best test F1', settings['tf1'], 'with the model', folder
#!/usr/bin/env python # -*- coding: utf-8 -*- import data.load as load import numpy as np from functools import reduce train_set, valid_set, test_set, dic = load.atisfold(3) idx2label = dict((v,k) for k,v in dic['labels2idx'].items()) idx2word = dict((v,k) for k,v in dic['words2idx'].items()) train_lex, train_ne, train_y = train_set valid_lex, valid_ne, valid_y = valid_set test_lex, test_ne, test_y = test_set sent = train_lex[0] print(list(map(lambda x: idx2word[x], sent))) vocsize = len(set(reduce(lambda x, y: list(x)+list(y), train_lex+valid_lex+test_lex))) nclasses = len(set(reduce(lambda x, y: list(x)+list(y), train_y+valid_y+test_y))) nsentences = len(train_lex) print("vocsize: %d, # of classes: %d, # of sentences: %d" % (vocsize, nclasses, nsentences)) def context_window(sentence, width=3): """
import random def random_digit(n): num = list(range(10)) ans = ''.join([str(random.choice(num)) for _ in range(n)]) return ans text = [] entities = [] is_train = [] for i in range(5): w2ne, w2la = {}, {} train, _, test, dic = ml.atisfold(i) w2idx, ne2idx, labels2idx = dic['words2idx'], dic['tables2idx'], dic[ 'labels2idx'] idx2w = dict((v, k) for k, v in w2idx.iteritems()) idx2ne = dict((v, k) for k, v in ne2idx.iteritems()) idx2la = dict((v, k) for k, v in labels2idx.iteritems()) test_x, test_ne, test_label = test train_x, train_ne, train_label = train wlength = 35 for e in ['train', 'test']: for sw, se, sl in zip(eval(e + '_x'), eval(e + '_ne'), eval(e + '_label')):
s = {'fold':3, # 5 folds 0,1,2,3,4 'lr':0.0627142536696559, 'verbose':1, 'decay':False, # decay on the learning rate if improvement stops 'win':7, # number of words in the context window 'bs':9, # number of backprop through time steps 'nhidden':100, # number of hidden units 'seed':345, 'emb_dimension':100, # dimension of word embedding 'nepochs':50} folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) # load the dataset train_set, valid_set, test_set, dic = load.atisfold(s['fold']) idx2label = dict((k,v) for v,k in dic['labels2idx'].iteritems()) idx2word = dict((k,v) for v,k in dic['words2idx'].iteritems()) train_lex, train_ne, train_y = train_set valid_lex, valid_ne, valid_y = valid_set test_lex, test_ne, test_y = test_set vocsize = len(set(reduce(\ lambda x, y: list(x)+list(y),\ train_lex+valid_lex+test_lex))) nclasses = len(set(reduce(\ lambda x, y: list(x)+list(y),\ train_y+test_y+valid_y)))