# train with early stopping on validation set best_f1_test, best_f1_test_val = -numpy.inf, -numpy.inf s['clr'] = s['lr'] # learning rate for e in xrange(s['nepochs']): # shuffle shuffle([train_lex, train_y, train_feat], s['seed']) s['ce'] = e tic = time.time() for i in xrange(num_sentences): context_words = contextwin( train_lex[i], s['win'] ) #list of list of indexes corresponding to context windows surrounding each word in the sentence words = map(lambda x: numpy.asarray(x).astype('int32'), minibatch(context_words, s['bs'])) features = minibatch(train_feat[i], s['bs']) labels = train_y[i] for word_batch, feature_batch, label_last_word in zip( words, features, labels): rnn.train(word_batch, feature_batch, label_last_word, s['clr']) rnn.normalize() if s['verbose']: print '[learning] epoch %i >> %2.2f%%' % (e, (i+1)*100./num_sentences),\ 'completed in %.2f (sec) <<\r' % (time.time()-tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words
# create a folder for store the models if not os.path.exists(model_folder): os.mkdir(model_folder) # train with early stopping on validation set best_f1_test, best_f1_test_val = -numpy.inf, -numpy.inf s['clr'] = s['lr'] # learning rate for e in xrange(s['nepochs']): # shuffle shuffle([train_lex, train_y, train_feat], s['seed']) s['ce'] = e tic = time.time() for i in xrange(num_sentences): context_words = contextwin(train_lex[i], s['win']) words = map(lambda x: numpy.asarray(x).astype('int32'), minibatch(context_words, s['bs'])) features = minibatch(train_feat[i], s['bs']) labels = train_y[i] for word_batch, feature_batch, label_last_word in zip(words, features, labels): rnn.train(word_batch, feature_batch, label_last_word, s['clr']) rnn.normalize() if s['verbose']: print '[learning] epoch %i >> %2.2f%%' % (e, (i+1)*100./num_sentences),\ 'completed in %.2f (sec) <<\r' % (time.time()-tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [map(lambda x: idx2label[x],
ne=vocsize, de=s['emb_dimension'], cs=s['win']) # train with early stopping on validation set best_f1 = -numpy.inf s['clr'] = s['lr'] for e in range(s['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], s['seed']) s['ce'] = e tic = time.time() for i in range(nsentences): cwords = contextwin(train_lex[i], s['win']) words = map(lambda x: numpy.asarray(x).astype('int32'),\ minibatch(cwords, s['bs'])) labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, s['clr']) rnn.normalize() if s['verbose']: print( '[learning] epoch {} >> {:2.2f}, completed in {:.2f} (sec) ' .format(e, (i + 1) * 100. / nsentences, time.time() - tic)) # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], \ rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32')))\
def play_with_spelling(): """Play with spelling mistakes""" print CONF np.random.seed(CONF['seed']) random.seed(CONF['seed']) print "Calculate output" session_files = get_session_files(number_of_files=CONF['number_of_files'], random_seed=CONF['seed']) sentences = get_sentences(session_files) print len(sentences) labels2idx = char2idx = get_char_to_idx(sentences) print "Prepare train, validation and test sets" train_valid_sentences, test_sentences = train_test_split(sentences, test_size=0.15, random_state=CONF['seed']) train_sentences, valid_sentences = train_test_split(train_valid_sentences, test_size=0.2, random_state=CONF['seed']) print len(train_valid_sentences), len(test_sentences) test_lex, test_y = create_tests(test_sentences, CONF['error_probability'], labels2idx, char2idx) valid_lex, valid_y = create_tests(valid_sentences, CONF['error_probability'], labels2idx, char2idx) train_lex = [] train_y = [] for error_probability in (CONF['error_probability'], CONF['error_probability'] / 10, CONF['error_probability'] / 100, 0): _train_idxes, _train_labels_idxes = create_tests(train_sentences, error_probability, labels2idx, char2idx) train_lex.extend(_train_idxes) train_y.extend(_train_labels_idxes) # train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_idxes, train_valid_labels_idxes, test_size=0.2, random_state=CONF['seed']) print len(train_lex), len(valid_lex), len(train_y), len(valid_y) print "Some more prep" idx2label = dict((k, v) for v, k in labels2idx.iteritems()) # Reverse the dictionary idx2word = dict((k, v) for v, k in char2idx.iteritems()) # Reverse the dictionary groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y] windowed_test_lex = [np.asarray(contextwin(x, CONF['win'])).astype('int32') for x in test_lex] windowed_valid_lex = [np.asarray(contextwin(x, CONF['win'])).astype('int32') for x in valid_lex] words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex] vocsize = 1 + len(set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex for item in sublist)) nclasses = 1 + len(set(item for _y in (train_y, test_y, valid_y) for sublist in _y for item in sublist)) nsentences = len(train_lex) words_lex = [] for i in xrange(nsentences): cwords = contextwin(train_lex[i], CONF['win']) words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, CONF['batch_size'])] words_lex.append(words) print "Some file os calls" folder = os.path.basename(__file__).split('.')[0] + "_3" if not os.path.exists(folder): os.mkdir(folder) print "Create a Neural Network" rnn = regular_elman(nh=CONF['nhidden'], nc=nclasses, ne=vocsize, de=CONF['emb_dimension'], cs=CONF['win'],) # train with early stopping on validation set best_f1 = -np.inf CONF['current_learning_rate'] = CONF['learning_rate'] print "Start training" start_time = print_time = time.time() for epoch in xrange(CONF['nepochs']): # shuffle shuffle([words_lex, train_y], CONF['seed']) CONF['ce'] = epoch tic = time.time() percentage_of_sentences_to_train = (epoch + 1) / CONF['nepochs'] numer_of_sentences_to_train = int(nsentences * percentage_of_sentences_to_train) print "starting an epoch, numer_of_sentences_to_train =", numer_of_sentences_to_train test_size = int(len(windowed_test_lex) * percentage_of_sentences_to_train) print "test_size", test_size validation_size = int(len(windowed_valid_lex) * percentage_of_sentences_to_train) print "validation_size", validation_size for _ in xrange(30): # Trauma! print "_", _ for i in xrange(numer_of_sentences_to_train): words = words_lex[i] labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, CONF['current_learning_rate']) rnn.normalize() if CONF['verbose'] and time.time() - print_time > 30: print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / numer_of_sentences_to_train), 'completed in %.2f (sec) <<\r' % (time.time() - tic), print_time = time.time() # evaluation // back into the real world : idx -> words if CONF['verbose']: print "Classify test" predictions_test = [[idx2label[x] for x in rnn.classify(windowed_test_lex_item)] for windowed_test_lex_item in windowed_test_lex[:test_size]] if CONF['verbose']: print "Classify validation" predictions_valid = [[idx2label[x] for x in rnn.classify(windowed_valid_lex_item)] for windowed_valid_lex_item in windowed_valid_lex[:validation_size]] # evaluation // compute the accuracy using conlleval.pl if CONF['verbose']: print "Evaluate test and validation" res_test = conlleval(predictions_test, groundtruth_test[:test_size], words_test[:test_size], folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid[:validation_size], words_valid[:validation_size], folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20 CONF['vf1'], CONF['vp'], CONF['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] CONF['tf1'], CONF['tp'], CONF['tr'] = res_test['f1'], res_test['p'], res_test['r'] CONF['be'] = epoch subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) else: print ' : epoch', epoch, 'valid F1', res_valid['f1'], ' test F1', res_test['f1'], ' ' * 20 # rnn.load(folder) # learning rate decay if no improvement in 10 epochs if CONF['decay'] and abs(CONF['be'] - CONF['ce']) >= 10: CONF['current_learning_rate'] *= 0.5 if CONF['current_learning_rate'] < 1e-5: break print 'BEST RESULT: epoch', CONF['be'], 'valid F1', best_f1, 'best test F1', CONF['tf1'], 'with the model', folder print "total time = {} seconds".format(time.time() - start_time)
def main(): settings = { 'fold': 3, # 5 folds 0,1,2,3,4 'lr': 0.0627142536696559, 'verbose': 1, 'decay': False, # decay on the learning rate if improvement stops 'win': 7, # number of words in the context window 'bs': 9, # number of backprop through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 100, # dimension of word embedding 'nepochs': 50 } folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) # load the dataset train_set, valid_set, test_set, dic = load.atisfold(settings['fold']) idx2label = dict((k, v) for v, k in dic['labels2idx'].iteritems()) idx2word = dict((k, v) for v, k in dic['words2idx'].iteritems()) train_lex, train_ne, train_y = train_set valid_lex, valid_ne, valid_y = valid_set test_lex, test_ne, test_y = test_set vocsize = len(dic['words2idx']) nclasses = len(dic['labels2idx']) nsentences = len(train_lex) # instantiate the model numpy.random.seed(settings['seed']) random.seed(settings['seed']) if LOAD: print "Loading model from %s..." % folder rnn = ElmanRNNModel.load(folder) else: rnn = ElmanRNNModel( hidden_dims=settings['nhidden'], num_classes=nclasses, vocab_size=vocsize, embed_dims=settings['emb_dimension'], context_size=settings['win'] ) # train with early stopping on validation set best_f1 = -numpy.inf settings['current_lr'] = settings['lr'] for e in xrange(settings['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], settings['seed']) settings['current_epoch'] = e tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], settings['win']) words = map( lambda x: numpy.asarray(x).astype('int32'), minibatch(cwords, settings['bs']) ) labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, settings['current_lr']) rnn.normalize() if settings['verbose']: print '[learning] epoch %i >> %2.2f%%' % (e, (i+1)*100./nsentences), \ 'completed in %.2f (sec) <<\r' % (time.time()-tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x, settings['win'])).astype('int32'))) for x in test_lex ] groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y ] words_test = [map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map( lambda idx: idx2label[idx], rnn.classify( numpy.asarray(contextwin(x, settings['win'])).astype('int32')) ) for x in valid_lex ] groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y] words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] if settings['verbose']: print 'NEW BEST: epoch', e, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' '*20 settings['vf1'], settings['vp'], settings['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] settings['tf1'], settings['tp'], settings['tr'] = res_test['f1'], res_test['p'], res_test['r'] settings['be'] = e subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) else: print '' # learning rate decay if no improvement in 10 epochs if settings['decay'] and abs(settings['be'] - settings['current_epoch']) >= 10: settings['current_lr'] *= 0.5 if settings['current_lr'] < 1e-5: break print 'BEST RESULT: epoch', e, 'valid F1', settings['vf1'], 'best test F1', settings['tf1'], 'with the model', folder
def play_with_splitting_sentences(): """Play with splitting sentences""" conf = { # 'fold': 3, # 5 folds 0,1,2,3,4 'lr': 0.0627142536696559, 'verbose': False, 'decay': True, # decay on the learning rate if improvement stops 'win': 15, # number of characters in the context window 'bs': 5, # number of back-propagation through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 30, # dimension of character embedding 'nepochs': 10 } number_of_files = 50000 np.random.seed(conf['seed']) random.seed(conf['seed']) print "Calculate output" session_files = get_session_files( number_of_files=number_of_files, random_seed=conf['seed']) # Limit the scope To speed things up... labels2idx = {"O": 0, "X": 1} sentences = [] idxes = [] labels_idxes = [] labels = [] char2idx = get_char_to_idx(session_files) for session_file in session_files: session = json.loads(open(session_file, "rb").read()) sentence = session_to_text0(session) if not sentence.strip(): continue sentence_out, label = create_test(sentence, probability=0.2) sentences.append(sentence_out) labels.append(label) labels_idxes.append( np.fromiter((labels2idx[l] for l in label), dtype=np.uint32)) idxes.append( np.fromiter((char2idx[char] for char in sentence_out), dtype=np.uint32)) print "Prepare train, validation and test sets" train_valid_lex, test_lex, train_valid_y, test_y = train_test_split( idxes, labels_idxes, test_size=0.15, random_state=42) train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42) print "Some more prep" idx2label = dict( (k, v) for v, k in labels2idx.iteritems()) # Reverse the dictionary idx2word = dict( (k, v) for v, k in char2idx.iteritems()) # Reverse the dictionary # vocsize = 1 + len(set(reduce(\ # lambda x, y: list(x)+list(y),\ # train_lex+valid_lex+test_lex))) vocsize = 1 + len( set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex for item in sublist)) nclasses = 2 #len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y))) nsentences = len(train_lex) print "Some file os calls" folder = os.path.basename(__file__).split('.')[0] + "_3" if not os.path.exists(folder): os.mkdir(folder) print "Create a Neural Network" rnn = regular_elman( nh=conf['nhidden'], nc=nclasses, ne=vocsize, de=conf['emb_dimension'], cs=conf['win'], ) # train with early stopping on validation set best_f1 = -np.inf conf['clr'] = conf['lr'] print "Start training" start_time = time.time() for epoch in xrange(conf['nepochs']): # shuffle shuffle([train_lex, train_y], conf['seed']) conf['ce'] = epoch tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], conf['win']) words = [ np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs']) ] labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, conf['clr']) rnn.normalize() if conf['verbose']: print '[learning] epoch %i >> %2.2f%%' % ( epoch, (i + 1) * 100. / nsentences ), 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in test_lex ] groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y] words_test = [map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in valid_lex ] groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y] words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] print 'NEW BEST: epoch', epoch, 'valid F1', res_valid[ 'f1'], 'best test F1', res_test['f1'], ' ' * 20 conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid[ 'p'], res_valid['r'] conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test[ 'p'], res_test['r'] conf['be'] = epoch subprocess.call([ 'mv', folder + '/current.test.txt', folder + '/best.test.txt' ]) subprocess.call([ 'mv', folder + '/current.valid.txt', folder + '/best.valid.txt' ]) else: print ' : epoch', epoch, 'valid F1', res_valid[ 'f1'], ' test F1', res_test['f1'], ' ' * 20 # learning rate decay if no improvement in 10 epochs if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10: conf['clr'] *= 0.5 if conf['clr'] < 1e-5: break print 'BEST RESULT: epoch', conf[ 'be'], 'valid F1', best_f1, 'best test F1', conf[ 'tf1'], 'with the model', folder print "total time = {} seconds".format(time.time() - start_time)
def prepare_data(): """Prepare the data""" conf = { 'fold': 3, # 5 folds 0,1,2,3,4 'lr': 0.0627142536696559, 'verbose': True, 'decay': True, # decay on the learning rate if improvement stops 'win': 7, # number of words in the context window 'bs': 9, # number of back-propagation through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 300, # dimension of word embedding 'nepochs': 50 } np.random.seed(conf['seed']) random.seed(conf['seed']) session_files = get_session_files( number_of_files=None, random_seed=conf['seed']) # Limit the scope To speed things up... sentences = [] idxes = [] labels = [] labels_idxes = [] print "Calculate words2idx" words2idx = get_words2idx(session_files) unknown = words2idx["<UNK>"] print "Calculate output" for session_file in session_files: session = json.loads(open(session_file, "rb").read()) sentence = session_to_text0(session) if not sentence.strip(): continue sentences.append(sentence) token_list = tokenize(sentence.lower()) dtp_search_res = dtp_search(sentence, None) iobes = to_iob(token_list, dtp_search_res) labels.append(iobes) labels_idxes.append( np.fromiter((LABELS2IDX[iob] for iob in iobes), dtype=np.int32)) # token_list = [re.sub(r"\d", "DIGIT", token) for token in token_list] idxes.append( np.fromiter( (words2idx.get(token, unknown) for token in token_list), dtype=np.int32)) print "Prepare train, validation and test sets" train_valid_lex, test_lex, train_valid_y, test_y = train_test_split( idxes, labels_idxes, test_size=0.15, random_state=42) train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42) idx2label = dict( (k, v) for v, k in LABELS2IDX.iteritems()) # Reverse the dictionary idx2word = dict( (k, v) for v, k in words2idx.iteritems()) # Reverse the dictionary vocsize = len(idx2word) nclasses = len({label for labels in labels_idxes for label in labels}) # nclasses = len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y))) nsentences = len(train_lex) folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) print "Loading Word2Vec" word2vec = Word2Vec.load_word2vec_format(WORD2VEC_FILENAME, binary=True) # C binary format print "Calculate word embeddings" embeddings = 0.2 * np.random.uniform( -1.0, 1.0, (vocsize + 1, conf['emb_dimension'])).astype( theano.config.floatX ) # add one for PADDING at the end @UndefinedVariable for idx, word in idx2word.iteritems(): try: embedding = word2vec[word] except KeyError: try: embedding = word2vec[word.capitalize()] except KeyError: embedding = embeddings[idx] # Keep it random embeddings[idx] = embedding del word2vec # It is huge print "Create a Neural Network" rnn = elman2vec(nh=conf['nhidden'], nc=nclasses, ne=vocsize, de=conf['emb_dimension'], cs=conf['win'], embeddings=embeddings) # train with early stopping on validation set best_f1 = -np.inf conf['clr'] = conf['lr'] print "Start training" for epoch in xrange(conf['nepochs']): # shuffle shuffle([train_lex, train_y], conf['seed']) conf['ce'] = epoch tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], conf['win']) words = [ np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs']) ] labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, conf['clr']) # rnn.normalize() if conf['verbose']: print '[learning] epoch %i >> %2.2f%%' % ( epoch, (i + 1) * 100. / nsentences ), 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in test_lex ] groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y] words_test = [map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in valid_lex ] groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y] words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] print 'NEW BEST: epoch', epoch, 'valid F1', res_valid[ 'f1'], 'best test F1', res_test['f1'], ' ' * 20 conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid[ 'p'], res_valid['r'] conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test[ 'p'], res_test['r'] conf['be'] = epoch subprocess.call([ 'mv', folder + '/current.test.txt', folder + '/best.test.txt' ]) subprocess.call([ 'mv', folder + '/current.valid.txt', folder + '/best.valid.txt' ]) else: print ' : epoch', epoch, 'valid F1', res_valid[ 'f1'], ' test F1', res_test['f1'], ' ' * 20 # learning rate decay if no improvement in 10 epochs if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10: conf['clr'] *= 0.5 if conf['clr'] < 1e-5: break print 'BEST RESULT: epoch', epoch, 'valid F1', res_valid[ 'f1'], 'best test F1', res_test['f1'], 'with the model', folder
ne = vocsize, de = s['emb_dimension'], cs = s['win'] ) # train with early stopping on validation set best_f1 = -numpy.inf s['clr'] = s['lr'] for e in xrange(s['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], s['seed']) s['ce'] = e tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], s['win']) words = map(lambda x: numpy.asarray(x).astype('int32'),\ minibatch(cwords, s['bs'])) labels = train_y[i] for word_batch , label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, s['clr']) rnn.normalize() if s['verbose']: print '[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./nsentences),'completed in %.2f (sec) <<\r'%(time.time()-tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], \ rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32')))\ for x in test_lex ] groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]
def play_with_splitting_sentences(): """Play with splitting sentences""" conf = { # 'fold': 3, # 5 folds 0,1,2,3,4 'lr': 0.0627142536696559, 'verbose': False, 'decay': True, # decay on the learning rate if improvement stops 'win': 15, # number of characters in the context window 'bs': 5, # number of back-propagation through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 30, # dimension of character embedding 'nepochs': 10} number_of_files = 50000 np.random.seed(conf['seed']) random.seed(conf['seed']) print "Calculate output" session_files = get_session_files(number_of_files=number_of_files, random_seed=conf['seed']) # Limit the scope To speed things up... labels2idx = {"O": 0, "X": 1} sentences = [] idxes = [] labels_idxes = [] labels = [] char2idx = get_char_to_idx(session_files) for session_file in session_files: session = json.loads(open(session_file, "rb").read()) sentence = session_to_text0(session) if not sentence.strip(): continue sentence_out, label = create_test(sentence, probability=0.2) sentences.append(sentence_out) labels.append(label) labels_idxes.append(np.fromiter((labels2idx[l] for l in label), dtype=np.uint32)) idxes.append(np.fromiter((char2idx[char] for char in sentence_out), dtype=np.uint32)) print "Prepare train, validation and test sets" train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(idxes, labels_idxes, test_size=0.15, random_state=42) train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42) print "Some more prep" idx2label = dict((k, v) for v, k in labels2idx.iteritems()) # Reverse the dictionary idx2word = dict((k, v) for v, k in char2idx.iteritems()) # Reverse the dictionary # vocsize = 1 + len(set(reduce(\ # lambda x, y: list(x)+list(y),\ # train_lex+valid_lex+test_lex))) vocsize = 1 + len(set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex for item in sublist)) nclasses = 2 #len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y))) nsentences = len(train_lex) print "Some file os calls" folder = os.path.basename(__file__).split('.')[0] + "_3" if not os.path.exists(folder): os.mkdir(folder) print "Create a Neural Network" rnn = regular_elman(nh=conf['nhidden'], nc=nclasses, ne=vocsize, de=conf['emb_dimension'], cs=conf['win'],) # train with early stopping on validation set best_f1 = -np.inf conf['clr'] = conf['lr'] print "Start training" start_time = time.time() for epoch in xrange(conf['nepochs']): # shuffle shuffle([train_lex, train_y], conf['seed']) conf['ce'] = epoch tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], conf['win']) words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs'])] labels = train_y[i] for word_batch , label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, conf['clr']) rnn.normalize() if conf['verbose']: print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / nsentences), 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in test_lex ] groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in valid_lex ] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20 conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test['p'], res_test['r'] conf['be'] = epoch subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) else: print ' : epoch', epoch, 'valid F1', res_valid['f1'], ' test F1', res_test['f1'], ' ' * 20 # learning rate decay if no improvement in 10 epochs if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10: conf['clr'] *= 0.5 if conf['clr'] < 1e-5: break print 'BEST RESULT: epoch', conf['be'], 'valid F1', best_f1, 'best test F1', conf['tf1'], 'with the model', folder print "total time = {} seconds".format(time.time() - start_time)
def prepare_data(): """Prepare the data""" conf = {'fold': 3, # 5 folds 0,1,2,3,4 'lr': 0.0627142536696559, 'verbose': True, 'decay': True, # decay on the learning rate if improvement stops 'win': 7, # number of words in the context window 'bs': 9, # number of back-propagation through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 300, # dimension of word embedding 'nepochs': 50} np.random.seed(conf['seed']) random.seed(conf['seed']) session_files = get_session_files(number_of_files=None, random_seed=conf['seed']) # Limit the scope To speed things up... sentences = [] idxes = [] labels = [] labels_idxes = [] print "Calculate words2idx" words2idx = get_words2idx(session_files) unknown = words2idx["<UNK>"] print "Calculate output" for session_file in session_files: session = json.loads(open(session_file, "rb").read()) sentence = session_to_text0(session) if not sentence.strip(): continue sentences.append(sentence) token_list = tokenize(sentence.lower()) dtp_search_res = dtp_search(sentence, None) iobes = to_iob(token_list, dtp_search_res) labels.append(iobes) labels_idxes.append(np.fromiter((LABELS2IDX[iob] for iob in iobes), dtype=np.int32)) # token_list = [re.sub(r"\d", "DIGIT", token) for token in token_list] idxes.append(np.fromiter((words2idx.get(token, unknown) for token in token_list), dtype=np.int32)) print "Prepare train, validation and test sets" train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(idxes, labels_idxes, test_size=0.15, random_state=42) train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42) idx2label = dict((k, v) for v, k in LABELS2IDX.iteritems()) # Reverse the dictionary idx2word = dict((k, v) for v, k in words2idx.iteritems()) # Reverse the dictionary vocsize = len(idx2word) nclasses = len({label for labels in labels_idxes for label in labels}) # nclasses = len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y))) nsentences = len(train_lex) folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) print "Loading Word2Vec" word2vec = Word2Vec.load_word2vec_format(WORD2VEC_FILENAME, binary=True) # C binary format print "Calculate word embeddings" embeddings = 0.2 * np.random.uniform(-1.0, 1.0, (vocsize + 1, conf['emb_dimension'])).astype(theano.config.floatX) # add one for PADDING at the end @UndefinedVariable for idx, word in idx2word.iteritems(): try: embedding = word2vec[word] except KeyError: try: embedding = word2vec[word.capitalize()] except KeyError: embedding = embeddings[idx] # Keep it random embeddings[idx] = embedding del word2vec # It is huge print "Create a Neural Network" rnn = elman2vec(nh=conf['nhidden'], nc=nclasses, ne=vocsize, de=conf['emb_dimension'], cs=conf['win'], embeddings=embeddings) # train with early stopping on validation set best_f1 = -np.inf conf['clr'] = conf['lr'] print "Start training" for epoch in xrange(conf['nepochs']): # shuffle shuffle([train_lex, train_y], conf['seed']) conf['ce'] = epoch tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], conf['win']) words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs'])] labels = train_y[i] for word_batch , label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, conf['clr']) # rnn.normalize() if conf['verbose']: print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / nsentences), 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in test_lex ] groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in valid_lex ] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20 conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test['p'], res_test['r'] conf['be'] = epoch subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) else: print ' : epoch', epoch, 'valid F1', res_valid['f1'], ' test F1', res_test['f1'], ' ' * 20 # learning rate decay if no improvement in 10 epochs if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10: conf['clr'] *= 0.5 if conf['clr'] < 1e-5: break print 'BEST RESULT: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], 'with the model', folder