def test(args): print '\nNEURAL POS TAGGER START\n' print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list) print '\tWORD\t\t\tEmb Dim: %d Hidden Dim: %d' % (args.w_emb_dim, args.w_hidden_dim) print '\tCHARACTER\t\tEmb Dim: %d Hidden Dim: %d' % (args.c_emb_dim, args.c_hidden_dim) print '\tOPTIMIZATION\t\tMethod: %s Learning Rate: %f\n' % (args.opt, args.lr) """ load vocab """ print 'Loading vocabularies...\n' vocab_word = io_utils.load_data('vocab_word') vocab_char = io_utils.load_data('vocab_char') vocab_tag = io_utils.load_data('vocab_tag') print '\tWord size: %d Char size: %d' % (vocab_word.size(), vocab_char.size()) """ load data """ print '\nLoading data set...\n' test_corpus, test_vocab_word, test_vocab_char, test_vocab_tag = io_utils.load_conll(args.dev_data) print '\tTest Sentences: %d' % len(test_corpus) """ converting into ids """ print '\nConverting into IDs...\n' test_x, test_c, test_b, test_y = preprocessor.convert_into_ids(test_corpus, vocab_word, vocab_char, vocab_tag) """ tagger set up """ tagger = io_utils.load_data(args.load) dev_f = theano.function( inputs=tagger.input[:-1], outputs=tagger.result, mode='FAST_RUN' ) """ Prediction """ print '\nPREDICTION START\n' print '\tBatch Index: ', start = time.time() total = 0.0 correct = 0 for index in xrange(len(test_x)): if index % 100 == 0 and index != 0: print index, sys.stdout.flush() if tagger.name == 'char': corrects = dev_f(test_x[index], test_c[index], test_b[index], test_y[index]) else: corrects = dev_f(test_x[index], test_y[index]) total += len(corrects) correct += np.sum(corrects) end = time.time() print '\n\tTime: %f seconds' % (end - start) print '\tAccuracy:%f Total:%d Correct:%d' % ((correct / total), total, correct)
def train(args): print '\nNEURAL POS TAGGER START\n' print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list) print '\tWORD\t\t\tEmb Dim: %d Hidden Dim: %d' % (args.w_emb_dim, args.w_hidden_dim) print '\tCHARACTER\t\tEmb Dim: %d Hidden Dim: %d' % (args.c_emb_dim, args.c_hidden_dim) print '\tOPTIMIZATION\t\tMethod: %s Learning Rate: %f\n' % (args.opt, args.lr) print '\tMINI-BATCH: %d\n' % args.batch_size """ load data """ print 'Loading data sets...\n' train_corpus, vocab_word, vocab_char, vocab_tag, _ = io_utils.load_conll(args.train_data) """ limit data set """ train_corpus = train_corpus[:args.data_size] train_corpus.sort(key=lambda a: len(a)) dev_corpus = None if args.dev_data: dev_corpus, dev_vocab_word, dev_vocab_char, dev_vocab_tag, _ = io_utils.load_conll(args.dev_data) for w in dev_vocab_word.i2w: if args.vocab_size is None or vocab_word.size() < args.vocab_size: vocab_word.add_word(w) for c in dev_vocab_char.i2w: vocab_char.add_word(c) for t in dev_vocab_tag.i2w: vocab_tag.add_word(t) if args.save: io_utils.dump_data(vocab_word, 'vocab_word') io_utils.dump_data(vocab_char, 'vocab_char') io_utils.dump_data(vocab_tag, 'vocab_tag') """ load word embeddings """ init_w_emb = None if args.emb_list: print '\tLoading pre-trained word embeddings...\n' init_w_emb = io_utils.load_init_emb(args.emb_list, args.word_list, vocab_word) w_emb_dim = init_w_emb.shape[1] else: print '\tUse random-initialized word embeddings...\n' w_emb_dim = args.w_emb_dim """ converting into ids """ print '\nConverting into IDs...\n' tr_x, tr_c, tr_b, tr_y = convert_into_ids(train_corpus, vocab_word, vocab_char, vocab_tag) tr_x, tr_y, tr_b = set_minibatch(tr_x, tr_y, args.batch_size) tr_x, tr_y = shared_samples(tr_x, tr_y) dev_x = None dev_y = None if args.dev_data: dev_x, dev_c, dev_b, dev_y = convert_into_ids(dev_corpus, vocab_word, vocab_char, vocab_tag) dev_x, dev_y, dev_b = set_minibatch(dev_x, dev_y, 1) dev_x, dev_y = shared_samples(dev_x, dev_y) print '\tTrain Sentences: %d Dev Sentences: %d' % (len(train_corpus), len(dev_corpus)) else: print '\tTrain Sentences: %d' % len(train_corpus) print '\tWord size: %d Char size: %d' % (vocab_word.size(), vocab_char.size()) """ set model parameters """ hidden_dim = args.w_hidden_dim output_dim = vocab_tag.size() window = args.window opt = args.opt """ symbol definition """ print '\tCompiling Theano Code...' bos = T.iscalar('bos') eos = T.iscalar('eos') n_words = T.iscalar('n_words') batch_size = T.iscalar('batch_size') x = T.imatrix('x') y = T.ivector('y') lr = T.fscalar('lr') """ tagger set up """ tagger = Model(x=x, y=y, n_words=n_words, batch_size=batch_size, lr=lr, init_emb=init_w_emb, vocab_size=vocab_word.size(), emb_dim=w_emb_dim, hidden_dim=hidden_dim, output_dim=output_dim, opt=opt, window=window) train_f = theano.function( inputs=[bos, eos, n_words, batch_size, lr], outputs=[tagger.nll, tagger.result], updates=tagger.updates, givens={ x: tr_x[bos: eos], y: tr_y[bos: eos] }, mode='FAST_RUN' ) dev_f = theano.function( inputs=[bos, eos, n_words, batch_size], outputs=tagger.result, givens={ x: dev_x[bos: eos], y: dev_y[bos: eos] }, mode='FAST_RUN' ) def _train(): for epoch in xrange(args.epoch): _lr = args.lr / float(epoch+1) indices = range(len(tr_b)) random.shuffle(indices) print '\nEpoch: %d' % (epoch + 1) print '\tBatch Index: ', start = time.time() total = 0.0 correct = 0 losses = 0.0 for i, index in enumerate(indices): if i % 100 == 0 and i != 0: print i, sys.stdout.flush() boundary = tr_b[index] loss, corrects = train_f(boundary[0], boundary[1], boundary[2],boundary[3], _lr) assert math.isnan(loss) is False, i total += len(corrects) correct += np.sum(corrects) losses += loss end = time.time() print '\tTime: %f seconds' % (end - start) print '\tNegative Log Likelihood: %f' % losses print '\tAccuracy:%f Total:%d Correct:%d' % ((correct / total), total, correct) _dev(dev_f) def _dev(model): print '\tBatch Index: ', start = time.time() total = 0.0 correct = 0 for index in xrange(len(dev_b)): if index % 100 == 0 and index != 0: print index, sys.stdout.flush() boundary = dev_b[index] corrects = model(boundary[0], boundary[1], boundary[2], boundary[3]) total += len(corrects) correct += np.sum(corrects) end = time.time() print '\tTime: %f seconds' % (end - start) print '\tAccuracy:%f Total:%d Correct:%d' % ((correct / total), total, correct) _train()
def main(argv): print '\nSYSTEM START' print '\nMODE: Training' ################### # PREPROCESS DATA # ################### """ Load initial embedding file """ vocab_word = Vocab() emb = None if argv.init_emb: print '\n\tInitial Embedding Loading...' emb, vocab_word = load_init_emb(init_emb=argv.init_emb) print '\t\tVocabulary Size: %d' % vocab_word.size() """ Load corpora """ print '\n\tLoading Corpora...' tr_corpus, tr_doc_names, vocab_word = load_conll(path=argv.train_data, vocab=vocab_word, data_size=argv.data_size) dev_corpus, dev_doc_names, _ = load_conll(path=argv.dev_data, vocab=vocab_word, data_size=argv.data_size) print '\t\tTrain Documents: %d' % len(tr_corpus) print '\t\tDev Documents: %d' % len(dev_corpus) """ Extract gold mentions CoNLL-2012: Train=155,560, Dev=19,156, Test=19,764 """ # gold_mentions: 1D: n_doc, 2D: n_sents, 3D: n_mentions: elem=(bos, eos) # gold_corefs: 1D: n_doc, 2D: n_sents, 3D: n_mentions: elem=coref_id print '\n\tExtracting Gold Mentions...' print '\t\tTRAIN', tr_gold_ments = get_gold_mentions(tr_corpus, check=argv.check) print '\t\tDEV ', dev_gold_ments = get_gold_mentions(dev_corpus) """ Extract cand mentions """ # cand_mentions: 1D: n_doc, 2D: n_sents, 3D: n_mentions; elem=(bos, eos) print '\n\tExtracting Cand Mentions...' print '\t\tTRAIN', tr_cand_ments = get_cand_mentions(tr_corpus, check=argv.check) print '\t\tDEV ', dev_cand_ments = get_cand_mentions(dev_corpus) """ Convert words into IDs """ print '\n\tConverting Words into IDs...' print '\t\tVocabulary Size: %d' % vocab_word.size() tr_word_ids = convert_words_into_ids(corpus=tr_corpus, vocab_word=vocab_word) dev_word_ids = convert_words_into_ids(corpus=dev_corpus, vocab_word=vocab_word) """ Set word ids for mentions """ tr_gold_ments = set_word_id_for_ment(tr_word_ids, tr_gold_ments) tr_cand_ments = set_word_id_for_ment(tr_word_ids, tr_cand_ments) dev_gold_ments = set_word_id_for_ment(dev_word_ids, dev_gold_ments) dev_cand_ments = set_word_id_for_ment(dev_word_ids, dev_cand_ments) """ Set coref ids for cand mentions """ tr_cand_ments = set_cand_ment_coref(tr_gold_ments, tr_cand_ments) dev_cand_ments = set_cand_ment_coref(dev_gold_ments, dev_cand_ments) """ Check the coverage: Coverage 95.0%, Rate 1:3.5 by Berkeley System """ print '\n\tChecking the Coverage of the Candidate Mentions...' check_coverage_of_cand_mentions(tr_gold_ments, tr_cand_ments) check_coverage_of_cand_mentions(dev_gold_ments, dev_cand_ments) """ Extract features """ print '\n\tExtracting features...' """ phi = (span, word, ctx, dist, label, position) span : 1D: n_doc, 2D: n_ments, 3D: n_cand_ants, 4D: limit * 2; elem=word id word : 1D: n_doc, 2D: n_ments, 3D: n_cand_ants, 4D: [m_first, m_last, a_first, a_last]; elem=word id ctx : 1D: n_doc, 2D: n_ments, 3D: n_cand_ants, 4D: window * 2 * 2; elem=word id dist : 1D: n_doc, 2D: n_ments, 3D: n_cand_ants; elem=sent dist label : 1D: n_doc, 2D: n_ments; elem=0/1 position: 1D: n_doc, 2D: n_ments, 3D: n_cand_ants; elem=(sent_m_i, span_m, sent_a_i, span_a) """ tr_phi, tr_posit = get_features(tr_cand_ments, False, argv.n_cands) dev_phi, dev_posit = get_features(dev_cand_ments, True, argv.n_cands) """ Count the number of features """ n_tr_phi_total = reduce(lambda a, b: a + reduce(lambda c, d: c + len(d), b, 0), tr_phi, 0) n_tr_phi_t = reduce(lambda a, b: a + reduce(lambda c, d: c + reduce(lambda e, f: e + np.sum(f[-1]), d, 0), b, 0), tr_phi, 0) n_tr_phi_f = n_tr_phi_total - n_tr_phi_t n_dev_phi_total = reduce(lambda a, b: a + reduce(lambda c, d: c + len(d), b, 0), dev_phi, 0) n_dev_phi_t = reduce(lambda a, b: a + reduce(lambda c, d: c + reduce(lambda e, f: e + np.sum(f[-1]), d, 0), b, 0), dev_phi, 0) n_dev_phi_f = n_dev_phi_total - n_dev_phi_t print '\t\tTrain Features Total: %d\tRate: P:N\t%d:%d' % (n_tr_phi_total, n_tr_phi_t, n_tr_phi_f) print '\t\tDev Features Total: %d\tRate: P:N\t%d:%d' % (n_dev_phi_total, n_dev_phi_t, n_dev_phi_f) """ Convert into the Theano format """ print '\n\tConverting features into the Theano Format...' """ samples = (span, word, ctx, dist, label) span : 1D: n_doc * n_ments * n_cand_ants, 2D: limit * 2; elem=word id word : 1D: n_doc * n_ments * n_cand_ants, 2D: [m_first, m_last, a_first, a_last]; elem=word id ctx : 1D: n_doc * n_ments * n_cand_ants, 2D: window * 2 * 2; elem=word id dist : 1D: n_doc * n_ments * n_cand_ants; elem=sent dist label : 1D: n_doc * n_ments * n_cand_ants; elem=0/1 indices: 1D: n_doc * n_ments; elem=(bos, eos) """ tr_samples, tr_indices = theano_format(tr_phi) dev_samples, dev_indices = theano_format(dev_phi) ###################### # BUILD ACTUAL MODEL # ###################### print '\nBuilding the model...' model = set_model(argv, vocab_word, emb) bos = T.iscalar('bos') eos = T.iscalar('eos') train_f = theano.function( inputs=[bos, eos], outputs=[model.nll, model.correct, model.correct_t, model.correct_f, model.total_p, model.total_r], updates=model.updates, givens={ model.x_span: tr_samples[0][bos: eos], model.x_word: tr_samples[1][bos: eos], model.x_ctx : tr_samples[2][bos: eos], model.x_dist: tr_samples[3][bos: eos], model.x_slen: tr_samples[4][bos: eos], model.y : tr_samples[5][bos: eos] }, mode='FAST_RUN' ) dev_f = theano.function( inputs=[bos, eos], outputs=[model.y_hat_index, model.p_y_hat, model.correct, model.correct_t, model.correct_f, model.total_p, model.total_r], givens={ model.x_span: dev_samples[0][bos: eos], model.x_word: dev_samples[1][bos: eos], model.x_ctx : dev_samples[2][bos: eos], model.x_dist: dev_samples[3][bos: eos], model.x_slen: dev_samples[4][bos: eos], model.y : dev_samples[5][bos: eos] }, mode='FAST_RUN' ) ############### # TRAIN MODEL # ############### batch_size = argv.batch n_batches = n_tr_phi_total / batch_size indices = range(n_batches) print 'Training START\n' print 'Mini-Batch Samples: %d\n' % n_batches for epoch in xrange(argv.epoch): random.shuffle(indices) print '\nEpoch: %d' % (epoch + 1) print 'TRAIN' print '\tIndex: ', start = time.time() total_loss = 0. correct = np.zeros(9, dtype='float32') correct_t = np.zeros(9, dtype='float32') correct_f = np.zeros(9, dtype='float32') total = 0. total_r = np.zeros(9, dtype='float32') total_p = np.zeros(9, dtype='float32') for i, index in enumerate(indices): if i % 1000 == 0 and i != 0: print '%d' % i, sys.stdout.flush() loss, crr, crr_t, crr_f, ttl_p, ttl_r = train_f(index * batch_size, (index+1) * batch_size) assert not math.isnan(loss), 'Index: %d Batch Index: %d' % (i, index) total_loss += loss correct += crr correct_t += crr_t correct_f += crr_f total += batch_size total_p += ttl_p total_r += ttl_r end = time.time() print '\n\tTime: %f seconds' % (end - start) show_results(total, total_p, total_r, correct, correct_t, correct_f, total_loss) predict(epoch, dev_f, dev_corpus, dev_doc_names, dev_indices, dev_posit)
def load_dev_data(args): if args.dev_data: return io_utils.load_conll(args.dev_data) else: return None, None, None, None, None
def load_train_data(args): if args.train_data: return io_utils.load_conll(args.dev_data) else: print 'Input: --train_data path' exit()
def train(args): print '\nNEURAL POS TAGGER START\n' print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list) print '\tWORD\t\t\tEmb Dim: %d Hidden Dim: %d' % (args.w_emb_dim, args.w_hidden_dim) print '\tCHARACTER\t\tEmb Dim: %d Hidden Dim: %d' % (args.c_emb_dim, args.c_hidden_dim) print '\tOPTIMIZATION\t\tMethod: %s Learning Rate: %f\n' % (args.opt, args.lr) print '\tMINI-BATCH: %d\n' % args.batch_size """ load data """ print 'Loading data sets...\n' train_corpus, vocab_word, vocab_char, vocab_tag, max_char_len = io_utils.load_conll( args.train_data) """ limit data set """ train_corpus = train_corpus[:args.data_size] train_corpus.sort(key=lambda a: len(a)) dev_corpus = None if args.dev_data: dev_corpus, dev_vocab_word, dev_vocab_char, dev_vocab_tag, max_char_len_dev = io_utils.load_conll( args.dev_data) for w in dev_vocab_word.i2w: if args.vocab_size is None or vocab_word.size() < args.vocab_size: vocab_word.add_word(w) for c in dev_vocab_char.i2w: vocab_char.add_word(c) for t in dev_vocab_tag.i2w: vocab_tag.add_word(t) if args.save: io_utils.dump_data(vocab_word, 'vocab_word') io_utils.dump_data(vocab_char, 'vocab_char') io_utils.dump_data(vocab_tag, 'vocab_tag') """ load pre-trained embeddings """ init_w_emb = None if args.emb_list: print '\tLoading word embeddings...\n' init_w_emb = io_utils.load_init_emb(args.emb_list, args.word_list, vocab_word) w_emb_dim = init_w_emb.shape[1] else: w_emb_dim = args.w_emb_dim """ converting into ids """ print '\nConverting into IDs...\n' tr_x, tr_c, tr_b, tr_y = convert_into_ids(train_corpus, vocab_word, vocab_char, vocab_tag, max_char_len) tr_x, tr_c, tr_y, tr_b = set_minibatch(tr_x, tr_c, tr_y, max_char_len, args.batch_size) tr_x, tr_c, tr_y = shared_samples(tr_x, tr_c, tr_y) if args.dev_data: dev_x, dev_c, dev_b, dev_y = convert_into_ids(dev_corpus, vocab_word, vocab_char, vocab_tag, max_char_len_dev) dev_x, dev_c, dev_y, dev_b = set_minibatch(dev_x, dev_c, dev_y, max_char_len_dev, 1) dev_x, dev_c, dev_y = shared_samples(dev_x, dev_c, dev_y) print '\tTrain Sentences: %d Dev Sentences: %d' % (len(train_corpus), len(dev_corpus)) else: print '\tTrain Sentences: %d' % len(train_corpus) print '\tWord size: %d Char size: %d' % (vocab_word.size(), vocab_char.size()) """ set model parameters """ w_hidden_dim = args.w_hidden_dim c_emb_dim = args.c_emb_dim c_hidden_dim = args.c_hidden_dim output_dim = vocab_tag.size() window = args.window opt = args.opt """ symbol definition """ print '\tCompiling Theano Code...' bos = T.iscalar('bos') eos = T.iscalar('eos') n_words = T.iscalar('n_words') batch_size = T.iscalar('batch_size') x = T.imatrix('x') c = T.itensor4('c') y = T.ivector('y') lr = T.fscalar('lr') """ tagger set up """ tagger = Model(x=x, c=c, y=y, n_words=n_words, batch_size=batch_size, lr=lr, init_emb=init_w_emb, vocab_w_size=vocab_word.size(), w_emb_dim=w_emb_dim, w_hidden_dim=w_hidden_dim, c_emb_dim=c_emb_dim, c_hidden_dim=c_hidden_dim, output_dim=output_dim, vocab_c_size=vocab_char.size(), window=window, opt=opt) train_f = theano.function(inputs=[bos, eos, n_words, batch_size, lr], outputs=[tagger.nll, tagger.result], updates=tagger.updates, givens={ x: tr_x[bos:eos], c: tr_c[bos:eos], y: tr_y[bos:eos] }, mode='FAST_RUN') dev_f = theano.function(inputs=[bos, eos, n_words, batch_size], outputs=tagger.result, givens={ x: dev_x[bos:eos], c: dev_c[bos:eos], y: dev_y[bos:eos] }, mode='FAST_RUN') def _train(): for epoch in xrange(args.epoch): _lr = args.lr / float(epoch + 1) indices = range(len(tr_b)) random.shuffle(indices) print '\nEpoch: %d' % (epoch + 1) print '\tBatch Index: ', start = time.time() total = 0.0 correct = 0 losses = 0.0 for i, index in enumerate(indices): if i % 100 == 0 and i != 0: print i, sys.stdout.flush() boundary = tr_b[index] loss, corrects = train_f(boundary[0], boundary[1], boundary[2], boundary[3], _lr) assert math.isnan(loss) is False, i total += len(corrects) correct += np.sum(corrects) losses += loss end = time.time() print '\tTime: %f seconds' % (end - start) print '\tNegative Log Likelihood: %f' % losses print '\tAccuracy:%f Total:%d Correct:%d' % ( (correct / total), total, correct) _dev(dev_f) def _dev(model): print '\tBatch Index: ', start = time.time() total = 0.0 correct = 0 for index in xrange(len(dev_b)): if index % 100 == 0 and index != 0: print index, sys.stdout.flush() boundary = dev_b[index] corrects = model(boundary[0], boundary[1], boundary[2], boundary[3]) total += len(corrects) correct += np.sum(corrects) end = time.time() print '\tTime: %f seconds' % (end - start) print '\tAccuracy:%f Total:%d Correct:%d' % ( (correct / total), total, correct) _train()