def _train(): print '\nTRAINING START\n' for epoch in xrange(args.epoch): _lr = args.lr / float(epoch + 1) indices = range(len(tr_x)) random.shuffle(indices) print '\nEpoch: %d' % (epoch + 1) print '\n\tTrain set' print '\t\tBatch Index: ', start = time.time() total = 0.0 correct = 0 losses = 0.0 for i, index in enumerate(indices): if i % 100 == 0 and i != 0: print i, sys.stdout.flush() if args.model == 'char': loss, corrects = train_f(tr_x[index], tr_c[index], tr_b[index], tr_y[index], _lr) else: loss, corrects = train_f(tr_x[index], tr_y[index], _lr) assert math.isnan(loss) is False, index total += len(corrects) correct += np.sum(corrects) losses += loss end = time.time() print '\n\t\tTime: %f seconds' % (end - start) print '\t\tNegative Log Likelihood: %f' % losses print '\t\tAccuracy:%f Total:%d Correct:%d' % ( (correct / total), total, correct) if args.save: io_utils.dump_data( tagger, 'model-%s.epoch-%d' % (args.model, epoch + 1)) _dev(dev_f)
def _train(): print '\nTRAINING START\n' for epoch in xrange(args.epoch): _lr = args.lr / float(epoch+1) indices = range(len(tr_x)) random.shuffle(indices) print '\nEpoch: %d' % (epoch + 1) print '\n\tTrain set' print '\t\tBatch Index: ', start = time.time() total = 0.0 correct = 0 losses = 0.0 for i, index in enumerate(indices): if i % 100 == 0 and i != 0: print i, sys.stdout.flush() if args.model == 'char': loss, corrects = train_f(tr_x[index], tr_c[index], tr_b[index], tr_y[index], _lr) else: loss, corrects = train_f(tr_x[index], tr_y[index], _lr) assert math.isnan(loss) is False, index total += len(corrects) correct += np.sum(corrects) losses += loss end = time.time() print '\n\t\tTime: %f seconds' % (end - start) print '\t\tNegative Log Likelihood: %f' % losses print '\t\tAccuracy:%f Total:%d Correct:%d' % ((correct / total), total, correct) if args.save: io_utils.dump_data(tagger, 'model-%s.epoch-%d' % (args.model, epoch+1)) _dev(dev_f)
def train(args): print '\nNEURAL POS TAGGER START\n' print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list) print '\tWORD\t\t\tEmb Dim: %d Hidden Dim: %d' % (args.w_emb_dim, args.w_hidden_dim) print '\tCHARACTER\t\tEmb Dim: %d Hidden Dim: %d' % (args.c_emb_dim, args.c_hidden_dim) print '\tOPTIMIZATION\t\tMethod: %s Learning Rate: %f\n' % (args.opt, args.lr) print '\tMINI-BATCH: %d\n' % args.batch_size """ load data """ print 'Loading data sets...\n' train_corpus, vocab_word, vocab_char, vocab_tag, _ = io_utils.load_conll(args.train_data) """ limit data set """ train_corpus = train_corpus[:args.data_size] train_corpus.sort(key=lambda a: len(a)) dev_corpus = None if args.dev_data: dev_corpus, dev_vocab_word, dev_vocab_char, dev_vocab_tag, _ = io_utils.load_conll(args.dev_data) for w in dev_vocab_word.i2w: if args.vocab_size is None or vocab_word.size() < args.vocab_size: vocab_word.add_word(w) for c in dev_vocab_char.i2w: vocab_char.add_word(c) for t in dev_vocab_tag.i2w: vocab_tag.add_word(t) if args.save: io_utils.dump_data(vocab_word, 'vocab_word') io_utils.dump_data(vocab_char, 'vocab_char') io_utils.dump_data(vocab_tag, 'vocab_tag') """ load word embeddings """ init_w_emb = None if args.emb_list: print '\tLoading pre-trained word embeddings...\n' init_w_emb = io_utils.load_init_emb(args.emb_list, args.word_list, vocab_word) w_emb_dim = init_w_emb.shape[1] else: print '\tUse random-initialized word embeddings...\n' w_emb_dim = args.w_emb_dim """ converting into ids """ print '\nConverting into IDs...\n' tr_x, tr_c, tr_b, tr_y = convert_into_ids(train_corpus, vocab_word, vocab_char, vocab_tag) tr_x, tr_y, tr_b = set_minibatch(tr_x, tr_y, args.batch_size) tr_x, tr_y = shared_samples(tr_x, tr_y) dev_x = None dev_y = None if args.dev_data: dev_x, dev_c, dev_b, dev_y = convert_into_ids(dev_corpus, vocab_word, vocab_char, vocab_tag) dev_x, dev_y, dev_b = set_minibatch(dev_x, dev_y, 1) dev_x, dev_y = shared_samples(dev_x, dev_y) print '\tTrain Sentences: %d Dev Sentences: %d' % (len(train_corpus), len(dev_corpus)) else: print '\tTrain Sentences: %d' % len(train_corpus) print '\tWord size: %d Char size: %d' % (vocab_word.size(), vocab_char.size()) """ set model parameters """ hidden_dim = args.w_hidden_dim output_dim = vocab_tag.size() window = args.window opt = args.opt """ symbol definition """ print '\tCompiling Theano Code...' bos = T.iscalar('bos') eos = T.iscalar('eos') n_words = T.iscalar('n_words') batch_size = T.iscalar('batch_size') x = T.imatrix('x') y = T.ivector('y') lr = T.fscalar('lr') """ tagger set up """ tagger = Model(x=x, y=y, n_words=n_words, batch_size=batch_size, lr=lr, init_emb=init_w_emb, vocab_size=vocab_word.size(), emb_dim=w_emb_dim, hidden_dim=hidden_dim, output_dim=output_dim, opt=opt, window=window) train_f = theano.function( inputs=[bos, eos, n_words, batch_size, lr], outputs=[tagger.nll, tagger.result], updates=tagger.updates, givens={ x: tr_x[bos: eos], y: tr_y[bos: eos] }, mode='FAST_RUN' ) dev_f = theano.function( inputs=[bos, eos, n_words, batch_size], outputs=tagger.result, givens={ x: dev_x[bos: eos], y: dev_y[bos: eos] }, mode='FAST_RUN' ) def _train(): for epoch in xrange(args.epoch): _lr = args.lr / float(epoch+1) indices = range(len(tr_b)) random.shuffle(indices) print '\nEpoch: %d' % (epoch + 1) print '\tBatch Index: ', start = time.time() total = 0.0 correct = 0 losses = 0.0 for i, index in enumerate(indices): if i % 100 == 0 and i != 0: print i, sys.stdout.flush() boundary = tr_b[index] loss, corrects = train_f(boundary[0], boundary[1], boundary[2],boundary[3], _lr) assert math.isnan(loss) is False, i total += len(corrects) correct += np.sum(corrects) losses += loss end = time.time() print '\tTime: %f seconds' % (end - start) print '\tNegative Log Likelihood: %f' % losses print '\tAccuracy:%f Total:%d Correct:%d' % ((correct / total), total, correct) _dev(dev_f) def _dev(model): print '\tBatch Index: ', start = time.time() total = 0.0 correct = 0 for index in xrange(len(dev_b)): if index % 100 == 0 and index != 0: print index, sys.stdout.flush() boundary = dev_b[index] corrects = model(boundary[0], boundary[1], boundary[2], boundary[3]) total += len(corrects) correct += np.sum(corrects) end = time.time() print '\tTime: %f seconds' % (end - start) print '\tAccuracy:%f Total:%d Correct:%d' % ((correct / total), total, correct) _train()
def train(argv): print '\nSETTING UP A TRAINING SETTING\n' emb = None batch_size = argv.batch_size window = argv.window print 'SETTING: Batch: %d Window: %d Learning Rate: %f' % (batch_size, window, argv.lr) ############## # LOAD FILES # ############## """ Load files """ # corpus: 1D: n_sents, 2D: n_words, 3D: (word, pas_info, pas_id) tr_corpus, vocab_word = io_utils.load(argv.train_data) print '\nTRAIN CORPUS' corpus_statistics(tr_corpus) if argv.dev_data: dev_corpus, vocab_word = io_utils.load(argv.dev_data, vocab_word) print '\nDEV CORPUS' corpus_statistics(dev_corpus) if argv.test_data: test_corpus, vocab_word = io_utils.load(argv.test_data, vocab_word) print '\nTEST CORPUS' corpus_statistics(test_corpus) print '\nVocab: %d' % vocab_word.size() if argv.save: io_utils.dump_data(vocab_word, 'Vocab.size-%d' % vocab_word.size()) ############## # PREPROCESS # ############## """ Preprocessing """ # samples: 1D: n_sents, 2D: [word_ids, tag_ids, prd_indices, contexts] train_samples = sample_format(tr_corpus, vocab_word, window) n_tr_samples = len(train_samples) if argv.dev_data: dev_samples = sample_format(dev_corpus, vocab_word, window) n_dev_samples = len(dev_samples) if argv.test_data: test_samples = sample_format(test_corpus, vocab_word, window) n_te_samples = len(test_samples) # dataset = [x, y, l] # x=features: 1D: n_samples * n_words, 2D: window; elem=word id # y=labels: 1D: n_samples; elem=scalar # l=question length: 1D: n_samples * 2; elem=scalar # bb_x=batch indices for x: 1D: n_samples / batch_size + 1; elem=(bob, eob) # bb_y=batch indices for y: 1D: n_samples / batch_size + 1; elem=(bob, eob) tr_dataset, tr_bb_x, tr_bb_y = theano_format(train_samples, batch_size) if argv.dev_data: dev_dataset, dev_bb_x, dev_bb_y = theano_format(dev_samples, batch_size) if argv.test_data: te_dataset, te_bb_x, te_bb_y = theano_format(test_samples, batch_size) ###################### # BUILD ACTUAL MODEL # ###################### """ Set a model """ print '\n\nBuilding a model...' model = set_model(argv=argv, emb=emb, vocab=vocab_word) train_f = set_train_f(model, tr_dataset) if argv.dev_data: dev_f = set_predict_f(model, dev_dataset) if argv.test_data: test_f = set_predict_f(model, te_dataset) ############### # TRAIN MODEL # ############### print '\nTRAINING START\n' indices = range(len(tr_bb_y)) best_dev_acc = -1. best_test_acc = -1. for epoch in xrange(argv.epoch): print '\n\nEPOCH: %d' % (epoch + 1) print '\tTRAIN\n\t', np.random.shuffle(indices) start = time.time() ttl_nll = 0. ttl_crr = 0. for i, b_index in enumerate(indices): if (i + 1) % 100 == 0: print i + 1, sys.stdout.flush() bb_x_i = tr_bb_x[b_index] bb_y_i = tr_bb_y[b_index] crr, nll = train_f(index=b_index, bob_x=bb_x_i[0], eob_x=bb_x_i[1], bob_y=bb_y_i[0], eob_y=bb_y_i[1]) assert not math.isnan(nll), '\nNLL is nan: %d\n' % i ttl_crr += np.sum(crr) ttl_nll += nll end = time.time() print '\n\tTime: %f\tNLL: %f' % ((end - start), ttl_nll) print '\tACC: %f CRR: %d TOTAL: %d' % (ttl_crr/n_tr_samples, ttl_crr, n_tr_samples) update = False if argv.dev_data: print '\n\tDEV\n\t', dev_acc = predict(dev_f, dev_bb_x, dev_bb_y, n_dev_samples) if best_dev_acc < dev_acc: best_dev_acc = dev_acc update = True if argv.save: fn = 'Model.Layer-%d.Sim-%s.Act-%s.Opt-%s.Batch-%d.reg-%f' %\ (argv.layer, argv.sim, argv.activation, argv.opt, argv.batch_size, argv.reg) io_utils.dump_data(model, fn) if argv.test_data: print '\n\tTEST\n\t', test_acc = predict(test_f, te_bb_x, te_bb_y, n_te_samples) if update: best_test_acc = test_acc print '\n\tBEST DEV ACC: %f TEST ACC: %f' % (best_dev_acc, best_test_acc)
def train(args): print '\nNEURAL POS TAGGER START\n' print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list) print '\tWORD\t\t\tEmb Dim: %d Hidden Dim: %d' % (args.w_emb_dim, args.w_hidden_dim) print '\tCHARACTER\t\tEmb Dim: %d Hidden Dim: %d' % (args.c_emb_dim, args.c_hidden_dim) print '\tOPTIMIZATION\t\tMethod: %s Learning Rate: %f\n' % (args.opt, args.lr) """ load data """ print 'Loading data sets...\n' train_corpus, vocab_word, vocab_char, vocab_tag, max_char_len = load_train_data( args) dev_corpus, dev_vocab_word, dev_vocab_char, dev_vocab_tag, dev_max_char_len = load_dev_data( args) if dev_corpus: for w in dev_vocab_word.i2w: if args.vocab_size is None or vocab_word.size() < args.vocab_size: vocab_word.add_word(w) for c in dev_vocab_char.i2w: vocab_char.add_word(c) for t in dev_vocab_tag.i2w: vocab_tag.add_word(t) if args.save: io_utils.dump_data(vocab_word, 'vocab_word') io_utils.dump_data(vocab_char, 'vocab_char') io_utils.dump_data(vocab_tag, 'vocab_tag') """ load word embeddings """ init_w_emb = None if args.emb_list: print '\tLoading pre-trained word embeddings...\n' init_w_emb = io_utils.load_init_emb(args.emb_list, args.word_list, vocab_word) w_emb_dim = init_w_emb.shape[1] else: print '\tUse random-initialized word embeddings...\n' w_emb_dim = args.w_emb_dim """ limit data set """ train_corpus = train_corpus[:args.data_size] train_corpus.sort(key=lambda a: len(a)) """ converting into ids """ print '\nConverting into IDs...\n' tr_x, tr_c, tr_b, tr_y = preprocessor.convert_into_ids( train_corpus, vocab_word, vocab_char, vocab_tag) if args.dev_data: dev_x, dev_c, dev_b, dev_y = preprocessor.convert_into_ids( dev_corpus, vocab_word, vocab_char, vocab_tag) print '\tTrain Sentences: %d Dev Sentences: %d' % (len(train_corpus), len(dev_corpus)) else: print '\tTrain Sentences: %d' % len(train_corpus) print '\tWord size: %d Char size: %d' % (vocab_word.size(), vocab_char.size()) """ tagger set up """ tagger = set_model(args, init_w_emb, w_emb_dim, vocab_word, vocab_char, vocab_tag) train_f = theano.function(inputs=tagger.input, outputs=[tagger.nll, tagger.result], updates=tagger.updates, mode='FAST_RUN') dev_f = theano.function(inputs=tagger.input[:-1], outputs=tagger.result, mode='FAST_RUN') def _train(): print '\nTRAINING START\n' for epoch in xrange(args.epoch): _lr = args.lr / float(epoch + 1) indices = range(len(tr_x)) random.shuffle(indices) print '\nEpoch: %d' % (epoch + 1) print '\n\tTrain set' print '\t\tBatch Index: ', start = time.time() total = 0.0 correct = 0 losses = 0.0 for i, index in enumerate(indices): if i % 100 == 0 and i != 0: print i, sys.stdout.flush() if args.model == 'char': loss, corrects = train_f(tr_x[index], tr_c[index], tr_b[index], tr_y[index], _lr) else: loss, corrects = train_f(tr_x[index], tr_y[index], _lr) assert math.isnan(loss) is False, index total += len(corrects) correct += np.sum(corrects) losses += loss end = time.time() print '\n\t\tTime: %f seconds' % (end - start) print '\t\tNegative Log Likelihood: %f' % losses print '\t\tAccuracy:%f Total:%d Correct:%d' % ( (correct / total), total, correct) if args.save: io_utils.dump_data( tagger, 'model-%s.epoch-%d' % (args.model, epoch + 1)) _dev(dev_f) def _dev(_dev_f): print '\n\tDev set' print '\t\tBatch Index: ', start = time.time() total = 0.0 correct = 0 for index in xrange(len(dev_x)): if index % 100 == 0 and index != 0: print index, sys.stdout.flush() if args.model == 'char': corrects = _dev_f(dev_x[index], dev_c[index], dev_b[index], dev_y[index]) else: corrects = _dev_f(dev_x[index], dev_y[index]) total += len(corrects) correct += np.sum(corrects) end = time.time() print '\n\t\tTime: %f seconds' % (end - start) print '\t\tAccuracy:%f Total:%d Correct:%d' % ( (correct / total), total, correct) _train()
def train(args): print '\nNEURAL POS TAGGER START\n' print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list) print '\tWORD\t\t\tEmb Dim: %d Hidden Dim: %d' % (args.w_emb_dim, args.w_hidden_dim) print '\tCHARACTER\t\tEmb Dim: %d Hidden Dim: %d' % (args.c_emb_dim, args.c_hidden_dim) print '\tOPTIMIZATION\t\tMethod: %s Learning Rate: %f\n' % (args.opt, args.lr) """ load data """ print 'Loading data sets...\n' train_corpus, vocab_word, vocab_char, vocab_tag, max_char_len = load_train_data(args) dev_corpus, dev_vocab_word, dev_vocab_char, dev_vocab_tag, dev_max_char_len = load_dev_data(args) if dev_corpus: for w in dev_vocab_word.i2w: if args.vocab_size is None or vocab_word.size() < args.vocab_size: vocab_word.add_word(w) for c in dev_vocab_char.i2w: vocab_char.add_word(c) for t in dev_vocab_tag.i2w: vocab_tag.add_word(t) if args.save: io_utils.dump_data(vocab_word, 'vocab_word') io_utils.dump_data(vocab_char, 'vocab_char') io_utils.dump_data(vocab_tag, 'vocab_tag') """ load word embeddings """ init_w_emb = None if args.emb_list: print '\tLoading pre-trained word embeddings...\n' init_w_emb = io_utils.load_init_emb(args.emb_list, args.word_list, vocab_word) w_emb_dim = init_w_emb.shape[1] else: print '\tUse random-initialized word embeddings...\n' w_emb_dim = args.w_emb_dim """ limit data set """ train_corpus = train_corpus[:args.data_size] train_corpus.sort(key=lambda a: len(a)) """ converting into ids """ print '\nConverting into IDs...\n' tr_x, tr_c, tr_b, tr_y = preprocessor.convert_into_ids(train_corpus, vocab_word, vocab_char, vocab_tag) if args.dev_data: dev_x, dev_c, dev_b, dev_y = preprocessor.convert_into_ids(dev_corpus, vocab_word, vocab_char, vocab_tag) print '\tTrain Sentences: %d Dev Sentences: %d' % (len(train_corpus), len(dev_corpus)) else: print '\tTrain Sentences: %d' % len(train_corpus) print '\tWord size: %d Char size: %d' % (vocab_word.size(), vocab_char.size()) """ tagger set up """ tagger = set_model(args, init_w_emb, w_emb_dim, vocab_word, vocab_char, vocab_tag) train_f = theano.function( inputs=tagger.input, outputs=[tagger.nll, tagger.result], updates=tagger.updates, mode='FAST_RUN' ) dev_f = theano.function( inputs=tagger.input[:-1], outputs=tagger.result, mode='FAST_RUN' ) def _train(): print '\nTRAINING START\n' for epoch in xrange(args.epoch): _lr = args.lr / float(epoch+1) indices = range(len(tr_x)) random.shuffle(indices) print '\nEpoch: %d' % (epoch + 1) print '\n\tTrain set' print '\t\tBatch Index: ', start = time.time() total = 0.0 correct = 0 losses = 0.0 for i, index in enumerate(indices): if i % 100 == 0 and i != 0: print i, sys.stdout.flush() if args.model == 'char': loss, corrects = train_f(tr_x[index], tr_c[index], tr_b[index], tr_y[index], _lr) else: loss, corrects = train_f(tr_x[index], tr_y[index], _lr) assert math.isnan(loss) is False, index total += len(corrects) correct += np.sum(corrects) losses += loss end = time.time() print '\n\t\tTime: %f seconds' % (end - start) print '\t\tNegative Log Likelihood: %f' % losses print '\t\tAccuracy:%f Total:%d Correct:%d' % ((correct / total), total, correct) if args.save: io_utils.dump_data(tagger, 'model-%s.epoch-%d' % (args.model, epoch+1)) _dev(dev_f) def _dev(_dev_f): print '\n\tDev set' print '\t\tBatch Index: ', start = time.time() total = 0.0 correct = 0 for index in xrange(len(dev_x)): if index % 100 == 0 and index != 0: print index, sys.stdout.flush() if args.model == 'char': corrects = _dev_f(dev_x[index], dev_c[index], dev_b[index], dev_y[index]) else: corrects = _dev_f(dev_x[index], dev_y[index]) total += len(corrects) correct += np.sum(corrects) end = time.time() print '\n\t\tTime: %f seconds' % (end - start) print '\t\tAccuracy:%f Total:%d Correct:%d' % ((correct / total), total, correct) _train()
def train(args): print '\nNEURAL POS TAGGER START\n' print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list) print '\tWORD\t\t\tEmb Dim: %d Hidden Dim: %d' % (args.w_emb_dim, args.w_hidden_dim) print '\tCHARACTER\t\tEmb Dim: %d Hidden Dim: %d' % (args.c_emb_dim, args.c_hidden_dim) print '\tOPTIMIZATION\t\tMethod: %s Learning Rate: %f\n' % (args.opt, args.lr) print '\tMINI-BATCH: %d\n' % args.batch_size """ load data """ print 'Loading data sets...\n' train_corpus, vocab_word, vocab_char, vocab_tag, max_char_len = io_utils.load_conll( args.train_data) """ limit data set """ train_corpus = train_corpus[:args.data_size] train_corpus.sort(key=lambda a: len(a)) dev_corpus = None if args.dev_data: dev_corpus, dev_vocab_word, dev_vocab_char, dev_vocab_tag, max_char_len_dev = io_utils.load_conll( args.dev_data) for w in dev_vocab_word.i2w: if args.vocab_size is None or vocab_word.size() < args.vocab_size: vocab_word.add_word(w) for c in dev_vocab_char.i2w: vocab_char.add_word(c) for t in dev_vocab_tag.i2w: vocab_tag.add_word(t) if args.save: io_utils.dump_data(vocab_word, 'vocab_word') io_utils.dump_data(vocab_char, 'vocab_char') io_utils.dump_data(vocab_tag, 'vocab_tag') """ load pre-trained embeddings """ init_w_emb = None if args.emb_list: print '\tLoading word embeddings...\n' init_w_emb = io_utils.load_init_emb(args.emb_list, args.word_list, vocab_word) w_emb_dim = init_w_emb.shape[1] else: w_emb_dim = args.w_emb_dim """ converting into ids """ print '\nConverting into IDs...\n' tr_x, tr_c, tr_b, tr_y = convert_into_ids(train_corpus, vocab_word, vocab_char, vocab_tag, max_char_len) tr_x, tr_c, tr_y, tr_b = set_minibatch(tr_x, tr_c, tr_y, max_char_len, args.batch_size) tr_x, tr_c, tr_y = shared_samples(tr_x, tr_c, tr_y) if args.dev_data: dev_x, dev_c, dev_b, dev_y = convert_into_ids(dev_corpus, vocab_word, vocab_char, vocab_tag, max_char_len_dev) dev_x, dev_c, dev_y, dev_b = set_minibatch(dev_x, dev_c, dev_y, max_char_len_dev, 1) dev_x, dev_c, dev_y = shared_samples(dev_x, dev_c, dev_y) print '\tTrain Sentences: %d Dev Sentences: %d' % (len(train_corpus), len(dev_corpus)) else: print '\tTrain Sentences: %d' % len(train_corpus) print '\tWord size: %d Char size: %d' % (vocab_word.size(), vocab_char.size()) """ set model parameters """ w_hidden_dim = args.w_hidden_dim c_emb_dim = args.c_emb_dim c_hidden_dim = args.c_hidden_dim output_dim = vocab_tag.size() window = args.window opt = args.opt """ symbol definition """ print '\tCompiling Theano Code...' bos = T.iscalar('bos') eos = T.iscalar('eos') n_words = T.iscalar('n_words') batch_size = T.iscalar('batch_size') x = T.imatrix('x') c = T.itensor4('c') y = T.ivector('y') lr = T.fscalar('lr') """ tagger set up """ tagger = Model(x=x, c=c, y=y, n_words=n_words, batch_size=batch_size, lr=lr, init_emb=init_w_emb, vocab_w_size=vocab_word.size(), w_emb_dim=w_emb_dim, w_hidden_dim=w_hidden_dim, c_emb_dim=c_emb_dim, c_hidden_dim=c_hidden_dim, output_dim=output_dim, vocab_c_size=vocab_char.size(), window=window, opt=opt) train_f = theano.function(inputs=[bos, eos, n_words, batch_size, lr], outputs=[tagger.nll, tagger.result], updates=tagger.updates, givens={ x: tr_x[bos:eos], c: tr_c[bos:eos], y: tr_y[bos:eos] }, mode='FAST_RUN') dev_f = theano.function(inputs=[bos, eos, n_words, batch_size], outputs=tagger.result, givens={ x: dev_x[bos:eos], c: dev_c[bos:eos], y: dev_y[bos:eos] }, mode='FAST_RUN') def _train(): for epoch in xrange(args.epoch): _lr = args.lr / float(epoch + 1) indices = range(len(tr_b)) random.shuffle(indices) print '\nEpoch: %d' % (epoch + 1) print '\tBatch Index: ', start = time.time() total = 0.0 correct = 0 losses = 0.0 for i, index in enumerate(indices): if i % 100 == 0 and i != 0: print i, sys.stdout.flush() boundary = tr_b[index] loss, corrects = train_f(boundary[0], boundary[1], boundary[2], boundary[3], _lr) assert math.isnan(loss) is False, i total += len(corrects) correct += np.sum(corrects) losses += loss end = time.time() print '\tTime: %f seconds' % (end - start) print '\tNegative Log Likelihood: %f' % losses print '\tAccuracy:%f Total:%d Correct:%d' % ( (correct / total), total, correct) _dev(dev_f) def _dev(model): print '\tBatch Index: ', start = time.time() total = 0.0 correct = 0 for index in xrange(len(dev_b)): if index % 100 == 0 and index != 0: print index, sys.stdout.flush() boundary = dev_b[index] corrects = model(boundary[0], boundary[1], boundary[2], boundary[3]) total += len(corrects) correct += np.sum(corrects) end = time.time() print '\tTime: %f seconds' % (end - start) print '\tAccuracy:%f Total:%d Correct:%d' % ( (correct / total), total, correct) _train()
def train(argv): print '\nSETTING UP A TRAINING SETTING\n' emb = None batch_size = argv.batch_size window = argv.window print 'SETTING: Batch: %d Window: %d Learning Rate: %f' % ( batch_size, window, argv.lr) ############## # LOAD FILES # ############## """ Load files """ # corpus: 1D: n_sents, 2D: n_words, 3D: (word, pas_info, pas_id) tr_corpus, vocab_word = io_utils.load(argv.train_data) print '\nTRAIN CORPUS' corpus_statistics(tr_corpus) if argv.dev_data: dev_corpus, vocab_word = io_utils.load(argv.dev_data, vocab_word) print '\nDEV CORPUS' corpus_statistics(dev_corpus) if argv.test_data: test_corpus, vocab_word = io_utils.load(argv.test_data, vocab_word) print '\nTEST CORPUS' corpus_statistics(test_corpus) print '\nVocab: %d' % vocab_word.size() if argv.save: io_utils.dump_data(vocab_word, 'Vocab.size-%d' % vocab_word.size()) ############## # PREPROCESS # ############## """ Preprocessing """ # samples: 1D: n_sents, 2D: [word_ids, tag_ids, prd_indices, contexts] train_samples = sample_format(tr_corpus, vocab_word, window) n_tr_samples = len(train_samples) if argv.dev_data: dev_samples = sample_format(dev_corpus, vocab_word, window) n_dev_samples = len(dev_samples) if argv.test_data: test_samples = sample_format(test_corpus, vocab_word, window) n_te_samples = len(test_samples) # dataset = [x, y, l] # x=features: 1D: n_samples * n_words, 2D: window; elem=word id # y=labels: 1D: n_samples; elem=scalar # l=question length: 1D: n_samples * 2; elem=scalar # bb_x=batch indices for x: 1D: n_samples / batch_size + 1; elem=(bob, eob) # bb_y=batch indices for y: 1D: n_samples / batch_size + 1; elem=(bob, eob) tr_dataset, tr_bb_x, tr_bb_y = theano_format(train_samples, batch_size) if argv.dev_data: dev_dataset, dev_bb_x, dev_bb_y = theano_format( dev_samples, batch_size) if argv.test_data: te_dataset, te_bb_x, te_bb_y = theano_format(test_samples, batch_size) ###################### # BUILD ACTUAL MODEL # ###################### """ Set a model """ print '\n\nBuilding a model...' model = set_model(argv=argv, emb=emb, vocab=vocab_word) train_f = set_train_f(model, tr_dataset) if argv.dev_data: dev_f = set_predict_f(model, dev_dataset) if argv.test_data: test_f = set_predict_f(model, te_dataset) ############### # TRAIN MODEL # ############### print '\nTRAINING START\n' indices = range(len(tr_bb_y)) best_dev_acc = -1. best_test_acc = -1. for epoch in xrange(argv.epoch): print '\n\nEPOCH: %d' % (epoch + 1) print '\tTRAIN\n\t', np.random.shuffle(indices) start = time.time() ttl_nll = 0. ttl_crr = 0. for i, b_index in enumerate(indices): if (i + 1) % 100 == 0: print i + 1, sys.stdout.flush() bb_x_i = tr_bb_x[b_index] bb_y_i = tr_bb_y[b_index] crr, nll = train_f(index=b_index, bob_x=bb_x_i[0], eob_x=bb_x_i[1], bob_y=bb_y_i[0], eob_y=bb_y_i[1]) assert not math.isnan(nll), '\nNLL is nan: %d\n' % i ttl_crr += np.sum(crr) ttl_nll += nll end = time.time() print '\n\tTime: %f\tNLL: %f' % ((end - start), ttl_nll) print '\tACC: %f CRR: %d TOTAL: %d' % (ttl_crr / n_tr_samples, ttl_crr, n_tr_samples) update = False if argv.dev_data: print '\n\tDEV\n\t', dev_acc = predict(dev_f, dev_bb_x, dev_bb_y, n_dev_samples) if best_dev_acc < dev_acc: best_dev_acc = dev_acc update = True if argv.save: fn = 'Model.Layer-%d.Sim-%s.Act-%s.Opt-%s.Batch-%d.reg-%f' %\ (argv.layer, argv.sim, argv.activation, argv.opt, argv.batch_size, argv.reg) io_utils.dump_data(model, fn) if argv.test_data: print '\n\tTEST\n\t', test_acc = predict(test_f, te_bb_x, te_bb_y, n_te_samples) if update: best_test_acc = test_acc print '\n\tBEST DEV ACC: %f TEST ACC: %f' % (best_dev_acc, best_test_acc)