Example #1
0
    def _train():
        print '\nTRAINING START\n'

        for epoch in xrange(args.epoch):
            _lr = args.lr / float(epoch + 1)
            indices = range(len(tr_x))
            random.shuffle(indices)

            print '\nEpoch: %d' % (epoch + 1)
            print '\n\tTrain set'
            print '\t\tBatch Index: ',
            start = time.time()

            total = 0.0
            correct = 0
            losses = 0.0

            for i, index in enumerate(indices):
                if i % 100 == 0 and i != 0:
                    print i,
                    sys.stdout.flush()

                if args.model == 'char':
                    loss, corrects = train_f(tr_x[index], tr_c[index],
                                             tr_b[index], tr_y[index], _lr)
                else:
                    loss, corrects = train_f(tr_x[index], tr_y[index], _lr)

                assert math.isnan(loss) is False, index

                total += len(corrects)
                correct += np.sum(corrects)
                losses += loss

            end = time.time()
            print '\n\t\tTime: %f seconds' % (end - start)
            print '\t\tNegative Log Likelihood: %f' % losses
            print '\t\tAccuracy:%f  Total:%d  Correct:%d' % (
                (correct / total), total, correct)

            if args.save:
                io_utils.dump_data(
                    tagger, 'model-%s.epoch-%d' % (args.model, epoch + 1))

            _dev(dev_f)
Example #2
0
    def _train():
        print '\nTRAINING START\n'

        for epoch in xrange(args.epoch):
            _lr = args.lr / float(epoch+1)
            indices = range(len(tr_x))
            random.shuffle(indices)

            print '\nEpoch: %d' % (epoch + 1)
            print '\n\tTrain set'
            print '\t\tBatch Index: ',
            start = time.time()

            total = 0.0
            correct = 0
            losses = 0.0

            for i, index in enumerate(indices):
                if i % 100 == 0 and i != 0:
                    print i,
                    sys.stdout.flush()

                if args.model == 'char':
                    loss, corrects = train_f(tr_x[index], tr_c[index], tr_b[index], tr_y[index], _lr)
                else:
                    loss, corrects = train_f(tr_x[index], tr_y[index], _lr)

                assert math.isnan(loss) is False, index

                total += len(corrects)
                correct += np.sum(corrects)
                losses += loss

            end = time.time()
            print '\n\t\tTime: %f seconds' % (end - start)
            print '\t\tNegative Log Likelihood: %f' % losses
            print '\t\tAccuracy:%f  Total:%d  Correct:%d' % ((correct / total), total, correct)

            if args.save:
                io_utils.dump_data(tagger, 'model-%s.epoch-%d' % (args.model, epoch+1))

            _dev(dev_f)
def train(args):
    print '\nNEURAL POS TAGGER START\n'

    print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list)
    print '\tWORD\t\t\tEmb Dim: %d  Hidden Dim: %d' % (args.w_emb_dim, args.w_hidden_dim)
    print '\tCHARACTER\t\tEmb Dim: %d  Hidden Dim: %d' % (args.c_emb_dim, args.c_hidden_dim)
    print '\tOPTIMIZATION\t\tMethod: %s  Learning Rate: %f\n' % (args.opt, args.lr)
    print '\tMINI-BATCH: %d\n' % args.batch_size

    """ load data """
    print 'Loading data sets...\n'
    train_corpus, vocab_word, vocab_char, vocab_tag, _ = io_utils.load_conll(args.train_data)

    """ limit data set """
    train_corpus = train_corpus[:args.data_size]
    train_corpus.sort(key=lambda a: len(a))

    dev_corpus = None
    if args.dev_data:
        dev_corpus, dev_vocab_word, dev_vocab_char, dev_vocab_tag, _ = io_utils.load_conll(args.dev_data)

        for w in dev_vocab_word.i2w:
            if args.vocab_size is None or vocab_word.size() < args.vocab_size:
                vocab_word.add_word(w)
        for c in dev_vocab_char.i2w:
            vocab_char.add_word(c)
        for t in dev_vocab_tag.i2w:
            vocab_tag.add_word(t)

    if args.save:
        io_utils.dump_data(vocab_word, 'vocab_word')
        io_utils.dump_data(vocab_char, 'vocab_char')
        io_utils.dump_data(vocab_tag, 'vocab_tag')

    """ load word embeddings """
    init_w_emb = None
    if args.emb_list:
        print '\tLoading pre-trained word embeddings...\n'
        init_w_emb = io_utils.load_init_emb(args.emb_list, args.word_list, vocab_word)
        w_emb_dim = init_w_emb.shape[1]
    else:
        print '\tUse random-initialized word embeddings...\n'
        w_emb_dim = args.w_emb_dim

    """ converting into ids """
    print '\nConverting into IDs...\n'

    tr_x, tr_c, tr_b, tr_y = convert_into_ids(train_corpus, vocab_word, vocab_char, vocab_tag)
    tr_x, tr_y, tr_b = set_minibatch(tr_x, tr_y, args.batch_size)
    tr_x, tr_y = shared_samples(tr_x, tr_y)

    dev_x = None
    dev_y = None
    if args.dev_data:
        dev_x, dev_c, dev_b, dev_y = convert_into_ids(dev_corpus, vocab_word, vocab_char, vocab_tag)
        dev_x, dev_y, dev_b = set_minibatch(dev_x, dev_y, 1)
        dev_x, dev_y = shared_samples(dev_x, dev_y)
        print '\tTrain Sentences: %d  Dev Sentences: %d' % (len(train_corpus), len(dev_corpus))
    else:
        print '\tTrain Sentences: %d' % len(train_corpus)

    print '\tWord size: %d  Char size: %d' % (vocab_word.size(), vocab_char.size())

    """ set model parameters """
    hidden_dim = args.w_hidden_dim
    output_dim = vocab_tag.size()
    window = args.window
    opt = args.opt

    """ symbol definition """
    print '\tCompiling Theano Code...'
    bos = T.iscalar('bos')
    eos = T.iscalar('eos')
    n_words = T.iscalar('n_words')
    batch_size = T.iscalar('batch_size')
    x = T.imatrix('x')
    y = T.ivector('y')
    lr = T.fscalar('lr')

    """ tagger set up """
    tagger = Model(x=x, y=y, n_words=n_words, batch_size=batch_size, lr=lr, init_emb=init_w_emb,
                   vocab_size=vocab_word.size(), emb_dim=w_emb_dim, hidden_dim=hidden_dim, output_dim=output_dim,
                   opt=opt, window=window)

    train_f = theano.function(
        inputs=[bos, eos, n_words, batch_size, lr],
        outputs=[tagger.nll, tagger.result],
        updates=tagger.updates,
        givens={
            x: tr_x[bos: eos],
            y: tr_y[bos: eos]
        },
        mode='FAST_RUN'
    )

    dev_f = theano.function(
        inputs=[bos, eos, n_words, batch_size],
        outputs=tagger.result,
        givens={
            x: dev_x[bos: eos],
            y: dev_y[bos: eos]
        },
        mode='FAST_RUN'
    )

    def _train():
        for epoch in xrange(args.epoch):
            _lr = args.lr / float(epoch+1)
            indices = range(len(tr_b))
            random.shuffle(indices)

            print '\nEpoch: %d' % (epoch + 1)
            print '\tBatch Index: ',
            start = time.time()

            total = 0.0
            correct = 0
            losses = 0.0

            for i, index in enumerate(indices):
                if i % 100 == 0 and i != 0:
                    print i,
                    sys.stdout.flush()

                boundary = tr_b[index]
                loss, corrects = train_f(boundary[0], boundary[1], boundary[2],boundary[3],  _lr)

                assert math.isnan(loss) is False, i

                total += len(corrects)
                correct += np.sum(corrects)
                losses += loss

            end = time.time()
            print '\tTime: %f seconds' % (end - start)
            print '\tNegative Log Likelihood: %f' % losses
            print '\tAccuracy:%f  Total:%d  Correct:%d' % ((correct / total), total, correct)

            _dev(dev_f)

    def _dev(model):
        print '\tBatch Index: ',
        start = time.time()

        total = 0.0
        correct = 0

        for index in xrange(len(dev_b)):
            if index % 100 == 0 and index != 0:
                print index,
                sys.stdout.flush()

            boundary = dev_b[index]
            corrects = model(boundary[0], boundary[1], boundary[2], boundary[3])

            total += len(corrects)
            correct += np.sum(corrects)

        end = time.time()
        print '\tTime: %f seconds' % (end - start)
        print '\tAccuracy:%f  Total:%d  Correct:%d' % ((correct / total), total, correct)

    _train()
def train(argv):
    print '\nSETTING UP A TRAINING SETTING\n'

    emb = None
    batch_size = argv.batch_size
    window = argv.window

    print 'SETTING: Batch: %d  Window: %d  Learning Rate: %f' % (batch_size, window, argv.lr)

    ##############
    # LOAD FILES #
    ##############

    """ Load files """
    # corpus: 1D: n_sents, 2D: n_words, 3D: (word, pas_info, pas_id)
    tr_corpus, vocab_word = io_utils.load(argv.train_data)
    print '\nTRAIN CORPUS'
    corpus_statistics(tr_corpus)

    if argv.dev_data:
        dev_corpus, vocab_word = io_utils.load(argv.dev_data, vocab_word)
        print '\nDEV CORPUS'
        corpus_statistics(dev_corpus)

    if argv.test_data:
        test_corpus, vocab_word = io_utils.load(argv.test_data, vocab_word)
        print '\nTEST CORPUS'
        corpus_statistics(test_corpus)

    print '\nVocab: %d' % vocab_word.size()
    if argv.save:
        io_utils.dump_data(vocab_word, 'Vocab.size-%d' % vocab_word.size())

    ##############
    # PREPROCESS #
    ##############

    """ Preprocessing """
    # samples: 1D: n_sents, 2D: [word_ids, tag_ids, prd_indices, contexts]
    train_samples = sample_format(tr_corpus, vocab_word, window)
    n_tr_samples = len(train_samples)

    if argv.dev_data:
        dev_samples = sample_format(dev_corpus, vocab_word, window)
        n_dev_samples = len(dev_samples)

    if argv.test_data:
        test_samples = sample_format(test_corpus, vocab_word, window)
        n_te_samples = len(test_samples)

    # dataset = [x, y, l]
    # x=features: 1D: n_samples * n_words, 2D: window; elem=word id
    # y=labels: 1D: n_samples; elem=scalar
    # l=question length: 1D: n_samples * 2; elem=scalar
    # bb_x=batch indices for x: 1D: n_samples / batch_size + 1; elem=(bob, eob)
    # bb_y=batch indices for y: 1D: n_samples / batch_size + 1; elem=(bob, eob)
    tr_dataset, tr_bb_x, tr_bb_y = theano_format(train_samples, batch_size)

    if argv.dev_data:
        dev_dataset, dev_bb_x, dev_bb_y = theano_format(dev_samples, batch_size)

    if argv.test_data:
        te_dataset, te_bb_x, te_bb_y = theano_format(test_samples, batch_size)

    ######################
    # BUILD ACTUAL MODEL #
    ######################

    """ Set a model """
    print '\n\nBuilding a model...'
    model = set_model(argv=argv, emb=emb, vocab=vocab_word)
    train_f = set_train_f(model, tr_dataset)

    if argv.dev_data:
        dev_f = set_predict_f(model, dev_dataset)

    if argv.test_data:
        test_f = set_predict_f(model, te_dataset)

    ###############
    # TRAIN MODEL #
    ###############

    print '\nTRAINING START\n'
    indices = range(len(tr_bb_y))
    best_dev_acc = -1.
    best_test_acc = -1.

    for epoch in xrange(argv.epoch):
        print '\n\nEPOCH: %d' % (epoch + 1)
        print '\tTRAIN\n\t',

        np.random.shuffle(indices)
        start = time.time()

        ttl_nll = 0.
        ttl_crr = 0.
        for i, b_index in enumerate(indices):
            if (i + 1) % 100 == 0:
                print i + 1,
                sys.stdout.flush()

            bb_x_i = tr_bb_x[b_index]
            bb_y_i = tr_bb_y[b_index]
            crr, nll = train_f(index=b_index, bob_x=bb_x_i[0], eob_x=bb_x_i[1], bob_y=bb_y_i[0], eob_y=bb_y_i[1])

            assert not math.isnan(nll), '\nNLL is nan: %d\n' % i

            ttl_crr += np.sum(crr)
            ttl_nll += nll

        end = time.time()
        print '\n\tTime: %f\tNLL: %f' % ((end - start), ttl_nll)
        print '\tACC: %f  CRR: %d   TOTAL: %d' % (ttl_crr/n_tr_samples, ttl_crr, n_tr_samples)

        update = False
        if argv.dev_data:
            print '\n\tDEV\n\t',
            dev_acc = predict(dev_f, dev_bb_x, dev_bb_y, n_dev_samples)
            if best_dev_acc < dev_acc:
                best_dev_acc = dev_acc
                update = True
                if argv.save:
                    fn = 'Model.Layer-%d.Sim-%s.Act-%s.Opt-%s.Batch-%d.reg-%f' %\
                         (argv.layer, argv.sim, argv.activation, argv.opt, argv.batch_size, argv.reg)
                    io_utils.dump_data(model, fn)

        if argv.test_data:
            print '\n\tTEST\n\t',
            test_acc = predict(test_f, te_bb_x, te_bb_y, n_te_samples)
            if update:
                best_test_acc = test_acc

        print '\n\tBEST DEV ACC: %f  TEST ACC: %f' % (best_dev_acc, best_test_acc)
Example #5
0
def train(args):
    print '\nNEURAL POS TAGGER START\n'

    print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list)
    print '\tWORD\t\t\tEmb Dim: %d  Hidden Dim: %d' % (args.w_emb_dim,
                                                       args.w_hidden_dim)
    print '\tCHARACTER\t\tEmb Dim: %d  Hidden Dim: %d' % (args.c_emb_dim,
                                                          args.c_hidden_dim)
    print '\tOPTIMIZATION\t\tMethod: %s  Learning Rate: %f\n' % (args.opt,
                                                                 args.lr)
    """ load data """
    print 'Loading data sets...\n'
    train_corpus, vocab_word, vocab_char, vocab_tag, max_char_len = load_train_data(
        args)
    dev_corpus, dev_vocab_word, dev_vocab_char, dev_vocab_tag, dev_max_char_len = load_dev_data(
        args)

    if dev_corpus:
        for w in dev_vocab_word.i2w:
            if args.vocab_size is None or vocab_word.size() < args.vocab_size:
                vocab_word.add_word(w)
        for c in dev_vocab_char.i2w:
            vocab_char.add_word(c)
        for t in dev_vocab_tag.i2w:
            vocab_tag.add_word(t)

    if args.save:
        io_utils.dump_data(vocab_word, 'vocab_word')
        io_utils.dump_data(vocab_char, 'vocab_char')
        io_utils.dump_data(vocab_tag, 'vocab_tag')
    """ load word embeddings """
    init_w_emb = None
    if args.emb_list:
        print '\tLoading pre-trained word embeddings...\n'
        init_w_emb = io_utils.load_init_emb(args.emb_list, args.word_list,
                                            vocab_word)
        w_emb_dim = init_w_emb.shape[1]
    else:
        print '\tUse random-initialized word embeddings...\n'
        w_emb_dim = args.w_emb_dim
    """ limit data set """
    train_corpus = train_corpus[:args.data_size]
    train_corpus.sort(key=lambda a: len(a))
    """ converting into ids """
    print '\nConverting into IDs...\n'

    tr_x, tr_c, tr_b, tr_y = preprocessor.convert_into_ids(
        train_corpus, vocab_word, vocab_char, vocab_tag)

    if args.dev_data:
        dev_x, dev_c, dev_b, dev_y = preprocessor.convert_into_ids(
            dev_corpus, vocab_word, vocab_char, vocab_tag)
        print '\tTrain Sentences: %d  Dev Sentences: %d' % (len(train_corpus),
                                                            len(dev_corpus))
    else:
        print '\tTrain Sentences: %d' % len(train_corpus)

    print '\tWord size: %d  Char size: %d' % (vocab_word.size(),
                                              vocab_char.size())
    """ tagger set up """
    tagger = set_model(args, init_w_emb, w_emb_dim, vocab_word, vocab_char,
                       vocab_tag)

    train_f = theano.function(inputs=tagger.input,
                              outputs=[tagger.nll, tagger.result],
                              updates=tagger.updates,
                              mode='FAST_RUN')

    dev_f = theano.function(inputs=tagger.input[:-1],
                            outputs=tagger.result,
                            mode='FAST_RUN')

    def _train():
        print '\nTRAINING START\n'

        for epoch in xrange(args.epoch):
            _lr = args.lr / float(epoch + 1)
            indices = range(len(tr_x))
            random.shuffle(indices)

            print '\nEpoch: %d' % (epoch + 1)
            print '\n\tTrain set'
            print '\t\tBatch Index: ',
            start = time.time()

            total = 0.0
            correct = 0
            losses = 0.0

            for i, index in enumerate(indices):
                if i % 100 == 0 and i != 0:
                    print i,
                    sys.stdout.flush()

                if args.model == 'char':
                    loss, corrects = train_f(tr_x[index], tr_c[index],
                                             tr_b[index], tr_y[index], _lr)
                else:
                    loss, corrects = train_f(tr_x[index], tr_y[index], _lr)

                assert math.isnan(loss) is False, index

                total += len(corrects)
                correct += np.sum(corrects)
                losses += loss

            end = time.time()
            print '\n\t\tTime: %f seconds' % (end - start)
            print '\t\tNegative Log Likelihood: %f' % losses
            print '\t\tAccuracy:%f  Total:%d  Correct:%d' % (
                (correct / total), total, correct)

            if args.save:
                io_utils.dump_data(
                    tagger, 'model-%s.epoch-%d' % (args.model, epoch + 1))

            _dev(dev_f)

    def _dev(_dev_f):
        print '\n\tDev set'
        print '\t\tBatch Index: ',
        start = time.time()

        total = 0.0
        correct = 0

        for index in xrange(len(dev_x)):
            if index % 100 == 0 and index != 0:
                print index,
                sys.stdout.flush()

            if args.model == 'char':
                corrects = _dev_f(dev_x[index], dev_c[index], dev_b[index],
                                  dev_y[index])
            else:
                corrects = _dev_f(dev_x[index], dev_y[index])

            total += len(corrects)
            correct += np.sum(corrects)

        end = time.time()

        print '\n\t\tTime: %f seconds' % (end - start)
        print '\t\tAccuracy:%f  Total:%d  Correct:%d' % (
            (correct / total), total, correct)

    _train()
Example #6
0
def train(args):
    print '\nNEURAL POS TAGGER START\n'

    print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list)
    print '\tWORD\t\t\tEmb Dim: %d  Hidden Dim: %d' % (args.w_emb_dim, args.w_hidden_dim)
    print '\tCHARACTER\t\tEmb Dim: %d  Hidden Dim: %d' % (args.c_emb_dim, args.c_hidden_dim)
    print '\tOPTIMIZATION\t\tMethod: %s  Learning Rate: %f\n' % (args.opt, args.lr)

    """ load data """
    print 'Loading data sets...\n'
    train_corpus, vocab_word, vocab_char, vocab_tag, max_char_len = load_train_data(args)
    dev_corpus, dev_vocab_word, dev_vocab_char, dev_vocab_tag, dev_max_char_len = load_dev_data(args)

    if dev_corpus:
        for w in dev_vocab_word.i2w:
            if args.vocab_size is None or vocab_word.size() < args.vocab_size:
                vocab_word.add_word(w)
        for c in dev_vocab_char.i2w:
            vocab_char.add_word(c)
        for t in dev_vocab_tag.i2w:
            vocab_tag.add_word(t)

    if args.save:
        io_utils.dump_data(vocab_word, 'vocab_word')
        io_utils.dump_data(vocab_char, 'vocab_char')
        io_utils.dump_data(vocab_tag, 'vocab_tag')

    """ load word embeddings """
    init_w_emb = None
    if args.emb_list:
        print '\tLoading pre-trained word embeddings...\n'
        init_w_emb = io_utils.load_init_emb(args.emb_list, args.word_list, vocab_word)
        w_emb_dim = init_w_emb.shape[1]
    else:
        print '\tUse random-initialized word embeddings...\n'
        w_emb_dim = args.w_emb_dim

    """ limit data set """
    train_corpus = train_corpus[:args.data_size]
    train_corpus.sort(key=lambda a: len(a))

    """ converting into ids """
    print '\nConverting into IDs...\n'

    tr_x, tr_c, tr_b, tr_y = preprocessor.convert_into_ids(train_corpus, vocab_word, vocab_char, vocab_tag)

    if args.dev_data:
        dev_x, dev_c, dev_b, dev_y = preprocessor.convert_into_ids(dev_corpus, vocab_word, vocab_char, vocab_tag)
        print '\tTrain Sentences: %d  Dev Sentences: %d' % (len(train_corpus), len(dev_corpus))
    else:
        print '\tTrain Sentences: %d' % len(train_corpus)

    print '\tWord size: %d  Char size: %d' % (vocab_word.size(), vocab_char.size())

    """ tagger set up """
    tagger = set_model(args, init_w_emb, w_emb_dim, vocab_word, vocab_char, vocab_tag)

    train_f = theano.function(
        inputs=tagger.input,
        outputs=[tagger.nll, tagger.result],
        updates=tagger.updates,
        mode='FAST_RUN'
    )

    dev_f = theano.function(
        inputs=tagger.input[:-1],
        outputs=tagger.result,
        mode='FAST_RUN'
    )

    def _train():
        print '\nTRAINING START\n'

        for epoch in xrange(args.epoch):
            _lr = args.lr / float(epoch+1)
            indices = range(len(tr_x))
            random.shuffle(indices)

            print '\nEpoch: %d' % (epoch + 1)
            print '\n\tTrain set'
            print '\t\tBatch Index: ',
            start = time.time()

            total = 0.0
            correct = 0
            losses = 0.0

            for i, index in enumerate(indices):
                if i % 100 == 0 and i != 0:
                    print i,
                    sys.stdout.flush()

                if args.model == 'char':
                    loss, corrects = train_f(tr_x[index], tr_c[index], tr_b[index], tr_y[index], _lr)
                else:
                    loss, corrects = train_f(tr_x[index], tr_y[index], _lr)

                assert math.isnan(loss) is False, index

                total += len(corrects)
                correct += np.sum(corrects)
                losses += loss

            end = time.time()
            print '\n\t\tTime: %f seconds' % (end - start)
            print '\t\tNegative Log Likelihood: %f' % losses
            print '\t\tAccuracy:%f  Total:%d  Correct:%d' % ((correct / total), total, correct)

            if args.save:
                io_utils.dump_data(tagger, 'model-%s.epoch-%d' % (args.model, epoch+1))

            _dev(dev_f)

    def _dev(_dev_f):
        print '\n\tDev set'
        print '\t\tBatch Index: ',
        start = time.time()

        total = 0.0
        correct = 0

        for index in xrange(len(dev_x)):
            if index % 100 == 0 and index != 0:
                print index,
                sys.stdout.flush()

            if args.model == 'char':
                corrects = _dev_f(dev_x[index], dev_c[index], dev_b[index], dev_y[index])
            else:
                corrects = _dev_f(dev_x[index], dev_y[index])

            total += len(corrects)
            correct += np.sum(corrects)

        end = time.time()

        print '\n\t\tTime: %f seconds' % (end - start)
        print '\t\tAccuracy:%f  Total:%d  Correct:%d' % ((correct / total), total, correct)

    _train()
Example #7
0
def train(args):
    print '\nNEURAL POS TAGGER START\n'

    print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list)
    print '\tWORD\t\t\tEmb Dim: %d  Hidden Dim: %d' % (args.w_emb_dim,
                                                       args.w_hidden_dim)
    print '\tCHARACTER\t\tEmb Dim: %d  Hidden Dim: %d' % (args.c_emb_dim,
                                                          args.c_hidden_dim)
    print '\tOPTIMIZATION\t\tMethod: %s  Learning Rate: %f\n' % (args.opt,
                                                                 args.lr)
    print '\tMINI-BATCH: %d\n' % args.batch_size
    """ load data """
    print 'Loading data sets...\n'
    train_corpus, vocab_word, vocab_char, vocab_tag, max_char_len = io_utils.load_conll(
        args.train_data)
    """ limit data set """
    train_corpus = train_corpus[:args.data_size]
    train_corpus.sort(key=lambda a: len(a))

    dev_corpus = None
    if args.dev_data:
        dev_corpus, dev_vocab_word, dev_vocab_char, dev_vocab_tag, max_char_len_dev = io_utils.load_conll(
            args.dev_data)

        for w in dev_vocab_word.i2w:
            if args.vocab_size is None or vocab_word.size() < args.vocab_size:
                vocab_word.add_word(w)
        for c in dev_vocab_char.i2w:
            vocab_char.add_word(c)
        for t in dev_vocab_tag.i2w:
            vocab_tag.add_word(t)

    if args.save:
        io_utils.dump_data(vocab_word, 'vocab_word')
        io_utils.dump_data(vocab_char, 'vocab_char')
        io_utils.dump_data(vocab_tag, 'vocab_tag')
    """ load pre-trained embeddings """
    init_w_emb = None
    if args.emb_list:
        print '\tLoading word embeddings...\n'
        init_w_emb = io_utils.load_init_emb(args.emb_list, args.word_list,
                                            vocab_word)
        w_emb_dim = init_w_emb.shape[1]
    else:
        w_emb_dim = args.w_emb_dim
    """ converting into ids """
    print '\nConverting into IDs...\n'
    tr_x, tr_c, tr_b, tr_y = convert_into_ids(train_corpus, vocab_word,
                                              vocab_char, vocab_tag,
                                              max_char_len)
    tr_x, tr_c, tr_y, tr_b = set_minibatch(tr_x, tr_c, tr_y, max_char_len,
                                           args.batch_size)
    tr_x, tr_c, tr_y = shared_samples(tr_x, tr_c, tr_y)

    if args.dev_data:
        dev_x, dev_c, dev_b, dev_y = convert_into_ids(dev_corpus, vocab_word,
                                                      vocab_char, vocab_tag,
                                                      max_char_len_dev)
        dev_x, dev_c, dev_y, dev_b = set_minibatch(dev_x, dev_c, dev_y,
                                                   max_char_len_dev, 1)
        dev_x, dev_c, dev_y = shared_samples(dev_x, dev_c, dev_y)
        print '\tTrain Sentences: %d  Dev Sentences: %d' % (len(train_corpus),
                                                            len(dev_corpus))
    else:
        print '\tTrain Sentences: %d' % len(train_corpus)

    print '\tWord size: %d  Char size: %d' % (vocab_word.size(),
                                              vocab_char.size())
    """ set model parameters """
    w_hidden_dim = args.w_hidden_dim
    c_emb_dim = args.c_emb_dim
    c_hidden_dim = args.c_hidden_dim
    output_dim = vocab_tag.size()
    window = args.window
    opt = args.opt
    """ symbol definition """
    print '\tCompiling Theano Code...'
    bos = T.iscalar('bos')
    eos = T.iscalar('eos')
    n_words = T.iscalar('n_words')
    batch_size = T.iscalar('batch_size')
    x = T.imatrix('x')
    c = T.itensor4('c')
    y = T.ivector('y')
    lr = T.fscalar('lr')
    """ tagger set up """
    tagger = Model(x=x,
                   c=c,
                   y=y,
                   n_words=n_words,
                   batch_size=batch_size,
                   lr=lr,
                   init_emb=init_w_emb,
                   vocab_w_size=vocab_word.size(),
                   w_emb_dim=w_emb_dim,
                   w_hidden_dim=w_hidden_dim,
                   c_emb_dim=c_emb_dim,
                   c_hidden_dim=c_hidden_dim,
                   output_dim=output_dim,
                   vocab_c_size=vocab_char.size(),
                   window=window,
                   opt=opt)

    train_f = theano.function(inputs=[bos, eos, n_words, batch_size, lr],
                              outputs=[tagger.nll, tagger.result],
                              updates=tagger.updates,
                              givens={
                                  x: tr_x[bos:eos],
                                  c: tr_c[bos:eos],
                                  y: tr_y[bos:eos]
                              },
                              mode='FAST_RUN')

    dev_f = theano.function(inputs=[bos, eos, n_words, batch_size],
                            outputs=tagger.result,
                            givens={
                                x: dev_x[bos:eos],
                                c: dev_c[bos:eos],
                                y: dev_y[bos:eos]
                            },
                            mode='FAST_RUN')

    def _train():
        for epoch in xrange(args.epoch):
            _lr = args.lr / float(epoch + 1)
            indices = range(len(tr_b))
            random.shuffle(indices)

            print '\nEpoch: %d' % (epoch + 1)
            print '\tBatch Index: ',
            start = time.time()

            total = 0.0
            correct = 0
            losses = 0.0

            for i, index in enumerate(indices):
                if i % 100 == 0 and i != 0:
                    print i,
                    sys.stdout.flush()

                boundary = tr_b[index]
                loss, corrects = train_f(boundary[0], boundary[1], boundary[2],
                                         boundary[3], _lr)

                assert math.isnan(loss) is False, i

                total += len(corrects)
                correct += np.sum(corrects)
                losses += loss

            end = time.time()
            print '\tTime: %f seconds' % (end - start)
            print '\tNegative Log Likelihood: %f' % losses
            print '\tAccuracy:%f  Total:%d  Correct:%d' % (
                (correct / total), total, correct)

            _dev(dev_f)

    def _dev(model):
        print '\tBatch Index: ',
        start = time.time()

        total = 0.0
        correct = 0

        for index in xrange(len(dev_b)):
            if index % 100 == 0 and index != 0:
                print index,
                sys.stdout.flush()

            boundary = dev_b[index]
            corrects = model(boundary[0], boundary[1], boundary[2],
                             boundary[3])

            total += len(corrects)
            correct += np.sum(corrects)

        end = time.time()
        print '\tTime: %f seconds' % (end - start)
        print '\tAccuracy:%f  Total:%d  Correct:%d' % (
            (correct / total), total, correct)

    _train()
def train(argv):
    print '\nSETTING UP A TRAINING SETTING\n'

    emb = None
    batch_size = argv.batch_size
    window = argv.window

    print 'SETTING: Batch: %d  Window: %d  Learning Rate: %f' % (
        batch_size, window, argv.lr)

    ##############
    # LOAD FILES #
    ##############
    """ Load files """
    # corpus: 1D: n_sents, 2D: n_words, 3D: (word, pas_info, pas_id)
    tr_corpus, vocab_word = io_utils.load(argv.train_data)
    print '\nTRAIN CORPUS'
    corpus_statistics(tr_corpus)

    if argv.dev_data:
        dev_corpus, vocab_word = io_utils.load(argv.dev_data, vocab_word)
        print '\nDEV CORPUS'
        corpus_statistics(dev_corpus)

    if argv.test_data:
        test_corpus, vocab_word = io_utils.load(argv.test_data, vocab_word)
        print '\nTEST CORPUS'
        corpus_statistics(test_corpus)

    print '\nVocab: %d' % vocab_word.size()
    if argv.save:
        io_utils.dump_data(vocab_word, 'Vocab.size-%d' % vocab_word.size())

    ##############
    # PREPROCESS #
    ##############
    """ Preprocessing """
    # samples: 1D: n_sents, 2D: [word_ids, tag_ids, prd_indices, contexts]
    train_samples = sample_format(tr_corpus, vocab_word, window)
    n_tr_samples = len(train_samples)

    if argv.dev_data:
        dev_samples = sample_format(dev_corpus, vocab_word, window)
        n_dev_samples = len(dev_samples)

    if argv.test_data:
        test_samples = sample_format(test_corpus, vocab_word, window)
        n_te_samples = len(test_samples)

    # dataset = [x, y, l]
    # x=features: 1D: n_samples * n_words, 2D: window; elem=word id
    # y=labels: 1D: n_samples; elem=scalar
    # l=question length: 1D: n_samples * 2; elem=scalar
    # bb_x=batch indices for x: 1D: n_samples / batch_size + 1; elem=(bob, eob)
    # bb_y=batch indices for y: 1D: n_samples / batch_size + 1; elem=(bob, eob)
    tr_dataset, tr_bb_x, tr_bb_y = theano_format(train_samples, batch_size)

    if argv.dev_data:
        dev_dataset, dev_bb_x, dev_bb_y = theano_format(
            dev_samples, batch_size)

    if argv.test_data:
        te_dataset, te_bb_x, te_bb_y = theano_format(test_samples, batch_size)

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    """ Set a model """
    print '\n\nBuilding a model...'
    model = set_model(argv=argv, emb=emb, vocab=vocab_word)
    train_f = set_train_f(model, tr_dataset)

    if argv.dev_data:
        dev_f = set_predict_f(model, dev_dataset)

    if argv.test_data:
        test_f = set_predict_f(model, te_dataset)

    ###############
    # TRAIN MODEL #
    ###############

    print '\nTRAINING START\n'
    indices = range(len(tr_bb_y))
    best_dev_acc = -1.
    best_test_acc = -1.

    for epoch in xrange(argv.epoch):
        print '\n\nEPOCH: %d' % (epoch + 1)
        print '\tTRAIN\n\t',

        np.random.shuffle(indices)
        start = time.time()

        ttl_nll = 0.
        ttl_crr = 0.
        for i, b_index in enumerate(indices):
            if (i + 1) % 100 == 0:
                print i + 1,
                sys.stdout.flush()

            bb_x_i = tr_bb_x[b_index]
            bb_y_i = tr_bb_y[b_index]
            crr, nll = train_f(index=b_index,
                               bob_x=bb_x_i[0],
                               eob_x=bb_x_i[1],
                               bob_y=bb_y_i[0],
                               eob_y=bb_y_i[1])

            assert not math.isnan(nll), '\nNLL is nan: %d\n' % i

            ttl_crr += np.sum(crr)
            ttl_nll += nll

        end = time.time()
        print '\n\tTime: %f\tNLL: %f' % ((end - start), ttl_nll)
        print '\tACC: %f  CRR: %d   TOTAL: %d' % (ttl_crr / n_tr_samples,
                                                  ttl_crr, n_tr_samples)

        update = False
        if argv.dev_data:
            print '\n\tDEV\n\t',
            dev_acc = predict(dev_f, dev_bb_x, dev_bb_y, n_dev_samples)
            if best_dev_acc < dev_acc:
                best_dev_acc = dev_acc
                update = True
                if argv.save:
                    fn = 'Model.Layer-%d.Sim-%s.Act-%s.Opt-%s.Batch-%d.reg-%f' %\
                         (argv.layer, argv.sim, argv.activation, argv.opt, argv.batch_size, argv.reg)
                    io_utils.dump_data(model, fn)

        if argv.test_data:
            print '\n\tTEST\n\t',
            test_acc = predict(test_f, te_bb_x, te_bb_y, n_te_samples)
            if update:
                best_test_acc = test_acc

        print '\n\tBEST DEV ACC: %f  TEST ACC: %f' % (best_dev_acc,
                                                      best_test_acc)