Beispiel #1
0
def test(args):
    print '\nNEURAL POS TAGGER START\n'

    print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list)
    print '\tWORD\t\t\tEmb Dim: %d  Hidden Dim: %d' % (args.w_emb_dim, args.w_hidden_dim)
    print '\tCHARACTER\t\tEmb Dim: %d  Hidden Dim: %d' % (args.c_emb_dim, args.c_hidden_dim)
    print '\tOPTIMIZATION\t\tMethod: %s  Learning Rate: %f\n' % (args.opt, args.lr)

    """ load vocab """
    print 'Loading vocabularies...\n'
    vocab_word = io_utils.load_data('vocab_word')
    vocab_char = io_utils.load_data('vocab_char')
    vocab_tag = io_utils.load_data('vocab_tag')
    print '\tWord size: %d  Char size: %d' % (vocab_word.size(), vocab_char.size())

    """ load data """
    print '\nLoading data set...\n'
    test_corpus, test_vocab_word, test_vocab_char, test_vocab_tag = io_utils.load_conll(args.dev_data)
    print '\tTest Sentences: %d' % len(test_corpus)

    """ converting into ids """
    print '\nConverting into IDs...\n'
    test_x, test_c, test_b, test_y = preprocessor.convert_into_ids(test_corpus, vocab_word, vocab_char, vocab_tag)

    """ tagger set up """
    tagger = io_utils.load_data(args.load)

    dev_f = theano.function(
        inputs=tagger.input[:-1],
        outputs=tagger.result,
        mode='FAST_RUN'
    )

    """ Prediction """
    print '\nPREDICTION START\n'

    print '\tBatch Index: ',
    start = time.time()

    total = 0.0
    correct = 0

    for index in xrange(len(test_x)):
        if index % 100 == 0 and index != 0:
            print index,
            sys.stdout.flush()

        if tagger.name == 'char':
            corrects = dev_f(test_x[index], test_c[index], test_b[index], test_y[index])
        else:
            corrects = dev_f(test_x[index], test_y[index])

        total += len(corrects)
        correct += np.sum(corrects)

    end = time.time()

    print '\n\tTime: %f seconds' % (end - start)
    print '\tAccuracy:%f  Total:%d  Correct:%d' % ((correct / total), total, correct)
def train(args):
    print '\nNEURAL POS TAGGER START\n'

    print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list)
    print '\tWORD\t\t\tEmb Dim: %d  Hidden Dim: %d' % (args.w_emb_dim, args.w_hidden_dim)
    print '\tCHARACTER\t\tEmb Dim: %d  Hidden Dim: %d' % (args.c_emb_dim, args.c_hidden_dim)
    print '\tOPTIMIZATION\t\tMethod: %s  Learning Rate: %f\n' % (args.opt, args.lr)
    print '\tMINI-BATCH: %d\n' % args.batch_size

    """ load data """
    print 'Loading data sets...\n'
    train_corpus, vocab_word, vocab_char, vocab_tag, _ = io_utils.load_conll(args.train_data)

    """ limit data set """
    train_corpus = train_corpus[:args.data_size]
    train_corpus.sort(key=lambda a: len(a))

    dev_corpus = None
    if args.dev_data:
        dev_corpus, dev_vocab_word, dev_vocab_char, dev_vocab_tag, _ = io_utils.load_conll(args.dev_data)

        for w in dev_vocab_word.i2w:
            if args.vocab_size is None or vocab_word.size() < args.vocab_size:
                vocab_word.add_word(w)
        for c in dev_vocab_char.i2w:
            vocab_char.add_word(c)
        for t in dev_vocab_tag.i2w:
            vocab_tag.add_word(t)

    if args.save:
        io_utils.dump_data(vocab_word, 'vocab_word')
        io_utils.dump_data(vocab_char, 'vocab_char')
        io_utils.dump_data(vocab_tag, 'vocab_tag')

    """ load word embeddings """
    init_w_emb = None
    if args.emb_list:
        print '\tLoading pre-trained word embeddings...\n'
        init_w_emb = io_utils.load_init_emb(args.emb_list, args.word_list, vocab_word)
        w_emb_dim = init_w_emb.shape[1]
    else:
        print '\tUse random-initialized word embeddings...\n'
        w_emb_dim = args.w_emb_dim

    """ converting into ids """
    print '\nConverting into IDs...\n'

    tr_x, tr_c, tr_b, tr_y = convert_into_ids(train_corpus, vocab_word, vocab_char, vocab_tag)
    tr_x, tr_y, tr_b = set_minibatch(tr_x, tr_y, args.batch_size)
    tr_x, tr_y = shared_samples(tr_x, tr_y)

    dev_x = None
    dev_y = None
    if args.dev_data:
        dev_x, dev_c, dev_b, dev_y = convert_into_ids(dev_corpus, vocab_word, vocab_char, vocab_tag)
        dev_x, dev_y, dev_b = set_minibatch(dev_x, dev_y, 1)
        dev_x, dev_y = shared_samples(dev_x, dev_y)
        print '\tTrain Sentences: %d  Dev Sentences: %d' % (len(train_corpus), len(dev_corpus))
    else:
        print '\tTrain Sentences: %d' % len(train_corpus)

    print '\tWord size: %d  Char size: %d' % (vocab_word.size(), vocab_char.size())

    """ set model parameters """
    hidden_dim = args.w_hidden_dim
    output_dim = vocab_tag.size()
    window = args.window
    opt = args.opt

    """ symbol definition """
    print '\tCompiling Theano Code...'
    bos = T.iscalar('bos')
    eos = T.iscalar('eos')
    n_words = T.iscalar('n_words')
    batch_size = T.iscalar('batch_size')
    x = T.imatrix('x')
    y = T.ivector('y')
    lr = T.fscalar('lr')

    """ tagger set up """
    tagger = Model(x=x, y=y, n_words=n_words, batch_size=batch_size, lr=lr, init_emb=init_w_emb,
                   vocab_size=vocab_word.size(), emb_dim=w_emb_dim, hidden_dim=hidden_dim, output_dim=output_dim,
                   opt=opt, window=window)

    train_f = theano.function(
        inputs=[bos, eos, n_words, batch_size, lr],
        outputs=[tagger.nll, tagger.result],
        updates=tagger.updates,
        givens={
            x: tr_x[bos: eos],
            y: tr_y[bos: eos]
        },
        mode='FAST_RUN'
    )

    dev_f = theano.function(
        inputs=[bos, eos, n_words, batch_size],
        outputs=tagger.result,
        givens={
            x: dev_x[bos: eos],
            y: dev_y[bos: eos]
        },
        mode='FAST_RUN'
    )

    def _train():
        for epoch in xrange(args.epoch):
            _lr = args.lr / float(epoch+1)
            indices = range(len(tr_b))
            random.shuffle(indices)

            print '\nEpoch: %d' % (epoch + 1)
            print '\tBatch Index: ',
            start = time.time()

            total = 0.0
            correct = 0
            losses = 0.0

            for i, index in enumerate(indices):
                if i % 100 == 0 and i != 0:
                    print i,
                    sys.stdout.flush()

                boundary = tr_b[index]
                loss, corrects = train_f(boundary[0], boundary[1], boundary[2],boundary[3],  _lr)

                assert math.isnan(loss) is False, i

                total += len(corrects)
                correct += np.sum(corrects)
                losses += loss

            end = time.time()
            print '\tTime: %f seconds' % (end - start)
            print '\tNegative Log Likelihood: %f' % losses
            print '\tAccuracy:%f  Total:%d  Correct:%d' % ((correct / total), total, correct)

            _dev(dev_f)

    def _dev(model):
        print '\tBatch Index: ',
        start = time.time()

        total = 0.0
        correct = 0

        for index in xrange(len(dev_b)):
            if index % 100 == 0 and index != 0:
                print index,
                sys.stdout.flush()

            boundary = dev_b[index]
            corrects = model(boundary[0], boundary[1], boundary[2], boundary[3])

            total += len(corrects)
            correct += np.sum(corrects)

        end = time.time()
        print '\tTime: %f seconds' % (end - start)
        print '\tAccuracy:%f  Total:%d  Correct:%d' % ((correct / total), total, correct)

    _train()
Beispiel #3
0
def main(argv):
    print '\nSYSTEM START'
    print '\nMODE: Training'

    ###################
    # PREPROCESS DATA #
    ###################

    """ Load initial embedding file """
    vocab_word = Vocab()
    emb = None
    if argv.init_emb:
        print '\n\tInitial Embedding Loading...'
        emb, vocab_word = load_init_emb(init_emb=argv.init_emb)
        print '\t\tVocabulary Size: %d' % vocab_word.size()

    """ Load corpora """
    print '\n\tLoading Corpora...'
    tr_corpus, tr_doc_names, vocab_word = load_conll(path=argv.train_data, vocab=vocab_word, data_size=argv.data_size)
    dev_corpus, dev_doc_names, _ = load_conll(path=argv.dev_data, vocab=vocab_word, data_size=argv.data_size)
    print '\t\tTrain Documents: %d' % len(tr_corpus)
    print '\t\tDev   Documents: %d' % len(dev_corpus)

    """ Extract gold mentions CoNLL-2012: Train=155,560, Dev=19,156, Test=19,764 """
    # gold_mentions: 1D: n_doc, 2D: n_sents, 3D: n_mentions: elem=(bos, eos)
    # gold_corefs: 1D: n_doc, 2D: n_sents, 3D: n_mentions: elem=coref_id
    print '\n\tExtracting Gold Mentions...'
    print '\t\tTRAIN',
    tr_gold_ments = get_gold_mentions(tr_corpus, check=argv.check)
    print '\t\tDEV  ',
    dev_gold_ments = get_gold_mentions(dev_corpus)

    """ Extract cand mentions """
    # cand_mentions: 1D: n_doc, 2D: n_sents, 3D: n_mentions; elem=(bos, eos)
    print '\n\tExtracting Cand Mentions...'
    print '\t\tTRAIN',
    tr_cand_ments = get_cand_mentions(tr_corpus, check=argv.check)
    print '\t\tDEV  ',
    dev_cand_ments = get_cand_mentions(dev_corpus)

    """ Convert words into IDs """
    print '\n\tConverting Words into IDs...'
    print '\t\tVocabulary Size: %d' % vocab_word.size()

    tr_word_ids = convert_words_into_ids(corpus=tr_corpus, vocab_word=vocab_word)
    dev_word_ids = convert_words_into_ids(corpus=dev_corpus, vocab_word=vocab_word)

    """ Set word ids for mentions """
    tr_gold_ments = set_word_id_for_ment(tr_word_ids, tr_gold_ments)
    tr_cand_ments = set_word_id_for_ment(tr_word_ids, tr_cand_ments)
    dev_gold_ments = set_word_id_for_ment(dev_word_ids, dev_gold_ments)
    dev_cand_ments = set_word_id_for_ment(dev_word_ids, dev_cand_ments)

    """ Set coref ids for cand mentions """
    tr_cand_ments = set_cand_ment_coref(tr_gold_ments, tr_cand_ments)
    dev_cand_ments = set_cand_ment_coref(dev_gold_ments, dev_cand_ments)

    """ Check the coverage: Coverage 95.0%, Rate 1:3.5 by Berkeley System """
    print '\n\tChecking the Coverage of the Candidate Mentions...'
    check_coverage_of_cand_mentions(tr_gold_ments, tr_cand_ments)
    check_coverage_of_cand_mentions(dev_gold_ments, dev_cand_ments)

    """ Extract features """
    print '\n\tExtracting features...'

    """
    phi = (span, word, ctx, dist, label, position)
    span    : 1D: n_doc, 2D: n_ments, 3D: n_cand_ants, 4D: limit * 2; elem=word id
    word    : 1D: n_doc, 2D: n_ments, 3D: n_cand_ants, 4D: [m_first, m_last, a_first, a_last]; elem=word id
    ctx     : 1D: n_doc, 2D: n_ments, 3D: n_cand_ants, 4D: window * 2 * 2; elem=word id
    dist    : 1D: n_doc, 2D: n_ments, 3D: n_cand_ants; elem=sent dist
    label   : 1D: n_doc, 2D: n_ments; elem=0/1
    position: 1D: n_doc, 2D: n_ments, 3D: n_cand_ants; elem=(sent_m_i, span_m, sent_a_i, span_a)
    """

    tr_phi, tr_posit = get_features(tr_cand_ments, False, argv.n_cands)
    dev_phi, dev_posit = get_features(dev_cand_ments, True, argv.n_cands)

    """ Count the number of features """
    n_tr_phi_total = reduce(lambda a, b: a + reduce(lambda c, d: c + len(d), b, 0), tr_phi, 0)
    n_tr_phi_t = reduce(lambda a, b: a + reduce(lambda c, d: c + reduce(lambda e, f: e + np.sum(f[-1]), d, 0), b, 0), tr_phi, 0)
    n_tr_phi_f = n_tr_phi_total - n_tr_phi_t

    n_dev_phi_total = reduce(lambda a, b: a + reduce(lambda c, d: c + len(d), b, 0), dev_phi, 0)
    n_dev_phi_t = reduce(lambda a, b: a + reduce(lambda c, d: c + reduce(lambda e, f: e + np.sum(f[-1]), d, 0), b, 0), dev_phi, 0)
    n_dev_phi_f = n_dev_phi_total - n_dev_phi_t
    print '\t\tTrain Features Total: %d\tRate: P:N\t%d:%d' % (n_tr_phi_total, n_tr_phi_t, n_tr_phi_f)
    print '\t\tDev   Features Total: %d\tRate: P:N\t%d:%d' % (n_dev_phi_total, n_dev_phi_t, n_dev_phi_f)

    """ Convert into the Theano format """
    print '\n\tConverting features into the Theano Format...'

    """
    samples = (span, word, ctx, dist, label)
    span   : 1D: n_doc * n_ments * n_cand_ants, 2D: limit * 2; elem=word id
    word   : 1D: n_doc * n_ments * n_cand_ants, 2D: [m_first, m_last, a_first, a_last]; elem=word id
    ctx    : 1D: n_doc * n_ments * n_cand_ants, 2D: window * 2 * 2; elem=word id
    dist   : 1D: n_doc * n_ments * n_cand_ants; elem=sent dist
    label  : 1D: n_doc * n_ments * n_cand_ants; elem=0/1
    indices: 1D: n_doc * n_ments; elem=(bos, eos)
    """

    tr_samples, tr_indices = theano_format(tr_phi)
    dev_samples, dev_indices = theano_format(dev_phi)

    ######################
    # BUILD ACTUAL MODEL #
    ######################

    print '\nBuilding the model...'

    model = set_model(argv, vocab_word, emb)

    bos = T.iscalar('bos')
    eos = T.iscalar('eos')

    train_f = theano.function(
        inputs=[bos, eos],
        outputs=[model.nll, model.correct, model.correct_t, model.correct_f, model.total_p, model.total_r],
        updates=model.updates,
        givens={
            model.x_span: tr_samples[0][bos: eos],
            model.x_word: tr_samples[1][bos: eos],
            model.x_ctx : tr_samples[2][bos: eos],
            model.x_dist: tr_samples[3][bos: eos],
            model.x_slen: tr_samples[4][bos: eos],
            model.y     : tr_samples[5][bos: eos]
        },
        mode='FAST_RUN'
    )

    dev_f = theano.function(
        inputs=[bos, eos],
        outputs=[model.y_hat_index, model.p_y_hat,
                 model.correct, model.correct_t, model.correct_f, model.total_p, model.total_r],
        givens={
            model.x_span: dev_samples[0][bos: eos],
            model.x_word: dev_samples[1][bos: eos],
            model.x_ctx : dev_samples[2][bos: eos],
            model.x_dist: dev_samples[3][bos: eos],
            model.x_slen: dev_samples[4][bos: eos],
            model.y     : dev_samples[5][bos: eos]
        },
        mode='FAST_RUN'
    )

    ###############
    # TRAIN MODEL #
    ###############

    batch_size = argv.batch
    n_batches = n_tr_phi_total / batch_size
    indices = range(n_batches)

    print 'Training START\n'
    print 'Mini-Batch Samples: %d\n' % n_batches

    for epoch in xrange(argv.epoch):
        random.shuffle(indices)

        print '\nEpoch: %d' % (epoch + 1)
        print 'TRAIN'
        print '\tIndex: ',
        start = time.time()

        total_loss = 0.
        correct = np.zeros(9, dtype='float32')
        correct_t = np.zeros(9, dtype='float32')
        correct_f = np.zeros(9, dtype='float32')
        total = 0.
        total_r = np.zeros(9, dtype='float32')
        total_p = np.zeros(9, dtype='float32')

        for i, index in enumerate(indices):
            if i % 1000 == 0 and i != 0:
                print '%d' % i,
                sys.stdout.flush()

            loss, crr, crr_t, crr_f, ttl_p, ttl_r = train_f(index * batch_size, (index+1) * batch_size)

            assert not math.isnan(loss), 'Index: %d  Batch Index: %d' % (i, index)

            total_loss += loss
            correct += crr
            correct_t += crr_t
            correct_f += crr_f
            total += batch_size
            total_p += ttl_p
            total_r += ttl_r

        end = time.time()
        print '\n\tTime: %f seconds' % (end - start)
        show_results(total, total_p, total_r, correct, correct_t, correct_f, total_loss)

        predict(epoch, dev_f, dev_corpus, dev_doc_names, dev_indices, dev_posit)
Beispiel #4
0
def load_dev_data(args):
    if args.dev_data:
        return io_utils.load_conll(args.dev_data)
    else:
        return None, None, None, None, None
Beispiel #5
0
def load_train_data(args):
    if args.train_data:
        return io_utils.load_conll(args.dev_data)
    else:
        print 'Input: --train_data path'
        exit()
Beispiel #6
0
def load_dev_data(args):
    if args.dev_data:
        return io_utils.load_conll(args.dev_data)
    else:
        return None, None, None, None, None
Beispiel #7
0
def load_train_data(args):
    if args.train_data:
        return io_utils.load_conll(args.dev_data)
    else:
        print 'Input: --train_data path'
        exit()
Beispiel #8
0
def train(args):
    print '\nNEURAL POS TAGGER START\n'

    print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list)
    print '\tWORD\t\t\tEmb Dim: %d  Hidden Dim: %d' % (args.w_emb_dim,
                                                       args.w_hidden_dim)
    print '\tCHARACTER\t\tEmb Dim: %d  Hidden Dim: %d' % (args.c_emb_dim,
                                                          args.c_hidden_dim)
    print '\tOPTIMIZATION\t\tMethod: %s  Learning Rate: %f\n' % (args.opt,
                                                                 args.lr)
    print '\tMINI-BATCH: %d\n' % args.batch_size
    """ load data """
    print 'Loading data sets...\n'
    train_corpus, vocab_word, vocab_char, vocab_tag, max_char_len = io_utils.load_conll(
        args.train_data)
    """ limit data set """
    train_corpus = train_corpus[:args.data_size]
    train_corpus.sort(key=lambda a: len(a))

    dev_corpus = None
    if args.dev_data:
        dev_corpus, dev_vocab_word, dev_vocab_char, dev_vocab_tag, max_char_len_dev = io_utils.load_conll(
            args.dev_data)

        for w in dev_vocab_word.i2w:
            if args.vocab_size is None or vocab_word.size() < args.vocab_size:
                vocab_word.add_word(w)
        for c in dev_vocab_char.i2w:
            vocab_char.add_word(c)
        for t in dev_vocab_tag.i2w:
            vocab_tag.add_word(t)

    if args.save:
        io_utils.dump_data(vocab_word, 'vocab_word')
        io_utils.dump_data(vocab_char, 'vocab_char')
        io_utils.dump_data(vocab_tag, 'vocab_tag')
    """ load pre-trained embeddings """
    init_w_emb = None
    if args.emb_list:
        print '\tLoading word embeddings...\n'
        init_w_emb = io_utils.load_init_emb(args.emb_list, args.word_list,
                                            vocab_word)
        w_emb_dim = init_w_emb.shape[1]
    else:
        w_emb_dim = args.w_emb_dim
    """ converting into ids """
    print '\nConverting into IDs...\n'
    tr_x, tr_c, tr_b, tr_y = convert_into_ids(train_corpus, vocab_word,
                                              vocab_char, vocab_tag,
                                              max_char_len)
    tr_x, tr_c, tr_y, tr_b = set_minibatch(tr_x, tr_c, tr_y, max_char_len,
                                           args.batch_size)
    tr_x, tr_c, tr_y = shared_samples(tr_x, tr_c, tr_y)

    if args.dev_data:
        dev_x, dev_c, dev_b, dev_y = convert_into_ids(dev_corpus, vocab_word,
                                                      vocab_char, vocab_tag,
                                                      max_char_len_dev)
        dev_x, dev_c, dev_y, dev_b = set_minibatch(dev_x, dev_c, dev_y,
                                                   max_char_len_dev, 1)
        dev_x, dev_c, dev_y = shared_samples(dev_x, dev_c, dev_y)
        print '\tTrain Sentences: %d  Dev Sentences: %d' % (len(train_corpus),
                                                            len(dev_corpus))
    else:
        print '\tTrain Sentences: %d' % len(train_corpus)

    print '\tWord size: %d  Char size: %d' % (vocab_word.size(),
                                              vocab_char.size())
    """ set model parameters """
    w_hidden_dim = args.w_hidden_dim
    c_emb_dim = args.c_emb_dim
    c_hidden_dim = args.c_hidden_dim
    output_dim = vocab_tag.size()
    window = args.window
    opt = args.opt
    """ symbol definition """
    print '\tCompiling Theano Code...'
    bos = T.iscalar('bos')
    eos = T.iscalar('eos')
    n_words = T.iscalar('n_words')
    batch_size = T.iscalar('batch_size')
    x = T.imatrix('x')
    c = T.itensor4('c')
    y = T.ivector('y')
    lr = T.fscalar('lr')
    """ tagger set up """
    tagger = Model(x=x,
                   c=c,
                   y=y,
                   n_words=n_words,
                   batch_size=batch_size,
                   lr=lr,
                   init_emb=init_w_emb,
                   vocab_w_size=vocab_word.size(),
                   w_emb_dim=w_emb_dim,
                   w_hidden_dim=w_hidden_dim,
                   c_emb_dim=c_emb_dim,
                   c_hidden_dim=c_hidden_dim,
                   output_dim=output_dim,
                   vocab_c_size=vocab_char.size(),
                   window=window,
                   opt=opt)

    train_f = theano.function(inputs=[bos, eos, n_words, batch_size, lr],
                              outputs=[tagger.nll, tagger.result],
                              updates=tagger.updates,
                              givens={
                                  x: tr_x[bos:eos],
                                  c: tr_c[bos:eos],
                                  y: tr_y[bos:eos]
                              },
                              mode='FAST_RUN')

    dev_f = theano.function(inputs=[bos, eos, n_words, batch_size],
                            outputs=tagger.result,
                            givens={
                                x: dev_x[bos:eos],
                                c: dev_c[bos:eos],
                                y: dev_y[bos:eos]
                            },
                            mode='FAST_RUN')

    def _train():
        for epoch in xrange(args.epoch):
            _lr = args.lr / float(epoch + 1)
            indices = range(len(tr_b))
            random.shuffle(indices)

            print '\nEpoch: %d' % (epoch + 1)
            print '\tBatch Index: ',
            start = time.time()

            total = 0.0
            correct = 0
            losses = 0.0

            for i, index in enumerate(indices):
                if i % 100 == 0 and i != 0:
                    print i,
                    sys.stdout.flush()

                boundary = tr_b[index]
                loss, corrects = train_f(boundary[0], boundary[1], boundary[2],
                                         boundary[3], _lr)

                assert math.isnan(loss) is False, i

                total += len(corrects)
                correct += np.sum(corrects)
                losses += loss

            end = time.time()
            print '\tTime: %f seconds' % (end - start)
            print '\tNegative Log Likelihood: %f' % losses
            print '\tAccuracy:%f  Total:%d  Correct:%d' % (
                (correct / total), total, correct)

            _dev(dev_f)

    def _dev(model):
        print '\tBatch Index: ',
        start = time.time()

        total = 0.0
        correct = 0

        for index in xrange(len(dev_b)):
            if index % 100 == 0 and index != 0:
                print index,
                sys.stdout.flush()

            boundary = dev_b[index]
            corrects = model(boundary[0], boundary[1], boundary[2],
                             boundary[3])

            total += len(corrects)
            correct += np.sum(corrects)

        end = time.time()
        print '\tTime: %f seconds' % (end - start)
        print '\tAccuracy:%f  Total:%d  Correct:%d' % (
            (correct / total), total, correct)

    _train()