Example #1
0
def main(_):
    np.random.seed(1337)
    random.seed(1337)

    if FLAGS.only_test or FLAGS.train_steps == 0:
        FLAGS.train_steps = 0
        test(FLAGS)
        return

    print "#" * 67
    print "# Loading data from:"
    print "#" * 67
    print "Train:", FLAGS.train_data
    print "Valid:", FLAGS.valid_data
    print "Test: ", FLAGS.test_data
    print "Feature threshold:", FLAGS.feat_thresh

    # Load feature templates
    template = Template(FLAGS.template)

    # pretreatment process: read, split and create vocabularies
    train_set, valid_set, test_set, dicts, max_len = pretreatment(
        FLAGS.train_data, FLAGS.valid_data, FLAGS.test_data,
        threshold=FLAGS.feat_thresh, template=template)

    # Reset the maximum sentence's length
    # max_len = max(MAX_LEN, max_len)
    FLAGS.max_len = max_len

    # unfold these corpus
    train_corpus, train_lens = train_set
    valid_corpus, valid_lens = valid_set
    test_corpus, test_lens = test_set
    train_sentcs, train_featvs, train_labels = unfold_corpus(train_corpus)
    valid_sentcs, valid_featvs, valid_labels = unfold_corpus(valid_corpus)
    test_sentcs, test_featvs, test_labels = unfold_corpus(test_corpus)

    # vocabularies
    feats2idx = dicts['feats2idx']
    words2idx = dicts['words2idx']
    label2idx = dicts['label2idx']
    FLAGS.label2idx = label2idx
    FLAGS.words2idx = words2idx
    FLAGS.feats2idx = feats2idx
    FLAGS.feat_size = len(feats2idx)

    print "Lexical word size:     %d" % len(words2idx)
    print "Label size:            %d" % len(label2idx)
    print "Features size:         %d" % len(feats2idx)
    print "-------------------------------------------------------------------"
    print "Training data size:    %d" % len(train_corpus)
    print "Validation data size:  %d" % len(valid_corpus)
    print "Test data size:        %d" % len(test_corpus)
    print "Maximum sentence len:  %d" % FLAGS.max_len

    del train_corpus
    del valid_corpus
    # del test_corpus

    # neural network's output_dim
    nb_classes = len(label2idx)
    FLAGS.nb_classes = nb_classes + 1

    idx2label = dict((k, v) for v, k in FLAGS.label2idx.iteritems())
    # idx2words = dict((k, v) for v, k in FLAGS.words2idx.iteritems())

    # convert corpus from string seq to numeric id seq with post padding 0
    print "Preparing training, validate and testing data."
    train_X, train_F, train_Y = conv_corpus(
        train_sentcs, train_featvs, train_labels,
        words2idx, feats2idx, label2idx, max_len=max_len)
    valid_X, valid_F, valid_Y = conv_corpus(
        valid_sentcs, valid_featvs, valid_labels,
        words2idx, feats2idx, label2idx, max_len=max_len)
    test_X, test_F, test_Y = conv_corpus(
        test_sentcs, test_featvs, test_labels,
        words2idx, feats2idx, label2idx, max_len=max_len)

    del train_sentcs, train_featvs, train_labels
    del valid_sentcs, valid_featvs, valid_labels
    # del test_sentcs, test_featvs, test_labels

    del train_X, valid_X, test_X

    print "#" * 67
    print "Training arguments"
    print "#" * 67
    print "L2 regular:    %f" % FLAGS.l2_reg
    print "nb_classes:    %d" % FLAGS.nb_classes
    print "Batch size:    %d" % FLAGS.batch_size
    # print "Hidden layer:  %d" % FLAGS.hidden_dim
    print "Train epochs:  %d" % FLAGS.train_steps
    print "Learning rate: %f" % FLAGS.lr

    print "#" * 67
    print "Training process start."
    print "#" * 67

    model = linear_chain_CRF(
        FLAGS.feat_size, FLAGS.nb_classes, FLAGS.max_len,
        FLAGS.batch_size, len(template.template), FLAGS.l2_reg)

    pred_test, test_loss, test_acc = model.run(
        train_F, train_Y, train_lens,
        valid_F, valid_Y, valid_lens,
        test_F, test_Y, test_lens,
        FLAGS)

    print "Test loss: %f, accuracy: %f" % (test_loss, test_acc)
    pred_test = [pred_test[i][:test_lens[i]] for i in xrange(len(pred_test))]
    pred_test_label = convert_id_to_word(pred_test, idx2label)
    if FLAGS.eval_test:
        res_test, pred_test_label = evaluate(pred_test_label, test_labels)
        print "Test F1: %f, P: %f, R: %f" % (res_test['f1'], res_test['p'], res_test['r'])
    original_text = [[item['w'] for item in sent] for sent in test_corpus]
    write_prediction(FLAGS.output_dir + 'prediction.utf8',
                     original_text, pred_test_label)

    print "Saving feature dicts..."
    save_dicts(FLAGS.output_dir, FLAGS.feats2idx,
               FLAGS.words2idx, FLAGS.label2idx)
def main(_):
    np.random.seed(1337)
    random.seed(1337)

    if FLAGS.only_test or FLAGS.train_steps == 0:
        FLAGS.train_steps = 0
        test(FLAGS)
        return

    print "#" * 67
    print "# Loading data from:"
    print "#" * 67
    print "Train:", FLAGS.train_data
    print "Valid:", FLAGS.valid_data
    print "Test: ", FLAGS.test_data

    if FLAGS.window == 1:
        win = (0, 0)
    elif FLAGS.window == 3:
        win = (-1, 1)
    elif FLAGS.window == 5:
        win = (-2, 2)
    else:
        raise ValueError('Unsupported window size %d.' % FLAGS.window)

    # Choose fields templates & features templates
    template = HybridTemplate(FLAGS.template, win)
    # pretreatment process: read, split and create vocabularies
    train_set, valid_set, test_set, dicts, max_len = pretreatment(
        FLAGS.train_data,
        FLAGS.valid_data,
        FLAGS.test_data,
        threshold=0,
        template=template)

    # Reset the maximum sentence's length
    # max_len = max(MAX_LEN, max_len)
    FLAGS.max_len = max_len

    # unfold these corpus
    train_corpus, train_lens = train_set
    valid_corpus, valid_lens = valid_set
    test_corpus, test_lens = test_set
    train_sentcs, train_featvs, train_labels = unfold_corpus(train_corpus)
    valid_sentcs, valid_featvs, valid_labels = unfold_corpus(valid_corpus)
    test_sentcs, test_featvs, test_labels = unfold_corpus(test_corpus)

    # vocabularies
    feats2idx = dicts['feats2idx']
    words2idx = dicts['words2idx']
    label2idx = dicts['label2idx']
    FLAGS.label2idx = label2idx
    FLAGS.words2idx = words2idx
    FLAGS.feats2idx = feats2idx
    FLAGS.feat_size = len(feats2idx)

    print "Lexical word size:     %d" % len(words2idx)
    print "Label size:            %d" % len(label2idx)
    print "Features size:         %d" % len(feats2idx)
    print "-------------------------------------------------------------------"
    print "Training data size:    %d" % len(train_corpus)
    print "Validation data size:  %d" % len(valid_corpus)
    print "Test data size:        %d" % len(test_corpus)
    print "Maximum sentence len:  %d" % FLAGS.max_len

    del train_corpus
    del valid_corpus
    # del test_corpus

    # neural network's output_dim
    nb_classes = len(label2idx)
    FLAGS.nb_classes = nb_classes + 1

    # Embedding layer's input_dim
    nb_words = len(words2idx)
    FLAGS.nb_words = nb_words
    FLAGS.in_dim = FLAGS.nb_words + 1

    # load embeddings from file
    print "#" * 67
    print "# Reading embeddings from file: %s" % (FLAGS.emb_file)
    emb_mat, idx_map = read_emb_from_file(FLAGS.emb_file, words2idx)
    FLAGS.emb_dim = max(emb_mat.shape[1], FLAGS.emb_dim)
    print "embeddings' size:", emb_mat.shape
    if FLAGS.fine_tuning:
        print "The embeddings will be fine-tuned!"

    idx2label = dict((k, v) for v, k in FLAGS.label2idx.iteritems())
    # idx2words = dict((k, v) for v, k in FLAGS.words2idx.iteritems())

    # convert corpus from string to it's own index seq with post padding 0
    print "Preparing training, validate and testing data."
    train_X, train_F, train_Y = conv_corpus(train_sentcs,
                                            train_featvs,
                                            train_labels,
                                            words2idx,
                                            feats2idx,
                                            label2idx,
                                            max_len=max_len)
    valid_X, valid_F, valid_Y = conv_corpus(valid_sentcs,
                                            valid_featvs,
                                            valid_labels,
                                            words2idx,
                                            feats2idx,
                                            label2idx,
                                            max_len=max_len)
    test_X, test_F, test_Y = conv_corpus(test_sentcs,
                                         test_featvs,
                                         test_labels,
                                         words2idx,
                                         feats2idx,
                                         label2idx,
                                         max_len=max_len)

    del train_sentcs, train_featvs, train_labels
    del valid_sentcs, valid_featvs, valid_labels
    # del test_sentcs, test_featvs, test_labels

    print "#" * 67
    print "Training arguments"
    print "#" * 67
    print "L2 regular:    %f" % FLAGS.l2_reg
    print "nb_classes:    %d" % FLAGS.nb_classes
    print "Batch size:    %d" % FLAGS.batch_size
    print "Hidden layer:  %d" % FLAGS.hidden_dim
    print "Train epochs:  %d" % FLAGS.train_steps
    print "Learning rate: %f" % FLAGS.lr

    print "#" * 67
    print "Training process start."
    print "#" * 67

    # if FLAGS.model == 'LSTM':
    #     Model_type = tagger.LSTM_NER
    # elif FLAGS.model == 'BLSTM':
    #     Model_type = tagger.Bi_LSTM_NER
    # elif FLAGS.model == 'CNNBLSTM':
    #     Model_type = tagger.CNN_Bi_LSTM_NER
    # else:
    #     raise TypeError("Unknow model type % " % FLAGS.model)

    model = Hybrid_LSTM_tagger(nb_words, FLAGS.emb_dim, emb_mat,
                               FLAGS.feat_size, FLAGS.hidden_dim,
                               FLAGS.nb_classes, FLAGS.max_len,
                               FLAGS.fine_tuning,
                               FLAGS.dropout, FLAGS.batch_size,
                               len(template.template), FLAGS.window,
                               FLAGS.l2_reg)

    pred_test, test_loss, test_acc = model.run(train_X, train_F, train_Y,
                                               train_lens, valid_X, valid_F,
                                               valid_Y, valid_lens, test_X,
                                               test_F, test_Y, test_lens,
                                               FLAGS)

    print "Test loss: %f, accuracy: %f" % (test_loss, test_acc)
    # pred_test = [pred_test[i][:test_lens[i]] for i in xrange(len(pred_test))]
    pred_test_label = convert_id_to_word(pred_test, idx2label)
    if FLAGS.eval_test:
        res_test, pred_test_label = evaluate(pred_test_label, test_labels)
        print "Test F1: %f, P: %f, R: %f" % (res_test['f1'], res_test['p'],
                                             res_test['r'])
    original_text = [[item['w'] for item in sent] for sent in test_corpus]
    write_prediction(FLAGS.output_dir + 'prediction.utf8', original_text,
                     pred_test_label)

    print "Saving feature dicts..."
    save_dicts(FLAGS.output_dir, FLAGS.feats2idx, FLAGS.words2idx,
               FLAGS.label2idx)
Example #3
0
def main(_):
    np.random.seed(1337)
    random.seed(1337)

    if FLAGS.only_test or FLAGS.train_steps == 0:
        FLAGS.train_steps = 0
        test(FLAGS)
        return

    print "#" * 67
    print "# Loading data from:"
    print "#" * 67
    print "Train:", FLAGS.train_data
    print "Valid:", FLAGS.valid_data
    print "Test: ", FLAGS.test_data

    # Choose fields templates & features templates
    template = Template(FLAGS.template, prefix=False)
    # pretreatment process: read, split and create vocabularies
    train_set, valid_set, test_set, dicts, max_len = pretreatment(
        FLAGS.train_data, FLAGS.valid_data, FLAGS.test_data,
        threshold=0, template=template)

    # Reset the maximum sentence's length
    # max_len = max(MAX_LEN, max_len)
    FLAGS.max_len = max_len

    # unfold these corpus
    train_corpus, train_lens = train_set
    valid_corpus, valid_lens = valid_set
    test_corpus, test_lens = test_set
    train_sentcs, train_featvs, train_labels = unfold_corpus(train_corpus)
    valid_sentcs, valid_featvs, valid_labels = unfold_corpus(valid_corpus)
    test_sentcs, test_featvs, test_labels = unfold_corpus(test_corpus)

    # vocabularies
    feats2idx = dicts['feats2idx']
    words2idx = dicts['words2idx']
    label2idx = dicts['label2idx']
    FLAGS.label2idx = label2idx
    FLAGS.words2idx = words2idx
    FLAGS.feats2idx = feats2idx

    print "Lexical word size:     %d" % len(feats2idx)
    print "Label size:            %d" % len(label2idx)
    print "-------------------------------------------------------------------"
    print "Training data size:    %d" % len(train_corpus)
    print "Validation data size:  %d" % len(valid_corpus)
    print "Test data size:        %d" % len(test_corpus)
    print "Maximum sentence len:  %d" % FLAGS.max_len

    del train_corpus
    del valid_corpus
    # del test_corpus

    # neural network's output_dim
    nb_classes = len(label2idx)
    FLAGS.nb_classes = nb_classes + 1

    # Embedding layer's input_dim
    nb_words = len(words2idx)
    FLAGS.nb_words = nb_words
    FLAGS.in_dim = FLAGS.nb_words + 1

    # load embeddings from file
    print "#" * 67
    print "# Reading embeddings from file: %s" % (FLAGS.emb_file)
    emb_mat, idx_map = read_emb_from_file(FLAGS.emb_file, feats2idx)
    FLAGS.emb_dim = max(emb_mat.shape[1], FLAGS.emb_dim)
    print "embeddings' size:", emb_mat.shape
    if FLAGS.fine_tuning:
        print "The embeddings will be fine-tuned!"

    idx2label = dict((k, v) for v, k in FLAGS.label2idx.iteritems())
    # idx2words = dict((k, v) for v, k in FLAGS.words2idx.iteritems())

    # convert corpus from string to it's own index seq with post padding 0
    print "Preparing training, validate and testing data."
    train_X, train_F, train_Y = conv_corpus(
        train_sentcs, train_featvs, train_labels,
        words2idx, feats2idx, label2idx, max_len=max_len)
    valid_X, valid_F, valid_Y = conv_corpus(
        valid_sentcs, valid_featvs, valid_labels,
        words2idx, feats2idx, label2idx, max_len=max_len)
    test_X, test_F, test_Y = conv_corpus(
        test_sentcs, test_featvs, test_labels,
        words2idx, feats2idx, label2idx, max_len=max_len)

    # Release memory
    del train_sentcs, train_featvs, train_labels
    del valid_sentcs, valid_featvs, valid_labels
    # del test_sentcs, test_featvs, test_labels

    del train_X, valid_X, test_X

    print "#" * 67
    print "Training arguments"
    print "#" * 67
    print "L2 regular:    %f" % FLAGS.l2_reg
    print "nb_classes:    %d" % FLAGS.nb_classes
    print "Batch size:    %d" % FLAGS.batch_size
    print "Hidden layer:  %d" % FLAGS.hidden_dim
    print "Train epochs:  %d" % FLAGS.train_steps
    print "Learning rate: %f" % FLAGS.lr

    print "#" * 67
    print "Training process start."
    print "#" * 67

    if FLAGS.model == 'LSTM':
        Model_type = tagger.LSTM_NER
    elif FLAGS.model == 'BLSTM':
        Model_type = tagger.Bi_LSTM_NER
    elif FLAGS.model == 'CNNBLSTM':
        Model_type = tagger.CNN_Bi_LSTM_NER
    else:
        raise TypeError("Unknow model type % " % FLAGS.model)

    model = Model_type(
        nb_words, FLAGS.emb_dim, emb_mat, FLAGS.hidden_dim,
        FLAGS.nb_classes, FLAGS.dropout, FLAGS.batch_size,
        FLAGS.max_len, len(template.template), FLAGS.l2_reg,
        FLAGS.fine_tuning)

    pred_test, test_loss, test_acc = model.run(
        train_F, train_Y, train_lens,
        valid_F, valid_Y, valid_lens,
        test_F, test_Y, test_lens,
        FLAGS)

    print "Test loss: %f, accuracy: %f" % (test_loss, test_acc)
    pred_test = [pred_test[i][:test_lens[i]] for i in xrange(len(pred_test))]
    pred_test_label = convert_id_to_word(pred_test, idx2label)
    if FLAGS.eval_test:
        res_test, pred_test_label = evaluate(pred_test_label, test_labels)
        print "Test F1: %f, P: %f, R: %f" % (res_test['f1'], res_test['p'], res_test['r'])
    original_text = [[item['w'] for item in sent] for sent in test_corpus]
    write_prediction(FLAGS.output_dir + 'prediction.utf8',
                     original_text, pred_test_label)

    print "Saving feature dicts..."
    save_dicts(FLAGS.output_dir, FLAGS.feats2idx,
               FLAGS.words2idx, FLAGS.label2idx)