Example #1
0
 def create_NER_model(self):
     ner_model = Sequential()
     # keras_contrib 2.0.8, keras 2.2.5,下 当mask_zero=True 会报
     # Tensors in list passed to 'values' of 'ConcatV2' Op have types [bool, float32] that don't all match.`
     # 错误。
     # 改成keras 2.2.4 解决
     embedding = Embedding(input_dim=VOCAB_SIZE,
                           output_dim=EMBED_DIM,
                           mask_zero=False,
                           embeddings_initializer=constant(
                               load_word2vec_embedding(config.vocab_size)))
     ner_model.add(embedding)
     ner_model.add(Masking(mask_value=config.src_padding, ))
     ner_model.add(
         Bidirectional(
             LSTM(BiRNN_UNITS // 2,
                  return_sequences=True,
                  dropout=DROPOUT_RATE)))
     crf = CRF(len(LABEL_DIC), sparse_target=True)
     ner_model.add(crf)
     # 以下两种损失和度量写法都可以
     ner_model.compile(Adam(lr=LEARN_RATE, decay=1e-3),
                       loss=crf_loss,
                       metrics=[crf_accuracy])
     # ner_model.compile(Adam(lr=LEARN_RATE), loss=crf.loss_function, metrics=[crf.accuracy])
     return ner_model
Example #2
0
def main():
    print(args)

    N_EPOCHS = args.n_epochs
    N_TRAIN = args.n_train
    N_VALID = args.n_valid
    BATCH_SIZE = args.batch_size

    data_dir = args.data_dir
    TRAIN_X = os.path.join(data_dir, 'train/train.article.txt')
    TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt')
    VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt')
    VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt')
    """
	vocab_file = os.path.join(data_dir, "vocab.json")
	if not os.path.exists(vocab_file):
		utils.build_vocab([TRAIN_X, TRAIN_Y], vocab_file, n_vocab=80000)
	vocab = json.load(open(vocab_file))
	"""

    embedding_path = '/home/kaiying/coco/embeddings/giga-256d.bin'
    vocab, embeddings = utils.load_word2vec_embedding(embedding_path)
    print(len(vocab), embeddings.shape)

    train_x = BatchManager(load_data(TRAIN_X, vocab, N_TRAIN), BATCH_SIZE)
    train_y = BatchManager(load_data(TRAIN_Y, vocab, N_TRAIN), BATCH_SIZE)

    valid_x = BatchManager(load_data(VALID_X, vocab, N_VALID), BATCH_SIZE)
    valid_y = BatchManager(load_data(VALID_Y, vocab, N_VALID), BATCH_SIZE)

    model = Model(vocab, emb_dim=256, hid_dim=512,
                  embeddings=embeddings).cuda()
    # model.embedding_look_up.to(torch.device("cpu"))

    ckpt_file = args.ckpt_file
    saved_state = {'lr': 0.001, 'epoch': 0}
    if os.path.exists(ckpt_file):
        saved_state = torch.load(ckpt_file)
        model.load_state_dict(saved_state['state_dict'])
        logging.info('Load model parameters from %s' % ckpt_file)

    optimizer = torch.optim.Adam(model.parameters(), lr=saved_state['lr'])
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=1,
                                                gamma=0.5)
    scheduler.step()

    train(train_x, train_y, valid_x, valid_y, model, optimizer, scheduler,
          saved_state['epoch'], N_EPOCHS)
Example #3
0
def main():

    N_TEST = args.n_test
    BATCH_SIZE = args.batch_size

    # vocab = json.load(open('sumdata/vocab.json'))

    embedding_path = '/home/kaiying/coco/embeddings/giga-256d.bin'
    vocab, embeddings = utils.load_word2vec_embedding(embedding_path)

    test_x = BatchManager(load_data(args.input_file, vocab, N_TEST),
                          BATCH_SIZE)
    # model = Seq2SeqAttention(len(vocab), EMB_DIM, HID_DIM, BATCH_SIZE, vocab, max_trg_len=25).cuda()
    model = Model(vocab, emb_dim=256, hid_dim=512,
                  embeddings=embeddings).cuda()
    model.eval()

    file = args.ckpt_file
    if os.path.exists(file):
        saved_state = torch.load(file)
        model.load_state_dict(saved_state['state_dict'])
        print('Load model parameters from %s' % file)

        my_test(test_x, model)
Example #4
0
        projection = tf.add(tf.matmul(x_reshape, W), b, name='projection')
        nsteps = tf.shape(outputs)[1]
        # -1 to time step
        self.outputs = tf.reshape(projection, [-1, nsteps, TAGS_NUM], name='output')

        self.log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood(
            self.outputs, self.y, self.seq_length)
        self.transition_params = tf.add(self.transition_params, 0, name='transition_params')
        # Add a training op to tune the parameters.
        self.loss = tf.reduce_mean(-self.log_likelihood)
        self.train_op = tf.train.AdamOptimizer(LEARN_RATE).minimize(self.loss)
        tf.summary.scalar('loss', self.loss)

#训练神经网络

embedding = load_word2vec_embedding(config.vocab_size)
net = NER_net(embedding)


with tf.Session() as sess:
    merged = tf.summary.merge_all()  # 将图形、训练过程等数据合并在一起
    writer = tf.summary.FileWriter(LOG_PATH, sess.graph)  # 将训练日志写入到logs文件夹下
    sess.run(tf.global_variables_initializer())
    print(dataset.get_step())
    for i in range(dataset.get_step()):
        x_train, y_train, x_test, y_test = dataset.next_batch(args.BATCH)

        max_sentenc_length = max(map(len, x_train))
        sequence_len = np.asarray([len(x) for x in x_train])
        # padding
        x_train = np.asarray([list(x[:]) + (max_sentenc_length - len(x)) * [config.src_padding] for x in x_train])
Example #5
0
File: rnn.py Project: zeal4u/NER
    args = parser.parse_args()
    config.FLAGS.model_path = args.modelpath
    config.FLAGS.action = args.action
    config.FLAGS.pred_file = args.data

    action = config.FLAGS.action

    # 获取词的总数。
    vocab_size = get_src_vocab_size()
    src_unknown_id = tgt_unknown_id = vocab_size
    src_padding = vocab_size + 1

    src_vocab_table, tgt_vocab_table = create_vocab_tables(src_vocab_file, tgt_vocab_file, src_unknown_id,
                                                           tgt_unknown_id)
    embedding = load_word2vec_embedding(vocab_size)

    if action == 'train':
        iterator = get_iterator(src_vocab_table, tgt_vocab_table, vocab_size, BATCH_SIZE)
    elif action == 'predict':
        BATCH_SIZE = 1
        DROPOUT_RATE = 1.0
        iterator = get_predict_iterator(src_vocab_table, vocab_size, BATCH_SIZE)
    else:
        print('Only support train and predict actions.')
        exit(0)

    tag_table = tag_to_id_table()
    net = NER_net("ner", iterator, embedding, BATCH_SIZE)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
Example #6
0
import tensorflow as tf
import numpy as np
from tensorflow.contrib.rnn import static_bidirectional_rnn
from tensorflow.contrib.rnn import DropoutWrapper
import utils


DATA_PATH = '../retokenized_corpus.txt'
# FEATURE_NUM = 64
BATCH_SIZE = 128
EMBEDDING_SIZE = unit_num = 300         # 默认词向量的大小等于RNN(每个time step) 和 CNN(列) 中神经单元的个数, 为了避免混淆model中全部用unit_num表示。
MAX_SEQUENCE_SIZE = time_step = 100      # 每个句子的最大长度和time_step一样,为了避免混淆model中全部用time_step表示。
DROPOUT_RATE = None
EPOCH = 20000

embeddings = utils.load_word2vec_embedding()
word_to_id_table, id_to_word_table, tag_to_id_table, id_to_tag_table = utils.build_word_tag_tables()
all_sentences, all_tags = \
    utils.get_sentences(word_to_id_table, tag_to_id_table, max_sequence=MAX_SEQUENCE_SIZE)

TAGS_NUM = len(tag_to_id_table)


class NER_net:
    def __init__(self, scope_name, batch_size):
        self.batch_size = batch_size
        with tf.variable_scope(scope_name) as scope:
            self._build_net()

    def _build_net(self):
        self.x = tf.placeholder(tf.float32, [None, time_step, unit_num])
Example #7
0
def main():
    parser = argparse.ArgumentParser(
        description='Linear Support Vector Classification Model')
    parser.add_argument('--embedding',
                        default='embedding/glove.6B.50d.subset.oov.vec',
                        help='Path to the embedding')
    parser.add_argument('--train',
                        default='data/train.tsv',
                        help='Path to training data')
    parser.add_argument('--dev',
                        default='data/dev.tsv',
                        help='Path to dev data')
    parser.add_argument('--test',
                        default='data/test.tsv',
                        help='Path to test data')
    parser.add_argument('--predict',
                        default='results/svm-subj-full.result',
                        help='Path to the prediction file')

    args = parser.parse_args()
    embedding, embed_dim = dp.load_word2vec_embedding(args.embedding)

    X_train, y_train = dp.load_data(args.train, textindex=1, labelindex=0)
    X_dev, y_dev = dp.load_data(args.dev, textindex=1, labelindex=0)
    X_test, y_test = dp.load_data(args.test, textindex=1, labelindex=0)

    # Get index-word/label dicts for lookup:
    vocab_dict = dp.get_index_dict(X_train + X_dev + X_test)

    # Replace words / labels in the data by the according index
    vocab_dict_flipped = dict((v, k) for k, v in vocab_dict.items())

    # Get indexed data and labels
    X_train_index = [[vocab_dict_flipped[word] for word in chunk]
                     for chunk in X_train]
    X_dev_index = [[vocab_dict_flipped[word] for word in chunk]
                   for chunk in X_dev]
    X_test_index = [[vocab_dict_flipped[word] for word in chunk]
                    for chunk in X_test]

    # Get embedding matrix:
    embed_matrix = dp.get_embedding_matrix(embedding, vocab_dict)

    # Use the simple count over all features in a single example:
    # Do average over word vectors:
    X_train_embedded = [
        np.mean([embed_matrix[element] for element in example], axis=0)
        for example in X_train_index
    ]
    X_dev_embedded = [
        np.mean([embed_matrix[element] for element in example], axis=0)
        for example in X_dev_index
    ]
    X_test_embedded = [
        np.mean([embed_matrix[element] for element in example], axis=0)
        for example in X_test_index
    ]

    print("Loaded data.")

    # Tune C on the dev set, test on the test set:
    best_acc = 0.0
    best_c_acc = 0.0

    for c in [0.001, 0.01, 0.1, 1, 2, 4, 8, 16, 32, 64, 128, 256]:
        model_svr = SVC(C=c, kernel='linear', probability=True)
        model_svr.fit(X_train_embedded, y_train)

        # Use dev set to tune our hyperparameters
        pred_svr = model_svr.predict(X_dev_embedded)
        true_svr = y_dev

        acc = accuracy_score(true_svr, pred_svr)

        if acc > best_acc:
            best_acc = acc
            best_c_acc = c

    print("Best dev score: ", best_acc)

    # Test best model on test set
    best_model = SVC(C=best_c_acc, kernel='linear', probability=True)
    best_model.fit(X_train_embedded, y_train)

    best_pred = best_model.predict(X_test_embedded)

    test_acc = accuracy_score(y_test, best_pred)

    outlog = open(args.predict, 'w')
    outlog.write('true\tpred\n')
    for true, pred in zip(y_test, best_pred):
        outlog.write('{}\t{}\n'.format(true, pred))
    outlog.close()

    print("Best C: ", best_c_acc)
    print("Test score: ", test_acc)

    print("Done")
Example #8
0
def main():
    print(args)

    # local
    """
    data_dir = 'sumdata/'
    TRAIN_X = os.path.join(data_dir, 'train/train.article.txt')
    TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt')
    VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt')
    VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt')
    EVAL_X = os.path.join(data_dir, 'train/valid.article.filter.txt')
    EVAL_Y = os.path.join(data_dir, 'train/valid.title.filter.txt')
    """

    # server
    data_dir = 'sumdata/'
    TRAIN_X = os.path.join(data_dir, 'train/train.article_01_new.txt')
    TRAIN_Y = os.path.join(data_dir, 'train/train.title_01_new.txt')
    VALID_X = os.path.join(data_dir, 'train/train.article_000_new.txt')
    VALID_Y = os.path.join(data_dir, 'train/train.title_000_new.txt')
    EVAL_X = os.path.join(data_dir, 'train/train.article_001_new.txt')
    EVAL_Y = os.path.join(data_dir, 'train/train.title_001_new.txt')

    small_vocab_file = 'sumdata/small_vocab.json'
    if os.path.exists(small_vocab_file):
        small_vocab = json.load(open(small_vocab_file))
    else:
        small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=80000)

    # bert embeddings
    emb_file = 'sumdata/bert-large-uncased.30522.1024d.vec'
    vocab, embeddings = load_word2vec_embedding(emb_file)

    max_src_len = 101
    max_tgt_len = 47

    bs = args.batch_size
    n_train = args.n_train
    n_valid = args.n_valid
    n_eval = args.n_eval

    # vocab = small_vocab

    train_x = BatchManager(load_data(TRAIN_X, max_src_len, n_train), bs, vocab)
    train_y = BatchManager(load_data(TRAIN_Y, max_tgt_len, n_train), bs, vocab)
    valid_x = BatchManager(load_data(VALID_X, max_src_len, n_valid), bs, vocab)
    valid_y = BatchManager(load_data(VALID_Y, max_tgt_len, n_valid), bs, vocab)
    eval_x = BatchManager(load_data(EVAL_X, max_src_len, n_eval), bs, vocab)
    eval_y = BatchManager(load_data(EVAL_Y, max_tgt_len, n_eval), bs, vocab)
    # model = Transformer(len(vocab), len(vocab), max_src_len, max_tgt_len, 1, 4, 256,
    #                     64, 64, 1024, src_tgt_emb_share=True, tgt_prj_emb_share=True).cuda()
    # model = Transformer(len(vocab), len(vocab), max_src_len, max_tgt_len, 1, 6, 300,
    #                     50, 50, 1200, src_tgt_emb_share=True, tgt_prj_emb_share=True).cuda()
    # model = TransformerShareEmbedding(len(vocab), max_src_len, 1, 6, 300, 50, 50, 1200, False).cuda()
    model = TransformerShareEmbedding(len(vocab), max_src_len, 1, 6, 1024, 50, 50, 1200, False, embeddings = embeddings).cuda()

    # print(model)
    saved_state = {'epoch': 0, 'lr': 0.001}
    if os.path.exists(args.ckpt_file):
        saved_state = torch.load(args.ckpt_file)
        model.load_state_dict(saved_state['state_dict'])
        logging.info('Load model parameters from %s' % args.ckpt_file)

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=saved_state['lr'])
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.3)
    scheduler.step()  # last_epoch=-1, which will not update lr at the first time

    # eval_model(valid_x, valid_y, vocab, model)
    # train(train_x, train_y, valid_x, valid_y, model, optimizer, vocab, scheduler, args.n_epochs, saved_state['epoch'])
    myeval(eval_x, eval_y, vocab, model)