def create_NER_model(self): ner_model = Sequential() # keras_contrib 2.0.8, keras 2.2.5,下 当mask_zero=True 会报 # Tensors in list passed to 'values' of 'ConcatV2' Op have types [bool, float32] that don't all match.` # 错误。 # 改成keras 2.2.4 解决 embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM, mask_zero=False, embeddings_initializer=constant( load_word2vec_embedding(config.vocab_size))) ner_model.add(embedding) ner_model.add(Masking(mask_value=config.src_padding, )) ner_model.add( Bidirectional( LSTM(BiRNN_UNITS // 2, return_sequences=True, dropout=DROPOUT_RATE))) crf = CRF(len(LABEL_DIC), sparse_target=True) ner_model.add(crf) # 以下两种损失和度量写法都可以 ner_model.compile(Adam(lr=LEARN_RATE, decay=1e-3), loss=crf_loss, metrics=[crf_accuracy]) # ner_model.compile(Adam(lr=LEARN_RATE), loss=crf.loss_function, metrics=[crf.accuracy]) return ner_model
def main(): print(args) N_EPOCHS = args.n_epochs N_TRAIN = args.n_train N_VALID = args.n_valid BATCH_SIZE = args.batch_size data_dir = args.data_dir TRAIN_X = os.path.join(data_dir, 'train/train.article.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt') VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt') VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt') """ vocab_file = os.path.join(data_dir, "vocab.json") if not os.path.exists(vocab_file): utils.build_vocab([TRAIN_X, TRAIN_Y], vocab_file, n_vocab=80000) vocab = json.load(open(vocab_file)) """ embedding_path = '/home/kaiying/coco/embeddings/giga-256d.bin' vocab, embeddings = utils.load_word2vec_embedding(embedding_path) print(len(vocab), embeddings.shape) train_x = BatchManager(load_data(TRAIN_X, vocab, N_TRAIN), BATCH_SIZE) train_y = BatchManager(load_data(TRAIN_Y, vocab, N_TRAIN), BATCH_SIZE) valid_x = BatchManager(load_data(VALID_X, vocab, N_VALID), BATCH_SIZE) valid_y = BatchManager(load_data(VALID_Y, vocab, N_VALID), BATCH_SIZE) model = Model(vocab, emb_dim=256, hid_dim=512, embeddings=embeddings).cuda() # model.embedding_look_up.to(torch.device("cpu")) ckpt_file = args.ckpt_file saved_state = {'lr': 0.001, 'epoch': 0} if os.path.exists(ckpt_file): saved_state = torch.load(ckpt_file) model.load_state_dict(saved_state['state_dict']) logging.info('Load model parameters from %s' % ckpt_file) optimizer = torch.optim.Adam(model.parameters(), lr=saved_state['lr']) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.5) scheduler.step() train(train_x, train_y, valid_x, valid_y, model, optimizer, scheduler, saved_state['epoch'], N_EPOCHS)
def main(): N_TEST = args.n_test BATCH_SIZE = args.batch_size # vocab = json.load(open('sumdata/vocab.json')) embedding_path = '/home/kaiying/coco/embeddings/giga-256d.bin' vocab, embeddings = utils.load_word2vec_embedding(embedding_path) test_x = BatchManager(load_data(args.input_file, vocab, N_TEST), BATCH_SIZE) # model = Seq2SeqAttention(len(vocab), EMB_DIM, HID_DIM, BATCH_SIZE, vocab, max_trg_len=25).cuda() model = Model(vocab, emb_dim=256, hid_dim=512, embeddings=embeddings).cuda() model.eval() file = args.ckpt_file if os.path.exists(file): saved_state = torch.load(file) model.load_state_dict(saved_state['state_dict']) print('Load model parameters from %s' % file) my_test(test_x, model)
projection = tf.add(tf.matmul(x_reshape, W), b, name='projection') nsteps = tf.shape(outputs)[1] # -1 to time step self.outputs = tf.reshape(projection, [-1, nsteps, TAGS_NUM], name='output') self.log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood( self.outputs, self.y, self.seq_length) self.transition_params = tf.add(self.transition_params, 0, name='transition_params') # Add a training op to tune the parameters. self.loss = tf.reduce_mean(-self.log_likelihood) self.train_op = tf.train.AdamOptimizer(LEARN_RATE).minimize(self.loss) tf.summary.scalar('loss', self.loss) #训练神经网络 embedding = load_word2vec_embedding(config.vocab_size) net = NER_net(embedding) with tf.Session() as sess: merged = tf.summary.merge_all() # 将图形、训练过程等数据合并在一起 writer = tf.summary.FileWriter(LOG_PATH, sess.graph) # 将训练日志写入到logs文件夹下 sess.run(tf.global_variables_initializer()) print(dataset.get_step()) for i in range(dataset.get_step()): x_train, y_train, x_test, y_test = dataset.next_batch(args.BATCH) max_sentenc_length = max(map(len, x_train)) sequence_len = np.asarray([len(x) for x in x_train]) # padding x_train = np.asarray([list(x[:]) + (max_sentenc_length - len(x)) * [config.src_padding] for x in x_train])
args = parser.parse_args() config.FLAGS.model_path = args.modelpath config.FLAGS.action = args.action config.FLAGS.pred_file = args.data action = config.FLAGS.action # 获取词的总数。 vocab_size = get_src_vocab_size() src_unknown_id = tgt_unknown_id = vocab_size src_padding = vocab_size + 1 src_vocab_table, tgt_vocab_table = create_vocab_tables(src_vocab_file, tgt_vocab_file, src_unknown_id, tgt_unknown_id) embedding = load_word2vec_embedding(vocab_size) if action == 'train': iterator = get_iterator(src_vocab_table, tgt_vocab_table, vocab_size, BATCH_SIZE) elif action == 'predict': BATCH_SIZE = 1 DROPOUT_RATE = 1.0 iterator = get_predict_iterator(src_vocab_table, vocab_size, BATCH_SIZE) else: print('Only support train and predict actions.') exit(0) tag_table = tag_to_id_table() net = NER_net("ner", iterator, embedding, BATCH_SIZE) with tf.Session() as sess: sess.run(tf.global_variables_initializer())
import tensorflow as tf import numpy as np from tensorflow.contrib.rnn import static_bidirectional_rnn from tensorflow.contrib.rnn import DropoutWrapper import utils DATA_PATH = '../retokenized_corpus.txt' # FEATURE_NUM = 64 BATCH_SIZE = 128 EMBEDDING_SIZE = unit_num = 300 # 默认词向量的大小等于RNN(每个time step) 和 CNN(列) 中神经单元的个数, 为了避免混淆model中全部用unit_num表示。 MAX_SEQUENCE_SIZE = time_step = 100 # 每个句子的最大长度和time_step一样,为了避免混淆model中全部用time_step表示。 DROPOUT_RATE = None EPOCH = 20000 embeddings = utils.load_word2vec_embedding() word_to_id_table, id_to_word_table, tag_to_id_table, id_to_tag_table = utils.build_word_tag_tables() all_sentences, all_tags = \ utils.get_sentences(word_to_id_table, tag_to_id_table, max_sequence=MAX_SEQUENCE_SIZE) TAGS_NUM = len(tag_to_id_table) class NER_net: def __init__(self, scope_name, batch_size): self.batch_size = batch_size with tf.variable_scope(scope_name) as scope: self._build_net() def _build_net(self): self.x = tf.placeholder(tf.float32, [None, time_step, unit_num])
def main(): parser = argparse.ArgumentParser( description='Linear Support Vector Classification Model') parser.add_argument('--embedding', default='embedding/glove.6B.50d.subset.oov.vec', help='Path to the embedding') parser.add_argument('--train', default='data/train.tsv', help='Path to training data') parser.add_argument('--dev', default='data/dev.tsv', help='Path to dev data') parser.add_argument('--test', default='data/test.tsv', help='Path to test data') parser.add_argument('--predict', default='results/svm-subj-full.result', help='Path to the prediction file') args = parser.parse_args() embedding, embed_dim = dp.load_word2vec_embedding(args.embedding) X_train, y_train = dp.load_data(args.train, textindex=1, labelindex=0) X_dev, y_dev = dp.load_data(args.dev, textindex=1, labelindex=0) X_test, y_test = dp.load_data(args.test, textindex=1, labelindex=0) # Get index-word/label dicts for lookup: vocab_dict = dp.get_index_dict(X_train + X_dev + X_test) # Replace words / labels in the data by the according index vocab_dict_flipped = dict((v, k) for k, v in vocab_dict.items()) # Get indexed data and labels X_train_index = [[vocab_dict_flipped[word] for word in chunk] for chunk in X_train] X_dev_index = [[vocab_dict_flipped[word] for word in chunk] for chunk in X_dev] X_test_index = [[vocab_dict_flipped[word] for word in chunk] for chunk in X_test] # Get embedding matrix: embed_matrix = dp.get_embedding_matrix(embedding, vocab_dict) # Use the simple count over all features in a single example: # Do average over word vectors: X_train_embedded = [ np.mean([embed_matrix[element] for element in example], axis=0) for example in X_train_index ] X_dev_embedded = [ np.mean([embed_matrix[element] for element in example], axis=0) for example in X_dev_index ] X_test_embedded = [ np.mean([embed_matrix[element] for element in example], axis=0) for example in X_test_index ] print("Loaded data.") # Tune C on the dev set, test on the test set: best_acc = 0.0 best_c_acc = 0.0 for c in [0.001, 0.01, 0.1, 1, 2, 4, 8, 16, 32, 64, 128, 256]: model_svr = SVC(C=c, kernel='linear', probability=True) model_svr.fit(X_train_embedded, y_train) # Use dev set to tune our hyperparameters pred_svr = model_svr.predict(X_dev_embedded) true_svr = y_dev acc = accuracy_score(true_svr, pred_svr) if acc > best_acc: best_acc = acc best_c_acc = c print("Best dev score: ", best_acc) # Test best model on test set best_model = SVC(C=best_c_acc, kernel='linear', probability=True) best_model.fit(X_train_embedded, y_train) best_pred = best_model.predict(X_test_embedded) test_acc = accuracy_score(y_test, best_pred) outlog = open(args.predict, 'w') outlog.write('true\tpred\n') for true, pred in zip(y_test, best_pred): outlog.write('{}\t{}\n'.format(true, pred)) outlog.close() print("Best C: ", best_c_acc) print("Test score: ", test_acc) print("Done")
def main(): print(args) # local """ data_dir = 'sumdata/' TRAIN_X = os.path.join(data_dir, 'train/train.article.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt') VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt') VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt') EVAL_X = os.path.join(data_dir, 'train/valid.article.filter.txt') EVAL_Y = os.path.join(data_dir, 'train/valid.title.filter.txt') """ # server data_dir = 'sumdata/' TRAIN_X = os.path.join(data_dir, 'train/train.article_01_new.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title_01_new.txt') VALID_X = os.path.join(data_dir, 'train/train.article_000_new.txt') VALID_Y = os.path.join(data_dir, 'train/train.title_000_new.txt') EVAL_X = os.path.join(data_dir, 'train/train.article_001_new.txt') EVAL_Y = os.path.join(data_dir, 'train/train.title_001_new.txt') small_vocab_file = 'sumdata/small_vocab.json' if os.path.exists(small_vocab_file): small_vocab = json.load(open(small_vocab_file)) else: small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=80000) # bert embeddings emb_file = 'sumdata/bert-large-uncased.30522.1024d.vec' vocab, embeddings = load_word2vec_embedding(emb_file) max_src_len = 101 max_tgt_len = 47 bs = args.batch_size n_train = args.n_train n_valid = args.n_valid n_eval = args.n_eval # vocab = small_vocab train_x = BatchManager(load_data(TRAIN_X, max_src_len, n_train), bs, vocab) train_y = BatchManager(load_data(TRAIN_Y, max_tgt_len, n_train), bs, vocab) valid_x = BatchManager(load_data(VALID_X, max_src_len, n_valid), bs, vocab) valid_y = BatchManager(load_data(VALID_Y, max_tgt_len, n_valid), bs, vocab) eval_x = BatchManager(load_data(EVAL_X, max_src_len, n_eval), bs, vocab) eval_y = BatchManager(load_data(EVAL_Y, max_tgt_len, n_eval), bs, vocab) # model = Transformer(len(vocab), len(vocab), max_src_len, max_tgt_len, 1, 4, 256, # 64, 64, 1024, src_tgt_emb_share=True, tgt_prj_emb_share=True).cuda() # model = Transformer(len(vocab), len(vocab), max_src_len, max_tgt_len, 1, 6, 300, # 50, 50, 1200, src_tgt_emb_share=True, tgt_prj_emb_share=True).cuda() # model = TransformerShareEmbedding(len(vocab), max_src_len, 1, 6, 300, 50, 50, 1200, False).cuda() model = TransformerShareEmbedding(len(vocab), max_src_len, 1, 6, 1024, 50, 50, 1200, False, embeddings = embeddings).cuda() # print(model) saved_state = {'epoch': 0, 'lr': 0.001} if os.path.exists(args.ckpt_file): saved_state = torch.load(args.ckpt_file) model.load_state_dict(saved_state['state_dict']) logging.info('Load model parameters from %s' % args.ckpt_file) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=saved_state['lr']) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.3) scheduler.step() # last_epoch=-1, which will not update lr at the first time # eval_model(valid_x, valid_y, vocab, model) # train(train_x, train_y, valid_x, valid_y, model, optimizer, vocab, scheduler, args.n_epochs, saved_state['epoch']) myeval(eval_x, eval_y, vocab, model)