def compute_confuse_matrix(fname, classes): """ Give a file, compute confuse matrix of y_true and y_pred. """ print('im in') y_true = [] with codecs.open(fname, 'r', 'utf8') as f: for line in f: line = line.strip().split('\t')[-1] y_true.append(line) checkpoint_dir = "output/self_attention/multi_attention_0802/" pred_path = "tmp/eval_y_self_attention.txt" if os.path.exists(checkpoint_dir + 'config.pkl'): config = pickle.load(open(checkpoint_dir + 'config.pkl', 'rb')) else: config = Config() config.mode = 'inference' word2id, id2word = read_vocab(config.word_vocab_file) tag2id, id2tag = read_vocab(config.tag_vocab_file) with tf.Session(config=get_config_proto( log_device_placement=False)) as sess: model = get_model(config.model, config, sess) model.build() model.restore_model(checkpoint_dir) y_pred = infer_file(model, word2id, id2tag, fname, pred_path) cmatrix = confusion_matrix(y_true, y_pred, classes) print(cmatrix) correct = [x == y for x, y in list(zip(y_true, y_pred))] print(correct.count(True) / len(correct)) return cmatrix
def setup_embedding(self): with tf.variable_scope("Embedding"), tf.device("/cpu:0"): self.word2id, self.id2word = read_vocab( self.config.word_vocab_file) embedding = load_pretrained_emb_from_txt( self.id2word, self.config.pretrained_embedding_file) # vocab_size * embedding_size self.source_embedding = tf.get_variable( "source_embedding", dtype=tf.float32, initializer=tf.constant(embedding), trainable=False) # embedding_size * W_dim self.W_w = tf.get_variable( "W_w", shape=[self.config.embedding_size, self.config.W_dim], initializer=tf.contrib.layers.xavier_initializer()) # vocab_size * W_dim self.hidden_embedding = tf.tanh( tf.matmul(self.source_embedding, self.W_w)) # batch_size * embedding_size self.source_inputs = tf.nn.embedding_lookup( self.source_embedding, self.input_x) # batch_size * W_dim self.hidden_inputs = tf.nn.embedding_lookup( self.hidden_embedding, self.input_x) # batch_size * window_size * embedding_size self.source_context = tf.nn.embedding_lookup( self.source_embedding, self.context_x) # batch_size * window_size * W_dim self.hidden_context = tf.nn.embedding_lookup( self.hidden_embedding, self.context_x)
def main(): checkpoint_dir = "output/self_attention/multi_attention_0802/" # inference 用的还是文件中的config,并不是当初的config了。应该读取check_point中的config.pkl # TODO: read config from checkpoint/config.pkl if os.path.exists(checkpoint_dir + 'config.pkl'): config = pickle.load(open(checkpoint_dir+'config.pkl', 'rb')) else: config = Config() config.mode = 'inference' # 每次训练word_vocab和tag_vocab都会变化,而inference的时候用的是当初训练该模型时的词表;所以最好把词表定下来。 # TODO:data_utils::fix vocab word2id, id2word = read_vocab(config.word_vocab_file) tag2id, id2tag = read_vocab(config.tag_vocab_file) with tf.Session(config=get_config_proto(log_device_placement=False)) as sess: model = get_model(config.model, config, sess) model.build() model.restore_model(checkpoint_dir) infer_cmd(model, word2id, id2tag)
def __init__(self, vocab_file, label_file, data_file=None, batch_size=None, max_len=300, min_len=0, label_type='multi-class'): self.data_file = data_file self.batch_size = batch_size self.vocab_file = vocab_file self.label_file = label_file self.max_len = max_len self.min_len = min_len self.label_type = label_type self.w2i, self.i2w = read_vocab(self.vocab_file) self.l2i, self.i2l = read_vocab(self.label_file, check_vocab=False) self._raw_data = [] if self.data_file: self._preprocess()
def setup_embedding(self): with tf.variable_scope("Embedding"), tf.device("/cpu:0"): self.word2id, self.id2word = read_vocab( self.config.word_vocab_file) embedding = load_pretrained_emb_from_txt( self.id2word, self.config.pretrained_embedding_file) self.source_embedding = tf.get_variable( "source_emebdding", dtype=tf.float32, initializer=tf.constant(embedding), trainable=False) # batch_size * sentence_length * embedding_size self.source_inputs = tf.nn.embedding_lookup( self.source_embedding, self.input_x) # batch_size * sentence_length * embedding_size * 1 self.source_inputs_expand = tf.expand_dims(self.source_inputs, -1)
def all_data_to_id(vocab_file, all_data_max_sent, num_docs, max_sent_in_doc=30, max_char_in_sent=20): _, word_to_index = data_utils.read_vocab( vocab_file) #读取字典{'<PAD>': 0, ',': 1, '的': 2, '。': 3,...} doc_to_id = np.zeros([num_docs, max_sent_in_doc, max_char_in_sent], dtype=int) for doc_index, doc in enumerate(all_data_max_sent): sent_to_id = np.zeros([max_sent_in_doc, max_char_in_sent]) for sent_index, sent in enumerate(doc): if sent_index < max_sent_in_doc: word_to_id = np.zeros([max_char_in_sent], dtype=int) for char_index, char in enumerate(Counter(sent)): # print(char) if char_index < max_char_in_sent: word_to_id[char_index] = word_to_index.get(char, PAD) # print(word_to_id) sent_to_id[sent_index] = word_to_id doc_to_id[doc_index] = sent_to_id return doc_to_id
def inference(self, sen, non_event_id=None): x_batch, context_batch = zip(*sen) feed_dict = { self.input_x: x_batch, self.context_x: context_batch, self.dropout_keep_prob: 1.0 } if non_event_id == None: tag2id, _ = read_vocab(self.config.tag_vocab_file) non_event_id = tag2id['__label__非事件'] prob = self.sess.run(self.softmax, feed_dict=feed_dict) prob_max = np.max(prob, axis=1).tolist() prob_idx = np.argmax(prob, axis=1).tolist() max_prob = 0 event_type_id = non_event_id for i in range(len(prob_idx)): if prob_idx[i] == non_event_id: continue if prob_max[i] > max_prob: max_prob = prob_max[i] event_type_id = prob_idx[i] return [event_type_id]
from utils import data_utils from collections import Counter import numpy as np import tensorflow.contrib.keras as kr import os def build_vocab(merge_file, vocab_file, vocab_size): """根据字符级数据集构建词汇表,存储""" _, contents = data_utils.read_label_content(merge_file) all_data = [] for content in contents: all_data.extend(content) counter = Counter( all_data) #{',': 480926, '的': 348828, '。': 194675, '一': 119858} count_pairs = counter.most_common(vocab_size - 1) #[('a', 5), ('b', 4), ('c', 3)] print(vocab_size - 1) words, _ = list(zip(*count_pairs)) print(len(list(words))) # 添加一个<pad>将所有文本pad为同一个长度 words = ['PAD'] + list(words) open(vocab_file, 'w', encoding='utf-8').write('\n'.join(words) + '\n') if __name__ == '__main__': merge_file = '../data/merge_file.txt' vocab_file = '../data/char_data/char_vocab.txt' build_vocab(merge_file, vocab_file, 5000) words, word_to_id = data_utils.read_vocab(vocab_file) print(word_to_id)
def main(): parser = argparse.ArgumentParser() add_argument(parser) args = parser.parse_args() config = Config() train_data = read_data(config.train_data_files, config.model) # 试试对负样本进行降采样 # train_data = sample(train_data) eval_data = read_data(config.eval_data_files, config.model) # train_data_sen = read_data_sen("data/data_tech.train") # eval_data_sen = read_data_sen("data/data_tech.eval") # 这里使用了预训练的词向量的词表作为了模型的词表 create_vocab_from_pretrained_w2v(config.w2v_path, config.word_vocab_file) create_tag_vocab_from_data(train_data, config.tag_vocab_file) word2id, id2word = read_vocab(config.word_vocab_file) tag2id, id2tag = read_vocab(config.tag_vocab_file) # convert word into ids train_data = convert_dataset(train_data, word2id, tag2id, config.sentence_length, config.num_classes, config.model) # train_data_sen = convert_dataset_sen(train_data_sen, word2id, tag2id, config.num_classes, one_hot_label=True) print(train_data[0]) eval_data = convert_dataset(eval_data, word2id, tag2id, config.sentence_length, config.num_classes, config.model) # eval_data_sen = convert_dataset_sen(eval_data_sen, word2id, tag2id, config.num_classes, one_hot_label=True) print("train_data size: {0}".format(len(train_data))) if os.path.exists(os.path.join(config.checkpoint_dir, "config.pkl")): config = pickle.load( open(os.path.join(config.checkpoint_dir, "config.pkl"), 'rb')) else: pickle.dump( config, open(os.path.join(config.checkpoint_dir, "config.pkl"), 'wb')) with tf.Session(config=get_config_proto( log_device_placement=False)) as sess: model = get_model(config.model, config, sess) model.build() model.init() batch_manager = Batch_self_attention(train_data, config.batch_size) batch_manager_eval = Batch_self_attention(eval_data, config.batch_size) # batch_manager = Batch(train_data, config.batch_size) # batch_manager_eval = Batch(eval_data, config.batch_size) epoches = config.epoch max_acc = 0 for i in range(epoches): for batch in batch_manager.next_batch(): # print(batch) loss, accuracy, global_step = model.train_one_step(*zip( *batch)) # key_shape, query_shape = model.test(*zip(*batch)) # print(key_shape, query_shape) # break train_accuracy = evaluate(model, batch_manager) eval_accuracy = evaluate(model, batch_manager_eval) # train_accuracy = evaluate_attention(model, train_data_sen, id2tag) # eval_accuracy = evaluate_attention(model, eval_data_sen, id2tag) print("epoch - {0} step - {1} loss - {2} train_accuracy - {3} eval_accuracy - {4}"\ .format(i, global_step, loss, train_accuracy, eval_accuracy)) # train_accuracy = evaluate_attention(model, train_data_sen, id2tag) # eval_accuracy = evaluate_attention(model, eval_data_sen, id2tag) # print("epoch - {0} step - {1} loss - {2} train_accuracy - {3} eval_accuracy - {4}"\ # .format(i, global_step, loss, train_accuracy, eval_accuracy)) if max_acc < eval_accuracy: max_acc = eval_accuracy model.save_model()