Python read_vocab Examples, utils.data_utils.read_vocab Python Examples

Example #1

0

Show file

def compute_confuse_matrix(fname, classes):
    """
    Give a file, compute confuse matrix of y_true and y_pred.
    """
    print('im in')
    y_true = []
    with codecs.open(fname, 'r', 'utf8') as f:
        for line in f:
            line = line.strip().split('\t')[-1]
            y_true.append(line)

    checkpoint_dir = "output/self_attention/multi_attention_0802/"
    pred_path = "tmp/eval_y_self_attention.txt"
    if os.path.exists(checkpoint_dir + 'config.pkl'):
        config = pickle.load(open(checkpoint_dir + 'config.pkl', 'rb'))
    else:
        config = Config()

    config.mode = 'inference'

    word2id, id2word = read_vocab(config.word_vocab_file)
    tag2id, id2tag = read_vocab(config.tag_vocab_file)

    with tf.Session(config=get_config_proto(
            log_device_placement=False)) as sess:
        model = get_model(config.model, config, sess)
        model.build()
        model.restore_model(checkpoint_dir)
        y_pred = infer_file(model, word2id, id2tag, fname, pred_path)

    cmatrix = confusion_matrix(y_true, y_pred, classes)
    print(cmatrix)
    correct = [x == y for x, y in list(zip(y_true, y_pred))]
    print(correct.count(True) / len(correct))
    return cmatrix

Example #2

0

Show file

 def setup_embedding(self):
     with tf.variable_scope("Embedding"), tf.device("/cpu:0"):
         self.word2id, self.id2word = read_vocab(
             self.config.word_vocab_file)
         embedding = load_pretrained_emb_from_txt(
             self.id2word, self.config.pretrained_embedding_file)
         # vocab_size * embedding_size
         self.source_embedding = tf.get_variable(
             "source_embedding",
             dtype=tf.float32,
             initializer=tf.constant(embedding),
             trainable=False)
         # embedding_size * W_dim
         self.W_w = tf.get_variable(
             "W_w",
             shape=[self.config.embedding_size, self.config.W_dim],
             initializer=tf.contrib.layers.xavier_initializer())
         # vocab_size * W_dim
         self.hidden_embedding = tf.tanh(
             tf.matmul(self.source_embedding, self.W_w))
         # batch_size * embedding_size
         self.source_inputs = tf.nn.embedding_lookup(
             self.source_embedding, self.input_x)
         # batch_size * W_dim
         self.hidden_inputs = tf.nn.embedding_lookup(
             self.hidden_embedding, self.input_x)
         # batch_size * window_size * embedding_size
         self.source_context = tf.nn.embedding_lookup(
             self.source_embedding, self.context_x)
         # batch_size * window_size * W_dim
         self.hidden_context = tf.nn.embedding_lookup(
             self.hidden_embedding, self.context_x)

Example #3

0

Show file

def main():
    checkpoint_dir = "output/self_attention/multi_attention_0802/"
    # inference 用的还是文件中的config，并不是当初的config了。应该读取check_point中的config.pkl
    # TODO: read config from checkpoint/config.pkl
    if os.path.exists(checkpoint_dir + 'config.pkl'):
        config = pickle.load(open(checkpoint_dir+'config.pkl', 'rb'))
    else:
        config = Config()

    config.mode = 'inference'
    
    # 每次训练word_vocab和tag_vocab都会变化，而inference的时候用的是当初训练该模型时的词表；所以最好把词表定下来。
    # TODO：data_utils::fix vocab
    word2id, id2word = read_vocab(config.word_vocab_file)
    tag2id, id2tag = read_vocab(config.tag_vocab_file)

    with tf.Session(config=get_config_proto(log_device_placement=False)) as sess:
        model = get_model(config.model, config, sess)
        model.build()
        model.restore_model(checkpoint_dir)
        infer_cmd(model, word2id, id2tag)

Example #4

0

Show file

    def __init__(self,
                 vocab_file,
                 label_file,
                 data_file=None,
                 batch_size=None,
                 max_len=300,
                 min_len=0,
                 label_type='multi-class'):
        self.data_file = data_file
        self.batch_size = batch_size
        self.vocab_file = vocab_file
        self.label_file = label_file
        self.max_len = max_len
        self.min_len = min_len
        self.label_type = label_type

        self.w2i, self.i2w = read_vocab(self.vocab_file)
        self.l2i, self.i2l = read_vocab(self.label_file, check_vocab=False)
        self._raw_data = []

        if self.data_file:
            self._preprocess()

Example #5

0

Show file

 def setup_embedding(self):
     with tf.variable_scope("Embedding"), tf.device("/cpu:0"):
         self.word2id, self.id2word = read_vocab(
             self.config.word_vocab_file)
         embedding = load_pretrained_emb_from_txt(
             self.id2word, self.config.pretrained_embedding_file)
         self.source_embedding = tf.get_variable(
             "source_emebdding",
             dtype=tf.float32,
             initializer=tf.constant(embedding),
             trainable=False)
         # batch_size * sentence_length * embedding_size
         self.source_inputs = tf.nn.embedding_lookup(
             self.source_embedding, self.input_x)
         # batch_size * sentence_length * embedding_size * 1
         self.source_inputs_expand = tf.expand_dims(self.source_inputs, -1)

Example #6

0

Show file

File: 3-fenju.py Project: Lv-zhenzhen/C_BiGRU_ATT-text-classification

def all_data_to_id(vocab_file,
                   all_data_max_sent,
                   num_docs,
                   max_sent_in_doc=30,
                   max_char_in_sent=20):
    _, word_to_index = data_utils.read_vocab(
        vocab_file)  #读取字典{'<PAD>': 0, '，': 1, '的': 2, '。': 3,...}
    doc_to_id = np.zeros([num_docs, max_sent_in_doc, max_char_in_sent],
                         dtype=int)
    for doc_index, doc in enumerate(all_data_max_sent):
        sent_to_id = np.zeros([max_sent_in_doc, max_char_in_sent])
        for sent_index, sent in enumerate(doc):
            if sent_index < max_sent_in_doc:
                word_to_id = np.zeros([max_char_in_sent], dtype=int)
                for char_index, char in enumerate(Counter(sent)):
                    # print(char)
                    if char_index < max_char_in_sent:
                        word_to_id[char_index] = word_to_index.get(char, PAD)
                        # print(word_to_id)
                sent_to_id[sent_index] = word_to_id
        doc_to_id[doc_index] = sent_to_id
    return doc_to_id

Example #7

0

Show file

 def inference(self, sen, non_event_id=None):
     x_batch, context_batch = zip(*sen)
     feed_dict = {
         self.input_x: x_batch,
         self.context_x: context_batch,
         self.dropout_keep_prob: 1.0
     }
     if non_event_id == None:
         tag2id, _ = read_vocab(self.config.tag_vocab_file)
         non_event_id = tag2id['__label__非事件']
     prob = self.sess.run(self.softmax, feed_dict=feed_dict)
     prob_max = np.max(prob, axis=1).tolist()
     prob_idx = np.argmax(prob, axis=1).tolist()
     max_prob = 0
     event_type_id = non_event_id
     for i in range(len(prob_idx)):
         if prob_idx[i] == non_event_id:
             continue
         if prob_max[i] > max_prob:
             max_prob = prob_max[i]
             event_type_id = prob_idx[i]
     return [event_type_id]

Example #8

0

Show file

from utils import data_utils
from collections import Counter
import numpy as np
import tensorflow.contrib.keras as kr
import os


def build_vocab(merge_file, vocab_file, vocab_size):
    """根据字符级数据集构建词汇表，存储"""
    _, contents = data_utils.read_label_content(merge_file)
    all_data = []
    for content in contents:
        all_data.extend(content)
    counter = Counter(
        all_data)  #{'，': 480926, '的': 348828, '。': 194675, '一': 119858}
    count_pairs = counter.most_common(vocab_size -
                                      1)  #[('a', 5), ('b', 4), ('c', 3)]
    print(vocab_size - 1)
    words, _ = list(zip(*count_pairs))
    print(len(list(words)))
    # 添加一个<pad>将所有文本pad为同一个长度
    words = ['PAD'] + list(words)
    open(vocab_file, 'w', encoding='utf-8').write('\n'.join(words) + '\n')


if __name__ == '__main__':
    merge_file = '../data/merge_file.txt'
    vocab_file = '../data/char_data/char_vocab.txt'
    build_vocab(merge_file, vocab_file, 5000)
    words, word_to_id = data_utils.read_vocab(vocab_file)
    print(word_to_id)

Example #9

0

Show file

def main():
    parser = argparse.ArgumentParser()
    add_argument(parser)
    args = parser.parse_args()

    config = Config()

    train_data = read_data(config.train_data_files, config.model)
    # 试试对负样本进行降采样
    # train_data = sample(train_data)
    eval_data = read_data(config.eval_data_files, config.model)
    # train_data_sen = read_data_sen("data/data_tech.train")
    # eval_data_sen = read_data_sen("data/data_tech.eval")
    # 这里使用了预训练的词向量的词表作为了模型的词表
    create_vocab_from_pretrained_w2v(config.w2v_path, config.word_vocab_file)
    create_tag_vocab_from_data(train_data, config.tag_vocab_file)

    word2id, id2word = read_vocab(config.word_vocab_file)
    tag2id, id2tag = read_vocab(config.tag_vocab_file)

    # convert word into ids
    train_data = convert_dataset(train_data, word2id, tag2id,
                                 config.sentence_length, config.num_classes,
                                 config.model)
    # train_data_sen = convert_dataset_sen(train_data_sen, word2id, tag2id, config.num_classes, one_hot_label=True)
    print(train_data[0])
    eval_data = convert_dataset(eval_data, word2id, tag2id,
                                config.sentence_length, config.num_classes,
                                config.model)
    # eval_data_sen = convert_dataset_sen(eval_data_sen, word2id, tag2id, config.num_classes, one_hot_label=True)
    print("train_data size: {0}".format(len(train_data)))

    if os.path.exists(os.path.join(config.checkpoint_dir, "config.pkl")):
        config = pickle.load(
            open(os.path.join(config.checkpoint_dir, "config.pkl"), 'rb'))
    else:
        pickle.dump(
            config,
            open(os.path.join(config.checkpoint_dir, "config.pkl"), 'wb'))

    with tf.Session(config=get_config_proto(
            log_device_placement=False)) as sess:
        model = get_model(config.model, config, sess)
        model.build()
        model.init()

        batch_manager = Batch_self_attention(train_data, config.batch_size)
        batch_manager_eval = Batch_self_attention(eval_data, config.batch_size)
        # batch_manager = Batch(train_data, config.batch_size)
        # batch_manager_eval = Batch(eval_data, config.batch_size)
        epoches = config.epoch
        max_acc = 0
        for i in range(epoches):
            for batch in batch_manager.next_batch():
                # print(batch)
                loss, accuracy, global_step = model.train_one_step(*zip(
                    *batch))
                # key_shape, query_shape = model.test(*zip(*batch))
                # print(key_shape, query_shape)
                # break
            train_accuracy = evaluate(model, batch_manager)
            eval_accuracy = evaluate(model, batch_manager_eval)
            # train_accuracy = evaluate_attention(model, train_data_sen, id2tag)
            # eval_accuracy = evaluate_attention(model, eval_data_sen, id2tag)
            print("epoch - {0}      step - {1}      loss - {2}      train_accuracy - {3}    eval_accuracy - {4}"\
                .format(i, global_step, loss, train_accuracy, eval_accuracy))

            # train_accuracy = evaluate_attention(model, train_data_sen, id2tag)
            # eval_accuracy = evaluate_attention(model, eval_data_sen, id2tag)
            # print("epoch - {0}      step - {1}      loss - {2}      train_accuracy - {3}    eval_accuracy - {4}"\
            #         .format(i, global_step, loss, train_accuracy, eval_accuracy))

            if max_acc < eval_accuracy:
                max_acc = eval_accuracy
                model.save_model()