Python Data.build_alphabet Examples

Programming Language: Python

Namespace/Package Name: utils.data

Class/Type: Data

Method/Function: build_alphabet

Examples at hotexamples.com: 6

Python Data.build_alphabet - 6 examples found. These are the top rated real world Python examples of utils.data.Data.build_alphabet extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Data(30)

HP_gpu(15)

build_word_pretrain_emb(8)

HP_lr(8)

build_pretrain_emb(7)

build_gaz_pretrain_emb(6)

HP_batch_size(6)

build_alphabet(6)

build_biword_pretrain_emb(4)

HP_use_char(4)

HP_iteration(4)

dset_dir(3)

build_gaz_file(3)

HP_hidden_dim(3)

build_char_pretrain_emb(3)

HP_fix_gaz_emb(3)

HP_dropout(3)

batches(2)

dev_dir(2)

char_emb_dir(2)

HP_lr_decay(2)

get_items_df(2)

HP_device(2)

get_dataframe_all(2)

MAX_SENTENCE_LENGTH(2)

build_gaz_alphabet(2)

data_loader(1)

data_save_file(1)

dataset_name(1)

decode_dir(1)

delete_cookie(1)

HP_glyph_ratio(1)

dev_enc_dep2label(1)

HP_bilstm(1)

cross_domain(1)

filter_data(1)

get_all_value(1)

get_concepts(1)

get_dataframe_test(1)

get_dataframe_train(1)

get_items(1)

get_skills_df(1)

loadfromfile(1)

only_first(1)

cross_test(1)

build_word_sense_map(1)

cls_score(1)

HP_use_posi(1)

HP_glyph_layernorm(1)

HP_glyph_highway(1)

Example #1

Show file

File: main_crf_batch.py Project: jiesutd/PyTorchSequence

def data_initialization(train_file, dev_file, test_file, emb_file):
    data = Data()
    data.number_normalized = True
    data.build_alphabet(train_file)
    data.build_alphabet(dev_file)
    data.build_alphabet(test_file)
    data.fix_alphabet()
    data.generate_instance(train_file,'train')
    data.generate_instance(dev_file,'dev')
    data.generate_instance(test_file,'test')
    data.build_word_pretrain_emb(emb_file)
    return data

Example #2

Show file

def data_initialization(train_file, dev_file, test_file, emb_file):
    data = Data()
    data.number_normalized = True
    data.build_alphabet(train_file)
    data.build_alphabet(dev_file)
    data.build_alphabet(test_file)
    data.fix_alphabet()
    data.generate_instance(train_file,'train')
    data.generate_instance(dev_file,'dev')
    data.generate_instance(test_file,'test')
    word_emb_norm = False
    data.build_word_pretrain_emb(emb_file, word_emb_norm)
    data.show_data_summary()
    sys.stdout.flush()
    return data

Example #3

Show file

    parser.add_argument('--config',
                        default="./demo.train.config",
                        help='Configuration File')

    args = parser.parse_args()
    data = Data()
    data.read_config(args.config)
    status = data.status.lower()  # train or test
    data.HP_gpu = torch.cuda.is_available()
    print "Seed num:", seed_num

    if status == 'train':
        print("MODEL: train")

        # data.initial_feature_alphabets()
        data.build_alphabet(data.train_dir)
        # data.build_alphabet(data.dev_dir)
        # data.build_alphabet(data.test_dir)
        data.build_translation_alphabet(data.trans_dir)
        data.fix_alphabet()
        data.build_translation_dict(data.trans_dir)

        data.generate_instance('train')
        data.generate_instance('dev')
        data.generate_instance('test')
        data.build_pretrain_emb()
        # print data.word_alphabet.instance2index
        # print data.char_alphabet.instance2index
        # print data.label_alphabet.instance2index
        # print data.translation_alphabet.instance2index
        # print data.train_Ids

Example #4

Show file

class Train_Model:
    def __init__(self):

        self.gaz_file = 'D:\\mygit\\NER_MODEL\\data\\data\\ctb.50d.vec'
        self.char_emb = 'D:\\mygit\\NER_MODEL\\data\\data\\gigaword_chn.all.a2b.uni.ite50.vec'
        self.train_file = 'D:\\mygit\\NER_MODEL\\data\\data\\demo.train.char'
        self.dev_file = 'D:\\mygit\\NER_MODEL\\data\\data\\demo.dev.char'
        self.test_file = 'D:\\mygit\\NER_MODEL\\data\\data\\demo.test.char'
        self.model_save_path = 'D:\\mygit\\NER_MODEL\\models\\ckpt'

        self.batch_size = 64
        self.max_char_len = 100
        self.emb_size = 50
        self.max_lexicon_words_num = 5
        self.num_units = 128
        self.num_tags = 18
        self.learning_rate = 0.005
        self.optimizer = 'adam'
        self.epoch = 0
        self.bichar_emb = None
        self.data = Data()
        self.load_data_and_embedding()
        self.model = Model_Lattice(self.max_char_len, self.emb_size,
                                   self.max_lexicon_words_num, self.num_units,
                                   self.num_tags, self.learning_rate)
        self.saver = tf.train.Saver()

    def train(self, epochs=10):
        init = tf.global_variables_initializer()
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.8
        with tf.Session(config=config) as sess:
            sess.run(init)
            for iter in range(epochs):
                loss = []
                print('iter: ', iter)
                random.shuffle(self.data.train_Ids)
                train_num = len(self.data.train_Ids)
                total_batch = train_num // self.batch_size
                for batch_id in range(total_batch):
                    start = batch_id * self.batch_size
                    end = (batch_id + 1) * self.batch_size

                    if end > train_num:
                        end = train_num

                    instance = self.data.train_Ids[start:end]
                    if not instance:
                        continue

                    self.epoch += 1
                    _, char_ids, lexicon_word_ids, word_length_tensor, _, labels = self.batch_with_label(
                        instance)

                    # run模型
                    feed_dict = {
                        self.model.placeholders["char_ids"]: char_ids,
                        self.model.placeholders["lexicon_word_ids"]:
                        lexicon_word_ids,
                        self.model.placeholders["word_length_tensor"]:
                        word_length_tensor,
                        self.model.placeholders["labels"]: labels,
                    }

                    _, losses, step = sess.run([
                        self.model.train_op, self.model.loss,
                        self.model.global_step
                    ],
                                               feed_dict=feed_dict)
                    loss.append(losses)
                    # print(loss)
                    self.ls = sum(loss) / len(loss)
                if self.epoch % 1 == 0:
                    print('*' * 100)
                    print(self.epoch, 'loss', self.ls)

                    # self.evaluate(sess, data)
                    self.evaluate_line(sess, [
                        '习', '近', '平', '在', '北', '京', '中', '南', '海', '呼', '吁',
                        '美', '国', '加', '强', '合', '作', '共', '创', '美', '好', '生',
                        '活'
                    ])
                self.saver.save(sess,
                                os.path.join(self.model_save_path, "ner.dat"))

    def batch_with_label(self, input_batch_list, is_train=True):
        """
        input: list of words, chars and labels, various length.
            [[words,biwords,chars,gaz,labels], [words,biwords,chars,gaz,labels],...]
            words: word ids for one sentence. (batch_size, sent_len)
            chars: char ids for one sentences, various length. (batch_size, sent_len, each_word_length)
        output:
            char_ids: (batch_size, )
            lexicon_word_ids: (batch_size, )
            word_length_tensor: (batch_size, )
            labels: (batch_size, )
        """
        # batch_size = len(input_batch_list)
        lengths = [
            len(sent[0][0:self.max_char_len]) for sent in input_batch_list
        ]
        chars_ids = [sent[0][0:self.max_char_len] for sent in input_batch_list]
        biwords = [sent[1][0:self.max_char_len] for sent in input_batch_list]
        # chars_ids_split = [sent[2][0:self.max_char_len] for sent in input_batch_list]
        # lexicon_words = [sent[3][0:self.max_char_len] for sent in input_batch_list]

        if is_train:
            target = [
                sent[4][0:self.max_char_len] for sent in input_batch_list
            ]

        chars_ids = list(
            map(lambda l: l + [0] * (self.max_char_len - len(l)), chars_ids))
        # biwords = list(map(lambda l: l + [0] * (self.max_char_len - len(l)), biwords))

        if is_train:
            labels = list(
                map(lambda l: l + [0] * (self.max_char_len - len(l)), target))

        lexicon_word_ids = []
        word_length_tensor = []
        for sent in input_batch_list:
            lexicon_word_ids_sent = []
            word_length_tensor_sent = []

            for word_lexicon in sent[3][0:self.max_char_len]:
                word_lexicon_pad = list(
                    map(
                        lambda l: l + [0] *
                        (self.max_lexicon_words_num - len(l)), word_lexicon))
                lexicon_word_ids_sent.append(
                    word_lexicon_pad[0][0:self.max_lexicon_words_num])  # id
                word_length_tensor_sent.append(
                    word_lexicon_pad[1]
                    [0:self.max_lexicon_words_num])  # length

            lexicon_word_ids.append(lexicon_word_ids_sent)
            word_length_tensor.append(word_length_tensor_sent)

        lexicon_word_ids = list(
            map(
                lambda l: l + [[0] * self.max_lexicon_words_num] *
                (self.max_char_len - len(l)), lexicon_word_ids))
        word_length_tensor = list(
            map(
                lambda l: l + [[0] * self.max_lexicon_words_num] *
                (self.max_char_len - len(l)), word_length_tensor))

        if is_train:
            return lengths, chars_ids, lexicon_word_ids, word_length_tensor, target, labels

        return lengths, chars_ids, lexicon_word_ids, word_length_tensor

    def evaluate_line(
        self,
        sess,
        sentence,
    ):
        '''
        因LatticeLSTM内部参数受batch_size限制，数据会转为批处理
        :param sess: 会话
        :param sentence: 带处理文本
        :param self.data: 含词库等处理的数据集
        :return: 返回标注结果
        '''
        _, Ids = self.data.generate_sentence_instance_with_gaz(sentence)
        lengths, char_ids, lexicon_word_ids, word_length_tensor = self.batch_with_label(
            Ids, False)

        lengths = lengths * self.batch_size
        char_ids = char_ids * self.batch_size
        lexicon_word_ids = lexicon_word_ids * self.batch_size
        word_length_tensor = word_length_tensor * self.batch_size

        # run模型
        feed_dict = {
            self.model.placeholders["char_ids"]: char_ids,
            self.model.placeholders["lexicon_word_ids"]: lexicon_word_ids,
            self.model.placeholders["word_length_tensor"]: word_length_tensor,
        }

        logits = sess.run(self.model.logits, feed_dict=feed_dict)
        paths = self.decode(logits, lengths,
                            self.model.trans.eval(session=sess))
        tags = [self.data.label_alphabet.get_instance(idx) for idx in paths[0]]
        print("tags: ", tags)

        return tags

    def decode(self, logits, lengths, transition_matrix):
        """
        :param logits: [batch_size, num_steps, num_tags]float32, logits
        :param lengths: [batch_size]int32, real length of each sequence
        :param transition_matrix: transaction matrix for inference
        :return:
        """
        # inference final labels usa viterbi Algorithm
        paths = []
        small = -1000.0
        start = np.asarray([[small] * self.num_tags + [0]])
        for score, length in zip(logits, lengths):
            score = score[:length]
            pad = small * np.ones([length, 1])

            logits = np.concatenate([score, pad], axis=1)
            logits = np.concatenate([start, logits], axis=0)
            path, _ = tf.contrib.crf.viterbi_decode(logits, transition_matrix)

            paths.append(path[1:])

        return paths

    def load_data_and_embedding(self):
        self.data.HP_use_char = False
        self.data.HP_batch_size = 1
        self.data.use_bigram = False
        self.data.gaz_dropout = 0.5
        self.data.norm_gaz_emb = False
        self.data.HP_fix_gaz_emb = False

        self.data_initialization()

        self.data.generate_instance_with_gaz(self.train_file, 'train')
        self.data.generate_instance_with_gaz(self.dev_file, 'dev')
        self.data.generate_instance_with_gaz(self.test_file, 'test')

        self.data.build_word_pretrain_emb(self.char_emb)
        self.data.build_biword_pretrain_emb(self.bichar_emb)
        self.data.build_gaz_pretrain_emb(self.gaz_file)

    def data_initialization(self):
        self.data.build_alphabet(self.train_file)
        self.data.build_alphabet(self.dev_file)
        self.data.build_alphabet(self.test_file)

        self.data.build_gaz_file(self.gaz_file)

        self.data.build_gaz_alphabet(self.train_file)
        self.data.build_gaz_alphabet(self.dev_file)
        self.data.build_gaz_alphabet(self.test_file)
        self.data.fix_alphabet()

Example #5

Show file

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Low Resource NER via Cross-lingual Knowledge-Transfer')
    parser = init_parser(parser)

    substring_file_prefix = 'ned'

    args = parser.parse_args()
    data = Data()
    data.read_config(args.config)
    data.read_arg(args)
    data.show_data_summary()

    data.initial_feature_alphabets()
    data.build_alphabet(data.train_dir)
    data.build_alphabet(data.dev_dir)
    data.build_alphabet(data.test_dir)
    # data.build_alphabet_substring(data.substring_dir, substring_file_prefix)
    if data.use_trans:
        data.build_translation_alphabet(data.trans_dir)
    data.fix_alphabet()
    if data.use_trans:
        data.build_translation_dict(data.trans_dir)

    data.generate_instance('train')
    data.generate_instance('dev')
    data.generate_instance('test')
    data.generate_instance_substring(substring_file_prefix)
    data.build_pretrain_emb()

Example #6

Show file

from utils.data import Data

data = Data()
debug_file = "debug.txt"
gaz_file = "data/ctb.50d.vec"
data.build_alphabet(debug_file)
data.build_gaz_file(gaz_file)
data.build_gaz_alphabet(debug_file)
data.fix_alphabet()
data.generate_instance_with_gaz(debug_file, 'train')