def map_string_2_id_open(string_list, name):
    string_id_list = []
    alphabet_string = Alphabet(name)
    for strings in string_list:
        ids = []
        for string in strings:
            id = alphabet_string.get_index(string)
            ids.append(id)
        string_id_list.append(ids)
    alphabet_string.close()
    return string_id_list, alphabet_string
Ejemplo n.º 2
0
def str_to_id():
    global alphabet_pos, alphabet_chunk, alphabet_tag

    alphabet_pos = Alphabet('pos')
    train_pos_id = map_string_2_id(train_pos, alphabet_pos)
    alphabet_pos.close()
    dev_pos_id = map_string_2_id(dev_pos, alphabet_pos)
    test_pos_id = map_string_2_id(test_pos, alphabet_pos)

    alphabet_chunk = Alphabet('chunk')
    train_chunk_id = map_string_2_id(train_chunk, alphabet_chunk)
    alphabet_chunk.close()
    dev_chunk_id = map_string_2_id(dev_chunk, alphabet_chunk)
    test_chunk_id = map_string_2_id(test_chunk, alphabet_chunk)

    alphabet_tag = Alphabet('tag')
    train_tag_id = map_string_2_id(train_tag, alphabet_tag)
    alphabet_tag.close()
    dev_tag_id = map_string_2_id(dev_tag, alphabet_tag)
    test_tag_id = map_string_2_id(test_tag, alphabet_tag)
Ejemplo n.º 3
0
class Data:
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = False
        self.norm_char_emb = False
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')

        self.feature_name = []
        self.feature_alphabets = []
        self.feature_num = len(self.feature_alphabets)
        self.feat_config = None
        self.feature_name2id = {}


        self.label_alphabet = Alphabet('label',True)
        self.tagScheme = "NoSeg" ## BMES/BIO
        
        self.seg = True

        ### I/O
        self.train_dir = None 
        self.dev_dir = None 
        self.test_dir = None


        self.model_dir = None ## model save  file


        self.word_emb_dir = None 
        self.char_emb_dir = None
        self.feature_emb_dirs = []

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []


        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []


        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None
        self.pretrain_feature_embeddings = []

        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        self.feature_alphabet_sizes = []
        self.feature_emb_dims = []
        self.norm_feature_embs = []
        self.word_emb_dim = 50
        self.char_emb_dim = 30

        ###Networks
        self.word_feature_extractor = "LSTM" ## "LSTM"/"CNN"/"GRU"/
        self.use_char = True
        self.char_feature_extractor = "CNN" ## "LSTM"/"CNN"/"GRU"/None
        self.use_crf = True
        self.nbest = None
        
        ## Training
        self.average_batch_loss = False

        ### Hyperparameters
        self.HP_cnn_layer = 4
        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        
        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = None
        self.HP_momentum = 0
        self.HP_l2 = 1e-8

        # both
        self.full_data = False
        self.tune_wordemb = False

        # relation
        self.pretrain = None
        self.max_seq_len = 500
        self.pad_idx = 1
        self.sent_window = 3
        self.output =None
        self.unk_ratio=1
        self.seq_feature_size=256
        self.max_epoch = 100
        self.feature_extractor=None

        self.re_feature_name = []
        self.re_feature_name2id = {}
        self.re_feature_alphabets = []
        self.re_feature_num = len(self.re_feature_alphabets)
        self.re_feat_config = None

        self.re_train_X = []
        self.re_dev_X = []
        self.re_test_X = []
        self.re_train_Y = []
        self.re_dev_Y = []
        self.re_test_Y = []

        
    def show_data_summary(self):
        print("++"*50)
        print("DATA SUMMARY START:")
        print(" I/O:")
        print("     Tag          scheme: %s"%(self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s"%(self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s"%(self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s"%(self.number_normalized))
        print("     Word  alphabet size: %s"%(self.word_alphabet_size))
        print("     Char  alphabet size: %s"%(self.char_alphabet_size))
        print("     Label alphabet size: %s"%(self.label_alphabet_size))
        print("     Word embedding  dir: %s"%(self.word_emb_dir))
        print("     Char embedding  dir: %s"%(self.char_emb_dir))
        print("     Word embedding size: %s"%(self.word_emb_dim))
        print("     Char embedding size: %s"%(self.char_emb_dim))
        print("     Norm   word     emb: %s"%(self.norm_word_emb))
        print("     Norm   char     emb: %s"%(self.norm_char_emb))
        print("     Train  file directory: %s"%(self.train_dir))
        print("     Dev    file directory: %s"%(self.dev_dir))
        print("     Test   file directory: %s"%(self.test_dir))


        print("     Model  file directory: %s"%(self.model_dir))


        print("     Train instance number: %s"%(len(self.train_texts)))
        print("     Dev   instance number: %s"%(len(self.dev_texts)))
        print("     Test  instance number: %s"%(len(self.test_texts)))

        print("     FEATURE num: %s"%(self.feature_num))
        for idx in range(self.feature_num):
            print("         Fe: %s  alphabet  size: %s"%(self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx]))
            print("         Fe: %s  embedding  dir: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dirs[idx]))
            print("         Fe: %s  embedding size: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dims[idx]))
            print("         Fe: %s  norm       emb: %s"%(self.feature_alphabets[idx].name, self.norm_feature_embs[idx]))
        # for k, v in self.feat_config.items():
        #     print("         Feature: %s, size %s, norm %s, dir %s"%(k, v['emb_size'], v['emb_norm'], v['emb_dir']))

        print(" "+"++"*20)
        print(" Model Network:")
        print("     Model        use_crf: %s"%(self.use_crf))
        print("     Model word extractor: %s"%(self.word_feature_extractor))
        print("     Model       use_char: %s"%(self.use_char))
        if self.use_char:
            print("     Model char extractor: %s"%(self.char_feature_extractor))
            print("     Model char_hidden_dim: %s"%(self.HP_char_hidden_dim))
        print(" "+"++"*20)
        print(" Training:")
        print("     Optimizer: %s"%(self.optimizer))
        print("     Iteration: %s"%(self.HP_iteration))
        print("     BatchSize: %s"%(self.HP_batch_size))
        print("     Average  batch   loss: %s"%(self.average_batch_loss))

        print(" "+"++"*20)
        print(" Hyperparameters:")
        
        print("     Hyper              lr: %s"%(self.HP_lr))
        print("     Hyper        lr_decay: %s"%(self.HP_lr_decay))
        print("     Hyper         HP_clip: %s"%(self.HP_clip))
        print("     Hyper        momentum: %s"%(self.HP_momentum))
        print("     Hyper              l2: %s"%(self.HP_l2))
        print("     Hyper      hidden_dim: %s"%(self.HP_hidden_dim))
        print("     Hyper         dropout: %s"%(self.HP_dropout))
        print("     Hyper      lstm_layer: %s"%(self.HP_lstm_layer))
        print("     Hyper          bilstm: %s"%(self.HP_bilstm))
        print("     Hyper             GPU: %s"%(self.HP_gpu))
        print("     Hyper             NBEST: %s"%(self.nbest))

        print(" " + "++" * 20)
        print(" Both:")

        print("     full data: %s" % (self.full_data))
        print("     Tune  word embeddings: %s" % (self.tune_wordemb))

        print(" "+"++"*20)
        print(" Relation:")

        print("     Pretrain directory: %s" % (self.pretrain))
        print("     max sequence length: %s" % (self.max_seq_len))
        print("     pad index: %s" % (self.pad_idx))
        print("     sentence window: %s" % (self.sent_window))
        print("     Output directory: %s" % (self.output))
        print("     The ratio using negative instnaces 0~1: %s" % (self.unk_ratio))
        print("     Size of seqeuence feature representation: %s" % (self.seq_feature_size))
        print("     Iteration for relation training: %s" % (self.max_epoch))
        print("     feature_extractor: %s" % (self.feature_extractor))

        print("     RE FEATURE num: %s"%(self.re_feature_num))
        for idx in range(self.re_feature_num):
            print("         Fe: %s  alphabet  size: %s"%(self.re_feature_alphabets[idx].name, self.re_feature_alphabet_sizes[idx]))
            print("         Fe: %s  embedding  dir: %s"%(self.re_feature_alphabets[idx].name, self.re_feature_emb_dirs[idx]))
            print("         Fe: %s  embedding size: %s"%(self.re_feature_alphabets[idx].name, self.re_feature_emb_dims[idx]))
            print("         Fe: %s  norm       emb: %s"%(self.re_feature_alphabets[idx].name, self.re_norm_feature_embs[idx]))

        print("     RE Train instance number: %s"%(len(self.re_train_Y)))
        print("     RE Dev   instance number: %s"%(len(self.re_dev_Y)))
        print("     RE Test  instance number: %s"%(len(self.re_test_Y)))

        print("DATA SUMMARY END.")
        print("++"*50)
        sys.stdout.flush()


    def initial_feature_alphabets(self, input_file):
        items = open(input_file,'r').readline().strip('\n').split()
        total_column = len(items)
        if total_column > 2:
            id = 0
            for idx in range(1, total_column-1):
                feature_prefix = items[idx].split(']',1)[0]+"]"
                self.feature_alphabets.append(Alphabet(feature_prefix))
                self.feature_name.append(feature_prefix)
                self.feature_name2id[feature_prefix] = id
                id += 1
                print "Find feature: ", feature_prefix 
        self.feature_num = len(self.feature_alphabets)
        self.pretrain_feature_embeddings = [None]*self.feature_num
        self.feature_emb_dims = [20]*self.feature_num
        self.feature_emb_dirs = [None]*self.feature_num 
        self.norm_feature_embs = [False]*self.feature_num
        self.feature_alphabet_sizes = [0]*self.feature_num
        if self.feat_config:
            for idx in range(self.feature_num):
                if self.feature_name[idx] in self.feat_config:
                    self.feature_emb_dims[idx] = self.feat_config[self.feature_name[idx]]['emb_size']
                    self.feature_emb_dirs[idx] = self.feat_config[self.feature_name[idx]]['emb_dir']
                    self.norm_feature_embs[idx] = self.feat_config[self.feature_name[idx]]['emb_norm']
        # exit(0)


    def build_alphabet(self, input_file):
        in_lines = open(input_file,'r').readlines()
        for line in in_lines:
            if len(line) > 2:
                pairs = line.strip().split()
                word = pairs[0].decode('utf-8')
                if self.number_normalized:
                    word = normalize_word(word)
                label = pairs[-1]
                self.label_alphabet.add(label)
                self.word_alphabet.add(word)
                ## build feature alphabet 
                for idx in range(self.feature_num):
                    feat_idx = pairs[idx+1].split(']',1)[-1]
                    self.feature_alphabets[idx].add(feat_idx)
                for char in word:
                    self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        for idx in range(self.feature_num):
            self.feature_alphabet_sizes[idx] = self.feature_alphabets[idx].size()
        startS = False
        startB = False
        for label,_ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"


    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close() 
        for idx in range(self.feature_num):
            self.feature_alphabets[idx].close()

    def initial_re_feature_alphabets(self):
        id = 0
        for k, v in self.re_feat_config.items():
            self.re_feature_alphabets.append(Alphabet(k))
            self.re_feature_name.append(k)
            self.re_feature_name2id[k] = id
            id += 1

        self.re_feature_num = len(self.re_feature_alphabets)
        self.re_pretrain_feature_embeddings = [None]*self.re_feature_num
        self.re_feature_emb_dims = [20]*self.re_feature_num
        self.re_feature_emb_dirs = [None]*self.re_feature_num
        self.re_norm_feature_embs = [False]*self.re_feature_num
        self.re_feature_alphabet_sizes = [0]*self.re_feature_num
        if self.re_feat_config:
            for idx in range(self.re_feature_num):
                if self.re_feature_name[idx] in self.re_feat_config:
                    self.re_feature_emb_dims[idx] = self.re_feat_config[self.re_feature_name[idx]]['emb_size']
                    self.re_feature_emb_dirs[idx] = self.re_feat_config[self.re_feature_name[idx]]['emb_dir']
                    self.re_norm_feature_embs[idx] = self.re_feat_config[self.re_feature_name[idx]]['emb_norm']


    def build_re_feature_alphabets(self, tokens, entities, relations):

        entity_type_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[ENTITY_TYPE]']]
        entity_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[ENTITY]']]
        relation_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[RELATION]']]
        token_num_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[TOKEN_NUM]']]
        entity_num_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[ENTITY_NUM]']]
        position_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[POSITION]']]

        for i, doc_token in enumerate(tokens):

            doc_entity = entities[i]
            doc_relation = relations[i]

            sent_idx = 0
            sentence = doc_token[(doc_token['sent_idx'] == sent_idx)]
            while sentence.shape[0] != 0:

                entities_in_sentence = doc_entity[(doc_entity['sent_idx'] == sent_idx)]
                for _, entity in entities_in_sentence.iterrows():
                    entity_type_alphabet.add(entity['type'])
                    tk_idx = entity['tf_start']
                    while tk_idx <= entity['tf_end']:
                        entity_alphabet.add(
                            my_utils1.normalizeWord(sentence.iloc[tk_idx, 0]))  # assume 'text' is in 0 column
                        tk_idx += 1

                sent_idx += 1
                sentence = doc_token[(doc_token['sent_idx'] == sent_idx)]

            for _, relation in doc_relation.iterrows():
                relation_alphabet.add(relation['type'])


        for i in range(data.max_seq_len):
            token_num_alphabet.add(i)
            entity_num_alphabet.add(i)
            position_alphabet.add(i)
            position_alphabet.add(-i)


        for idx in range(self.re_feature_num):
            self.re_feature_alphabet_sizes[idx] = self.re_feature_alphabets[idx].size()


    def fix_re_alphabet(self):
        for alphabet in self.re_feature_alphabets:
            alphabet.close()


    def build_pretrain_emb(self):
        if self.word_emb_dir:
            print("Load pretrained word embedding, norm: %s, dir: %s"%(self.norm_word_emb, self.word_emb_dir))
            self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb)
        if self.char_emb_dir:
            print("Load pretrained char embedding, norm: %s, dir: %s"%(self.norm_char_emb, self.char_emb_dir))
            self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb)
        for idx in range(self.feature_num):
            if self.feature_emb_dirs[idx]:
                print("Load pretrained feature %s embedding:, norm: %s, dir: %s"%(self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx]))
                self.pretrain_feature_embeddings[idx], self.feature_emb_dims[idx] = build_pretrain_embedding(self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx])

    def build_re_pretrain_emb(self):
        for idx in range(self.re_feature_num):
            if self.re_feature_emb_dirs[idx]:
                print("Load pretrained re feature %s embedding:, norm: %s, dir: %s" % (self.re_feature_name[idx], self.re_norm_feature_embs[idx], self.re_feature_emb_dirs[idx]))
                self.re_pretrain_feature_embeddings[idx], self.re_feature_emb_dims[idx] = build_pretrain_embedding(
                    self.re_feature_emb_dirs[idx], self.re_feature_alphabets[idx], self.re_feature_emb_dims[idx],
                    self.re_norm_feature_embs[idx])

    def generate_instance(self, name, input_file):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(input_file, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(input_file, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(input_file, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH)
        else:
            print("Error: you can only generate train/dev/test instance! Illegal input:%s"%(name))



    def generate_re_instance(self, name, tokens, entities, relations, names):
        self.fix_re_alphabet()
        if name == "train":
            self.re_train_X, self.re_train_Y = relation_extraction.getRelationInstance2(tokens, entities, relations, names, self)
        elif name == "dev":
            self.re_dev_X, self.re_dev_Y = relation_extraction.getRelationInstance2(tokens, entities, relations, names, self)
        elif name == "test":
            self.re_test_X, self.re_test_Y = relation_extraction.getRelationInstance2(tokens, entities, relations, names, self)
        else:
            print("Error: you can only generate train/dev/test instance! Illegal input:%s"%(name))


    def load(self,data_file):
        f = open(data_file, 'rb')
        tmp_dict = pickle.load(f)
        f.close()
        self.__dict__.update(tmp_dict)

    def save(self,save_file):
        f = open(save_file, 'wb')
        pickle.dump(self.__dict__, f, 2)
        f.close()




    def read_config(self,config_file):
        config = config_file_to_dict(config_file)
        ## read data:
        the_item = 'train_dir'
        if the_item in config:
            self.train_dir = config[the_item]
        the_item = 'dev_dir'
        if the_item in config:
            self.dev_dir = config[the_item]
        the_item = 'test_dir'
        if the_item in config:
            self.test_dir = config[the_item]


        the_item = 'model_dir'
        if the_item in config:
            self.model_dir = config[the_item]


        the_item = 'word_emb_dir'
        if the_item in config:
            self.word_emb_dir = config[the_item]
        the_item = 'char_emb_dir'
        if the_item in config:
            self.char_emb_dir = config[the_item]


        the_item = 'MAX_SENTENCE_LENGTH'
        if the_item in config:
            self.MAX_SENTENCE_LENGTH = int(config[the_item])
        the_item = 'MAX_WORD_LENGTH'
        if the_item in config:
            self.MAX_WORD_LENGTH = int(config[the_item])

        the_item = 'norm_word_emb'
        if the_item in config:
            self.norm_word_emb = str2bool(config[the_item])
        the_item = 'norm_char_emb'
        if the_item in config:
            self.norm_char_emb = str2bool(config[the_item])
        the_item = 'number_normalized'
        if the_item in config:
            self.number_normalized = str2bool(config[the_item])


        the_item = 'seg'
        if the_item in config:
            self.seg = str2bool(config[the_item])
        the_item = 'word_emb_dim'
        if the_item in config:
            self.word_emb_dim = int(config[the_item])
        the_item = 'char_emb_dim'
        if the_item in config:
            self.char_emb_dim = int(config[the_item])

        ## read network:
        the_item = 'use_crf'
        if the_item in config:
            self.use_crf = str2bool(config[the_item])
        the_item = 'use_char'
        if the_item in config:
            self.use_char = str2bool(config[the_item])
        the_item = 'word_seq_feature'
        if the_item in config:
            self.word_feature_extractor = config[the_item]
        the_item = 'char_seq_feature'
        if the_item in config:
            self.char_feature_extractor = config[the_item]
        the_item = 'nbest'
        if the_item in config:
            self.nbest = int(config[the_item])

        the_item = 'feature'
        if the_item in config:
            self.feat_config = config[the_item] ## feat_config is a dict 






        ## read training setting:
        the_item = 'optimizer'
        if the_item in config:
            self.optimizer = config[the_item]
        the_item = 'ave_batch_loss'
        if the_item in config:
            self.average_batch_loss = str2bool(config[the_item])


        ## read Hyperparameters:
        the_item = 'cnn_layer'
        if the_item in config:
            self.HP_cnn_layer = int(config[the_item])
        the_item = 'iteration'
        if the_item in config:
            self.HP_iteration = int(config[the_item])
        the_item = 'batch_size'
        if the_item in config:
            self.HP_batch_size = int(config[the_item])

        the_item = 'char_hidden_dim'
        if the_item in config:
            self.HP_char_hidden_dim = int(config[the_item])
        the_item = 'hidden_dim'
        if the_item in config:
            self.HP_hidden_dim = int(config[the_item])
        the_item = 'dropout'
        if the_item in config:
            self.HP_dropout = float(config[the_item])
        the_item = 'lstm_layer'
        if the_item in config:
            self.HP_lstm_layer = int(config[the_item])
        the_item = 'bilstm'
        if the_item in config:
            self.HP_bilstm = str2bool(config[the_item])

        the_item = 'gpu'
        if the_item in config:
            self.HP_gpu = int(config[the_item])
        the_item = 'learning_rate'
        if the_item in config:
            self.HP_lr = float(config[the_item])
        the_item = 'lr_decay'
        if the_item in config:
            self.HP_lr_decay = float(config[the_item])
        the_item = 'clip'
        if the_item in config:
            self.HP_clip = float(config[the_item])
        the_item = 'momentum'
        if the_item in config:
            self.HP_momentum = float(config[the_item])
        the_item = 'l2'
        if the_item in config:
            self.HP_l2 = float(config[the_item])

        # both
        the_item = 'full_data'
        if the_item in config:
            self.full_data = str2bool(config[the_item])

        the_item = 'tune_wordemb'
        if the_item in config:
            self.tune_wordemb = str2bool(config[the_item])

        # relation
        the_item = 'pretrain'
        if the_item in config:
            self.pretrain = config[the_item]

        the_item = 'max_seq_len'
        if the_item in config:
            self.max_seq_len = int(config[the_item])

        the_item = 'pad_idx'
        if the_item in config:
            self.pad_idx = int(config[the_item])

        the_item = 'sent_window'
        if the_item in config:
            self.sent_window = int(config[the_item])

        the_item = 'output'
        if the_item in config:
            self.output = config[the_item]

        the_item = 'unk_ratio'
        if the_item in config:
            self.unk_ratio = float(config[the_item])

        the_item = 'seq_feature_size'
        if the_item in config:
            self.seq_feature_size = int(config[the_item])

        the_item = 'max_epoch'
        if the_item in config:
            self.max_epoch = int(config[the_item])

        the_item = 'feature_extractor'
        if the_item in config:
            self.feature_extractor = config[the_item]

        the_item = 're_feature'
        if the_item in config:
            self.re_feat_config = config[the_item] ## feat_config is a dict
Ejemplo n.º 4
0
                        break

        training_instances_fp.close()

    elif opt.whattodo == 4:  # train vsm on candidates

        datapoints_train = load_dataponts(opt.train_file)
        datapoints_test = load_dataponts(opt.test_file)
        # datapoints_train = load_dataponts('training_instances_debug.txt')
        # datapoints_test = load_dataponts('test_instances_debug.txt')

        word_alphabet = Alphabet('word')

        build_alphabet(word_alphabet, datapoints_train)
        build_alphabet(word_alphabet, datapoints_test)
        word_alphabet.close()

        if d.config.get('norm_emb') is not None:
            logging.info("load pretrained word embedding ...")
            pretrain_word_embedding, word_emb_dim = build_pretrain_embedding(
                d.config.get('norm_emb'), word_alphabet, opt.word_emb_dim,
                False)
            word_embedding = nn.Embedding(word_alphabet.size(),
                                          word_emb_dim,
                                          padding_idx=0)
            word_embedding.weight.data.copy_(
                torch.from_numpy(pretrain_word_embedding))
            embedding_dim = word_emb_dim
        else:
            logging.info("randomly initialize word embedding ...")
            word_embedding = nn.Embedding(word_alphabet.size(),
Ejemplo n.º 5
0
class Data:
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = False
        self.norm_char_emb = False
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')

        self.feature_name = []
        self.feature_alphabets = []
        self.feature_num = len(self.feature_alphabets)
        self.feat_config = None


        self.label_alphabet = Alphabet('label',True)
        self.tagScheme = "NoSeg" ## BMES/BIO
        
        self.seg = True

        ### I/O
        self.train_dir = None 
        self.dev_dir = None 
        self.test_dir = None 
        self.raw_dir = None

        self.decode_dir = None
        self.dset_dir = None ## data vocabulary related file
        self.model_dir = None ## model save  file
        self.load_model_dir = None ## model load file

        self.word_emb_dir = None 
        self.char_emb_dir = None
        self.feature_emb_dirs = []

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None
        self.pretrain_feature_embeddings = []

        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        self.feature_alphabet_sizes = []
        self.feature_emb_dims = []
        self.norm_feature_embs = []
        self.word_emb_dim = 50
        self.char_emb_dim = 30

        ###Networks
        self.word_feature_extractor = "LSTM" ## "LSTM"/"CNN"/"GRU"/
        self.use_char = True
        self.char_feature_extractor = "CNN" ## "LSTM"/"CNN"/"GRU"/None
        self.use_crf = True
        self.nbest = None
        
        ## Training
        self.average_batch_loss = False
        self.optimizer = "SGD" ## "SGD"/"AdaGrad"/"AdaDelta"/"RMSProp"/"Adam"
        self.status = "train"
        ### Hyperparameters
        self.HP_cnn_layer = 4
        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        
        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = None
        self.HP_momentum = 0
        self.HP_l2 = 1e-8
        
    def show_data_summary(self):
        print("++"*50)
        print("DATA SUMMARY START:")
        print(" I/O:")
        print("     Tag          scheme: %s"%(self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s"%(self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s"%(self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s"%(self.number_normalized))
        print("     Word  alphabet size: %s"%(self.word_alphabet_size))
        print("     Char  alphabet size: %s"%(self.char_alphabet_size))
        print("     Label alphabet size: %s"%(self.label_alphabet_size))
        print("     Word embedding  dir: %s"%(self.word_emb_dir))
        print("     Char embedding  dir: %s"%(self.char_emb_dir))
        print("     Word embedding size: %s"%(self.word_emb_dim))
        print("     Char embedding size: %s"%(self.char_emb_dim))
        print("     Norm   word     emb: %s"%(self.norm_word_emb))
        print("     Norm   char     emb: %s"%(self.norm_char_emb))
        print("     Train  file directory: %s"%(self.train_dir))
        print("     Dev    file directory: %s"%(self.dev_dir))
        print("     Test   file directory: %s"%(self.test_dir))
        print("     Raw    file directory: %s"%(self.raw_dir))
        print("     Dset   file directory: %s"%(self.dset_dir))
        print("     Model  file directory: %s"%(self.model_dir))
        print("     Loadmodel   directory: %s"%(self.load_model_dir))
        print("     Decode file directory: %s"%(self.decode_dir))
        print("     Train instance number: %s"%(len(self.train_texts)))
        print("     Dev   instance number: %s"%(len(self.dev_texts)))
        print("     Test  instance number: %s"%(len(self.test_texts)))
        print("     Raw   instance number: %s"%(len(self.raw_texts)))
        print("     FEATURE num: %s"%(self.feature_num))
        for idx in range(self.feature_num):
            print("         Fe: %s  alphabet  size: %s"%(self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx]))
            print("         Fe: %s  embedding  dir: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dirs[idx]))
            print("         Fe: %s  embedding size: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dims[idx]))
            print("         Fe: %s  norm       emb: %s"%(self.feature_alphabets[idx].name, self.norm_feature_embs[idx]))
        print(" "+"++"*20)
        print(" Model Network:")
        print("     Model        use_crf: %s"%(self.use_crf))
        print("     Model word extractor: %s"%(self.word_feature_extractor))
        print("     Model       use_char: %s"%(self.use_char))
        if self.use_char:
            print("     Model char extractor: %s"%(self.char_feature_extractor))
            print("     Model char_hidden_dim: %s"%(self.HP_char_hidden_dim))
        print(" "+"++"*20)
        print(" Training:")
        print("     Optimizer: %s"%(self.optimizer))
        print("     Iteration: %s"%(self.HP_iteration))
        print("     BatchSize: %s"%(self.HP_batch_size))
        print("     Average  batch   loss: %s"%(self.average_batch_loss))

        print(" "+"++"*20)
        print(" Hyperparameters:")
        
        print("     Hyper              lr: %s"%(self.HP_lr))
        print("     Hyper        lr_decay: %s"%(self.HP_lr_decay))
        print("     Hyper         HP_clip: %s"%(self.HP_clip))
        print("     Hyper        momentum: %s"%(self.HP_momentum))
        print("     Hyper              l2: %s"%(self.HP_l2))
        print("     Hyper      hidden_dim: %s"%(self.HP_hidden_dim))
        print("     Hyper         dropout: %s"%(self.HP_dropout))
        print("     Hyper      lstm_layer: %s"%(self.HP_lstm_layer))
        print("     Hyper          bilstm: %s"%(self.HP_bilstm))
        print("     Hyper             GPU: %s"%(self.HP_gpu))   
        print("DATA SUMMARY END.")
        print("++"*50)
        sys.stdout.flush()


    def initial_feature_alphabets(self):
        items = open(self.train_dir,'r').readline().strip('\n').split()
        total_column = len(items)
        if total_column > 2:
            for idx in range(1, total_column-1):
                feature_prefix = items[idx].split(']',1)[0]+"]"
                self.feature_alphabets.append(Alphabet(feature_prefix))
                self.feature_name.append(feature_prefix)
                print "Find feature: ", feature_prefix 
        self.feature_num = len(self.feature_alphabets)
        self.pretrain_feature_embeddings = [None]*self.feature_num
        self.feature_emb_dims = [20]*self.feature_num
        self.feature_emb_dirs = [None]*self.feature_num 
        self.norm_feature_embs = [False]*self.feature_num
        self.feature_alphabet_sizes = [0]*self.feature_num
        if self.feat_config:
            for idx in range(self.feature_num):
                if self.feature_name[idx] in self.feat_config:
                    self.feature_emb_dims[idx] = self.feat_config[self.feature_name[idx]]['emb_size']
                    self.feature_emb_dirs[idx] = self.feat_config[self.feature_name[idx]]['emb_dir']
                    self.norm_feature_embs[idx] = self.feat_config[self.feature_name[idx]]['emb_norm']
        # exit(0)


    def build_alphabet(self, input_file):
        in_lines = open(input_file,'r').readlines()
        for line in in_lines:
            if len(line) > 2:
                pairs = line.strip().split()
                word = pairs[0].decode('utf-8')
                if self.number_normalized:
                    word = normalize_word(word)
                label = pairs[-1]
                self.label_alphabet.add(label)
                self.word_alphabet.add(word)
                ## build feature alphabet 
                for idx in range(self.feature_num):
                    feat_idx = pairs[idx+1].split(']',1)[-1]
                    self.feature_alphabets[idx].add(feat_idx)
                for char in word:
                    self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        for idx in range(self.feature_num):
            self.feature_alphabet_sizes[idx] = self.feature_alphabets[idx].size()
        startS = False
        startB = False
        for label,_ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"


    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close() 
        for idx in range(self.feature_num):
            self.feature_alphabets[idx].close()      


    def build_pretrain_emb(self):
        if self.word_emb_dir:
            print("Load pretrained word embedding, norm: %s, dir: %s"%(self.norm_word_emb, self.word_emb_dir))
            self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb)
        if self.char_emb_dir:
            print("Load pretrained char embedding, norm: %s, dir: %s"%(self.norm_char_emb, self.char_emb_dir))
            self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb)
        for idx in range(self.feature_num):
            if self.feature_emb_dirs[idx]:
                print("Load pretrained feature %s embedding:, norm: %s, dir: %s"%(self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx]))
                self.pretrain_feature_embeddings[idx], self.feature_emb_dims[idx] = build_pretrain_embedding(self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx])


    def generate_instance(self, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(self.train_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(self.dev_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(self.test_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "raw":
            self.raw_texts, self.raw_Ids = read_instance(self.raw_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH)
        else:
            print("Error: you can only generate train/dev/test instance! Illegal input:%s"%(name))


    def write_decoded_results(self, predict_results, name):
        fout = open(self.decode_dir,'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
           content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print("Error: illegal name during writing predict result, name should be within train/dev/test/raw !")
        assert(sent_num == len(content_list))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx])
            for idy in range(sent_length):
                ## content_list[idx] is a list with [word, char, label]
                fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n')
            fout.write('\n')
        fout.close()
        print("Predict %s result has been written into file. %s"%(name, self.decode_dir))


    def load(self,data_file):
        f = open(data_file, 'rb')
        tmp_dict = pickle.load(f)
        f.close()
        self.__dict__.update(tmp_dict)

    def save(self,save_file):
        f = open(save_file, 'wb')
        pickle.dump(self.__dict__, f, 2)
        f.close()



    def write_nbest_decoded_results(self, predict_results, pred_scores, name):
        ## predict_results : [whole_sent_num, nbest, each_sent_length]
        ## pred_scores: [whole_sent_num, nbest]
        fout = open(self.decode_dir,'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
           content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print("Error: illegal name during writing predict result, name should be within train/dev/test/raw !")
        assert(sent_num == len(content_list))
        assert(sent_num == len(pred_scores))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx][0])
            nbest = len(predict_results[idx])
            score_string = "# "
            for idz in range(nbest):
                score_string += format(pred_scores[idx][idz], '.4f')+" "
            fout.write(score_string.strip() + "\n")

            for idy in range(sent_length):
                label_string = content_list[idx][0][idy].encode('utf-8') + " "
                for idz in range(nbest):
                    label_string += predict_results[idx][idz][idy]+" "
                label_string = label_string.strip() + "\n"
                fout.write(label_string)
            fout.write('\n')
        fout.close()
        print("Predict %s %s-best result has been written into file. %s"%(name,nbest, self.decode_dir))


    def read_config(self,config_file):
        config = config_file_to_dict(config_file)
        ## read data:
        the_item = 'train_dir'
        if the_item in config:
            self.train_dir = config[the_item]
        the_item = 'dev_dir'
        if the_item in config:
            self.dev_dir = config[the_item]
        the_item = 'test_dir'
        if the_item in config:
            self.test_dir = config[the_item]
        the_item = 'raw_dir'
        if the_item in config:
            self.raw_dir = config[the_item]
        the_item = 'decode_dir'
        if the_item in config:
            self.decode_dir = config[the_item]
        the_item = 'dset_dir'
        if the_item in config:
            self.dset_dir = config[the_item]
        the_item = 'model_dir'
        if the_item in config:
            self.model_dir = config[the_item]
        the_item = 'load_model_dir'
        if the_item in config:
            self.load_model_dir = config[the_item]

        the_item = 'word_emb_dir'
        if the_item in config:
            self.word_emb_dir = config[the_item]
        the_item = 'char_emb_dir'
        if the_item in config:
            self.char_emb_dir = config[the_item]


        the_item = 'MAX_SENTENCE_LENGTH'
        if the_item in config:
            self.MAX_SENTENCE_LENGTH = int(config[the_item])
        the_item = 'MAX_WORD_LENGTH'
        if the_item in config:
            self.MAX_WORD_LENGTH = int(config[the_item])

        the_item = 'norm_word_emb'
        if the_item in config:
            self.norm_word_emb = str2bool(config[the_item])
        the_item = 'norm_char_emb'
        if the_item in config:
            self.norm_char_emb = str2bool(config[the_item])
        the_item = 'number_normalized'
        if the_item in config:
            self.number_normalized = str2bool(config[the_item])


        the_item = 'seg'
        if the_item in config:
            self.seg = str2bool(config[the_item])
        the_item = 'word_emb_dim'
        if the_item in config:
            self.word_emb_dim = int(config[the_item])
        the_item = 'char_emb_dim'
        if the_item in config:
            self.char_emb_dim = int(config[the_item])

        ## read network:
        the_item = 'use_crf'
        if the_item in config:
            self.use_crf = str2bool(config[the_item])
        the_item = 'use_char'
        if the_item in config:
            self.use_char = str2bool(config[the_item])
        the_item = 'word_seq_feature'
        if the_item in config:
            self.word_feature_extractor = config[the_item]
        the_item = 'char_seq_feature'
        if the_item in config:
            self.char_feature_extractor = config[the_item]
        the_item = 'nbest'
        if the_item in config:
            self.nbest = int(config[the_item])

        the_item = 'feature'
        if the_item in config:
            self.feat_config = config[the_item] ## feat_config is a dict 






        ## read training setting:
        the_item = 'optimizer'
        if the_item in config:
            self.optimizer = config[the_item]
        the_item = 'ave_batch_loss'
        if the_item in config:
            self.average_batch_loss = str2bool(config[the_item])
        the_item = 'status'
        if the_item in config:
            self.status = config[the_item]

        ## read Hyperparameters:
        the_item = 'cnn_layer'
        if the_item in config:
            self.HP_cnn_layer = int(config[the_item])
        the_item = 'iteration'
        if the_item in config:
            self.HP_iteration = int(config[the_item])
        the_item = 'batch_size'
        if the_item in config:
            self.HP_batch_size = int(config[the_item])

        the_item = 'char_hidden_dim'
        if the_item in config:
            self.HP_char_hidden_dim = int(config[the_item])
        the_item = 'hidden_dim'
        if the_item in config:
            self.HP_hidden_dim = int(config[the_item])
        the_item = 'dropout'
        if the_item in config:
            self.HP_dropout = float(config[the_item])
        the_item = 'lstm_layer'
        if the_item in config:
            self.HP_lstm_layer = int(config[the_item])
        the_item = 'bilstm'
        if the_item in config:
            self.HP_bilstm = str2bool(config[the_item])

        the_item = 'gpu'
        if the_item in config:
            self.HP_gpu = str2bool(config[the_item])
        the_item = 'learning_rate'
        if the_item in config:
            self.HP_lr = float(config[the_item])
        the_item = 'lr_decay'
        if the_item in config:
            self.HP_lr_decay = float(config[the_item])
        the_item = 'clip'
        if the_item in config:
            self.HP_clip = float(config[the_item])
        the_item = 'momentum'
        if the_item in config:
            self.HP_momentum = float(config[the_item])
        the_item = 'l2'
        if the_item in config:
            self.HP_l2 = float(config[the_item])
Ejemplo n.º 6
0
class Data:
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = False
        self.norm_char_emb = False
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')

        self.feature_name = []
        self.feature_alphabets = []
        self.feature_num = len(self.feature_alphabets)
        self.feat_config = None
        self.label_alphabet = {0: Alphabet('label', True)}
        self.tagScheme = "NoSeg"  ## BMES/BIO

        self.seg = True

        ### I/O
        self.train_dir = None
        self.dev_dir = None
        self.test_dir = None
        self.raw_dir = None

        self.decode_dir = None
        self.dset_dir = None  ## data vocabulary related file
        self.model_dir = None  ## model save  file
        self.load_model_dir = None  ## model load file

        self.word_emb_dir = None
        self.char_emb_dir = None
        self.feature_emb_dirs = []

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None
        self.pretrain_feature_embeddings = []

        #Added for pretraining
        self.PRETRAINED_ALL = "all"
        self.PRETRAINED_LSTMS = "lstms"
        self.pretrained_model = None
        self.pretrained_part = None

        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        #self.label_alphabet_size = 0
        self.label_alphabet_sizes = {0: 0}
        self.feature_alphabet_sizes = []
        self.feature_emb_dims = []
        self.norm_feature_embs = []
        self.word_emb_dim = 50
        self.char_emb_dim = 30

        ###Networks
        self.word_feature_extractor = "LSTM"  ## "LSTM"/"CNN"/"GRU"/
        self.use_char = True
        self.char_feature_extractor = "CNN"  ## "LSTM"/"CNN"/"GRU"/None
        self.use_crf = True
        self.nbest = None

        ## Training
        self.average_batch_loss = False
        self.optimizer = "SGD"  ## "SGD"/"AdaGrad"/"AdaDelta"/"RMSProp"/"Adam"
        self.status = "train"
        ### Hyperparameters
        self.HP_cnn_layer = 4
        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 200
        self.HP_feature_default_size = 20
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True

        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = None
        self.HP_momentum = 0
        self.HP_l2 = 1e-8

        #D: The number of tasks to be solved
        self.HP_tasks = 1
        self.HP_main_tasks = self.HP_tasks
        self.HP_tasks_weights = [1]

        self.optimize_with_evalb = False
        self.optimize_with_las = False
        self.offset = False
        self.choice_of_best_model = "avg"
        self.language = "English"
        #   self.HP_tasks_inputs = [self.LSTMOUT]

        #Policy Gradient
        self.No_samples = 8
        self.pg_variance_reduce = True
        self.variance_reduce_burn_in = 999
        self.pg_valsteps = 1000
        self.entropy_regularisation = True
        self.entropy_reg_coeff = 0.01

        #Hyper-parameters for disjoint training
        self.train_task_ids = []
        self.dev_task_ids = []
        self.test_task_ids = []
        self.raw_task_ids = []
        self.disjoint = True
        self.datasets = {}
        self.tasks_metrics = {}
        self.HP_tasks_weight_decays = [0]

    def show_data_summary(self):
        print("++" * 50)
        print("DATA SUMMARY START:")
        print(" I/O:")
        print("     Tag          scheme: %s" % (self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s" % (self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s" % (self.number_normalized))
        print("     Word  alphabet size: %s" % (self.word_alphabet_size))
        print("     Char  alphabet size: %s" % (self.char_alphabet_size))
        for idtask in self.label_alphabet:
            print("     Label alphabet size for task %s: %s" %
                  (idtask, self.label_alphabet_sizes[idtask]))
        #print("     Label alphabet size: %s"%(self.label_alphabet_size))
        print("     Word embedding  dir: %s" % (self.word_emb_dir))
        print("     Char embedding  dir: %s" % (self.char_emb_dir))
        print("     Word embedding size: %s" % (self.word_emb_dim))
        print("     Char embedding size: %s" % (self.char_emb_dim))
        print("     Norm   word     emb: %s" % (self.norm_word_emb))
        print("     Norm   char     emb: %s" % (self.norm_char_emb))
        print("     Train  file directory: %s" % (self.train_dir))
        print("     Dev    file directory: %s" % (self.dev_dir))
        print("     Test   file directory: %s" % (self.test_dir))
        print("     Raw    file directory: %s" % (self.raw_dir))
        print("     Dset   file directory: %s" % (self.dset_dir))
        print("     Model  file directory: %s" % (self.model_dir))
        print("     Pretrained model     : %s" % (self.pretrained_model))
        print("     Pretrained part      : %s" % (self.pretrained_part))
        print("     Loadmodel   directory: %s" % (self.load_model_dir))
        print("     Decode file directory: %s" % (self.decode_dir))
        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))
        print("     Raw   instance number: %s" % (len(self.raw_texts)))
        print("     FEATURE num: %s" % (self.feature_num))
        for idx in range(self.feature_num):
            print("         Fe: %s  alphabet  size: %s" %
                  (self.feature_alphabets[idx].name,
                   self.feature_alphabet_sizes[idx]))
            print(
                "         Fe: %s  embedding  dir: %s" %
                (self.feature_alphabets[idx].name, self.feature_emb_dirs[idx]))
            print(
                "         Fe: %s  embedding size: %s" %
                (self.feature_alphabets[idx].name, self.feature_emb_dims[idx]))
            print("         Fe: %s  norm       emb: %s" %
                  (self.feature_alphabets[idx].name,
                   self.norm_feature_embs[idx]))
        print(" " + "++" * 20)
        print(" Model Network:")
        print("     Model        use_crf: %s" % (self.use_crf))
        print("     Model word extractor: %s" % (self.word_feature_extractor))
        print("     Model       use_char: %s" % (self.use_char))
        if self.use_char:
            print("     Model char extractor: %s" %
                  (self.char_feature_extractor))
            print("     Model char_hidden_dim: %s" % (self.HP_char_hidden_dim))
        print(" " + "++" * 20)
        print(" Training:")
        print("     Optimizer: %s" % (self.optimizer))
        print("     Iteration: %s" % (self.HP_iteration))
        print("     BatchSize: %s" % (self.HP_batch_size))
        print("     Average  batch   loss: %s" % (self.average_batch_loss))

        print(" " + "++" * 20)
        print(" Hyperparameters:")

        print("     Hyper              lr: %s" % (self.HP_lr))
        print("     Hyper        lr_decay: %s" % (self.HP_lr_decay))
        print("     Hyper         HP_clip: %s" % (self.HP_clip))
        print("     Hyper        momentum: %s" % (self.HP_momentum))
        print("     Hyper              l2: %s" % (self.HP_l2))
        print("     Hyper      hidden_dim: %s" % (self.HP_hidden_dim))
        print("     Hyper         dropout: %s" % (self.HP_dropout))
        print("     Hyper      lstm_layer: %s" % (self.HP_lstm_layer))
        print("     Hyper          bilstm: %s" % (self.HP_bilstm))
        print("     Hyper             GPU: %s" % (self.HP_gpu))
        print("     Hyper number of tasks: %s" % (self.HP_tasks))

        print("DATA SUMMARY END.")
        print("++" * 50)
        sys.stdout.flush()

    def initial_feature_alphabets(self):
        for l in open(self.train_dir, 'r').readlines():
            if not l.startswith("#") and not l.startswith("-BOS-"):
                items = l.strip("\n").split()
                break

        total_column = len(items)
        if total_column > 2:
            for idx in range(1, total_column - 1):
                feature_prefix = items[idx].split(']', 1)[0] + "]"
                self.feature_alphabets.append(Alphabet(feature_prefix))
                self.feature_name.append(feature_prefix)
                print "Find feature: ", feature_prefix
        self.feature_num = len(self.feature_alphabets)

        self.pretrain_feature_embeddings = [None] * self.feature_num
        self.feature_emb_dims = [self.HP_feature_default_size
                                 ] * self.feature_num
        #self.feature_emb_dims = [20]*self.feature_num
        self.feature_emb_dirs = [None] * self.feature_num
        self.norm_feature_embs = [False] * self.feature_num
        self.feature_alphabet_sizes = [0] * self.feature_num
        if self.feat_config:
            for idx in range(self.feature_num):
                if self.feature_name[idx] in self.feat_config:
                    self.feature_emb_dims[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_size']
                    self.feature_emb_dirs[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_dir']
                    self.norm_feature_embs[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_norm']

    def build_alphabet(self, input_file):
        sample_corpus = None
        in_lines = open(input_file, 'r').readlines()
        for line in in_lines:
            if line.upper().startswith(
                    TREEBANK_LINE
            ):  #Check the treebank this sentence comes from
                sample_corpus = "[" + line.upper().replace(TREEBANK_LINE,
                                                           "").strip() + "]"

            elif len(line) > 2:
                pairs = line.strip().split()
                word = pairs[0].decode('utf-8')
                if self.number_normalized:
                    word = normalize_word(word)
                label = pairs[-1]

                if self.HP_tasks > 1 or not self.disjoint:  #self.task_config[sample_corpus]["nb_tasks"] > 1:
                    label = parse_multitask_label(label)
                else:
                    label = [label]

                if len(label) != len(
                        self.label_alphabet) and not self.disjoint:
                    raise ValueError(
                        "The number of tasks and the number of labels in the output column do not match"
                    )

                init_label_alp_index = 0 if not self.disjoint else self.task_config[
                    sample_corpus]["idstask"]
                for idtask, l in enumerate(label, init_label_alp_index):
                    #for idtask, l in enumerate(label):
                    self.label_alphabet[idtask].add(l)
                self.word_alphabet.add(word)
                for idx in range(self.feature_num):
                    feat_idx = pairs[idx + 1].split(']', 1)[-1]
                    self.feature_alphabets[idx].add(feat_idx)

                for char in word:
                    self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()

        for idtask in self.label_alphabet:
            self.label_alphabet_sizes[idtask] = self.label_alphabet[
                idtask].size()

        for idx in range(self.feature_num):
            self.feature_alphabet_sizes[idx] = self.feature_alphabets[
                idx].size()

        for idtask in self.label_alphabet:
            startS = False
            startB = False

            for label, _ in self.label_alphabet[idtask].iteritems():
                if "S-" in label.upper():
                    startS = True
                elif "B-" in label.upper():
                    startB = True
            if startB:
                if startS:
                    self.tagScheme = "BMES"
                else:
                    self.tagScheme = "BIO"

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()

        for idtask in self.label_alphabet:
            self.label_alphabet[idtask].close()
        for idx in range(self.feature_num):
            self.feature_alphabets[idx].close()

    def build_pretrain_emb(self):
        if self.word_emb_dir:
            print("Load pretrained word embedding, norm: %s, dir: %s" %
                  (self.norm_word_emb, self.word_emb_dir))
            self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(
                self.word_emb_dir, self.word_alphabet, self.word_emb_dim,
                self.norm_word_emb)
        if self.char_emb_dir:
            print("Load pretrained char embedding, norm: %s, dir: %s" %
                  (self.norm_char_emb, self.char_emb_dir))
            self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(
                self.char_emb_dir, self.char_alphabet, self.char_emb_dim,
                self.norm_char_emb)
        for idx in range(self.feature_num):
            if self.feature_emb_dirs[idx]:
                print(
                    "Load pretrained feature %s embedding:, norm: %s, dir: %s"
                    % (self.feature_name[idx], self.norm_feature_embs[idx],
                       self.feature_emb_dirs[idx]))
                self.pretrain_feature_embeddings[idx], self.feature_emb_dims[
                    idx] = build_pretrain_embedding(
                        self.feature_emb_dirs[idx],
                        self.feature_alphabets[idx],
                        self.feature_emb_dims[idx],
                        self.norm_feature_embs[idx])

    def generate_instance(self, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(
                self.train_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.task_config if self.disjoint else None)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(
                self.dev_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.task_config if self.disjoint else None)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(
                self.test_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.task_config if self.disjoint else None)
        elif name == "raw":
            self.raw_texts, self.raw_Ids = read_instance(
                self.raw_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.task_config if self.disjoint else None)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def write_decoded_results(self, predict_results, name, indexes=None):
        fout = open(self.decode_dir, 'w')
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )
        for task_predict_results in predict_results:
            sent_num = len(task_predict_results)
            assert (sent_num == len(content_list))

        for idx in range(sent_num):

            if indexes is not None and idx not in indexes:
                continue

            sent_length = len(
                predict_results[0]
                [idx])  #Index 0 to know the length of the input sentence
            for idy in range(sent_length):
                ## content_list[idx] is a list with [word, char, label]
                inputs = []
                for id_input in range(len(content_list[idx]) - 2):
                    if content_list[idx][id_input][0] != []:
                        if type(content_list[idx][id_input][idy]) == type([]):
                            for feature in content_list[idx][id_input][idy]:
                                inputs.append(feature.encode('utf-8'))
                        else:
                            inputs.append(content_list[idx][id_input]
                                          [idy].encode('utf-8'))

                outputs = []
                for task in predict_results:
                    outputs.append(task[idx][idy])

                fout.write("\t".join(inputs) + "\t" + "{}".join(outputs) +
                           '\n')
            fout.write('\n')
        fout.close()
        print("Predict %s result has been written into file. %s" %
              (name, self.decode_dir))

    def load(self, data_file):
        f = open(data_file, 'rb')
        tmp_dict = pickle.load(f)
        f.close()
        self.__dict__.update(tmp_dict)

    def save(self, save_file):
        f = open(save_file, 'wb')
        pickle.dump(self.__dict__, f, 2)
        f.close()

    def write_nbest_decoded_results(self, predict_results, pred_scores, name):
        fout = open(self.decode_dir, 'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )

        for idtask_predict_results, task_predict_results in enumerate(
                predict_results):
            sent_num = len(task_predict_results)
            assert (sent_num == len(content_list))

        for idx in range(sent_num):
            score_string = "# "

            for idtask_predict_results, task_predict_results in enumerate(
                    predict_results):
                sent_length = len(task_predict_results[idx][0])
                nbest = len(task_predict_results[0])

                #Printing the probabilities
                for idz in range(nbest):
                    score_string += format(
                        pred_scores[idtask_predict_results][idx][idz],
                        '.4f') + " "
            fout.write(score_string.strip() + "\t")
            fout.write("\n")

            for idy in range(sent_length):

                label_string = content_list[idx][0][idy].encode('utf-8') + "\t"
                for ifeat in range(len(content_list[idx][1][idy])):
                    label_string += content_list[idx][1][idy][ifeat].encode(
                        'utf-8') + "\t"

                for idtask_predict_results, task_predict_results in enumerate(
                        predict_results):
                    for idz in range(nbest):
                        label_string += task_predict_results[idx][idz][
                            idy] + ","
                    label_string = label_string.strip().strip(",") + "{}"
                fout.write(label_string)
                fout.write('\n')
            fout.write('\n')
        fout.close()
        print("Predict %s %s-best result has been written into file. %s" %
              (name, nbest, self.decode_dir))

    def read_config(self, config_file):
        config = config_file_to_dict(config_file)
        ## read data:
        the_item = 'train_dir'
        if the_item in config:
            self.train_dir = config[the_item]
        the_item = 'dev_dir'
        if the_item in config:
            self.dev_dir = config[the_item]
        the_item = 'test_dir'
        if the_item in config:
            self.test_dir = config[the_item]
        the_item = 'raw_dir'
        if the_item in config:
            self.raw_dir = config[the_item]
        the_item = 'decode_dir'
        if the_item in config:
            self.decode_dir = config[the_item]
        the_item = 'dset_dir'
        if the_item in config:
            self.dset_dir = config[the_item]
        the_item = 'model_dir'
        if the_item in config:
            self.model_dir = config[the_item]
        the_item = 'load_model_dir'
        if the_item in config:
            self.load_model_dir = config[the_item]
        the_item = 'word_emb_dir'
        if the_item in config:
            self.word_emb_dir = config[the_item]
        the_item = 'char_emb_dir'
        if the_item in config:
            self.char_emb_dir = config[the_item]

        the_item = 'MAX_SENTENCE_LENGTH'
        if the_item in config:
            self.MAX_SENTENCE_LENGTH = int(config[the_item])
        the_item = 'MAX_WORD_LENGTH'
        if the_item in config:
            self.MAX_WORD_LENGTH = int(config[the_item])

        the_item = 'norm_word_emb'
        if the_item in config:
            self.norm_word_emb = str2bool(config[the_item])
        the_item = 'norm_char_emb'
        if the_item in config:
            self.norm_char_emb = str2bool(config[the_item])
        the_item = 'number_normalized'
        if the_item in config:
            self.number_normalized = str2bool(config[the_item])

        the_item = 'seg'
        if the_item in config:
            self.seg = str2bool(config[the_item])
        the_item = 'word_emb_dim'
        if the_item in config:
            self.word_emb_dim = int(config[the_item])
        the_item = 'char_emb_dim'
        if the_item in config:
            self.char_emb_dim = int(config[the_item])

        ## read network:
        the_item = 'use_crf'
        if the_item in config:
            self.use_crf = str2bool(config[the_item])
        the_item = 'use_char'
        if the_item in config:
            self.use_char = str2bool(config[the_item])
        the_item = 'word_seq_feature'
        if the_item in config:
            self.word_feature_extractor = config[the_item]
        the_item = 'char_seq_feature'
        if the_item in config:
            self.char_feature_extractor = config[the_item]
        the_item = 'nbest'
        if the_item in config:
            self.nbest = int(config[the_item])

        the_item = 'feature'
        if the_item in config:
            self.feat_config = config[the_item]  ## feat_config is a dict

        the_item = 'feature_default_size'
        if the_item in config:
            self.HP_feature_default_size = int(config[the_item])

        ## read training setting:
        the_item = 'optimizer'
        if the_item in config:
            self.optimizer = config[the_item]
        the_item = 'ave_batch_loss'
        if the_item in config:
            self.average_batch_loss = str2bool(config[the_item])
        the_item = 'status'
        if the_item in config:
            self.status = config[the_item]

        ## read Hyperparameters:
        the_item = 'cnn_layer'
        if the_item in config:
            self.HP_cnn_layer = int(config[the_item])
        the_item = 'iteration'
        if the_item in config:
            self.HP_iteration = int(config[the_item])
        the_item = 'batch_size'
        if the_item in config:
            self.HP_batch_size = int(config[the_item])

        the_item = 'char_hidden_dim'
        if the_item in config:
            self.HP_char_hidden_dim = int(config[the_item])
        the_item = 'hidden_dim'
        if the_item in config:
            self.HP_hidden_dim = int(config[the_item])
        the_item = 'dropout'
        if the_item in config:
            self.HP_dropout = float(config[the_item])
        the_item = 'lstm_layer'
        if the_item in config:
            self.HP_lstm_layer = int(config[the_item])
        the_item = 'bilstm'
        if the_item in config:
            self.HP_bilstm = str2bool(config[the_item])

        the_item = 'gpu'
        if the_item in config:
            self.HP_gpu = str2bool(config[the_item])
        the_item = 'learning_rate'
        if the_item in config:
            self.HP_lr = float(config[the_item])
        the_item = 'lr_decay'
        if the_item in config:
            self.HP_lr_decay = float(config[the_item])
        the_item = 'clip'
        if the_item in config:
            self.HP_clip = float(config[the_item])
        the_item = 'momentum'
        if the_item in config:
            self.HP_momentum = float(config[the_item])
        the_item = 'l2'
        if the_item in config:
            self.HP_l2 = float(config[the_item])

        #Hyperparameters for auxiliary tasks over the same treebank

        the_item = 'disjoint'
        if the_item in config:
            self.disjoint = str2bool(config[the_item])

        if not self.disjoint:

            the_item = 'tasks'
            if the_item in config:
                self.HP_tasks = int(config[the_item])
                if self.HP_tasks > 1:
                    self.label_alphabet = {
                        idtask: Alphabet('label', True)
                        for idtask in range(self.HP_tasks)
                    }
                    self.label_alphabet_sizes = {
                        idtask: self.label_alphabet[idtask].size()
                        for idtask in range(self.HP_tasks)
                    }

            the_item = "main_tasks"
            if the_item in config:
                self.HP_main_tasks = int(config[the_item])
                print self.HP_main_tasks, self.HP_tasks
                if self.HP_main_tasks > self.HP_tasks:
                    raise ValueError(
                        "HP_main_tasks cannot be greater than HP_tasks")

            the_item = 'tasks_weights'
            if the_item in config:
                self.HP_tasks_weights = map(float, config[the_item].split("|"))

        else:
            #Hyperparameters for auxiliary tasks over a different treebank
            the_item = 'dataset'
            if the_item in config:
                self.task_config = config[the_item]  ## feat_config is a dict
                self.HP_tasks = sum([
                    self.task_config[idtask]["nb_tasks"]
                    for idtask in self.task_config
                ])

                self.HP_main_tasks = sum([
                    self.task_config[idtask]["nb_tasks"]
                    for idtask in self.task_config
                    if self.task_config[idtask]["main"]
                ])

                self.label_alphabet = {
                    idtask: Alphabet('label', True)
                    for idtask in range(self.HP_tasks)
                }
                self.label_alphabet_sizes = {
                    idtask: self.label_alphabet[idtask].size()
                    for idtask in range(self.HP_tasks)
                }

                self.HP_tasks_weights = []
                self.HP_tasks_weight_decays = []
                for idtask in self.task_config:
                    for weight in self.task_config[idtask]["weight"]:
                        self.HP_tasks_weights.append(weight)

                    if "weight_decay" in self.task_config[idtask]:
                        for weight_decay in self.task_config[idtask][
                                "weight_decay"]:
                            self.HP_tasks_weight_decays.append(weight_decay)
                    else:
                        for j in range(self.task_config[idtask]["nb_tasks"]):
                            self.HP_tasks_weight_decays.append(0)

                self.dataset_ids = {
                    treebank: range(
                        self.task_config[treebank]["idstask"],
                        self.task_config[treebank]["idstask"] +
                        self.task_config[treebank]["nb_tasks"])
                    for id, treebank in enumerate(self.task_config)
                }

                self.ignore_after_epoch = {
                    treebank: self.task_config[treebank]["ignore_after_epoch"]
                    if "ignore_after_epoch" in self.task_config[treebank] else
                    self.HP_iteration + 1
                    for treebank in self.task_config
                }

                self.inv_dataset_ids = {}
                for tb in self.dataset_ids:
                    for subtask in self.dataset_ids[tb]:
                        self.inv_dataset_ids[subtask] = tb

                self.task_metric = {}
                for dataset in self.task_config:
                    for i in range(
                            self.task_config[dataset]["idstask"],
                            self.task_config[dataset]["idstask"] +
                            self.task_config[dataset]["nb_tasks"]):

                        if "metric" in self.task_config[dataset]:
                            self.task_metric[i] = self.task_config[dataset][
                                "metric"]

        the_item = 'evaluate'
        if the_item in config:
            self.evaluate = config[the_item]

        the_item = "gold_dev_trees"
        if the_item in config:
            self.gold_dev_trees = config[the_item]

        the_item = "gold_dev_dep"
        if the_item in config:
            self.gold_dev_dep = config[the_item]

        the_item = "combine_dependency_offset"
        if the_item in config:
            self.offset = str2bool(config[the_item])

        the_item = "pretrained_model"
        if the_item in config:
            self.pretrained_model = config[the_item]

        the_item = "pretrained_part"
        if the_item in config:
            if config[the_item].lower() not in [
                    self.PRETRAINED_ALL, self.PRETRAINED_LSTMS
            ]:
                raise ValueError(
                    "Invalidad value for pretrained_part (must be 'all' or 'lstms' "
                )
            self.pretrained_part = config[the_item]

        the_item = "optimize_with_las"
        if the_item in config:
            self.optimize_with_las = str2bool(config[the_item])

        the_item = "gold_train_trees"
        if the_item in config:
            self.gold_train_trees = config[the_item]
Ejemplo n.º 7
0
def load_dataset_parsing(train_path,
                         dev_path,
                         test_path,
                         word_column=1,
                         pos_column=4,
                         head_column=6,
                         type_column=7,
                         embedding="word2Vec",
                         embedding_path=None):
    """

    load data from file
    :param train_path: path of training file
    :param dev_path: path of dev file
    :param test_path: path of test file
    :param word_column: the column index of word (start from 0)
    :param pos_column: the column index of pos (start from 0)
    :param head_column: the column index of head (start from 0)
    :param type_column: the column index of types (start from 0)
    :param embedding: embeddings for words, choose from ['word2vec', 'senna'].
    :param embedding_path: path of file storing word embeddings.
    :return: X_train, POS_train, Head_train, Type_train, mask_train,
             X_dev, POS_dev, Head_dev, Type_dev, mask_dev,
             X_test, POS_test, Head_test, Type_test, mask_test,
             embedd_table, word_alphabet, pos_alphabet, type_alphabet, C_train, C_dev, C_test, char_embedd_table
    """
    def construct_tensor(word_index_sentences, pos_index_sentences,
                         head_sentences, type_index_sentences):
        X = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        POS = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        Head = np.empty([len(word_index_sentences), max_length],
                        dtype=np.int32)
        Type = np.empty([len(word_index_sentences), max_length],
                        dtype=np.int32)
        mask = np.zeros([len(word_index_sentences), max_length],
                        dtype=theano.config.floatX)

        for i in range(len(word_index_sentences)):
            word_ids = word_index_sentences[i]
            pos_ids = pos_index_sentences[i]
            heads = head_sentences[i]
            type_ids = type_index_sentences[i]
            length = len(word_ids)
            for j in range(length):
                wid = word_ids[j]
                pid = pos_ids[j]
                head = heads[j]
                tid = type_ids[j]
                X[i, j] = wid
                POS[i, j] = pid - 1
                Head[i, j] = head
                Type[i, j] = tid - 1

            # Zero out X after the end of the sequence
            X[i, length:] = 0
            # Copy the last label after the end of the sequence
            POS[i, length:] = POS[i, length - 1]
            Head[i, length:] = Head[i, length - 1]
            Type[i, length:] = Type[i, length - 1]
            # Make the mask for this sample 1 within the range of length
            mask[i, :length] = 1
        return X, POS, Head, Type, mask

    word_alphabet = Alphabet('word')
    pos_alphabet = Alphabet('pos')
    type_alphabet = Alphabet('type')

    # read training data
    logger.info("Reading data from training set...")
    word_sentences_train, pos_sentences_train, head_sentences_train, type_sentence_train, \
    word_index_sentences_train, pos_index_sentences_train, \
    type_index_sentences_train = read_conll_parsing(train_path, word_alphabet, pos_alphabet, type_alphabet, word_column,
                                                    pos_column, head_column, type_column)

    # read dev data
    logger.info("Reading data from dev set...")
    word_sentences_dev, pos_sentences_dev, head_sentences_dev, type_sentence_dev, \
    word_index_sentences_dev, pos_index_sentences_dev, \
    type_index_sentences_dev = read_conll_parsing(dev_path, word_alphabet, pos_alphabet, type_alphabet, word_column,
                                                  pos_column, head_column, type_column)

    # read test data
    logger.info("Reading data from test set...")
    word_sentences_test, pos_sentences_test, head_sentences_test, type_sentence_test, \
    word_index_sentences_test, pos_index_sentences_test, \
    type_index_sentences_test = read_conll_parsing(test_path, word_alphabet, pos_alphabet, type_alphabet, word_column,
                                                   pos_column, head_column, type_column)

    # close alphabets
    word_alphabet.close()
    pos_alphabet.close()
    type_alphabet.close()

    logger.info("word alphabet size: %d" % (word_alphabet.size() - 1))
    logger.info("pos alphabet size: %d" % (pos_alphabet.size() - 1))
    logger.info("type alphabet size: %d" % (type_alphabet.size() - 1))

    # get maximum length
    max_length_train = get_max_length(word_sentences_train)
    max_length_dev = get_max_length(word_sentences_dev)
    max_length_test = get_max_length(word_sentences_test)
    max_length = min(MAX_LENGTH,
                     max(max_length_train, max_length_dev, max_length_test))
    logger.info("Maximum length of training set is %d" % max_length_train)
    logger.info("Maximum length of dev set is %d" % max_length_dev)
    logger.info("Maximum length of test set is %d" % max_length_test)
    logger.info("Maximum length used for training is %d" % max_length)

    embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(
        embedding, embedding_path, word_alphabet, logger)
    logger.info("Dimension of embedding is %d, Caseless: %d" %
                (embedd_dim, caseless))
    # fill data tensor (X.shape = [#data, max_length], {POS, Head, Type}.shape = [#data, max_length])
    X_train, POS_train, Head_train, Type_train, mask_train = construct_tensor(
        word_index_sentences_train, pos_index_sentences_train,
        head_sentences_train, type_index_sentences_train)

    X_dev, POS_dev, Head_dev, Type_dev, mask_dev = construct_tensor(
        word_index_sentences_dev, pos_index_sentences_dev, head_sentences_dev,
        type_index_sentences_dev)

    X_test, POS_test, Head_test, Type_test, mask_test = construct_tensor(
        word_index_sentences_test, pos_index_sentences_test,
        head_sentences_test, type_index_sentences_test)

    embedd_table = build_embedd_table(word_alphabet, embedd_dict, embedd_dim,
                                      caseless)

    C_train, C_dev, C_test, char_embedd_table = generate_character_data(
        word_sentences_train, word_sentences_dev, word_sentences_test,
        max_length)

    return X_train, POS_train, Head_train, Type_train, mask_train, \
           X_dev, POS_dev, Head_dev, Type_dev, mask_dev, \
           X_test, POS_test, Head_test, Type_test, mask_test, \
           embedd_table, word_alphabet, pos_alphabet, type_alphabet, \
           C_train, C_dev, C_test, char_embedd_table
Ejemplo n.º 8
0
def generate_character_data(sentences_train,
                            sentences_dev,
                            sentences_test,
                            max_sent_length,
                            char_embedd_dim=80):
    """
    generate data for charaters
    :param sentences_train:
    :param sentences_dev:
    :param sentences_test:
    :param max_sent_length:
    :return: C_train, C_dev, C_test, char_embedd_table
    """
    def get_character_indexes(sentences):
        index_sentences = []
        max_length = 0
        for words in sentences:
            index_words = []
            for word in words:
                index_chars = []
                if len(word) > max_length:
                    max_length = len(word)

                for char in word[:MAX_CHAR_LENGTH]:
                    char_id = char_alphabet.get_index(char)
                    index_chars.append(char_id)

                index_words.append(index_chars)
            index_sentences.append(index_words)
        return index_sentences, max_length

    def construct_tensor_char(index_sentences):
        C = np.empty([len(index_sentences), max_sent_length, max_char_length],
                     dtype=np.int32)
        word_end_id = char_alphabet.get_index(word_end)

        for i in range(len(index_sentences)):
            words = index_sentences[i]
            sent_length = len(words)
            for j in range(sent_length):
                chars = words[j]
                char_length = len(chars)
                for k in range(char_length):
                    cid = chars[k]
                    C[i, j, k] = cid
                # fill index of word end after the end of word
                C[i, j, char_length:] = word_end_id
            # Zero out C after the end of the sentence
            C[i, sent_length:, :] = 0
        return C

    def build_char_embedd_table():
        logger.info('Dimension of char embedding dim is ' +
                    str(char_embedd_dim))
        scale = np.sqrt(3.0 / char_embedd_dim)
        char_embedd_table = np.random.uniform(
            -scale, scale, [char_alphabet.size(), char_embedd_dim]).astype(
                theano.config.floatX)
        return char_embedd_table

    char_alphabet = Alphabet('character')
    char_alphabet.get_index(word_end)

    index_sentences_train, max_char_length_train = get_character_indexes(
        sentences_train)
    index_sentences_dev, max_char_length_dev = get_character_indexes(
        sentences_dev)
    index_sentences_test, max_char_length_test = get_character_indexes(
        sentences_test)

    # close character alphabet
    char_alphabet.close()
    logger.info("character alphabet size: %d" % (char_alphabet.size() - 1))

    max_char_length = min(
        MAX_CHAR_LENGTH,
        max(max_char_length_train, max_char_length_dev, max_char_length_test))
    logger.info("Maximum character length of training set is %d" %
                max_char_length_train)
    logger.info("Maximum character length of dev set is %d" %
                max_char_length_dev)
    logger.info("Maximum character length of test set is %d" %
                max_char_length_test)
    logger.info("Maximum character length used for training is %d" %
                max_char_length)

    # fill character tensor
    C_train = construct_tensor_char(index_sentences_train)
    C_dev = construct_tensor_char(index_sentences_dev)
    C_test = construct_tensor_char(index_sentences_test)

    return C_train, C_dev, C_test, build_char_embedd_table()
Ejemplo n.º 9
0
def load_dataset_sequence_labeling(train_path, dev_path, test_path, word_column=1, label_column=4,
                                   label_name='pos', oov='embedding', fine_tune=False, embedding="word2Vec",
                                   embedding_path=None,
                                   use_character=False):
    """
    load data from file
    :param train_path: path of training file
    :param dev_path: path of dev file
    :param test_path: path of test file
    :param word_column: the column index of word (start from 0)
    :param label_column: the column of label (start from 0)
    :param label_name: name of label, such as pos or ner
    :param oov: embedding for oov word, choose from ['random', 'embedding']. If "embedding", then add words in dev and
                test data to alphabet; if "random", not.
    :param fine_tune: if fine tune word embeddings.
    :param embedding: embeddings for words, choose from ['word2vec', 'senna'].
    :param embedding_path: path of file storing word embeddings.
    :param use_character: if use character embeddings.
    :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test,
            embedd_table (if fine tune), label_alphabet, C_train, C_dev, C_test, char_embedd_table
    """

    def get_max_length(word_sentences):
        max_len = 0
        for sentence in word_sentences:
            length = len(sentence)
            if length > max_len:
                max_len = length
        return max_len

    def construct_tensor_fine_tune(word_index_sentences, label_index_sentences):
        X = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        Y = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        mask = np.zeros([len(word_index_sentences), max_length], dtype=theano.config.floatX)

        for i in range(len(word_index_sentences)):
            word_ids = word_index_sentences[i]
            label_ids = label_index_sentences[i]
            length = len(word_ids)
            for j in range(length):
                wid = word_ids[j]
                label = label_ids[j]
                X[i, j] = wid
                Y[i, j] = label - 1

            # Zero out X after the end of the sequence
            X[i, length:] = 0
            # Copy the last label after the end of the sequence
            Y[i, length:] = Y[i, length - 1]
            # Make the mask for this sample 1 within the range of length
            mask[i, :length] = 1
        return X, Y, mask

    def build_embedd_table(embedd_dict, embedd_dim, caseless):
        scale = np.sqrt(3.0 / embedd_dim)
        embedd_table = np.empty([word_alphabet.size(), embedd_dim], dtype=theano.config.floatX)
        embedd_table[word_alphabet.default_index, :] = np.random.uniform(-scale, scale, [1, embedd_dim])
        for word, index in word_alphabet.iteritems():
            ww = word.lower() if caseless else word
            embedd = embedd_dict[ww] if ww in embedd_dict else np.random.uniform(-scale, scale, [1, embedd_dim])
            embedd_table[index, :] = embedd
        return embedd_table

    def generate_dataset_fine_tune():
        """
        generate data tensor when fine tuning
        :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, embedd_table, label_size
        """

        embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet,
                                                                           logger)
        logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless))
        # fill data tensor (X.shape = [#data, max_length], Y.shape = [#data, max_length])
        X_train, Y_train, mask_train = construct_tensor_fine_tune(word_index_sentences_train,
                                                                  label_index_sentences_train)
        X_dev, Y_dev, mask_dev = construct_tensor_fine_tune(word_index_sentences_dev, label_index_sentences_dev)
        X_test, Y_test, mask_test = construct_tensor_fine_tune(word_index_sentences_test, label_index_sentences_test)
        C_train, C_dev, C_test, char_embedd_table = generate_character_data(word_sentences_train, word_sentences_dev,
                                                                            word_sentences_test,
                                                                            max_length) if use_character else (
            None, None, None, None)
        return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
               build_embedd_table(embedd_dict, embedd_dim, caseless), label_alphabet, \
               C_train, C_dev, C_test, char_embedd_table

    def construct_tensor_not_fine_tune(word_sentences, label_index_sentences, unknown_embedd, embedd_dict,
                                       embedd_dim, caseless):
        X = np.empty([len(word_sentences), max_length, embedd_dim], dtype=theano.config.floatX)
        Y = np.empty([len(word_sentences), max_length], dtype=np.int32)
        mask = np.zeros([len(word_sentences), max_length], dtype=theano.config.floatX)

        # bad_dict = dict()
        # bad_num = 0
        for i in range(len(word_sentences)):
            words = word_sentences[i]
            label_ids = label_index_sentences[i]
            length = len(words)
            for j in range(length):
                word = words[j].lower() if caseless else words[j]
                label = label_ids[j]
                embedd = embedd_dict[word] if word in embedd_dict else unknown_embedd
                X[i, j, :] = embedd
                Y[i, j] = label - 1

                # if word not in embedd_dict:
                #     bad_num += 1
                #     if word in bad_dict:
                #         bad_dict[word] += 1
                #     else:
                #         bad_dict[word] = 1

            # Zero out X after the end of the sequence
            X[i, length:] = np.zeros([1, embedd_dim], dtype=theano.config.floatX)
            # Copy the last label after the end of the sequence
            Y[i, length:] = Y[i, length - 1]
            # Make the mask for this sample 1 within the range of length
            mask[i, :length] = 1

        # for w, c in bad_dict.items():
        #     if c >= 100:
        #         print "%s: %d" % (w, c)
        # print bad_num

        return X, Y, mask

    def generate_dataset_not_fine_tune():
        """
        generate data tensor when not fine tuning
        :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, None, label_size
        """

        embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet,
                                                                           logger)
        logger.info("Dimension of embedding is %d, Caseless: %s" % (embedd_dim, caseless))

        # fill data tensor (X.shape = [#data, max_length, embedding_dim], Y.shape = [#data, max_length])
        unknown_embedd = np.random.uniform(-0.01, 0.01, [1, embedd_dim])
        X_train, Y_train, mask_train = construct_tensor_not_fine_tune(word_sentences_train,
                                                                      label_index_sentences_train, unknown_embedd,
                                                                      embedd_dict, embedd_dim, caseless)
        X_dev, Y_dev, mask_dev = construct_tensor_not_fine_tune(word_sentences_dev, label_index_sentences_dev,
                                                                unknown_embedd, embedd_dict, embedd_dim, caseless)
        X_test, Y_test, mask_test = construct_tensor_not_fine_tune(word_sentences_test, label_index_sentences_test,
                                                                   unknown_embedd, embedd_dict, embedd_dim, caseless)
        C_train, C_dev, C_test, char_embedd_table = generate_character_data(word_sentences_train, word_sentences_dev,
                                                                            word_sentences_test,
                                                                            max_length) if use_character else (
            None, None, None, None)

        return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
               None, label_alphabet, C_train, C_dev, C_test, char_embedd_table

    word_alphabet = Alphabet('word')
    label_alphabet = Alphabet(label_name)

    # read training data
    logger.info("Reading data from training set...")
    word_sentences_train, _, word_index_sentences_train, label_index_sentences_train = read_conll_sequence_labeling(
        train_path, word_alphabet, label_alphabet, word_column, label_column)

    # if oov is "random" and do not fine tune, close word_alphabet
    if oov == "random" and not fine_tune:
        logger.info("Close word alphabet.")
        word_alphabet.close()

    # read dev data
    logger.info("Reading data from dev set...")
    word_sentences_dev, _, word_index_sentences_dev, label_index_sentences_dev = read_conll_sequence_labeling(
        dev_path, word_alphabet, label_alphabet, word_column, label_column)

    # read test data
    logger.info("Reading data from test set...")
    word_sentences_test, _, word_index_sentences_test, label_index_sentences_test = read_conll_sequence_labeling(
        test_path, word_alphabet, label_alphabet, word_column, label_column)

    # close alphabets
    word_alphabet.close()
    label_alphabet.close()

    logger.info("word alphabet size: %d" % (word_alphabet.size() - 1))
    logger.info("label alphabet size: %d" % (label_alphabet.size() - 1))

    # get maximum length
    max_length_train = get_max_length(word_sentences_train)
    max_length_dev = get_max_length(word_sentences_dev)
    max_length_test = get_max_length(word_sentences_test)
    max_length = min(MAX_LENGTH, max(max_length_train, max_length_dev, max_length_test))
    logger.info("Maximum length of training set is %d" % max_length_train)
    logger.info("Maximum length of dev set is %d" % max_length_dev)
    logger.info("Maximum length of test set is %d" % max_length_test)
    logger.info("Maximum length used for training is %d" % max_length)

    if fine_tune:
        logger.info("Generating data with fine tuning...")
        return generate_dataset_fine_tune()
    else:
        logger.info("Generating data without fine tuning...")
        return generate_dataset_not_fine_tune()
Ejemplo n.º 10
0
def load_dataset_parsing(train_path, dev_path, test_path, word_column=1, pos_column=4, head_column=6, type_column=7,
                         embedding="word2Vec", embedding_path=None):
    """

    load data from file
    :param train_path: path of training file
    :param dev_path: path of dev file
    :param test_path: path of test file
    :param word_column: the column index of word (start from 0)
    :param pos_column: the column index of pos (start from 0)
    :param head_column: the column index of head (start from 0)
    :param type_column: the column index of types (start from 0)
    :param embedding: embeddings for words, choose from ['word2vec', 'senna'].
    :param embedding_path: path of file storing word embeddings.
    :return: X_train, POS_train, Head_train, Type_train, mask_train,
             X_dev, POS_dev, Head_dev, Type_dev, mask_dev,
             X_test, POS_test, Head_test, Type_test, mask_test,
             embedd_table, word_alphabet, pos_alphabet, type_alphabet, C_train, C_dev, C_test, char_embedd_table
    """

    def construct_tensor(word_index_sentences, pos_index_sentences, head_sentences, type_index_sentences):
        X = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        POS = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        Head = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        Type = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        mask = np.zeros([len(word_index_sentences), max_length], dtype=theano.config.floatX)

        for i in range(len(word_index_sentences)):
            word_ids = word_index_sentences[i]
            pos_ids = pos_index_sentences[i]
            heads = head_sentences[i]
            type_ids = type_index_sentences[i]
            length = len(word_ids)
            for j in range(length):
                wid = word_ids[j]
                pid = pos_ids[j]
                head = heads[j]
                tid = type_ids[j]
                X[i, j] = wid
                POS[i, j] = pid - 1
                Head[i, j] = head
                Type[i, j] = tid - 1

            # Zero out X after the end of the sequence
            X[i, length:] = 0
            # Copy the last label after the end of the sequence
            POS[i, length:] = POS[i, length - 1]
            Head[i, length:] = Head[i, length - 1]
            Type[i, length:] = Type[i, length - 1]
            # Make the mask for this sample 1 within the range of length
            mask[i, :length] = 1
        return X, POS, Head, Type, mask

    word_alphabet = Alphabet('word')
    pos_alphabet = Alphabet('pos')
    type_alphabet = Alphabet('type')

    # read training data
    logger.info("Reading data from training set...")
    word_sentences_train, pos_sentences_train, head_sentences_train, type_sentence_train, \
    word_index_sentences_train, pos_index_sentences_train, \
    type_index_sentences_train = read_conll_parsing(train_path, word_alphabet, pos_alphabet, type_alphabet, word_column,
                                                    pos_column, head_column, type_column)

    # read dev data
    logger.info("Reading data from dev set...")
    word_sentences_dev, pos_sentences_dev, head_sentences_dev, type_sentence_dev, \
    word_index_sentences_dev, pos_index_sentences_dev, \
    type_index_sentences_dev = read_conll_parsing(dev_path, word_alphabet, pos_alphabet, type_alphabet, word_column,
                                                  pos_column, head_column, type_column)

    # read test data
    logger.info("Reading data from test set...")
    word_sentences_test, pos_sentences_test, head_sentences_test, type_sentence_test, \
    word_index_sentences_test, pos_index_sentences_test, \
    type_index_sentences_test = read_conll_parsing(test_path, word_alphabet, pos_alphabet, type_alphabet, word_column,
                                                   pos_column, head_column, type_column)

    # close alphabets
    word_alphabet.close()
    pos_alphabet.close()
    type_alphabet.close()

    logger.info("word alphabet size: %d" % (word_alphabet.size() - 1))
    logger.info("pos alphabet size: %d" % (pos_alphabet.size() - 1))
    logger.info("type alphabet size: %d" % (type_alphabet.size() - 1))

    # get maximum length
    max_length_train = get_max_length(word_sentences_train)
    max_length_dev = get_max_length(word_sentences_dev)
    max_length_test = get_max_length(word_sentences_test)
    max_length = min(MAX_LENGTH, max(max_length_train, max_length_dev, max_length_test))
    logger.info("Maximum length of training set is %d" % max_length_train)
    logger.info("Maximum length of dev set is %d" % max_length_dev)
    logger.info("Maximum length of test set is %d" % max_length_test)
    logger.info("Maximum length used for training is %d" % max_length)

    embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet,
                                                                       logger)
    logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless))
    # fill data tensor (X.shape = [#data, max_length], {POS, Head, Type}.shape = [#data, max_length])
    X_train, POS_train, Head_train, Type_train, mask_train = construct_tensor(word_index_sentences_train,
                                                                              pos_index_sentences_train,
                                                                              head_sentences_train,
                                                                              type_index_sentences_train)

    X_dev, POS_dev, Head_dev, Type_dev, mask_dev = construct_tensor(word_index_sentences_dev,
                                                                    pos_index_sentences_dev,
                                                                    head_sentences_dev,
                                                                    type_index_sentences_dev)

    X_test, POS_test, Head_test, Type_test, mask_test = construct_tensor(word_index_sentences_test,
                                                                         pos_index_sentences_test,
                                                                         head_sentences_test,
                                                                         type_index_sentences_test)

    embedd_table = build_embedd_table(word_alphabet, embedd_dict, embedd_dim, caseless)

    C_train, C_dev, C_test, char_embedd_table = generate_character_data(word_sentences_train, word_sentences_dev,
                                                                        word_sentences_test, max_length)

    return X_train, POS_train, Head_train, Type_train, mask_train, \
           X_dev, POS_dev, Head_dev, Type_dev, mask_dev, \
           X_test, POS_test, Head_test, Type_test, mask_test, \
           embedd_table, word_alphabet, pos_alphabet, type_alphabet, \
           C_train, C_dev, C_test, char_embedd_table
Ejemplo n.º 11
0
def loadDataForSequenceLabeling(train_path,
                                dev_path,
                                test_path,
                                char_emb_dim,
                                word_column=0,
                                label_column=3,
                                label_name='pos',
                                oov='embedding',
                                fine_tune=False,
                                embeddingToUse="glove",
                                embedding_path=None,
                                use_character=True):
    """
    load data from file
    :param train_path: path of training file
    :param dev_path: path of dev file
    :param test_path: path of test file
    :param word_column: the column index of word (start from 0)
    :param label_column: the column of label (start from 0)
    :param label_name: name of label, such as pos or ner
    :param oov: embedding for oov word, choose from ['random', 'embedding'].
                If "embedding", then add words in dev and
                test data to alphabet; if "random", not.
    :param fine_tune: if fine tune word embeddings.
    :param embedding: embeddings for words, choose from ['word2vec', 'senna'].
    :param embedding_path: path of file storing word embeddings.
    :param use_character: if use character embeddings.
    :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test,
             Y_test, mask_test, embedd_table (if fine tune), label_alphabet,
             C_train, C_dev, C_test, char_embedd_table
    """

    def construct_tensor_fine_tune(word_index_sentences,
                                   label_index_sentences):
        X = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        Y = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        mask = np.zeros([len(word_index_sentences), max_length],
                        dtype=theano.config.floatX)

        for i in range(len(word_index_sentences)):
            word_ids = word_index_sentences[i]
            label_ids = label_index_sentences[i]
            length = len(word_ids)
            for j in range(length):
                wid = word_ids[j]
                label = label_ids[j]
                X[i, j] = wid
                Y[i, j] = label - 1

            # Zero out X after the end of the sequence
            X[i, length:] = 0
            # Copy the last label after the end of the sequence
            Y[i, length:] = Y[i, length - 1]
            # Make the mask for this sample 1 within the range of length
            mask[i, :length] = 1
        return X, Y, mask

    def construct_orth_tensor_fine_tune(orth_word_index_sentences):
        X = np.empty([len(orth_word_index_sentences), max_length],
                     dtype=np.int32)

        for i in range(len(orth_word_index_sentences)):
            orth_word_ids = orth_word_index_sentences[i]
            length = len(orth_word_ids)
            for j in range(length):
                wid = orth_word_ids[j]
                X[i, j] = wid

            # Zero out X after the end of the sequence
            X[i, length:] = 0
        return X

    def generateDatasetFineTune():
        """
        generate data tensor when fine tuning
        :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev,
                 X_test, Y_test, mask_test, embedd_table, label_size
        """

        word_emb_dict, word_emb_dim, caseless = utils.loadEmbeddingsFromFile(
            embeddingToUse,
            embedding_path,
            word_alphabet,
            logger)
        # TODO add a cmd line arg for this
        orth_word_emb_dict, orth_word_emb_dim = \
            utils.randomlyInitialiseOrthographicEmbeddings(orth_word_alphabet,
                                                           logger,
                                                           200)
        logger.info("Dimension of embedding is %d, Caseless: %d" %
                    (word_emb_dim, caseless))
        # fill data tensor (X.shape = [#data, max_length],
        #                   Y.shape = [#data, max_length])
        X_train, Y_train, mask_train = construct_tensor_fine_tune(
            word_index_sentences_train,
            label_index_sentences_train)
        X_train_orth = construct_orth_tensor_fine_tune(
            orth_word_index_sentences_train)

        X_dev, Y_dev, mask_dev = construct_tensor_fine_tune(
            word_index_sentences_dev,
            label_index_sentences_dev)
        X_dev_orth = construct_orth_tensor_fine_tune(
            orth_word_index_sentences_dev)

        X_test, Y_test, mask_test = construct_tensor_fine_tune(
            word_index_sentences_test,
            label_index_sentences_test)
        X_test_orth = construct_orth_tensor_fine_tune(
            orth_word_index_sentences_test)

        C_train, C_dev, C_test, char_emb_table = generate_character_data(
            word_sentences_train,
            word_sentences_dev,
            word_sentences_test,
            max_length,
            "char",
            30) if use_character else \
            (None, None, None, None)
        orth_C_train, orth_C_dev, orth_C_test, orth_char_emb_table = \
            generate_character_data(orth_word_sentences_train,
                                    orth_word_sentences_dev,
                                    orth_word_sentences_test,
                                    max_length,
                                    "orth_char",
                                    30) if use_character else \
            (None, None, None, None)
        word_emb_table = build_embedd_table(word_alphabet,
                                            word_emb_dict,
                                            word_emb_dim,
                                            caseless)
        orth_word_emb_table = build_embedd_table(orth_word_alphabet,
                                                 orth_word_emb_dict,
                                                 orth_word_emb_dim,
                                                 False)
        return X_train, Y_train, mask_train, X_train_orth, \
            X_dev, Y_dev, mask_dev, X_dev_orth, \
            X_test, Y_test, mask_test, X_test_orth, \
            word_emb_table, word_alphabet, orth_word_emb_table, \
            label_alphabet, \
            C_train, C_dev, C_test, char_emb_table, \
            orth_C_train, orth_C_dev, orth_C_test, orth_char_emb_table

    def construct_tensor_not_fine_tune(word_sentences,
                                       label_index_sentences,
                                       unknown_embedd,
                                       word_emb_dict,
                                       word_emb_dim,
                                       caseless):
        X = np.empty([len(word_sentences), max_length, word_emb_dim],
                     dtype=theano.config.floatX)
        Y = np.empty([len(word_sentences), max_length],
                     dtype=np.int32)
        mask = np.zeros([len(word_sentences), max_length],
                        dtype=theano.config.floatX)

        # bad_dict = dict()
        # bad_num = 0
        for i in range(len(word_sentences)):
            words = word_sentences[i]
            label_ids = label_index_sentences[i]
            length = len(words)
            for j in range(length):
                word = words[j].lower() if caseless else words[j]
                label = label_ids[j]
                embedd = word_emb_dict[word] if word in word_emb_dict \
                    else unknown_embedd
                X[i, j, :] = embedd
                Y[i, j] = label - 1

                # if word not in word_emb_dict:
                #     bad_num += 1
                #     if word in bad_dict:
                #         bad_dict[word] += 1
                #     else:
                #         bad_dict[word] = 1

            # Zero out X after the end of the sequence
            X[i, length:] = np.zeros([1, word_emb_dim],
                                     dtype=theano.config.floatX)
            # Copy the last label after the end of the sequence
            Y[i, length:] = Y[i, length - 1]
            # Make the mask for this sample 1 within the range of length
            mask[i, :length] = 1

        # for w, c in bad_dict.items():
        #     if c >= 100:
        #         print "%s: %d" % (w, c)
        # print bad_num

        return X, Y, mask

    def generateDatasetWithoutFineTune():
        """
        generate data tensor when not fine tuning
        :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev,
                 X_test, Y_test, mask_test, None, label_size
        """

        word_emb_dict, word_emb_dim, caseless = \
            utils.loadEmbeddingsFromFile(embeddingToUse,
                                         embedding_path,
                                         word_alphabet,
                                         logger)
        logger.info("Dimension of embedding is %d, Caseless: %s" % (word_emb_dim,
                                                                    caseless))

        # fill data tensor (X.shape = [#data, max_length, embedding_dim],
        #                   Y.shape = [#data, max_length])
        unknown_embedd = np.random.uniform(-0.01, 0.01, [1, word_emb_dim])
        X_train, Y_train, mask_train = construct_tensor_not_fine_tune(
            word_sentences_train,
            label_index_sentences_train,
            unknown_embedd,
            word_emb_dict,
            word_emb_dim,
            caseless)
        X_dev, Y_dev, mask_dev = construct_tensor_not_fine_tune(
            word_sentences_dev,
            label_index_sentences_dev,
            unknown_embedd,
            word_emb_dict,
            word_emb_dim,
            caseless)
        X_test, Y_test, mask_test = construct_tensor_not_fine_tune(
            word_sentences_test,
            label_index_sentences_test,
            unknown_embedd,
            word_emb_dict,
            word_emb_dim,
            caseless)
        C_train, C_dev, C_test, char_embedd_table = generate_character_data(
            word_sentences_train,
            word_sentences_dev,
            word_sentences_test,
            max_length) if use_character else (
            None, None, None, None)

        return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, \
            Y_test, mask_test, None, label_alphabet, C_train, C_dev, \
            C_test, char_embedd_table

    word_alphabet = Alphabet('word')
    label_alphabet = Alphabet(label_name)
    orth_word_alphabet = Alphabet('word_orth')

    # read training data
    logger.info("Reading data from training set...")
    word_sentences_train, _, word_index_sentences_train, \
        label_index_sentences_train = readDataForSequenceLabeling(
            train_path,
            word_alphabet,
            label_alphabet,
            word_column,
            label_column)
    orth_word_sentences_train, orth_word_index_sentences_train = \
        readDataForSequenceLabelingOrthographic(train_path, orth_word_alphabet)

    # if oov is "random" and do not fine tune, close word_alphabet
    if oov == "random" and not fine_tune:
        logger.info("Close word alphabet.")
        word_alphabet.close()
        orth_word_alphabet.close()  # TODO: What's this for?

    # read dev data
    logger.info("Reading data from dev set...")
    word_sentences_dev, _, word_index_sentences_dev, \
        label_index_sentences_dev = readDataForSequenceLabeling(
            dev_path,
            word_alphabet,
            label_alphabet,
            word_column,
            label_column)
    orth_word_sentences_dev, orth_word_index_sentences_dev = \
        readDataForSequenceLabelingOrthographic(
            dev_path,
            orth_word_alphabet)

    # read test data
    logger.info("Reading data from test set...")
    word_sentences_test, _, word_index_sentences_test, \
        label_index_sentences_test = readDataForSequenceLabeling(
            test_path,
            word_alphabet,
            label_alphabet,
            word_column,
            label_column)
    orth_word_sentences_test, orth_word_index_sentences_test = \
        readDataForSequenceLabelingOrthographic(
            test_path,
            orth_word_alphabet)

    # close alphabets
    word_alphabet.close()
    label_alphabet.close()
    orth_word_alphabet.close()

    logger.info("word alphabet size: %d" % (word_alphabet.size() - 1))
    logger.info("label alphabet size: %d" % (label_alphabet.size() - 1))
    logger.info("orthographic word alphabet size: %d" %
                (orth_word_alphabet.size() - 1))

    # get maximum length
    max_length_train = get_max_length(word_sentences_train)
    max_length_dev = get_max_length(word_sentences_dev)
    max_length_test = get_max_length(word_sentences_test)
    max_length = min(MAX_LENGTH,
                     max(max_length_train,
                         max_length_dev,
                         max_length_test))
    logger.info("maximum length of training set: %d" % max_length_train)
    logger.info("maximum length of dev set: %d" % max_length_dev)
    logger.info("maximum length of test set: %d" % max_length_test)
    logger.info("maximum length used for training: %d" % max_length)

    if fine_tune:
        logger.info("generating data with fine tuning...")
        return generateDatasetFineTune()
    else:
        logger.info("generating data without fine tuning...")
        return generateDatasetWithoutFineTune()
Ejemplo n.º 12
0
class Data:
    def __init__(self, input_file):
        self.original_data = open(input_file, 'r').readlines()
        self.index_data = []
        self.word_alphabet = Alphabet('word')
        self.gloss_alphabet = Alphabet('gloss')
        self.entity_alphabet = Alphabet('entity')
        self.gaz_alphabet = Alphabet('gaz')
        self.label_alphabet = Alphabet('label')
        self.word_alphabet_size = 0
        self.gloss_alphabet_size = 0
        self.entity_alphabet_size = 0
        self.gaz_alphabet_size = 0
        self.label_alphabet_size = 0
        ### hyperparameters
        self.HP_iteration = 100
        self.HP_batch_size = 1
        self.HP_gaz_hidden_dim = 50
        self.HP_lstm_hidden_dim = 200
        self.HP_dropout = 0.5
        self.gaz_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = False
        self.HP_use_entity = False
        self.HP_use_gloss = True
        self.HP_use_gaz = False
        self.HP_gpu = True
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = 5.0
        self.HP_momentum = 0
        self.HP_iteration = 100
        # embedding hyperparameter
        self.word_emb_dim = 200
        self.entity_emb_dim = 50
        self.gloss_features = "CNN"  #["CNN","LSTM"]
        self.gloss_emb_dim = 200
        self.gloss_hidden_dim = 300
        self.pretrain_word_embedding = np.array([])
        self.pretrain_gaz_embedding = None
        self.word_embed_path = "../LOVECC/NYM.6B.200d.txt"  #"NYM_200.txt"
        self.gaz_embed_path = None
        self.gaz_emb_dim = 200
        self.HP_fix_gaz_emb = True

    def build_alphabet(self):
        in_lines = self.original_data
        for idx in range(len(in_lines)):
            line = json.loads(in_lines[idx])
            words = line["word_context"]
            for word in words:
                self.word_alphabet.add(word)

            sentence_gloss = line["babel_gloss"]
            for word_gloss in sentence_gloss:
                for phrase_gloss in word_gloss:  #一个词可以匹配多个词组
                    if "EN" in phrase_gloss:
                        phrase_gloss_EN = phrase_gloss["EN"]
                        final_gloss = " . ".join(phrase_gloss_EN)
                        for de_word in final_gloss:
                            # for definates in phrase_gloss_EN:
                            # for de_word in definates.split():
                            self.gloss_alphabet.add(de_word)

            entitys = line["entity_context"]
            for entity in entitys:
                self.entity_alphabet.add(entity)

            gazs = line["babel_phase"]
            for gaz in gazs:
                for item in gaz:
                    self.gaz_alphabet.add(item)

            labels = line["detection_label"]
            for label in labels:
                self.label_alphabet.add(label)
        print(self.label_alphabet.get_content())
        self.word_alphabet_size = self.word_alphabet.size()
        self.gloss_alphabet_size = self.gloss_alphabet.size()
        self.entity_alphabet_size = self.entity_alphabet.size()
        self.gaz_alphabet_size = self.gaz_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        self.word_alphabet.close()
        self.gloss_alphabet.close()
        self.entity_alphabet.close()
        self.gaz_alphabet.close()
        self.label_alphabet.close()

    def generate_instance_Ids(self):  #把输入句子变成对应的标号(Id)
        in_lines = self.original_data
        for idx in range(len(in_lines)):
            line = json.loads(in_lines[idx])
            words = line["word_context"]
            words_Id = []
            for word in words:
                words_Id.append(self.word_alphabet.get_index(word))

            sentence_gloss = line["babel_gloss"]
            sentence_glosses_Id = []
            for word_gloss in sentence_gloss:
                word_glosses_Id = []
                for phrase_gloss in word_gloss:  #一个词可以匹配多个词组
                    if "EN" in phrase_gloss:
                        phrase_gloss_EN = phrase_gloss["EN"]  #这是个list
                        final_gloss = " . ".join(phrase_gloss_EN)
                        for de_word in final_gloss:
                            word_glosses_Id.append(
                                self.gloss_alphabet.get_index(de_word))
                sentence_glosses_Id.append(word_glosses_Id)

            entitys = line["entity_context"]
            entitys_Id = []
            for entity in entitys:
                entitys_Id.append(self.entity_alphabet.get_index(entity))

            gazs = line["babel_phase"]
            sentence_gazs_Id = [
            ]  #gazs_Id=[[[take over,take over of,...],[2,3,...]],[[legal,legal procedures,...],[1,2,...]],...,[[open the window,open the window please,...],[3,4,...]]]
            for gaz in gazs:
                word_gazs_Id = []
                Ids = []
                Lens = []
                for item in gaz:
                    Ids.append(self.gaz_alphabet.get_index(item))
                    Lens.append(len(item.split()))
                word_gazs_Id = [Ids, Lens]
                sentence_gazs_Id.append(word_gazs_Id)

            labels = line["detection_label"]
            labels_Id = []
            for label in labels:
                labels_Id.append(self.label_alphabet.get_index(label))
            self.index_data.append([
                words_Id, entitys_Id, sentence_gazs_Id, sentence_glosses_Id,
                labels_Id
            ])

    def load_pretrain_emb(self, embedding_path):
        lines = open(embedding_path, 'r', encoding="utf-8").readlines()
        statistic = lines[0].strip()  #开头的两个统计数据:单词数,向量长度
        # print(statistic)
        embedd_dim = int(statistic.split()[1])
        embedd_dict = dict()
        embedd_dict["<pad>"] = [0.0 for i in range(embedd_dim)]  #填充词对应的向量置为全零
        # print(len(embedd_dict["<pad>"]))
        for line in lines[1:]:
            line = line.strip()
            if len(line) == 0:
                continue
            tokens = line.split()
            if embedd_dim < 0:
                embedd_dim = len(tokens) - 1
            else:
                assert (embedd_dim + 1 == len(tokens))
            embedd_dict[tokens[0]] = [float(i) for i in tokens[1:]]
        return embedd_dict, embedd_dim

    def norm2one(self, vec):
        if np.sum(vec) == 0:
            return vec
        root_sum_square = np.sqrt(np.sum(np.square(vec)))
        return vec / root_sum_square

    def build_pretrain_embedding(self,
                                 embedding_path,
                                 word_alphabet,
                                 embedd_dim=200,
                                 norm=True):
        embedd_dict = dict()
        if embedding_path != None:
            # 读取embedding字典
            embedd_dict, embedd_dim = self.load_pretrain_emb(embedding_path)
        scale = np.sqrt(3.0 / embedd_dim)
        pretrain_emb = np.zeros([word_alphabet.size(),
                                 embedd_dim])  #pretrain_emb就是重排之后的embedding矩阵
        perfect_match = 0
        case_match = 0
        not_match = 0
        for word, index in word_alphabet.get_alphabet().items():
            if word in embedd_dict:
                # print(word,index)
                # print(len(embedd_dict[word]))
                if norm:
                    pretrain_emb[index] = self.norm2one(embedd_dict[word])
                else:
                    pretrain_emb[index] = embedd_dict[word]
                perfect_match += 1
            elif word.lower() in embedd_dict:
                if norm:
                    pretrain_emb[index] = self.norm2one(
                        embedd_dict[word.lower()])
                else:
                    pretrain_emb[index] = embedd_dict[word.lower()]
                case_match += 1
            else:
                pretrain_emb[index] = np.random.uniform(
                    -scale, scale, [1, embedd_dim])
                not_match += 1
        pretrained_size = len(embedd_dict)
        # print("pad's embedding:",pretrain_emb[word_alphabet.get_index(",")])
        print(
            "Embedding:\n  pretrain word:%s, prefect match:%s, case_match:%s, oov:%s, oov%%:%s"
            % (pretrained_size, perfect_match, case_match, not_match,
               (not_match + 0.) / word_alphabet.size()))
        return pretrain_emb, embedd_dim  #pretrain_emb就是根据alphabet的顺序重排embedding矩阵,embedd_dim是向量的纬度

    def generate_embedding(self):
        self.pretrain_word_embedding, self.word_pretrain_dim = self.build_pretrain_embedding(
            self.word_embed_path, self.word_alphabet)
        self.pretrain_gloss_embedding, self.gloss_pretrain_dim = self.build_pretrain_embedding(
            self.word_embed_path, self.gloss_alphabet)
        self.pretrain_gaz_embedding, self.gaz_pretrain_dim = self.build_pretrain_embedding(
            self.word_embed_path, self.gaz_alphabet)
Ejemplo n.º 13
0
class Data:
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        # self.punctuation_filter = True
        self.norm_word_emb = True
        self.norm_biword_emb = True
        self.norm_gaz_emb = False
        self.word_alphabet = Alphabet('word')
        self.biword_alphabet = Alphabet('biword')
        self.char_alphabet = Alphabet('character')
        # self.word_alphabet.add(START)
        # self.word_alphabet.add(UNKNOWN)
        # self.char_alphabet.add(START)
        # self.char_alphabet.add(UNKNOWN)
        # self.char_alphabet.add(PADDING)
        self.label_alphabet = Alphabet('label', True)
        self.gaz_lower = False
        self.gaz = Gazetteer(self.gaz_lower)
        self.gaz_alphabet = Alphabet('gaz')
        self.HP_fix_gaz_emb = False
        self.HP_use_gaz = True

        self.tagScheme = "NoSeg"
        self.char_features = "LSTM"

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []
        self.use_bigram = True
        self.word_emb_dim = 50
        self.biword_emb_dim = 50
        self.char_emb_dim = 30
        self.gaz_emb_dim = 50
        self.gaz_dropout = 0.5
        self.pretrain_word_embedding = None
        self.pretrain_biword_embedding = None
        self.pretrain_gaz_embedding = None
        self.label_size = 0
        self.word_alphabet_size = 0
        self.biword_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        ### hyperparameters
        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        self.HP_use_char = False
        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = 5.0
        self.HP_momentum = 0

    def show_data_summary(self):
        addLogSectionMark("DATA SUMMARY")
        print("DATA SUMMARY START:")
        print("     Tag          scheme: %s" % (self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s" % (self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s" % (self.number_normalized))
        # print("     Punctuation  filter: %s" % (self.punctuation_filter))
        print("     Use          bigram: %s" % (self.use_bigram))
        print("     Word  alphabet size: %s" % (self.word_alphabet_size))
        print("     Biword alphabet size: %s" % (self.biword_alphabet_size))
        print("     Char  alphabet size: %s" % (self.char_alphabet_size))
        print("     Gaz   alphabet size: %s" % (self.gaz_alphabet.size()))
        print("     Label alphabet size: %s" % (self.label_alphabet_size))
        print("     Word embedding size: %s" % (self.word_emb_dim))
        print("     Biword embedding size: %s" % (self.biword_emb_dim))
        print("     Char embedding size: %s" % (self.char_emb_dim))
        print("     Gaz embedding size: %s" % (self.gaz_emb_dim))
        print("     Norm     word   emb: %s" % (self.norm_word_emb))
        print("     Norm     biword emb: %s" % (self.norm_biword_emb))
        print("     Norm     gaz    emb: %s" % (self.norm_gaz_emb))
        print("     Norm   gaz  dropout: %s" % (self.gaz_dropout))
        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))
        print("     Raw   instance number: %s" % (len(self.raw_texts)))
        print("     Hyperpara  iteration: %s" % (self.HP_iteration))
        print("     Hyperpara  batch size: %s" % (self.HP_batch_size))
        print("     Hyperpara          lr: %s" % (self.HP_lr))
        print("     Hyperpara    lr_decay: %s" % (self.HP_lr_decay))
        print("     Hyperpara     HP_clip: %s" % (self.HP_clip))
        print("     Hyperpara    momentum: %s" % (self.HP_momentum))
        print("     Hyperpara  hidden_dim: %s" % (self.HP_hidden_dim))
        print("     Hyperpara     dropout: %s" % (self.HP_dropout))
        print("     Hyperpara  lstm_layer: %s" % (self.HP_lstm_layer))
        print("     Hyperpara      bilstm: %s" % (self.HP_bilstm))
        print("     Hyperpara         GPU: %s" % (self.HP_gpu))
        print("     Hyperpara     use_gaz: %s" % (self.HP_use_gaz))
        print("     Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb))
        print("     Hyperpara    use_char: %s" % (self.HP_use_char))

        logger.info("     Tag          scheme: %s" % (self.tagScheme))
        logger.info("     MAX SENTENCE LENGTH: %s" %
                    (self.MAX_SENTENCE_LENGTH))
        logger.info("     MAX   WORD   LENGTH: %s" % (self.MAX_WORD_LENGTH))
        logger.info("     Number   normalized: %s" % (self.number_normalized))
        logger.info("     Use          bigram: %s" % (self.use_bigram))
        logger.info("     Word  alphabet size: %s" % (self.word_alphabet_size))
        logger.info("     Biword alphabet size: %s" %
                    (self.biword_alphabet_size))
        logger.info("     Char  alphabet size: %s" % (self.char_alphabet_size))
        logger.info("     Gaz   alphabet size: %s" %
                    (self.gaz_alphabet.size()))
        logger.info("     Label alphabet size: %s" %
                    (self.label_alphabet_size))
        logger.info("     Word embedding size: %s" % (self.word_emb_dim))
        logger.info("     Biword embedding size: %s" % (self.biword_emb_dim))
        logger.info("     Char embedding size: %s" % (self.char_emb_dim))
        logger.info("     Gaz embedding size: %s" % (self.gaz_emb_dim))
        logger.info("     Norm     word   emb: %s" % (self.norm_word_emb))
        logger.info("     Norm     biword emb: %s" % (self.norm_biword_emb))
        logger.info("     Norm     gaz    emb: %s" % (self.norm_gaz_emb))
        logger.info("     Norm   gaz  dropout: %s" % (self.gaz_dropout))
        logger.info("     Train instance number: %s" % (len(self.train_texts)))
        logger.info("     Dev   instance number: %s" % (len(self.dev_texts)))
        logger.info("     Test  instance number: %s" % (len(self.test_texts)))
        logger.info("     Raw   instance number: %s" % (len(self.raw_texts)))
        logger.info("     Hyperpara  iteration: %s" % (self.HP_iteration))
        logger.info("     Hyperpara  batch size: %s" % (self.HP_batch_size))
        logger.info("     Hyperpara          lr: %s" % (self.HP_lr))
        logger.info("     Hyperpara    lr_decay: %s" % (self.HP_lr_decay))
        logger.info("     Hyperpara     HP_clip: %s" % (self.HP_clip))
        logger.info("     Hyperpara    momentum: %s" % (self.HP_momentum))
        logger.info("     Hyperpara  hidden_dim: %s" % (self.HP_hidden_dim))
        logger.info("     Hyperpara     dropout: %s" % (self.HP_dropout))
        logger.info("     Hyperpara  lstm_layer: %s" % (self.HP_lstm_layer))
        logger.info("     Hyperpara      bilstm: %s" % (self.HP_bilstm))
        logger.info("     Hyperpara         GPU: %s" % (self.HP_gpu))
        logger.info("     Hyperpara     use_gaz: %s" % (self.HP_use_gaz))
        logger.info("     Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb))
        print("     Hyperpara    use_char: %s" % (self.HP_use_char))
        if self.HP_use_char:
            print("             Char_features: %s" % (self.char_features))
            logger.info("             Char_features: %s" %
                        (self.char_features))
        print("DATA SUMMARY END.")
        sys.stdout.flush()

    def refresh_label_alphabet(self, input_file):
        old_size = self.label_alphabet_size
        self.label_alphabet.clear(True)
        in_lines = open(input_file, 'r').readlines()
        for line in in_lines:
            if len(line) > 2:
                pairs = line.strip().split()
                label = pairs[-1]
                self.label_alphabet.add(label)
        self.label_alphabet_size = self.label_alphabet.size()
        startS = False
        startB = False
        for label, _ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"
        self.fix_alphabet()
        print("Refresh label alphabet finished: old:%s -> new:%s" %
              (old_size, self.label_alphabet_size))

    def build_alphabet(self, input_file):
        in_lines = open(input_file, 'r').readlines()
        for idx in xrange(len(in_lines)):
            line = in_lines[idx]
            if len(line) > 2:
                pairs = line.strip().split()
                word = pairs[0].decode('utf-8')
                if self.number_normalized:
                    word = normalize_word(word)
                label = pairs[-1]
                self.label_alphabet.add(label)
                self.word_alphabet.add(word)

                if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2:
                    biword = word + in_lines[
                        idx + 1].strip().split()[0].decode('utf-8')
                else:
                    biword = word + NULLKEY

                self.biword_alphabet.add(biword)
                for char in word:
                    self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.biword_alphabet_size = self.biword_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        startS = False
        startB = False
        for label, _ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"

    def build_gaz_file(self, gaz_file):
        ## build gaz file,initial read gaz embedding file
        if gaz_file:
            fins = open(gaz_file, 'r').readlines()
            for fin in fins:
                fin = fin.strip().split()[0].decode('utf-8')
                if fin:
                    self.gaz.insert(fin, "one_source")
            print "Load gaz file: ", gaz_file, " total size:", self.gaz.size()
        else:
            print "Gaz file is None, load nothing"

    def build_gaz_alphabet(self, input_file):
        in_lines = open(input_file, 'r').readlines()
        word_list = []
        for line in in_lines:
            if len(line) > 3:
                word = line.split()[0].decode('utf-8')
                if self.number_normalized:
                    word = normalize_word(word)
                word_list.append(word)
            else:
                w_length = len(word_list)
                for idx in range(w_length):
                    matched_entity = self.gaz.enumerateMatchList(
                        word_list[idx:])
                    for entity in matched_entity:
                        # print entity, self.gaz.searchId(entity),self.gaz.searchType(entity)
                        self.gaz_alphabet.add(entity)
                word_list = []
        print "gaz alphabet size:", self.gaz_alphabet.size()

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.biword_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()
        self.gaz_alphabet.close()

    def build_word_pretrain_emb(self, emb_path):
        print "build word pretrain emb..."
        self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(
            emb_path, self.word_alphabet, self.word_emb_dim,
            self.norm_word_emb)

    def build_biword_pretrain_emb(self, emb_path):
        print "build biword pretrain emb..."
        self.pretrain_biword_embedding, self.biword_emb_dim = build_pretrain_embedding(
            emb_path, self.biword_alphabet, self.biword_emb_dim,
            self.norm_biword_emb)

    def build_gaz_pretrain_emb(self, emb_path):
        print "build gaz pretrain emb..."
        self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding(
            emb_path, self.gaz_alphabet, self.gaz_emb_dim, self.norm_gaz_emb)

    def generate_instance(self, input_file, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_seg_instance(
                input_file, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_seg_instance(
                input_file, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_seg_instance(
                input_file, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "raw":
            self.raw_texts, self.raw_Ids = read_seg_instance(
                input_file, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def generate_instance_with_gaz(self, input_file, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance_with_gaz(
                input_file, self.gaz, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.gaz_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance_with_gaz(
                input_file, self.gaz, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.gaz_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance_with_gaz(
                input_file, self.gaz, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.gaz_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "raw":
            self.raw_texts, self.raw_Ids = read_instance_with_gaz(
                input_file, self.gaz, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.gaz_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)

        elif name == "sentence":
            self.raw_texts, self.raw_Ids = read_instance_with_gaz_text(
                input_file, self.gaz, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.gaz_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def write_decoded_results(self, output_file, predict_results, name):
        fout = open(output_file, 'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )
        assert (sent_num == len(content_list))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx])
            for idy in range(sent_length):
                ## content_list[idx] is a list with [word, char, label]
                fout.write(content_list[idx][0][idy].encode('utf-8') + " " +
                           predict_results[idx][idy] + '\n')

            fout.write('\n')
        fout.close()
        print("Predict %s result has been written into file. %s" %
              (name, output_file))

    def write_decoded_results_back(self, predict_results, name):
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )

        assert (sent_num == len(content_list))
        result = []
        for idx in range(sent_num):
            sent_length = len(predict_results[idx])
            for idy in range(sent_length):
                ## content_list[idx] is a list with [word, char, label]
                print(content_list[idx][0][idy].encode('utf-8') + " " +
                      predict_results[idx][idy] + '\n')

        for idx in range(sent_num):
            sent_length = len(predict_results[idx])

            data = {'start': '', 'end': "", 'value': '', 'entity': ''}
            value = ''
            for idy in range(sent_length):
                pre_su_item = predict_results[idx][idy].split('-')
                if pre_su_item[0] == 'S':
                    data['start'] = str(idy)
                    data['end'] = str(idy + 1)
                    data['value'] = content_list[idx][0][idy].encode('utf-8')
                    data['entity'] = pre_su_item[1]
                    result.append(data)
                    data = {'start': '', 'end': "", 'value': '', 'entity': ''}
                if pre_su_item[0] == 'B':
                    data['start'] = str(idy)
                    value = value + (content_list[idx][0][idy].encode('utf-8'))
                if pre_su_item[0] == 'E':
                    value = value + (content_list[idx][0][idy].encode('utf-8'))
                    data['end'] = str(idy + 1)
                    data['value'] = value
                    data['entity'] = pre_su_item[1]
                    result.append(data)
                    data = {'start': '', 'end': "", 'value': '', 'entity': ''}
                    value = ''
                if pre_su_item[0] == 'I':
                    value = value + (content_list[idx][0][idy].encode('utf-8'))

        return result

    def write_http_data(self, output_file, inputData, name):
        fout = open(output_file, 'w')
        get_num = len(inputData)

        start = 0
        numOfParagram = int(math.ceil(get_num / 5.0))
        num_start_sentence = start
        num_end_sentence = numOfParagram

        if name == "test":
            num_start_sentence = 0
            num_end_sentence = numOfParagram
        elif name == "dev":
            num_start_sentence = numOfParagram
            num_end_sentence = numOfParagram * 2
        elif name == "train":
            num_start_sentence = numOfParagram * 2
            num_end_sentence = get_num

        for idx in range(num_start_sentence, num_end_sentence):
            text = inputData[idx]["text"]
            entities = inputData[idx]["entities"]

            idText = 1
            inWord = False
            tagReady = False
            entity_name = ''
            for Text in text:
                ## content_list[idx] is a list with [word, char, label]
                tagReady = False

                for entity in entities:
                    if not inWord:
                        if entity['start'] + 1 == entity['end'] and entity[
                                'end'] == idText:
                            fout.write(
                                Text.encode('utf-8') + " " + "S-" +
                                entity['entity'].encode('utf-8') + '\n')
                            tagReady = True
                            break
                        if entity['start'] + 1 == idText:
                            fout.write(
                                Text.encode('utf-8') + " " + "B-" +
                                entity['entity'].encode('utf-8') + '\n')
                            tagReady = True
                            inWord = True
                            entity_name = entity['entity'].encode('utf-8')
                            break
                    else:
                        if entity['end'] == idText:
                            fout.write(
                                Text.encode('utf-8') + " " + "E-" +
                                entity_name + '\n')
                            tagReady = True
                            inWord = False
                            break

                if not tagReady:
                    if not inWord:
                        fout.write(Text.encode('utf-8') + " " + "O" + '\n')
                    else:
                        fout.write(
                            Text.encode('utf-8') + " " + "I-" + entity_name +
                            '\n')

                idText = idText + 1
            fout.write('\n')
        fout.close()

        print("Predict input data has been written into file. %s" %
              (output_file))
Ejemplo n.º 14
0
        build_alphabet(enc_word_alphabet, enc_char_alphabet, dec_word_alphabet,
                       dec_char_alphabet, train_datapoints)
        build_alphabet_1(enc_word_alphabet, enc_char_alphabet,
                         dec_word_alphabet, dec_char_alphabet, dev_datapoints)
        if len(test_documents) != 0:
            build_alphabet_1(enc_word_alphabet, enc_char_alphabet,
                             dec_word_alphabet, dec_char_alphabet,
                             test_datapoints)
        if opt.pretraining:
            build_alphabet(enc_word_alphabet, enc_char_alphabet,
                           dec_word_alphabet, dec_char_alphabet,
                           dict_datapoints)

        if opt.method == 'cla':
            enc_word_alphabet.close()
            if opt.use_char:
                enc_char_alphabet.close()
            position_alphabet = None
        else:
            enc_word_alphabet.close()
            dec_word_alphabet.close()
            if opt.use_char:
                enc_char_alphabet.close()
                dec_char_alphabet.close()

            if opt.context == 'sent':
                position_alphabet = Alphabet('position')
                build_position_alphabet(position_alphabet)
                position_alphabet.close()
            else:
class Data:
    def __init__(self, args):

        # Alphabet
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')
        self.label_alphabet = Alphabet('label', True)

        # data
        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []

        self.input_size = 0

        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None

        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0

        # hyper parameters
        self.HP_word_emb_dim = args.embedding_size
        self.HP_char_emb_dim = args.char_embedding_size
        self.HP_iteration = args.max_epoch
        self.HP_batch_size = args.batch_size
        self.HP_char_hidden_dim = args.char_hidden_dim
        self.HP_hidden_dim = args.hidden_size
        self.HP_dropout = args.dropout
        self.HP_char_dropout = args.char_dropout
        self.HP_use_char = True if args.char_encoder else False
        self.HP_char_features = args.char_encoder
        self.HP_gpu = torch.cuda.is_available() and args.gpu
        self.HP_lr = args.lr
        self.HP_model_name = args.model_name
        self.HP_encoder_type = args.encoder
        self.HP_optim = args.optim
        self.HP_number_normalized = args.number_normalized
        self.HP_seed = args.seed
        self.HP_l2 = args.l2
        self.HP_kernel_size = args.kernel_size
        self.HP_kernel_num = args.kernel_num

        # self.HP_lr_decay = 0.05
        # self.HP_clip = None
        # self.HP_momentum = 0
        # self.HP_lstm_layer = 1
        # self.HP_bilstm = True

    def show_data_summary(self):
        print("DATA SUMMARY START:")
        print("     Word  alphabet size: %s" % self.word_alphabet_size)
        print("     Char  alphabet size: %s" % self.char_alphabet_size)
        print("     Label alphabet size: %s" % self.label_alphabet_size)
        print("     Word embedding size: %s" % self.HP_word_emb_dim)
        print("     Char embedding size: %s" % self.HP_char_emb_dim)
        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))
        print("     Hyper       iteration: %s" % self.HP_iteration)
        print("     Hyper      batch size: %s" % self.HP_batch_size)
        print("     Hyper              lr: %s" % self.HP_lr)
        print("     Hyper      hidden_dim: %s" % self.HP_hidden_dim)
        print("     Hyper         dropout: %s" % self.HP_dropout)
        print("     Hyper             GPU: %s" % self.HP_gpu)
        print("     Hyper        use_char: %s" % self.HP_use_char)
        if self.HP_use_char:
            print("             Char_features: %s" % self.HP_char_features)
        print("DATA SUMMARY END.")
        sys.stdout.flush()

    def build_alphabet(self, input_file):
        in_lines = open(input_file, 'r').readlines()
        for line in in_lines:
            line = line.strip()
            if line:
                pairs = line.strip().split()
                label = pairs[0].strip()
                self.label_alphabet.add(label)
                for word in pairs[2:]:
                    if self.HP_number_normalized:
                        word = normalize_word(word)
                    self.word_alphabet.add(word)
                    for char in word:
                        self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()

    def extend_word_char_alphabet(self, input_file_list):
        """

        :param
        :return:
        """
        old_word_size = self.word_alphabet_size
        old_char_size = self.char_alphabet_size
        for input_file in input_file_list:
            in_lines = open(input_file, 'r').readlines()
            for line in in_lines:
                line = line.strip()
                if line:
                    pairs = line.strip().split()
                    for word in pairs[2:]:
                        if self.HP_number_normalized:
                            word = normalize_word(word)  # 如果单词中有数字,变为0
                        self.word_alphabet.add(word)
                        for char in word:
                            self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        print("Extend word/char alphabet finished!")
        print("     old word:%s -> new word:%s" %
              (old_word_size, self.word_alphabet_size))
        print("     old char:%s -> new char:%s" %
              (old_char_size, self.char_alphabet_size))
        for input_file in input_file_list:
            print("     from file:%s" % input_file)

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()

    def generate_instance(self, input_file, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.HP_number_normalized)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.HP_number_normalized)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.HP_number_normalized)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % name)

    def build_word_pretrain_emb(self, emb_path):
        """
        预训练词向量
        :param emb_path:
        :return:
        """
        self.pretrain_word_embedding, self.HP_word_emb_dim = build_pretrain_embedding(
            emb_path, self.word_alphabet, self.HP_word_emb_dim)

    def build_char_pretrain_emb(self, emb_path):
        """

        :param emb_path:
        :return:
        """

        self.pretrain_char_embedding, self.HP_char_emb_dim = build_pretrain_embedding(
            emb_path, self.char_alphabet, self.HP_char_emb_dim)
Ejemplo n.º 16
0
class Data:
    def __init__(self, opt):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')
        self.label_alphabet = Alphabet('label', True)

        self.train_texts = None
        self.train_Ids = None
        self.dev_texts = None
        self.dev_Ids = None
        self.test_texts = None
        self.test_Ids = None

        self.pretrain_word_embedding = None
        self.word_emb_dim = opt.word_emb_dim

        self.config = self.read_config(opt.config)
        self.feat_config = None

        the_item = 'ner_feature'
        if the_item in self.config:
            self.feat_config = self.config[the_item]  ## [POS]:{emb_size:20}
            self.feature_alphabets = []
            self.feature_emb_dims = []
            for k, v in self.feat_config.items():
                self.feature_alphabets.append(Alphabet(k))
                self.feature_emb_dims.append(int(v['emb_size']))

    def clear(self):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        self.train_texts = None
        self.train_Ids = None
        self.dev_texts = None
        self.dev_Ids = None
        self.test_texts = None
        self.test_Ids = None

        self.pretrain_word_embedding = None

    def build_alphabet(self, data):
        for document in data:
            for sentence in document.sentences:
                for token in sentence:
                    word = token['text']
                    if opt.ner_number_normalized:
                        word = normalize_word(word)
                    self.word_alphabet.add(word)
                    if token.get('label') is not None:
                        self.label_alphabet.add(token['label'])
                    # try:
                    #     self.label_alphabet.add(token['label'])
                    # except Exception, e:
                    #     print("document id {} {} {}".format(document.name))
                    #     exit()
                    if self.feat_config is not None:
                        for alphabet in self.feature_alphabets:
                            if alphabet.name == '[POS]':
                                alphabet.add(token['pos'])
                            elif alphabet.name == '[Cap]':
                                alphabet.add(token['cap'])

                    for char in word:
                        self.char_alphabet.add(char)

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()

    def load(self, data_file):
        f = open(data_file, 'rb')
        tmp_dict = pk.load(f)
        f.close()
        self.__dict__.update(tmp_dict)

    def save(self, save_file):
        f = open(save_file, 'wb')
        pk.dump(self.__dict__, f, 2)
        f.close()

    def read_config(self, config_file):

        config = config_file_to_dict(config_file)
        return config
Ejemplo n.º 17
0
def create_alphabets(alphabet_directory,
                     data_paths,
                     max_vocabulary_size,
                     normalize_digits=True):
    logger = utils.get_logger("Create Alphabets")
    word_alphabet = Alphabet('word')
    pos_alphabet = Alphabet('pos')
    type_alphabet = Alphabet('type')
    if not gfile.Exists(alphabet_directory):
        logger.info("Creating Alphabets: %s" % alphabet_directory)

        pos_alphabet.add(ROOT_POS)
        type_alphabet.add(ROOT_TYPE)

        pos_alphabet.add(PAD_POS)
        type_alphabet.add(PAD_TYPE)

        vocab = dict()
        for data_path in data_paths:
            logger.info("Processing data: %s" % data_path)
            with gfile.GFile(data_path, mode="r") as file:
                for line in file:
                    line = line.decode('utf-8')
                    line = line.strip()
                    if len(line) == 0:
                        continue

                    tokens = line.split()
                    word = DIGIT_RE.sub(
                        b"0", tokens[1]) if normalize_digits else tokens[1]
                    pos = tokens[4]
                    type = tokens[7]

                    pos_alphabet.add(pos)
                    type_alphabet.add(type)

                    if word in vocab:
                        vocab[word] += 1
                    else:
                        vocab[word] = 1

        vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
        logger.info("Total Vocabulary Size: %d" % len(vocab_list))
        logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
        logger.info("Type Alphabet Size: %d" % type_alphabet.size())

        if len(vocab_list) > max_vocabulary_size:
            vocab_list = vocab_list[:max_vocabulary_size]
        for word in vocab_list:
            word_alphabet.add(word)

        word_alphabet.save(alphabet_directory)
        pos_alphabet.save(alphabet_directory)
        type_alphabet.save(alphabet_directory)

    else:
        word_alphabet.load(alphabet_directory)
        pos_alphabet.load(alphabet_directory)
        type_alphabet.load(alphabet_directory)

    word_alphabet.close()
    pos_alphabet.close()
    type_alphabet.close()
    return word_alphabet, pos_alphabet, type_alphabet
Ejemplo n.º 18
0
def generate_character_data(sentences_train, sentences_dev, sentences_test, max_sent_length, char_embedd_dim=30):
    """
    generate data for charaters
    :param sentences_train:
    :param sentences_dev:
    :param sentences_test:
    :param max_sent_length:
    :return: C_train, C_dev, C_test, char_embedd_table
    """

    def get_character_indexes(sentences):
        index_sentences = []
        max_length = 0
        for words in sentences:
            index_words = []
            for word in words:
                index_chars = []
                if len(word) > max_length:
                    max_length = len(word)

                for char in word[:MAX_CHAR_LENGTH]:
                    char_id = char_alphabet.get_index(char)
                    index_chars.append(char_id)

                index_words.append(index_chars)
            index_sentences.append(index_words)
        return index_sentences, max_length

    def construct_tensor_char(index_sentences):
        C = np.empty([len(index_sentences), max_sent_length, max_char_length], dtype=np.int32)
        word_end_id = char_alphabet.get_index(word_end)

        for i in range(len(index_sentences)):
            words = index_sentences[i]
            sent_length = len(words)
            for j in range(sent_length):
                chars = words[j]
                char_length = len(chars)
                for k in range(char_length):
                    cid = chars[k]
                    C[i, j, k] = cid
                # fill index of word end after the end of word
                C[i, j, char_length:] = word_end_id
            # Zero out C after the end of the sentence
            C[i, sent_length:, :] = 0
        return C

    def build_char_embedd_table():
        scale = np.sqrt(3.0 / char_embedd_dim)
        char_embedd_table = np.random.uniform(-scale, scale, [char_alphabet.size(), char_embedd_dim]).astype(
            theano.config.floatX)
        return char_embedd_table

    char_alphabet = Alphabet('character')
    char_alphabet.get_index(word_end)

    index_sentences_train, max_char_length_train = get_character_indexes(sentences_train)
    index_sentences_dev, max_char_length_dev = get_character_indexes(sentences_dev)
    index_sentences_test, max_char_length_test = get_character_indexes(sentences_test)

    # close character alphabet
    char_alphabet.close()
    logger.info("character alphabet size: %d" % (char_alphabet.size() - 1))

    max_char_length = min(MAX_CHAR_LENGTH, max(max_char_length_train, max_char_length_dev, max_char_length_test))
    logger.info("Maximum character length of training set is %d" % max_char_length_train)
    logger.info("Maximum character length of dev set is %d" % max_char_length_dev)
    logger.info("Maximum character length of test set is %d" % max_char_length_test)
    logger.info("Maximum character length used for training is %d" % max_char_length)

    # fill character tensor
    C_train = construct_tensor_char(index_sentences_train)
    C_dev = construct_tensor_char(index_sentences_dev)
    C_test = construct_tensor_char(index_sentences_test)

    return C_train, C_dev, C_test, build_char_embedd_table()
Ejemplo n.º 19
0
def pretrain(opt):

    samples_per_epoch = []
    pregenerated_data = Path(opt.instance_dir)
    for i in range(opt.iter):

        epoch_file = pregenerated_data / f"epoch_{i}.json"
        metrics_file = pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(
                f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({opt.iter})."
            )
            print(
                "This script will loop over the available data, but training diversity may be negatively impacted."
            )
            num_data_epochs = i
            break
    else:
        num_data_epochs = opt.iter

    if opt.gpu >= 0 and torch.cuda.is_available():
        if opt.multi_gpu:
            device = torch.device("cuda")
            n_gpu = torch.cuda.device_count()
        else:
            device = torch.device('cuda', opt.gpu)
            n_gpu = 1
    else:
        device = torch.device("cpu")
        n_gpu = 0

    logging.info("device: {} n_gpu: {}".format(device, n_gpu))

    if opt.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(opt.gradient_accumulation_steps))

    opt.batch_size = opt.batch_size // opt.gradient_accumulation_steps

    makedir_and_clear(opt.save)

    tokenizer = BertTokenizer.from_pretrained(opt.bert_dir,
                                              do_lower_case=opt.do_lower_case)

    total_train_examples = 0
    for i in range(opt.iter):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(total_train_examples / opt.batch_size /
                                       opt.gradient_accumulation_steps)

    logging.info("load dict ...")
    UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(opt.norm_dict)
    logging.info("dict concept number {}".format(len(UMLS_dict)))
    dict_alphabet = Alphabet('dict')
    init_dict_alphabet(dict_alphabet, UMLS_dict)
    dict_alphabet.close()

    # Prepare model
    model, _ = BertForPreTraining.from_pretrained(
        opt.bert_dir, num_norm_labels=get_dict_size(dict_alphabet))
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=opt.lr,
                         warmup=opt.warmup_proportion,
                         t_total=num_train_optimization_steps)

    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", opt.batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)
    model.train()
    for epoch in range(opt.iter):
        epoch_dataset = PregeneratedDataset(epoch=epoch,
                                            training_path=pregenerated_data,
                                            tokenizer=tokenizer,
                                            num_data_epochs=num_data_epochs,
                                            dict_alphabet=dict_alphabet)
        train_sampler = RandomSampler(epoch_dataset)

        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=opt.batch_size)
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        epoch_start = time.time()
        sum_loss = 0
        sum_orginal_loss = 0
        num_iter = len(train_dataloader)

        with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next, input_ids_ent, input_mask_ent, norm_label_ids = batch
                loss, original_loss = model(input_ids, segment_ids, input_mask,
                                            lm_label_ids, input_ids_ent,
                                            input_mask_ent, is_next,
                                            norm_label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                    original_loss = original_loss.mean()
                if opt.gradient_accumulation_steps > 1:
                    loss = loss / opt.gradient_accumulation_steps
                    original_loss = original_loss / opt.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                pbar.update(1)
                mean_loss = tr_loss * opt.gradient_accumulation_steps / nb_tr_steps
                pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")

                if (step + 1) % opt.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                sum_loss += loss.item()
                sum_orginal_loss += original_loss.item()

        epoch_finish = time.time()
        logging.info(
            "epoch: %s training finished. Time: %.2fs. loss: %.4f, original_loss %.4f"
            % (epoch, epoch_finish - epoch_start, sum_loss / num_iter,
               sum_orginal_loss / num_iter))

        # Save a trained model
        logging.info("** ** * Saving fine-tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(
            opt.save, "pytorch_model_{}.bin".format(str(epoch + 1)))
        torch.save(model_to_save.state_dict(), str(output_model_file))
Ejemplo n.º 20
0
def load_dataset_sequence_labeling(train_path,
                                   dev_path,
                                   test_path,
                                   word_column=0,
                                   label_column=1,
                                   label_name='senti',
                                   oov='embedding',
                                   fine_tune=False,
                                   embedding="word2Vec",
                                   embedding_path=None,
                                   use_character=False):
    """
    load data from file
    :param train_path: path of training file
    :param dev_path: path of dev file
    :param test_path: path of test file
    :param word_column: the column index of word (start from 0)
    :param label_column: the column of label (start from 0)
    :param label_name: name of label, such as pos or ner
    :param oov: embedding for oov word, choose from ['random', 'embedding']. If "embedding", then add words in dev and
                test data to alphabet; if "random", not.
    :param fine_tune: if fine tune word embeddings.
    :param embedding: embeddings for words, choose from ['word2vec', 'senna'].
    :param embedding_path: path of file storing word embeddings.
    :param use_character: if use character embeddings.
    :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test,
            embedd_table (if fine tune), label_alphabet, C_train, C_dev, C_test, char_embedd_table
    """
    def construct_tensor_fine_tune(word_index_sentences,
                                   label_index_sentences):
        X = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        Y = []
        mask = np.zeros([len(word_index_sentences), max_length],
                        dtype=theano.config.floatX)

        for i in range(len(word_index_sentences)):
            word_ids = word_index_sentences[i]
            label_ids = label_index_sentences[i]
            length = len(word_ids)
            for j in range(length):
                wid = word_ids[j]
                X[i, j] = wid

            label = label_ids[0]
            Y.append(label)

            # Zero out X after the end of the sequence
            X[i, length:] = 0
            # Make the mask for this sample 1 within the range of length
            mask[i, :length] = 1
        return X, Y, mask

    def generate_dataset_fine_tune():
        """
        generate data tensor when fine tuning
        :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, embedd_table, label_size
        """

        embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(
            embedding, embedding_path, word_alphabet, logger)
        logger.info("Dimension of embedding is %d, Caseless: %d" %
                    (embedd_dim, caseless))
        # fill data tensor (X.shape = [#data, max_length], Y.shape = [#data, max_length])
        X_train, Y_train, mask_train = construct_tensor_fine_tune(
            word_index_sentences_train, label_index_sentences_train)
        X_dev, Y_dev, mask_dev = construct_tensor_fine_tune(
            word_index_sentences_dev, label_index_sentences_dev)
        X_test, Y_test, mask_test = construct_tensor_fine_tune(
            word_index_sentences_test, label_index_sentences_test)
        C_train, C_dev, C_test, char_embedd_table = generate_character_data(
            word_sentences_train, word_sentences_dev, word_sentences_test,
            max_length) if use_character else (None, None, None, None)
        return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
               build_embedd_table(word_alphabet, embedd_dict, embedd_dim, caseless), label_alphabet, \
               C_train, C_dev, C_test, char_embedd_table

    def construct_tensor_not_fine_tune(word_sentences, label_index_sentences,
                                       unknown_embedd, embedd_dict, embedd_dim,
                                       caseless):
        X = np.empty([len(word_sentences), max_length, embedd_dim],
                     dtype=theano.config.floatX)
        Y = np.empty([len(word_sentences), max_length], dtype=np.int32)
        mask = np.zeros([len(word_sentences), max_length],
                        dtype=theano.config.floatX)

        # bad_dict = dict()
        # bad_num = 0
        for i in range(len(word_sentences)):
            words = word_sentences[i]
            label_ids = label_index_sentences[i]
            length = len(words)
            for j in range(length):
                word = words[j].lower() if caseless else words[j]
                label = label_ids[j]
                embedd = embedd_dict[
                    word] if word in embedd_dict else unknown_embedd
                X[i, j, :] = embedd
                Y[i, j] = label - 1

                # if word not in embedd_dict:
                #     bad_num += 1
                #     if word in bad_dict:
                #         bad_dict[word] += 1
                #     else:
                #         bad_dict[word] = 1

            # Zero out X after the end of the sequence
            X[i, length:] = np.zeros([1, embedd_dim],
                                     dtype=theano.config.floatX)
            # Copy the last label after the end of the sequence
            Y[i, length:] = Y[i, length - 1]
            # Make the mask for this sample 1 within the range of length
            mask[i, :length] = 1

        # for w, c in bad_dict.items():
        #     if c >= 100:
        #         print "%s: %d" % (w, c)
        # print bad_num

        return X, Y, mask

    def generate_dataset_not_fine_tune():
        """
        generate data tensor when not fine tuning
        :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, None, label_size
        """

        embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(
            embedding, embedding_path, word_alphabet, logger)
        logger.info("Dimension of embedding is %d, Caseless: %s" %
                    (embedd_dim, caseless))

        # fill data tensor (X.shape = [#data, max_length, embedding_dim], Y.shape = [#data, max_length])
        unknown_embedd = np.random.uniform(-0.01, 0.01, [1, embedd_dim])
        X_train, Y_train, mask_train = construct_tensor_not_fine_tune(
            word_sentences_train, label_index_sentences_train, unknown_embedd,
            embedd_dict, embedd_dim, caseless)
        X_dev, Y_dev, mask_dev = construct_tensor_not_fine_tune(
            word_sentences_dev, label_index_sentences_dev, unknown_embedd,
            embedd_dict, embedd_dim, caseless)
        X_test, Y_test, mask_test = construct_tensor_not_fine_tune(
            word_sentences_test, label_index_sentences_test, unknown_embedd,
            embedd_dict, embedd_dim, caseless)
        C_train, C_dev, C_test, char_embedd_table = generate_character_data(
            word_sentences_train, word_sentences_dev, word_sentences_test,
            max_length) if use_character else (None, None, None, None)

        return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
               None, label_alphabet, C_train, C_dev, C_test, char_embedd_table

    word_alphabet = Alphabet('word')
    label_alphabet = Alphabet(label_name)

    # read training data
    logger.info("Reading data from training set...")
    word_sentences_train, _, word_index_sentences_train, label_index_sentences_train = read_conll_sequence_labeling(
        train_path, word_alphabet, label_alphabet, word_column, label_column)

    # if oov is "random" and do not fine tune, close word_alphabet
    if oov == "random" and not fine_tune:
        logger.info("Close word alphabet.")
        word_alphabet.close()

    # read dev data
    logger.info("Reading data from dev set...")
    word_sentences_dev, _, word_index_sentences_dev, label_index_sentences_dev = read_conll_sequence_labeling(
        dev_path, word_alphabet, label_alphabet, word_column, label_column)

    # read test data
    logger.info("Reading data from test set...")
    word_sentences_test, _, word_index_sentences_test, label_index_sentences_test = read_conll_sequence_labeling(
        test_path, word_alphabet, label_alphabet, word_column, label_column)

    # close alphabets
    word_alphabet.close()
    label_alphabet.close()

    logger.info("word alphabet size: %d" % (word_alphabet.size() - 1))
    logger.info("label alphabet size: %d" % (label_alphabet.size() - 1))

    # get maximum length
    max_length_train = get_max_length(word_sentences_train)
    max_length_dev = get_max_length(word_sentences_dev)
    max_length_test = get_max_length(word_sentences_test)
    max_length = min(MAX_LENGTH,
                     max(max_length_train, max_length_dev, max_length_test))
    logger.info("Maximum length of training set is %d" % max_length_train)
    logger.info("Maximum length of dev set is %d" % max_length_dev)
    logger.info("Maximum length of test set is %d" % max_length_test)
    logger.info("Maximum length used for training is %d" % max_length)

    if fine_tune:
        logger.info("Generating data with fine tuning...")
        return generate_dataset_fine_tune()
    else:
        logger.info("Generating data without fine tuning...")
        return generate_dataset_not_fine_tune()
Ejemplo n.º 21
0
class Data:
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 230
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = False
        self.norm_word_emb = True
        self.norm_biword_emb = True
        self.norm_gaz_emb = False
        self.word_alphabet = Alphabet('word')
        self.biword_alphabet = Alphabet('biword')
        self.char_alphabet = Alphabet('character')
        # self.word_alphabet.add(START)
        # self.word_alphabet.add(UNKNOWN)
        # self.char_alphabet.add(START)
        # self.char_alphabet.add(UNKNOWN)
        # self.char_alphabet.add(PADDING)
        self.label_alphabet = Alphabet('label', True)
        self.gaz_lower = False
        self.gaz = Gazetteer(self.gaz_lower)
        self.gaz_alphabet = Alphabet('gaz')
        self.HP_fix_gaz_emb = False
        self.HP_use_gaz = True

        self.tagScheme = "BMES"
        self.char_features = "LSTM"

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []
        self.use_bigram = False
        self.word_emb_dim = 50
        self.biword_emb_dim = 50
        self.char_emb_dim = 50
        self.gaz_emb_dim = 50
        self.gaz_dropout = 0.5
        self.pretrain_word_embedding = None
        self.pretrain_biword_embedding = None
        self.pretrain_gaz_embedding = None
        self.label_size = 0
        self.word_alphabet_size = 0
        self.biword_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        # hyperparameters
        self.HP_iteration = 100
        self.HP_batch_size = 1
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        self.HP_use_char = True
        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = 5.0
        self.HP_momentum = 0

    def show_data_summary(self):
        print("DATA SUMMARY START:")
        print("     Tag          scheme: %s" % (self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s" % (self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s" % (self.number_normalized))
        print("     Use          bigram: %s" % (self.use_bigram))
        print("     Word  alphabet size: %s" % (self.word_alphabet_size))
        print("     Biword alphabet size: %s" % (self.biword_alphabet_size))
        print("     Char  alphabet size: %s" % (self.char_alphabet_size))
        print("     Gaz   alphabet size: %s" % (self.gaz_alphabet.size()))
        print("     Label alphabet size: %s" % (self.label_alphabet_size))
        print("     Word embedding size: %s" % (self.word_emb_dim))
        print("     Biword embedding size: %s" % (self.biword_emb_dim))
        print("     Char embedding size: %s" % (self.char_emb_dim))
        print("     Gaz embedding size: %s" % (self.gaz_emb_dim))
        print("     Norm     word   emb: %s" % (self.norm_word_emb))
        print("     Norm     biword emb: %s" % (self.norm_biword_emb))
        print("     Norm     gaz    emb: %s" % (self.norm_gaz_emb))
        print("     Norm   gaz  dropout: %s" % (self.gaz_dropout))
        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))
        print("     Raw   instance number: %s" % (len(self.raw_texts)))
        print("     Hyperpara  iteration: %s" % (self.HP_iteration))
        print("     Hyperpara  batch size: %s" % (self.HP_batch_size))
        print("     Hyperpara          lr: %s" % (self.HP_lr))
        print("     Hyperpara    lr_decay: %s" % (self.HP_lr_decay))
        print("     Hyperpara     HP_clip: %s" % (self.HP_clip))
        print("     Hyperpara    momentum: %s" % (self.HP_momentum))
        print("     Hyperpara  hidden_dim: %s" % (self.HP_hidden_dim))
        print("     Hyperpara     dropout: %s" % (self.HP_dropout))
        print("     Hyperpara  lstm_layer: %s" % (self.HP_lstm_layer))
        print("     Hyperpara      bilstm: %s" % (self.HP_bilstm))
        print("     Hyperpara         GPU: %s" % (self.HP_gpu))
        print("     Hyperpara     use_gaz: %s" % (self.HP_use_gaz))
        print("     Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb))
        print("     Hyperpara    use_char: %s" % (self.HP_use_char))
        if self.HP_use_char:
            print("             Char_features: %s" % (self.char_features))
        print("DATA SUMMARY END.")
        sys.stdout.flush()

    def refresh_label_alphabet(self, input_file):
        old_size = self.label_alphabet_size
        self.label_alphabet.clear(True)
        in_lines = open(input_file, 'r').readlines()
        for line in in_lines:
            if len(line) > 2:
                pairs = line.strip().split()
                label = pairs[-1]
                self.label_alphabet.add(label)
        self.label_alphabet_size = self.label_alphabet.size()
        startS = False
        startB = False
        for label, _ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"
        self.fix_alphabet()
        print("Refresh label alphabet finished: old:%s -> new:%s" %
              (old_size, self.label_alphabet_size))

    def build_alphabet(self, input_file):
        in_lines = open(input_file, 'r').readlines()
        for idx in xrange(len(in_lines)):
            line = in_lines[idx]
            if len(line) > 2:
                pairs = line.strip().split()
                word = pairs[0].decode('utf-8')
                if self.number_normalized:
                    word = normalize_word(word)
                #  获取label
                label = pairs[-1]
                # 安装出现顺序添加
                self.label_alphabet.add(label)
                self.word_alphabet.add(word)
                if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2:
                    biword = word + in_lines[
                        idx + 1].strip().split()[0].decode('utf-8')
                else:
                    biword = word + NULLKEY
                self.biword_alphabet.add(biword)
                for char in word:
                    self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.biword_alphabet_size = self.biword_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        startS = False
        startB = False
        # 判断是否属于BIO,BMES,BIOES其中一�?
        for label, _ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                # 如果有S则为BMES或BIOES
                self.tagScheme = "BMES"
            else:
                # 没有则为BIO
                self.tagScheme = "BIO"

    def build_gaz_file(self, gaz_file):
        # build gaz file,initial read gaz embedding file
        if gaz_file:
            fins = open(gaz_file, 'r').readlines()
            for fin in fins:
                fin = fin.strip().split()[0].decode('utf-8')
                if fin:
                    self.gaz.insert(fin, "one_source")
            print
            "Load gaz file: ", gaz_file, " total size:", self.gaz.size()
        else:
            print
            "Gaz file is None, load nothing"

    def build_gaz_alphabet(self, input_file):
        in_lines = open(input_file, 'r').readlines()
        word_list = []
        for line in in_lines:
            if len(line) > 3:
                word = line.split()[0].decode('utf-8')
                if self.number_normalized:
                    word = normalize_word(word)
                word_list.append(word)
            else:
                w_length = len(word_list)
                for idx in range(w_length):
                    matched_entity = self.gaz.enumerateMatchList(
                        word_list[idx:])
                    for entity in matched_entity:
                        # print entity, self.gaz.searchId(entity),self.gaz.searchType(entity)
                        self.gaz_alphabet.add(entity)
                word_list = []
        print
        "gaz alphabet size:", self.gaz_alphabet.size()

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.biword_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()
        self.gaz_alphabet.close()

    def build_word_pretrain_emb(self, emb_path):
        print
        "build word pretrain emb..."
        self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(
            emb_path, self.word_alphabet, self.word_emb_dim,
            self.norm_word_emb)

    def build_radical_pretrain_emb(self, emb_path):
        print
        "build radical pretrain emb..."
        self.pretrain_word_embedding, self.word_emb_dim = build_radical_pretrain_embedding(
            emb_path, self.word_alphabet, self.word_emb_dim,
            self.norm_word_emb)

    def build_biword_pretrain_emb(self, emb_path):
        print
        "build biword pretrain emb..."
        self.pretrain_biword_embedding, self.biword_emb_dim = build_pretrain_embedding(
            emb_path, self.biword_alphabet, self.biword_emb_dim,
            self.norm_biword_emb)

    def build_gaz_pretrain_emb(self, emb_path):
        print
        "build gaz pretrain emb..."
        self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding(
            emb_path, self.gaz_alphabet, self.gaz_emb_dim, self.norm_gaz_emb)

    def generate_instance(self, input_file, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_seg_instance(
                input_file, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_seg_instance(
                input_file, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_seg_instance(
                input_file, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "raw":
            self.raw_texts, self.raw_Ids = read_seg_instance(
                input_file, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def generate_instance_with_gaz(self, input_file, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance_with_gaz(
                input_file, self.gaz, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.gaz_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance_with_gaz(
                input_file, self.gaz, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.gaz_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance_with_gaz(
                input_file, self.gaz, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.gaz_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "raw":
            self.raw_texts, self.raw_Ids = read_instance_with_gaz(
                input_file, self.gaz, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.gaz_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def write_decoded_results(self, output_file, predict_results, name):
        fout = open(output_file, 'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )
        assert (sent_num == len(content_list))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx])
            for idy in range(sent_length):
                # content_list[idx] is a list with [word, char, label]
                fout.write(content_list[idx][0][idy].encode('utf-8') + " " +
                           predict_results[idx][idy] + '\n')

            fout.write('\n')
        fout.close()
        print("Predict %s result has been written into file. %s" %
              (name, output_file))
Ejemplo n.º 22
0
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
print("Writing to {}\n".format(out_dir))
# read training data
logger.info("Reading data from training set...")
word_sentences_train, _, word_index_sentences_train, label_index_sentences_train = dp.read_conll_sequence_labeling(
    train_path,
    word_alphabet,
    label_alphabet,
    word_column,
    label_column,
    out_dir=out_dir)

# if oov is "random" and do not fine tune, close word_alphabet
if oov == "random" and not fine_tune:
    logger.info("Close word alphabet.")
    word_alphabet.close()

# read dev data
logger.info("Reading data from dev set...")
word_sentences_dev, _, word_index_sentences_dev, label_index_sentences_dev = dp.read_conll_sequence_labeling(
    dev_path, word_alphabet, label_alphabet, word_column, label_column)

# close alphabets : by close we mean we cannot add any more words to the word vocabulary.
#To DO :change to close this after train set alone
word_alphabet.close()
label_alphabet.close()

# we are doing a -1 because we did not use the zer index. I believe this is to account for unknown word
logger.info("word alphabet size: %d" % (word_alphabet.size() - 1))
logger.info("label alphabet size: %d" % (label_alphabet.size() - 1))
# get maximum length : this is mainly for padding.
Ejemplo n.º 23
0
class Data:
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.number_normalized = True
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')

        self.feature_name = []
        self.feature_alphabets = []
        self.feature_num = len(self.feature_alphabets)
        self.feat_config = None
        self.feature_name2id = {}

        self.label_alphabet = Alphabet('label', True)
        self.tagScheme = "BMES"

        ### I/O
        self.train_dir = None
        self.dev_dir = None
        self.test_dir = None

        self.word_emb_dir = None

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []

        self.pretrain_word_embedding = None

        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        self.feature_alphabet_sizes = []
        self.feature_emb_dims = []
        self.word_emb_dim = 50
        self.char_emb_dim = 30

        self.nbest = None

        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.5

        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_l2 = 1e-8

        # both
        self.full_data = False
        self.tune_wordemb = False

        # relation
        self.max_seq_len = 500
        self.pad_idx = 0
        self.sent_window = 3
        # self.output =None
        self.unk_ratio = 1
        self.seq_feature_size = 256

        self.re_feature_name = []
        self.re_feature_name2id = {}
        self.re_feature_alphabets = []
        self.re_feature_num = len(self.re_feature_alphabets)
        self.re_feat_config = None
        self.re_feature_emb_dims = []
        self.re_feature_alphabet_sizes = []

        self.re_train_X = []
        self.re_dev_X = []
        self.re_test_X = []
        self.re_train_Y = []
        self.re_dev_Y = []
        self.re_test_Y = []

        self.patience = 10

        # self.pretrained_model_dir = None

    def copy_alphabet(self, other):
        self.word_alphabet = copy.deepcopy(other.word_alphabet)
        self.char_alphabet = copy.deepcopy(other.char_alphabet)
        for feature_alphabet in other.feature_alphabets:
            self.feature_alphabets.append(copy.deepcopy(feature_alphabet))

        self.label_alphabet = copy.deepcopy(other.label_alphabet)

        self.feature_name = copy.deepcopy(other.feature_name)
        self.feature_alphabets = copy.deepcopy(other.feature_alphabets)
        self.feature_num = len(self.feature_alphabets)
        self.feature_name2id = copy.deepcopy(other.feature_name2id)
        self.feature_alphabet_sizes = copy.deepcopy(
            other.feature_alphabet_sizes)
        self.feature_emb_dims = copy.deepcopy(other.feature_emb_dims)

        for re_feature_alphabet in other.re_feature_alphabets:
            self.re_feature_alphabets.append(
                copy.deepcopy(re_feature_alphabet))

        self.re_feature_name = copy.deepcopy(other.re_feature_name)
        self.re_feature_name2id = copy.deepcopy(other.re_feature_name2id)
        self.re_feature_alphabets = copy.deepcopy(other.re_feature_alphabets)
        self.re_feature_num = len(self.re_feature_alphabets)
        self.re_feature_emb_dims = copy.deepcopy(other.re_feature_emb_dims)
        self.re_feature_alphabet_sizes = copy.deepcopy(
            other.re_feature_alphabet_sizes)

    def show_data_summary(self):
        print("++" * 50)
        print("DATA SUMMARY START:")
        print("     Tag          scheme: %s" % (self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH))
        print("     Number   normalized: %s" % (self.number_normalized))
        print("     Word  alphabet size: %s" % (self.word_alphabet_size))
        print("     Char  alphabet size: %s" % (self.char_alphabet_size))
        print("     Label alphabet size: %s" % (self.label_alphabet_size))
        print("     Word embedding  dir: %s" % (self.word_emb_dir))
        print("     Word embedding size: %s" % (self.word_emb_dim))
        print("     Char embedding size: %s" % (self.char_emb_dim))
        print("     Train  file directory: %s" % (self.train_dir))
        print("     Dev    file directory: %s" % (self.dev_dir))
        print("     Test   file directory: %s" % (self.test_dir))

        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))

        print("     FEATURE num: %s" % (self.feature_num))
        for idx in range(self.feature_num):
            print("         Fe: %s  alphabet  size: %s" %
                  (self.feature_alphabets[idx].name,
                   self.feature_alphabet_sizes[idx]))
            print(
                "         Fe: %s  embedding size: %s" %
                (self.feature_alphabets[idx].name, self.feature_emb_dims[idx]))

        print("     Model char_hidden_dim: %s" % (self.HP_char_hidden_dim))

        print("     Iteration: %s" % (self.HP_iteration))
        print("     BatchSize: %s" % (self.HP_batch_size))

        print("     Hyper              lr: %s" % (self.HP_lr))
        print("     Hyper              l2: %s" % (self.HP_l2))
        print("     Hyper      hidden_dim: %s" % (self.HP_hidden_dim))
        print("     Hyper         dropout: %s" % (self.HP_dropout))
        print("     Hyper             GPU: %s" % (self.HP_gpu))
        print("     Hyper             NBEST: %s" % (self.nbest))

        print("     full data: %s" % (self.full_data))
        print("     Tune  word embeddings: %s" % (self.tune_wordemb))

        print("     max sequence length: %s" % (self.max_seq_len))
        print("     pad index: %s" % (self.pad_idx))
        print("     patience: %s" % (self.patience))
        print("     sentence window: %s" % (self.sent_window))
        # print("     Output directory: %s" % (self.output))
        print("     The ratio using negative instnaces 0~1: %s" %
              (self.unk_ratio))
        print("     Size of seqeuence feature representation: %s" %
              (self.seq_feature_size))

        print("     RE FEATURE num: %s" % (self.re_feature_num))
        for idx in range(self.re_feature_num):
            print("         Fe: %s  alphabet  size: %s" %
                  (self.re_feature_alphabets[idx].name,
                   self.re_feature_alphabet_sizes[idx]))
            print("         Fe: %s  embedding size: %s" %
                  (self.re_feature_alphabets[idx].name,
                   self.re_feature_emb_dims[idx]))

        print("     RE Train instance number: %s" % (len(self.re_train_Y)))
        print("     RE Dev   instance number: %s" % (len(self.re_dev_Y)))
        print("     RE Test  instance number: %s" % (len(self.re_test_Y)))

        # print("     pretrained_model_dir: %s" % (self.pretrained_model_dir))

        print("DATA SUMMARY END.")
        print("++" * 50)
        sys.stdout.flush()

    def initial_feature_alphabets(self):

        feature_prefix = '[Cap]'
        self.feature_alphabets.append(Alphabet(feature_prefix))
        self.feature_name.append(feature_prefix)
        self.feature_name2id[feature_prefix] = 0

        feature_prefix = '[POS]'
        self.feature_alphabets.append(Alphabet(feature_prefix))
        self.feature_name.append(feature_prefix)
        self.feature_name2id[feature_prefix] = 1

        self.feature_num = len(self.feature_alphabets)
        self.feature_emb_dims = [20] * self.feature_num
        self.feature_alphabet_sizes = [0] * self.feature_num
        if self.feat_config:
            for idx in range(self.feature_num):
                if self.feature_name[idx] in self.feat_config:
                    self.feature_emb_dims[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_size']

    def build_alphabet(self, documents):
        for doc in documents:
            for sentence in doc:
                for token in sentence:
                    word = token['word']
                    if self.number_normalized:
                        word = normalize_word(word)
                    label = token['label']
                    self.label_alphabet.add(label)
                    self.word_alphabet.add(word)
                    ## build feature alphabet
                    self.feature_alphabets[0].add(token['cap'])
                    self.feature_alphabets[1].add(token['pos'])

                    for char in word:
                        self.char_alphabet.add(char)

        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        for idx in range(self.feature_num):
            self.feature_alphabet_sizes[idx] = self.feature_alphabets[
                idx].size()

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()
        for idx in range(self.feature_num):
            self.feature_alphabets[idx].close()

    def open_alphabet(self):
        self.word_alphabet.open()
        self.char_alphabet.open()
        # label not open
        # self.label_alphabet.open()
        for idx in range(self.feature_num):
            self.feature_alphabets[idx].open()

    def initial_re_feature_alphabets(self):
        id = 0
        for k, v in self.re_feat_config.items():
            self.re_feature_alphabets.append(Alphabet(k))
            self.re_feature_name.append(k)
            self.re_feature_name2id[k] = id
            id += 1

        self.re_feature_num = len(self.re_feature_alphabets)
        self.re_feature_emb_dims = [20] * self.re_feature_num
        self.re_feature_alphabet_sizes = [0] * self.re_feature_num
        if self.re_feat_config:
            for idx in range(self.re_feature_num):
                if self.re_feature_name[idx] in self.re_feat_config:
                    self.re_feature_emb_dims[idx] = self.re_feat_config[
                        self.re_feature_name[idx]]['emb_size']

    def build_re_feature_alphabets(self, tokens, entities, relations):

        entity_type_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[ENTITY_TYPE]']]
        entity_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[ENTITY]']]
        relation_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[RELATION]']]
        token_num_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[TOKEN_NUM]']]
        entity_num_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[ENTITY_NUM]']]
        position_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[POSITION]']]

        for i, doc_token in enumerate(tokens):

            doc_entity = entities[i]
            doc_relation = relations[i]

            sent_idx = 0
            sentence = doc_token[(doc_token['sent_idx'] == sent_idx)]
            while sentence.shape[0] != 0:

                entities_in_sentence = doc_entity[(
                    doc_entity['sent_idx'] == sent_idx)]
                for _, entity in entities_in_sentence.iterrows():
                    entity_type_alphabet.add(entity['type'])
                    tk_idx = entity['tf_start']
                    while tk_idx <= entity['tf_end']:
                        entity_alphabet.add(
                            my_utils1.normalizeWord(sentence.iloc[
                                tk_idx, 0]))  # assume 'text' is in 0 column
                        tk_idx += 1

                sent_idx += 1
                sentence = doc_token[(doc_token['sent_idx'] == sent_idx)]

            for _, relation in doc_relation.iterrows():
                relation_alphabet.add(relation['type'])

        for i in range(data.max_seq_len):
            token_num_alphabet.add(i)
            entity_num_alphabet.add(i)
            position_alphabet.add(i)
            position_alphabet.add(-i)

        for idx in range(self.re_feature_num):
            self.re_feature_alphabet_sizes[idx] = self.re_feature_alphabets[
                idx].size()

    def fix_re_alphabet(self):
        for alphabet in self.re_feature_alphabets:
            alphabet.close()

    def open_re_alphabet(self):
        for alphabet in self.re_feature_alphabets:
            if alphabet.name == '[RELATION]':  # label not open
                continue
            alphabet.open()

    def build_pretrain_emb(self):
        if self.word_emb_dir:
            logging.info("Load pretrained word embedding, dir: %s" %
                         (self.word_emb_dir))
            self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(
                self.word_emb_dir, self.word_alphabet, self.word_emb_dim)

    def generate_instance(self, name, documents):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(
                documents, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(
                documents, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(
                documents, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        else:
            logging.info(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def generate_re_instance(self, name, tokens, entities, relations, names):
        self.fix_re_alphabet()
        if name == "train":
            self.re_train_X, self.re_train_Y = relation_extraction.getRelationInstance2(
                tokens, entities, relations, names, self)
        elif name == "dev":
            self.re_dev_X, self.re_dev_Y = relation_extraction.getRelationInstance2(
                tokens, entities, relations, names, self)
        elif name == "test":
            self.re_test_X, self.re_test_Y = relation_extraction.getRelationInstance2(
                tokens, entities, relations, names, self)
        else:
            logging.info(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def load(self, data_file):
        f = open(data_file, 'rb')
        tmp_dict = pickle.load(f)
        f.close()
        self.__dict__.update(tmp_dict)

    def save(self, save_file):
        f = open(save_file, 'wb')
        pickle.dump(self.__dict__, f, 2)
        f.close()

    def clear_data(self):
        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []

        self.re_train_X = []
        self.re_dev_X = []
        self.re_test_X = []
        self.re_train_Y = []
        self.re_dev_Y = []
        self.re_test_Y = []

        self.pretrain_word_embedding = None

    def read_config(self, config_file, opt):
        config = config_file_to_dict(config_file)
        ## read data:

        self.train_dir = opt.train_dir

        self.dev_dir = opt.dev_dir

        self.test_dir = opt.test_dir

        self.word_emb_dir = opt.word_emb_file

        the_item = 'MAX_SENTENCE_LENGTH'
        if the_item in config:
            self.MAX_SENTENCE_LENGTH = int(config[the_item])

        the_item = 'number_normalized'
        if the_item in config:
            self.number_normalized = str2bool(config[the_item])

        the_item = 'word_emb_dim'
        if the_item in config:
            self.word_emb_dim = int(config[the_item])
        the_item = 'char_emb_dim'
        if the_item in config:
            self.char_emb_dim = int(config[the_item])

        the_item = 'nbest'
        if the_item in config:
            self.nbest = int(config[the_item])

        the_item = 'feature'
        if the_item in config:
            self.feat_config = config[the_item]  ## feat_config is a dict

        the_item = 'iteration'
        if the_item in config:
            self.HP_iteration = int(config[the_item])
        the_item = 'batch_size'
        if the_item in config:
            self.HP_batch_size = int(config[the_item])

        the_item = 'char_hidden_dim'
        if the_item in config:
            self.HP_char_hidden_dim = int(config[the_item])
        the_item = 'hidden_dim'
        if the_item in config:
            self.HP_hidden_dim = int(config[the_item])
        the_item = 'dropout'
        if the_item in config:
            self.HP_dropout = float(config[the_item])

        the_item = 'gpu'
        if the_item in config:
            self.HP_gpu = int(config[the_item])
        the_item = 'learning_rate'
        if the_item in config:
            self.HP_lr = float(config[the_item])

        the_item = 'l2'
        if the_item in config:
            self.HP_l2 = float(config[the_item])

        # both
        the_item = 'full_data'
        if the_item in config:
            self.full_data = str2bool(config[the_item])

        the_item = 'tune_wordemb'
        if the_item in config:
            self.tune_wordemb = str2bool(config[the_item])

        the_item = 'max_seq_len'
        if the_item in config:
            self.max_seq_len = int(config[the_item])

        the_item = 'pad_idx'
        if the_item in config:
            self.pad_idx = int(config[the_item])

        the_item = 'sent_window'
        if the_item in config:
            self.sent_window = int(config[the_item])

        # the_item = 'output'
        # if the_item in config:
        #     self.output = config[the_item]

        the_item = 'unk_ratio'
        if the_item in config:
            self.unk_ratio = float(config[the_item])

        the_item = 'seq_feature_size'
        if the_item in config:
            self.seq_feature_size = int(config[the_item])

        the_item = 're_feature'
        if the_item in config:
            self.re_feat_config = config[the_item]  ## feat_config is a dict

        the_item = 'patience'
        if the_item in config:
            self.patience = int(config[the_item])
Ejemplo n.º 24
0
Archivo: data.py Proyecto: NLP1502/NLP
class Data:
    def __init__(self):
        self.substring_names = ['word', 'pos', 'char', 'bpe', 'word-pos']
        self.substring_maxlen = 10

        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = False
        self.norm_char_emb = False
        self.norm_trans_emb = False
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')

        self.translation_alphabet = Alphabet('translation')
        self.translation_id_format = {}

        self.feature_names = []
        self.feature_alphabets = []
        self.feature_num = len(self.feature_alphabets)
        self.feat_config = None

        self.label_alphabet = Alphabet('label', True)
        self.tagScheme = "NoSeg"  ## BMES/BIO

        self.seg = True
        ###
        self.task_name = None

        ### I/O
        self.data_bin_dir = None
        self.train_dir = None
        self.dev_dir = None
        self.test_dir = None
        self.raw_dir = None
        self.middle_dir = None
        self.viterbi_inputs_model_name = None

        self.trans_dir = None

        self.decode_dir = None
        self.model_dir = None  ## model save  file
        self.load_model_dir = None  ## model load file

        self.word_emb_dir = None
        self.char_emb_dir = None
        self.trans_embed_dir = None
        self.typeinfo_dir = None

        self.feature_emb_dirs = []

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None
        self.pretrain_trans_embedding = None
        self.pretrain_feature_embeddings = []

        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        self.trans_alphabet_size = 0

        self.feature_alphabet_sizes = []
        self.feature_emb_dims = []
        self.norm_feature_embs = []
        self.word_emb_dim = 50
        self.char_emb_dim = 30
        self.trans_emb_dim = 100

        ###Classification
        ## Dataset Plus
        self.substring_dir = None
        self.bpe_emb_dir = None
        self.pos_emb_dir = None
        self.pretrain_bpe_embedding = None
        self.pretrain_pos_embedding = None
        self.bpe_emb_dim = 30
        self.pos_emb_dim = 30
        self.bpe_alphabet_size = 0
        self.pos_alphabet_size = 0
        self.norm_bpe_emb = False
        self.norm_pos_emb = False
        self.bpe_texts = []
        self.bpe_Ids = []
        self.pos_texts = []
        self.pos_Ids = []
        self.label_size = 0
        self.substring_train_texts = None
        self.substring_train_Ids = None
        self.substring_dev_texts = None
        self.substring_dev_Ids = None
        self.substring_test_texts = None
        self.substring_test_Ids = None
        self.substring_label_alphabet = Alphabet('substring_label', True)

        ###Networks
        self.word_feature_extractor = "LSTM"  # "LSTM"/"CNN"/"GRU"/
        self.use_char = True
        self.char_seq_feature = "CNN"  # "LSTM"/"CNN"/"GRU"/None
        self.use_trans = False
        self.use_crf = True
        self.nbest = None
        self.use_mapping = False
        self.mapping_func = None  # tanh or sigmoid

        # Training
        self.save_model = True
        self.state_training_name = 'default'
        self.average_batch_loss = False
        self.optimizer = "SGD"  # "SGD"/"Adam"
        self.status = "train"
        self.show_loss_per_batch = 100
        # Hyperparameters
        self.seed_num = None
        self.cnn_layer = 4
        self.iteration = 100
        self.batch_size = 10
        self.char_hidden_dim = 50
        self.trans_hidden_dim = 50
        self.hidden_dim = 200
        self.dropout = 0.5
        self.lstm_layer = 1
        self.bilstm = True

        self.gpu = False
        self.lr = 0.015
        self.lr_decay = 0.05
        self.clip = None
        self.momentum = 0
        self.l2 = 1e-8

        # circul
        self.circul_time = 4
        self.circul_deepth = 2
        self.circul_gather_output_mode = "concat"

        # decode prepare
        self.decode_prepare_mode = 'example'

    def init_substring_instance(self):
        len_names = len(self.substring_names)
        self.substring_train_texts = [[[]
                                       for _ in range(self.substring_maxlen)]
                                      for _ in range(len_names)]
        self.substring_train_Ids = [[[] for _ in range(self.substring_maxlen)]
                                    for _ in range(len_names)]
        self.substring_dev_texts = [[[] for _ in range(self.substring_maxlen)]
                                    for _ in range(len_names)]
        self.substring_dev_Ids = [[[] for _ in range(self.substring_maxlen)]
                                  for _ in range(len_names)]
        self.substring_test_texts = [[[] for _ in range(self.substring_maxlen)]
                                     for _ in range(len_names)]
        self.substring_test_Ids = [[[] for _ in range(self.substring_maxlen)]
                                   for _ in range(len_names)]

    def show_data_summary(self):
        print("++" * 50)
        print("DATA SUMMARY START:")
        print(" I/O:")
        print("     Tag          scheme: %s" % (self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s" % (self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s" % (self.number_normalized))
        print("     Word  alphabet size: %s" % (self.word_alphabet_size))
        print("     Char  alphabet size: %s" % (self.char_alphabet_size))
        print("     Label alphabet size: %s" % (self.label_alphabet_size))
        print("     Trans alphabet size: %s" % (self.trans_alphabet_size))
        print("     Word embedding  dir: %s" % (self.word_emb_dir))
        print("     Char embedding  dir: %s" % (self.char_emb_dir))
        print("     Tran embedding  dir: %s" % (self.trans_embed_dir))
        print("     Word embedding size: %s" % (self.word_emb_dim))
        print("     Char embedding size: %s" % (self.char_emb_dim))
        print("     Tran embedding size: %s" % (self.trans_emb_dim))
        print("     Norm   word     emb: %s" % (self.norm_word_emb))
        print("     Norm   char     emb: %s" % (self.norm_char_emb))
        print("     Norm   tran     emb: %s" % (self.norm_trans_emb))
        print("++" * 50)
        print("   task name: %s" % (self.task_name))
        print("++" * 50)
        print("   Data bin file directory: %s" % (self.data_bin_dir))
        print("     Train  file directory: %s" % (self.train_dir))
        print("     Dev    file directory: %s" % (self.dev_dir))
        print("     Test   file directory: %s" % (self.test_dir))
        print("     Raw    file directory: %s" % (self.raw_dir))
        print("     Middle file directory: %s" % (self.middle_dir))
        print(" viterbi inputs model name: %s" %
              (self.viterbi_inputs_model_name))
        if self.typeinfo_dir:
            print("     typeinfo    directory: %s" % (self.typeinfo_dir))
        print("     Model  file directory: %s" % (self.model_dir))
        print("     Loadmodel   directory: %s" % (self.load_model_dir))
        print("     Decode file directory: %s" % (self.decode_dir))
        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))
        print("     Raw   instance number: %s" % (len(self.raw_texts)))
        print("     FEATURE num: %s" % (self.feature_num))
        for idx in range(self.feature_num):
            print("         Fe: %s  alphabet  size: %s" %
                  (self.feature_alphabets[idx].name,
                   self.feature_alphabet_sizes[idx]))
            print(
                "         Fe: %s  embedding  dir: %s" %
                (self.feature_alphabets[idx].name, self.feature_emb_dirs[idx]))
            print(
                "         Fe: %s  embedding size: %s" %
                (self.feature_alphabets[idx].name, self.feature_emb_dims[idx]))
            print("         Fe: %s  norm       emb: %s" %
                  (self.feature_alphabets[idx].name,
                   self.norm_feature_embs[idx]))
        print(" " + "++" * 20)
        print(" Model Network:")
        print("     Model        use_crf: %s" % (self.use_crf))
        print("     Model word extractor: %s" % (self.word_feature_extractor))
        print("     Model       use_char: %s" % (self.use_char))
        if self.use_char:
            print("     Model char_seq_feature: %s" % (self.char_seq_feature))
            print("     Model char_hidden_dim: %s" % (self.char_hidden_dim))
        if self.use_trans:
            print("     Model trans_hidden_dim: %s" % (self.trans_hidden_dim))
        if self.use_mapping:
            print("     Model mapping function: %s" % (self.mapping_func))
        print(" " + "++" * 20)
        print(" Training:")
        print("     show_loss_per_batch: %s" % (self.show_loss_per_batch))
        print("     save_model: %s" % (self.save_model))
        print("     state_training_name: %s" % (self.state_training_name))
        print("     Optimizer: %s" % (self.optimizer))
        print("     Iteration: %s" % (self.iteration))
        print("     BatchSize: %s" % (self.batch_size))
        print("     Average  batch   loss: %s" % (self.average_batch_loss))

        print(" " + "++" * 20)
        print(" Hyperparameters:")

        print("     Hyper        seed_num: %s" % (self.seed_num))
        print("     Hyper              lr: %s" % (self.lr))
        print("     Hyper        lr_decay: %s" % (self.lr_decay))
        print("     Hyper            clip: %s" % (self.clip))
        print("     Hyper        momentum: %s" % (self.momentum))
        print("     Hyper              l2: %s" % (self.l2))
        print("     Hyper      hidden_dim: %s" % (self.hidden_dim))
        print("     Hyper         dropout: %s" % (self.dropout))
        print("     Hyper      lstm_layer: %s" % (self.lstm_layer))
        print("     Hyper          bilstm: %s" % (self.bilstm))
        print("     Hyper             GPU: %s" % (self.gpu))
        print("DATA SUMMARY END.")
        print("++" * 50)

        print("      substring dir : %s" % (self.substring_dir))
        print("    bpe_emb_dir dir : %s" % (self.bpe_emb_dir))
        print("    pos_emb_dir dir : %s" % (self.pos_emb_dir))
        print("++" * 50)

        print("      circul time   : %s" % (self.circul_time))
        print("      circul deepth : %s" % (self.circul_deepth))
        print(" gather output mode : %s" % (self.circul_gather_output_mode))
        print("++" * 50)

        print(" decode prepare mode : %s" % (self.decode_prepare_mode))
        print("++" * 50)

        sys.stdout.flush()

    def make_substring_label_alphabet(self):
        for label in self.label_alphabet.instances:
            label = label.split('-')[-1]
            self.substring_label_alphabet.add(label)
        self.substring_label_alphabet.close()

    def initial_feature_alphabets(self):
        items = open(self.train_dir, 'r').readline().strip('\n').split()
        total_column = len(items)
        if total_column > 2:
            for idx in range(1, total_column - 1):
                feature_prefix = 'feature_' + str(idx)
                self.feature_alphabets.append(Alphabet(feature_prefix))
                self.feature_names.append(feature_prefix)
                print "Find feature: ", feature_prefix
        self.feature_num = len(self.feature_alphabets)
        self.pretrain_feature_embeddings = [None] * self.feature_num
        self.feature_emb_dims = [20] * self.feature_num
        self.feature_emb_dirs = [None] * self.feature_num
        self.norm_feature_embs = [False] * self.feature_num
        self.feature_alphabet_sizes = [0] * self.feature_num
        if self.feat_config:
            for idx in range(self.feature_num):
                self.feature_emb_dims[idx] = self.feat_config[
                    self.feature_names[idx]]['emb_size']
                self.feature_emb_dirs[idx] = self.feat_config[
                    self.feature_names[idx]]['emb_dir']
                self.norm_feature_embs[idx] = self.feat_config[
                    self.feature_names[idx]]['emb_norm']
        # exit(0)

    def build_alphabet(self, input_file):
        in_lines = open(input_file, 'r').readlines()
        for line in in_lines:
            if len(line) > 2:
                pairs = line.strip().split()
                word = pairs[0].decode('windows-1252')
                # word = pairs[0].decode('utf-8')
                if self.number_normalized:
                    word = normalize_word(word)
                label = pairs[-1]
                self.label_alphabet.add(label)
                self.word_alphabet.add(word)
                ## build feature alphabet
                for idx in range(self.feature_num):
                    feat_idx = pairs[idx + 1].split(']', 1)[-1]
                    self.feature_alphabets[idx].add(feat_idx)
                for char in word:
                    self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        for idx in range(self.feature_num):
            self.feature_alphabet_sizes[idx] = self.feature_alphabets[
                idx].size()
        startS = False
        startB = False
        for label, _ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"

    def build_alphabet_substring(self, input_file_dir, substring_file_prefix):
        ## will not read lables
        input_files = os.listdir(input_file_dir)
        print input_files
        for input_file in input_files:
            plus_feature = ''
            input_file_name = os.path.split(input_file)[1]
            if input_file_name.split('.')[0] != substring_file_prefix:
                continue
            if 'bpe' in input_file_name:
                plus_feature = 'bpe'
            elif 'word' in input_file_name:
                plus_feature = 'word'
            if plus_feature == '':
                continue
            in_lines = open(input_file_dir + input_file, 'r').readlines()
            for line in in_lines:
                if len(line.strip()) > 0:
                    pairs = line.strip().split('\t')
                    words = pairs[0].decode('windows-1252')
                    # word = pairs[0].decode('utf-8')
                    if self.number_normalized:
                        words = normalize_word(words)
                    labels = pairs[-1]
                    for word in words.split():
                        self.word_alphabet.add(word)
                        for char in word:
                            self.char_alphabet.add(char)
            self.word_alphabet_size = self.word_alphabet.size()
            self.char_alphabet_size = self.char_alphabet.size()

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()
        self.translation_alphabet.close()
        for idx in range(self.feature_num):
            self.feature_alphabets[idx].close()

    def build_pretrain_emb(self):
        if self.word_emb_dir:
            print("Load pretrained word embedding, norm: %s, dir: %s" %
                  (self.norm_word_emb, self.word_emb_dir))
            self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(
                self.word_emb_dir, self.word_alphabet, self.word_emb_dim,
                self.norm_word_emb)

            if self.typeinfo_dir:
                type_info_matrix = []
                with codecs.open(self.typeinfo_dir, 'r') as typeinfo_file:
                    type_info_lines = typeinfo_file.readlines()
                    for line in type_info_lines:
                        line = line.rstrip().split()
                        for i, _ in enumerate(line):
                            line[i] = float(line[i])
                        line = np.array(line)
                        type_info_matrix.append(line)

                print(
                    "Caculate type info distribution,and concate word and type......"
                )
                cos_res = []
                for i, word_embed in enumerate(self.pretrain_word_embedding):
                    word_type_info = []
                    if i == 0:
                        word_type_info = np.random.random(
                            size=len(type_info_matrix))
                        cos_res.append(word_type_info)
                    else:
                        for type_info in type_info_matrix:
                            cos_sim = 1 - spatial.distance.cosine(
                                word_embed, type_info)
                            word_type_info.append(cos_sim)
                        cos_res.append(word_type_info)
                cos_res = np.array(cos_res)
                cos_res = sigmoid(cos_res)
                self.pretrain_word_embedding = np.concatenate(
                    [self.pretrain_word_embedding, cos_res], axis=1)
                print "type info length:{}".format(len(type_info_matrix))
                self.word_emb_dim += len(type_info_matrix)
                print "new word dim is :{}".format(self.word_emb_dim)

        if self.char_emb_dir:
            print("Load pretrained char embedding, norm: %s, dir: %s" %
                  (self.norm_char_emb, self.char_emb_dir))
            self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(
                self.char_emb_dir, self.char_alphabet, self.char_emb_dim,
                self.norm_char_emb)
        if self.trans_embed_dir:
            print("Load pretrained trans embedding, norm: %s, dir: %s" %
                  (self.norm_trans_emb, self.trans_embed_dir))
            self.pretrain_trans_embedding, self.trans_emb_dim = build_chi_pretrain_embedding(
                self.trans_embed_dir, self.translation_alphabet,
                self.trans_emb_dim, self.norm_trans_emb)

        for idx in range(self.feature_num):
            if self.feature_emb_dirs[idx]:
                print(
                    "Load pretrained feature %s embedding:, norm: %s, dir: %s"
                    % (self.feature_name[idx], self.norm_feature_embs[idx],
                       self.feature_emb_dirs[idx]))
                self.pretrain_feature_embeddings[idx], self.feature_emb_dims[
                    idx] = build_pretrain_embedding(
                        self.feature_emb_dirs[idx],
                        self.feature_alphabets[idx],
                        self.feature_emb_dims[idx],
                        self.norm_feature_embs[idx])

    def generate_instance(self, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(
                self.train_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.translation_id_format)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(
                self.dev_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.translation_id_format)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(
                self.test_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.translation_id_format)
        elif name == "raw":
            self.raw_texts, self.raw_Ids = read_instance(
                self.raw_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.translation_id_format)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def generate_instance_substring(self, substring_file_prefix):
        self.init_substring_instance()
        self.make_substring_label_alphabet()
        input_files = os.listdir(self.substring_dir)
        print input_files
        for input_file in input_files:
            input_file_name = os.path.split(input_file)[1]
            input_file_dir = os.path.join(self.substring_dir, input_file_name)
            input_file_name_split = input_file_name.split('.')
            if input_file_name_split[0] != substring_file_prefix:
                continue
            print('dealing %s' % (input_file_name))
            name = input_file_name_split[1]
            feature_name = input_file_name_split[2]
            f_l = int(input_file_name_split[-1][3:])  #feature_len

            if feature_name == 'word':
                alphabet = self.word_alphabet
            elif feature_name == 'char':
                alphabet = self.char_alphabet
            elif feature_name == 'pos':
                alphabet = self.feature_alphabets[0]
            elif feature_name == 'bpe':
                alphabet = self.feature_alphabets[1]

            s_f_id = self.substring_names.index(
                feature_name)  #substring_feature_id
            if name == "train":
                self.substring_train_texts[s_f_id][f_l], self.substring_train_Ids[s_f_id][f_l]\
                    = read_instance_substring(input_file_dir, alphabet, self.substring_label_alphabet, self.number_normalized)
            elif name == "testa":
                self.substring_dev_texts[s_f_id][f_l], self.substring_dev_Ids[s_f_id][f_l] \
                    = read_instance_substring(input_file_dir, alphabet, self.substring_label_alphabet, self.number_normalized)
            elif name == "testb":
                self.substring_test_texts[s_f_id][f_l], self.substring_test_Ids[s_f_id][f_l] \
                    = read_instance_substring(input_file_dir, alphabet, self.substring_label_alphabet, self.number_normalized)
            else:
                print(
                    "Error: you can only generate train/testa/testb instance! Illegal input:%s"
                    % (name))

    def write_decoded_results(self, predict_results, name):
        fout = open(self.decode_dir, 'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )
        assert (sent_num == len(content_list))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx])
            for idy in range(sent_length):
                ## content_list[idx] is a list with [word, char, label]
                fout.write(content_list[idx][0][idy].encode('utf-8') + " " +
                           predict_results[idx][idy] + '\n')
            fout.write('\n')
        fout.close()
        print("Predict %s result has been written into file. %s" %
              (name, self.decode_dir))

    def load(self, data_file):
        f = open(data_file, 'rb')
        tmp_dict = pickle.load(f)
        f.close()
        self.__dict__.update(tmp_dict)

    def save(self, save_file):
        f = open(save_file, 'wb')
        pickle.dump(self.__dict__, f, 2)
        f.close()

    def write_nbest_decoded_results(self, predict_results, pred_scores, name):
        ## predict_results : [whole_sent_num, nbest, each_sent_length]
        ## pred_scores: [whole_sent_num, nbest]
        fout = open(self.decode_dir, 'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )
        assert (sent_num == len(content_list))
        assert (sent_num == len(pred_scores))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx][0])
            nbest = len(predict_results[idx])
            score_string = "# "
            for idz in range(nbest):
                score_string += format(pred_scores[idx][idz], '.4f') + " "
            fout.write(score_string.strip() + "\n")

            for idy in range(sent_length):
                label_string = content_list[idx][0][idy].encode('utf-8') + " "
                for idz in range(nbest):
                    label_string += predict_results[idx][idz][idy] + " "
                label_string = label_string.strip() + "\n"
                fout.write(label_string)
            fout.write('\n')
        fout.close()
        print("Predict %s %s-best result has been written into file. %s" %
              (name, nbest, self.decode_dir))

    def read_config(self, config_file):
        config = config_file_to_dict(config_file)
        ## task:
        the_item = 'task_name'
        if the_item in config:
            self.task_name = config[the_item]

        ## read data:
        the_item = 'data_bin_dir'
        if the_item in config:
            self.data_bin_dir = config[the_item]
        the_item = 'train_dir'
        if the_item in config:
            self.train_dir = config[the_item]
        the_item = 'dev_dir'
        if the_item in config:
            self.dev_dir = config[the_item]
        the_item = 'test_dir'
        if the_item in config:
            self.test_dir = config[the_item]
        the_item = 'trans_dir'
        if the_item in config:
            self.trans_dir = config[the_item]
        the_item = 'middle_dir'
        if the_item in config:
            self.middle_dir = config[the_item]
        the_item = 'viterbi_inputs_model_name'
        if the_item in config:
            self.viterbi_inputs_model_name = config[the_item]

        the_item = 'substring_dir'
        if the_item in config:
            self.substring_dir = config[the_item]
        the_item = 'bpe_emb_dir'
        if the_item in config:
            self.bpe_emb_dir = config[the_item]
        the_item = 'pos_emb_dir'
        if the_item in config:
            self.pos_emb_dir = config[the_item]

        the_item = 'raw_dir'
        if the_item in config:
            self.raw_dir = config[the_item]
        the_item = 'decode_dir'
        if the_item in config:
            self.decode_dir = config[the_item]
        the_item = 'model_dir'
        if the_item in config:
            self.model_dir = config[the_item]
        the_item = 'load_model_dir'
        if the_item in config:
            self.load_model_dir = config[the_item]

        the_item = 'word_emb_dir'
        if the_item in config:
            self.word_emb_dir = config[the_item]
        the_item = 'char_emb_dir'
        if the_item in config:
            self.char_emb_dir = config[the_item]
        the_item = 'trans_embed_dir'
        if the_item in config:
            self.trans_embed_dir = config[the_item]
        the_item = 'typeinfo_dir'
        if the_item in config:
            self.typeinfo_dir = config[the_item]

        the_item = 'MAX_SENTENCE_LENGTH'
        if the_item in config:
            self.MAX_SENTENCE_LENGTH = int(config[the_item])
        the_item = 'MAX_WORD_LENGTH'
        if the_item in config:
            self.MAX_WORD_LENGTH = int(config[the_item])

        the_item = 'norm_word_emb'
        if the_item in config:
            self.norm_word_emb = str2bool(config[the_item])
        the_item = 'norm_char_emb'
        if the_item in config:
            self.norm_char_emb = str2bool(config[the_item])
        the_item = 'number_normalized'
        if the_item in config:
            self.number_normalized = str2bool(config[the_item])

        the_item = 'seg'
        if the_item in config:
            self.seg = str2bool(config[the_item])
        the_item = 'word_emb_dim'
        if the_item in config:
            self.word_emb_dim = int(config[the_item])
        the_item = 'char_emb_dim'
        if the_item in config:
            self.char_emb_dim = int(config[the_item])
        the_item = 'trans_emb_dim'
        if the_item in config:
            self.trans_emb_dim = int(config[the_item])

        ## read network:
        the_item = 'use_crf'
        if the_item in config:
            self.use_crf = str2bool(config[the_item])
        the_item = 'use_char'
        if the_item in config:
            self.use_char = str2bool(config[the_item])
        the_item = 'use_trans'
        if the_item in config:
            self.use_trans = str2bool(config[the_item])
        the_item = 'use_mapping'
        if the_item in config:
            self.use_mapping = str2bool(config[the_item])
        the_item = 'mapping_func'
        if the_item in config:
            self.mapping_func = config[the_item]
        the_item = 'word_seq_feature'
        if the_item in config:
            self.word_feature_extractor = config[the_item]
        the_item = 'char_seq_feature'
        if the_item in config:
            self.char_seq_feature = config[the_item]
        the_item = 'nbest'
        if the_item in config:
            self.nbest = int(config[the_item])

        the_item = 'feature'
        if the_item in config:
            self.feat_config = config[the_item]  ## feat_config is a dict

        ## read training setting:
        the_item = 'save_model'
        if the_item in config:
            self.save_model = str2bool(config[the_item])
        the_item = 'state_training_name'
        if the_item in config:
            self.state_training_name = config[the_item]
        the_item = 'optimizer'
        if the_item in config:
            self.optimizer = config[the_item]
        the_item = 'ave_batch_loss'
        if the_item in config:
            self.average_batch_loss = str2bool(config[the_item])
        the_item = 'status'
        if the_item in config:
            self.status = config[the_item]
        the_item = 'show_loss_per_batch'
        if the_item in config:
            self.show_loss_per_batch = int(config[the_item])

        ## read Hyperparameters:
        the_item = 'seed_num'
        if the_item in config:
            if config[the_item] != 'None':
                self.seed_num = int(config[the_item])
        the_item = 'cnn_layer'
        if the_item in config:
            self.cnn_layer = int(config[the_item])
        the_item = 'iteration'
        if the_item in config:
            self.iteration = int(config[the_item])
        the_item = 'batch_size'
        if the_item in config:
            self.batch_size = int(config[the_item])

        the_item = 'char_hidden_dim'
        if the_item in config:
            self.char_hidden_dim = int(config[the_item])

        the_item = 'trans_hidden_dim'
        if the_item in config:
            self.trans_hidden_dim = int(config[the_item])

        the_item = 'hidden_dim'
        if the_item in config:
            self.hidden_dim = int(config[the_item])
        the_item = 'dropout'
        if the_item in config:
            self.dropout = float(config[the_item])
        the_item = 'lstm_layer'
        if the_item in config:
            self.lstm_layer = int(config[the_item])
        the_item = 'bilstm'
        if the_item in config:
            self.bilstm = str2bool(config[the_item])

        the_item = 'gpu'
        if the_item in config:
            self.gpu = str2bool(config[the_item])
        the_item = 'learning_rate'
        if the_item in config:
            self.lr = float(config[the_item])
        the_item = 'lr_decay'
        if the_item in config:
            self.lr_decay = float(config[the_item])
        the_item = 'clip'
        if the_item in config:
            if config[the_item] == 'None':
                self.clip = None
            else:
                self.clip = float(config[the_item])
        the_item = 'momentum'
        if the_item in config:
            self.momentum = float(config[the_item])
        the_item = 'l2'
        if the_item in config:
            self.l2 = float(config[the_item])

        ###base2
        the_item = 'feature_name'
        if the_item in config:
            self.feature_name = config[the_item]
        the_item = 'feature_length'
        if the_item in config:
            self.feature_length = int(config[the_item])
        the_item = 'class_num'
        if the_item in config:
            self.class_num = int(config[the_item])
        the_item = 'feature_ans'
        if the_item in config:
            self.feature_ans = config[the_item]

        ###circul
        the_item = 'circul_time'
        if the_item in config:
            self.circul_time = config[the_item]
        the_item = 'circul_deepth'
        if the_item in config:
            self.circul_deepth = config[the_item]
        the_item = 'circul_gather_output_mode'
        if the_item in config:
            self.circul_gather_output_mode = config[the_item]

        ###decode_prepare
        the_item = 'decode_prepare_mode'
        if the_item in config:
            self.decode_prepare_mode = config[the_item]

    def read_arg(self, args):
        if args.task_name != None: self.task_name = args.task_name

        if args.data_bin_dir != None: self.data_bin_dir = args.data_bin_dir
        if args.train_dir != None: self.train_dir = args.train_dir
        if args.dev_dir != None: self.dev_dir = args.dev_dir
        if args.test_dir != None: self.test_dir = args.test_dir
        if args.trans_dir != None: self.trans_dir = args.trans_dir
        if args.word_emb_dir != None: self.word_emb_dir = args.word_emb_dir
        if args.trans_embed_dir != None:
            self.trans_embed_dir = args.trans_embed_dir
        if args.middle_dir != None: self.middle_dir = args.middle_dir
        if args.viterbi_inputs_model_name != None:
            self.viterbi_inputs_model_name = args.viterbi_inputs_model_name

        if args.substring_dir != None: self.substring_dir = args.substring_dir
        if args.bpe_emb_dir != None: self.bpe_emb_dir = args.bpe_emb_dir
        if args.pos_emb_dir != None: self.pos_emb_dir = args.pos_emb_dir

        if args.model_dir != None: self.model_dir = args.model_dir
        if args.norm_word_emb != None: self.norm_word_emb = args.norm_word_emb
        if args.norm_char_emb != None: self.norm_char_emb = args.norm_char_emb
        if args.word_emb_dim != None: self.word_emb_dim = args.word_emb_dim
        if args.char_emb_dim != None: self.char_emb_dim = args.char_emb_dim
        if args.trans_emb_dim != None: self.trans_emb_dim = args.trans_emb_dim

        if args.number_normalized != None:
            self.number_normalized = args.number_normalized
        if args.seg != None: self.seg = args.seg

        if args.use_crf != None: self.use_crf = args.use_crf
        if args.use_char != None: self.use_char = args.use_char
        if args.use_trans != None: self.use_trans = args.use_trans

        if args.word_seq_feature != None:
            self.word_seq_feature = args.word_seq_feature
        if args.char_seq_feature != None:
            self.char_seq_feature = args.char_seq_feature

        if args.nbest != None: self.nbest = args.nbest

        if args.status != None: self.status = args.status
        if args.state_training_name != None:
            self.state_training_name = args.state_training_name
        if args.save_model != None: self.save_model = args.save_model
        if args.optimizer != None: self.optimizer = args.optimizer
        if args.iteration != None: self.iteration = args.iteration
        if args.batch_size != None: self.batch_size = args.batch_size
        if args.ave_batch_loss != None:
            self.ave_batch_loss = args.ave_batch_loss
        if args.show_loss_per_batch != None:
            self.show_loss_per_batch = args.show_loss_per_batch

        if args.seed_num != None: self.seed_num = args.seed_num
        if args.cnn_layer != None: self.cnn_layer = args.cnn_layer
        if args.char_hidden_dim != None:
            self.char_hidden_dim = args.char_hidden_dim
        if args.trans_hidden_dim != None:
            self.trans_hidden_dim = args.trans_hidden_dim
        if args.hidden_dim != None: self.hidden_dim = args.hidden_dim
        if args.dropout != None: self.dropout = args.dropout
        if args.lstm_layer != None: self.lstm_layer = args.lstm_layer
        if args.bilstm != None: self.bilstm = args.bilstm
        if args.learning_rate != None: self.learning_rate = args.learning_rate
        if args.lr_decay != None: self.lr_decay = args.lr_decay
        if args.momentum != None: self.momentum = args.momentum
        if args.l2 != None: self.l2 = args.l2
        if args.gpu != None: self.gpu = args.gpu
        if args.clip != None: self.clip = args.clip

        ###base2
        if args.feature_name != None: self.feature_name = args.feature_name
        if args.feature_length != None:
            self.feature_length = args.feature_length
        if args.class_num != None: self.class_num = args.class_num
        if args.feature_ans != None:
            self.feature_ans = args.feature_ans

        ###circul
        if args.circul_time != None: self.circul_time = args.circul_time
        if args.circul_deepth != None: self.circul_deepth = args.circul_deepth
        if args.circul_gather_output_mode != None:
            self.circul_gather_output_mode = args.circul_gather_output_mode

        ###decode_prepare
        if args.decode_prepare_mode != None:
            self.decode_prepare_mode = args.decode_prepare_mode

    def build_translation_alphabet(self, trans_path):
        print("Creating translation alphabet......")
        with codecs.open(trans_path, 'r', "utf-8") as f:
            lines = f.readlines()
            for line in lines:
                if len(line.strip().split(":")) == 2:
                    temp = line.strip().split(":", 1)
                    words = temp[1].split()
                    for word in words:
                        self.translation_alphabet.add(word.strip())
        self.trans_alphabet_size = self.translation_alphabet.size()

    def build_translation_dict(self, trans_path):
        print("Creating Id to Id translation dictionary......")
        translation_id_format_temp = {}
        with codecs.open(trans_path, 'r', "utf-8") as f:
            lines = f.readlines()
            for line in lines:
                ids = []
                if len(line.strip().split(":", 1)) == 2:
                    temp = line.strip().split(":", 1)
                    word_id = self.word_alphabet.get_index(temp[0].strip())
                    translations = temp[1].split()
                    for translation in translations:
                        ids.append(
                            self.translation_alphabet.get_index(
                                translation.strip()))
                    if ids == []:
                        ids = [0]
                    translation_id_format_temp[word_id] = ids

        for word in self.word_alphabet.instances:
            if self.word_alphabet.get_index(
                    word) in translation_id_format_temp.keys():
                self.translation_id_format[self.word_alphabet.get_index(
                    word)] = translation_id_format_temp[
                        self.word_alphabet.get_index(word)]
            else:
                self.translation_id_format[self.word_alphabet.get_index(
                    word)] = [0]
Ejemplo n.º 25
0
class Data:
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = False
        self.norm_char_emb = False
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')
        # self.word_alphabet.add(START)
        # self.word_alphabet.add(UNKNOWN)
        # self.char_alphabet.add(START)
        # self.char_alphabet.add(UNKNOWN)
        # self.char_alphabet.add(PADDING)
        self.label_alphabet = Alphabet('label', True)
        self.tagScheme = "NoSeg"
        self.char_features = "LSTM"  ## "LSTM"/"CNN"

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.word_emb_dim = 50
        self.char_emb_dim = 30
        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None
        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        ### hyperparameters
        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_average_batch_loss = False
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 50
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        self.HP_use_char = False
        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = None
        self.HP_momentum = 0

    def show_data_summary(self):
        print("DATA SUMMARY START:")
        print("     Tag          scheme: %s" % (self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s" % (self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s" % (self.number_normalized))
        print("     Word  alphabet size: %s" % (self.word_alphabet_size))
        print("     Char  alphabet size: %s" % (self.char_alphabet_size))
        print("     Label alphabet size: %s" % (self.label_alphabet_size))
        print("     Word embedding size: %s" % (self.word_emb_dim))
        print("     Char embedding size: %s" % (self.char_emb_dim))
        print("     Norm   word     emb: %s" % (self.norm_word_emb))
        print("     Norm   char     emb: %s" % (self.norm_char_emb))
        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))
        print("     Raw   instance number: %s" % (len(self.raw_texts)))
        print("     Hyper       iteration: %s" % (self.HP_iteration))
        print("     Hyper      batch size: %s" % (self.HP_batch_size))
        print("     Hyper   average batch: %s" % (self.HP_average_batch_loss))
        print("     Hyper              lr: %s" % (self.HP_lr))
        print("     Hyper        lr_decay: %s" % (self.HP_lr_decay))
        print("     Hyper         HP_clip: %s" % (self.HP_clip))
        print("     Hyper        momentum: %s" % (self.HP_momentum))
        print("     Hyper      hidden_dim: %s" % (self.HP_hidden_dim))
        print("     Hyper         dropout: %s" % (self.HP_dropout))
        print("     Hyper      lstm_layer: %s" % (self.HP_lstm_layer))
        print("     Hyper          bilstm: %s" % (self.HP_bilstm))
        print("     Hyper             GPU: %s" % (self.HP_gpu))
        print("     Hyper        use_char: %s" % (self.HP_use_char))
        if self.HP_use_char:
            print("             Char_features: %s" % (self.char_features))

        print("DATA SUMMARY END.")
        sys.stdout.flush()

    def refresh_label_alphabet(self, input_file):
        old_size = self.label_alphabet_size
        self.label_alphabet.clear(True)
        in_lines = open(input_file, 'r').readlines()
        for line in in_lines:
            if len(line) > 2:
                pairs = line.strip().split()
                label = pairs[-1]
                self.label_alphabet.add(label)
        self.label_alphabet_size = self.label_alphabet.size()
        startS = False
        startB = False
        for label, _ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"
        self.fix_alphabet()
        print("Refresh label alphabet finished: old:%s -> new:%s" %
              (old_size, self.label_alphabet_size))

    def extend_word_char_alphabet(self, input_file_list):
        old_word_size = self.word_alphabet_size
        old_char_size = self.char_alphabet_size
        for input_file in input_file_list:
            in_lines = open(input_file, 'r').readlines()
            for line in in_lines:
                if len(line) > 2:
                    pairs = line.strip().split()
                    word = pairs[0]
                    if self.number_normalized:
                        word = normalize_word(word)
                    self.word_alphabet.add(word)
                    for char in word:
                        self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        print("Extend word/char alphabet finished!")
        print("     old word:%s -> new word:%s" %
              (old_word_size, self.word_alphabet_size))
        print("     old char:%s -> new char:%s" %
              (old_char_size, self.char_alphabet_size))
        for input_file in input_file_list:
            print("     from file:%s" % (input_file))

    def build_alphabet(self, input_file):
        in_lines_string = open(input_file + ".string.txt", 'r').readlines()
        in_lines_label = open(input_file + ".label.txt", 'r').readlines()
        for line_string, line_label in zip(in_lines_string, in_lines_label):
            print(line_label)
            print(line_string)
            line_label = line_label[:-1].split(',')
            line_string = line_string[:-1]
            assert len(line_label) == len(line_string)
            for i in range(len(line_label)):
                self.label_alphabet.add(line_label[i])
                self.word_alphabet.add(line_string[i])
        self.char_alphabet.add("*")
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        startS = False
        startB = False
        for label, _ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()

    def build_word_pretrain_emb(self, emb_path):
        self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(
            emb_path, self.word_alphabet, self.word_emb_dim,
            self.norm_word_emb)

    def build_char_pretrain_emb(self, emb_path):
        self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(
            emb_path, self.char_alphabet, self.char_emb_dim,
            self.norm_char_emb)

    def generate_instance(self, input_file, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.number_normalized,
                self.MAX_SENTENCE_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.number_normalized,
                self.MAX_SENTENCE_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.number_normalized,
                self.MAX_SENTENCE_LENGTH)
        elif name == "raw":
            self.raw_texts, self.raw_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.number_normalized,
                self.MAX_SENTENCE_LENGTH)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def write_decoded_results(self, output_file, predict_results, name):
        fout = open(output_file, 'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )
        assert (sent_num == len(content_list))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx])
            for idy in range(sent_length):
                ## content_list[idx] is a list with [word, char, label]
                fout.write(content_list[idx][0][idy].encode('utf-8') + " " +
                           predict_results[idx][idy] + '\n')
            fout.write('\n')
        fout.close()
        print("Predict %s result has been written into file. %s" %
              (name, output_file))
Ejemplo n.º 26
0
def train(train_data, dev_data, test_data, d, dictionary, dictionary_reverse, opt, fold_idx, isMeddra_dict):
    logging.info("train the neural-based normalization model ...")

    logging.info("build alphabet ...")


    dict_alphabet = Alphabet('dict')
    init_dict_alphabet(dict_alphabet, dictionary)
    dict_alphabet.close()

    train_X = []
    train_Y = []
    for doc in train_data:

        temp_X, temp_Y = generate_instances_ehr(doc.entities, dict_alphabet, dictionary_reverse)
        train_X.extend(temp_X)
        train_Y.extend(temp_Y)


    train_loader = DataLoader(MyDataset(train_X, train_Y), opt.batch_size, shuffle=True, collate_fn=my_collate)

    if opt.gpu >= 0 and torch.cuda.is_available():
        device = torch.device('cuda', opt.gpu)
    else:
        device = torch.device('cpu')

    model = BertForSequenceClassification.from_pretrained(opt.bert_dir,
                                                          target=dict_alphabet)
    model.dict_alphabet = dict_alphabet
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    optimizer = BertAdam(optimizer_grouped_parameters, lr=opt.lr)

    best_dev_f = -10
    best_dev_p = -10
    best_dev_r = -10

    bad_counter = 0

    logging.info("start training ...")

    for idx in range(opt.iter):
        epoch_start = time.time()

        model.train()

        train_iter = iter(train_loader)
        num_iter = len(train_loader)

        sum_loss = 0

        correct, total = 0, 0

        for i in range(num_iter):

            x, mask, sentences, y, _ = next(train_iter)

            _, y_pred = model.forward(x, sentences, mask)

            l = model.loss(y_pred, y)

            sum_loss += l.item()

            l.backward()
            optimizer.step()
            model.zero_grad()

            total += y.size(0)
            _, pred = torch.max(y_pred, 1)
            correct += (pred == y).sum().item()

        epoch_finish = time.time()
        accuracy = 100.0 * correct / total
        logging.info("epoch: %s training finished. Time: %.2fs. loss: %.4f Accuracy %.2f" % (
        idx, epoch_finish - epoch_start, sum_loss / num_iter, accuracy))

        if opt.dev_file:
            p, r, f = evaluate(dev_data, dictionary, dictionary_reverse, model)
            logging.info("Dev: p: %.4f, r: %.4f, f: %.4f" % (p, r, f))
        else:
            f = best_dev_f

        if f > best_dev_f:
            logging.info("Exceed previous best f score on dev: %.4f" % (best_dev_f))

            torch.save(model, os.path.join(opt.output, "norm_neural.pkl"))

            best_dev_f = f
            best_dev_p = p
            best_dev_r = r

            bad_counter = 0
        else:
            bad_counter += 1

        if len(opt.dev_file) != 0 and bad_counter >= opt.patience:
            logging.info('Early Stop!')
            break

    logging.info("train finished")


    return best_dev_p, best_dev_r, best_dev_f
Ejemplo n.º 27
0
class Data:
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 512
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = False
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')
        self.word_alphabet.add(START)
        self.word_alphabet.add(UNKNOWN)
        self.char_alphabet.add(START)
        self.char_alphabet.add(UNKNOWN)
        self.char_alphabet.add(PADDING)
        self.label_alphabet = Alphabet('label')
        self.tagScheme = "NoSeg"

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.word_emb_dim = 50
        self.pretrain_word_embedding = None
        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        ### hyperparameters
        self.HP_batch_size = 10
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        self.HP_use_char = True
        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0
        self.HP_clip = 5.0
        self.HP_momentum = 0

    def show_data_summary(self):
        print("DATA SUMMARY START:")
        print("     Tag          scheme: %s" % (self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s" % (self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s" % (self.number_normalized))
        print("     Word  alphabet size: %s" % (self.word_alphabet_size))
        print("     Char  alphabet size: %s" % (self.char_alphabet_size))
        print("     Label alphabet size: %s" % (self.label_alphabet_size))
        print("     Word embedding size: %s" % (self.word_emb_dim))
        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))
        print("     Raw   instance number: %s" % (len(self.raw_texts)))
        print("     Hyperpara  batch size: %s" % (self.HP_batch_size))
        print("     Hyperpara          lr: %s" % (self.HP_lr))
        print("     Hyperpara    lr_decay: %s" % (self.HP_lr_decay))
        print("     Hyperpara     HP_clip: %s" % (self.HP_clip))
        print("     Hyperpara    momentum: %s" % (self.HP_momentum))
        print("     Hyperpara  hidden_dim: %s" % (self.HP_hidden_dim))
        print("     Hyperpara     dropout: %s" % (self.HP_dropout))
        print("     Hyperpara  lstm_layer: %s" % (self.HP_lstm_layer))
        print("     Hyperpara      bilstm: %s" % (self.HP_bilstm))
        print("     Hyperpara    use_char: %s" % (self.HP_use_char))
        print("     Hyperpara         GPU: %s" % (self.HP_gpu))
        print("DATA SUMMARY END.")
        sys.stdout.flush()

    def build_alphabet(self, input_file):
        in_lines = open(input_file, 'r').readlines()
        for line in in_lines:
            if len(line) > 2:
                pairs = line.strip().split()
                word = pairs[0]
                if self.number_normalized:
                    word = normalize_word(word)
                label = pairs[-1]
                self.label_alphabet.add(label)
                self.word_alphabet.add(word)
                for char in word:
                    self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        startS = False
        startB = False
        for label, _ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()

    def build_word_pretrain_emb(self, emb_path, norm=False):
        self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(
            emb_path, self.word_alphabet, self.word_emb_dim, norm)

    def generate_instance(self, input_file, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.number_normalized,
                self.MAX_WORD_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.number_normalized,
                self.MAX_WORD_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.number_normalized,
                self.MAX_WORD_LENGTH)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))