Esempio n. 1
0
class Data:
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.number_normalized = True
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')

        self.feature_name = []
        self.feature_alphabets = []
        self.feature_num = len(self.feature_alphabets)
        self.feat_config = None
        self.feature_name2id = {}

        self.label_alphabet = Alphabet('label', True)
        self.tagScheme = "BMES"

        ### I/O
        self.train_dir = None
        self.dev_dir = None
        self.test_dir = None

        self.word_emb_dir = None

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []

        self.pretrain_word_embedding = None

        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        self.feature_alphabet_sizes = []
        self.feature_emb_dims = []
        self.word_emb_dim = 50
        self.char_emb_dim = 30

        self.nbest = None

        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.5

        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_l2 = 1e-8

        # both
        self.full_data = False
        self.tune_wordemb = False

        # relation
        self.max_seq_len = 500
        self.pad_idx = 0
        self.sent_window = 3
        # self.output =None
        self.unk_ratio = 1
        self.seq_feature_size = 256

        self.re_feature_name = []
        self.re_feature_name2id = {}
        self.re_feature_alphabets = []
        self.re_feature_num = len(self.re_feature_alphabets)
        self.re_feat_config = None
        self.re_feature_emb_dims = []
        self.re_feature_alphabet_sizes = []

        self.re_train_X = []
        self.re_dev_X = []
        self.re_test_X = []
        self.re_train_Y = []
        self.re_dev_Y = []
        self.re_test_Y = []

        self.patience = 10

        # self.pretrained_model_dir = None

    def copy_alphabet(self, other):
        self.word_alphabet = copy.deepcopy(other.word_alphabet)
        self.char_alphabet = copy.deepcopy(other.char_alphabet)
        for feature_alphabet in other.feature_alphabets:
            self.feature_alphabets.append(copy.deepcopy(feature_alphabet))

        self.label_alphabet = copy.deepcopy(other.label_alphabet)

        self.feature_name = copy.deepcopy(other.feature_name)
        self.feature_alphabets = copy.deepcopy(other.feature_alphabets)
        self.feature_num = len(self.feature_alphabets)
        self.feature_name2id = copy.deepcopy(other.feature_name2id)
        self.feature_alphabet_sizes = copy.deepcopy(
            other.feature_alphabet_sizes)
        self.feature_emb_dims = copy.deepcopy(other.feature_emb_dims)

        for re_feature_alphabet in other.re_feature_alphabets:
            self.re_feature_alphabets.append(
                copy.deepcopy(re_feature_alphabet))

        self.re_feature_name = copy.deepcopy(other.re_feature_name)
        self.re_feature_name2id = copy.deepcopy(other.re_feature_name2id)
        self.re_feature_alphabets = copy.deepcopy(other.re_feature_alphabets)
        self.re_feature_num = len(self.re_feature_alphabets)
        self.re_feature_emb_dims = copy.deepcopy(other.re_feature_emb_dims)
        self.re_feature_alphabet_sizes = copy.deepcopy(
            other.re_feature_alphabet_sizes)

    def show_data_summary(self):
        print("++" * 50)
        print("DATA SUMMARY START:")
        print("     Tag          scheme: %s" % (self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH))
        print("     Number   normalized: %s" % (self.number_normalized))
        print("     Word  alphabet size: %s" % (self.word_alphabet_size))
        print("     Char  alphabet size: %s" % (self.char_alphabet_size))
        print("     Label alphabet size: %s" % (self.label_alphabet_size))
        print("     Word embedding  dir: %s" % (self.word_emb_dir))
        print("     Word embedding size: %s" % (self.word_emb_dim))
        print("     Char embedding size: %s" % (self.char_emb_dim))
        print("     Train  file directory: %s" % (self.train_dir))
        print("     Dev    file directory: %s" % (self.dev_dir))
        print("     Test   file directory: %s" % (self.test_dir))

        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))

        print("     FEATURE num: %s" % (self.feature_num))
        for idx in range(self.feature_num):
            print("         Fe: %s  alphabet  size: %s" %
                  (self.feature_alphabets[idx].name,
                   self.feature_alphabet_sizes[idx]))
            print(
                "         Fe: %s  embedding size: %s" %
                (self.feature_alphabets[idx].name, self.feature_emb_dims[idx]))

        print("     Model char_hidden_dim: %s" % (self.HP_char_hidden_dim))

        print("     Iteration: %s" % (self.HP_iteration))
        print("     BatchSize: %s" % (self.HP_batch_size))

        print("     Hyper              lr: %s" % (self.HP_lr))
        print("     Hyper              l2: %s" % (self.HP_l2))
        print("     Hyper      hidden_dim: %s" % (self.HP_hidden_dim))
        print("     Hyper         dropout: %s" % (self.HP_dropout))
        print("     Hyper             GPU: %s" % (self.HP_gpu))
        print("     Hyper             NBEST: %s" % (self.nbest))

        print("     full data: %s" % (self.full_data))
        print("     Tune  word embeddings: %s" % (self.tune_wordemb))

        print("     max sequence length: %s" % (self.max_seq_len))
        print("     pad index: %s" % (self.pad_idx))
        print("     patience: %s" % (self.patience))
        print("     sentence window: %s" % (self.sent_window))
        # print("     Output directory: %s" % (self.output))
        print("     The ratio using negative instnaces 0~1: %s" %
              (self.unk_ratio))
        print("     Size of seqeuence feature representation: %s" %
              (self.seq_feature_size))

        print("     RE FEATURE num: %s" % (self.re_feature_num))
        for idx in range(self.re_feature_num):
            print("         Fe: %s  alphabet  size: %s" %
                  (self.re_feature_alphabets[idx].name,
                   self.re_feature_alphabet_sizes[idx]))
            print("         Fe: %s  embedding size: %s" %
                  (self.re_feature_alphabets[idx].name,
                   self.re_feature_emb_dims[idx]))

        print("     RE Train instance number: %s" % (len(self.re_train_Y)))
        print("     RE Dev   instance number: %s" % (len(self.re_dev_Y)))
        print("     RE Test  instance number: %s" % (len(self.re_test_Y)))

        # print("     pretrained_model_dir: %s" % (self.pretrained_model_dir))

        print("DATA SUMMARY END.")
        print("++" * 50)
        sys.stdout.flush()

    def initial_feature_alphabets(self):

        feature_prefix = '[Cap]'
        self.feature_alphabets.append(Alphabet(feature_prefix))
        self.feature_name.append(feature_prefix)
        self.feature_name2id[feature_prefix] = 0

        feature_prefix = '[POS]'
        self.feature_alphabets.append(Alphabet(feature_prefix))
        self.feature_name.append(feature_prefix)
        self.feature_name2id[feature_prefix] = 1

        self.feature_num = len(self.feature_alphabets)
        self.feature_emb_dims = [20] * self.feature_num
        self.feature_alphabet_sizes = [0] * self.feature_num
        if self.feat_config:
            for idx in range(self.feature_num):
                if self.feature_name[idx] in self.feat_config:
                    self.feature_emb_dims[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_size']

    def build_alphabet(self, documents):
        for doc in documents:
            for sentence in doc:
                for token in sentence:
                    word = token['word']
                    if self.number_normalized:
                        word = normalize_word(word)
                    label = token['label']
                    self.label_alphabet.add(label)
                    self.word_alphabet.add(word)
                    ## build feature alphabet
                    self.feature_alphabets[0].add(token['cap'])
                    self.feature_alphabets[1].add(token['pos'])

                    for char in word:
                        self.char_alphabet.add(char)

        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        for idx in range(self.feature_num):
            self.feature_alphabet_sizes[idx] = self.feature_alphabets[
                idx].size()

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()
        for idx in range(self.feature_num):
            self.feature_alphabets[idx].close()

    def open_alphabet(self):
        self.word_alphabet.open()
        self.char_alphabet.open()
        # label not open
        # self.label_alphabet.open()
        for idx in range(self.feature_num):
            self.feature_alphabets[idx].open()

    def initial_re_feature_alphabets(self):
        id = 0
        for k, v in self.re_feat_config.items():
            self.re_feature_alphabets.append(Alphabet(k))
            self.re_feature_name.append(k)
            self.re_feature_name2id[k] = id
            id += 1

        self.re_feature_num = len(self.re_feature_alphabets)
        self.re_feature_emb_dims = [20] * self.re_feature_num
        self.re_feature_alphabet_sizes = [0] * self.re_feature_num
        if self.re_feat_config:
            for idx in range(self.re_feature_num):
                if self.re_feature_name[idx] in self.re_feat_config:
                    self.re_feature_emb_dims[idx] = self.re_feat_config[
                        self.re_feature_name[idx]]['emb_size']

    def build_re_feature_alphabets(self, tokens, entities, relations):

        entity_type_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[ENTITY_TYPE]']]
        entity_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[ENTITY]']]
        relation_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[RELATION]']]
        token_num_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[TOKEN_NUM]']]
        entity_num_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[ENTITY_NUM]']]
        position_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[POSITION]']]

        for i, doc_token in enumerate(tokens):

            doc_entity = entities[i]
            doc_relation = relations[i]

            sent_idx = 0
            sentence = doc_token[(doc_token['sent_idx'] == sent_idx)]
            while sentence.shape[0] != 0:

                entities_in_sentence = doc_entity[(
                    doc_entity['sent_idx'] == sent_idx)]
                for _, entity in entities_in_sentence.iterrows():
                    entity_type_alphabet.add(entity['type'])
                    tk_idx = entity['tf_start']
                    while tk_idx <= entity['tf_end']:
                        entity_alphabet.add(
                            my_utils1.normalizeWord(sentence.iloc[
                                tk_idx, 0]))  # assume 'text' is in 0 column
                        tk_idx += 1

                sent_idx += 1
                sentence = doc_token[(doc_token['sent_idx'] == sent_idx)]

            for _, relation in doc_relation.iterrows():
                relation_alphabet.add(relation['type'])

        for i in range(data.max_seq_len):
            token_num_alphabet.add(i)
            entity_num_alphabet.add(i)
            position_alphabet.add(i)
            position_alphabet.add(-i)

        for idx in range(self.re_feature_num):
            self.re_feature_alphabet_sizes[idx] = self.re_feature_alphabets[
                idx].size()

    def fix_re_alphabet(self):
        for alphabet in self.re_feature_alphabets:
            alphabet.close()

    def open_re_alphabet(self):
        for alphabet in self.re_feature_alphabets:
            if alphabet.name == '[RELATION]':  # label not open
                continue
            alphabet.open()

    def build_pretrain_emb(self):
        if self.word_emb_dir:
            logging.info("Load pretrained word embedding, dir: %s" %
                         (self.word_emb_dir))
            self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(
                self.word_emb_dir, self.word_alphabet, self.word_emb_dim)

    def generate_instance(self, name, documents):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(
                documents, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(
                documents, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(
                documents, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        else:
            logging.info(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def generate_re_instance(self, name, tokens, entities, relations, names):
        self.fix_re_alphabet()
        if name == "train":
            self.re_train_X, self.re_train_Y = relation_extraction.getRelationInstance2(
                tokens, entities, relations, names, self)
        elif name == "dev":
            self.re_dev_X, self.re_dev_Y = relation_extraction.getRelationInstance2(
                tokens, entities, relations, names, self)
        elif name == "test":
            self.re_test_X, self.re_test_Y = relation_extraction.getRelationInstance2(
                tokens, entities, relations, names, self)
        else:
            logging.info(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def load(self, data_file):
        f = open(data_file, 'rb')
        tmp_dict = pickle.load(f)
        f.close()
        self.__dict__.update(tmp_dict)

    def save(self, save_file):
        f = open(save_file, 'wb')
        pickle.dump(self.__dict__, f, 2)
        f.close()

    def clear_data(self):
        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []

        self.re_train_X = []
        self.re_dev_X = []
        self.re_test_X = []
        self.re_train_Y = []
        self.re_dev_Y = []
        self.re_test_Y = []

        self.pretrain_word_embedding = None

    def read_config(self, config_file, opt):
        config = config_file_to_dict(config_file)
        ## read data:

        self.train_dir = opt.train_dir

        self.dev_dir = opt.dev_dir

        self.test_dir = opt.test_dir

        self.word_emb_dir = opt.word_emb_file

        the_item = 'MAX_SENTENCE_LENGTH'
        if the_item in config:
            self.MAX_SENTENCE_LENGTH = int(config[the_item])

        the_item = 'number_normalized'
        if the_item in config:
            self.number_normalized = str2bool(config[the_item])

        the_item = 'word_emb_dim'
        if the_item in config:
            self.word_emb_dim = int(config[the_item])
        the_item = 'char_emb_dim'
        if the_item in config:
            self.char_emb_dim = int(config[the_item])

        the_item = 'nbest'
        if the_item in config:
            self.nbest = int(config[the_item])

        the_item = 'feature'
        if the_item in config:
            self.feat_config = config[the_item]  ## feat_config is a dict

        the_item = 'iteration'
        if the_item in config:
            self.HP_iteration = int(config[the_item])
        the_item = 'batch_size'
        if the_item in config:
            self.HP_batch_size = int(config[the_item])

        the_item = 'char_hidden_dim'
        if the_item in config:
            self.HP_char_hidden_dim = int(config[the_item])
        the_item = 'hidden_dim'
        if the_item in config:
            self.HP_hidden_dim = int(config[the_item])
        the_item = 'dropout'
        if the_item in config:
            self.HP_dropout = float(config[the_item])

        the_item = 'gpu'
        if the_item in config:
            self.HP_gpu = int(config[the_item])
        the_item = 'learning_rate'
        if the_item in config:
            self.HP_lr = float(config[the_item])

        the_item = 'l2'
        if the_item in config:
            self.HP_l2 = float(config[the_item])

        # both
        the_item = 'full_data'
        if the_item in config:
            self.full_data = str2bool(config[the_item])

        the_item = 'tune_wordemb'
        if the_item in config:
            self.tune_wordemb = str2bool(config[the_item])

        the_item = 'max_seq_len'
        if the_item in config:
            self.max_seq_len = int(config[the_item])

        the_item = 'pad_idx'
        if the_item in config:
            self.pad_idx = int(config[the_item])

        the_item = 'sent_window'
        if the_item in config:
            self.sent_window = int(config[the_item])

        # the_item = 'output'
        # if the_item in config:
        #     self.output = config[the_item]

        the_item = 'unk_ratio'
        if the_item in config:
            self.unk_ratio = float(config[the_item])

        the_item = 'seq_feature_size'
        if the_item in config:
            self.seq_feature_size = int(config[the_item])

        the_item = 're_feature'
        if the_item in config:
            self.re_feat_config = config[the_item]  ## feat_config is a dict

        the_item = 'patience'
        if the_item in config:
            self.patience = int(config[the_item])