Beispiel #1
0
class ChineseNER:
    def __init__(self, entry="train"):
        # Load some Hyper-parameters
        config = load_config()
        self.embedding_size = config.get("embedding_size")
        self.hidden_size = config.get("hidden_size")
        self.batch_size = config.get("batch_size")
        self.model_path = config.get("model_path")
        self.dropout = config.get("dropout")
        self.tags = config.get("tags")
        self.learning_rate = config.get("learning_rate")
        self.epochs = config.get("epochs")
        self.weight_decay = config.get("weight_decay")
        self.transfer_learning = config.get("transfer_learning")
        self.lr_decay_step = config.get("lr_decay_step")
        self.lr_decay_rate = config.get("lr_decay_rate")
        self.max_length = config.get("max_length")

        # Model Initialization
        self.main_model(entry)

    def main_model(self, entry):
        """
        Model Initialization
        """
        # The Training Process
        if entry == "train":
            # Training Process: read Training Data from DataManager
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             data_type='train',
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)

            # Read the corresponding character index (vocab) and other hyper-parameters
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }

            save_params(data=data, path=self.model_path)

            # Build BiLSTM-CRF Model
            self.model = BiLSTMCRF(tag_map=self.train_manager.tag_map,
                                   batch_size=self.batch_size,
                                   vocab_size=len(self.train_manager.vocab),
                                   dropout=self.dropout,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size,
                                   max_length=self.max_length)

            # Evaluation Process: read Dev Data from DataManager
            self.dev_size = DataManager(batch_size=1,
                                        data_type="dev",
                                        tags=self.tags).load_data()
            self.dev_manager = DataManager(batch_size=int(self.dev_size),
                                           data_type="dev",
                                           tags=self.tags)
            self.dev_batch = self.dev_manager.iteration()

            # Restore model if it exists
            self.restore_model()

        # The Testing & Inference Process
        elif entry == "predict":
            data_map = load_params(path=self.model_path)
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")
            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   dropout=0.0,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size,
                                   max_length=self.max_length)

            self.restore_model()

    def restore_model(self):
        """
        Restore the model if there is one
        """
        try:
            self.model.load_state_dict(
                torch.load(self.model_path + "params.pkl"))
            print("Model Successfully Restored!")
        except Exception as error:
            print("Model Failed to restore! {}".format(error))

    def train(self):
        """
        Training stage
        """
        model = self.model.to(device=device)

        # Transfer Learning Module
        if self.transfer_learning == True:
            keep_grad = [
                "transitions", "word_embeddings.weight", "hidden2tag.weight",
                "hidden2tag.bias", "linear1.weight", "linear1.bias",
                "linear2.weight", "linear2.bias"
            ]

            for name, value in model.named_parameters():
                if name in keep_grad:
                    value.requires_grad = True
                else:
                    value.requires_grad = False
        else:
            for name, value in model.named_parameters():
                value.requires_grad = True

        # Use Adam Optimizer
        optimizer = optim.AdamW(params=filter(lambda p: p.requires_grad,
                                              model.parameters()),
                                lr=self.learning_rate,
                                weight_decay=self.weight_decay,
                                amsgrad=True)

        # Learning Rate Decay
        # scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=self.lr_decay_step, gamma=self.lr_decay_rate)

        # Print model architecture
        print('\033[1;31mThe model architecture is shown below:\033[0m')
        print(model)
        print('\n')

        # Print model parameters
        print('\033[1;31mThe model\'s parameters are shown below:\033[0m')
        for name, value in model.named_parameters():
            print("Name: \033[1;31m{0}\033[0m, "
                  "Parameter Size: \033[1;36m{1}\033[0m, "
                  "Gradient: \033[1;35m{2}\033[0m".format(
                      name, value.size(), value.requires_grad))
        print('\n')

        for epoch in range(1, self.epochs + 1):
            index = 0
            for batch in self.train_manager.get_batch():
                index += 1

                # Clear gradients before training
                self.model.zero_grad()

                # Read sentences and tags from the batch data
                sentences, tags, length = zip(*batch)
                sentences_tensor = torch.tensor(sentences,
                                                dtype=torch.long,
                                                device=device)
                tags_tensor = torch.tensor(tags,
                                           dtype=torch.float,
                                           device=device)
                length_tensor = torch.tensor(length,
                                             dtype=torch.int64,
                                             device=device)

                # Use Negative Log-Likelihood (NLL) as Loss Function, Run the forward pass
                batch_loss = self.model.neg_log_likelihood(
                    sentences_tensor, tags_tensor, length_tensor)
                loss = batch_loss.mean()

                progress = ("█" * int(index * 40 / self.total_size)).ljust(40)
                print("epoch [{}] |{}| {}/{}\n\t Training Loss {:.6f}".format(
                    epoch, progress, index, self.total_size, loss))

                loss.backward()
                optimizer.step()

                # Save the model during training
                torch.save(self.model.state_dict(),
                           self.model_path + 'params.pkl')

            self.evaluate()
            # scheduler.step()

    def evaluate(self):
        """
        Evaluation of the performance using the dev batch - dev dataset
        """
        sentences, labels, length = zip(*self.dev_batch.__next__())
        _, pre = self.model(sentences=sentences,
                            real_length=length,
                            lengths=None)

        sentences_tensor = torch.tensor(sentences,
                                        dtype=torch.long,
                                        device=device)
        tags_tensor = torch.tensor(pre, dtype=torch.float, device=device)
        length_tensor = torch.tensor(length, dtype=torch.int64, device=device)

        loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor,
                                             length_tensor)
        print("\t Evaluation Loss {:.6f}".format(loss.tolist()[0]))

        ####################################################################################################################################
        print('Start to evaluate on the dev set: ')
        # Tag-level F1 score summary (w.r.t. each tag)
        tag_f1_total = []
        for tag in self.tags:
            _, _, f1_tag = tag_f1(tar_path=labels,
                                  pre_path=pre,
                                  tag=tag,
                                  tag_map=self.model.tag_map)
            tag_f1_total.append(f1_tag)
        tag_macro_f1 = sum(tag_f1_total) / len(tag_f1_total)
        print(
            'Tag-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m'
            % tag_macro_f1)

        # Tag-level Micro-averaged F1 Score
        _, _, f1_Micro_tag = tag_micro_f1(tar_path=labels,
                                          pre_path=pre,
                                          tags=self.tags,
                                          tag_map=self.model.tag_map)
        print(
            'Tag-level Micro-averaged F1 Score of the dev set is \033[1;35m%s\033[0m'
            % f1_Micro_tag)

        ####################################################################################################################################
        # Tag-level with Label-level F1 score summary
        f1_prefix_total = []
        prefixes = ['B', 'I', 'E', 'S']
        for tag in self.tags:
            for prefix in prefixes:
                _, _, f1_prefix = entity_label_f1(tar_path=labels,
                                                  pre_path=pre,
                                                  length=length,
                                                  tag=tag,
                                                  tag_map=self.model.tag_map,
                                                  prefix=prefix)
                f1_prefix_total.append(f1_prefix)

        f1_macro_tag_prefix = sum(f1_prefix_total) / len(f1_prefix_total)
        print(
            'Tag-Label-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m'
            % f1_macro_tag_prefix)

        ####################################################################################################################################
        # Label-level F1 score summary
        f1_prefix_total = []
        prefixes = ['B', 'I', 'E', 'S', 'O']
        for prefix in prefixes:
            _, _, f1_prefix = label_f1(tar_path=labels,
                                       pre_path=pre,
                                       length=length,
                                       tags=self.tags,
                                       tag_map=self.model.tag_map,
                                       prefix=prefix)
            f1_prefix_total.append(f1_prefix)

        f1_macro_prefix = sum(f1_prefix_total) / len(f1_prefix_total)
        print(
            'Label-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m'
            % f1_macro_prefix)

    def predict(self):
        """
        Prediction & Inference Stage
        :param input_str: Input Chinese sentence
        :return entities: Predicted entities
        """
        # Print model architecture
        print('\033[1;31mThe model architecture is shown below:\033[0m')
        print(self.model)
        print('\n')

        # Input one Chinese Sentence
        while True:
            input_str = input("Please input a sentence in Chinese: ")

            if len(input_str) != 0:
                # Full-width to half-width
                input_str = strQ2B(input_str)
                input_str = re.sub(pattern='。', repl='.', string=input_str)

                text = cut_text(text=input_str, length=self.max_length)

                cut_out = []
                for cuttext in text:
                    # Get the embedding vector (Input Vector) from vocab
                    input_vec = [self.vocab.get(i, 0) for i in cuttext]

                    # convert it to tensor and run the model
                    sentences = torch.tensor(input_vec).view(1, -1)

                    length = np.expand_dims(np.shape(sentences)[1], axis=0)
                    length = torch.tensor(length,
                                          dtype=torch.int64,
                                          device=device)

                    _, paths = self.model(sentences=sentences,
                                          real_length=length,
                                          lengths=None)

                    # Get the entities from the model
                    entities = []
                    for tag in self.tags:
                        tags = get_tags(paths[0], tag, self.tag_map)
                        entities += format_result(tags, cuttext, tag)

                    # Get all the entities
                    all_start = []
                    for entity in entities:
                        start = entity.get('start')
                        all_start.append([start, entity])

                    # Sort the results by the "start" index
                    sort_d = [
                        value for index, value in sorted(
                            enumerate(all_start),
                            key=lambda all_start: all_start[1])
                    ]

                    if len(sort_d) == 0:
                        return print("There was no entity in this sentence!!")
                    else:
                        sort_d = np.reshape(
                            np.array(sort_d)[:, 1], [np.shape(sort_d)[0], 1])
                        cut_out.append(sort_d)
                # return cut_out
                print(cut_out)
            else:
                return print('Invalid input! Please re-input!!\n')
class ChineseNER(object):
    def __init__(self, entry="train"):
        self.load_config()
        self.__init_model(entry)

    def __init_model(self, entry):
        if entry == "train":
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }
            self.save_params(data)
            dev_manager = DataManager(batch_size=30, data_type="dev")
            self.dev_batch = dev_manager.iteration()

            self.model = BiLSTMCRF(tag_map=self.train_manager.tag_map,
                                   batch_size=self.batch_size,
                                   vocab_size=len(self.train_manager.vocab),
                                   dropout=self.dropout,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size,
                                   use_gpu=self.use_gpu)
            if self.use_gpu:
                print('True')
                self.model = self.model.cuda()
            else:
                print('False')
            self.restore_model()
#         elif entry=='testXXX':
#             self.dev_manager= DataManager(batch_size=30, data_type="test")
# #             self.dev_batch = dev_manager.batch_data
#             print('####batch_data###',len(dev_manager.batch_data))
        elif entry == 'test':
            self.dev_manager = DataManager(batch_size=30, data_type="test")
            #             self.dev_batch = dev_manager.iteration()

            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")

            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size,
                                   use_gpu=self.use_gpu)
            if self.use_gpu:
                print('True')
                self.model = self.model.cuda()
            else:
                print('False')
            self.restore_model()

        elif entry == "predict":
            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")

            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size,
                                   use_gpu=self.use_gpu)
            if self.use_gpu:
                self.model = self.model.cuda()
            self.restore_model()

    def load_config(self):
        try:
            fopen = open("models/config.yml")
            config = yaml.load(fopen)
            fopen.close()
        except Exception as error:
            print("Load config failed, using default config {}".format(error))
            fopen = open("models/config.yml", "w")
            config = {
                "embedding_size": 100,
                "hidden_size": 128,
                "batch_size": 20,
                "dropout": 0.5,
                "model_path": "models/",
                "tags": ["component", "disease&symptom", "people"],  #在这里修改tag
                "use_gpu": True
            }
            yaml.dump(config, fopen)
            fopen.close()
        self.embedding_size = config.get("embedding_size")
        self.hidden_size = config.get("hidden_size")
        self.batch_size = config.get("batch_size")
        self.model_path = config.get("model_path")
        self.tags = config.get("tags")
        self.dropout = config.get("dropout")
        self.use_gpu = config.get("use_gpu")

    def restore_model(self):
        try:
            self.model.load_state_dict(
                torch.load(self.model_path + "params.pkl"))
            print("model restore success!")
        except Exception as error:
            print("model restore faild! {}".format(error))

    def save_params(self, data):
        with open("models/data.pkl", "wb") as fopen:
            pickle.dump(data, fopen)

    def load_params(self):
        with open("models/data.pkl", "rb") as fopen:
            data_map = pickle.load(fopen)
        return data_map

    def train(self):
        optimizer = optim.Adam(self.model.parameters())
        # optimizer = optim.SGD(ner_model.parameters(), lr=0.01)

        for epoch in range(100):
            index = 0
            for batch in self.train_manager.get_batch():
                index += 1
                self.model.zero_grad()
                print('batch', type(batch), len(batch), len(batch[0]),
                      len(batch[10]))
                sentences, tags, length = zip(*batch)
                # print('zip batch sentences', type(sentences), sentences)
                # print('zip batch tags', type(tags), tags)
                # print('zip batch length', type(length), length)
                sentences_tensor = torch.tensor(sentences, dtype=torch.long)
                tags_tensor = torch.tensor(tags, dtype=torch.long)
                length_tensor = torch.tensor(
                    length, dtype=torch.long)  #在一个batch中,每个句子的原长度
                if self.use_gpu:
                    sentences_tensor = sentences_tensor.cuda()
                    tags_tensor = tags_tensor.cuda()
                    length_tensor = length_tensor.cuda()
#                 print('zip batch sentences', type(sentences_tensor), sentences_tensor.shape)
#                 print('zip batch tags', type(tags_tensor), tags_tensor.shape)
#                 print('zip batch length', type(length_tensor), length_tensor.shape,length)
                loss = self.model.neg_log_likelihood(sentences_tensor,
                                                     tags_tensor,
                                                     length_tensor)
                progress = ("█" * int(index * 25 / self.total_size)).ljust(25)
                print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format(
                    epoch, progress, index, self.total_size,
                    loss.cpu().tolist()[0]))
                if index % 10 == 0:
                    self.evaluate()
                print("-" * 50)
                loss.backward()
                optimizer.step()
                torch.save(self.model.state_dict(),
                           self.model_path + 'params.pkl')

    def evaluate(self):
        with torch.no_grad():
            sentences, labels, length = zip(*self.dev_batch.__next__())
            _, paths = self.model(sentences)
            print("\teval")
            for tag in self.tags:
                f1_score(labels, paths, tag, self.model.tag_map)

    def predict(self, path):  #, input_str=""):
        #         if not input_str:
        #             input_str = input("请输入文本: ")
        sentences = []
        with open('./data/' + path + '.txt', 'r', encoding='utf-8') as f:
            for i in f:
                sentences += i.strip().split('。')
        f = open('./result/tag_' + path + '.json', 'w')
        for input_str in sentences:
            input_vec = [self.vocab.get(i, 0) for i in input_str]
            # convert to tensor
            sentences = torch.tensor(input_vec).view(1, -1)
            _, paths = self.model(sentences)

            entities = []
            for tag in self.tags:
                tags = get_tags(paths[0], tag, self.tag_map)
                entities += format_result(tags, input_str, tag)
            dic = {'sentense': input_str, 'entities': entities}
            json.dump(dic, f, ensure_ascii=False)
        f.close()
#             return entities
#     def testXXX(self):
#         for batch in self.dev_manager.get_batch():
#             print(_)
#             print(_,len(items),len(items[0][0]),len(items[0][1]),items[0][2])
#             break

    def test(self):
        with torch.no_grad():
            id2vocab = {self.vocab[i]: i for i in self.vocab}
            print(len(id2vocab))
            f = open('./result/test_tag.json', 'w')
            total_matrix = np.zeros(
                [len(self.tags), 3]
            )  #横坐标分别表示component,disease&symptom,people;纵坐标分别表示recall, precision, f1
            count = 0
            for batch in self.dev_manager.get_batch():
                count += 1
                print(count)
                #                 print(type(items))
                sentences, labels, length = zip(*batch)
                #             sentences, labels, length = zip(*self.dev_batch.__next__())
                #                 print('I am in')
                strs = [[id2vocab[w] for w in s] for s in sentences]
                #                 print(strs)
                #                 print(len(sentences),len(sentences[0]),len(sentences[5]))
                _, paths = self.model(sentences)
                #                 print("\teval")
                #                 print('path',len(paths),len(paths[0]),len(paths[1]))
                for i in range(len(self.tags)):
                    recall, precision, f1 = f1_score(labels, paths,
                                                     self.tags[i],
                                                     self.model.tag_map)
                    total_matrix[i][0] += recall
                    total_matrix[i][1] += precision
                    total_matrix[i][2] += f1
                entities = []
                for i in range(len(paths)):
                    tmp = []

                    for tag in self.tags:
                        tags = get_tags(paths[i], tag, self.tag_map)
                        tmp += format_result(tags, strs[i], tag)
                    entities.append(tmp)

    #             print(entities)
                for i in range(len(entities)):
                    dic = {
                        'sentense': ''.join(strs[i]),
                        'entities': entities[i]
                    }
                    json.dump(dic, f, ensure_ascii=False)


#                     f.write(''.join(strs[i])+'#####找到的实体为#####'+'&'.join(entities[i])+'\n')
            total_matrix /= count
            #             print(total_matrix)
            for i in range(len(self.tags)):
                print(
                    "{}\tcount\t{}\trecall {:.2f}\tprecision {:.2f}\tf1 {:.2f}"
                    .format(count, self.tags[i], total_matrix[i][0],
                            total_matrix[i][1], total_matrix[i][2]))
            f.close()
Beispiel #3
0
class ChineseNER(object):
    def __init__(self, entry="train"):
        self.load_config()
        self.__init_model(entry)

    def __init_model(self, entry):
        if entry == "train":
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }
            self.save_params(data)
            dev_manager = DataManager(batch_size=30, data_type="dev")
            self.dev_batch = dev_manager.iteration()

            self.model = BiLSTMCRF(
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.vocab),
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
            self.restore_model()
        elif entry == "predict":
            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")

            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size)
            self.restore_model()

    def load_config(self):
        try:
            fopen = open("models/config.yml")
            config = yaml.load(fopen, Loader=yaml.FullLoader)
            fopen.close()
        except Exception as error:
            print("Load config failed, using default config {}".format(error))
            fopen = open("models/config.yml", "w", encoding='UTF-8')
            config = {
                "embedding_size": 100,
                "hidden_size": 128,
                "batch_size": 20,
                "dropout": 0.5,
                "model_path": "models/",
                "tasg": ["ORG", "PER"]
            }
            yaml.dump(config, fopen)
            fopen.close()
        self.embedding_size = config.get("embedding_size")
        self.hidden_size = config.get("hidden_size")
        self.batch_size = config.get("batch_size")
        self.model_path = config.get("model_path")
        self.tags = config.get("tags")
        self.dropout = config.get("dropout")

    def restore_model(self):
        try:
            self.model.load_state_dict(
                torch.load(self.model_path + "params.pkl"))
            print("model restore success!")
        except Exception as error:
            print("model restore faild! {}".format(error))

    def save_params(self, data):
        with open("models/data.pkl", "wb") as fopen:
            pickle.dump(data, fopen)

    def load_params(self):
        with open("models/data.pkl", "rb") as fopen:
            data_map = pickle.load(fopen)
        return data_map

    def train(self):
        optimizer = optim.Adam(self.model.parameters())
        # optimizer = optim.SGD(ner_model.parameters(), lr=0.01)
        for epoch in range(100):
            index = 0
            for batch in self.train_manager.get_batch():
                index += 1
                self.model.zero_grad()

                sentences, tags, length = zip(*batch)
                sentences_tensor = torch.tensor(sentences, dtype=torch.long)
                tags_tensor = torch.tensor(tags, dtype=torch.long)
                length_tensor = torch.tensor(length, dtype=torch.long)

                loss = self.model.neg_log_likelihood(sentences_tensor,
                                                     tags_tensor,
                                                     length_tensor)
                progress = ("█" * int(index * 25 / self.total_size)).ljust(25)
                print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format(
                    epoch, progress, index, self.total_size,
                    loss.cpu().tolist()[0]))
                self.evaluate()
                print("-" * 50)
                loss.backward()
                optimizer.step()
                torch.save(self.model.state_dict(),
                           self.model_path + 'params.pkl')

    def evaluate(self):
        sentences, labels, length = zip(*self.dev_batch.__next__())
        _, paths = self.model(sentences)
        print("\teval")
        for tag in self.tags:
            f1_score(labels, paths, tag, self.model.tag_map)

    def predict(self, input_str=""):
        if not input_str:
            input_str = input("请输入文本: ")
        input_vec = [self.vocab.get(i, 0) for i in input_str]
        # convert to tensor
        sentences = torch.tensor(input_vec).view(1, -1)
        _, paths = self.model(sentences)

        entities = []
        for tag in self.tags:
            tags = get_tags(paths[0], tag, self.tag_map)
            entities += format_result(tags, input_str, tag)
        return entities
Beispiel #4
0
class ChineseNER:
    def __init__(self, entry="train"):
        # Load Hyper-parameters
        config = load_config()
        self.model_path = config.get("model_path")
        self.epochs = config.get("epochs")
        self.batch_size = config.get("batch_size")
        self.learning_rate = config.get("learning_rate")
        self.weight_decay = config.get("weight_decay")
        self.dropout = config.get("dropout")
        self.hidden_size = config.get("hidden_size")
        self.char_num = config.get("char_num")
        self.char_dim = config.get("char_dim")
        self.word_dim = config.get("word_dim")
        self.word_num = config.get("word_num")
        self.tags = config.get("tags")
        self.transfer_learning = config.get("transfer_learning")
        self.lr_decay_step = config.get("lr_decay_step")
        self.lr_decay_rate = config.get("lr_decay_rate")

        # Load main model
        self.main_model(entry)

    def main_model(self, entry):
        # The Training Process
        if entry == "train":
            # Training Process: read Training Data from DataManager
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             data_type='train',
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)

            # Load some model parameters
            try:
                load_params(path=self.model_path)
                print("Successfully load the data.pkl!!!")

            except Exception as error:
                print("There was no data.pkl!! Start to save........")
                # Read the corresponding character index (vocab) and other hyper-parameters
                saved_data = {
                    "batch_size": self.train_manager.batch_size,
                    "input_size": self.train_manager.input_size,
                    "char_vocab": self.train_manager.char_vocab,
                    "tag_map": self.train_manager.tag_map,
                }
                save_params(data=saved_data, path=self.model_path)

            # Build BiLSTM-CRF Model
            self.model = BiLSTMCRF(
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.char_vocab),
                dropout=self.dropout,
                word_num=self.word_num,
                word_dim=self.word_dim,
                char_num=self.char_num,
                char_dim=self.char_dim,
                hidden_dim=self.hidden_size,
            )

            # Restore model if it exists
            self.restore_model()

            # Evaluation Process: read Dev Data from DataManager
            self.dev_size = DataManager(batch_size=1,
                                        data_type="dev",
                                        tags=self.tags).load_char_data()
            self.dev_manager = DataManager(batch_size=int(self.dev_size),
                                           data_type="dev")
            self.dev_batch = self.dev_manager.iteration()

        # The Inference Process
        elif entry == "predict":
            data = load_params(path=self.model_path)
            input_size = data.get("input_size")
            self.tag_map = data.get("tag_map")
            self.vocab = data.get("char_vocab")
            self.model = BiLSTMCRF(
                tag_map=self.tag_map,
                vocab_size=input_size,
                dropout=0.0,
                word_num=self.word_num,
                word_dim=self.word_dim,
                char_num=self.char_num,
                char_dim=self.char_dim,
                hidden_dim=self.hidden_size,
            )
            self.restore_model()

    def train(self):
        # Transfer Learning Module
        if self.transfer_learning == True:
            keep_grad = [
                "transitions", "char_embedding.weight",
                "char_linear_lstm.weight", "char_linear_lstm.bias",
                "word_linear_lstm.weight", "word_linear_lstm.bias",
                "hidden2tag.weight", "hidden2tag.bias"
            ]

            for name, value in self.model.named_parameters():
                if name in keep_grad:
                    value.requires_grad = True
                else:
                    value.requires_grad = False
        else:
            for name, value in self.model.named_parameters():
                value.requires_grad = True

        # Use Adam Optimizer
        optimizer = optim.Adam(params=filter(lambda p: p.requires_grad,
                                             self.model.parameters()),
                               lr=self.learning_rate,
                               weight_decay=self.weight_decay)

        # Learning Rate Decay
        # scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=self.lr_decay_step, gamma=self.lr_decay_rate)

        # Print model architecture
        print('\033[1;31mThe model architecture is shown below:\033[0m')
        print(self.model)
        print('\n')

        # Print model parameters
        print('\033[1;31mThe model\'s parameters are shown below:\033[0m')
        for name, value in self.model.named_parameters():
            print("Name: \033[1;31m{0}\033[0m, "
                  "Parameter Size: \033[1;36m{1}\033[0m, "
                  "Gradient: \033[1;35m{2}\033[0m".format(
                      name, value.size(), value.requires_grad))
        print('\n')

        for epoch in range(1, self.epochs + 1):
            index = 0
            for batch in self.train_manager.get_batch():
                index += 1
                # Clear gradients before training
                self.model.zero_grad()

                ####################################################################################################################################
                # Read sentences and labels from the batch data
                chars, labels, words, len_word, len_char = zip(*batch)
                chars_tensor = torch.tensor(chars,
                                            dtype=torch.long,
                                            device=device)
                labels_tensor = torch.tensor(labels,
                                             dtype=torch.float,
                                             device=device)
                words_tensor = torch.tensor(words,
                                            dtype=torch.float,
                                            device=device)
                len_word_tensor = torch.tensor(len_word,
                                               dtype=torch.int64,
                                               device=device)
                len_char_tensor = torch.tensor(len_char,
                                               dtype=torch.int64,
                                               device=device)

                ####################################################################################################################################
                loss = self.model.neg_log_likelihood(characters=chars_tensor,
                                                     tags=labels_tensor,
                                                     len_char=len_char_tensor,
                                                     words=words_tensor,
                                                     len_word=len_word_tensor)
                progress = ("█" * int(index * 40 / self.total_size)).ljust(40)
                print("epoch [{}] |{}| {}/{}\t Batch Loss {:.6f}".format(
                    epoch, progress, index, self.total_size,
                    loss.tolist()[0]))

                ####################################################################################################################################
                loss.backward()
                optimizer.step()
                torch.save(self.model.state_dict(),
                           self.model_path + 'params.pkl')

            self.evaluate()
            # scheduler.step()

    def evaluate(self):
        """
        Evaluation of the performance using the development set
        """
        chars, labels, words, len_words, len_chars = zip(
            *self.dev_batch.__next__())
        chars_tensor = torch.tensor(chars, dtype=torch.long, device=device)
        words_tensor = torch.tensor(words, dtype=torch.float, device=device)
        len_word_tensor = torch.tensor(len_words,
                                       dtype=torch.int64,
                                       device=device)
        len_char_tensor = torch.tensor(len_chars,
                                       dtype=torch.int64,
                                       device=device)

        # Run the Forward pass of the model
        _, pre = self.model(characters=chars_tensor,
                            len_char=len_char_tensor,
                            words=words_tensor,
                            len_word=len_word_tensor)
        tags_tensor = torch.tensor(pre, dtype=torch.int, device=device)

        ####################################################################################################################################
        # Loss on the dev set
        loss = self.model.neg_log_likelihood(characters=chars_tensor,
                                             tags=tags_tensor,
                                             len_char=len_char_tensor,
                                             words=words_tensor,
                                             len_word=len_word_tensor)
        print("\t Evaluation Loss on the dev set{:.6f}".format(
            loss.tolist()[0]))

        ####################################################################################################################################
        print('Start to evaluate on the dev set: ')

        ####################################################################################################################################
        # Tag-level F1 score summary (w.r.t. each tag)
        tag_f1_total = []
        for tag in self.tags:
            _, _, f1_tag = tag_f1(tar_path=labels,
                                  pre_path=pre,
                                  tag=tag,
                                  tag_map=self.model.tag_map)
            tag_f1_total.append(f1_tag)
        tag_macro_f1 = sum(tag_f1_total) / len(tag_f1_total)
        print(
            'Tag-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m'
            % tag_macro_f1)

        # Tag-level Micro-averaged F1 Score
        _, _, f1_Micro_tag = tag_micro_f1(tar_path=labels,
                                          pre_path=pre,
                                          tags=self.tags,
                                          tag_map=self.model.tag_map)
        print(
            'Tag-level Micro-averaged F1 Score of the dev set is \033[1;35m%s\033[0m'
            % f1_Micro_tag)

        ####################################################################################################################################
        # Tag-level with Label-level F1 score summary
        f1_prefix_total = []
        prefixes = ['B', 'I', 'E', 'S']
        for tag in self.tags:
            for prefix in prefixes:
                _, _, f1_prefix = entity_label_f1(tar_path=labels,
                                                  pre_path=pre,
                                                  length=len_chars,
                                                  tag=tag,
                                                  tag_map=self.model.tag_map,
                                                  prefix=prefix)
                f1_prefix_total.append(f1_prefix)

        f1_macro_tag_prefix = sum(f1_prefix_total) / len(f1_prefix_total)
        print(
            'Tag-Label-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m'
            % f1_macro_tag_prefix)

        ####################################################################################################################################
        # Label-level F1 score summary
        f1_prefix_total = []
        prefixes = ['B', 'I', 'E', 'S', 'O']
        for prefix in prefixes:
            _, _, f1_prefix = label_f1(tar_path=labels,
                                       pre_path=pre,
                                       length=len_chars,
                                       tags=self.tags,
                                       tag_map=self.model.tag_map,
                                       prefix=prefix)
            f1_prefix_total.append(f1_prefix)

        f1_macro_prefix = sum(f1_prefix_total) / len(f1_prefix_total)
        print(
            'Label-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m'
            % f1_macro_prefix)

    def predict(self):
        """
        Prediction & Inference Stage
        """
        # Load word vectors
        pre_trained = self.load_word_vector()

        while True:
            input_str = input("Please input a sentence in Chinese: ")
            input_str = stringQ2B(input_str)

            # Get character embedding
            char_vec = [self.vocab.get(i, 0) for i in input_str]
            char_tensor = np.reshape(char_vec, [-1]).tolist()
            len_char = np.expand_dims(len(char_tensor), axis=0)
            len_char = torch.tensor(len_char, dtype=torch.int64, device=device)
            char_tensor = np.array(self.pad_char_data(char_tensor)).tolist()
            char_tensor = torch.tensor(char_tensor,
                                       dtype=torch.long,
                                       device=device)

            # Get word embedding
            embed_words = []
            words = jieba.lcut(input_str, HMM=True)
            for i in words:
                vec = pre_trained.get(i)
                if str(type(vec)) != "<class 'NoneType'>":
                    embed_words.append(vec)
                else:
                    gen_vec = np.random.normal(size=self.word_dim).tolist()
                    embed_words.append(gen_vec)

            word_tensor = np.array(self.pad_word_data(embed_words)).tolist()
            len_word = np.expand_dims(len(word_tensor), axis=0)
            len_word = torch.tensor(len_word, dtype=torch.int64, device=device)
            word_tensor = torch.tensor(word_tensor,
                                       dtype=torch.float,
                                       device=device)

            # Run the model and get all the predicted entities
            _, paths = self.model(characters=char_tensor,
                                  len_char=len_char,
                                  words=word_tensor,
                                  len_word=len_word)

            # Format the results
            entities = []
            for tag in self.tags:
                tags = get_tags(path=paths[0], tag=tag, tag_map=self.tag_map)
                entities += format_result(result=tags, text=input_str, tag=tag)
            print(entities)

    def load_word_vector(self):
        """
        Load pre-trained word vectors
        """
        if 'pre_trained' not in globals().keys():
            print("Start to load pre-trained word embeddings!!")
            pre_trained = {}
            for i, line in enumerate(
                    codecs.open(self.model_path + "word_vectors.vec",
                                'r',
                                encoding='utf-8')):
                line = line.rstrip().split()
                if len(line) == self.word_dim + 1:
                    pre_trained[line[0]] = np.array(
                        [float(x) for x in line[1:]]).astype(np.float32)
        else:
            pre_trained = globals().get("pre_trained")
        return pre_trained

    def pad_char_data(self, data: list):
        """
        Pad character data
        """
        c_data = copy.deepcopy(data)
        if np.shape(c_data)[0] < self.char_num:
            c_data = c_data + (self.char_num - np.shape(c_data)[0]) * [0]
        else:
            c_data = c_data[:self.char_num]
        c_data = np.expand_dims(c_data, axis=0)
        return c_data

    def pad_word_data(self, data: list):
        """
        Pad word data
        """
        c_data = copy.deepcopy(data)
        if len(c_data) <= self.word_num:
            c_data = c_data + (self.word_num - len(c_data)) * [[0] *
                                                               self.word_dim]
        else:
            c_data = c_data[:self.word_num, :]
        c_data = np.reshape(c_data,
                            [np.shape(c_data)[0] * np.shape(c_data)[1]])
        c_data = np.expand_dims(c_data, axis=0)
        return c_data

    def restore_model(self):
        """
        Restore and load the model
        """
        try:
            self.model.load_state_dict(
                torch.load(self.model_path + "params.pkl"))
            print("Model Successfully Restored!!")
        except Exception as error:
            print("Model Failed to restore!!")
Beispiel #5
0
class ChineseNER(object):
    def __init__(self, entry="train"):
        self.load_config()
        self.__init_model(entry)

    def __init_model(self, entry):
        if entry == "train":
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }
            self.save_params(data)
            dev_manager = DataManager(batch_size=30, data_type="dev")
            self.dev_batch = dev_manager.iteration()

            self.model = BiLSTMCRF(
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.vocab),
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
            self.restore_model()
        elif entry == "predict":
            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")

            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size)
            self.restore_model()

    def parse_argument(self):
        """
        :argument
        :return:
        """
        parser = argparse.ArgumentParser(description="NER")
        parser.add_argument("-c",
                            "--config",
                            dest="config_file",
                            type=str,
                            default="./Config/config.cfg",
                            help="config path")
        parser.add_argument("-device",
                            "--device",
                            dest="device",
                            type=str,
                            default="cuda:0",
                            help="device[‘cpu’,‘cuda:0’,‘cuda:1’,......]")
        parser.add_argument("--train",
                            dest="train",
                            action="store_true",
                            default=True,
                            help="train model")
        parser.add_argument("-p",
                            "--process",
                            dest="process",
                            action="store_true",
                            default=True,
                            help="data process")
        parser.add_argument("-t",
                            "--test",
                            dest="test",
                            action="store_true",
                            default=False,
                            help="test model")
        parser.add_argument("--t_model",
                            dest="t_model",
                            type=str,
                            default=None,
                            help="model for test")
        parser.add_argument("--t_data",
                            dest="t_data",
                            type=str,
                            default=None,
                            help="data[train, dev, test, None] for test model")
        parser.add_argument("--predict",
                            dest="predict",
                            action="store_true",
                            default=False,
                            help="predict model")
        args = parser.parse_args()
        # print(vars(args))
        config = configurable.Configurable(config_file=args.config_file)
        config.device = args.device
        config.train = args.train
        config.process = args.process
        config.test = args.test
        config.t_model = args.t_model
        config.t_data = args.t_data
        config.predict = args.predict
        # config
        if config.test is True:
            config.train = False
        if config.t_data not in [None, "train", "dev", "test"]:
            print("\nUsage")
            parser.print_help()
            print("t_data : {}, not in [None, 'train', 'dev', 'test']".format(
                config.t_data))
            exit()
        print("***************************************")
        print("Device : {}".format(config.device))
        print("Data Process : {}".format(config.process))
        print("Train model : {}".format(config.train))
        print("Test model : {}".format(config.test))
        print("t_model : {}".format(config.t_model))
        print("t_data : {}".format(config.t_data))
        print("predict : {}".format(config.predict))
        print("***************************************")

        return config

    def load_config(self):
        try:
            fopen = open("models/config.yml")
            config = yaml.load(fopen)
            fopen.close()
        except Exception as error:
            print("Load config failed, using default config {}".format(error))
            fopen = open("models/config.yml", "w")
            config = {
                "embedding_size": 100,
                "hidden_size": 128,
                "batch_size": 20,
                "dropout": 0.5,
                "model_path": "models/",
                "tasg": ["ORG", "PER"]
            }
            yaml.dump(config, fopen)
            fopen.close()
        self.embedding_size = config.get("embedding_size")
        self.hidden_size = config.get("hidden_size")
        self.batch_size = config.get("batch_size")
        self.model_path = config.get("model_path")
        self.tags = config.get("tags")
        self.dropout = config.get("dropout")

    def restore_model(self):
        try:
            self.model.load_state_dict(
                torch.load(self.model_path + "params.pkl"))
            print("model restore success!")
        except Exception as error:
            print("model restore faild! {}".format(error))

    def save_params(self, data):
        with open("models/data.pkl", "wb") as fopen:
            pickle.dump(data, fopen)

    def load_params(self):
        with open("models/data.pkl", "rb") as fopen:
            data_map = pickle.load(fopen)
        return data_map

    def train(self):
        optimizer = optim.Adam(self.model.parameters())
        # optimizer = optim.SGD(ner_model.parameters(), lr=0.01)
        for epoch in range(100):
            index = 0
            for batch in self.train_manager.get_batch():
                index += 1
                self.model.zero_grad()

                sentences, tags, length = zip(*batch)
                sentences_tensor = torch.tensor(sentences, dtype=torch.long)
                tags_tensor = torch.tensor(tags, dtype=torch.long)
                length_tensor = torch.tensor(length, dtype=torch.long)

                loss = self.model.neg_log_likelihood(sentences_tensor,
                                                     tags_tensor,
                                                     length_tensor)
                progress = ("█" * int(index * 25 / self.total_size)).ljust(25)
                print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format(
                    epoch, progress, index, self.total_size,
                    loss.cpu().tolist()[0]))
                self.evaluate()
                print("-" * 50)
                loss.backward()
                optimizer.step()
                torch.save(self.model.state_dict(),
                           self.model_path + 'params.pkl')

    def evaluate(self):
        sentences, labels, length = zip(*self.dev_batch.__next__())
        _, paths = self.model(sentences)
        print("\teval")
        for tag in self.tags:
            f1_score(labels, paths, tag, self.model.tag_map)

    def predict(self, input_str=""):
        if not input_str:
            input_str = input("请输入文本: ")
        input_vec = [self.vocab.get(i, 0) for i in input_str]
        # convert to tensor
        sentences = torch.tensor(input_vec).view(1, -1)
        _, paths = self.model(sentences)

        entities = []
        for tag in self.tags:
            tags = get_tags(paths[0], tag, self.tag_map)
            entities += format_result(tags, input_str, tag)
        return entities
class ChineseNER(object):
    def __init__(self, entry="train"):
        self.load_config()
        self.__init_model(entry)

    def __init_model(self, entry):
        if entry == "train":
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }
            self.save_params(data)
            self.dev_manager = DataManager(batch_size=60, data_type="dev")
            # 验证集
            # self.dev_batch = self.dev_manager.iteration()

            self.model = BiLSTMCRF(
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.vocab),
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
            self.model = self.model.cuda()
            self.restore_model()
        elif entry == "predict" or "evaluate":
            # python main.py predict
            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")
            print('input_size', input_size)
            print('tag_map', self.tag_map)
            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size)
            self.model = self.model.cuda()
            self.test_manager = DataManager(batch_size=60, data_type="dev")
            self.restore_model()

    # 加载配置项
    def load_config(self):
        try:
            fopen = open("models/config.yml")
            config = yaml.load(fopen)
            fopen.close()
        except Exception as error:
            print("Load config failed, using default config {}".format(error))
            fopen = open("models/config.yml", "w")
            config = {
                "embedding_size": 300,
                "hidden_size": 128,
                "batch_size": 30,
                "dropout": 0.5,
                "model_path": "models/",
                "tags": ["TREATMENT", "BODY", "SIGNS", "CHECK", "DISEASE"]
            }
            yaml.dump(config, fopen)
            fopen.close()
        self.embedding_size = config.get("embedding_size")
        self.hidden_size = config.get("hidden_size")
        self.batch_size = config.get("batch_size")
        self.model_path = config.get("model_path")
        self.tags = config.get("tags")
        self.dropout = config.get("dropout")

    # 保存模型各种训练参数
    def restore_model(self):
        try:
            self.model.load_state_dict(
                torch.load(self.model_path + "params_6all.pkl"))
            print("model restore success!")
        except Exception as error:
            print("model restore faild! {}".format(error))

    # 保存模型超参数
    def save_params(self, data):
        with open("models/data_6all.pkl", "wb") as fopen:
            pickle.dump(data, fopen)

    # 加载模型超参数
    def load_params(self):
        with open("models/data_6all.pkl", "rb") as fopen:
            data_map = pickle.load(fopen)
        return data_map

    def train(self):
        optimizer = optim.Adam(self.model.parameters(),
                               weight_decay=0.002,
                               lr=0.0000004)  # 0.000001
        # optimizer = optim.SGD(self.model.parameters(), lr=0.00000008,weight_decay=0.001,momentum=0.9) #4e-7
        scheduler_lr = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                            mode='min',
                                                            factor=0.5,
                                                            patience=2,
                                                            cooldown=5,
                                                            verbose=True,
                                                            min_lr=1e-8,
                                                            eps=1e-8)
        best_loss = 240
        lossList = [0] * self.total_size
        for epoch in range(268, 401):
            losses = []
            index = 0
            startTime = time.process_time()
            for batch in self.train_manager.get_batch():
                start = time.process_time()
                index += 1
                self.model.zero_grad()

                sentences, tags, length = zip(*batch)
                # lenght 是句子的原本长度
                # shape (batch_size,max.len(sentence) (20,332) batch_size 和 每个batch最长句子的长度
                sentences_tensor = torch.tensor(sentences,
                                                dtype=torch.long).cuda()
                tags_tensor = torch.tensor(tags, dtype=torch.long).cuda()
                length_tensor = torch.tensor(length, dtype=torch.long).cuda()

                loss = self.model.neg_log_likelihood(sentences_tensor,
                                                     tags_tensor,
                                                     length_tensor)
                losses.append(loss.cpu().item())
                progress = ("█" * int(index * 60 / self.total_size)).ljust(60)
                loss.backward()
                optimizer.step()
                # torch.save(self.model.state_dict(), self.model_path + 'params_6all.pkl')
                end = time.process_time()
                dur = end - start
                print(
                    """epoch [{}] |{}| {}/{}\n\tloss {:.3f}\t\tlast_loss {:.3f}\t\ttime {}\t\tbest_avg_loss {:.3f}"""
                    .format(epoch, progress, index, self.total_size,
                            loss.cpu().tolist()[0], lossList[index - 1],
                            str(dur), best_loss))
                lossList[index - 1] = loss.cpu().item()
                print("-" * 90)
            endTime = time.process_time()
            totalTime = endTime - startTime
            avg_loss = np.mean(losses)
            # 保存最好的模型
            if avg_loss < best_loss:
                best_loss = avg_loss
                torch.save(self.model.state_dict(),
                           self.model_path + 'params_6all.pkl')
            writer.add_scalar('BiLstm_CRF:avg_loss-epoch', avg_loss, epoch)
            print('epoch ', epoch, '   avg_loss ', avg_loss, '   total_time ',
                  totalTime)
            if epoch % 5 == 0:
                self.evaluate(epoch / 5, manager=self.dev_manager)
            print("-" * 100)
            scheduler_lr.step(avg_loss)
        writer.close()

    # train: BODY 7507, SIGNS 6355, CHECK 6965, DISEASE 474, TREATMENT 805
    # test:
    # 计算f1,评估模型
    def evaluate(self, epoch, manager, add_scalar=True):
        print('正在开始评估')
        all_origins = all_founds = all_rights = 0
        for tag in self.tags:
            origins = founds = rights = 0
            for batch in manager.get_batch():
                sentences, labels, length = zip(*batch)
                _, paths = self.model(sentences)
                origin, found, right = f1_score(labels, paths, tag,
                                                self.model.tag_map)
                origins += origin
                founds += found
                rights += right
            all_origins += origins
            all_founds += founds
            all_rights += rights
            recall = 0. if origins == 0 else (rights / origins)
            precision = 0. if founds == 0 else (rights / founds)
            f1 = 0. if recall + precision == 0 else (
                2 * precision * recall) / (precision + recall)
            print("\t{}\torigins:{}\t\t\tfounds:{}\t\t\trights:{}".format(
                tag, origins, founds, rights))
            print("\t\t\trecall:{}\tprecision:{}\tf1:{}".format(
                recall, precision, f1))
            if add_scalar:
                tag_epoch = tag + '-5epoch'
                writer.add_scalars(tag_epoch, {
                    'recall': recall,
                    'precision': precision,
                    'f1': f1
                }, epoch)
        all_recall = 0. if all_origins == 0 else (all_rights / all_origins)
        all_precision = 0. if all_founds == 0 else (all_rights / all_founds)
        all_f1 = 0. if all_recall + all_precision == 0 else (
            2 * all_precision * all_recall) / (all_precision + all_recall)
        print("\tall_origins:{}\t\t\tall_founds:{}\t\t\tall_rights:{}".format(
            all_origins, all_founds, all_rights))
        print("\tall_recall:{}\tall_precision:{}\tall_f1:{}".format(
            all_recall, all_precision, all_f1))
        if add_scalar:
            writer.add_scalars(
                "ALL-5epoch", {
                    'all_recall': all_recall,
                    'all_precision': all_precision,
                    'all_f1': all_f1
                }, epoch)
        print('评估结束')
        return all_recall, all_precision, all_f1

    # 预测方法
    def predict(self, input_str=""):
        if not input_str:
            input_str = input("请输入文本: ")
        # 获取输入句子所有汉字的在vocab的索引
        input_vec = [self.vocab.get(i, 0) for i in input_str]
        # convert to tensor
        sentences = torch.tensor(input_vec, dtype=torch.long).view(1, -1)
        sentences = sentences.cuda()
        # paths 预测出来的标签索引 shape 为 [1,1]
        _, paths = self.model(sentences)

        entities = []
        # "tags": ["ORG", "PER"]
        for tag in self.tags:
            tags = get_tags(paths[0], tag, self.tag_map)
            entities += format_result(tags, input_str, tag)
        print(entities)
        print(json.dumps(entities, indent=4, ensure_ascii=False))
        return entities
Beispiel #7
0
class BiLSTMCRFEnter(object):

    def __init__(self, entry="train"):
        # 导入训练参数
        # 利用配置文件对main函数里面需要的变量进行初始化
        self.load_config()
        # 这里传入的entry是train,也就是训练集,也就是说对model初始化时是利用训练集对模型初始化的
        self.__init_model(entry)

    def __init_model(self, entry):
        # 模型训练的参数准备
        if entry == "train":
            #创建训练数据集的管理对象
            print(self.tags)
            self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags)
            print(self.train_manager.batch_data)
            print(len(self.train_manager.batch_data))
            self.total_size = len(self.train_manager.batch_data)
            # print(self.train_manager.batch_data)
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }
            # 保存参数
            self.save_params(data)
            # 验证数据集的准备
            # 创建验证数据集的管理对象
            dev_manager = DataManager(batch_size=30, data_type="dev")
            # 通过data_manager中的迭代器不断将创建的数据管理器对象赋值到dev_batch中,用于下面计算损失的函数
            self.dev_batch = dev_manager.iteration()

            # 模型的主体使用的是BiLSTM来进行语义编码,CRF用来约束各个标签
            self.model = BiLSTMCRF(
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.vocab),
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
            # 加载恢复模型参数
            self.restore_model()
        # 模型用来预测的参数准备
        elif entry == "predict":
            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")
            # 这里创建一个模型对象model
            self.model = BiLSTMCRF(
                tag_map=self.tag_map,
                vocab_size=input_size,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size
            )
            self.restore_model()

    def load_config(self):
        try:
            fopen = open("models/config.yml")
            #读取yml文件
            config = yaml.load(fopen)
            fopen.close()
        except Exception as error:
            print("Load config failed, using default config {}".format(error))
            #这里是重写config.yml文件
            fopen = open("models/config.yml", "w")
            config = {
                # 用于重写的数据,即初始化数据
                "embedding_size": 100,
                "hidden_size": 128,
                "batch_size": 50,
                "dropout": 0.5,
                "model_path": "models/",
                #这里原来的tags写成了tasg了,需要改过来
                "tags": ["Medicinal_Name", "Medicinal_Other_Name", "Medicinal_Function", "Medicinal_Taste", "Medicinal_Use_Num"]
            }
            yaml.dump(config, fopen)
            fopen.close()
        #重写过后再读取,感觉有点多此一举,主要就是将tags写进了config文件
        # word_embedding的维度大小
        self.embedding_size = config.get("embedding_size")
        # 隐藏层的维度
        self.hidden_size = config.get("hidden_size")
        # 每一个batch导入多少条数据
        self.batch_size = config.get("batch_size")
        # 模型的保存数据
        self.model_path = config.get("model_path")
        self.tags = config.get("tags")
        # 模型中神经百分之多少激活
        self.dropout = config.get("dropout")
        # 模型一共训练多少轮
        self.epoch = config.get("epoch")

    # 模型在测试过程中进行参数导入
    def restore_model(self):
        try:
            # 加载模型字典、
            # 这个load_state_dict函数并没有出现在任何一个文件中,所以这是怎么调用的?
            self.model.load_state_dict(torch.load(self.model_path + "params.pkl"))
            print("model restore success!")
        except Exception as error:
            print("model restore faild! {}".format(error))

    # 训练过程中保存模型的参数
    def save_params(self, data):
        with open("models/data.pkl", "wb") as fopen:
            pickle.dump(data, fopen)
    # 训练过程中读取更新后的模型的参数
    def load_params(self):
        # pkl文件的读取
        with open("models/data.pkl", "rb") as fopen:
            data_map = pickle.load(fopen)
            # print("*"*50+data_map+"*"*50)
        return data_map

    def train(self):
        # 使用Adam优化器进行梯度下降算法的优化迭代
        # 这里的parameters函数也没有在任何文件中声明过
        optimizer = optim.Adam(self.model.parameters(), lr=0.05)
        # optimizer = optim.SGD(ner_model.parameters(), lr=0.01)
        # 模型一共训练多少轮轮
        for epoch in range(self.epoch):
            index = 0
            # 获取每一个batch的数据
            for batch in self.train_manager.get_batch():
                index += 1
                self.model.zero_grad()

                sentences, tags, length = zip(*batch)
                sentences_tensor = torch.tensor(sentences, dtype=torch.long)
                tags_tensor = torch.tensor(tags, dtype=torch.long)
                length_tensor = torch.tensor(length, dtype=torch.long)

                # 计算模型训练过程中的损失

                loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor, length_tensor)
                # 进度加载
                progress = ("█" * int(index * 25 / self.total_size)).ljust(25)
                print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format(
                    epoch, progress, index, self.total_size, loss.cpu().tolist()[0]
                )
                )
                self.evaluate()
                print("-" * 50)
                # 梯度回传
                loss.backward()
                # 优化器优化

                optimizer.step()
                # 保存模型
                torch.save(self.model.state_dict(), self.model_path + 'params.pkl')
                # torch.save(self.model)

    # 训练过程中的损失计算
    def evaluate(self):
        sentences, labels, length = zip(*self.dev_batch.__next__())
        _, paths = self.model(sentences)
        print("\teval")
        for tag in self.tags:
            f1_score(labels, paths, tag, self.model.tag_map)

    # 模型训练好之后的预测
    def predict(self, input_str=""):
        if not input_str:
            input_str = input("请输入文本: ")
        input_vec = [self.vocab.get(i, 0) for i in input_str]
        # convert to tensor
        sentences = torch.tensor(input_vec).view(1, -1)
        _, paths = self.model(sentences)
        entities = []
        for tag in self.tags:
            # 这里调用了工具类里面的get_tags用来对数据进行标注,就是标一些B-FUNC什么的
            tags = get_tags(paths[0], tag, self.tag_map)
            print(tag)
            print(self.tag_map)
            print(paths[0])
            print(tags)
            entities += format_result(tags, input_str, tag)

        return entities

        # 模型对文件中的句子进行实体预测
    def predict_file(self, f_r_path, f_w_path):
        # 去除重复预测的实体
        duplication = set()
        with open(f_r_path, encoding='utf-8') as f_r:
            with open(f_w_path, 'ab') as f_w:
                for line in f_r.readlines():
                    sent = line.split('\t')[-3].strip()
                    res = self.predict(sent)
                    for i in range(len(res)-1):
                        entity = res[i]['word']
                        tag=res[i]["type"]
                        if entity not in duplication:
                            # print(entity)
                            duplication.add(tag)
                            duplication.add(entity)
                            f_w.write((tag+" : "+entity + '\n').encode())
                        if res[i]["type"]!=res[i+1]["type"]:
                            f_w.write('\n'.encode())
Beispiel #8
0
class ChineseNER(object):
    use_gpu = False

    def __init__(self, entry="train"):
        self.load_config()
        #self.use_gpu = torch.cuda.is_available()
        self.__init_model(entry)
        print(self.use_gpu)
        if (self.use_gpu):  # gpu加速
            self.model = self.model.cuda()

    def __init_model(self, entry):
        if entry == "train":
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }
            self.save_params(data)
            dev_manager = DataManager(batch_size=30, data_type="dev")
            self.dev_batch = dev_manager.iteration()
            self.model = BiLSTMCRF(
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.vocab),
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
            self.restore_model()
        elif entry == "predict":
            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")

            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size)
            self.restore_model()

    def load_config(self):
        try:
            fopen = open("models/config.yml")
            config = yaml.load(fopen)
            fopen.close()
        except Exception as error:
            print("Load config failed, using default config {}".format(error))
            fopen = open("models/config.yml", "w")
            config = {
                "embedding_size": 100,
                "hidden_size": 128,
                "batch_size": 20,
                "dropout": 0.5,
                "model_path": "models/",
                "tasg": ["ORG", "PER"]
            }
            yaml.dump(config, fopen)
            fopen.close()
        self.embedding_size = config.get("embedding_size")
        self.hidden_size = config.get("hidden_size")
        self.batch_size = config.get("batch_size")
        self.model_path = config.get("model_path")
        self.tags = config.get("tags")
        self.dropout = config.get("dropout")

    def restore_model(self):
        try:
            self.model.load_state_dict(
                torch.load(self.model_path + "params.pkl"))
            print("model restore success!")
        except Exception as error:
            print("model restore faild! {}".format(error))

    def save_params(self, data):
        with open("models/data.pkl", "wb") as fopen:
            pickle.dump(data, fopen)

    def load_params(self):
        with open("models/data.pkl", "rb") as fopen:
            data_map = pickle.load(fopen)
        return data_map

    #@torchsnooper.snoop()
    def train(self):
        optimizer = optim.Adam(self.model.parameters())
        # optimizer = optim.SGD(ner_model.parameters(), lr=0.01)
        for epoch in range(100):
            index = 0
            for batch in self.train_manager.get_batch():
                index += 1
                self.model.zero_grad()
                sentences, tags, length = zip(*batch)
                sentences_tensor = torch.tensor(sentences, dtype=torch.long)
                tags_tensor = torch.tensor(tags, dtype=torch.long)
                length_tensor = torch.tensor(length, dtype=torch.long)
                if (self.use_gpu):  # gpu加速
                    sentences_tensor = sentences_tensor.cuda()
                    tags_tensor = tags_tensor.cuda()
                    length_tensor = length_tensor.cuda()
                loss = self.model.neg_log_likelihood(sentences_tensor,
                                                     tags_tensor,
                                                     length_tensor)
                if (self.use_gpu):
                    loss = loss.cuda()
                progress = ("█" * int(index * 25 / self.total_size)).ljust(25)
                print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format(
                    epoch, progress, index, self.total_size,
                    loss.cpu().tolist()[0]))
                self.evaluate()
                print("-" * 50)
                loss.backward()
                optimizer.step()
                torch.save(self.model.state_dict(),
                           self.model_path + 'params.pkl')

    def get_string(self, x):
        now = x.split('\n')
        o = now[1].split(' ')
        while '' in o:
            o.remove('')
        return o[1]

    def evaluate(self):
        sentences, labels, length = zip(*self.dev_batch.__next__())
        if (self.use_gpu):
            sentences = torch.tensor(sentences, dtype=torch.long).cuda()
        _, paths = self.model(sentences)
        print("\teval")
        for tag in self.tags:
            f1_score(labels, paths, tag, self.model.tag_map)

    def predict(self, input_str="", input_path=None):
        if input_path is not None:
            tests = pd.read_csv(input_path)
            with open('output.txt', 'w', encoding='utf-8') as o:
                #o.write('id,aspect,opinion\n')
                for ids in range(1, 2235):
                    input_str = self.get_string(
                        str(tests.loc[ids - 1:ids - 1, ['Review']]))
                    index = int(
                        self.get_string(str(tests.loc[ids - 1:ids - 1,
                                                      ['id']])))
                    input_vec = [self.vocab.get(i, 0) for i in input_str]
                    # convert to tensor
                    if (self.use_gpu):  # gpu加速
                        sentences = torch.tensor(input_vec).view(1, -1).cuda()
                    else:
                        sentences = torch.tensor(input_vec).view(1, -1)
                    _, paths = self.model(sentences)

                    entities = []
                    for tag in self.tags:
                        tags = get_tags(paths[0], tag, self.tag_map)
                        entities += format_result(tags, input_str, tag)
                    entities = sorted(entities, key=lambda x: x['start'])
                    #print(str(index) + "  " + input_str + " " +str(len(entities)))
                    for entity in entities:
                        #print(entity)
                        o.write(
                            str(index) + ',' + entity['type'] + ',' +
                            entity['word'] + '\n')
        else:
            if not input_str:
                input_str = input("请输入文本: ")
            input_vec = [self.vocab.get(i, 0) for i in input_str]
            # convert to tensor
            if (self.use_gpu):  # gpu加速
                sentences = torch.tensor(input_vec).view(1, -1).cuda()
            else:
                sentences = torch.tensor(input_vec).view(1, -1)
            _, paths = self.model(sentences)

            entities = []
            for tag in self.tags:
                tags = get_tags(paths[0], tag, self.tag_map)
                entities += format_result(tags, input_str, tag)
            return entities