Exemple #1
0
class ChineseNER:
    def __init__(self, entry="train"):
        # Load some Hyper-parameters
        config = load_config()
        self.embedding_size = config.get("embedding_size")
        self.hidden_size = config.get("hidden_size")
        self.batch_size = config.get("batch_size")
        self.model_path = config.get("model_path")
        self.dropout = config.get("dropout")
        self.tags = config.get("tags")
        self.learning_rate = config.get("learning_rate")
        self.epochs = config.get("epochs")
        self.weight_decay = config.get("weight_decay")
        self.transfer_learning = config.get("transfer_learning")
        self.lr_decay_step = config.get("lr_decay_step")
        self.lr_decay_rate = config.get("lr_decay_rate")
        self.max_length = config.get("max_length")

        # Model Initialization
        self.main_model(entry)

    def main_model(self, entry):
        """
        Model Initialization
        """
        # The Training Process
        if entry == "train":
            # Training Process: read Training Data from DataManager
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             data_type='train',
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)

            # Read the corresponding character index (vocab) and other hyper-parameters
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }

            save_params(data=data, path=self.model_path)

            # Build BiLSTM-CRF Model
            self.model = BiLSTMCRF(tag_map=self.train_manager.tag_map,
                                   batch_size=self.batch_size,
                                   vocab_size=len(self.train_manager.vocab),
                                   dropout=self.dropout,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size,
                                   max_length=self.max_length)

            # Evaluation Process: read Dev Data from DataManager
            self.dev_size = DataManager(batch_size=1,
                                        data_type="dev",
                                        tags=self.tags).load_data()
            self.dev_manager = DataManager(batch_size=int(self.dev_size),
                                           data_type="dev",
                                           tags=self.tags)
            self.dev_batch = self.dev_manager.iteration()

            # Restore model if it exists
            self.restore_model()

        # The Testing & Inference Process
        elif entry == "predict":
            data_map = load_params(path=self.model_path)
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")
            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   dropout=0.0,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size,
                                   max_length=self.max_length)

            self.restore_model()

    def restore_model(self):
        """
        Restore the model if there is one
        """
        try:
            self.model.load_state_dict(
                torch.load(self.model_path + "params.pkl"))
            print("Model Successfully Restored!")
        except Exception as error:
            print("Model Failed to restore! {}".format(error))

    def train(self):
        """
        Training stage
        """
        model = self.model.to(device=device)

        # Transfer Learning Module
        if self.transfer_learning == True:
            keep_grad = [
                "transitions", "word_embeddings.weight", "hidden2tag.weight",
                "hidden2tag.bias", "linear1.weight", "linear1.bias",
                "linear2.weight", "linear2.bias"
            ]

            for name, value in model.named_parameters():
                if name in keep_grad:
                    value.requires_grad = True
                else:
                    value.requires_grad = False
        else:
            for name, value in model.named_parameters():
                value.requires_grad = True

        # Use Adam Optimizer
        optimizer = optim.AdamW(params=filter(lambda p: p.requires_grad,
                                              model.parameters()),
                                lr=self.learning_rate,
                                weight_decay=self.weight_decay,
                                amsgrad=True)

        # Learning Rate Decay
        # scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=self.lr_decay_step, gamma=self.lr_decay_rate)

        # Print model architecture
        print('\033[1;31mThe model architecture is shown below:\033[0m')
        print(model)
        print('\n')

        # Print model parameters
        print('\033[1;31mThe model\'s parameters are shown below:\033[0m')
        for name, value in model.named_parameters():
            print("Name: \033[1;31m{0}\033[0m, "
                  "Parameter Size: \033[1;36m{1}\033[0m, "
                  "Gradient: \033[1;35m{2}\033[0m".format(
                      name, value.size(), value.requires_grad))
        print('\n')

        for epoch in range(1, self.epochs + 1):
            index = 0
            for batch in self.train_manager.get_batch():
                index += 1

                # Clear gradients before training
                self.model.zero_grad()

                # Read sentences and tags from the batch data
                sentences, tags, length = zip(*batch)
                sentences_tensor = torch.tensor(sentences,
                                                dtype=torch.long,
                                                device=device)
                tags_tensor = torch.tensor(tags,
                                           dtype=torch.float,
                                           device=device)
                length_tensor = torch.tensor(length,
                                             dtype=torch.int64,
                                             device=device)

                # Use Negative Log-Likelihood (NLL) as Loss Function, Run the forward pass
                batch_loss = self.model.neg_log_likelihood(
                    sentences_tensor, tags_tensor, length_tensor)
                loss = batch_loss.mean()

                progress = ("█" * int(index * 40 / self.total_size)).ljust(40)
                print("epoch [{}] |{}| {}/{}\n\t Training Loss {:.6f}".format(
                    epoch, progress, index, self.total_size, loss))

                loss.backward()
                optimizer.step()

                # Save the model during training
                torch.save(self.model.state_dict(),
                           self.model_path + 'params.pkl')

            self.evaluate()
            # scheduler.step()

    def evaluate(self):
        """
        Evaluation of the performance using the dev batch - dev dataset
        """
        sentences, labels, length = zip(*self.dev_batch.__next__())
        _, pre = self.model(sentences=sentences,
                            real_length=length,
                            lengths=None)

        sentences_tensor = torch.tensor(sentences,
                                        dtype=torch.long,
                                        device=device)
        tags_tensor = torch.tensor(pre, dtype=torch.float, device=device)
        length_tensor = torch.tensor(length, dtype=torch.int64, device=device)

        loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor,
                                             length_tensor)
        print("\t Evaluation Loss {:.6f}".format(loss.tolist()[0]))

        ####################################################################################################################################
        print('Start to evaluate on the dev set: ')
        # Tag-level F1 score summary (w.r.t. each tag)
        tag_f1_total = []
        for tag in self.tags:
            _, _, f1_tag = tag_f1(tar_path=labels,
                                  pre_path=pre,
                                  tag=tag,
                                  tag_map=self.model.tag_map)
            tag_f1_total.append(f1_tag)
        tag_macro_f1 = sum(tag_f1_total) / len(tag_f1_total)
        print(
            'Tag-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m'
            % tag_macro_f1)

        # Tag-level Micro-averaged F1 Score
        _, _, f1_Micro_tag = tag_micro_f1(tar_path=labels,
                                          pre_path=pre,
                                          tags=self.tags,
                                          tag_map=self.model.tag_map)
        print(
            'Tag-level Micro-averaged F1 Score of the dev set is \033[1;35m%s\033[0m'
            % f1_Micro_tag)

        ####################################################################################################################################
        # Tag-level with Label-level F1 score summary
        f1_prefix_total = []
        prefixes = ['B', 'I', 'E', 'S']
        for tag in self.tags:
            for prefix in prefixes:
                _, _, f1_prefix = entity_label_f1(tar_path=labels,
                                                  pre_path=pre,
                                                  length=length,
                                                  tag=tag,
                                                  tag_map=self.model.tag_map,
                                                  prefix=prefix)
                f1_prefix_total.append(f1_prefix)

        f1_macro_tag_prefix = sum(f1_prefix_total) / len(f1_prefix_total)
        print(
            'Tag-Label-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m'
            % f1_macro_tag_prefix)

        ####################################################################################################################################
        # Label-level F1 score summary
        f1_prefix_total = []
        prefixes = ['B', 'I', 'E', 'S', 'O']
        for prefix in prefixes:
            _, _, f1_prefix = label_f1(tar_path=labels,
                                       pre_path=pre,
                                       length=length,
                                       tags=self.tags,
                                       tag_map=self.model.tag_map,
                                       prefix=prefix)
            f1_prefix_total.append(f1_prefix)

        f1_macro_prefix = sum(f1_prefix_total) / len(f1_prefix_total)
        print(
            'Label-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m'
            % f1_macro_prefix)

    def predict(self):
        """
        Prediction & Inference Stage
        :param input_str: Input Chinese sentence
        :return entities: Predicted entities
        """
        # Print model architecture
        print('\033[1;31mThe model architecture is shown below:\033[0m')
        print(self.model)
        print('\n')

        # Input one Chinese Sentence
        while True:
            input_str = input("Please input a sentence in Chinese: ")

            if len(input_str) != 0:
                # Full-width to half-width
                input_str = strQ2B(input_str)
                input_str = re.sub(pattern='。', repl='.', string=input_str)

                text = cut_text(text=input_str, length=self.max_length)

                cut_out = []
                for cuttext in text:
                    # Get the embedding vector (Input Vector) from vocab
                    input_vec = [self.vocab.get(i, 0) for i in cuttext]

                    # convert it to tensor and run the model
                    sentences = torch.tensor(input_vec).view(1, -1)

                    length = np.expand_dims(np.shape(sentences)[1], axis=0)
                    length = torch.tensor(length,
                                          dtype=torch.int64,
                                          device=device)

                    _, paths = self.model(sentences=sentences,
                                          real_length=length,
                                          lengths=None)

                    # Get the entities from the model
                    entities = []
                    for tag in self.tags:
                        tags = get_tags(paths[0], tag, self.tag_map)
                        entities += format_result(tags, cuttext, tag)

                    # Get all the entities
                    all_start = []
                    for entity in entities:
                        start = entity.get('start')
                        all_start.append([start, entity])

                    # Sort the results by the "start" index
                    sort_d = [
                        value for index, value in sorted(
                            enumerate(all_start),
                            key=lambda all_start: all_start[1])
                    ]

                    if len(sort_d) == 0:
                        return print("There was no entity in this sentence!!")
                    else:
                        sort_d = np.reshape(
                            np.array(sort_d)[:, 1], [np.shape(sort_d)[0], 1])
                        cut_out.append(sort_d)
                # return cut_out
                print(cut_out)
            else:
                return print('Invalid input! Please re-input!!\n')
Exemple #2
0
class ChineseNER:
    def __init__(self, entry="train"):
        # Load Hyper-parameters
        config = load_config()
        self.model_path = config.get("model_path")
        self.epochs = config.get("epochs")
        self.batch_size = config.get("batch_size")
        self.learning_rate = config.get("learning_rate")
        self.weight_decay = config.get("weight_decay")
        self.dropout = config.get("dropout")
        self.hidden_size = config.get("hidden_size")
        self.char_num = config.get("char_num")
        self.char_dim = config.get("char_dim")
        self.word_dim = config.get("word_dim")
        self.word_num = config.get("word_num")
        self.tags = config.get("tags")
        self.transfer_learning = config.get("transfer_learning")
        self.lr_decay_step = config.get("lr_decay_step")
        self.lr_decay_rate = config.get("lr_decay_rate")

        # Load main model
        self.main_model(entry)

    def main_model(self, entry):
        # The Training Process
        if entry == "train":
            # Training Process: read Training Data from DataManager
            self.train_manager = DataManager(batch_size=self.batch_size, data_type='train', tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)

            # Read the corresponding character index (vocab) and other hyper-parameters
            saved_data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "char_vocab": self.train_manager.char_vocab,
                "tag_map": self.train_manager.tag_map,
            }
            save_params(data=saved_data, path=self.model_path)

            # Evaluation Process: read Dev Data from DataManager
            self.dev_size = DataManager(batch_size=1, data_type="dev", tags=self.tags).load_char_data()
            self.dev_manager = DataManager(batch_size=int(self.dev_size), data_type="dev")
            self.dev_batch = self.dev_manager.iteration()

            # Build BiLSTM-CRF Model
            self.model = BiLSTMCRF(
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.char_vocab),
                dropout=self.dropout,
                word_num=self.word_num,
                word_dim=self.word_dim,
                char_num=self.char_num,
                char_dim=self.char_dim,
                hidden_dim=self.hidden_size,
            )

            # Restore model if it exists
            self.restore_model()

        # The Inference Process
        elif entry == "predict":
            data = load_params(path=self.model_path)
            input_size = data.get("input_size")
            self.tag_map = data.get("tag_map")
            self.vocab = data.get("char_vocab")
            self.model = BiLSTMCRF(
                tag_map=self.tag_map,
                vocab_size=input_size,
                dropout=1.0,
                word_num=self.word_num,
                word_dim=self.word_dim,
                char_num=self.char_num,
                char_dim=self.char_dim,
                hidden_dim=self.hidden_size,
            )
            self.restore_model()

    def restore_model(self):
        """
        Restore and load the model
        """
        try:
            self.model.load_state_dict(torch.load(self.model_path + "params.pkl"))
            print("Model Successfully Restored!!")
        except Exception as error:
            print("Model Failed to restore!!")

    def train(self):
        model = self.model.to(device=device)

        # Transfer Learning Module
        if self.transfer_learning == True:
            keep_grad = [
                "transitions",
                "char_embed.weight",
                "linear_lstm.weight",
                "linear_lstm.bias",
                "linear_cnn.weight",
                "linear_cnn.bias",
                "hidden2tag.weight",
                "hidden2tag.bias"
            ]

            for name, value in model.named_parameters():
                if name in keep_grad:
                    value.requires_grad = True
                else:
                    value.requires_grad = False
        else:
            for name, value in model.named_parameters():
                value.requires_grad = True

        # Use Adam Optimizer
        optimizer = optim.Adam(params=filter(lambda p: p.requires_grad, model.parameters()), lr=self.learning_rate, weight_decay=self.weight_decay)

        # Learning Rate Decay
        # scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=self.lr_decay_step, gamma=self.lr_decay_rate)

        # Print model architecture
        print('\033[1;31mThe model architecture is shown below:\033[0m')
        print(model)
        print('\n')

        # Print model parameters
        print('\033[1;31mThe model\'s parameters are shown below:\033[0m')
        for name, value in model.named_parameters():
            print("Name: \033[1;31m{0}\033[0m, "
                  "Parameter Size: \033[1;36m{1}\033[0m, "
                  "Gradient: \033[1;35m{2}\033[0m".format(name, value.size(), value.requires_grad))
        print('\n')

        for epoch in range(1, self.epochs+1):
            index = 0
            for batch in self.train_manager.get_batch():
                index += 1
                # Clear gradients before training
                model.zero_grad()

                # Read sentences and tags from the batch data
                chars, tags, words, len_char = zip(*batch)
                chars_tensor = torch.tensor(chars, dtype=torch.long, device=device)
                tags_tensor = torch.tensor(tags, dtype=torch.float, device=device)
                words_tensor = torch.tensor(words, dtype=torch.float, device=device)
                leng_char = torch.tensor(len_char, dtype=torch.int64, device=device)

                loss = model.neg_log_likelihood(characters=chars_tensor, tags=tags_tensor, length=leng_char, words=words_tensor)
                progress = ("█" * int(index * 40 / self.total_size)).ljust(40)
                print("epoch [{}] |{}| {}/{}\t Batch Loss {:.6f}".format(epoch, progress, index, self.total_size, loss.tolist()[0]))

                loss.backward()
                optimizer.step()
                torch.save(model.state_dict(), self.model_path + 'params.pkl')

            self.evaluate()
            # scheduler.step()

    def evaluate(self):
        """
        Evaluation of the performance using the development set
        """
        model = self.model.to(device)

        chars, labels, words, len_chars = zip(*self.dev_batch.__next__())
        chars_tensor = torch.tensor(chars, dtype=torch.long, device=device)
        words_tensor = torch.tensor(words, dtype=torch.float, device=device)
        len_char_tensor = torch.tensor(len_chars, dtype=torch.int64, device=device)

        # Run the Forward pass of the model
        _, pre = model(characters=chars_tensor, words=words_tensor, len_char=len_chars)
        pre_tensor = torch.tensor(pre, dtype=torch.int, device=device)

        ####################################################################################################################################
        # Loss on the dev set
        loss = model.neg_log_likelihood(characters=chars_tensor, tags=pre_tensor, length=len_char_tensor, words=words_tensor)
        print("\t Evaluation Loss on the dev set {:.6f}".format(loss.tolist()[0]))

        ####################################################################################################################################
        print('Start to evaluate on the dev set: ')
        # Tag-level F1 score summary (w.r.t. each tag)
        tag_f1_total = []
        for tag in self.tags:
            _, _, f1_tag = tag_f1(tar_path=labels, pre_path=pre, tag=tag, tag_map=self.model.tag_map)
            tag_f1_total.append(f1_tag)
        tag_macro_f1 = sum(tag_f1_total) / len(tag_f1_total)
        print('Tag-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m' % tag_macro_f1)

        # Tag-level Micro-averaged F1 Score
        _, _, f1_Micro_tag = tag_micro_f1(tar_path=labels, pre_path=pre, tags=self.tags, tag_map=self.model.tag_map)
        print('Tag-level Micro-averaged F1 Score of the dev set is \033[1;35m%s\033[0m' % f1_Micro_tag)

        ####################################################################################################################################
        # Tag-level with Label-level F1 score summary
        f1_prefix_total = []
        prefixes = ['B', 'I', 'E', 'S']
        for tag in self.tags:
            for prefix in prefixes:
                _, _, f1_prefix = entity_label_f1(tar_path=labels,
                                                  pre_path=pre,
                                                  length=len_chars,
                                                  tag=tag,
                                                  tag_map=self.model.tag_map,
                                                  prefix=prefix)
                f1_prefix_total.append(f1_prefix)

        f1_macro_tag_prefix = sum(f1_prefix_total) / len(f1_prefix_total)
        print('Tag-Label-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m' % f1_macro_tag_prefix)

        ####################################################################################################################################
        # Label-level F1 score summary
        f1_prefix_total = []
        prefixes = ['B', 'I', 'E', 'S', 'O']
        for prefix in prefixes:
            _, _, f1_prefix = label_f1(tar_path=labels,
                                       pre_path=pre,
                                       length=len_chars,
                                       tags=self.tags,
                                       tag_map=self.model.tag_map,
                                       prefix=prefix)
            f1_prefix_total.append(f1_prefix)

        f1_macro_prefix = sum(f1_prefix_total) / len(f1_prefix_total)
        print('Label-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m' % f1_macro_prefix)

    def load_word_vector(self):
        """
        Load pre-trained word vectors
        """
        if 'pre_trained' not in globals().keys():
            print("Start to load pre-trained word embeddings!!")
            pre_trained = {}
            for i, line in enumerate(codecs.open(self.model_path + "word_vectors.vec", 'r', encoding='utf-8')):
                line = line.rstrip().split()
                if len(line) == self.word_dim + 1:
                    pre_trained[line[0]] = np.array([float(x) for x in line[1:]]).astype(np.float32)
        else:
            pre_trained = globals().get("pre_trained")
        return pre_trained
    
    def pad_char_data(self, data: list) -> list:
        """
        Pad character data
        """
        c_data = copy.deepcopy(data)
        if np.shape(c_data)[0] < self.char_num:
            c_data = c_data + (self.char_num - np.shape(c_data)[0]) * [0]
        else:
            c_data = c_data[:self.char_num]
        c_data = np.expand_dims(c_data, axis=0)
        return c_data

    def pad_word_data(self, data: list) -> list:
        """
        Pad word data
        """
        c_data = copy.deepcopy(data)
        if len(c_data) <= self.word_num:
            c_data = c_data + (self.word_num - len(c_data)) * [[0] * self.word_dim]
        else:
            c_data = c_data[:self.word_num, :]
        c_data = np.reshape(c_data, [np.shape(c_data)[0] * np.shape(c_data)[1]])
        c_data = np.expand_dims(c_data, axis=0)
        return c_data

    def predict(self):
        """
        Prediction & Inference Stage
        """
        self.pre_trained = self.load_word_vector()

        while True:
            input_str = input("Please input a sentence (in Chinese): ")

            # Get character embedding
            char_vec = [self.vocab.get(i, 0) for i in input_str]
            char_tensor = np.reshape(char_vec, [-1]).tolist()
            len_char = len(char_tensor)
            char_tensor = np.array(self.pad_char_data(char_tensor)).tolist()
            char_tensor = torch.tensor(char_tensor, dtype=torch.long, device=device)

            # Get word embedding
            embed_words = []
            words = jieba.lcut(input_str, HMM=True)
            for i in words:
                vec = self.pre_trained.get(i)
                if str(type(vec)) != "<class 'NoneType'>":
                    embed_words.append(vec)
            word_tensor = np.array(self.pad_word_data(embed_words)).tolist()
            word_tensor = torch.tensor(word_tensor, dtype=torch.float, device=device)

            # Run the model
            _, paths = self.model(characters=char_tensor, words=word_tensor, len_char=len_char)

            # Get the entities and format the results
            entities = []
            for tag in self.tags:
                tags = get_tags(path=paths[0], tag=tag, tag_map=self.tag_map)
                entities += format_result(result=tags, text=input_str, tag=tag)
            print(entities)