Esempio n. 1
0
    def load(self, preprocessor_file, json_file, weights_file, custom_objects=None):
        """load ner application

        Args:
            preprocessor_file: path to load preprocessor
            json_file: path to load model architecture
            weights_file: path to load model weights
            custom_objects: Optional dictionary mapping names (strings) to custom classes or
                            functions to be considered during deserialization. Must provided when
                            using custom layer.

        """
        self.preprocessor = NERPreprocessor.load(preprocessor_file)
        logging.info('Load preprocessor from {}'.format(preprocessor_file))

        custom_objects = custom_objects or {}
        custom_objects.update(get_custom_objects())
        with open(json_file, 'r') as reader:
            self.model = model_from_json(reader.read(), custom_objects=custom_objects)
        logging.info('Load model architecture from {}'.format(json_file))

        self.model.load_weights(weights_file)
        logging.info('Load model weight from {}'.format(weights_file))

        self.trainer = NERTrainer(self.model, self.preprocessor)
        self.predictor = NERPredictor(self.model, self.preprocessor)
Esempio n. 2
0
    def test_train_no_bert(self):
        preprocessor = NERPreprocessor(self.train_data + self.valid_data,
                                       self.train_labels + self.valid_labels,
                                       use_word=True,
                                       char_embed_type='word2vec')
        ner_model = BiLSTMCNNNER(num_class=preprocessor.num_class,
                                 use_char=True,
                                 char_embeddings=preprocessor.char_embeddings,
                                 char_vocab_size=preprocessor.char_vocab_size,
                                 char_embed_dim=preprocessor.char_embed_dim,
                                 char_embed_trainable=False,
                                 use_word=True,
                                 word_embeddings=preprocessor.word_embeddings,
                                 word_vocab_size=preprocessor.word_vocab_size,
                                 word_embed_dim=preprocessor.word_embed_dim,
                                 word_embed_trainable=False,
                                 max_len=preprocessor.max_len,
                                 use_crf=True).build_model()

        ner_trainer = NERTrainer(ner_model, preprocessor)
        ner_trainer.train(self.train_data,
                          self.train_labels,
                          self.valid_data,
                          self.valid_labels,
                          batch_size=2,
                          epochs=2)
        assert not os.path.exists(self.json_file)
        assert not os.path.exists(self.weights_file)
Esempio n. 3
0
    def test_ner_generator(self):
        test_file = os.path.join(os.path.dirname(__file__), '../../../data/ner/msra/example.txt')
        x_train, y_train = load_ner_data_and_labels(test_file)

        preprocessor = NERPreprocessor(x_train, y_train)
        generator = NERGenerator(preprocessor, x_train, batch_size=64)
        assert len(generator) == math.ceil(len(x_train) / 64)
        for i, (features, y) in enumerate(generator):
            if i < len(generator) - 1:
                assert features.shape[0] == 64
                assert y is None
            else:
                assert features.shape[0] == len(x_train) - 64 * (len(generator) - 1)
                assert y is None
Esempio n. 4
0
    def setup_class(self):
        x_train, y_train = load_ner_data_and_labels(self.test_file)
        self.preprocessor = NERPreprocessor(
            x_train,
            y_train,
            use_char=True,
            use_bert=True,
            use_word=True,
            bert_vocab_file=self.bert_vocab_file,
            char_embed_type='word2vec',
            word_embed_type='word2vec',
            max_len=16)
        self.num_class = self.preprocessor.num_class
        self.char_embeddings = self.preprocessor.char_embeddings
        self.char_vocab_size = self.preprocessor.char_vocab_size
        self.char_embed_dim = self.preprocessor.char_embed_dim

        self.word_embeddings = self.preprocessor.word_embeddings
        self.word_vocab_size = self.preprocessor.word_vocab_size
        self.word_embed_dim = self.preprocessor.word_embed_dim
        self.checkpoint_dir = os.path.dirname(__file__)
Esempio n. 5
0
    def load(self,
             preprocessor_file: str,
             json_file: str,
             weights_file: str,
             custom_objects: Optional[Dict[str, Any]] = None) -> None:
        """Load ner application from disk.

        There are 3 things in total that we need to load: 1) preprocessor, which stores the
        vocabulary and embedding matrix built during pre-processing and helps us prepare feature
        input for ner model; 2) model architecture, which describes the framework of our ner model;
        3) model weights, which stores the value of ner model's parameters.

        Args:
            preprocessor_file: path to load preprocessor
            json_file: path to load model architecture
            weights_file: path to load model weights
            custom_objects: Optional dictionary mapping names (strings) to custom classes or
                            functions to be considered during deserialization. We will
                            automatically add all the custom layers of this project to
                            custom_objects. So you can ignore this argument in most cases unlesss
                            you use your own custom layer.

        """
        self.preprocessor = NERPreprocessor.load(preprocessor_file)
        logging.info('Load preprocessor from {}'.format(preprocessor_file))

        custom_objects = custom_objects or {}
        custom_objects.update(get_custom_objects())
        with open(json_file, 'r') as reader:
            self.model = tf.keras.models.model_from_json(
                reader.read(), custom_objects=custom_objects)
        logging.info('Load model architecture from {}'.format(json_file))

        self.model.load_weights(weights_file)
        logging.info('Load model weight from {}'.format(weights_file))

        self.trainer = NERTrainer(self.model, self.preprocessor)
        self.predictor = NERPredictor(self.model, self.preprocessor)
Esempio n. 6
0
    def setup_class(self):
        self.train_data, self.train_labels, self.valid_data, self.valid_labels = \
            load_ner_data_and_labels(self.test_file, split=True)
        self.preprocessor = NERPreprocessor(
            self.train_data + self.valid_data,
            self.train_labels + self.valid_labels,
            use_bert=True,
            use_word=True,
            bert_vocab_file=self.bert_vocab_file,
            char_embed_type='word2vec',
            word_embed_type='word2vec',
            max_len=16)
        self.num_class = self.preprocessor.num_class
        self.char_embeddings = self.preprocessor.char_embeddings
        self.char_vocab_size = self.preprocessor.char_vocab_size
        self.char_embed_dim = self.preprocessor.char_embed_dim

        self.word_embeddings = self.preprocessor.word_embeddings
        self.word_vocab_size = self.preprocessor.word_vocab_size
        self.word_embed_dim = self.preprocessor.word_embed_dim
        self.checkpoint_dir = os.path.dirname(__file__)

        self.ner_model = BiLSTMCNNNER(
            num_class=self.num_class,
            use_char=True,
            char_embeddings=self.char_embeddings,
            char_vocab_size=self.char_vocab_size,
            char_embed_dim=self.char_embed_dim,
            char_embed_trainable=False,
            use_bert=True,
            bert_config_file=self.bert_config_file,
            bert_checkpoint_file=self.bert_model_file,
            use_word=True,
            word_embeddings=self.word_embeddings,
            word_vocab_size=self.word_vocab_size,
            word_embed_dim=self.word_embed_dim,
            word_embed_trainable=False,
            max_len=self.preprocessor.max_len,
            use_crf=True).build_model()

        self.swa_model = BiLSTMCNNNER(
            num_class=self.num_class,
            use_char=True,
            char_embeddings=self.char_embeddings,
            char_vocab_size=self.char_vocab_size,
            char_embed_dim=self.char_embed_dim,
            char_embed_trainable=False,
            use_bert=True,
            bert_config_file=self.bert_config_file,
            bert_checkpoint_file=self.bert_model_file,
            use_word=True,
            word_embeddings=self.word_embeddings,
            word_vocab_size=self.word_vocab_size,
            word_embed_dim=self.word_embed_dim,
            word_embed_trainable=False,
            max_len=self.preprocessor.max_len,
            use_crf=True).build_model()

        self.ner_trainer = NERTrainer(self.ner_model, self.preprocessor)

        self.json_file = os.path.join(self.checkpoint_dir,
                                      'bilstm_cnn_ner.json')
        self.weights_file = os.path.join(self.checkpoint_dir,
                                         'bilstm_cnn_ner.hdf5')
Esempio n. 7
0
    def fit(self,
            train_data,
            train_labels,
            valid_data=None,
            valid_labels=None,
            ner_model_type='bilstm_cnn',
            use_char=True,
            char_embed_type='word2vec',
            char_embed_dim=300,
            char_embed_trainable=True,
            use_bert=False,
            bert_vocab_file=None,
            bert_config_file=None,
            bert_checkpoint_file=None,
            bert_trainable=False,
            use_word=False,
            external_word_dict=None,
            word_embed_type='word2vec',
            word_embed_dim=300,
            word_embed_trainable=True,
            max_len=None,
            use_crf=True,
            optimizer='adam',
            batch_size=32,
            epochs=50,
            callback_list=None,
            checkpoint_dir=None,
            model_name=None,
            load_swa_model=False,
            **kwargs):
        """Train ner model using provided data

        Args:
            train_data: list of tokenized (in char level) texts for training,
                        like ``[['我', '是', '中', '国', '人']]``
            train_labels: labels string of train_data
            valid_data: list of tokenized (in char level) texts for evaluation
            valid_labels: labels string of valid data
            ner_model_type: str, which ner model to use
            use_char: boolean, whether to use char embedding as input
            char_embed_type: str, can be a pre-trained embedding filename or pre-trained embedding
                             methods (word2vec, fasttext)
            char_embed_dim: int, dimensionality of char embedding
            char_embed_trainable: boolean, whether to update char embedding during training
            use_bert: boolean, whether to use bert embedding as input
            bert_vocab_file: str, path to bert's vocabulary file
            bert_config_file: str, path to bert's configuration file
            bert_checkpoint_file: str, path to bert's checkpoint file
            bert_trainable: boolean, whether to update bert during training
            use_word: boolean, whether to use word as additional input
            external_word_dict: list of words, external word dictionary
            word_embed_dim: similar as 'char_embed_dim'
            word_embed_type: similar as 'char_embed_type'
            word_embed_trainable: similar as 'char_embed_trainable'
            max_len: int, max sequence len. If None, we dynamically use the max length of one batch
                     as max_len. However, max_len must be provided when using bert as input.
            use_crf: boolean, whether to use crf layer
            optimizer: str or instance of `keras.optimizers.Optimizer`, indicating the optimizer to
                       use during training
            batch_size: num of samples per gradient update
            epochs: num of epochs to train the model
            callback_list: list of str, each item indicates the callback to apply during training
                           Currently, we support using 'modelcheckpoint' for `ModelCheckpoint`
                           callback, 'earlystopping` for 'Earlystopping` callback, 'swa' for
                           'SWA' callback. We will automatically add `NERMetric` callback when
                           valid_data and valid_labels are both provided.
            checkpoint_dir: str, directory to save ner model, must be provided when using
                            `ModelCheckpoint` or `SWA` callback.
            model_name: str, prefix of ner model's weights filem must be provided when using
                        `ModelCheckpoint` or `SWA` callback.
                        For example, if checkpoint_dir is 'ckpt' and model_name is 'model', the
                        weights of ner model saved by `ModelCheckpoint` callback will be
                        'ckpt/model.hdf5' and by `SWA` callback will be 'ckpt/model_swa.hdf5'
            load_swa_model: boolean, whether to load swa model, only apply when using SWA Callback.
            **kwargs: other argument for building ner model, such as "rnn_units", "fc_dim" etc.
        """
        self.preprocessor = NERPreprocessor(train_data=train_data,
                                            train_labels=train_labels,
                                            use_char=use_char,
                                            use_bert=use_bert,
                                            use_word=use_word,
                                            external_word_dict=external_word_dict,
                                            bert_vocab_file=bert_vocab_file,
                                            char_embed_type=char_embed_type,
                                            char_embed_dim=char_embed_dim,
                                            word_embed_type=word_embed_type,
                                            word_embed_dim=word_embed_dim,
                                            max_len=max_len)

        self.model = self.get_ner_model(ner_model_type=ner_model_type,
                                        num_class=self.preprocessor.num_class,
                                        use_char=use_char,
                                        char_embeddings=self.preprocessor.char_embeddings,
                                        char_vocab_size=self.preprocessor.char_vocab_size,
                                        char_embed_dim=self.preprocessor.char_embed_dim,
                                        char_embed_trainable=char_embed_trainable,
                                        use_bert=use_bert,
                                        bert_config_file=bert_config_file,
                                        bert_checkpoint_file=bert_checkpoint_file,
                                        bert_trainable=bert_trainable,
                                        use_word=use_word,
                                        word_embeddings=self.preprocessor.word_embeddings,
                                        word_vocab_size=self.preprocessor.word_vocab_size,
                                        word_embed_dim=self.preprocessor.word_embed_dim,
                                        word_embed_trainable=word_embed_trainable,
                                        max_len=self.preprocessor.max_len,
                                        use_crf=use_crf,
                                        optimizer=optimizer,
                                        **kwargs)

        if 'swa' in callback_list:
            swa_model = self.get_ner_model(ner_model_type=ner_model_type,
                                           num_class=self.preprocessor.num_class,
                                           use_char=use_char,
                                           char_embeddings=self.preprocessor.char_embeddings,
                                           char_vocab_size=self.preprocessor.char_vocab_size,
                                           char_embed_dim=self.preprocessor.char_embed_dim,
                                           char_embed_trainable=char_embed_trainable,
                                           use_bert=use_bert,
                                           bert_config_file=bert_config_file,
                                           bert_checkpoint_file=bert_checkpoint_file,
                                           bert_trainable=bert_trainable,
                                           use_word=use_word,
                                           word_embeddings=self.preprocessor.word_embeddings,
                                           word_vocab_size=self.preprocessor.word_vocab_size,
                                           word_embed_dim=self.preprocessor.word_embed_dim,
                                           word_embed_trainable=word_embed_trainable,
                                           max_len=self.preprocessor.max_len,
                                           use_crf=use_crf,
                                           optimizer=optimizer,
                                           **kwargs)
        else:
            swa_model = None

        self.trainer = NERTrainer(self.model, self.preprocessor)
        self.trainer.train_generator(train_data, train_labels, valid_data, valid_labels,
                                     batch_size, epochs, callback_list, checkpoint_dir, model_name,
                                     swa_model, load_swa_model)

        self.predictor = NERPredictor(self.model, self.preprocessor)

        if valid_data is not None and valid_labels is not None:
            logging.info('Evaluating on validation data...')
            self.score(valid_data, valid_labels)
Esempio n. 8
0
class NER(object):
    """NER application"""

    def __init__(self, use_pretrained=True):
        self.preprocessor = None
        self.model = None
        self.trainer = None
        self.predictor = None

        if use_pretrained:
            self.load_pretrained_model()

    def fit(self,
            train_data,
            train_labels,
            valid_data=None,
            valid_labels=None,
            ner_model_type='bilstm_cnn',
            use_char=True,
            char_embed_type='word2vec',
            char_embed_dim=300,
            char_embed_trainable=True,
            use_bert=False,
            bert_vocab_file=None,
            bert_config_file=None,
            bert_checkpoint_file=None,
            bert_trainable=False,
            use_word=False,
            external_word_dict=None,
            word_embed_type='word2vec',
            word_embed_dim=300,
            word_embed_trainable=True,
            max_len=None,
            use_crf=True,
            optimizer='adam',
            batch_size=32,
            epochs=50,
            callback_list=None,
            checkpoint_dir=None,
            model_name=None,
            load_swa_model=False,
            **kwargs):
        """Train ner model using provided data

        Args:
            train_data: list of tokenized (in char level) texts for training,
                        like ``[['我', '是', '中', '国', '人']]``
            train_labels: labels string of train_data
            valid_data: list of tokenized (in char level) texts for evaluation
            valid_labels: labels string of valid data
            ner_model_type: str, which ner model to use
            use_char: boolean, whether to use char embedding as input
            char_embed_type: str, can be a pre-trained embedding filename or pre-trained embedding
                             methods (word2vec, fasttext)
            char_embed_dim: int, dimensionality of char embedding
            char_embed_trainable: boolean, whether to update char embedding during training
            use_bert: boolean, whether to use bert embedding as input
            bert_vocab_file: str, path to bert's vocabulary file
            bert_config_file: str, path to bert's configuration file
            bert_checkpoint_file: str, path to bert's checkpoint file
            bert_trainable: boolean, whether to update bert during training
            use_word: boolean, whether to use word as additional input
            external_word_dict: list of words, external word dictionary
            word_embed_dim: similar as 'char_embed_dim'
            word_embed_type: similar as 'char_embed_type'
            word_embed_trainable: similar as 'char_embed_trainable'
            max_len: int, max sequence len. If None, we dynamically use the max length of one batch
                     as max_len. However, max_len must be provided when using bert as input.
            use_crf: boolean, whether to use crf layer
            optimizer: str or instance of `keras.optimizers.Optimizer`, indicating the optimizer to
                       use during training
            batch_size: num of samples per gradient update
            epochs: num of epochs to train the model
            callback_list: list of str, each item indicates the callback to apply during training
                           Currently, we support using 'modelcheckpoint' for `ModelCheckpoint`
                           callback, 'earlystopping` for 'Earlystopping` callback, 'swa' for
                           'SWA' callback. We will automatically add `NERMetric` callback when
                           valid_data and valid_labels are both provided.
            checkpoint_dir: str, directory to save ner model, must be provided when using
                            `ModelCheckpoint` or `SWA` callback.
            model_name: str, prefix of ner model's weights filem must be provided when using
                        `ModelCheckpoint` or `SWA` callback.
                        For example, if checkpoint_dir is 'ckpt' and model_name is 'model', the
                        weights of ner model saved by `ModelCheckpoint` callback will be
                        'ckpt/model.hdf5' and by `SWA` callback will be 'ckpt/model_swa.hdf5'
            load_swa_model: boolean, whether to load swa model, only apply when using SWA Callback.
            **kwargs: other argument for building ner model, such as "rnn_units", "fc_dim" etc.
        """
        self.preprocessor = NERPreprocessor(train_data=train_data,
                                            train_labels=train_labels,
                                            use_char=use_char,
                                            use_bert=use_bert,
                                            use_word=use_word,
                                            external_word_dict=external_word_dict,
                                            bert_vocab_file=bert_vocab_file,
                                            char_embed_type=char_embed_type,
                                            char_embed_dim=char_embed_dim,
                                            word_embed_type=word_embed_type,
                                            word_embed_dim=word_embed_dim,
                                            max_len=max_len)

        self.model = self.get_ner_model(ner_model_type=ner_model_type,
                                        num_class=self.preprocessor.num_class,
                                        use_char=use_char,
                                        char_embeddings=self.preprocessor.char_embeddings,
                                        char_vocab_size=self.preprocessor.char_vocab_size,
                                        char_embed_dim=self.preprocessor.char_embed_dim,
                                        char_embed_trainable=char_embed_trainable,
                                        use_bert=use_bert,
                                        bert_config_file=bert_config_file,
                                        bert_checkpoint_file=bert_checkpoint_file,
                                        bert_trainable=bert_trainable,
                                        use_word=use_word,
                                        word_embeddings=self.preprocessor.word_embeddings,
                                        word_vocab_size=self.preprocessor.word_vocab_size,
                                        word_embed_dim=self.preprocessor.word_embed_dim,
                                        word_embed_trainable=word_embed_trainable,
                                        max_len=self.preprocessor.max_len,
                                        use_crf=use_crf,
                                        optimizer=optimizer,
                                        **kwargs)

        if 'swa' in callback_list:
            swa_model = self.get_ner_model(ner_model_type=ner_model_type,
                                           num_class=self.preprocessor.num_class,
                                           use_char=use_char,
                                           char_embeddings=self.preprocessor.char_embeddings,
                                           char_vocab_size=self.preprocessor.char_vocab_size,
                                           char_embed_dim=self.preprocessor.char_embed_dim,
                                           char_embed_trainable=char_embed_trainable,
                                           use_bert=use_bert,
                                           bert_config_file=bert_config_file,
                                           bert_checkpoint_file=bert_checkpoint_file,
                                           bert_trainable=bert_trainable,
                                           use_word=use_word,
                                           word_embeddings=self.preprocessor.word_embeddings,
                                           word_vocab_size=self.preprocessor.word_vocab_size,
                                           word_embed_dim=self.preprocessor.word_embed_dim,
                                           word_embed_trainable=word_embed_trainable,
                                           max_len=self.preprocessor.max_len,
                                           use_crf=use_crf,
                                           optimizer=optimizer,
                                           **kwargs)
        else:
            swa_model = None

        self.trainer = NERTrainer(self.model, self.preprocessor)
        self.trainer.train_generator(train_data, train_labels, valid_data, valid_labels,
                                     batch_size, epochs, callback_list, checkpoint_dir, model_name,
                                     swa_model, load_swa_model)

        self.predictor = NERPredictor(self.model, self.preprocessor)

        if valid_data is not None and valid_labels is not None:
            logging.info('Evaluating on validation data...')
            self.score(valid_data, valid_labels)

    def score(self, valid_data, valid_labels):
        """Return the f1 score of the model over validation data

        Args:
            valid_data: list of tokenized texts
            valid_labels: list of label strings

        Returns:

        """
        if self.trainer:
            return self.trainer.evaluate(valid_data, valid_labels)
        else:
            logging.fatal('Trainer is None! Call fit() or load() to get trainer.')

    def predict(self, test_text):
        """Return prediction of the model for test data

        Args:
            test_text: untokenized text or tokenized (in char level) text

        Returns:

        """
        if self.predictor:
            return self.predictor.tag(test_text)
        else:
            logging.fatal('Predictor is None! Call fit() or load() to get predictor.')

    def predict_batch(self, test_texts):
        """Return predictions of the model for test data

        Args:
            test_texts: list of untokenized texts or tokenized (in char level) texts

        Returns:

        """
        if self.predictor:
            return self.predictor.tag_batch(test_texts)
        else:
            logging.fatal('Predictor is None! Call fit() or load() to get predictor.')

    def analyze(self, text):
        """Analyze text and return pretty format.

        Args:
            text: untokenized text or tokenized (in char level) text
        Returns:

        """
        if self.predictor:
            return self.predictor.pretty_tag(text)
        else:
            logging.fatal('Predictor is None! Call fit() or load() to get predictor.')

    def analyze_batch(self, texts):
        """Analyze batch of texts and return pretty format.

        Args:
            texts: untokenized texts or tokenized (in char level) texts
        Returns:

        """
        if self.predictor:
            return self.predictor.pretty_tag_batch(texts)
        else:
            logging.fatal('Predictor is None! Call fit() or load() to get predictor.')

    def restrict_analyze(self, text, threshold=0.85):
        if self.predictor:
            return self.predictor.restrict_tag(text, threshold)
        else:
            logging.fatal('Predictor is None! Call fit() or load() to get predictor.')

    def restrict_analyze_batch(self, texts, threshold=0.85):
        if self.predictor:
            return self.predictor.restrict_tag_batch(texts, threshold)
        else:
            logging.fatal('Predictor is None! Call fit() or load() to get predictor.')

    def save(self, preprocessor_file, json_file, weights_file=None):
        """save ner application
        
        Args:
            preprocessor_file: path to save preprocessor
            json_file: path to save model architecture
            weights_file: path to save model weights, can be None. When we use `ModelCheckpoint` 
                          or `SWA` callback, model's weights will be saved to disk after training.
                          In that case, we don't need to save it again. We usually set weights_file
                          to be None.
        """
        self.preprocessor.save(preprocessor_file)
        logging.info('Save preprocessor to {}'.format(preprocessor_file))

        model_json = self.model.to_json()
        with open(json_file, 'w') as writer:
            writer.write(model_json)
        logging.info('Save model architecture to {}'.format(json_file))

        if weights_file:
            self.model.save_weights(weights_file)
            logging.info('Save model weights to {}'.format(weights_file))

    def load(self, preprocessor_file, json_file, weights_file, custom_objects=None):
        """load ner application

        Args:
            preprocessor_file: path to load preprocessor
            json_file: path to load model architecture
            weights_file: path to load model weights
            custom_objects: Optional dictionary mapping names (strings) to custom classes or
                            functions to be considered during deserialization. Must provided when
                            using custom layer.

        """
        self.preprocessor = NERPreprocessor.load(preprocessor_file)
        logging.info('Load preprocessor from {}'.format(preprocessor_file))

        custom_objects = custom_objects or {}
        custom_objects.update(get_custom_objects())
        with open(json_file, 'r') as reader:
            self.model = model_from_json(reader.read(), custom_objects=custom_objects)
        logging.info('Load model architecture from {}'.format(json_file))

        self.model.load_weights(weights_file)
        logging.info('Load model weight from {}'.format(weights_file))

        self.trainer = NERTrainer(self.model, self.preprocessor)
        self.predictor = NERPredictor(self.model, self.preprocessor)

    @staticmethod
    def get_ner_model(ner_model_type, num_class, use_char, char_embeddings, char_vocab_size,
                      char_embed_dim, char_embed_trainable, use_bert, bert_config_file,
                      bert_checkpoint_file, bert_trainable, use_word, word_embeddings,
                      word_vocab_size, word_embed_dim, word_embed_trainable, max_len,
                      use_crf, optimizer, **kwargs):
        if ner_model_type == 'bilstm':
            ner_model = BiLSTMNER(
                num_class=num_class,
                use_char=use_char,
                char_embeddings=char_embeddings,
                char_vocab_size=char_vocab_size,
                char_embed_dim=char_embed_dim,
                char_embed_trainable=char_embed_trainable,
                use_bert=use_bert,
                bert_config_file=bert_config_file,
                bert_checkpoint_file=bert_checkpoint_file,
                bert_trainable=bert_trainable,
                use_word=use_word,
                word_embeddings=word_embeddings,
                word_vocab_size=word_vocab_size,
                word_embed_dim=word_embed_dim,
                word_embed_trainable=word_embed_trainable,
                max_len=max_len,
                use_crf=use_crf,
                optimizer=optimizer,
                **kwargs
            )
        elif ner_model_type == 'bilstm_cnn':
            ner_model = BiLSTMCNNNER(
                num_class=num_class,
                use_char=use_char,
                char_embeddings=char_embeddings,
                char_vocab_size=char_vocab_size,
                char_embed_dim=char_embed_dim,
                char_embed_trainable=char_embed_trainable,
                use_bert=use_bert,
                bert_config_file=bert_config_file,
                bert_checkpoint_file=bert_checkpoint_file,
                bert_trainable=bert_trainable,
                use_word=use_word,
                word_embeddings=word_embeddings,
                word_vocab_size=word_vocab_size,
                word_embed_dim=word_embed_dim,
                word_embed_trainable=word_embed_trainable,
                max_len=max_len,
                use_crf=use_crf,
                optimizer=optimizer,
                **kwargs
            )
        elif ner_model_type == 'bigru':
            ner_model = BiGRUNER(
                num_class=num_class,
                use_char=use_char,
                char_embeddings=char_embeddings,
                char_vocab_size=char_vocab_size,
                char_embed_dim=char_embed_dim,
                char_embed_trainable=char_embed_trainable,
                use_bert=use_bert,
                bert_config_file=bert_config_file,
                bert_checkpoint_file=bert_checkpoint_file,
                bert_trainable=bert_trainable,
                use_word=use_word,
                word_embeddings=word_embeddings,
                word_vocab_size=word_vocab_size,
                word_embed_dim=word_embed_dim,
                word_embed_trainable=word_embed_trainable,
                max_len=max_len,
                use_crf=use_crf,
                optimizer=optimizer,
                **kwargs
            )
        elif ner_model_type == 'bigru_cnn':
            ner_model = BiGRUCNNNER(
                num_class=num_class,
                use_char=use_char,
                char_embeddings=char_embeddings,
                char_vocab_size=char_vocab_size,
                char_embed_dim=char_embed_dim,
                char_embed_trainable=char_embed_trainable,
                use_bert=use_bert,
                bert_config_file=bert_config_file,
                bert_checkpoint_file=bert_checkpoint_file,
                bert_trainable=bert_trainable,
                use_word=use_word,
                word_embeddings=word_embeddings,
                word_vocab_size=word_vocab_size,
                word_embed_dim=word_embed_dim,
                word_embed_trainable=word_embed_trainable,
                max_len=max_len,
                use_crf=use_crf,
                optimizer=optimizer,
                **kwargs
            )
        elif ner_model_type == 'bert':
            ner_model = BertNER(
                num_class=num_class,
                bert_config_file=bert_config_file,
                bert_checkpoint_file=bert_checkpoint_file,
                bert_trainable=bert_trainable,
                max_len=max_len,
                use_crf=use_crf,
                optimizer=optimizer,
                **kwargs
            )
        else:
            raise ValueError('`ner_model_type` not understood: {}'.format(ner_model_type))

        return ner_model.build_model()

    def load_pretrained_model(self):
        cache_subdir = 'pretrained_models'

        prefix = 'https://fancy-nlp-1253403094.cos.ap-shanghai.myqcloud.com/pretrained_models/'

        preprocessor_file = get_file(fname='msra_ner_bilstm_cnn_crf_preprocessor.pkl',
                                     origin=prefix+'msra_ner_bilstm_cnn_crf_preprocessor.pkl',
                                     cache_subdir=cache_subdir, cache_dir=CACHE_DIR)
        json_file = get_file(fname='msra_ner_bilstm_cnn_crf.json',
                             origin=prefix+'msra_ner_bilstm_cnn_crf.json',
                             cache_subdir=cache_subdir, cache_dir=CACHE_DIR)
        weights_file = get_file(fname='msra_ner_bilstm_cnn_crf.hdf5',
                                origin=prefix+'msra_ner_bilstm_cnn_crf.hdf5',
                                cache_subdir=cache_subdir, cache_dir=CACHE_DIR)

        self.load(preprocessor_file, json_file, weights_file)
Esempio n. 9
0
class NER(object):
    """NER application. Support training ner model from scratch with provided dataset,
    loading pre-trained ner model as well as evaluating ner model on raw text with detailed and
    pretty-formatted tagging results.

    Examples:

    The following snippet shows how to train a ner model that uses BiLSTM-CNN-CRF model with
    character embedding and bert embedding as input and save it to disk:

    ```python
        from fancy_nlp.utils import load_ner_data_and_labels
        from fancy_nlp.applications import NER

        msra_train_file = 'data/ner/msra/train_data'
        msra_dev_file = 'data/ner/msra/test_data'
        bert_vocab_file='data/embeddings/chinese_L-12_H-768_A-12/vocab.txt',
        bert_config_file='data/embeddings/chinese_L-12_H-768_A-12/bert_config.json'),
        bert_checkpoint_file='data/embeddings/chinese_L-12_H-768_A-12/bert_model.ckpt')

        checkpoint_dir = 'ner_models'
        model_name = 'bert-bilstm-cnn-crf'
        weights_file = os.path.join(checkpoint_dir, f'{model_name}.hdf5')
        json_file = os.path.join(checkpoint_dir, f'{model_name}.json')
        preprocessor_file = os.path.join(checkpoint_dir, f'{model_name}_preprocessor.pkl')

        # start training
        train_data, train_labels = load_ner_data_and_labels(msra_train_file, delimiter='\t')
        dev_data, dev_labels = load_ner_data_and_labels(msra_dev_file, delimiter='\t')
        ner = NER()
        ner.fit(train_data=train_data,
                train_labels=train_labels,
                dev_data=dev_data,
                dev_labels=dev_labels,
                ner_model_type='bilstm_cnn',
                use_char=True,
                use_bert=True,
                bert_vocab_file=bert_vocab_file,
                bert_config_file=bert_config_file,
                bert_checkpoint_file=bert_checkpoint_file,
                bert_trainable=True,
                use_crf=True,
                callback_list=['modelcheckpoint', 'earlystopping'],
                checkpoint_dir=checkpoint_dir,
                model_name=model_name)

        # save ner application's preprocessor, model architecture and model weights to disk
        # with `ModelCheckpoint` callback, model weights will be saved to disk after training.
        # In that case, we don't need to save it again. So we pass None to weight_file
        ner.save(preprocessor_file=preprocessor_file, json_file=json_file, weight_file=None)
    ```

    The following snippet shows how to load a pre-trained ner model from disk and evaluate it using
    raw text:

    ```python
        from fancy_nlp.utils import load_ner_data_and_labels
        from fancy_nlp.applications import NER

        checkpoint_dir = 'ner_models'
        model_name = 'bert-bilstm-cnn-crf'
        weights_file = os.path.join(checkpoint_dir, f'{model_name}.hdf5')
        json_file = os.path.join(checkpoint_dir, f'{model_name}.json')
        preprocessor_file = os.path.join(checkpoint_dir, f'{model_name}_preprocessor.pkl')

        ner = NER()
        # load from disk
        ner.load(preprocessor_file=preprocessor_file, json_file=json_file, weight_file=weight_file)

        # evaluate over development dataset
        msra_dev_file = 'data/ner/msra/test_data'
        dev_data, dev_labels = load_ner_data_and_labels(msra_dev_file, delimiter='\t')
        print(ner.score(valid_data=dev_data, valid_labels=dev_labels))

        # predict tad sequence for given text
        print(ner.predict(text='同济大学位于上海市杨浦区,校长为陈杰')

        # show detailed tagging result in pretty-formatted for given text
        print(ner.analyze(text='同济大学位于上海市杨浦区,校长为陈杰'))
    ```

    """
    def __init__(self, use_pretrained: bool = True) -> None:
        """

        Args:
            use_pretrained: Boolean. Whether to load a pre-trained ner model that was trained on
                msra dataset using BiLSTM+CNN+CRF model.
        """
        # instance of NERPreprocessor, used to process the dataset and prepare model input
        self.preprocessor = None
        # instance of tf.Keras Model, ner model, the core of ner application
        self.model = None
        # instance of NERTrainer, used to train the ner model with dataset
        self.trainer = None
        # instance of NERPredictor, used to predict tagging results with the trained ner model
        self.predictor = None

        if use_pretrained:
            self.load_pretrained_model()

    def fit(self,
            train_data: List[List[str]],
            train_labels: List[List[str]],
            valid_data: Optional[List[List[str]]] = None,
            valid_labels: Optional[List[List[str]]] = None,
            ner_model_type: str = 'bilstm_cnn',
            use_char: bool = True,
            char_embed_type: Optional[str] = 'word2vec',
            char_embed_dim: int = 300,
            char_embed_trainable: bool = True,
            use_bert: bool = False,
            bert_vocab_file: Optional[str] = None,
            bert_config_file: Optional[str] = None,
            bert_checkpoint_file: Optional[str] = None,
            bert_trainable: bool = False,
            use_word: bool = False,
            external_word_dict: Optional[List[str]] = None,
            word_embed_type: Optional[str] = 'word2vec',
            word_embed_dim: int = 300,
            word_embed_trainable: bool = True,
            max_len: Optional[int] = None,
            use_crf: bool = True,
            optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam',
            batch_size: int = 32,
            epochs: int = 50,
            callback_list: Optional[List[str]] = None,
            checkpoint_dir: Optional[str] = None,
            model_name: Optional[str] = None,
            load_swa_model: bool = False,
            **kwargs) -> None:
        """Train ner model with provided dataset.

        We would like to make NER in Fancy-NLP more configurable, so we provided a bunch of
        arguments for users to configure:
            1. Which type of ner model to use; Currently we implement 5 types of ner models:
            'bilstm', 'bisltm-cnn', 'bigru', 'bigru-cnn' and 'bert'.

            2. Which kind of input embedding to use; We support 3 kinds of embedding: char
            embedding, bert embedding and word embedding. We can choose any one of them or
            combine any two or all of them to used as input. Note that our ner model only support
            char-level input, for the reason that char-level input haven shown
            effective for Chinese NER task without word-segmentation error. Therefore, we should use
            char embedding or bert embedding or both of them as main input. On that basis, we
            can use word embedding as auxiliary input to provide semantic information.

            3. Whether to use CRF; We can choose whether to add crf layer on the last layer of ner
            model.

            4. How to train the model:
            a) which optimizer to use, we support any optimizer that is compatible with
               tf.keras's optimizer:
               https://www.tensorflow.org/api_docs/python/tf/keras/optimizers ;
            b) how many sample to train per batch, how many epoch to train;
            c) which callbacks to use during training, we currently support 3 kinds of callbacks:
               i) 'modelcheckpoint' is used to save the model with best performance:
                   https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ModelCheckpoint;
               ii) 'earlystoppoing' is used to stop training when no performance gain observed:
                   https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping
               iii) 'swa' is used to apply an novel weight averaging ensemble mechanism to the
                   ner model we are training: https://arxiv.org/abs/1803.05407)

            5. Where to save the model

        Args:
            train_data: List of List of str. List of tokenized (in char level) texts for training,
                like ``[['我', '在', '上', '海', '上', '学'], ...]``.
            train_labels: List of List of str. The labels of train_data, usually in BIO or BIOES
                format, like ``[['O', 'O', 'B-LOC', 'I-LOC', 'O', 'O'], ...]``.
            valid_data: Optional List of List of str, can be None. List of tokenized (in char
                level) texts for evaluation.
            valid_labels: Optional List of List of str, can be None. The labels of valid_data.
                We can use fancy_nlp.utils.load_ner_data_and_labels() function to get training
                or validation data and labels from raw dataset in CoNLL format.
            ner_model_type: str. Which type of ner model to use, can be one of {'bilstm',
                'bilstm-cnn', 'bigru', 'bigru-cnn', 'bert'}.
            use_char: Boolean. Whether to use character embedding as input.
            char_embed_type: Optional str, can be None. The type of char embedding, can be a
                pre-trained embedding filename that used to load pre-trained embedding,
                or a embedding training method (one of {'word2vec', 'fasttext'}) that used to
                train character embedding with dataset. If None, do not apply anr pre-trained
                embedding, and use randomly initialized embedding instead.
            char_embed_dim: int. Dimensionality of char embedding.
            char_embed_trainable: Boolean. Whether to update char embedding during training.
            use_bert: Boolean. Whether to use bert embedding as input.
            bert_vocab_file: Optional str, can be None. Path to bert's vocabulary file.
            bert_config_file: Optional str, can be None. Path to bert's configuration file.
            bert_checkpoint_file: Optional str, can be None. Path to bert's checkpoint file.
            bert_trainable: Boolean. Whether to update bert during training.
            use_word: Boolean. Whether to use word as additional input.
            external_word_dict: Optional List of str, can be None. List of words, external word
                dictionary that will be used to loaded in jieba. It can be regarded as one kind
                of gazetter that contain a number of correct named-entities.
                Such as ``['南京市', '长江大桥']``
            word_embed_dim: similar as 'char_embed_dim'.
            word_embed_type: similar as 'char_embed_type'.
            word_embed_trainable: similar as 'char_embed_trainable'.
            max_len: Optional int, can be None. Max length of one sequence. If None, we dynamically
                use the max length of each batch as max_len. However, max_len must be provided
                when using bert as input.
            use_crf: Boolean. Whether to use crf layer.
            optimizer: str or instance of `tf.keras.optimizers.Optimizer`. Which optimizer to
                use during training.
            batch_size: int. Number of samples per gradient update.
            epochs: int. Number of epochs to train the model
            callback_list: Optional List of str or instance of `keras.callbacks.Callback`,
                can be None. Each item indicates the callback to apply during training. Currently,
                we support using 'modelcheckpoint' for `ModelCheckpoint` callback, 'earlystopping`
                for 'Earlystopping` callback, 'swa' for 'SWA' callback. We will automatically add
                `NERMetric` callback when valid_data and valid_labels are both provided.
            checkpoint_dir: Optional str, can be None. Directory to save the ner model. It must be
                provided when using `ModelCheckpoint` or `SWA` callback, since these callbacks needs
                to save ner model after training.
            model_name: Optional str, can be None. Prefix of ner model's weights file. I must be
                provided when using `ModelCheckpoint` or `SWA` callback, since these callbacks needs
                to save ner model after training. For example, if checkpoint_dir is 'ckpt' and
                model_name is 'model', the weights of ner model saved by `ModelCheckpoint` callback
                will be 'ckpt/model.hdf5' and by `SWA` callback will be 'ckpt/model_swa.hdf5'.
            load_swa_model: Boolean. Whether to load swa model, only apply when using `SWA`
                Callback. We suggest set it to True when using `SWA` Callback since swa model
                performs better than the original model at most cases.
            **kwargs: Other argument for building ner model, such as "rnn_units", "fc_dim" etc. See
                models.ner_models.py for there arguments.
        """

        # whether to use traditional bert model for prediction
        use_bert_model = ner_model_type == 'bert'
        # add assertion for checking input
        assert not (use_bert_model and use_word), \
            'when using bert model, `use_word` must be False'
        assert not (use_bert_model and use_char), \
            'when using bert model, `use_char` must be False'
        assert not (use_bert_model and not use_bert), \
            'when using bert model, `use_bert` must be True'

        self.preprocessor = NERPreprocessor(
            train_data=train_data,
            train_labels=train_labels,
            use_char=use_char,
            use_bert=use_bert,
            use_word=use_word,
            external_word_dict=external_word_dict,
            bert_vocab_file=bert_vocab_file,
            char_embed_type=char_embed_type,
            char_embed_dim=char_embed_dim,
            word_embed_type=word_embed_type,
            word_embed_dim=word_embed_dim,
            max_len=max_len)

        self.model = self.get_ner_model(
            ner_model_type=ner_model_type,
            num_class=self.preprocessor.num_class,
            use_char=use_char,
            char_embeddings=self.preprocessor.char_embeddings,
            char_vocab_size=self.preprocessor.char_vocab_size,
            char_embed_dim=self.preprocessor.char_embed_dim,
            char_embed_trainable=char_embed_trainable,
            use_bert=use_bert,
            bert_config_file=bert_config_file,
            bert_checkpoint_file=bert_checkpoint_file,
            bert_trainable=bert_trainable,
            use_word=use_word,
            word_embeddings=self.preprocessor.word_embeddings,
            word_vocab_size=self.preprocessor.word_vocab_size,
            word_embed_dim=self.preprocessor.word_embed_dim,
            word_embed_trainable=word_embed_trainable,
            max_len=self.preprocessor.max_len,
            use_crf=use_crf,
            optimizer=optimizer,
            **kwargs)

        if 'swa' in callback_list:
            # initialize swa model when using `SWA` callback
            swa_model = self.get_ner_model(
                ner_model_type=ner_model_type,
                num_class=self.preprocessor.num_class,
                use_char=use_char,
                char_embeddings=self.preprocessor.char_embeddings,
                char_vocab_size=self.preprocessor.char_vocab_size,
                char_embed_dim=self.preprocessor.char_embed_dim,
                char_embed_trainable=char_embed_trainable,
                use_bert=use_bert,
                bert_config_file=bert_config_file,
                bert_checkpoint_file=bert_checkpoint_file,
                bert_trainable=bert_trainable,
                use_word=use_word,
                word_embeddings=self.preprocessor.word_embeddings,
                word_vocab_size=self.preprocessor.word_vocab_size,
                word_embed_dim=self.preprocessor.word_embed_dim,
                word_embed_trainable=word_embed_trainable,
                max_len=self.preprocessor.max_len,
                use_crf=use_crf,
                optimizer=optimizer,
                **kwargs)
        else:
            swa_model = None

        self.trainer = NERTrainer(self.model, self.preprocessor)
        self.trainer.train_generator(train_data, train_labels, valid_data,
                                     valid_labels, batch_size, epochs,
                                     callback_list, checkpoint_dir, model_name,
                                     swa_model, load_swa_model)

        self.predictor = NERPredictor(self.model, self.preprocessor)

        if valid_data is not None and valid_labels is not None:
            logging.info('Evaluating on validation data...')
            self.score(valid_data, valid_labels)

    def score(self, data: List[List[str]], labels: List[List[str]]) -> float:
        """Evaluate the performance of ner model with given data and labels, return the f1 score.

        Args:
            data: List of List of str. List of tokenized (in char level) texts ,
                like ``[['我', '在', '上', '海', '上', '学'], ...]``.
            labels: List of List of str. The corresponding labels , usually in BIO or BIOES
                format, like ``[['O', 'O', 'B-LOC', 'I-LOC', 'O', 'O'], ...]``.

        Returns:
            Float. The F1 score.

        """
        if self.trainer:
            return self.trainer.evaluate(data, labels)
        else:
            logging.fatal(
                'Trainer is None! Call fit() or load() to get trainer.')

    def predict(self, text: Union[str, List[str]]) -> List[str]:
        """Return the tag sequence of given text predicted by the ner model

        Args:
            text: str or List of str. Can be a un-tokenized text, like ``'我在上海上学'`` or a
                tokenized (in char level) text sequence, like ``['我', '在', '上', '海', '上', '学']``.

        Returns:
            List of str. The tag sequence, like ``['O', 'O', 'B-LOC', 'I-LOC', 'O', 'O']``

        """
        if self.predictor:
            return self.predictor.tag(text)
        else:
            logging.fatal(
                'Predictor is None! Call fit() or load() to get predictor.')

    def predict_batch(
            self, texts: Union[List[str], List[List[str]]]) -> List[List[str]]:
        """Return the tag sequences of given batch of texts predicted by the ner model

        Args:
            texts: List of str or List of List of str. Can be a batch of un-tokenized texts,
                like ``['我在上海上学', ...]`` or a batch of tokenized (in char level) text sequences,
                like ``[['我', '在', '上', '海', '上', '学'], ...]``.

        Returns:
            List of List of str. The tag sequences, like ``[['O', 'O', 'B-LOC', 'I-LOC', 'O',
            'O']]``

        """
        if self.predictor:
            return self.predictor.tag_batch(texts)
        else:
            logging.fatal(
                'Predictor is None! Call fit() or load() to get predictor.')

    def analyze(self, text: Union[str, List[str]]) -> Dict[str, Any]:
        """Analyze the tagging result of given text predicted by the ner model and return the
        result in pretty format with detailed information.

        Args:
            text: str or List of str. Can be a un-tokenized text, like ``'我在上海上学'`` or a
                tokenized (in char level) text sequence, like ``['我', '在', '上', '海', '上', '学']``.

        Returns:
            A Dict including the original text and list of recognized entities with detailed
            information (name, type, score, offset). Specifically, it will be like:
            {'text': '我在上海上学',
             'entities': [{'name': '上海',
                           'type': 'LOC',
                           'score': 0.9986118674278259,
                           'beginOffset': 2,
                           'endOffset': 4
                           }]
            }

        Notes:
            the score of entity is the probability of being a named-entity, it is computed by
            taking the average the probability of all the tokens within the entity, which is
            predicted by the ner model. However, if one use crf layer at the last layer of ner
            model, the score will be always 1. This is because the viterbi algorithm used by crf
            will output a definite best path instead of probability distribution.

        """
        if self.predictor:
            return self.predictor.pretty_tag(text)
        else:
            logging.fatal(
                'Predictor is None! Call fit() or load() to get predictor.')

    def analyze_batch(
            self, texts: Union[List[str],
                               List[List[str]]]) -> List[Dict[str, Any]]:
        """Analyze the tagging results of given batch of text predicted by the ner model and
        return the results in pretty format with detailed information.

        Args:
            texts: List of str or List of List of str. Can be a batch of un-tokenized texts,
                like ``['我在上海上学', ...]`` or a batch of tokenized (in char level) text sequences,
                like ``[['我', '在', '上', '海', '上', '学'], ...]``.

        Returns:
            List of Dict. Each Dict contain the tagging results of one text, including the original
            text and list of recognized entities with detailed information (name, type, score,
            offset). Specifically, it will be like:
            [{'text': '我在上海上学',
             'entities': [{'name': '上海',
                           'type': 'LOC',
                           'score': 0.9986118674278259,
                           'beginOffset': 2,
                           'endOffset': 4
                           }]
             }
             ...
            ]

        Notes:
            The score of entity is the probability of being a named-entity, it is computed by
            taking the average the probability of all the tokens within the entity, which is
            predicted by the ner model. However, if one use crf layer at the last layer of ner
            model, the score will be always 1. This is because the viterbi algorithm used by crf
            will output a definite best path instead of probability distribution.

        """
        if self.predictor:
            return self.predictor.pretty_tag_batch(texts)
        else:
            logging.fatal(
                'Predictor is None! Call fit() or load() to get predictor.')

    def restrict_analyze(self,
                         text: Union[str, List[str]],
                         threshold: float = 0.85) -> Dict[str, Any]:
        """Analyze the tagging result of given text predicted by the ner model and then remove some
        recognized entities such that 1) all entities's scores are higher than threshold; 2)
        for each entity type, only keep the entity with the highest score. After that, return the
        recognized result in pretty format with detailed information.

        Args:
            text: str or List of str. Can be a un-tokenized text, like ``'我在上海上学'`` or a
                tokenized (in char level) text sequence, like ``['我', '在', '上', '海', '上', '学']``.
            threshold: float. The scores of recognized entities must be higher than threshold.

        Returns:
            A Dict including the original text and list of recognized entities with detailed
            information (name, type, score, offset). Specifically, it will be like:
            {'text': '我在上海上学',
             'entities': [{'name': '上海',
                           'type': 'LOC',
                           'score': 0.9986118674278259,
                           'beginOffset': 2,
                           'endOffset': 4
                           }]
            }

        Notes:
            The score of entity is the probability of being a named-entity, it is computed by
            taking the average the probability of all the tokens within the entity, which is
            predicted by the ner model. However, if one use crf layer at the last layer of ner
            model, the score will be always 1. This is because the viterbi algorithm used by crf
            will output a definite best path instead of probability distribution. As a result,
            we do not recommend you use this function when using crf layer.

        """
        if self.predictor:
            return self.predictor.restrict_tag(text, threshold)
        else:
            logging.fatal(
                'Predictor is None! Call fit() or load() to get predictor.')

    def restrict_analyze_batch(
            self,
            texts: Union[List[str], List[List[str]]],
            threshold: float = 0.85) -> List[Dict[str, Any]]:
        """Analyze the tagging results of given batch of texts predicted by the ner model and then
        remove some recognized entities such that 1) all entities's scores are higher than
        threshold; 2) for each entity type, only keep the entity with the highest score. After
        that, return the recognized results in pretty format with detailed information.

        Args:
            texts: List of str or List of List of str. Can be a batch of un-tokenized texts,
                like ``['我在上海上学', ...]`` or a batch of tokenized (in char level) text sequences,
                like ``[['我', '在', '上', '海', '上', '学'], ...]``
            threshold: float. The scores of recognized entities must be higher than threshold.

        Returns:
            List of Dict. Each Dict contain the tagging results of one text, including the original
            text and list of recognized entities with detailed information (name, type, score,
            offset). Specifically, it will be like:
            [{'text': '我在上海上学',
             'entities': [{'name': '上海',
                           'type': 'LOC',
                           'score': 0.9986118674278259,
                           'beginOffset': 2,
                           'endOffset': 4
                           }]
             }
             ...
            ]

        Notes:
            The score of entity is the probability of being a named-entity, it is computed by
            taking the average the probability of all the tokens within the entity, which is
            predicted by the ner model. However, if one use crf layer at the last layer of ner
            model, the score will be always 1. This is because the viterbi algorithm used by crf
            will output a definite best path instead of probability distribution. As a result,
            we do not recommend you use this function when using crf layer.

        """
        if self.predictor:
            return self.predictor.restrict_tag_batch(texts, threshold)
        else:
            logging.fatal(
                'Predictor is None! Call fit() or load() to get predictor.')

    def save(self,
             preprocessor_file: str,
             json_file: str,
             weights_file: Optional[str] = None) -> None:
        """Save ner application to disk.

        There are 3 things in total that we need to save: 1) preprocessor, which stores the
        vocabulary and embedding matrix built during pre-processing and helps us prepare feature
        input for ner model; 2) model architecture, which describes the framework of our ner model;
        3) model weights, which stores the value of ner model's parameters.
        
        Args:
            preprocessor_file: path to save preprocessor
            json_file: path to save model architecture
            weights_file: path to save model weights, can be None. When we use `ModelCheckpoint` 
                          or `SWA` callback, model's weights will be saved to disk after training.
                          In that case, we don't need to save it again. We usually set weights_file
                          to be None.
        """
        self.preprocessor.save(preprocessor_file)
        logging.info('Save preprocessor to {}'.format(preprocessor_file))

        model_json = self.model.to_json()
        with open(json_file, 'w') as writer:
            writer.write(model_json)
        logging.info('Save model architecture to {}'.format(json_file))

        if weights_file:
            self.model.save_weights(weights_file)
            logging.info('Save model weights to {}'.format(weights_file))

    def load(self,
             preprocessor_file: str,
             json_file: str,
             weights_file: str,
             custom_objects: Optional[Dict[str, Any]] = None) -> None:
        """Load ner application from disk.

        There are 3 things in total that we need to load: 1) preprocessor, which stores the
        vocabulary and embedding matrix built during pre-processing and helps us prepare feature
        input for ner model; 2) model architecture, which describes the framework of our ner model;
        3) model weights, which stores the value of ner model's parameters.

        Args:
            preprocessor_file: path to load preprocessor
            json_file: path to load model architecture
            weights_file: path to load model weights
            custom_objects: Optional dictionary mapping names (strings) to custom classes or
                            functions to be considered during deserialization. We will
                            automatically add all the custom layers of this project to
                            custom_objects. So you can ignore this argument in most cases unlesss
                            you use your own custom layer.

        """
        self.preprocessor = NERPreprocessor.load(preprocessor_file)
        logging.info('Load preprocessor from {}'.format(preprocessor_file))

        custom_objects = custom_objects or {}
        custom_objects.update(get_custom_objects())
        with open(json_file, 'r') as reader:
            self.model = tf.keras.models.model_from_json(
                reader.read(), custom_objects=custom_objects)
        logging.info('Load model architecture from {}'.format(json_file))

        self.model.load_weights(weights_file)
        logging.info('Load model weight from {}'.format(weights_file))

        self.trainer = NERTrainer(self.model, self.preprocessor)
        self.predictor = NERPredictor(self.model, self.preprocessor)

    @staticmethod
    def get_ner_model(ner_model_type: str, num_class: int, use_char: bool,
                      char_embeddings: Optional[np.ndarray],
                      char_vocab_size: int, char_embed_dim: int,
                      char_embed_trainable: bool, use_bert: bool,
                      bert_config_file: Optional[str],
                      bert_checkpoint_file: Optional[str],
                      bert_trainable: bool, use_word: bool,
                      word_embeddings: Optional[np.ndarray],
                      word_vocab_size: int, word_embed_dim: int,
                      word_embed_trainable: bool, max_len: Optional[int],
                      use_crf: bool,
                      optimizer: Union[str, tf.keras.optimizers.Optimizer],
                      **kwargs) -> tf.keras.models.Model:
        """Build ner model.

        Args:
            ner_model_type: str. Which type of ner model to use, can be one of {'bilstm',
                'bilstm-cnn', 'bigru', 'bigru-cnn', 'bert'}.
            num_class: int. Number of entity type. Usually calculated and passed by ner
                preprocessor.
            use_char: Boolean. Whether to use character embedding as input.
            char_embeddings: Optional np.ndarray. Char embedding matrix, shaped
                [char_vocab_size, char_embed_dim]. Usually pre-trained and passed by ner
                preprocessor. There are 2 cases when char_embeddings is None: 1)  use_char is
                False, do not use char embedding as input; 2) user did not provide valid
                pre-trained embedding file or any embedding training method.
            char_vocab_size: int. The size of char vocabulary. Usually calculated and passed by ner
                preprocessor.
            char_embed_dim: int. Dimensionality of char embedding.
            char_embed_trainable: Boolean. Whether to update char embedding during training.
            use_bert: Boolean. Whether to use bert embedding as input.
            bert_config_file: Optional str, can be None. Path to bert's configuration file.
            bert_checkpoint_file: Optional str, can be None. Path to bert's checkpoint file.
            bert_trainable: Boolean. Whether to update bert during training.
            use_word: Boolean. Whether to use word as additional input.
            word_embeddings: Optional np.ndarray. Similar as char_embeddings.
            word_vocab_size: int. Similar as char_vocab_size.
            word_embed_dim: int. Similar as char_embed_dim.
            word_embed_trainable: Boolean. Similar as char_embed_trainable.
            max_len: Optional int, can be None. Max length of one sequence.
            use_crf: Boolean. Whether to use crf layer.
            optimizer: str or instance of `tf.keras.optimizers.Optimizer`. Which optimizer to
                use during training.
            **kwargs: Other argument for building ner model, such as "rnn_units", "fc_dim" etc. See
                models.ner_models.py for there arguments.

        Raises:
            ValueError when `ner_model_type` not in one of {'bilstm', 'bilstm_cnn', 'bigru',
            'bigru_cnn', 'bert'}

        """

        if ner_model_type not in ner_model_dict:
            raise ValueError(
                '`ner_model_type` not understood: {}'.format(ner_model_type))
        else:
            ner_model = ner_model_dict[ner_model_type](
                num_class=num_class,
                use_char=use_char,
                char_embeddings=char_embeddings,
                char_vocab_size=char_vocab_size,
                char_embed_dim=char_embed_dim,
                char_embed_trainable=char_embed_trainable,
                use_bert=use_bert,
                bert_config_file=bert_config_file,
                bert_checkpoint_file=bert_checkpoint_file,
                bert_trainable=bert_trainable,
                use_word=use_word,
                word_embeddings=word_embeddings,
                word_vocab_size=word_vocab_size,
                word_embed_dim=word_embed_dim,
                word_embed_trainable=word_embed_trainable,
                max_len=max_len,
                use_crf=use_crf,
                optimizer=optimizer,
                **kwargs)

            return ner_model.build_model()

    def load_pretrained_model(self) -> None:
        """Load a pre-trained ner model that was trained on msra dataset using BiLSTM+CNN+CRF model.

        """
        cache_subdir = 'pretrained_models'

        preprocessor_file = tf.keras.utils.get_file(
            fname='msra_ner_bilstm_cnn_crf_preprocessor.pkl',
            origin=MODEL_STORAGE_PREFIX +
            'msra_ner_bilstm_cnn_crf_preprocessor.pkl',
            cache_subdir=cache_subdir,
            cache_dir=CACHE_DIR)
        json_file = tf.keras.utils.get_file(
            fname='msra_ner_bilstm_cnn_crf.json',
            origin=MODEL_STORAGE_PREFIX + 'msra_ner_bilstm_cnn_crf.json',
            cache_subdir=cache_subdir,
            cache_dir=CACHE_DIR)
        weights_file = tf.keras.utils.get_file(
            fname='msra_ner_bilstm_cnn_crf.hdf5',
            origin=MODEL_STORAGE_PREFIX + 'msra_ner_bilstm_cnn_crf.hdf5',
            cache_subdir=cache_subdir,
            cache_dir=CACHE_DIR)

        self.load(preprocessor_file, json_file, weights_file)
Esempio n. 10
0
    def fit(self,
            train_data: List[List[str]],
            train_labels: List[List[str]],
            valid_data: Optional[List[List[str]]] = None,
            valid_labels: Optional[List[List[str]]] = None,
            ner_model_type: str = 'bilstm_cnn',
            use_char: bool = True,
            char_embed_type: Optional[str] = 'word2vec',
            char_embed_dim: int = 300,
            char_embed_trainable: bool = True,
            use_bert: bool = False,
            bert_vocab_file: Optional[str] = None,
            bert_config_file: Optional[str] = None,
            bert_checkpoint_file: Optional[str] = None,
            bert_trainable: bool = False,
            use_word: bool = False,
            external_word_dict: Optional[List[str]] = None,
            word_embed_type: Optional[str] = 'word2vec',
            word_embed_dim: int = 300,
            word_embed_trainable: bool = True,
            max_len: Optional[int] = None,
            use_crf: bool = True,
            optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam',
            batch_size: int = 32,
            epochs: int = 50,
            callback_list: Optional[List[str]] = None,
            checkpoint_dir: Optional[str] = None,
            model_name: Optional[str] = None,
            load_swa_model: bool = False,
            **kwargs) -> None:
        """Train ner model with provided dataset.

        We would like to make NER in Fancy-NLP more configurable, so we provided a bunch of
        arguments for users to configure:
            1. Which type of ner model to use; Currently we implement 5 types of ner models:
            'bilstm', 'bisltm-cnn', 'bigru', 'bigru-cnn' and 'bert'.

            2. Which kind of input embedding to use; We support 3 kinds of embedding: char
            embedding, bert embedding and word embedding. We can choose any one of them or
            combine any two or all of them to used as input. Note that our ner model only support
            char-level input, for the reason that char-level input haven shown
            effective for Chinese NER task without word-segmentation error. Therefore, we should use
            char embedding or bert embedding or both of them as main input. On that basis, we
            can use word embedding as auxiliary input to provide semantic information.

            3. Whether to use CRF; We can choose whether to add crf layer on the last layer of ner
            model.

            4. How to train the model:
            a) which optimizer to use, we support any optimizer that is compatible with
               tf.keras's optimizer:
               https://www.tensorflow.org/api_docs/python/tf/keras/optimizers ;
            b) how many sample to train per batch, how many epoch to train;
            c) which callbacks to use during training, we currently support 3 kinds of callbacks:
               i) 'modelcheckpoint' is used to save the model with best performance:
                   https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ModelCheckpoint;
               ii) 'earlystoppoing' is used to stop training when no performance gain observed:
                   https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping
               iii) 'swa' is used to apply an novel weight averaging ensemble mechanism to the
                   ner model we are training: https://arxiv.org/abs/1803.05407)

            5. Where to save the model

        Args:
            train_data: List of List of str. List of tokenized (in char level) texts for training,
                like ``[['我', '在', '上', '海', '上', '学'], ...]``.
            train_labels: List of List of str. The labels of train_data, usually in BIO or BIOES
                format, like ``[['O', 'O', 'B-LOC', 'I-LOC', 'O', 'O'], ...]``.
            valid_data: Optional List of List of str, can be None. List of tokenized (in char
                level) texts for evaluation.
            valid_labels: Optional List of List of str, can be None. The labels of valid_data.
                We can use fancy_nlp.utils.load_ner_data_and_labels() function to get training
                or validation data and labels from raw dataset in CoNLL format.
            ner_model_type: str. Which type of ner model to use, can be one of {'bilstm',
                'bilstm-cnn', 'bigru', 'bigru-cnn', 'bert'}.
            use_char: Boolean. Whether to use character embedding as input.
            char_embed_type: Optional str, can be None. The type of char embedding, can be a
                pre-trained embedding filename that used to load pre-trained embedding,
                or a embedding training method (one of {'word2vec', 'fasttext'}) that used to
                train character embedding with dataset. If None, do not apply anr pre-trained
                embedding, and use randomly initialized embedding instead.
            char_embed_dim: int. Dimensionality of char embedding.
            char_embed_trainable: Boolean. Whether to update char embedding during training.
            use_bert: Boolean. Whether to use bert embedding as input.
            bert_vocab_file: Optional str, can be None. Path to bert's vocabulary file.
            bert_config_file: Optional str, can be None. Path to bert's configuration file.
            bert_checkpoint_file: Optional str, can be None. Path to bert's checkpoint file.
            bert_trainable: Boolean. Whether to update bert during training.
            use_word: Boolean. Whether to use word as additional input.
            external_word_dict: Optional List of str, can be None. List of words, external word
                dictionary that will be used to loaded in jieba. It can be regarded as one kind
                of gazetter that contain a number of correct named-entities.
                Such as ``['南京市', '长江大桥']``
            word_embed_dim: similar as 'char_embed_dim'.
            word_embed_type: similar as 'char_embed_type'.
            word_embed_trainable: similar as 'char_embed_trainable'.
            max_len: Optional int, can be None. Max length of one sequence. If None, we dynamically
                use the max length of each batch as max_len. However, max_len must be provided
                when using bert as input.
            use_crf: Boolean. Whether to use crf layer.
            optimizer: str or instance of `tf.keras.optimizers.Optimizer`. Which optimizer to
                use during training.
            batch_size: int. Number of samples per gradient update.
            epochs: int. Number of epochs to train the model
            callback_list: Optional List of str or instance of `keras.callbacks.Callback`,
                can be None. Each item indicates the callback to apply during training. Currently,
                we support using 'modelcheckpoint' for `ModelCheckpoint` callback, 'earlystopping`
                for 'Earlystopping` callback, 'swa' for 'SWA' callback. We will automatically add
                `NERMetric` callback when valid_data and valid_labels are both provided.
            checkpoint_dir: Optional str, can be None. Directory to save the ner model. It must be
                provided when using `ModelCheckpoint` or `SWA` callback, since these callbacks needs
                to save ner model after training.
            model_name: Optional str, can be None. Prefix of ner model's weights file. I must be
                provided when using `ModelCheckpoint` or `SWA` callback, since these callbacks needs
                to save ner model after training. For example, if checkpoint_dir is 'ckpt' and
                model_name is 'model', the weights of ner model saved by `ModelCheckpoint` callback
                will be 'ckpt/model.hdf5' and by `SWA` callback will be 'ckpt/model_swa.hdf5'.
            load_swa_model: Boolean. Whether to load swa model, only apply when using `SWA`
                Callback. We suggest set it to True when using `SWA` Callback since swa model
                performs better than the original model at most cases.
            **kwargs: Other argument for building ner model, such as "rnn_units", "fc_dim" etc. See
                models.ner_models.py for there arguments.
        """

        # whether to use traditional bert model for prediction
        use_bert_model = ner_model_type == 'bert'
        # add assertion for checking input
        assert not (use_bert_model and use_word), \
            'when using bert model, `use_word` must be False'
        assert not (use_bert_model and use_char), \
            'when using bert model, `use_char` must be False'
        assert not (use_bert_model and not use_bert), \
            'when using bert model, `use_bert` must be True'

        self.preprocessor = NERPreprocessor(
            train_data=train_data,
            train_labels=train_labels,
            use_char=use_char,
            use_bert=use_bert,
            use_word=use_word,
            external_word_dict=external_word_dict,
            bert_vocab_file=bert_vocab_file,
            char_embed_type=char_embed_type,
            char_embed_dim=char_embed_dim,
            word_embed_type=word_embed_type,
            word_embed_dim=word_embed_dim,
            max_len=max_len)

        self.model = self.get_ner_model(
            ner_model_type=ner_model_type,
            num_class=self.preprocessor.num_class,
            use_char=use_char,
            char_embeddings=self.preprocessor.char_embeddings,
            char_vocab_size=self.preprocessor.char_vocab_size,
            char_embed_dim=self.preprocessor.char_embed_dim,
            char_embed_trainable=char_embed_trainable,
            use_bert=use_bert,
            bert_config_file=bert_config_file,
            bert_checkpoint_file=bert_checkpoint_file,
            bert_trainable=bert_trainable,
            use_word=use_word,
            word_embeddings=self.preprocessor.word_embeddings,
            word_vocab_size=self.preprocessor.word_vocab_size,
            word_embed_dim=self.preprocessor.word_embed_dim,
            word_embed_trainable=word_embed_trainable,
            max_len=self.preprocessor.max_len,
            use_crf=use_crf,
            optimizer=optimizer,
            **kwargs)

        if 'swa' in callback_list:
            # initialize swa model when using `SWA` callback
            swa_model = self.get_ner_model(
                ner_model_type=ner_model_type,
                num_class=self.preprocessor.num_class,
                use_char=use_char,
                char_embeddings=self.preprocessor.char_embeddings,
                char_vocab_size=self.preprocessor.char_vocab_size,
                char_embed_dim=self.preprocessor.char_embed_dim,
                char_embed_trainable=char_embed_trainable,
                use_bert=use_bert,
                bert_config_file=bert_config_file,
                bert_checkpoint_file=bert_checkpoint_file,
                bert_trainable=bert_trainable,
                use_word=use_word,
                word_embeddings=self.preprocessor.word_embeddings,
                word_vocab_size=self.preprocessor.word_vocab_size,
                word_embed_dim=self.preprocessor.word_embed_dim,
                word_embed_trainable=word_embed_trainable,
                max_len=self.preprocessor.max_len,
                use_crf=use_crf,
                optimizer=optimizer,
                **kwargs)
        else:
            swa_model = None

        self.trainer = NERTrainer(self.model, self.preprocessor)
        self.trainer.train_generator(train_data, train_labels, valid_data,
                                     valid_labels, batch_size, epochs,
                                     callback_list, checkpoint_dir, model_name,
                                     swa_model, load_swa_model)

        self.predictor = NERPredictor(self.model, self.preprocessor)

        if valid_data is not None and valid_labels is not None:
            logging.info('Evaluating on validation data...')
            self.score(valid_data, valid_labels)