Exemple #1
0
    def test_train_no_bert(self):
        preprocessor = NERPreprocessor(self.train_data + self.valid_data,
                                       self.train_labels + self.valid_labels,
                                       use_word=True,
                                       char_embed_type='word2vec')
        ner_model = BiLSTMCNNNER(num_class=preprocessor.num_class,
                                 use_char=True,
                                 char_embeddings=preprocessor.char_embeddings,
                                 char_vocab_size=preprocessor.char_vocab_size,
                                 char_embed_dim=preprocessor.char_embed_dim,
                                 char_embed_trainable=False,
                                 use_word=True,
                                 word_embeddings=preprocessor.word_embeddings,
                                 word_vocab_size=preprocessor.word_vocab_size,
                                 word_embed_dim=preprocessor.word_embed_dim,
                                 word_embed_trainable=False,
                                 max_len=preprocessor.max_len,
                                 use_crf=True).build_model()

        ner_trainer = NERTrainer(ner_model, preprocessor)
        ner_trainer.train(self.train_data,
                          self.train_labels,
                          self.valid_data,
                          self.valid_labels,
                          batch_size=2,
                          epochs=2)
        assert not os.path.exists(self.json_file)
        assert not os.path.exists(self.weights_file)
Exemple #2
0
    def test_train_no_crf(self):
        ner_model = BiLSTMCNNNER(num_class=self.num_class,
                                 use_char=True,
                                 char_embeddings=self.char_embeddings,
                                 char_vocab_size=self.char_vocab_size,
                                 char_embed_dim=self.char_embed_dim,
                                 char_embed_trainable=False,
                                 use_bert=True,
                                 bert_config_file=self.bert_config_file,
                                 bert_checkpoint_file=self.bert_model_file,
                                 use_word=True,
                                 word_embeddings=self.word_embeddings,
                                 word_vocab_size=self.word_vocab_size,
                                 word_embed_dim=self.word_embed_dim,
                                 word_embed_trainable=False,
                                 max_len=self.preprocessor.max_len,
                                 use_crf=False).build_model()

        ner_trainer = NERTrainer(ner_model, self.preprocessor)
        ner_trainer.train(self.train_data,
                          self.train_labels,
                          self.valid_data,
                          self.valid_labels,
                          batch_size=2,
                          epochs=2)
        assert not os.path.exists(self.json_file)
        assert not os.path.exists(self.weights_file)
Exemple #3
0
    def load(self, preprocessor_file, json_file, weights_file, custom_objects=None):
        """load ner application

        Args:
            preprocessor_file: path to load preprocessor
            json_file: path to load model architecture
            weights_file: path to load model weights
            custom_objects: Optional dictionary mapping names (strings) to custom classes or
                            functions to be considered during deserialization. Must provided when
                            using custom layer.

        """
        self.preprocessor = NERPreprocessor.load(preprocessor_file)
        logging.info('Load preprocessor from {}'.format(preprocessor_file))

        custom_objects = custom_objects or {}
        custom_objects.update(get_custom_objects())
        with open(json_file, 'r') as reader:
            self.model = model_from_json(reader.read(), custom_objects=custom_objects)
        logging.info('Load model architecture from {}'.format(json_file))

        self.model.load_weights(weights_file)
        logging.info('Load model weight from {}'.format(weights_file))

        self.trainer = NERTrainer(self.model, self.preprocessor)
        self.predictor = NERPredictor(self.model, self.preprocessor)
Exemple #4
0
    def load(self,
             preprocessor_file: str,
             json_file: str,
             weights_file: str,
             custom_objects: Optional[Dict[str, Any]] = None) -> None:
        """Load ner application from disk.

        There are 3 things in total that we need to load: 1) preprocessor, which stores the
        vocabulary and embedding matrix built during pre-processing and helps us prepare feature
        input for ner model; 2) model architecture, which describes the framework of our ner model;
        3) model weights, which stores the value of ner model's parameters.

        Args:
            preprocessor_file: path to load preprocessor
            json_file: path to load model architecture
            weights_file: path to load model weights
            custom_objects: Optional dictionary mapping names (strings) to custom classes or
                            functions to be considered during deserialization. We will
                            automatically add all the custom layers of this project to
                            custom_objects. So you can ignore this argument in most cases unlesss
                            you use your own custom layer.

        """
        self.preprocessor = NERPreprocessor.load(preprocessor_file)
        logging.info('Load preprocessor from {}'.format(preprocessor_file))

        custom_objects = custom_objects or {}
        custom_objects.update(get_custom_objects())
        with open(json_file, 'r') as reader:
            self.model = tf.keras.models.model_from_json(
                reader.read(), custom_objects=custom_objects)
        logging.info('Load model architecture from {}'.format(json_file))

        self.model.load_weights(weights_file)
        logging.info('Load model weight from {}'.format(weights_file))

        self.trainer = NERTrainer(self.model, self.preprocessor)
        self.predictor = NERPredictor(self.model, self.preprocessor)
Exemple #5
0
    def setup_class(self):
        self.train_data, self.train_labels, self.valid_data, self.valid_labels = \
            load_ner_data_and_labels(self.test_file, split=True)
        self.preprocessor = NERPreprocessor(
            self.train_data + self.valid_data,
            self.train_labels + self.valid_labels,
            use_bert=True,
            use_word=True,
            bert_vocab_file=self.bert_vocab_file,
            char_embed_type='word2vec',
            word_embed_type='word2vec',
            max_len=16)
        self.num_class = self.preprocessor.num_class
        self.char_embeddings = self.preprocessor.char_embeddings
        self.char_vocab_size = self.preprocessor.char_vocab_size
        self.char_embed_dim = self.preprocessor.char_embed_dim

        self.word_embeddings = self.preprocessor.word_embeddings
        self.word_vocab_size = self.preprocessor.word_vocab_size
        self.word_embed_dim = self.preprocessor.word_embed_dim
        self.checkpoint_dir = os.path.dirname(__file__)

        self.ner_model = BiLSTMCNNNER(
            num_class=self.num_class,
            use_char=True,
            char_embeddings=self.char_embeddings,
            char_vocab_size=self.char_vocab_size,
            char_embed_dim=self.char_embed_dim,
            char_embed_trainable=False,
            use_bert=True,
            bert_config_file=self.bert_config_file,
            bert_checkpoint_file=self.bert_model_file,
            use_word=True,
            word_embeddings=self.word_embeddings,
            word_vocab_size=self.word_vocab_size,
            word_embed_dim=self.word_embed_dim,
            word_embed_trainable=False,
            max_len=self.preprocessor.max_len,
            use_crf=True).build_model()

        self.swa_model = BiLSTMCNNNER(
            num_class=self.num_class,
            use_char=True,
            char_embeddings=self.char_embeddings,
            char_vocab_size=self.char_vocab_size,
            char_embed_dim=self.char_embed_dim,
            char_embed_trainable=False,
            use_bert=True,
            bert_config_file=self.bert_config_file,
            bert_checkpoint_file=self.bert_model_file,
            use_word=True,
            word_embeddings=self.word_embeddings,
            word_vocab_size=self.word_vocab_size,
            word_embed_dim=self.word_embed_dim,
            word_embed_trainable=False,
            max_len=self.preprocessor.max_len,
            use_crf=True).build_model()

        self.ner_trainer = NERTrainer(self.ner_model, self.preprocessor)

        self.json_file = os.path.join(self.checkpoint_dir,
                                      'bilstm_cnn_ner.json')
        self.weights_file = os.path.join(self.checkpoint_dir,
                                         'bilstm_cnn_ner.hdf5')
Exemple #6
0
    def fit(self,
            train_data,
            train_labels,
            valid_data=None,
            valid_labels=None,
            ner_model_type='bilstm_cnn',
            use_char=True,
            char_embed_type='word2vec',
            char_embed_dim=300,
            char_embed_trainable=True,
            use_bert=False,
            bert_vocab_file=None,
            bert_config_file=None,
            bert_checkpoint_file=None,
            bert_trainable=False,
            use_word=False,
            external_word_dict=None,
            word_embed_type='word2vec',
            word_embed_dim=300,
            word_embed_trainable=True,
            max_len=None,
            use_crf=True,
            optimizer='adam',
            batch_size=32,
            epochs=50,
            callback_list=None,
            checkpoint_dir=None,
            model_name=None,
            load_swa_model=False,
            **kwargs):
        """Train ner model using provided data

        Args:
            train_data: list of tokenized (in char level) texts for training,
                        like ``[['我', '是', '中', '国', '人']]``
            train_labels: labels string of train_data
            valid_data: list of tokenized (in char level) texts for evaluation
            valid_labels: labels string of valid data
            ner_model_type: str, which ner model to use
            use_char: boolean, whether to use char embedding as input
            char_embed_type: str, can be a pre-trained embedding filename or pre-trained embedding
                             methods (word2vec, fasttext)
            char_embed_dim: int, dimensionality of char embedding
            char_embed_trainable: boolean, whether to update char embedding during training
            use_bert: boolean, whether to use bert embedding as input
            bert_vocab_file: str, path to bert's vocabulary file
            bert_config_file: str, path to bert's configuration file
            bert_checkpoint_file: str, path to bert's checkpoint file
            bert_trainable: boolean, whether to update bert during training
            use_word: boolean, whether to use word as additional input
            external_word_dict: list of words, external word dictionary
            word_embed_dim: similar as 'char_embed_dim'
            word_embed_type: similar as 'char_embed_type'
            word_embed_trainable: similar as 'char_embed_trainable'
            max_len: int, max sequence len. If None, we dynamically use the max length of one batch
                     as max_len. However, max_len must be provided when using bert as input.
            use_crf: boolean, whether to use crf layer
            optimizer: str or instance of `keras.optimizers.Optimizer`, indicating the optimizer to
                       use during training
            batch_size: num of samples per gradient update
            epochs: num of epochs to train the model
            callback_list: list of str, each item indicates the callback to apply during training
                           Currently, we support using 'modelcheckpoint' for `ModelCheckpoint`
                           callback, 'earlystopping` for 'Earlystopping` callback, 'swa' for
                           'SWA' callback. We will automatically add `NERMetric` callback when
                           valid_data and valid_labels are both provided.
            checkpoint_dir: str, directory to save ner model, must be provided when using
                            `ModelCheckpoint` or `SWA` callback.
            model_name: str, prefix of ner model's weights filem must be provided when using
                        `ModelCheckpoint` or `SWA` callback.
                        For example, if checkpoint_dir is 'ckpt' and model_name is 'model', the
                        weights of ner model saved by `ModelCheckpoint` callback will be
                        'ckpt/model.hdf5' and by `SWA` callback will be 'ckpt/model_swa.hdf5'
            load_swa_model: boolean, whether to load swa model, only apply when using SWA Callback.
            **kwargs: other argument for building ner model, such as "rnn_units", "fc_dim" etc.
        """
        self.preprocessor = NERPreprocessor(train_data=train_data,
                                            train_labels=train_labels,
                                            use_char=use_char,
                                            use_bert=use_bert,
                                            use_word=use_word,
                                            external_word_dict=external_word_dict,
                                            bert_vocab_file=bert_vocab_file,
                                            char_embed_type=char_embed_type,
                                            char_embed_dim=char_embed_dim,
                                            word_embed_type=word_embed_type,
                                            word_embed_dim=word_embed_dim,
                                            max_len=max_len)

        self.model = self.get_ner_model(ner_model_type=ner_model_type,
                                        num_class=self.preprocessor.num_class,
                                        use_char=use_char,
                                        char_embeddings=self.preprocessor.char_embeddings,
                                        char_vocab_size=self.preprocessor.char_vocab_size,
                                        char_embed_dim=self.preprocessor.char_embed_dim,
                                        char_embed_trainable=char_embed_trainable,
                                        use_bert=use_bert,
                                        bert_config_file=bert_config_file,
                                        bert_checkpoint_file=bert_checkpoint_file,
                                        bert_trainable=bert_trainable,
                                        use_word=use_word,
                                        word_embeddings=self.preprocessor.word_embeddings,
                                        word_vocab_size=self.preprocessor.word_vocab_size,
                                        word_embed_dim=self.preprocessor.word_embed_dim,
                                        word_embed_trainable=word_embed_trainable,
                                        max_len=self.preprocessor.max_len,
                                        use_crf=use_crf,
                                        optimizer=optimizer,
                                        **kwargs)

        if 'swa' in callback_list:
            swa_model = self.get_ner_model(ner_model_type=ner_model_type,
                                           num_class=self.preprocessor.num_class,
                                           use_char=use_char,
                                           char_embeddings=self.preprocessor.char_embeddings,
                                           char_vocab_size=self.preprocessor.char_vocab_size,
                                           char_embed_dim=self.preprocessor.char_embed_dim,
                                           char_embed_trainable=char_embed_trainable,
                                           use_bert=use_bert,
                                           bert_config_file=bert_config_file,
                                           bert_checkpoint_file=bert_checkpoint_file,
                                           bert_trainable=bert_trainable,
                                           use_word=use_word,
                                           word_embeddings=self.preprocessor.word_embeddings,
                                           word_vocab_size=self.preprocessor.word_vocab_size,
                                           word_embed_dim=self.preprocessor.word_embed_dim,
                                           word_embed_trainable=word_embed_trainable,
                                           max_len=self.preprocessor.max_len,
                                           use_crf=use_crf,
                                           optimizer=optimizer,
                                           **kwargs)
        else:
            swa_model = None

        self.trainer = NERTrainer(self.model, self.preprocessor)
        self.trainer.train_generator(train_data, train_labels, valid_data, valid_labels,
                                     batch_size, epochs, callback_list, checkpoint_dir, model_name,
                                     swa_model, load_swa_model)

        self.predictor = NERPredictor(self.model, self.preprocessor)

        if valid_data is not None and valid_labels is not None:
            logging.info('Evaluating on validation data...')
            self.score(valid_data, valid_labels)
Exemple #7
0
    def fit(self,
            train_data: List[List[str]],
            train_labels: List[List[str]],
            valid_data: Optional[List[List[str]]] = None,
            valid_labels: Optional[List[List[str]]] = None,
            ner_model_type: str = 'bilstm_cnn',
            use_char: bool = True,
            char_embed_type: Optional[str] = 'word2vec',
            char_embed_dim: int = 300,
            char_embed_trainable: bool = True,
            use_bert: bool = False,
            bert_vocab_file: Optional[str] = None,
            bert_config_file: Optional[str] = None,
            bert_checkpoint_file: Optional[str] = None,
            bert_trainable: bool = False,
            use_word: bool = False,
            external_word_dict: Optional[List[str]] = None,
            word_embed_type: Optional[str] = 'word2vec',
            word_embed_dim: int = 300,
            word_embed_trainable: bool = True,
            max_len: Optional[int] = None,
            use_crf: bool = True,
            optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam',
            batch_size: int = 32,
            epochs: int = 50,
            callback_list: Optional[List[str]] = None,
            checkpoint_dir: Optional[str] = None,
            model_name: Optional[str] = None,
            load_swa_model: bool = False,
            **kwargs) -> None:
        """Train ner model with provided dataset.

        We would like to make NER in Fancy-NLP more configurable, so we provided a bunch of
        arguments for users to configure:
            1. Which type of ner model to use; Currently we implement 5 types of ner models:
            'bilstm', 'bisltm-cnn', 'bigru', 'bigru-cnn' and 'bert'.

            2. Which kind of input embedding to use; We support 3 kinds of embedding: char
            embedding, bert embedding and word embedding. We can choose any one of them or
            combine any two or all of them to used as input. Note that our ner model only support
            char-level input, for the reason that char-level input haven shown
            effective for Chinese NER task without word-segmentation error. Therefore, we should use
            char embedding or bert embedding or both of them as main input. On that basis, we
            can use word embedding as auxiliary input to provide semantic information.

            3. Whether to use CRF; We can choose whether to add crf layer on the last layer of ner
            model.

            4. How to train the model:
            a) which optimizer to use, we support any optimizer that is compatible with
               tf.keras's optimizer:
               https://www.tensorflow.org/api_docs/python/tf/keras/optimizers ;
            b) how many sample to train per batch, how many epoch to train;
            c) which callbacks to use during training, we currently support 3 kinds of callbacks:
               i) 'modelcheckpoint' is used to save the model with best performance:
                   https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ModelCheckpoint;
               ii) 'earlystoppoing' is used to stop training when no performance gain observed:
                   https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping
               iii) 'swa' is used to apply an novel weight averaging ensemble mechanism to the
                   ner model we are training: https://arxiv.org/abs/1803.05407)

            5. Where to save the model

        Args:
            train_data: List of List of str. List of tokenized (in char level) texts for training,
                like ``[['我', '在', '上', '海', '上', '学'], ...]``.
            train_labels: List of List of str. The labels of train_data, usually in BIO or BIOES
                format, like ``[['O', 'O', 'B-LOC', 'I-LOC', 'O', 'O'], ...]``.
            valid_data: Optional List of List of str, can be None. List of tokenized (in char
                level) texts for evaluation.
            valid_labels: Optional List of List of str, can be None. The labels of valid_data.
                We can use fancy_nlp.utils.load_ner_data_and_labels() function to get training
                or validation data and labels from raw dataset in CoNLL format.
            ner_model_type: str. Which type of ner model to use, can be one of {'bilstm',
                'bilstm-cnn', 'bigru', 'bigru-cnn', 'bert'}.
            use_char: Boolean. Whether to use character embedding as input.
            char_embed_type: Optional str, can be None. The type of char embedding, can be a
                pre-trained embedding filename that used to load pre-trained embedding,
                or a embedding training method (one of {'word2vec', 'fasttext'}) that used to
                train character embedding with dataset. If None, do not apply anr pre-trained
                embedding, and use randomly initialized embedding instead.
            char_embed_dim: int. Dimensionality of char embedding.
            char_embed_trainable: Boolean. Whether to update char embedding during training.
            use_bert: Boolean. Whether to use bert embedding as input.
            bert_vocab_file: Optional str, can be None. Path to bert's vocabulary file.
            bert_config_file: Optional str, can be None. Path to bert's configuration file.
            bert_checkpoint_file: Optional str, can be None. Path to bert's checkpoint file.
            bert_trainable: Boolean. Whether to update bert during training.
            use_word: Boolean. Whether to use word as additional input.
            external_word_dict: Optional List of str, can be None. List of words, external word
                dictionary that will be used to loaded in jieba. It can be regarded as one kind
                of gazetter that contain a number of correct named-entities.
                Such as ``['南京市', '长江大桥']``
            word_embed_dim: similar as 'char_embed_dim'.
            word_embed_type: similar as 'char_embed_type'.
            word_embed_trainable: similar as 'char_embed_trainable'.
            max_len: Optional int, can be None. Max length of one sequence. If None, we dynamically
                use the max length of each batch as max_len. However, max_len must be provided
                when using bert as input.
            use_crf: Boolean. Whether to use crf layer.
            optimizer: str or instance of `tf.keras.optimizers.Optimizer`. Which optimizer to
                use during training.
            batch_size: int. Number of samples per gradient update.
            epochs: int. Number of epochs to train the model
            callback_list: Optional List of str or instance of `keras.callbacks.Callback`,
                can be None. Each item indicates the callback to apply during training. Currently,
                we support using 'modelcheckpoint' for `ModelCheckpoint` callback, 'earlystopping`
                for 'Earlystopping` callback, 'swa' for 'SWA' callback. We will automatically add
                `NERMetric` callback when valid_data and valid_labels are both provided.
            checkpoint_dir: Optional str, can be None. Directory to save the ner model. It must be
                provided when using `ModelCheckpoint` or `SWA` callback, since these callbacks needs
                to save ner model after training.
            model_name: Optional str, can be None. Prefix of ner model's weights file. I must be
                provided when using `ModelCheckpoint` or `SWA` callback, since these callbacks needs
                to save ner model after training. For example, if checkpoint_dir is 'ckpt' and
                model_name is 'model', the weights of ner model saved by `ModelCheckpoint` callback
                will be 'ckpt/model.hdf5' and by `SWA` callback will be 'ckpt/model_swa.hdf5'.
            load_swa_model: Boolean. Whether to load swa model, only apply when using `SWA`
                Callback. We suggest set it to True when using `SWA` Callback since swa model
                performs better than the original model at most cases.
            **kwargs: Other argument for building ner model, such as "rnn_units", "fc_dim" etc. See
                models.ner_models.py for there arguments.
        """

        # whether to use traditional bert model for prediction
        use_bert_model = ner_model_type == 'bert'
        # add assertion for checking input
        assert not (use_bert_model and use_word), \
            'when using bert model, `use_word` must be False'
        assert not (use_bert_model and use_char), \
            'when using bert model, `use_char` must be False'
        assert not (use_bert_model and not use_bert), \
            'when using bert model, `use_bert` must be True'

        self.preprocessor = NERPreprocessor(
            train_data=train_data,
            train_labels=train_labels,
            use_char=use_char,
            use_bert=use_bert,
            use_word=use_word,
            external_word_dict=external_word_dict,
            bert_vocab_file=bert_vocab_file,
            char_embed_type=char_embed_type,
            char_embed_dim=char_embed_dim,
            word_embed_type=word_embed_type,
            word_embed_dim=word_embed_dim,
            max_len=max_len)

        self.model = self.get_ner_model(
            ner_model_type=ner_model_type,
            num_class=self.preprocessor.num_class,
            use_char=use_char,
            char_embeddings=self.preprocessor.char_embeddings,
            char_vocab_size=self.preprocessor.char_vocab_size,
            char_embed_dim=self.preprocessor.char_embed_dim,
            char_embed_trainable=char_embed_trainable,
            use_bert=use_bert,
            bert_config_file=bert_config_file,
            bert_checkpoint_file=bert_checkpoint_file,
            bert_trainable=bert_trainable,
            use_word=use_word,
            word_embeddings=self.preprocessor.word_embeddings,
            word_vocab_size=self.preprocessor.word_vocab_size,
            word_embed_dim=self.preprocessor.word_embed_dim,
            word_embed_trainable=word_embed_trainable,
            max_len=self.preprocessor.max_len,
            use_crf=use_crf,
            optimizer=optimizer,
            **kwargs)

        if 'swa' in callback_list:
            # initialize swa model when using `SWA` callback
            swa_model = self.get_ner_model(
                ner_model_type=ner_model_type,
                num_class=self.preprocessor.num_class,
                use_char=use_char,
                char_embeddings=self.preprocessor.char_embeddings,
                char_vocab_size=self.preprocessor.char_vocab_size,
                char_embed_dim=self.preprocessor.char_embed_dim,
                char_embed_trainable=char_embed_trainable,
                use_bert=use_bert,
                bert_config_file=bert_config_file,
                bert_checkpoint_file=bert_checkpoint_file,
                bert_trainable=bert_trainable,
                use_word=use_word,
                word_embeddings=self.preprocessor.word_embeddings,
                word_vocab_size=self.preprocessor.word_vocab_size,
                word_embed_dim=self.preprocessor.word_embed_dim,
                word_embed_trainable=word_embed_trainable,
                max_len=self.preprocessor.max_len,
                use_crf=use_crf,
                optimizer=optimizer,
                **kwargs)
        else:
            swa_model = None

        self.trainer = NERTrainer(self.model, self.preprocessor)
        self.trainer.train_generator(train_data, train_labels, valid_data,
                                     valid_labels, batch_size, epochs,
                                     callback_list, checkpoint_dir, model_name,
                                     swa_model, load_swa_model)

        self.predictor = NERPredictor(self.model, self.preprocessor)

        if valid_data is not None and valid_labels is not None:
            logging.info('Evaluating on validation data...')
            self.score(valid_data, valid_labels)