def test_train_no_bert(self): preprocessor = NERPreprocessor(self.train_data + self.valid_data, self.train_labels + self.valid_labels, use_word=True, char_embed_type='word2vec') ner_model = BiLSTMCNNNER(num_class=preprocessor.num_class, use_char=True, char_embeddings=preprocessor.char_embeddings, char_vocab_size=preprocessor.char_vocab_size, char_embed_dim=preprocessor.char_embed_dim, char_embed_trainable=False, use_word=True, word_embeddings=preprocessor.word_embeddings, word_vocab_size=preprocessor.word_vocab_size, word_embed_dim=preprocessor.word_embed_dim, word_embed_trainable=False, max_len=preprocessor.max_len, use_crf=True).build_model() ner_trainer = NERTrainer(ner_model, preprocessor) ner_trainer.train(self.train_data, self.train_labels, self.valid_data, self.valid_labels, batch_size=2, epochs=2) assert not os.path.exists(self.json_file) assert not os.path.exists(self.weights_file)
def test_train_no_crf(self): ner_model = BiLSTMCNNNER(num_class=self.num_class, use_char=True, char_embeddings=self.char_embeddings, char_vocab_size=self.char_vocab_size, char_embed_dim=self.char_embed_dim, char_embed_trainable=False, use_bert=True, bert_config_file=self.bert_config_file, bert_checkpoint_file=self.bert_model_file, use_word=True, word_embeddings=self.word_embeddings, word_vocab_size=self.word_vocab_size, word_embed_dim=self.word_embed_dim, word_embed_trainable=False, max_len=self.preprocessor.max_len, use_crf=False).build_model() ner_trainer = NERTrainer(ner_model, self.preprocessor) ner_trainer.train(self.train_data, self.train_labels, self.valid_data, self.valid_labels, batch_size=2, epochs=2) assert not os.path.exists(self.json_file) assert not os.path.exists(self.weights_file)
def load(self, preprocessor_file, json_file, weights_file, custom_objects=None): """load ner application Args: preprocessor_file: path to load preprocessor json_file: path to load model architecture weights_file: path to load model weights custom_objects: Optional dictionary mapping names (strings) to custom classes or functions to be considered during deserialization. Must provided when using custom layer. """ self.preprocessor = NERPreprocessor.load(preprocessor_file) logging.info('Load preprocessor from {}'.format(preprocessor_file)) custom_objects = custom_objects or {} custom_objects.update(get_custom_objects()) with open(json_file, 'r') as reader: self.model = model_from_json(reader.read(), custom_objects=custom_objects) logging.info('Load model architecture from {}'.format(json_file)) self.model.load_weights(weights_file) logging.info('Load model weight from {}'.format(weights_file)) self.trainer = NERTrainer(self.model, self.preprocessor) self.predictor = NERPredictor(self.model, self.preprocessor)
def load(self, preprocessor_file: str, json_file: str, weights_file: str, custom_objects: Optional[Dict[str, Any]] = None) -> None: """Load ner application from disk. There are 3 things in total that we need to load: 1) preprocessor, which stores the vocabulary and embedding matrix built during pre-processing and helps us prepare feature input for ner model; 2) model architecture, which describes the framework of our ner model; 3) model weights, which stores the value of ner model's parameters. Args: preprocessor_file: path to load preprocessor json_file: path to load model architecture weights_file: path to load model weights custom_objects: Optional dictionary mapping names (strings) to custom classes or functions to be considered during deserialization. We will automatically add all the custom layers of this project to custom_objects. So you can ignore this argument in most cases unlesss you use your own custom layer. """ self.preprocessor = NERPreprocessor.load(preprocessor_file) logging.info('Load preprocessor from {}'.format(preprocessor_file)) custom_objects = custom_objects or {} custom_objects.update(get_custom_objects()) with open(json_file, 'r') as reader: self.model = tf.keras.models.model_from_json( reader.read(), custom_objects=custom_objects) logging.info('Load model architecture from {}'.format(json_file)) self.model.load_weights(weights_file) logging.info('Load model weight from {}'.format(weights_file)) self.trainer = NERTrainer(self.model, self.preprocessor) self.predictor = NERPredictor(self.model, self.preprocessor)
def setup_class(self): self.train_data, self.train_labels, self.valid_data, self.valid_labels = \ load_ner_data_and_labels(self.test_file, split=True) self.preprocessor = NERPreprocessor( self.train_data + self.valid_data, self.train_labels + self.valid_labels, use_bert=True, use_word=True, bert_vocab_file=self.bert_vocab_file, char_embed_type='word2vec', word_embed_type='word2vec', max_len=16) self.num_class = self.preprocessor.num_class self.char_embeddings = self.preprocessor.char_embeddings self.char_vocab_size = self.preprocessor.char_vocab_size self.char_embed_dim = self.preprocessor.char_embed_dim self.word_embeddings = self.preprocessor.word_embeddings self.word_vocab_size = self.preprocessor.word_vocab_size self.word_embed_dim = self.preprocessor.word_embed_dim self.checkpoint_dir = os.path.dirname(__file__) self.ner_model = BiLSTMCNNNER( num_class=self.num_class, use_char=True, char_embeddings=self.char_embeddings, char_vocab_size=self.char_vocab_size, char_embed_dim=self.char_embed_dim, char_embed_trainable=False, use_bert=True, bert_config_file=self.bert_config_file, bert_checkpoint_file=self.bert_model_file, use_word=True, word_embeddings=self.word_embeddings, word_vocab_size=self.word_vocab_size, word_embed_dim=self.word_embed_dim, word_embed_trainable=False, max_len=self.preprocessor.max_len, use_crf=True).build_model() self.swa_model = BiLSTMCNNNER( num_class=self.num_class, use_char=True, char_embeddings=self.char_embeddings, char_vocab_size=self.char_vocab_size, char_embed_dim=self.char_embed_dim, char_embed_trainable=False, use_bert=True, bert_config_file=self.bert_config_file, bert_checkpoint_file=self.bert_model_file, use_word=True, word_embeddings=self.word_embeddings, word_vocab_size=self.word_vocab_size, word_embed_dim=self.word_embed_dim, word_embed_trainable=False, max_len=self.preprocessor.max_len, use_crf=True).build_model() self.ner_trainer = NERTrainer(self.ner_model, self.preprocessor) self.json_file = os.path.join(self.checkpoint_dir, 'bilstm_cnn_ner.json') self.weights_file = os.path.join(self.checkpoint_dir, 'bilstm_cnn_ner.hdf5')
def fit(self, train_data, train_labels, valid_data=None, valid_labels=None, ner_model_type='bilstm_cnn', use_char=True, char_embed_type='word2vec', char_embed_dim=300, char_embed_trainable=True, use_bert=False, bert_vocab_file=None, bert_config_file=None, bert_checkpoint_file=None, bert_trainable=False, use_word=False, external_word_dict=None, word_embed_type='word2vec', word_embed_dim=300, word_embed_trainable=True, max_len=None, use_crf=True, optimizer='adam', batch_size=32, epochs=50, callback_list=None, checkpoint_dir=None, model_name=None, load_swa_model=False, **kwargs): """Train ner model using provided data Args: train_data: list of tokenized (in char level) texts for training, like ``[['我', '是', '中', '国', '人']]`` train_labels: labels string of train_data valid_data: list of tokenized (in char level) texts for evaluation valid_labels: labels string of valid data ner_model_type: str, which ner model to use use_char: boolean, whether to use char embedding as input char_embed_type: str, can be a pre-trained embedding filename or pre-trained embedding methods (word2vec, fasttext) char_embed_dim: int, dimensionality of char embedding char_embed_trainable: boolean, whether to update char embedding during training use_bert: boolean, whether to use bert embedding as input bert_vocab_file: str, path to bert's vocabulary file bert_config_file: str, path to bert's configuration file bert_checkpoint_file: str, path to bert's checkpoint file bert_trainable: boolean, whether to update bert during training use_word: boolean, whether to use word as additional input external_word_dict: list of words, external word dictionary word_embed_dim: similar as 'char_embed_dim' word_embed_type: similar as 'char_embed_type' word_embed_trainable: similar as 'char_embed_trainable' max_len: int, max sequence len. If None, we dynamically use the max length of one batch as max_len. However, max_len must be provided when using bert as input. use_crf: boolean, whether to use crf layer optimizer: str or instance of `keras.optimizers.Optimizer`, indicating the optimizer to use during training batch_size: num of samples per gradient update epochs: num of epochs to train the model callback_list: list of str, each item indicates the callback to apply during training Currently, we support using 'modelcheckpoint' for `ModelCheckpoint` callback, 'earlystopping` for 'Earlystopping` callback, 'swa' for 'SWA' callback. We will automatically add `NERMetric` callback when valid_data and valid_labels are both provided. checkpoint_dir: str, directory to save ner model, must be provided when using `ModelCheckpoint` or `SWA` callback. model_name: str, prefix of ner model's weights filem must be provided when using `ModelCheckpoint` or `SWA` callback. For example, if checkpoint_dir is 'ckpt' and model_name is 'model', the weights of ner model saved by `ModelCheckpoint` callback will be 'ckpt/model.hdf5' and by `SWA` callback will be 'ckpt/model_swa.hdf5' load_swa_model: boolean, whether to load swa model, only apply when using SWA Callback. **kwargs: other argument for building ner model, such as "rnn_units", "fc_dim" etc. """ self.preprocessor = NERPreprocessor(train_data=train_data, train_labels=train_labels, use_char=use_char, use_bert=use_bert, use_word=use_word, external_word_dict=external_word_dict, bert_vocab_file=bert_vocab_file, char_embed_type=char_embed_type, char_embed_dim=char_embed_dim, word_embed_type=word_embed_type, word_embed_dim=word_embed_dim, max_len=max_len) self.model = self.get_ner_model(ner_model_type=ner_model_type, num_class=self.preprocessor.num_class, use_char=use_char, char_embeddings=self.preprocessor.char_embeddings, char_vocab_size=self.preprocessor.char_vocab_size, char_embed_dim=self.preprocessor.char_embed_dim, char_embed_trainable=char_embed_trainable, use_bert=use_bert, bert_config_file=bert_config_file, bert_checkpoint_file=bert_checkpoint_file, bert_trainable=bert_trainable, use_word=use_word, word_embeddings=self.preprocessor.word_embeddings, word_vocab_size=self.preprocessor.word_vocab_size, word_embed_dim=self.preprocessor.word_embed_dim, word_embed_trainable=word_embed_trainable, max_len=self.preprocessor.max_len, use_crf=use_crf, optimizer=optimizer, **kwargs) if 'swa' in callback_list: swa_model = self.get_ner_model(ner_model_type=ner_model_type, num_class=self.preprocessor.num_class, use_char=use_char, char_embeddings=self.preprocessor.char_embeddings, char_vocab_size=self.preprocessor.char_vocab_size, char_embed_dim=self.preprocessor.char_embed_dim, char_embed_trainable=char_embed_trainable, use_bert=use_bert, bert_config_file=bert_config_file, bert_checkpoint_file=bert_checkpoint_file, bert_trainable=bert_trainable, use_word=use_word, word_embeddings=self.preprocessor.word_embeddings, word_vocab_size=self.preprocessor.word_vocab_size, word_embed_dim=self.preprocessor.word_embed_dim, word_embed_trainable=word_embed_trainable, max_len=self.preprocessor.max_len, use_crf=use_crf, optimizer=optimizer, **kwargs) else: swa_model = None self.trainer = NERTrainer(self.model, self.preprocessor) self.trainer.train_generator(train_data, train_labels, valid_data, valid_labels, batch_size, epochs, callback_list, checkpoint_dir, model_name, swa_model, load_swa_model) self.predictor = NERPredictor(self.model, self.preprocessor) if valid_data is not None and valid_labels is not None: logging.info('Evaluating on validation data...') self.score(valid_data, valid_labels)
def fit(self, train_data: List[List[str]], train_labels: List[List[str]], valid_data: Optional[List[List[str]]] = None, valid_labels: Optional[List[List[str]]] = None, ner_model_type: str = 'bilstm_cnn', use_char: bool = True, char_embed_type: Optional[str] = 'word2vec', char_embed_dim: int = 300, char_embed_trainable: bool = True, use_bert: bool = False, bert_vocab_file: Optional[str] = None, bert_config_file: Optional[str] = None, bert_checkpoint_file: Optional[str] = None, bert_trainable: bool = False, use_word: bool = False, external_word_dict: Optional[List[str]] = None, word_embed_type: Optional[str] = 'word2vec', word_embed_dim: int = 300, word_embed_trainable: bool = True, max_len: Optional[int] = None, use_crf: bool = True, optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', batch_size: int = 32, epochs: int = 50, callback_list: Optional[List[str]] = None, checkpoint_dir: Optional[str] = None, model_name: Optional[str] = None, load_swa_model: bool = False, **kwargs) -> None: """Train ner model with provided dataset. We would like to make NER in Fancy-NLP more configurable, so we provided a bunch of arguments for users to configure: 1. Which type of ner model to use; Currently we implement 5 types of ner models: 'bilstm', 'bisltm-cnn', 'bigru', 'bigru-cnn' and 'bert'. 2. Which kind of input embedding to use; We support 3 kinds of embedding: char embedding, bert embedding and word embedding. We can choose any one of them or combine any two or all of them to used as input. Note that our ner model only support char-level input, for the reason that char-level input haven shown effective for Chinese NER task without word-segmentation error. Therefore, we should use char embedding or bert embedding or both of them as main input. On that basis, we can use word embedding as auxiliary input to provide semantic information. 3. Whether to use CRF; We can choose whether to add crf layer on the last layer of ner model. 4. How to train the model: a) which optimizer to use, we support any optimizer that is compatible with tf.keras's optimizer: https://www.tensorflow.org/api_docs/python/tf/keras/optimizers ; b) how many sample to train per batch, how many epoch to train; c) which callbacks to use during training, we currently support 3 kinds of callbacks: i) 'modelcheckpoint' is used to save the model with best performance: https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ModelCheckpoint; ii) 'earlystoppoing' is used to stop training when no performance gain observed: https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping iii) 'swa' is used to apply an novel weight averaging ensemble mechanism to the ner model we are training: https://arxiv.org/abs/1803.05407) 5. Where to save the model Args: train_data: List of List of str. List of tokenized (in char level) texts for training, like ``[['我', '在', '上', '海', '上', '学'], ...]``. train_labels: List of List of str. The labels of train_data, usually in BIO or BIOES format, like ``[['O', 'O', 'B-LOC', 'I-LOC', 'O', 'O'], ...]``. valid_data: Optional List of List of str, can be None. List of tokenized (in char level) texts for evaluation. valid_labels: Optional List of List of str, can be None. The labels of valid_data. We can use fancy_nlp.utils.load_ner_data_and_labels() function to get training or validation data and labels from raw dataset in CoNLL format. ner_model_type: str. Which type of ner model to use, can be one of {'bilstm', 'bilstm-cnn', 'bigru', 'bigru-cnn', 'bert'}. use_char: Boolean. Whether to use character embedding as input. char_embed_type: Optional str, can be None. The type of char embedding, can be a pre-trained embedding filename that used to load pre-trained embedding, or a embedding training method (one of {'word2vec', 'fasttext'}) that used to train character embedding with dataset. If None, do not apply anr pre-trained embedding, and use randomly initialized embedding instead. char_embed_dim: int. Dimensionality of char embedding. char_embed_trainable: Boolean. Whether to update char embedding during training. use_bert: Boolean. Whether to use bert embedding as input. bert_vocab_file: Optional str, can be None. Path to bert's vocabulary file. bert_config_file: Optional str, can be None. Path to bert's configuration file. bert_checkpoint_file: Optional str, can be None. Path to bert's checkpoint file. bert_trainable: Boolean. Whether to update bert during training. use_word: Boolean. Whether to use word as additional input. external_word_dict: Optional List of str, can be None. List of words, external word dictionary that will be used to loaded in jieba. It can be regarded as one kind of gazetter that contain a number of correct named-entities. Such as ``['南京市', '长江大桥']`` word_embed_dim: similar as 'char_embed_dim'. word_embed_type: similar as 'char_embed_type'. word_embed_trainable: similar as 'char_embed_trainable'. max_len: Optional int, can be None. Max length of one sequence. If None, we dynamically use the max length of each batch as max_len. However, max_len must be provided when using bert as input. use_crf: Boolean. Whether to use crf layer. optimizer: str or instance of `tf.keras.optimizers.Optimizer`. Which optimizer to use during training. batch_size: int. Number of samples per gradient update. epochs: int. Number of epochs to train the model callback_list: Optional List of str or instance of `keras.callbacks.Callback`, can be None. Each item indicates the callback to apply during training. Currently, we support using 'modelcheckpoint' for `ModelCheckpoint` callback, 'earlystopping` for 'Earlystopping` callback, 'swa' for 'SWA' callback. We will automatically add `NERMetric` callback when valid_data and valid_labels are both provided. checkpoint_dir: Optional str, can be None. Directory to save the ner model. It must be provided when using `ModelCheckpoint` or `SWA` callback, since these callbacks needs to save ner model after training. model_name: Optional str, can be None. Prefix of ner model's weights file. I must be provided when using `ModelCheckpoint` or `SWA` callback, since these callbacks needs to save ner model after training. For example, if checkpoint_dir is 'ckpt' and model_name is 'model', the weights of ner model saved by `ModelCheckpoint` callback will be 'ckpt/model.hdf5' and by `SWA` callback will be 'ckpt/model_swa.hdf5'. load_swa_model: Boolean. Whether to load swa model, only apply when using `SWA` Callback. We suggest set it to True when using `SWA` Callback since swa model performs better than the original model at most cases. **kwargs: Other argument for building ner model, such as "rnn_units", "fc_dim" etc. See models.ner_models.py for there arguments. """ # whether to use traditional bert model for prediction use_bert_model = ner_model_type == 'bert' # add assertion for checking input assert not (use_bert_model and use_word), \ 'when using bert model, `use_word` must be False' assert not (use_bert_model and use_char), \ 'when using bert model, `use_char` must be False' assert not (use_bert_model and not use_bert), \ 'when using bert model, `use_bert` must be True' self.preprocessor = NERPreprocessor( train_data=train_data, train_labels=train_labels, use_char=use_char, use_bert=use_bert, use_word=use_word, external_word_dict=external_word_dict, bert_vocab_file=bert_vocab_file, char_embed_type=char_embed_type, char_embed_dim=char_embed_dim, word_embed_type=word_embed_type, word_embed_dim=word_embed_dim, max_len=max_len) self.model = self.get_ner_model( ner_model_type=ner_model_type, num_class=self.preprocessor.num_class, use_char=use_char, char_embeddings=self.preprocessor.char_embeddings, char_vocab_size=self.preprocessor.char_vocab_size, char_embed_dim=self.preprocessor.char_embed_dim, char_embed_trainable=char_embed_trainable, use_bert=use_bert, bert_config_file=bert_config_file, bert_checkpoint_file=bert_checkpoint_file, bert_trainable=bert_trainable, use_word=use_word, word_embeddings=self.preprocessor.word_embeddings, word_vocab_size=self.preprocessor.word_vocab_size, word_embed_dim=self.preprocessor.word_embed_dim, word_embed_trainable=word_embed_trainable, max_len=self.preprocessor.max_len, use_crf=use_crf, optimizer=optimizer, **kwargs) if 'swa' in callback_list: # initialize swa model when using `SWA` callback swa_model = self.get_ner_model( ner_model_type=ner_model_type, num_class=self.preprocessor.num_class, use_char=use_char, char_embeddings=self.preprocessor.char_embeddings, char_vocab_size=self.preprocessor.char_vocab_size, char_embed_dim=self.preprocessor.char_embed_dim, char_embed_trainable=char_embed_trainable, use_bert=use_bert, bert_config_file=bert_config_file, bert_checkpoint_file=bert_checkpoint_file, bert_trainable=bert_trainable, use_word=use_word, word_embeddings=self.preprocessor.word_embeddings, word_vocab_size=self.preprocessor.word_vocab_size, word_embed_dim=self.preprocessor.word_embed_dim, word_embed_trainable=word_embed_trainable, max_len=self.preprocessor.max_len, use_crf=use_crf, optimizer=optimizer, **kwargs) else: swa_model = None self.trainer = NERTrainer(self.model, self.preprocessor) self.trainer.train_generator(train_data, train_labels, valid_data, valid_labels, batch_size, epochs, callback_list, checkpoint_dir, model_name, swa_model, load_swa_model) self.predictor = NERPredictor(self.model, self.preprocessor) if valid_data is not None and valid_labels is not None: logging.info('Evaluating on validation data...') self.score(valid_data, valid_labels)