def test_forward_works_with_mask(numpy_crf): logits = np.array([ [[0, 0, .5, .5, .2], [0, 0, .3, .3, .1], [0, 0, .9, 10, 1]], [[0, 0, .2, .5, .2], [0, 0, 3, .3, .1], [0, 0, .9, 1, 1]], ]) transitions = np.array([ [0.1, 0.2, 0.3, 0.4, 0.5], [0.8, 0.3, 0.1, 0.7, 0.9], [-0.3, 2.1, -5.6, 3.4, 4.0], [0.2, 0.4, 0.6, -0.3, -0.4], [1.0, 1.0, 1.0, 1.0, 1.0] ]) boundary_transitions = np.array([0.1, 0.2, 0.3, 0.4, 0.6]) tags = np.array([ [2, 3, 4], [3, 2, 2] ]) # Use the CRF Module with fixed transitions to compute the log_likelihood crf = CRF( units=5, use_kernel=False, # disable kernel transform chain_initializer=initializers.Constant(transitions), use_boundary=True, boundary_initializer=initializers.Constant(boundary_transitions), name="crf_layer" ) # Use a non-trivial mask mask = np.array([ [1, 1, 1], [1, 1, 0] ]) crf_loss_instance = ConditionalRandomFieldLoss() model = Sequential() model.add(layers.Input(shape=(3, 5))) model.add(MockMasking(mask_shape=(2, 3), mask_value=mask)) model.add(crf) model.compile('adam', loss={"crf_layer": crf_loss_instance}) result = model.train_on_batch(logits, tags) numpy_crf_instance = numpy_crf(logits, mask, transitions, boundary_transitions, boundary_transitions) expected = numpy_crf_instance.compute_log_likehood(tags) / -2 assert result == approx(expected)
def create_bilstm_crf(vocab_size, EMBED_DIM, BiRNN_UNITS, tags_size): model = Sequential() # model.add(Embedding(len(vocab)+1, EMBED_DIM, mask_zero=True)) model.add(Embedding(vocab_size, EMBED_DIM, mask_zero=True)) model.add(Bidirectional(LSTM(BiRNN_UNITS // 2, return_sequences=True))) model.add(CRF(tags_size, sparse_target=True, name="crf_layer")) print(model.summary()) crf_loss_instance = ConditionalRandomFieldLoss() # model.compile('adam', loss=crf_loss, metrics=[crf_viterbi_accuracy]) #model.compile('adam', loss={"crf_layer": crf_loss_instance}, metrics=[crf_accuracy]) model.summary() return model
def test_masking_fixed_length(get_random_data): nb_samples = 2 timesteps = 10 embedding_dim = 4 output_dim = 5 embedding_num = 12 crf_loss_instance = ConditionalRandomFieldLoss() x, y = get_random_data(nb_samples, timesteps, x_high=embedding_num, y_high=output_dim) # right padding; left padding is not supported due to the tf.contrib.crf x[0, -4:] = 0 # test with masking, fix length model = Sequential() model.add( Embedding(embedding_num, embedding_dim, input_length=timesteps, mask_zero=True)) model.add(CRF(output_dim, name="crf_layer")) model.compile(optimizer='adam', loss={"crf_layer": crf_loss_instance}) model.fit(x, y, epochs=1, batch_size=1) model.fit(x, y, epochs=1, batch_size=2) model.fit(x, y, epochs=1, batch_size=3) model.fit(x, y, epochs=1) # check mask y_pred = model.predict(x) assert (y_pred[0, -4:] == 0).all() # right padding # left padding not working currently due to the tf.contrib.crf.* # assert (y_pred[1, :5] == 0).all() # test saving and loading model MODEL_PERSISTENCE_PATH = './test_saving_crf_model.h5' model.save(MODEL_PERSISTENCE_PATH) load_model(MODEL_PERSISTENCE_PATH, custom_objects={'CRF': CRF}) try: os.remove(MODEL_PERSISTENCE_PATH) except OSError: pass
def test_viterbi_tags(numpy_crf): logits = np.array([ [[0, 0, .5, .5, .2], [0, 0, .3, .3, .1], [0, 0, .9, 10, 1]], [[0, 0, .2, .5, .2], [0, 0, 3, .3, .1], [0, 0, .9, 1, 1]], ]) transitions = np.array([ [0.1, 0.2, 0.3, 0.4, 0.5], [0.8, 0.3, 0.1, 0.7, 0.9], [-0.3, 2.1, -5.6, 3.4, 4.0], [0.2, 0.4, 0.6, -0.3, -0.4], [1.0, 1.0, 1.0, 1.0, 1.0] ]) boundary_transitions = np.array([0.1, 0.2, 0.3, 0.4, 0.6]) # Use the CRF Module with fixed transitions to compute the log_likelihood crf = CRF( units=5, use_kernel=False, # disable kernel transform chain_initializer=initializers.Constant(transitions), use_boundary=True, boundary_initializer=initializers.Constant(boundary_transitions), name="crf_layer" ) mask = np.array([ [1, 1, 1], [1, 1, 0] ]) crf_loss_instance = ConditionalRandomFieldLoss() model = Sequential() model.add(layers.Input(shape=(3, 5))) model.add(MockMasking(mask_shape=(2, 3), mask_value=mask)) model.add(crf) model.compile('adam', loss={"crf_layer": crf_loss_instance}) # Separate the tags and scores. result = model.predict(logits) numpy_crf_instance = numpy_crf(logits, mask, transitions, boundary_transitions, boundary_transitions) expected, _ = numpy_crf_instance.decode() np.testing.assert_equal(result, expected)
def test_masking_fixed_length(get_random_data): nb_samples = 2 timesteps = 10 embedding_dim = 4 output_dim = 5 embedding_num = 12 crf_loss_instance = ConditionalRandomFieldLoss() x, y = get_random_data(nb_samples, timesteps, x_high=embedding_num, y_high=output_dim) # test with no masking, fix length model = Sequential() model.add(Embedding(embedding_num, embedding_dim, input_length=timesteps)) model.add(CRF(output_dim, name="crf_layer")) model.compile(optimizer='adam', loss={"crf_layer": crf_loss_instance}) model.fit(x, y, epochs=1, batch_size=1) model.fit(x, y, epochs=1, batch_size=2) model.fit(x, y, epochs=1, batch_size=3) model.fit(x, y, epochs=1) # test saving and loading model MODEL_PERSISTENCE_PATH = './test_saving_crf_model.h5' model.save(MODEL_PERSISTENCE_PATH) load_model(MODEL_PERSISTENCE_PATH, custom_objects={ 'CRF': CRF, 'crf_loss': crf_loss }) try: os.remove(MODEL_PERSISTENCE_PATH) except OSError: pass
def test_masking_with_boundary(get_random_data): nb_samples = 2 timesteps = 10 embedding_dim = 4 output_dim = 5 embedding_num = 12 crf_loss_instance = ConditionalRandomFieldLoss() x, y = get_random_data(nb_samples, timesteps, x_high=embedding_num,y_high=output_dim) # right padding; left padding is not supported due to the tf.contrib.crf x[0, -4:] = 0 # test with masking, fix length model = Sequential() model.add(Embedding(embedding_num, embedding_dim, input_length=timesteps, mask_zero=True)) model.add(CRF(output_dim, use_boundary=True, name="crf_layer")) model.compile(optimizer='adam', loss={"crf_layer": crf_loss_instance}) model.fit(x, y, epochs=1, batch_size=1) model.fit(x, y, epochs=1, batch_size=2) model.fit(x, y, epochs=1, batch_size=3) model.fit(x, y, epochs=1)
def test_crf_viterbi_accuracy(get_random_data): nb_samples = 2 timesteps = 10 embedding_dim = 4 output_dim = 5 embedding_num = 12 crf_loss_instance = ConditionalRandomFieldLoss() x, y = get_random_data(nb_samples, timesteps, x_high=embedding_num, y_high=output_dim) # right padding; left padding is not supported due to the tf.contrib.crf x[0, -4:] = 0 # test with masking, fix length model = Sequential() model.add( Embedding(embedding_num, embedding_dim, input_length=timesteps, mask_zero=True)) model.add(CRF(output_dim, name="crf_layer")) model.compile(optimizer='rmsprop', loss={"crf_layer": crf_loss_instance}, metrics=[crf_viterbi_accuracy]) model.fit(x, y, epochs=1, batch_size=10) # test viterbi_acc y_pred = model.predict(x) _, v_acc = model.evaluate(x, y) np_acc = (y_pred[x > 0] == y[x > 0]).astype('float32').mean() print(v_acc, np_acc) assert np.abs(v_acc - np_acc) < 1e-4
def main(): config = read_configure() # ioflow corpus = get_corpus_processor(config) corpus.prepare() # ? train_data_generator_func = corpus.get_generator_func(corpus.TRAIN) eval_data_generator_func = corpus.get_generator_func(corpus.EVAL) corpus_meta_data = corpus.get_meta_info() tags_data = generate_tagset(corpus_meta_data["tags"]) # process entity into BIO train_data = list(train_data_generator_func()) eval_data = list(eval_data_generator_func()) tag_lookuper = Lookuper({v: i for i, v in enumerate(tags_data)}) # tag index vocab_data_file = config.get("vocabulary_file") vocabulary_lookuper = index_table_from_file(vocab_data_file) def preprocss(data, maxlen): raw_x = [] raw_y = [] for offset_data in data: tags = offset_to_biluo(offset_data) words = offset_data.text tag_ids = [tag_lookuper.lookup(i) for i in tags] word_ids = [vocabulary_lookuper.lookup(i) for i in words] raw_x.append(word_ids) raw_y.append(tag_ids) if maxlen is None: maxlen = max(len(s) for s in raw_x) print(">>> maxlen: {}".format(maxlen)) x = tf.keras.preprocessing.sequence.pad_sequences( raw_x, maxlen, padding="post" ) # right padding # lef padded with -1. Indeed, any integer works as it will be masked # y_pos = pad_sequences(y_pos, maxlen, value=-1) # y_chunk = pad_sequences(y_chunk, maxlen, value=-1) y = tf.keras.preprocessing.sequence.pad_sequences( raw_y, maxlen, value=0, padding="post" ) return x, y MAX_SENTENCE_LEN = config.get("max_sentence_len", 25) train_x, train_y = preprocss(train_data, MAX_SENTENCE_LEN) test_x, test_y = preprocss(eval_data, MAX_SENTENCE_LEN) EPOCHS = config["epochs"] BATCH_SIZE = config["batch_size"] EMBED_DIM = config["embedding_dim"] USE_ATTENTION_LAYER = config.get("use_attention_layer", False) BiLSTM_STACK_CONFIG = config.get("bilstm_stack_config", []) BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG = config.get( "use_batch_normalization_after_embedding", False ) BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG = config.get( "use_batch_normalization_after_bilstm", False ) CRF_PARAMS = config.get("crf_params", {}) OPTIMIZER_PARAMS = config.get("optimizer_params", {}) vacab_size = vocabulary_lookuper.size() tag_size = tag_lookuper.size() model = Sequential() model.add( Embedding(vacab_size, EMBED_DIM, embeddings_initializer='glorot_normal', mask_zero=True, input_length=MAX_SENTENCE_LEN) ) if BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG: model.add(BatchNormalization()) for bilstm_config in BiLSTM_STACK_CONFIG: model.add(Bidirectional(LSTM(return_sequences=True, **bilstm_config))) if BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG: model.add(BatchNormalization()) if USE_ATTENTION_LAYER: model.add(GlobalAttentionLayer()) model.add(CRF(tag_size, name="crf", **CRF_PARAMS)) # print model summary model.summary() callbacks_list = [] tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=create_dir_if_needed(config["summary_log_dir"]) ) callbacks_list.append(tensorboard_callback) checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( os.path.join(create_dir_if_needed(config["model_dir"]), "cp-{epoch:04d}.ckpt"), load_weights_on_restart=True, verbose=1, ) callbacks_list.append(checkpoint_callback) metrics_list = [] metrics_list.append(SequenceCorrectness()) metrics_list.append(SequenceSpanAccuracy()) loss_func = ConditionalRandomFieldLoss() # loss_func = crf_loss optimizer = optimizers.Adam(**OPTIMIZER_PARAMS) # optimizer = optimizers.Nadam(**OPTIMIZER_PARAMS) model.compile(optimizer=optimizer, loss={"crf": loss_func}, metrics=metrics_list) model.fit( train_x, train_y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=[test_x, test_y], callbacks=callbacks_list, ) # Save the model model.save(create_file_dir_if_needed(config["h5_model_file"])) tf.keras.experimental.export_saved_model( model, create_dir_if_needed(config["saved_model_dir"])) export_as_deliverable_model( create_dir_if_needed(config["deliverable_model_dir"]), keras_saved_model=config["saved_model_dir"], vocabulary_lookup_table=vocabulary_lookuper, tag_lookup_table=tag_lookuper, padding_parameter={"maxlen": MAX_SENTENCE_LEN, "value": 0, "padding": "post"}, addition_model_dependency=["tf-crf-layer"], custom_object_dependency=["tf_crf_layer"], )
def main(): # get configure config = read_configure() # get train/test corpus corpus = get_corpus_processor(config) corpus.prepare() train_data_generator_func = corpus.get_generator_func(corpus.TRAIN) eval_data_generator_func = corpus.get_generator_func(corpus.EVAL) corpus_meta_data = corpus.get_meta_info() # process str data to onehot ner_tags_data = generate_tagset(corpus_meta_data["tags"]) cls_tags_data = corpus_meta_data["labels"] train_data = list(train_data_generator_func()) eval_data = list(eval_data_generator_func()) ner_tag_lookuper = Lookuper({v: i for i, v in enumerate(ner_tags_data)}) cls_tag_lookuper = Lookuper({v: i for i, v in enumerate(cls_tags_data)}) vocab_data_file = config.get("vocabulary_file") if not vocab_data_file: # load built in vocabulary file vocab_data_file = os.path.join( os.path.dirname(__file__), "../data/unicode_char_list.txt" ) vocabulary_lookuper = index_table_from_file(vocab_data_file) def preprocss(data, maxlen, **kwargs): raw_x = [] raw_y_ner = [] raw_y_cls = [] for offset_data in data: tags = offset_to_biluo(offset_data) label = offset_data.label words = offset_data.text tag_ids = [ner_tag_lookuper.lookup(i) for i in tags] label_id = cls_tag_lookuper.lookup(label) word_ids = [vocabulary_lookuper.lookup(i) for i in words] raw_x.append(word_ids) raw_y_ner.append(tag_ids) raw_y_cls.append(label_id) if maxlen is None: maxlen = max(len(s) for s in raw_x) print(">>> maxlen: {}".format(maxlen)) x = tf.keras.preprocessing.sequence.pad_sequences( raw_x, maxlen, padding="post" ) # right padding y_ner = tf.keras.preprocessing.sequence.pad_sequences( raw_y_ner, maxlen, value=0, padding="post" ) from keras.utils import to_categorical y_cls = np.array(raw_y_cls) y_cls = y_cls[:, np.newaxis] y_cls = to_categorical(y_cls, kwargs.get('cls_dims', 81)) return x, y_ner, y_cls # get Parameters (controller) EPOCHS = config.get("epochs", 10) BATCHSIZE = config.get("batch_size", 32) LEARNINGRATE = config.get("learning_rate", 0.001) MAX_SENTENCE_LEN = config.get("max_sentence_len", 25) # get Parameters (model structure) EMBED_DIM = config.get("embedding_dim", 300) USE_ATTENTION_LAYER = config.get("use_attention_layer", False) BiLSTM_STACK_CONFIG = config.get("bilstm_stack_config", []) BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG = config.get( "use_batch_normalization_after_embedding", False) BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG = config.get( "use_batch_normalization_after_bilstm", False) CRF_PARAMS = config.get("crf_params", {}) # get train/test data for training model vacab_size = vocabulary_lookuper.size() tag_size = ner_tag_lookuper.size() label_size = cls_tag_lookuper.size() train_x, train_y_ner, train_y_cls = preprocss(train_data, MAX_SENTENCE_LEN, **{'cls_dims':label_size}) test_x, test_y_ner, test_y_cls = preprocss(eval_data, MAX_SENTENCE_LEN, **{'cls_dims':label_size}) # build model input_length = MAX_SENTENCE_LEN input_layer = Input(shape=(input_length,), dtype='float', name='input_layer') # encoder with tf.keras.backend.name_scope("Encoder"): embedding_layer = Embedding(vacab_size, EMBED_DIM, mask_zero=True, input_length=input_length, name='embedding')(input_layer) # feature extractor with tf.keras.backend.name_scope("biLSTM"): if BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG: embedding_layer = BatchNormalization()(embedding_layer) biLSTM = embedding_layer for bilstm_config in BiLSTM_STACK_CONFIG: biLSTM = Bidirectional(LSTM(return_sequences=True, **bilstm_config, name='biLSTM'))(biLSTM) if BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG: biLSTM = BatchNormalization()(biLSTM) if USE_ATTENTION_LAYER: biLSTM = GlobalAttentionLayer()(biLSTM) # NER branch with tf.keras.backend.name_scope("NER_branch"): crf = CRF(tag_size, name="crf", **CRF_PARAMS)(biLSTM) loss_func = ConditionalRandomFieldLoss() # classification branch chosen = 'lstm_cls' with tf.keras.backend.name_scope("CLS_branch"): from tensorflow.keras.layers import Dense, Flatten, Dropout # add paragraph vector #paragraph_vector = get_paragraph_vector(embedding_layer) if chosen == "lstm_cls": cls_flat_lstm = Flatten()(biLSTM) #cls_flat_lstm = tf.keras.layers.concatenate([cls_flat_lstm, paragraph_vector]) classification_dense = Dropout(0.2)(cls_flat_lstm) classification_dense = SetLearningRate(Dense(label_size, activation='sigmoid', name='CLS'), lr=0.001, is_ada=True)(classification_dense) elif chosen == "conv_cls": from tensorflow.keras.layers import Conv1D, MaxPooling1D embedding_layer = BatchNormalization()(embedding_layer) cls_conv_emb = Conv1D(32, 3, activation='relu', padding='same')(embedding_layer) cls_conv_emb = Conv1D(64, 3, activation='relu', padding='same')(cls_conv_emb) cls_conv_emb = MaxPooling1D(2)(cls_conv_emb) cls_conv_emb = Conv1D(128, 3, activation='relu', dilation_rate=1, padding='same')(cls_conv_emb) cls_conv_emb = Conv1D(128, 3, activation='relu', dilation_rate=2, padding='same')(cls_conv_emb) cls_conv_emb = Conv1D(128, 3, activation='relu', dilation_rate=5, padding='same')(cls_conv_emb) cls_conv_emb = Conv1D(256, 1, activation='relu', padding='same')(cls_conv_emb) cls_conv_emb = MaxPooling1D(2)(cls_conv_emb) cls_flat = BatchNormalization()(cls_conv_emb) cls_flat = Flatten()(cls_flat) classification_dense = Dropout(0.2)(cls_flat) classification_dense = Dense(label_size, activation='sigmoid', name='CLS')(classification_dense) # merge NER and Classification model = Model(inputs=[input_layer], outputs=[crf, classification_dense]) model.summary() callbacks_list = [] tensorboard_callback = tf.keras.callbacks.TensorBoard( #log_dir=create_dir_if_needed(config["summary_log_dir"]) log_dir='.\\results\\summary_log_dir', batch_size=BATCHSIZE, ) callbacks_list.append(tensorboard_callback) checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( os.path.join(create_dir_if_needed(config["model_dir"]), "cp-{epoch:04d}.ckpt"), load_weights_on_restart=True, verbose=1, ) callbacks_list.append(checkpoint_callback) metrics_list = [] metrics_list.append(crf_accuracy) metrics_list.append(SequenceCorrectness()) metrics_list.append(sequence_span_accuracy) # early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', # early stop index # patience=3, # early stop delay epoch # verbose=2, # display mode # mode='auto') # callbacks_list.append(early_stop) from mtnlpmodel.trainer.loss_func_util import FocalLoss adam_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNINGRATE, beta_1=0.9, beta_2=0.999, amsgrad=False) model.compile(optimizer=adam_optimizer, #loss={'crf': loss_func, 'CLS': 'sparse_categorical_crossentropy'}, loss={'crf': loss_func, 'CLS': FocalLoss()}, loss_weights={'crf': 1., 'CLS': 100}, # set weight of loss #metrics={'crf': SequenceCorrectness(), 'CLS': 'sparse_categorical_accuracy'} ) metrics={'crf': SequenceCorrectness(), 'CLS': 'categorical_accuracy'}) model.fit( train_x, {'crf': train_y_ner, 'CLS': train_y_cls}, epochs=EPOCHS, batch_size=BATCHSIZE, validation_data=[test_x, {'crf': test_y_ner, 'CLS': test_y_cls}], callbacks=callbacks_list, ) model.save(create_file_dir_if_needed(config["h5_model_file"])) model.save_weights(create_file_dir_if_needed(config["h5_weights_file"])) tf.keras.experimental.export_saved_model( model, create_or_rm_dir_if_needed(config["saved_model_dir"]) ) mt_export_as_deliverable_model( create_dir_if_needed(config["deliverable_model_dir"]), keras_saved_model=config["saved_model_dir"], converter_for_request=ConverterForRequest(), converter_for_response=ConverterForMTResponse(), lookup_tables={'vocab_lookup':vocabulary_lookuper, 'tag_lookup':ner_tag_lookuper, 'label_lookup':cls_tag_lookuper}, padding_parameter={"maxlen": MAX_SENTENCE_LEN, "value": 0, "padding": "post"}, addition_model_dependency=["tf-crf-layer"], custom_object_dependency=["tf_crf_layer"], )
callbacks_list.append(tensorboard_callback) checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( os.path.join(create_dir_if_needed(config["model_dir"]), "cp-{epoch:04d}.ckpt"), load_weights_on_restart=True, verbose=1, ) callbacks_list.append(checkpoint_callback) metrics_list = [] metrics_list.append(crf_accuracy) metrics_list.append(SequenceCorrectness()) metrics_list.append(sequence_span_accuracy) loss_func = ConditionalRandomFieldLoss() # loss_func = crf_loss model.compile("adam", loss={"crf": loss_func}, metrics=metrics_list) # model.compile("nadam", loss={"crf": loss_func}, metrics=metrics_list) model.fit( train_x, train_y, epochs=EPOCHS, validation_data=[test_x, test_y], callbacks=callbacks_list, ) # Save the model model.save(create_file_dir_if_needed(config["h5_model_file"]))
def main(): # ------ # Data # ----- # conll200 has two different targets, here will only use # IBO like chunking as an example train, test, voc = conll2000.load_data() (train_x, _, train_y) = train (test_x, _, test_y) = test (vocab, _, class_labels) = voc # -------------- # 1. Regular CRF # -------------- print('==== training CRF ====') model = Sequential() model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding # model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True, input_length=78)) # Random embedding crf = CRF(len(class_labels), name="crf_layer") model.add(crf) crf_loss_instance = ConditionalRandomFieldLoss() # The default `crf_loss` for `learn_mode='join'` is negative log likelihood. model.compile('adam', loss={"crf_layer": crf_loss_instance}, metrics=[SequenceSpanAccuracy()]) # model.compile('adam', loss={"crf_layer": crf_loss_instance}, metrics=[CategoricalAccuracy()]) # model.compile('adam', loss={"crf_layer": crf_loss_instance}, metrics=[crf_accuracy]) model.fit(train_x, train_y, epochs=EPOCHS, validation_data=[test_x, test_y]) # test_y_pred = model.predict(test_x).argmax(-1)[test_x > 0] test_y_pred = model.predict(test_x)[test_x > 0] test_y_true = test_y[test_x > 0] print('\n---- Result of CRF ----\n') classification_report(test_y_true, test_y_pred, class_labels) # ------------- # 2. BiLSTM-CRF # ------------- print('==== training BiLSTM-CRF ====') model = Sequential() model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding # model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True, input_length=78)) # Random embedding model.add(Bidirectional(LSTM(BiRNN_UNITS // 2, return_sequences=True))) crf = CRF(len(class_labels), name="crf_layer") model.add(crf) crf_loss_instance = ConditionalRandomFieldLoss() model.compile('adam', loss={"crf_layer": crf_loss_instance}, metrics=[SequenceSpanAccuracy()]) # model.compile('adam', loss={"crf_layer": crf_loss_instance}, metrics=[CategoricalAccuracy()]) # model.compile('adam', loss={"crf_layer": crf_loss_instance}, metrics=[crf_accuracy]) model.fit(train_x, train_y, epochs=EPOCHS, validation_data=[test_x, test_y]) predict_result = model.predict(test_x) test_y_pred = predict_result[test_x > 0] test_y_true = test_y[test_x > 0] print('\n---- Result of BiLSTM-CRF ----\n') classification_report(test_y_true, test_y_pred, class_labels)
def test_crf_config(get_random_data): nb_samples = 2 timesteps = 10 embedding_dim = 4 output_dim = 5 embedding_num = 12 x, y = get_random_data( nb_samples, timesteps, x_high=embedding_num, y_high=output_dim ) # right padding; left padding is not supported due to the tf.contrib.crf x[0, -4:] = 0 crf_loss_instance = ConditionalRandomFieldLoss() # test with masking, fix length model = Sequential() model.add( Embedding(embedding_num, embedding_dim, input_length=timesteps, mask_zero=True) ) model.add(CRF(output_dim, name="crf_layer")) model.compile(optimizer="rmsprop", loss={"crf_layer": crf_loss_instance}) model.fit(x, y, epochs=1, batch_size=10) # test config result = model.get_config() expected = { "name": "sequential", "layers": [ { "class_name": "Embedding", "config": { "name": "embedding", "trainable": True, "batch_input_shape": (None, 10), "dtype": "float32", "input_dim": 12, "output_dim": 4, "embeddings_initializer": { "class_name": "RandomUniform", "config": { "minval": -0.05, "maxval": 0.05, "seed": None, "dtype": "float32", }, }, "embeddings_regularizer": None, "activity_regularizer": None, "embeddings_constraint": None, "mask_zero": True, "input_length": 10, }, }, { "class_name": "CRF", "config": { "name": "crf_layer", "trainable": True, "dtype": "float32", "units": 5, "use_boundary": True, "use_bias": True, "use_kernel": True, "kernel_initializer": { "class_name": "GlorotUniform", "config": {"seed": None, "dtype": "float32"}, }, "chain_initializer": { "class_name": "Orthogonal", "config": {"gain": 1.0, "seed": None, "dtype": "float32"}, }, "boundary_initializer": { "class_name": "Zeros", "config": {"dtype": "float32"}, }, "bias_initializer": { "class_name": "Zeros", "config": {"dtype": "float32"}, }, "activation": "linear", "kernel_regularizer": None, "chain_regularizer": None, "boundary_regularizer": None, "bias_regularizer": None, "kernel_constraint": None, "chain_constraint": None, "boundary_constraint": None, "bias_constraint": None, }, }, ], } assert result == expected
def test_masked_viterbi_decode(): transitions = np.ones([5, 5]) transitions_from_start = np.ones(5) transitions_to_end = np.ones(5) logits = np.array([ [ # O B-X I-X B-Y I-Y [ 0., 1., 0., 0., 0.], [ 0., 0., 1., 0., 0.], [ 0., 0., 1., 0., 0.] ], [ # O B-X I-X B-Y I-Y [ 0., 1., 0., 0., 0.], [ 0., 1., 0., 0., 0.], [ 0., 1., 0., 0., 0.] ] ]) # TODO: this test case is right padding mask only # due to the underline crf function only support sequence length mask = np.array([ [1, 1, 0], [1, 1, 0] ]) crf = CRF( units=5, use_kernel=False, # disable kernel transform chain_initializer=initializers.Constant(transitions), use_boundary=True, # left_boundary_initializer=initializers.Constant(transitions_from_start), # right_boundary_initializer=initializers.Constant(transitions_to_end), name="crf_layer" ) crf_loss_instance = ConditionalRandomFieldLoss() model = Sequential() model.add(layers.Input(shape=(3, 5))) model.add(MockMasking(mask_shape=(2, 3), mask_value=mask)) model.add(crf) model.compile('adam', loss={"crf_layer": crf_loss_instance}) # for layer in model.layers: # print(layer.get_config()) # print(dict(zip(layer.weights, layer.get_weights()))) # Get just the tags from each tuple of (tags, score). result = model.predict(logits) # Now the tags should respect the constraints expected = [ [1, 2, 0], # B-X I-X NA [1, 1, 0] # B-X B-X NA ] # if constrain not work it should be: # [ # [2, 4, 3], # [2, 3, 0] # ] # test assert np.testing.assert_equal(result, expected)
def test_unmasked_constrained_viterbi_tags(self): # TODO: using BILUO tag scheme instead of BIO. # So that, transition from tags to end can be tested. raw_constraints = np.array([ # O B-X I-X B-Y I-Y start end [ 1, 1, 0, 1, 0, 0, 1], # O [ 1, 1, 1, 1, 0, 0, 1], # B-X [ 1, 1, 1, 1, 0, 0, 1], # I-X [ 1, 1, 0, 1, 1, 0, 1], # B-Y [ 1, 1, 0, 1, 1, 0, 1], # I-Y [ 1, 1, 0, 1, 0, 0, 0], # start [ 0, 0, 0, 0, 0, 0, 0], # end ]) constraints = np.argwhere(raw_constraints > 0).tolist() # transitions = np.array([ # # O B-X I-X B-Y I-Y # [ 0.1, 0.2, 0.3, 0.4, 0.5], # O # [ 0.8, 0.3, 0.1, 0.7, 0.9], # B-X # [ -0.3, 2.1, -5.6, 3.4, 4.0], # I-X # [ 0.2, 0.4, 0.6, -0.3, -0.4], # B-Y # [ 1.0, 1.0, 1.0, 1.0, 1.0] # I-Y # ]) transitions = np.ones([5, 5]) # transitions_from_start = np.array( # # O B-X I-X B-Y I-Y # [ 0.1, 0.2, 0.3, 0.4, 0.6] # start # ) transitions_from_start = np.ones(5) # transitions_to_end = np.array( # [ # # end # -0.1, # O # -0.2, # B-X # 0.3, # I-X # -0.4, # B-Y # -0.4 # I-Y # ] # ) transitions_to_end = np.ones(5) logits = np.array([ [ # constraint transition from start to tags # O B-X I-X B-Y I-Y [ 0., .1, 1., 0., 0.], [ 0., 0., 1., 0., 0.], [ 0., 0., 1., 0., 0.] ], [ # constraint transition from tags to tags # O B-X I-X B-Y I-Y [ 0., 1., 0., 0., 0.], [ 0., 0., .1, 1., 0.], [ 0., 0., 1., 0., 0.] ] ]) crf = CRF( units=5, use_kernel=False, # disable kernel transform chain_initializer=initializers.Constant(transitions), use_boundary=True, # left_boundary_initializer=initializers.Constant(transitions_from_start), # right_boundary_initializer=initializers.Constant(transitions_to_end), transition_constraint=constraints, name="crf_layer" ) crf.left_boundary = crf.add_weight( shape=(5,), name="left_boundary", initializer=initializers.Constant(self.transitions_from_start), ) crf.right_boundary = crf.add_weight( shape=(5,), name="right_boundary", initializer=initializers.Constant(self.transitions_to_end), ) crf_loss_instance = ConditionalRandomFieldLoss() model = Sequential() model.add(layers.Input(shape=(3, 5))) model.add(crf) model.compile('adam', loss={"crf_layer": crf_loss_instance}) for layer in model.layers: print(layer.get_config()) print(dict(zip(layer.weights, layer.get_weights()))) # Get just the tags from each tuple of (tags, score). viterbi_tags = model.predict(logits) # Now the tags should respect the constraints expected_tags = [ [1, 2, 2], # B-X I-X I-X [1, 2, 2] # B-X I-X I-X ] # if constrain not work it should be: # [ # [2, 4, 3], # [2, 3, 0] # ] # test assert np.testing.assert_equal(viterbi_tags, expected_tags)
def test_constrained_viterbi_tags(self): constraints = {(0, 0), (0, 1), (1, 1), (1, 2), (2, 2), (2, 3), (3, 3), (3, 4), (4, 4), (4, 0)} # Add the transitions to the end tag # and from the start tag. for i in range(5): constraints.add((5, i)) constraints.add((i, 6)) mask = np.array([ [1, 1, 1], [1, 1, 0] ]) crf = CRF( units=5, use_kernel=False, # disable kernel transform chain_initializer=initializers.Constant(self.transitions), use_boundary=True, # left_boundary_initializer=initializers.Constant(self.transitions_from_start), # right_boundary_initializer=initializers.Constant(self.transitions_to_end), transition_constraint=constraints, name="crf_layer" ) crf.left_boundary = crf.add_weight( shape=(5,), name="left_boundary", initializer=initializers.Constant(self.transitions_from_start), ) crf.right_boundary = crf.add_weight( shape=(5,), name="right_boundary", initializer=initializers.Constant(self.transitions_to_end), ) crf_loss_instance = ConditionalRandomFieldLoss() model = Sequential() model.add(layers.Input(shape=(3, 5))) model.add(MockMasking(mask_shape=(2, 3), mask_value=mask)) model.add(crf) model.compile('adam', loss={"crf_layer": crf_loss_instance}) for layer in model.layers: print(layer.get_config()) print(dict(zip(layer.weights, layer.get_weights()))) # Get just the tags from each tuple of (tags, score). viterbi_tags = model.predict(self.logits) # Now the tags should respect the constraints expected_tags = [ [2, 3, 3], [2, 3, 0] ] # if constrain not work it should be: # [ # [2, 4, 3], # [2, 3, 0] # ] # test assert np.testing.assert_equal(viterbi_tags, expected_tags)
def main(): # get configure config = _read_configure("./configure.yaml") # get Parameters (controller) EPOCHS = config.get("epochs", 10) PRETRAIN_EPOCHS = config.get("pretrain_cls", 5) BATCHSIZE = config.get("batch_size", 32) PRETRAIN_BATCHSIZE = config.get("pretrain_batchsize", 32) LEARNINGRATE = config.get("learning_rate", 0.001) MAX_SENTENCE_LEN = config.get("max_sentence_len", 25) LRDECAY = config.get('lr_decay', False) EARLYSTOP = config.get('early_stop', False) # get Parameters (model select) MODEL_CHOICE = config.get("model_choice", "VIRTUAL_EMBEDDING") FINETUNE = config.get("finetune", False) # get Parameters (model structure) CLS2NER_KEYWORD_LEN = config.get("cls2ner_keyword_len", 5) EMBED_DIM = config.get("embedding_dim", 128) ARCLOSS = config.get("Arcloss", True) USE_ATTENTION_LAYER = config.get("use_attention_layer", False) BiLSTM_STACK_CONFIG = config.get("bilstm_stack_config", []) CRF_PARAMS = config.get("crf_params", {}) # get preprocessed input data dict from mtnlpmodel.utils.input_process_util import input_data_process # to build a fixed training environment, input data should be fixed. # input_data should be shuffled and remove duplication outside the trainer before running the program. # input data should be corpus(no duplication, shuffle well) if MODEL_CHOICE == 'VIRTUAL_EMBEDDING' or MODEL_CHOICE == 'CLS2NER_INPUT': # different model structures have different input process way data_dict = input_data_process( config, **{ 'MAX_SENTENCE_LEN': MAX_SENTENCE_LEN, # preprocess the input_data 'CLS2NER_KEYWORD_LEN': CLS2NER_KEYWORD_LEN, }) else: data_dict = input_data_process( config, **{ 'MAX_SENTENCE_LEN': MAX_SENTENCE_LEN, # preprocess the input_data 'CLS2NER_KEYWORD_LEN': 0, }) PRETRAIN_EPOCHS = 0 # get lookupers ner_tag_lookuper = data_dict['ner_tag_lookuper'] cls_label_lookuper = data_dict['cls_label_lookuper'] vocabulary_lookuper = data_dict['vocabulary_lookuper'] # get train/test data for training model ner_train_x, ner_train_y = data_dict['ner_train_x'], data_dict[ 'ner_train_y'] ner_test_x, ner_test_y = data_dict['ner_test_x'], data_dict['ner_test_y'] cls_train_x, cls_train_y = data_dict['cls_train_x'], data_dict[ 'cls_train_y'] cls_test_x, cls_test_y = data_dict['cls_test_x'], data_dict['cls_test_y'] # build model or finetuning from mtnlpmodel.core import build_model, finetune_model, get_freeze_list_for_finetuning params = { 'EMBED_DIM': EMBED_DIM, 'PRETRAIN_EPOCHS': PRETRAIN_EPOCHS, 'BiLSTM_STACK_CONFIG': BiLSTM_STACK_CONFIG, 'MAX_SENTENCE_LEN': MAX_SENTENCE_LEN, 'CLS2NER_KEYWORD_LEN': CLS2NER_KEYWORD_LEN, 'USE_ATTENTION_LAYER': USE_ATTENTION_LAYER, 'Arcloss': ARCLOSS, 'ner_tag_lookuper': ner_tag_lookuper, 'cls_label_lookuper': cls_label_lookuper, 'vocabulary_lookuper': vocabulary_lookuper, 'CRF_PARAMS': CRF_PARAMS } model_choice = MODEL_CHOICE # VIRTUAL_EMBEDDING, CLS2NER_INPUT, OTHER print("Model structure choosing {}".format(model_choice)) from mtnlpmodel.core import finetuning_logger if FINETUNE: # fine-tuning the model, load model by the weights recommend_freeze_list = get_freeze_list_for_finetuning( model_choice ) # you can modify this list to customize the freeze list model_weights_path = os.path.abspath( './results/h5_weights/weights.h5') # use weight finetuning_logger(*(model_weights_path, recommend_freeze_list)) # print some log model = finetune_model(model_choice, model_weights_path, recommend_freeze_list, **params) else: # train the model by random initializer(make a fresh start to train a model) model = build_model(model_choice, **params) # to build the model model.summary() # build callbacks list callbacks_list = [] tensorboard_callback = tf.keras.callbacks.TensorBoard( #log_dir=create_dir_if_needed(config["summary_log_dir"]) log_dir='.\\results\\summary_log_dir', batch_size=BATCHSIZE, ) callbacks_list.append(tensorboard_callback) checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( os.path.join(create_dir_if_needed(config["model_dir"]), "cp-{epoch:04d}.ckpt"), load_weights_on_restart=True, verbose=1, ) callbacks_list.append(checkpoint_callback) # early stop util if EARLYSTOP: early_stop = tf.keras.callbacks.EarlyStopping( monitor='val_loss', # early stop index patience=3, # early stop delay epoch verbose=2, # display mode mode='auto') callbacks_list.append(early_stop) #learning rate decay util if LRDECAY: reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.75, patience=3, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0.00001) callbacks_list.append(reduce_lr) # ner_loss_func ner_loss_func = ConditionalRandomFieldLoss() # set optimizer adam_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNINGRATE, beta_1=0.9, beta_2=0.999, amsgrad=True) if FINETUNE: NER_out_name = 'crf_' CLS_out_name = 'cls_' else: NER_out_name = 'crf' CLS_out_name = 'cls' # pretrain model -> train cls branch model.compile( optimizer=adam_optimizer, loss={ NER_out_name: ner_loss_func, CLS_out_name: 'categorical_crossentropy' }, loss_weights={ NER_out_name: 0., CLS_out_name: 10. }, # set weight of loss metrics={ NER_out_name: SequenceCorrectness(), CLS_out_name: 'categorical_accuracy' }) model.fit( { 'ner_input': ner_train_x, 'cls_input': cls_train_x }, { NER_out_name: ner_train_y, CLS_out_name: cls_train_y }, epochs=PRETRAIN_EPOCHS, batch_size=PRETRAIN_BATCHSIZE, class_weight={ NER_out_name: None, CLS_out_name: 'auto' }, # cls loss multiply the class weights validation_data=[{ 'ner_input': ner_test_x, 'cls_input': cls_test_x }, { NER_out_name: ner_test_y, CLS_out_name: cls_test_y }], callbacks=callbacks_list, ) # train model model.compile( optimizer=adam_optimizer, loss={ NER_out_name: ner_loss_func, CLS_out_name: 'categorical_crossentropy' }, loss_weights={ NER_out_name: 15., CLS_out_name: 10. }, # set weight of loss metrics={ NER_out_name: SequenceCorrectness(), CLS_out_name: 'categorical_accuracy' }) model.fit( { 'ner_input': ner_train_x, 'cls_input': cls_train_x }, { NER_out_name: ner_train_y, CLS_out_name: cls_train_y }, epochs=EPOCHS, batch_size=BATCHSIZE, class_weight={ NER_out_name: None, CLS_out_name: 'auto' }, # cls loss multiply the class weights validation_data=[{ 'ner_input': ner_test_x, 'cls_input': cls_test_x }, { NER_out_name: ner_test_y, CLS_out_name: cls_test_y }], callbacks=callbacks_list, ) # save model model.save(create_file_dir_if_needed(config["h5_model_file"])) model.save_weights(create_file_dir_if_needed(config["h5_weights_file"])) tf.keras.experimental.export_saved_model( model, create_or_rm_dir_if_needed(config["saved_model_dir"])) mtinput_export_as_deliverable_model( create_dir_if_needed(config["deliverable_model_dir"]), keras_saved_model=config["saved_model_dir"], converter_for_request=ConverterForMTRequest(), converter_for_response=ConverterForMTResponse_VirtualPad( prepad=CLS2NER_KEYWORD_LEN), lookup_tables={ 'vocab_lookup': vocabulary_lookuper, 'tag_lookup': ner_tag_lookuper, 'label_lookup': cls_label_lookuper }, padding_parameter={ "maxlen": MAX_SENTENCE_LEN, "value": 0, "padding": "post" }, addition_model_dependency=["tf-crf-layer"], custom_object_dependency=["tf_crf_layer"], )