Exemple #1
0
def test_forward_works_with_mask(numpy_crf):
    logits = np.array([
        [[0, 0, .5, .5, .2], [0, 0, .3, .3, .1], [0, 0, .9, 10, 1]],
        [[0, 0, .2, .5, .2], [0, 0, 3, .3, .1], [0, 0, .9, 1, 1]],
    ])
    transitions = np.array([
        [0.1, 0.2, 0.3, 0.4, 0.5],
        [0.8, 0.3, 0.1, 0.7, 0.9],
        [-0.3, 2.1, -5.6, 3.4, 4.0],
        [0.2, 0.4, 0.6, -0.3, -0.4],
        [1.0, 1.0, 1.0, 1.0, 1.0]
    ])

    boundary_transitions = np.array([0.1, 0.2, 0.3, 0.4, 0.6])

    tags = np.array([
            [2, 3, 4],
            [3, 2, 2]
    ])

    # Use the CRF Module with fixed transitions to compute the log_likelihood
    crf = CRF(
        units=5,
        use_kernel=False,  # disable kernel transform
        chain_initializer=initializers.Constant(transitions),
        use_boundary=True,
        boundary_initializer=initializers.Constant(boundary_transitions),
        name="crf_layer"
    )
    # Use a non-trivial mask
    mask = np.array([
            [1, 1, 1],
            [1, 1, 0]
    ])

    crf_loss_instance = ConditionalRandomFieldLoss()

    model = Sequential()
    model.add(layers.Input(shape=(3, 5)))
    model.add(MockMasking(mask_shape=(2, 3), mask_value=mask))
    model.add(crf)
    model.compile('adam', loss={"crf_layer": crf_loss_instance})

    result = model.train_on_batch(logits, tags)


    numpy_crf_instance = numpy_crf(logits, mask, transitions, boundary_transitions, boundary_transitions)
    expected = numpy_crf_instance.compute_log_likehood(tags) / -2

    assert result == approx(expected)
Exemple #2
0
def create_bilstm_crf(vocab_size, EMBED_DIM, BiRNN_UNITS, tags_size):

    model = Sequential()
    # model.add(Embedding(len(vocab)+1, EMBED_DIM, mask_zero=True))
    model.add(Embedding(vocab_size, EMBED_DIM, mask_zero=True))
    model.add(Bidirectional(LSTM(BiRNN_UNITS // 2, return_sequences=True)))
    model.add(CRF(tags_size, sparse_target=True, name="crf_layer"))
    print(model.summary())

    crf_loss_instance = ConditionalRandomFieldLoss()
    # model.compile('adam', loss=crf_loss, metrics=[crf_viterbi_accuracy])
    #model.compile('adam', loss={"crf_layer": crf_loss_instance}, metrics=[crf_accuracy])

    model.summary()

    return model
def test_masking_fixed_length(get_random_data):
    nb_samples = 2
    timesteps = 10
    embedding_dim = 4
    output_dim = 5
    embedding_num = 12

    crf_loss_instance = ConditionalRandomFieldLoss()

    x, y = get_random_data(nb_samples,
                           timesteps,
                           x_high=embedding_num,
                           y_high=output_dim)
    # right padding; left padding is not supported due to the tf.contrib.crf
    x[0, -4:] = 0

    # test with masking, fix length
    model = Sequential()
    model.add(
        Embedding(embedding_num,
                  embedding_dim,
                  input_length=timesteps,
                  mask_zero=True))
    model.add(CRF(output_dim, name="crf_layer"))
    model.compile(optimizer='adam', loss={"crf_layer": crf_loss_instance})

    model.fit(x, y, epochs=1, batch_size=1)
    model.fit(x, y, epochs=1, batch_size=2)
    model.fit(x, y, epochs=1, batch_size=3)
    model.fit(x, y, epochs=1)

    # check mask
    y_pred = model.predict(x)
    assert (y_pred[0, -4:] == 0).all()  # right padding
    # left padding not working currently due to the tf.contrib.crf.*
    # assert (y_pred[1, :5] == 0).all()

    # test saving and loading model
    MODEL_PERSISTENCE_PATH = './test_saving_crf_model.h5'
    model.save(MODEL_PERSISTENCE_PATH)
    load_model(MODEL_PERSISTENCE_PATH, custom_objects={'CRF': CRF})

    try:
        os.remove(MODEL_PERSISTENCE_PATH)
    except OSError:
        pass
Exemple #4
0
def test_viterbi_tags(numpy_crf):
    logits = np.array([
        [[0, 0, .5, .5, .2], [0, 0, .3, .3, .1], [0, 0, .9, 10, 1]],
        [[0, 0, .2, .5, .2], [0, 0, 3, .3, .1], [0, 0, .9, 1, 1]],
    ])
    transitions = np.array([
        [0.1, 0.2, 0.3, 0.4, 0.5],
        [0.8, 0.3, 0.1, 0.7, 0.9],
        [-0.3, 2.1, -5.6, 3.4, 4.0],
        [0.2, 0.4, 0.6, -0.3, -0.4],
        [1.0, 1.0, 1.0, 1.0, 1.0]
    ])

    boundary_transitions = np.array([0.1, 0.2, 0.3, 0.4, 0.6])

    # Use the CRF Module with fixed transitions to compute the log_likelihood
    crf = CRF(
        units=5,
        use_kernel=False,  # disable kernel transform
        chain_initializer=initializers.Constant(transitions),
        use_boundary=True,
        boundary_initializer=initializers.Constant(boundary_transitions),
        name="crf_layer"
    )
    mask = np.array([
            [1, 1, 1],
            [1, 1, 0]
    ])

    crf_loss_instance = ConditionalRandomFieldLoss()

    model = Sequential()
    model.add(layers.Input(shape=(3, 5)))
    model.add(MockMasking(mask_shape=(2, 3), mask_value=mask))
    model.add(crf)
    model.compile('adam', loss={"crf_layer": crf_loss_instance})

    # Separate the tags and scores.
    result = model.predict(logits)

    numpy_crf_instance = numpy_crf(logits, mask, transitions, boundary_transitions, boundary_transitions)
    expected, _ = numpy_crf_instance.decode()

    np.testing.assert_equal(result, expected)
def test_masking_fixed_length(get_random_data):
    nb_samples = 2
    timesteps = 10
    embedding_dim = 4
    output_dim = 5
    embedding_num = 12

    crf_loss_instance = ConditionalRandomFieldLoss()

    x, y = get_random_data(nb_samples,
                           timesteps,
                           x_high=embedding_num,
                           y_high=output_dim)

    # test with no masking, fix length
    model = Sequential()
    model.add(Embedding(embedding_num, embedding_dim, input_length=timesteps))
    model.add(CRF(output_dim, name="crf_layer"))
    model.compile(optimizer='adam', loss={"crf_layer": crf_loss_instance})

    model.fit(x, y, epochs=1, batch_size=1)
    model.fit(x, y, epochs=1, batch_size=2)
    model.fit(x, y, epochs=1, batch_size=3)
    model.fit(x, y, epochs=1)

    # test saving and loading model
    MODEL_PERSISTENCE_PATH = './test_saving_crf_model.h5'
    model.save(MODEL_PERSISTENCE_PATH)
    load_model(MODEL_PERSISTENCE_PATH,
               custom_objects={
                   'CRF': CRF,
                   'crf_loss': crf_loss
               })

    try:
        os.remove(MODEL_PERSISTENCE_PATH)
    except OSError:
        pass
def test_masking_with_boundary(get_random_data):
    nb_samples = 2
    timesteps = 10
    embedding_dim = 4
    output_dim = 5
    embedding_num = 12

    crf_loss_instance = ConditionalRandomFieldLoss()

    x, y = get_random_data(nb_samples, timesteps, x_high=embedding_num,y_high=output_dim)
    # right padding; left padding is not supported due to the tf.contrib.crf
    x[0, -4:] = 0

    # test with masking, fix length
    model = Sequential()
    model.add(Embedding(embedding_num, embedding_dim, input_length=timesteps,
                        mask_zero=True))
    model.add(CRF(output_dim, use_boundary=True, name="crf_layer"))
    model.compile(optimizer='adam', loss={"crf_layer": crf_loss_instance})

    model.fit(x, y, epochs=1, batch_size=1)
    model.fit(x, y, epochs=1, batch_size=2)
    model.fit(x, y, epochs=1, batch_size=3)
    model.fit(x, y, epochs=1)
def test_crf_viterbi_accuracy(get_random_data):
    nb_samples = 2
    timesteps = 10
    embedding_dim = 4
    output_dim = 5
    embedding_num = 12

    crf_loss_instance = ConditionalRandomFieldLoss()

    x, y = get_random_data(nb_samples,
                           timesteps,
                           x_high=embedding_num,
                           y_high=output_dim)
    # right padding; left padding is not supported due to the tf.contrib.crf
    x[0, -4:] = 0

    # test with masking, fix length
    model = Sequential()
    model.add(
        Embedding(embedding_num,
                  embedding_dim,
                  input_length=timesteps,
                  mask_zero=True))
    model.add(CRF(output_dim, name="crf_layer"))
    model.compile(optimizer='rmsprop',
                  loss={"crf_layer": crf_loss_instance},
                  metrics=[crf_viterbi_accuracy])

    model.fit(x, y, epochs=1, batch_size=10)

    # test viterbi_acc
    y_pred = model.predict(x)
    _, v_acc = model.evaluate(x, y)
    np_acc = (y_pred[x > 0] == y[x > 0]).astype('float32').mean()
    print(v_acc, np_acc)
    assert np.abs(v_acc - np_acc) < 1e-4
Exemple #8
0
def main():

    config = read_configure()   # ioflow
    corpus = get_corpus_processor(config)
    corpus.prepare()  # ?
    train_data_generator_func = corpus.get_generator_func(corpus.TRAIN)
    eval_data_generator_func = corpus.get_generator_func(corpus.EVAL)

    corpus_meta_data = corpus.get_meta_info()
    tags_data = generate_tagset(corpus_meta_data["tags"])  # process entity into BIO

    train_data = list(train_data_generator_func())
    eval_data = list(eval_data_generator_func())

    tag_lookuper = Lookuper({v: i for i, v in enumerate(tags_data)})  # tag index
    vocab_data_file = config.get("vocabulary_file")
    vocabulary_lookuper = index_table_from_file(vocab_data_file)

    def preprocss(data, maxlen):
        raw_x = []
        raw_y = []

        for offset_data in data:
            tags = offset_to_biluo(offset_data)
            words = offset_data.text

            tag_ids = [tag_lookuper.lookup(i) for i in tags]
            word_ids = [vocabulary_lookuper.lookup(i) for i in words]

            raw_x.append(word_ids)
            raw_y.append(tag_ids)

        if maxlen is None:
            maxlen = max(len(s) for s in raw_x)

        print(">>> maxlen: {}".format(maxlen))

        x = tf.keras.preprocessing.sequence.pad_sequences(
            raw_x, maxlen, padding="post"
        )  # right padding

        # lef padded with -1. Indeed, any integer works as it will be masked
        # y_pos = pad_sequences(y_pos, maxlen, value=-1)
        # y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
        y = tf.keras.preprocessing.sequence.pad_sequences(
            raw_y, maxlen, value=0, padding="post"
        )

        return x, y

    MAX_SENTENCE_LEN = config.get("max_sentence_len", 25)

    train_x, train_y = preprocss(train_data, MAX_SENTENCE_LEN)
    test_x, test_y = preprocss(eval_data, MAX_SENTENCE_LEN)

    EPOCHS = config["epochs"]
    BATCH_SIZE = config["batch_size"]
    EMBED_DIM = config["embedding_dim"]
    USE_ATTENTION_LAYER = config.get("use_attention_layer", False)
    BiLSTM_STACK_CONFIG = config.get("bilstm_stack_config", [])
    BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG = config.get(
        "use_batch_normalization_after_embedding", False
    )
    BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG = config.get(
        "use_batch_normalization_after_bilstm", False
    )
    CRF_PARAMS = config.get("crf_params", {})
    OPTIMIZER_PARAMS = config.get("optimizer_params", {})

    vacab_size = vocabulary_lookuper.size()
    tag_size = tag_lookuper.size()

    model = Sequential()

    model.add(
        Embedding(vacab_size, EMBED_DIM, embeddings_initializer='glorot_normal',
                  mask_zero=True, input_length=MAX_SENTENCE_LEN)
    )

    if BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG:
        model.add(BatchNormalization())

    for bilstm_config in BiLSTM_STACK_CONFIG:
        model.add(Bidirectional(LSTM(return_sequences=True, **bilstm_config)))

    if BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG:
        model.add(BatchNormalization())

    if USE_ATTENTION_LAYER:
        model.add(GlobalAttentionLayer())

    model.add(CRF(tag_size, name="crf", **CRF_PARAMS))

    # print model summary
    model.summary()

    callbacks_list = []

    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=create_dir_if_needed(config["summary_log_dir"])
    )
    callbacks_list.append(tensorboard_callback)

    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        os.path.join(create_dir_if_needed(config["model_dir"]), "cp-{epoch:04d}.ckpt"),
        load_weights_on_restart=True,
        verbose=1,
    )
    callbacks_list.append(checkpoint_callback)

    metrics_list = []

    metrics_list.append(SequenceCorrectness())
    metrics_list.append(SequenceSpanAccuracy())

    loss_func = ConditionalRandomFieldLoss()
    # loss_func = crf_loss

    optimizer = optimizers.Adam(**OPTIMIZER_PARAMS)
    # optimizer = optimizers.Nadam(**OPTIMIZER_PARAMS)

    model.compile(optimizer=optimizer, loss={"crf": loss_func}, metrics=metrics_list)

    model.fit(
        train_x,
        train_y,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_data=[test_x, test_y],
        callbacks=callbacks_list,
    )

    # Save the model
    model.save(create_file_dir_if_needed(config["h5_model_file"]))

    tf.keras.experimental.export_saved_model(
        model, create_dir_if_needed(config["saved_model_dir"]))

    export_as_deliverable_model(
        create_dir_if_needed(config["deliverable_model_dir"]),
        keras_saved_model=config["saved_model_dir"],
        vocabulary_lookup_table=vocabulary_lookuper,
        tag_lookup_table=tag_lookuper,
        padding_parameter={"maxlen": MAX_SENTENCE_LEN, "value": 0, "padding": "post"},
        addition_model_dependency=["tf-crf-layer"],
        custom_object_dependency=["tf_crf_layer"],
    )
Exemple #9
0
def main():

    # get configure
    config = read_configure()

    # get train/test corpus
    corpus = get_corpus_processor(config)
    corpus.prepare()
    train_data_generator_func = corpus.get_generator_func(corpus.TRAIN)
    eval_data_generator_func = corpus.get_generator_func(corpus.EVAL)

    corpus_meta_data = corpus.get_meta_info()

    # process str data to onehot
    ner_tags_data = generate_tagset(corpus_meta_data["tags"])
    cls_tags_data = corpus_meta_data["labels"]

    train_data = list(train_data_generator_func())
    eval_data = list(eval_data_generator_func())

    ner_tag_lookuper = Lookuper({v: i for i, v in enumerate(ner_tags_data)})
    cls_tag_lookuper = Lookuper({v: i for i, v in enumerate(cls_tags_data)})

    vocab_data_file = config.get("vocabulary_file")

    if not vocab_data_file:
        # load built in vocabulary file
        vocab_data_file = os.path.join(
            os.path.dirname(__file__), "../data/unicode_char_list.txt"
        )

    vocabulary_lookuper = index_table_from_file(vocab_data_file)

    def preprocss(data, maxlen, **kwargs):
        raw_x = []
        raw_y_ner = []
        raw_y_cls = []

        for offset_data in data:
            tags = offset_to_biluo(offset_data)
            label = offset_data.label
            words = offset_data.text

            tag_ids = [ner_tag_lookuper.lookup(i) for i in tags]
            label_id = cls_tag_lookuper.lookup(label)
            word_ids = [vocabulary_lookuper.lookup(i) for i in words]

            raw_x.append(word_ids)
            raw_y_ner.append(tag_ids)
            raw_y_cls.append(label_id)

        if maxlen is None:
            maxlen = max(len(s) for s in raw_x)

        print(">>> maxlen: {}".format(maxlen))

        x = tf.keras.preprocessing.sequence.pad_sequences(
            raw_x, maxlen, padding="post"
        )  # right padding

        y_ner = tf.keras.preprocessing.sequence.pad_sequences(
            raw_y_ner, maxlen, value=0, padding="post"
        )

        from keras.utils import to_categorical
        y_cls = np.array(raw_y_cls)
        y_cls = y_cls[:, np.newaxis]
        y_cls = to_categorical(y_cls, kwargs.get('cls_dims', 81))

        return x, y_ner, y_cls


    # get Parameters (controller)
    EPOCHS = config.get("epochs", 10)
    BATCHSIZE = config.get("batch_size", 32)
    LEARNINGRATE = config.get("learning_rate", 0.001)
    MAX_SENTENCE_LEN = config.get("max_sentence_len", 25)

    # get Parameters (model structure)
    EMBED_DIM = config.get("embedding_dim", 300)
    USE_ATTENTION_LAYER = config.get("use_attention_layer", False)
    BiLSTM_STACK_CONFIG = config.get("bilstm_stack_config", [])
    BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG = config.get(
        "use_batch_normalization_after_embedding", False)
    BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG = config.get(
        "use_batch_normalization_after_bilstm", False)
    CRF_PARAMS = config.get("crf_params", {})


    # get train/test data for training model
    vacab_size = vocabulary_lookuper.size()
    tag_size = ner_tag_lookuper.size()
    label_size = cls_tag_lookuper.size()

    train_x, train_y_ner, train_y_cls = preprocss(train_data, MAX_SENTENCE_LEN, **{'cls_dims':label_size})
    test_x, test_y_ner, test_y_cls = preprocss(eval_data, MAX_SENTENCE_LEN, **{'cls_dims':label_size})


    # build model
    input_length = MAX_SENTENCE_LEN
    input_layer = Input(shape=(input_length,), dtype='float', name='input_layer')

    # encoder
    with tf.keras.backend.name_scope("Encoder"):

        embedding_layer = Embedding(vacab_size,
                                    EMBED_DIM,
                                    mask_zero=True,
                                    input_length=input_length,
                                    name='embedding')(input_layer)

    # feature extractor
    with tf.keras.backend.name_scope("biLSTM"):
        if BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG:
            embedding_layer = BatchNormalization()(embedding_layer)

        biLSTM = embedding_layer
        for bilstm_config in BiLSTM_STACK_CONFIG:
               biLSTM = Bidirectional(LSTM(return_sequences=True, **bilstm_config, name='biLSTM'))(biLSTM)

    if BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG:
        biLSTM = BatchNormalization()(biLSTM)

    if USE_ATTENTION_LAYER:
        biLSTM = GlobalAttentionLayer()(biLSTM)

    # NER branch
    with tf.keras.backend.name_scope("NER_branch"):
        crf = CRF(tag_size, name="crf", **CRF_PARAMS)(biLSTM)
        loss_func = ConditionalRandomFieldLoss()


    # classification branch

    chosen = 'lstm_cls'
    with tf.keras.backend.name_scope("CLS_branch"):
        from tensorflow.keras.layers import Dense, Flatten, Dropout
        # add paragraph vector
        #paragraph_vector = get_paragraph_vector(embedding_layer)

        if chosen == "lstm_cls":
            cls_flat_lstm = Flatten()(biLSTM)
            #cls_flat_lstm = tf.keras.layers.concatenate([cls_flat_lstm, paragraph_vector])
            classification_dense = Dropout(0.2)(cls_flat_lstm)
            classification_dense = SetLearningRate(Dense(label_size, activation='sigmoid', name='CLS'), lr=0.001, is_ada=True)(classification_dense)

        elif chosen == "conv_cls":
            from tensorflow.keras.layers import Conv1D, MaxPooling1D
            embedding_layer = BatchNormalization()(embedding_layer)
            cls_conv_emb = Conv1D(32, 3, activation='relu', padding='same')(embedding_layer)
            cls_conv_emb = Conv1D(64, 3, activation='relu', padding='same')(cls_conv_emb)
            cls_conv_emb = MaxPooling1D(2)(cls_conv_emb)
            cls_conv_emb = Conv1D(128, 3, activation='relu', dilation_rate=1, padding='same')(cls_conv_emb)
            cls_conv_emb = Conv1D(128, 3, activation='relu', dilation_rate=2, padding='same')(cls_conv_emb)
            cls_conv_emb = Conv1D(128, 3, activation='relu', dilation_rate=5, padding='same')(cls_conv_emb)
            cls_conv_emb = Conv1D(256, 1, activation='relu', padding='same')(cls_conv_emb)
            cls_conv_emb = MaxPooling1D(2)(cls_conv_emb)

            cls_flat = BatchNormalization()(cls_conv_emb)
            cls_flat = Flatten()(cls_flat)
            classification_dense = Dropout(0.2)(cls_flat)
            classification_dense = Dense(label_size, activation='sigmoid', name='CLS')(classification_dense)



    # merge NER and Classification
    model = Model(inputs=[input_layer], outputs=[crf, classification_dense])


    model.summary()

    callbacks_list = []

    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        #log_dir=create_dir_if_needed(config["summary_log_dir"])
        log_dir='.\\results\\summary_log_dir',
        batch_size=BATCHSIZE,
    )
    callbacks_list.append(tensorboard_callback)

    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        os.path.join(create_dir_if_needed(config["model_dir"]), "cp-{epoch:04d}.ckpt"),
        load_weights_on_restart=True,
        verbose=1,
    )
    callbacks_list.append(checkpoint_callback)

    metrics_list = []

    metrics_list.append(crf_accuracy)
    metrics_list.append(SequenceCorrectness())
    metrics_list.append(sequence_span_accuracy)

    # early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss',  # early stop index
    #                                               patience=3,          # early stop delay epoch
    #                                               verbose=2,           # display mode
    #                                               mode='auto')
    # callbacks_list.append(early_stop)

    from mtnlpmodel.trainer.loss_func_util import FocalLoss
    adam_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNINGRATE, beta_1=0.9, beta_2=0.999, amsgrad=False)
    model.compile(optimizer=adam_optimizer,
                  #loss={'crf': loss_func, 'CLS': 'sparse_categorical_crossentropy'},
                  loss={'crf': loss_func, 'CLS': FocalLoss()},
                  loss_weights={'crf': 1., 'CLS': 100},  # set weight of loss
                  #metrics={'crf': SequenceCorrectness(), 'CLS': 'sparse_categorical_accuracy'} )
                  metrics={'crf': SequenceCorrectness(), 'CLS': 'categorical_accuracy'})

    model.fit(
        train_x,
        {'crf': train_y_ner, 'CLS': train_y_cls},
        epochs=EPOCHS,
        batch_size=BATCHSIZE,
        validation_data=[test_x,  {'crf': test_y_ner, 'CLS': test_y_cls}],
        callbacks=callbacks_list,
    )


    model.save(create_file_dir_if_needed(config["h5_model_file"]))
    model.save_weights(create_file_dir_if_needed(config["h5_weights_file"]))

    tf.keras.experimental.export_saved_model(
        model, create_or_rm_dir_if_needed(config["saved_model_dir"])
    )


    mt_export_as_deliverable_model(
        create_dir_if_needed(config["deliverable_model_dir"]),
        keras_saved_model=config["saved_model_dir"],
        converter_for_request=ConverterForRequest(),
        converter_for_response=ConverterForMTResponse(),
        lookup_tables={'vocab_lookup':vocabulary_lookuper,
                       'tag_lookup':ner_tag_lookuper,
                       'label_lookup':cls_tag_lookuper},
        padding_parameter={"maxlen": MAX_SENTENCE_LEN, "value": 0, "padding": "post"},
        addition_model_dependency=["tf-crf-layer"],
        custom_object_dependency=["tf_crf_layer"],
    )
Exemple #10
0
callbacks_list.append(tensorboard_callback)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    os.path.join(create_dir_if_needed(config["model_dir"]), "cp-{epoch:04d}.ckpt"),
    load_weights_on_restart=True,
    verbose=1,
)
callbacks_list.append(checkpoint_callback)

metrics_list = []

metrics_list.append(crf_accuracy)
metrics_list.append(SequenceCorrectness())
metrics_list.append(sequence_span_accuracy)

loss_func = ConditionalRandomFieldLoss()
# loss_func = crf_loss

model.compile("adam", loss={"crf": loss_func}, metrics=metrics_list)
# model.compile("nadam", loss={"crf": loss_func}, metrics=metrics_list)
model.fit(
    train_x,
    train_y,
    epochs=EPOCHS,
    validation_data=[test_x, test_y],
    callbacks=callbacks_list,
)

# Save the model
model.save(create_file_dir_if_needed(config["h5_model_file"]))
def main():
    # ------
    # Data
    # -----

    # conll200 has two different targets, here will only use
    # IBO like chunking as an example
    train, test, voc = conll2000.load_data()
    (train_x, _, train_y) = train
    (test_x, _, test_y) = test
    (vocab, _, class_labels) = voc

    # --------------
    # 1. Regular CRF
    # --------------

    print('==== training CRF ====')

    model = Sequential()
    model.add(Embedding(len(vocab), EMBED_DIM,
                        mask_zero=True))  # Random embedding
    # model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True, input_length=78))  # Random embedding
    crf = CRF(len(class_labels), name="crf_layer")
    model.add(crf)

    crf_loss_instance = ConditionalRandomFieldLoss()

    # The default `crf_loss` for `learn_mode='join'` is negative log likelihood.
    model.compile('adam',
                  loss={"crf_layer": crf_loss_instance},
                  metrics=[SequenceSpanAccuracy()])
    # model.compile('adam', loss={"crf_layer": crf_loss_instance}, metrics=[CategoricalAccuracy()])
    # model.compile('adam', loss={"crf_layer": crf_loss_instance}, metrics=[crf_accuracy])
    model.fit(train_x,
              train_y,
              epochs=EPOCHS,
              validation_data=[test_x, test_y])

    # test_y_pred = model.predict(test_x).argmax(-1)[test_x > 0]
    test_y_pred = model.predict(test_x)[test_x > 0]
    test_y_true = test_y[test_x > 0]

    print('\n---- Result of CRF ----\n')
    classification_report(test_y_true, test_y_pred, class_labels)

    # -------------
    # 2. BiLSTM-CRF
    # -------------

    print('==== training BiLSTM-CRF ====')

    model = Sequential()
    model.add(Embedding(len(vocab), EMBED_DIM,
                        mask_zero=True))  # Random embedding
    # model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True, input_length=78))  # Random embedding
    model.add(Bidirectional(LSTM(BiRNN_UNITS // 2, return_sequences=True)))
    crf = CRF(len(class_labels), name="crf_layer")
    model.add(crf)

    crf_loss_instance = ConditionalRandomFieldLoss()

    model.compile('adam',
                  loss={"crf_layer": crf_loss_instance},
                  metrics=[SequenceSpanAccuracy()])
    # model.compile('adam', loss={"crf_layer": crf_loss_instance}, metrics=[CategoricalAccuracy()])
    # model.compile('adam', loss={"crf_layer": crf_loss_instance}, metrics=[crf_accuracy])
    model.fit(train_x,
              train_y,
              epochs=EPOCHS,
              validation_data=[test_x, test_y])

    predict_result = model.predict(test_x)
    test_y_pred = predict_result[test_x > 0]
    test_y_true = test_y[test_x > 0]

    print('\n---- Result of BiLSTM-CRF ----\n')
    classification_report(test_y_true, test_y_pred, class_labels)
Exemple #12
0
def test_crf_config(get_random_data):
    nb_samples = 2
    timesteps = 10
    embedding_dim = 4
    output_dim = 5
    embedding_num = 12

    x, y = get_random_data(
        nb_samples, timesteps, x_high=embedding_num, y_high=output_dim
    )
    # right padding; left padding is not supported due to the tf.contrib.crf
    x[0, -4:] = 0

    crf_loss_instance = ConditionalRandomFieldLoss()

    # test with masking, fix length
    model = Sequential()
    model.add(
        Embedding(embedding_num, embedding_dim, input_length=timesteps, mask_zero=True)
    )
    model.add(CRF(output_dim, name="crf_layer"))
    model.compile(optimizer="rmsprop", loss={"crf_layer": crf_loss_instance})

    model.fit(x, y, epochs=1, batch_size=10)

    # test config
    result = model.get_config()

    expected = {
        "name": "sequential",
        "layers": [
            {
                "class_name": "Embedding",
                "config": {
                    "name": "embedding",
                    "trainable": True,
                    "batch_input_shape": (None, 10),
                    "dtype": "float32",
                    "input_dim": 12,
                    "output_dim": 4,
                    "embeddings_initializer": {
                        "class_name": "RandomUniform",
                        "config": {
                            "minval": -0.05,
                            "maxval": 0.05,
                            "seed": None,
                            "dtype": "float32",
                        },
                    },
                    "embeddings_regularizer": None,
                    "activity_regularizer": None,
                    "embeddings_constraint": None,
                    "mask_zero": True,
                    "input_length": 10,
                },
            },
            {
                "class_name": "CRF",
                "config": {
                    "name": "crf_layer",
                    "trainable": True,
                    "dtype": "float32",
                    "units": 5,
                    "use_boundary": True,
                    "use_bias": True,
                    "use_kernel": True,
                    "kernel_initializer": {
                        "class_name": "GlorotUniform",
                        "config": {"seed": None, "dtype": "float32"},
                    },
                    "chain_initializer": {
                        "class_name": "Orthogonal",
                        "config": {"gain": 1.0, "seed": None, "dtype": "float32"},
                    },
                    "boundary_initializer": {
                        "class_name": "Zeros",
                        "config": {"dtype": "float32"},
                    },
                    "bias_initializer": {
                        "class_name": "Zeros",
                        "config": {"dtype": "float32"},
                    },
                    "activation": "linear",
                    "kernel_regularizer": None,
                    "chain_regularizer": None,
                    "boundary_regularizer": None,
                    "bias_regularizer": None,
                    "kernel_constraint": None,
                    "chain_constraint": None,
                    "boundary_constraint": None,
                    "bias_constraint": None,
                },
            },
        ],
    }

    assert result == expected
Exemple #13
0
def test_masked_viterbi_decode():
    transitions = np.ones([5, 5])
    transitions_from_start = np.ones(5)
    transitions_to_end = np.ones(5)

    logits = np.array([
        [
        #     O     B-X    I-X    B-Y    I-Y
            [ 0.,    1.,   0.,     0.,   0.],
            [ 0.,    0.,   1.,     0.,   0.],
            [ 0.,    0.,   1.,     0.,   0.]
        ],
        [
        #     O     B-X    I-X    B-Y    I-Y
            [ 0.,    1.,   0.,     0.,   0.],
            [ 0.,    1.,   0.,     0.,   0.],
            [ 0.,    1.,   0.,     0.,   0.]
        ]
    ])

    # TODO: this test case is right padding mask only
    #       due to the underline crf function only support sequence length
    mask = np.array([
            [1, 1, 0],
            [1, 1, 0]
    ])

    crf = CRF(
        units=5,
        use_kernel=False,  # disable kernel transform
        chain_initializer=initializers.Constant(transitions),
        use_boundary=True,
        # left_boundary_initializer=initializers.Constant(transitions_from_start),
        # right_boundary_initializer=initializers.Constant(transitions_to_end),
        name="crf_layer"
    )

    crf_loss_instance = ConditionalRandomFieldLoss()

    model = Sequential()
    model.add(layers.Input(shape=(3, 5)))
    model.add(MockMasking(mask_shape=(2, 3), mask_value=mask))
    model.add(crf)
    model.compile('adam', loss={"crf_layer": crf_loss_instance})

    # for layer in model.layers:
    #     print(layer.get_config())
    #     print(dict(zip(layer.weights, layer.get_weights())))

    # Get just the tags from each tuple of (tags, score).
    result = model.predict(logits)

    # Now the tags should respect the constraints
    expected = [
        [1, 2, 0],  # B-X  I-X  NA
        [1, 1, 0]   # B-X  B-X  NA
    ]

    # if constrain not work it should be:
    # [
    #     [2, 4, 3],
    #     [2, 3, 0]
    # ]

    # test assert
    np.testing.assert_equal(result, expected)
Exemple #14
0
    def test_unmasked_constrained_viterbi_tags(self):
        # TODO: using BILUO tag scheme instead of BIO.
        #       So that, transition from tags to end can be tested.

        raw_constraints = np.array([
            #     O     B-X    I-X    B-Y    I-Y  start   end
            [     1,     1,     0,     1,     0,    0,     1],  # O
            [     1,     1,     1,     1,     0,    0,     1],  # B-X
            [     1,     1,     1,     1,     0,    0,     1],  # I-X
            [     1,     1,     0,     1,     1,    0,     1],  # B-Y
            [     1,     1,     0,     1,     1,    0,     1],  # I-Y
            [     1,     1,     0,     1,     0,    0,     0],  # start
            [     0,     0,     0,     0,     0,    0,     0],  # end
        ])

        constraints = np.argwhere(raw_constraints > 0).tolist()

        # transitions = np.array([
        #     #     O     B-X    I-X    B-Y    I-Y
        #     [    0.1,   0.2,   0.3,   0.4,   0.5],  # O
        #     [    0.8,   0.3,   0.1,   0.7,   0.9],  # B-X
        #     [   -0.3,   2.1,  -5.6,   3.4,   4.0],  # I-X
        #     [    0.2,   0.4,   0.6,  -0.3,  -0.4],  # B-Y
        #     [    1.0,   1.0,   1.0,   1.0,   1.0]   # I-Y
        # ])

        transitions = np.ones([5, 5])

        # transitions_from_start = np.array(
        #     #     O     B-X    I-X    B-Y    I-Y
        #     [    0.1,   0.2,   0.3,   0.4,   0.6]  # start
        # )

        transitions_from_start = np.ones(5)

        # transitions_to_end = np.array(
        #     [
        #     #    end
        #         -0.1,  # O
        #         -0.2,  # B-X
        #          0.3,  # I-X
        #         -0.4,  # B-Y
        #         -0.4   # I-Y
        #     ]
        # )

        transitions_to_end = np.ones(5)

        logits = np.array([
            [
            # constraint transition from start to tags
            #     O     B-X    I-X    B-Y    I-Y
                [ 0.,    .1,   1.,     0.,   0.],
                [ 0.,    0.,   1.,     0.,   0.],
                [ 0.,    0.,   1.,     0.,   0.]
            ],
            [
            # constraint transition from tags to tags
            #     O     B-X    I-X    B-Y    I-Y
                [ 0.,    1.,   0.,     0.,   0.],
                [ 0.,    0.,   .1,     1.,   0.],
                [ 0.,    0.,   1.,     0.,   0.]
            ]
        ])

        crf = CRF(
            units=5,
            use_kernel=False,  # disable kernel transform
            chain_initializer=initializers.Constant(transitions),
            use_boundary=True,
            # left_boundary_initializer=initializers.Constant(transitions_from_start),
            # right_boundary_initializer=initializers.Constant(transitions_to_end),
            transition_constraint=constraints,
            name="crf_layer"
        )
        crf.left_boundary = crf.add_weight(
            shape=(5,),
            name="left_boundary",
            initializer=initializers.Constant(self.transitions_from_start),
        )
        crf.right_boundary = crf.add_weight(
            shape=(5,),
            name="right_boundary",
            initializer=initializers.Constant(self.transitions_to_end),
        )

        crf_loss_instance = ConditionalRandomFieldLoss()

        model = Sequential()
        model.add(layers.Input(shape=(3, 5)))
        model.add(crf)
        model.compile('adam', loss={"crf_layer": crf_loss_instance})

        for layer in model.layers:
            print(layer.get_config())
            print(dict(zip(layer.weights, layer.get_weights())))

        # Get just the tags from each tuple of (tags, score).
        viterbi_tags = model.predict(logits)

        # Now the tags should respect the constraints
        expected_tags = [
            [1, 2, 2],  # B-X  I-X  I-X
            [1, 2, 2]   # B-X  I-X  I-X
        ]

        # if constrain not work it should be:
        # [
        #     [2, 4, 3],
        #     [2, 3, 0]
        # ]

        # test assert
        np.testing.assert_equal(viterbi_tags, expected_tags)
Exemple #15
0
    def test_constrained_viterbi_tags(self):
        constraints = {(0, 0), (0, 1),
                       (1, 1), (1, 2),
                       (2, 2), (2, 3),
                       (3, 3), (3, 4),
                       (4, 4), (4, 0)}

        # Add the transitions to the end tag
        # and from the start tag.
        for i in range(5):
            constraints.add((5, i))
            constraints.add((i, 6))

        mask = np.array([
                [1, 1, 1],
                [1, 1, 0]
        ])

        crf = CRF(
            units=5,
            use_kernel=False,  # disable kernel transform
            chain_initializer=initializers.Constant(self.transitions),
            use_boundary=True,
            # left_boundary_initializer=initializers.Constant(self.transitions_from_start),
            # right_boundary_initializer=initializers.Constant(self.transitions_to_end),
            transition_constraint=constraints,
            name="crf_layer"
        )
        crf.left_boundary = crf.add_weight(
            shape=(5,),
            name="left_boundary",
            initializer=initializers.Constant(self.transitions_from_start),
        )
        crf.right_boundary = crf.add_weight(
            shape=(5,),
            name="right_boundary",
            initializer=initializers.Constant(self.transitions_to_end),
        )


        crf_loss_instance = ConditionalRandomFieldLoss()

        model = Sequential()
        model.add(layers.Input(shape=(3, 5)))
        model.add(MockMasking(mask_shape=(2, 3), mask_value=mask))
        model.add(crf)
        model.compile('adam', loss={"crf_layer": crf_loss_instance})

        for layer in model.layers:
            print(layer.get_config())
            print(dict(zip(layer.weights, layer.get_weights())))

        # Get just the tags from each tuple of (tags, score).
        viterbi_tags = model.predict(self.logits)

        # Now the tags should respect the constraints
        expected_tags = [
            [2, 3, 3],
            [2, 3, 0]
        ]

        # if constrain not work it should be:
        # [
        #     [2, 4, 3],
        #     [2, 3, 0]
        # ]

        # test assert
        np.testing.assert_equal(viterbi_tags, expected_tags)
Exemple #16
0
def main():
    # get configure
    config = _read_configure("./configure.yaml")

    # get Parameters (controller)
    EPOCHS = config.get("epochs", 10)
    PRETRAIN_EPOCHS = config.get("pretrain_cls", 5)
    BATCHSIZE = config.get("batch_size", 32)
    PRETRAIN_BATCHSIZE = config.get("pretrain_batchsize", 32)
    LEARNINGRATE = config.get("learning_rate", 0.001)
    MAX_SENTENCE_LEN = config.get("max_sentence_len", 25)
    LRDECAY = config.get('lr_decay', False)
    EARLYSTOP = config.get('early_stop', False)

    # get Parameters (model select)
    MODEL_CHOICE = config.get("model_choice", "VIRTUAL_EMBEDDING")
    FINETUNE = config.get("finetune", False)

    # get Parameters (model structure)
    CLS2NER_KEYWORD_LEN = config.get("cls2ner_keyword_len", 5)
    EMBED_DIM = config.get("embedding_dim", 128)
    ARCLOSS = config.get("Arcloss", True)
    USE_ATTENTION_LAYER = config.get("use_attention_layer", False)
    BiLSTM_STACK_CONFIG = config.get("bilstm_stack_config", [])
    CRF_PARAMS = config.get("crf_params", {})

    # get preprocessed input data dict
    from mtnlpmodel.utils.input_process_util import input_data_process
    # to build a fixed training environment, input data should be fixed.
    # input_data should be shuffled and remove duplication outside the trainer before running the program.
    # input data should be corpus(no duplication, shuffle well)

    if MODEL_CHOICE == 'VIRTUAL_EMBEDDING' or MODEL_CHOICE == 'CLS2NER_INPUT':  # different model structures have different input process way
        data_dict = input_data_process(
            config,
            **{
                'MAX_SENTENCE_LEN':
                MAX_SENTENCE_LEN,  # preprocess the input_data
                'CLS2NER_KEYWORD_LEN': CLS2NER_KEYWORD_LEN,
            })
    else:
        data_dict = input_data_process(
            config,
            **{
                'MAX_SENTENCE_LEN':
                MAX_SENTENCE_LEN,  # preprocess the input_data
                'CLS2NER_KEYWORD_LEN': 0,
            })
        PRETRAIN_EPOCHS = 0

    # get lookupers
    ner_tag_lookuper = data_dict['ner_tag_lookuper']
    cls_label_lookuper = data_dict['cls_label_lookuper']
    vocabulary_lookuper = data_dict['vocabulary_lookuper']

    # get train/test data for training model
    ner_train_x, ner_train_y = data_dict['ner_train_x'], data_dict[
        'ner_train_y']
    ner_test_x, ner_test_y = data_dict['ner_test_x'], data_dict['ner_test_y']

    cls_train_x, cls_train_y = data_dict['cls_train_x'], data_dict[
        'cls_train_y']
    cls_test_x, cls_test_y = data_dict['cls_test_x'], data_dict['cls_test_y']

    # build model or finetuning
    from mtnlpmodel.core import build_model, finetune_model, get_freeze_list_for_finetuning
    params = {
        'EMBED_DIM': EMBED_DIM,
        'PRETRAIN_EPOCHS': PRETRAIN_EPOCHS,
        'BiLSTM_STACK_CONFIG': BiLSTM_STACK_CONFIG,
        'MAX_SENTENCE_LEN': MAX_SENTENCE_LEN,
        'CLS2NER_KEYWORD_LEN': CLS2NER_KEYWORD_LEN,
        'USE_ATTENTION_LAYER': USE_ATTENTION_LAYER,
        'Arcloss': ARCLOSS,
        'ner_tag_lookuper': ner_tag_lookuper,
        'cls_label_lookuper': cls_label_lookuper,
        'vocabulary_lookuper': vocabulary_lookuper,
        'CRF_PARAMS': CRF_PARAMS
    }
    model_choice = MODEL_CHOICE  # VIRTUAL_EMBEDDING, CLS2NER_INPUT, OTHER
    print("Model structure choosing {}".format(model_choice))

    from mtnlpmodel.core import finetuning_logger
    if FINETUNE:  # fine-tuning the model, load model by the weights
        recommend_freeze_list = get_freeze_list_for_finetuning(
            model_choice
        )  # you can modify this list to customize the freeze list
        model_weights_path = os.path.abspath(
            './results/h5_weights/weights.h5')  # use weight
        finetuning_logger(*(model_weights_path,
                            recommend_freeze_list))  # print some log
        model = finetune_model(model_choice, model_weights_path,
                               recommend_freeze_list, **params)

    else:  # train the model by random initializer(make a fresh start to train a model)
        model = build_model(model_choice, **params)  # to build the model

    model.summary()

    # build callbacks list
    callbacks_list = []

    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        #log_dir=create_dir_if_needed(config["summary_log_dir"])
        log_dir='.\\results\\summary_log_dir',
        batch_size=BATCHSIZE,
    )
    callbacks_list.append(tensorboard_callback)

    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        os.path.join(create_dir_if_needed(config["model_dir"]),
                     "cp-{epoch:04d}.ckpt"),
        load_weights_on_restart=True,
        verbose=1,
    )
    callbacks_list.append(checkpoint_callback)

    # early stop util
    if EARLYSTOP:
        early_stop = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',  # early stop index
            patience=3,  # early stop delay epoch
            verbose=2,  # display mode
            mode='auto')
        callbacks_list.append(early_stop)

    #learning rate decay util
    if LRDECAY:
        reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                         factor=0.75,
                                                         patience=3,
                                                         verbose=1,
                                                         mode='auto',
                                                         epsilon=0.0001,
                                                         cooldown=0,
                                                         min_lr=0.00001)
        callbacks_list.append(reduce_lr)

    # ner_loss_func
    ner_loss_func = ConditionalRandomFieldLoss()

    # set optimizer
    adam_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNINGRATE,
                                              beta_1=0.9,
                                              beta_2=0.999,
                                              amsgrad=True)

    if FINETUNE:
        NER_out_name = 'crf_'
        CLS_out_name = 'cls_'
    else:
        NER_out_name = 'crf'
        CLS_out_name = 'cls'

    # pretrain model -> train cls branch
    model.compile(
        optimizer=adam_optimizer,
        loss={
            NER_out_name: ner_loss_func,
            CLS_out_name: 'categorical_crossentropy'
        },
        loss_weights={
            NER_out_name: 0.,
            CLS_out_name: 10.
        },  # set weight of loss
        metrics={
            NER_out_name: SequenceCorrectness(),
            CLS_out_name: 'categorical_accuracy'
        })

    model.fit(
        {
            'ner_input': ner_train_x,
            'cls_input': cls_train_x
        },
        {
            NER_out_name: ner_train_y,
            CLS_out_name: cls_train_y
        },
        epochs=PRETRAIN_EPOCHS,
        batch_size=PRETRAIN_BATCHSIZE,
        class_weight={
            NER_out_name: None,
            CLS_out_name: 'auto'
        },  # cls loss multiply the class weights
        validation_data=[{
            'ner_input': ner_test_x,
            'cls_input': cls_test_x
        }, {
            NER_out_name: ner_test_y,
            CLS_out_name: cls_test_y
        }],
        callbacks=callbacks_list,
    )

    # train model
    model.compile(
        optimizer=adam_optimizer,
        loss={
            NER_out_name: ner_loss_func,
            CLS_out_name: 'categorical_crossentropy'
        },
        loss_weights={
            NER_out_name: 15.,
            CLS_out_name: 10.
        },  # set weight of loss
        metrics={
            NER_out_name: SequenceCorrectness(),
            CLS_out_name: 'categorical_accuracy'
        })

    model.fit(
        {
            'ner_input': ner_train_x,
            'cls_input': cls_train_x
        },
        {
            NER_out_name: ner_train_y,
            CLS_out_name: cls_train_y
        },
        epochs=EPOCHS,
        batch_size=BATCHSIZE,
        class_weight={
            NER_out_name: None,
            CLS_out_name: 'auto'
        },  # cls loss multiply the class weights
        validation_data=[{
            'ner_input': ner_test_x,
            'cls_input': cls_test_x
        }, {
            NER_out_name: ner_test_y,
            CLS_out_name: cls_test_y
        }],
        callbacks=callbacks_list,
    )

    # save model
    model.save(create_file_dir_if_needed(config["h5_model_file"]))

    model.save_weights(create_file_dir_if_needed(config["h5_weights_file"]))

    tf.keras.experimental.export_saved_model(
        model, create_or_rm_dir_if_needed(config["saved_model_dir"]))

    mtinput_export_as_deliverable_model(
        create_dir_if_needed(config["deliverable_model_dir"]),
        keras_saved_model=config["saved_model_dir"],
        converter_for_request=ConverterForMTRequest(),
        converter_for_response=ConverterForMTResponse_VirtualPad(
            prepad=CLS2NER_KEYWORD_LEN),
        lookup_tables={
            'vocab_lookup': vocabulary_lookuper,
            'tag_lookup': ner_tag_lookuper,
            'label_lookup': cls_label_lookuper
        },
        padding_parameter={
            "maxlen": MAX_SENTENCE_LEN,
            "value": 0,
            "padding": "post"
        },
        addition_model_dependency=["tf-crf-layer"],
        custom_object_dependency=["tf_crf_layer"],
    )