def test_main(datadir):
    workshop_dir = datadir
    # TODO(howl-anderson): add a util to clean workshop
    # clean result dir first
    result_dir = os.path.join(workshop_dir, "./results")
    for target_dir in [
            os.path.join(result_dir, i) for i in [
                "h5_model",
                "model_dir",
                "saved_model",
                "summary_log_dir",
                "deliverable_model_dir",
            ]
    ]:
        create_dir_if_needed(target_dir)
        remove_content_in_dir(target_dir)

    config_file = os.path.join(workshop_dir, "./configure.yaml")

    os.environ["_DEFAULT_CONFIG_FILE"] = config_file

    # set current working directory to file directory
    os.chdir(workshop_dir)

    cli_keras.main()
Esempio n. 2
0
def main():

    # get configure
    config = read_configure()

    # get train/test corpus
    corpus = get_corpus_processor(config)
    corpus.prepare()
    train_data_generator_func = corpus.get_generator_func(corpus.TRAIN)
    eval_data_generator_func = corpus.get_generator_func(corpus.EVAL)

    corpus_meta_data = corpus.get_meta_info()

    # process str data to onehot
    ner_tags_data = generate_tagset(corpus_meta_data["tags"])
    cls_tags_data = corpus_meta_data["labels"]

    train_data = list(train_data_generator_func())
    eval_data = list(eval_data_generator_func())

    ner_tag_lookuper = Lookuper({v: i for i, v in enumerate(ner_tags_data)})
    cls_tag_lookuper = Lookuper({v: i for i, v in enumerate(cls_tags_data)})

    vocab_data_file = config.get("vocabulary_file")

    if not vocab_data_file:
        # load built in vocabulary file
        vocab_data_file = os.path.join(
            os.path.dirname(__file__), "../data/unicode_char_list.txt"
        )

    vocabulary_lookuper = index_table_from_file(vocab_data_file)

    def preprocss(data, maxlen, **kwargs):
        raw_x = []
        raw_y_ner = []
        raw_y_cls = []

        for offset_data in data:
            tags = offset_to_biluo(offset_data)
            label = offset_data.label
            words = offset_data.text

            tag_ids = [ner_tag_lookuper.lookup(i) for i in tags]
            label_id = cls_tag_lookuper.lookup(label)
            word_ids = [vocabulary_lookuper.lookup(i) for i in words]

            raw_x.append(word_ids)
            raw_y_ner.append(tag_ids)
            raw_y_cls.append(label_id)

        if maxlen is None:
            maxlen = max(len(s) for s in raw_x)

        print(">>> maxlen: {}".format(maxlen))

        x = tf.keras.preprocessing.sequence.pad_sequences(
            raw_x, maxlen, padding="post"
        )  # right padding

        y_ner = tf.keras.preprocessing.sequence.pad_sequences(
            raw_y_ner, maxlen, value=0, padding="post"
        )

        from keras.utils import to_categorical
        y_cls = np.array(raw_y_cls)
        y_cls = y_cls[:, np.newaxis]
        y_cls = to_categorical(y_cls, kwargs.get('cls_dims', 81))

        return x, y_ner, y_cls


    # get Parameters (controller)
    EPOCHS = config.get("epochs", 10)
    BATCHSIZE = config.get("batch_size", 32)
    LEARNINGRATE = config.get("learning_rate", 0.001)
    MAX_SENTENCE_LEN = config.get("max_sentence_len", 25)

    # get Parameters (model structure)
    EMBED_DIM = config.get("embedding_dim", 300)
    USE_ATTENTION_LAYER = config.get("use_attention_layer", False)
    BiLSTM_STACK_CONFIG = config.get("bilstm_stack_config", [])
    BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG = config.get(
        "use_batch_normalization_after_embedding", False)
    BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG = config.get(
        "use_batch_normalization_after_bilstm", False)
    CRF_PARAMS = config.get("crf_params", {})


    # get train/test data for training model
    vacab_size = vocabulary_lookuper.size()
    tag_size = ner_tag_lookuper.size()
    label_size = cls_tag_lookuper.size()

    train_x, train_y_ner, train_y_cls = preprocss(train_data, MAX_SENTENCE_LEN, **{'cls_dims':label_size})
    test_x, test_y_ner, test_y_cls = preprocss(eval_data, MAX_SENTENCE_LEN, **{'cls_dims':label_size})


    # build model
    input_length = MAX_SENTENCE_LEN
    input_layer = Input(shape=(input_length,), dtype='float', name='input_layer')

    # encoder
    with tf.keras.backend.name_scope("Encoder"):

        embedding_layer = Embedding(vacab_size,
                                    EMBED_DIM,
                                    mask_zero=True,
                                    input_length=input_length,
                                    name='embedding')(input_layer)

    # feature extractor
    with tf.keras.backend.name_scope("biLSTM"):
        if BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG:
            embedding_layer = BatchNormalization()(embedding_layer)

        biLSTM = embedding_layer
        for bilstm_config in BiLSTM_STACK_CONFIG:
               biLSTM = Bidirectional(LSTM(return_sequences=True, **bilstm_config, name='biLSTM'))(biLSTM)

    if BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG:
        biLSTM = BatchNormalization()(biLSTM)

    if USE_ATTENTION_LAYER:
        biLSTM = GlobalAttentionLayer()(biLSTM)

    # NER branch
    with tf.keras.backend.name_scope("NER_branch"):
        crf = CRF(tag_size, name="crf", **CRF_PARAMS)(biLSTM)
        loss_func = ConditionalRandomFieldLoss()


    # classification branch

    chosen = 'lstm_cls'
    with tf.keras.backend.name_scope("CLS_branch"):
        from tensorflow.keras.layers import Dense, Flatten, Dropout
        # add paragraph vector
        #paragraph_vector = get_paragraph_vector(embedding_layer)

        if chosen == "lstm_cls":
            cls_flat_lstm = Flatten()(biLSTM)
            #cls_flat_lstm = tf.keras.layers.concatenate([cls_flat_lstm, paragraph_vector])
            classification_dense = Dropout(0.2)(cls_flat_lstm)
            classification_dense = SetLearningRate(Dense(label_size, activation='sigmoid', name='CLS'), lr=0.001, is_ada=True)(classification_dense)

        elif chosen == "conv_cls":
            from tensorflow.keras.layers import Conv1D, MaxPooling1D
            embedding_layer = BatchNormalization()(embedding_layer)
            cls_conv_emb = Conv1D(32, 3, activation='relu', padding='same')(embedding_layer)
            cls_conv_emb = Conv1D(64, 3, activation='relu', padding='same')(cls_conv_emb)
            cls_conv_emb = MaxPooling1D(2)(cls_conv_emb)
            cls_conv_emb = Conv1D(128, 3, activation='relu', dilation_rate=1, padding='same')(cls_conv_emb)
            cls_conv_emb = Conv1D(128, 3, activation='relu', dilation_rate=2, padding='same')(cls_conv_emb)
            cls_conv_emb = Conv1D(128, 3, activation='relu', dilation_rate=5, padding='same')(cls_conv_emb)
            cls_conv_emb = Conv1D(256, 1, activation='relu', padding='same')(cls_conv_emb)
            cls_conv_emb = MaxPooling1D(2)(cls_conv_emb)

            cls_flat = BatchNormalization()(cls_conv_emb)
            cls_flat = Flatten()(cls_flat)
            classification_dense = Dropout(0.2)(cls_flat)
            classification_dense = Dense(label_size, activation='sigmoid', name='CLS')(classification_dense)



    # merge NER and Classification
    model = Model(inputs=[input_layer], outputs=[crf, classification_dense])


    model.summary()

    callbacks_list = []

    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        #log_dir=create_dir_if_needed(config["summary_log_dir"])
        log_dir='.\\results\\summary_log_dir',
        batch_size=BATCHSIZE,
    )
    callbacks_list.append(tensorboard_callback)

    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        os.path.join(create_dir_if_needed(config["model_dir"]), "cp-{epoch:04d}.ckpt"),
        load_weights_on_restart=True,
        verbose=1,
    )
    callbacks_list.append(checkpoint_callback)

    metrics_list = []

    metrics_list.append(crf_accuracy)
    metrics_list.append(SequenceCorrectness())
    metrics_list.append(sequence_span_accuracy)

    # early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss',  # early stop index
    #                                               patience=3,          # early stop delay epoch
    #                                               verbose=2,           # display mode
    #                                               mode='auto')
    # callbacks_list.append(early_stop)

    from mtnlpmodel.trainer.loss_func_util import FocalLoss
    adam_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNINGRATE, beta_1=0.9, beta_2=0.999, amsgrad=False)
    model.compile(optimizer=adam_optimizer,
                  #loss={'crf': loss_func, 'CLS': 'sparse_categorical_crossentropy'},
                  loss={'crf': loss_func, 'CLS': FocalLoss()},
                  loss_weights={'crf': 1., 'CLS': 100},  # set weight of loss
                  #metrics={'crf': SequenceCorrectness(), 'CLS': 'sparse_categorical_accuracy'} )
                  metrics={'crf': SequenceCorrectness(), 'CLS': 'categorical_accuracy'})

    model.fit(
        train_x,
        {'crf': train_y_ner, 'CLS': train_y_cls},
        epochs=EPOCHS,
        batch_size=BATCHSIZE,
        validation_data=[test_x,  {'crf': test_y_ner, 'CLS': test_y_cls}],
        callbacks=callbacks_list,
    )


    model.save(create_file_dir_if_needed(config["h5_model_file"]))
    model.save_weights(create_file_dir_if_needed(config["h5_weights_file"]))

    tf.keras.experimental.export_saved_model(
        model, create_or_rm_dir_if_needed(config["saved_model_dir"])
    )


    mt_export_as_deliverable_model(
        create_dir_if_needed(config["deliverable_model_dir"]),
        keras_saved_model=config["saved_model_dir"],
        converter_for_request=ConverterForRequest(),
        converter_for_response=ConverterForMTResponse(),
        lookup_tables={'vocab_lookup':vocabulary_lookuper,
                       'tag_lookup':ner_tag_lookuper,
                       'label_lookup':cls_tag_lookuper},
        padding_parameter={"maxlen": MAX_SENTENCE_LEN, "value": 0, "padding": "post"},
        addition_model_dependency=["tf-crf-layer"],
        custom_object_dependency=["tf_crf_layer"],
    )
Esempio n. 3
0
def main():
    config = read_configure()

    corpus = get_corpus_processor(config)
    corpus.prepare()
    train_data_generator_func = corpus.get_generator_func(corpus.TRAIN)
    eval_data_generator_func = corpus.get_generator_func(corpus.EVAL)

    corpus_meta_data = corpus.get_meta_info()

    tags_data = generate_tagset(corpus_meta_data["tags"])

    train_data = list(train_data_generator_func())
    eval_data = list(eval_data_generator_func())

    tag_lookuper = Lookuper({v: i for i, v in enumerate(tags_data)})

    vocab_data_file = config.get("vocabulary_file")

    if not vocab_data_file:
        # load built in vocabulary file
        vocab_data_file = os.path.join(os.path.dirname(__file__),
                                       "../data/unicode_char_list.txt")

    vocabulary_lookuper = index_table_from_file(vocab_data_file)

    def preprocss(data, maxlen):
        raw_x = []
        raw_y = []

        for offset_data in data:
            tags = offset_to_biluo(offset_data)
            words = offset_data.text

            tag_ids = [tag_lookuper.lookup(i) for i in tags]
            word_ids = [vocabulary_lookuper.lookup(i) for i in words]

            raw_x.append(word_ids)
            raw_y.append(tag_ids)

        if maxlen is None:
            maxlen = max(len(s) for s in raw_x)

        print(">>> maxlen: {}".format(maxlen))

        x = tf.keras.preprocessing.sequence.pad_sequences(
            raw_x, maxlen, padding="post")  # right padding

        # lef padded with -1. Indeed, any integer works as it will be masked
        # y_pos = pad_sequences(y_pos, maxlen, value=-1)
        # y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
        y = tf.keras.preprocessing.sequence.pad_sequences(raw_y,
                                                          maxlen,
                                                          value=0,
                                                          padding="post")

        return x, y

    MAX_SENTENCE_LEN = config.get("max_sentence_len", 25)

    train_x, train_y = preprocss(train_data, MAX_SENTENCE_LEN)
    test_x, test_y = preprocss(eval_data, MAX_SENTENCE_LEN)

    EPOCHS = config["epochs"]
    EMBED_DIM = config["embedding_dim"]
    USE_ATTENTION_LAYER = config.get("use_attention_layer", False)
    BiLSTM_STACK_CONFIG = config.get("bilstm_stack_config", [])
    BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG = config.get(
        "use_batch_normalization_after_embedding", False)
    BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG = config.get(
        "use_batch_normalization_after_bilstm", False)
    CRF_PARAMS = config.get("crf_params", {})

    vacab_size = vocabulary_lookuper.size()
    tag_size = tag_lookuper.size()

    model = Sequential()

    model.add(
        Embedding(vacab_size,
                  EMBED_DIM,
                  mask_zero=True,
                  input_length=MAX_SENTENCE_LEN))

    if BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG:
        model.add(BatchNormalization())

    for bilstm_config in BiLSTM_STACK_CONFIG:
        model.add(Bidirectional(LSTM(return_sequences=True, **bilstm_config)))

    if BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG:
        model.add(BatchNormalization())

    if USE_ATTENTION_LAYER:
        model.add(GlobalAttentionLayer())

    model.add(CRF(tag_size, name="crf", **CRF_PARAMS))

    # print model summary
    model.summary()

    callbacks_list = []

    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=create_dir_if_needed(config["summary_log_dir"]))
    callbacks_list.append(tensorboard_callback)

    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        os.path.join(create_dir_if_needed(config["model_dir"]),
                     "cp-{epoch:04d}.ckpt"),
        load_weights_on_restart=True,
        verbose=1,
    )
    callbacks_list.append(checkpoint_callback)

    metrics_list = []

    metrics_list.append(SequenceCorrectness())
    metrics_list.append(SequenceSpanAccuracy())

    loss_func = ConditionalRandomFieldLoss()
    # loss_func = crf_loss

    model.compile("adam", loss={"crf": loss_func}, metrics=metrics_list)
    model.fit(
        train_x,
        train_y,
        epochs=EPOCHS,
        validation_data=[test_x, test_y],
        callbacks=callbacks_list,
    )

    # Save the model
    model.save(create_file_dir_if_needed(config["h5_model_file"]))

    tf.keras.experimental.export_saved_model(
        model, create_dir_if_needed(config["saved_model_dir"]))

    export_as_deliverable_model(
        create_dir_if_needed(config["deliverable_model_dir"]),
        keras_saved_model=config["saved_model_dir"],
        vocabulary_lookup_table=vocabulary_lookuper,
        tag_lookup_table=tag_lookuper,
        padding_parameter={
            "maxlen": MAX_SENTENCE_LEN,
            "value": 0,
            "padding": "post"
        },
        addition_model_dependency=["tf-crf-layer"],
        custom_object_dependency=["tf_crf_layer"],
    )
Esempio n. 4
0
def train_model(train_inpf, eval_inpf, config, model_fn, model_name):
    estimator_params = copy.deepcopy(config)

    indices = [
        idx for idx, tag in enumerate(config["tags_data"])
        if tag.strip() != "O"
    ]
    num_tags = len(indices) + 1
    estimator_params["_indices"] = indices
    estimator_params["_num_tags"] = num_tags

    cfg = tf.estimator.RunConfig(
        save_checkpoints_secs=config["save_checkpoints_secs"])

    model_specific_name = "{model_name}-{batch_size}-{learning_rate}-{max_steps}-{max_steps_without_increase}".format(
        model_name=model_name,
        batch_size=config["batch_size"],
        learning_rate=config["learning_rate"],
        max_steps=config["max_steps"],
        max_steps_without_increase=config["max_steps_without_increase"],
    )

    instance_model_dir = os.path.join(config["model_dir"], model_specific_name)

    if config["use_tpu"]:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            tpu=config["tpu_name"],
            zone=config["tpu_zone"],
            project=config["gcp_project"],
        )

        run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=instance_model_dir,
            session_config=tf.ConfigProto(allow_soft_placement=True,
                                          log_device_placement=True),
            tpu_config=tf.contrib.tpu.TPUConfig(),
        )

        tpu_estimator_params = copy.deepcopy(estimator_params)
        # remove reserved keys
        # tpu_estimator_params['train_batch_size'] = tpu_estimator_params['batch_size']
        del tpu_estimator_params["batch_size"]
        # del tpu_estimator_params['context']

        estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=model_fn,
            params=tpu_estimator_params,
            config=run_config,
            use_tpu=True,
            train_batch_size=estimator_params["batch_size"],
            eval_batch_size=estimator_params["batch_size"],
            predict_batch_size=estimator_params["batch_size"],
        )
    else:
        estimator = tf.estimator.Estimator(model_fn, instance_model_dir, cfg,
                                           estimator_params)

    # Path(estimator.eval_dir()).mkdir(parents=True, exist_ok=True)
    utils.create_dir_if_needed(estimator.eval_dir())

    # hook_params = params['hook']['stop_if_no_increase']
    # hook = tf.contrib.estimator.stop_if_no_increase_hook(
    #     estimator, 'f1',
    #     max_steps_without_increase=hook_params['max_steps_without_increase'],
    #     min_steps=hook_params['min_steps'],
    #     run_every_secs=hook_params['run_every_secs']
    # )

    # build hooks from config
    train_hook = []
    for i in config.get("train_hook", []):
        class_ = class_from_module_path(i["class"])
        params = i["params"]
        if i.get("inject_whole_config", False):
            params["config"] = config
        train_hook.append(class_(**params))

    eval_hook = []
    for i in config.get("eval_hook", []):
        class_ = class_from_module_path(i["class"])
        params = i["params"]
        if i.get("inject_whole_config", False):
            params["config"] = config
        eval_hook.append(class_(**params))

    if eval_inpf:
        train_spec = tf.estimator.TrainSpec(input_fn=train_inpf,
                                            hooks=train_hook,
                                            max_steps=config["max_steps"])
        eval_spec = tf.estimator.EvalSpec(
            input_fn=eval_inpf,
            throttle_secs=config["throttle_secs"],
            hooks=eval_hook)
        evaluate_result, export_results = tf.estimator.train_and_evaluate(
            estimator, train_spec, eval_spec)
    else:
        estimator.train(input_fn=train_inpf,
                        hooks=train_hook,
                        max_steps=config["max_steps"])
        evaluate_result, export_results = {}, None

        # # Write predictions to file
    # def write_predictions(name):
    #     output_file = preds_file(name)
    #     with tf.io.gfile.GFile(output_file, 'w') as f:
    #         test_inpf = functools.partial(input_fn, fwords(name))
    #         golds_gen = generator_fn(fwords(name))
    #         preds_gen = estimator.predict(test_inpf)
    #         for golds, preds in zip(golds_gen, preds_gen):
    #             ((words, _), tags) = golds
    #             preds_tags = [i.decode() for i in preds['tags']]
    #             for word, tag, tag_pred in zip(words, tags, preds_tags):
    #                 # f.write(b' '.join([word, tag, tag_pred]) + b'\n')
    #                 f.write(' '.join([word, tag, tag_pred]) + '\n')
    #             # f.write(b'\n')
    #             f.write('\n')
    #
    # for name in ['train', 'test']:
    #     write_predictions(name)

    # export saved_model
    feature_spec = {
        # 'words': tf.placeholder(tf.int32, [None, None]),
        "words": tf.placeholder(tf.string, [None, None]),
        "words_len": tf.placeholder(tf.int32, [None]),
    }

    if config.get("forced_saved_model_dir"):
        instance_saved_dir = config.get("forced_saved_model_dir")
    else:
        instance_saved_dir = os.path.join(config["saved_model_dir"],
                                          model_specific_name)

    utils.create_dir_if_needed(instance_saved_dir)

    serving_input_receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(
        feature_spec)
    raw_final_saved_model = estimator.export_saved_model(
        instance_saved_dir,
        serving_input_receiver_fn,
        # assets_extra={
        #     'tags.txt': 'data/tags.txt',
        #     'vocab.txt': 'data/unicode_char_list.txt'
        # }
    )

    final_saved_model = raw_final_saved_model.decode("utf-8")

    return evaluate_result, export_results, final_saved_model
Esempio n. 5
0
def main():

    # get configure

    config = read_configure()

    # get train/test corpus
    corpus = get_corpus_processor(config)
    corpus.prepare()
    train_data_generator_func = corpus.get_generator_func(corpus.TRAIN)
    eval_data_generator_func = corpus.get_generator_func(corpus.EVAL)

    corpus_meta_data = corpus.get_meta_info()

    # process str data to onehot
    ner_tags_data = generate_tagset(corpus_meta_data["tags"])
    cls_tags_data = corpus_meta_data["labels"]

    train_data = list(train_data_generator_func())
    eval_data = list(eval_data_generator_func())

    ner_tag_lookuper = Lookuper({v: i for i, v in enumerate(ner_tags_data)})
    cls_tag_lookuper = Lookuper({v: i for i, v in enumerate(cls_tags_data)})

    vocab_data_file = config.get("vocabulary_file")

    if not vocab_data_file:
        # load built in vocabulary file
        vocab_data_file = os.path.join(os.path.dirname(__file__),
                                       "../data/unicode_char_list.txt")

    vocabulary_lookuper = index_table_from_file(vocab_data_file)

    def preprocss(data, maxlen):
        raw_x = []
        raw_y_ner = []
        raw_y_cls = []

        for offset_data in data:
            tags = offset_to_biluo(offset_data)
            label = offset_data.label
            words = offset_data.text

            tag_ids = [ner_tag_lookuper.lookup(i) for i in tags]
            label_id = cls_tag_lookuper.lookup(label)
            word_ids = [vocabulary_lookuper.lookup(i) for i in words]

            raw_x.append(word_ids)
            raw_y_ner.append(tag_ids)
            raw_y_cls.append(label_id)

        if maxlen is None:
            maxlen = max(len(s) for s in raw_x)

        print(">>> maxlen: {}".format(maxlen))

        x = tf.keras.preprocessing.sequence.pad_sequences(
            raw_x, maxlen, padding="post")  # right padding

        y_ner = tf.keras.preprocessing.sequence.pad_sequences(raw_y_ner,
                                                              maxlen,
                                                              value=0,
                                                              padding="post")

        y_cls = np.array(raw_y_cls)
        y_cls = y_cls[:, np.newaxis]

        return x, y_ner, y_cls

    # get Parameters (controller)
    EPOCHS = config.get("epochs", 10)
    BATCHSIZE = config.get("batch_size", 32)
    LEARNINGRATE = config.get("learning_rate", 0.0001)
    MAX_SENTENCE_LEN = config.get("max_sentence_len", 25)

    # get Parameters (model structure)
    EMBED_DIM = config.get("embedding_dim", 300)
    BiLSTM_STACK_CONFIG = config.get("bilstm_stack_config", [])

    # get train/test data for training model
    train_x, train_y_ner, train_y_cls = preprocss(train_data, MAX_SENTENCE_LEN)
    test_x, test_y_ner, test_y_cls = preprocss(eval_data, MAX_SENTENCE_LEN)

    vacab_size = vocabulary_lookuper.size()
    # tag_size = ner_tag_lookuper.size()
    label_size = cls_tag_lookuper.size()

    # finetuning correlation code

    adam_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNINGRATE,
                                              beta_1=0.9,
                                              beta_2=0.999,
                                              amsgrad=False)
    index_dict = {
        'optimizer': adam_optimizer,
        'loss': 'sparse_categorical_crossentropy',
        'metrics': ['sparse_categorical_accuracy']
    }

    warm_start_list = ['embedding', 'bidirectional',
                       'batch_normalization']  # layer in list is frozen

    backbone_model_path = './mtnlpmodel/trainer/fine_tuning_trainer/save_weights/weights.h5'

    output_dims = label_size

    # model structure correlation code

    # define new_layer for the task
    new_task_output_layer = Dense(
        output_dims, activation='softmax')  # new softmax layer -> output

    input_shape = MAX_SENTENCE_LEN
    input_layer = Input(shape=(input_shape, ),
                        dtype='int32',
                        name='input_layer')  # input

    # backbone + transfer_learning function can use backbone to do some different job
    # what you need is to define a new layer which is new_task_output_layer below
    # this code is a sample for only text classification
    # backbone output is biLSTM's output, so transfer_learning add a flatten layer to connect dense layer
    # you can modify the structure in function transfer_learning to match your task demand
    base_model = backbone_network(
        BiLSTM_STACK_CONFIG,
        input_layer=input_layer,
        vacab_size=vacab_size,
        EMBED_DIM=EMBED_DIM,
        input_length=MAX_SENTENCE_LEN,
    )

    new_model = transfer_learning(input_shape, input_layer, base_model,
                                  new_task_output_layer, index_dict,
                                  backbone_model_path, warm_start_list)

    # model output info correlation code
    new_model.summary()

    callbacks_list = []

    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        #log_dir=create_dir_if_needed(config["summary_log_dir"])
        log_dir='.\\results\\summary_log_dir',
        batch_size=BATCHSIZE,
    )
    callbacks_list.append(tensorboard_callback)

    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        os.path.join(create_dir_if_needed(config["model_dir"]),
                     "cp-{epoch:04d}.ckpt"),
        load_weights_on_restart=True,
        verbose=1,
    )
    callbacks_list.append(checkpoint_callback)

    metrics_list = []

    metrics_list.append(crf_accuracy)
    metrics_list.append(SequenceCorrectness())
    metrics_list.append(sequence_span_accuracy)

    early_stop = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',  # early stop index
        patience=3,  # early stop delay epoch
        verbose=2,  # display mode
        mode='auto')
    callbacks_list.append(early_stop)

    new_model.fit(
        train_x,
        train_y_cls,
        epochs=EPOCHS,
        batch_size=BATCHSIZE,
        validation_data=[test_x, test_y_cls],
        callbacks=callbacks_list,
    )

    new_model.save(create_file_dir_if_needed(config["h5_model_file"]))

    tf.keras.experimental.export_saved_model(
        new_model, create_or_rm_dir_if_needed(config["saved_model_dir"]))

    mt_export_as_deliverable_model(
        create_dir_if_needed(config["deliverable_model_dir"]),
        keras_saved_model=config["saved_model_dir"],
        vocabulary_lookup_table=vocabulary_lookuper,
        tag_lookup_table=ner_tag_lookuper,
        label_lookup_table=cls_tag_lookuper,
        padding_parameter={
            "maxlen": MAX_SENTENCE_LEN,
            "value": 0,
            "padding": "post"
        },
        addition_model_dependency=["tf-crf-layer"],
        custom_object_dependency=["tf_crf_layer"],
    )
Esempio n. 6
0
vacab_size = vocabulary_lookuper.size()
tag_size = tag_lookuper.size()

model = Sequential()
model.add(Embedding(vacab_size, EMBED_DIM, mask_zero=True))
model.add(Bidirectional(LSTM(BiRNN_UNITS, return_sequences=True)))
model.add(CRF(tag_size, name='crf'))

# print model summary
model.summary()

callbacks_list = []

tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir=create_dir_if_needed(config['summary_log_dir']))
callbacks_list.append(tensorboard_callback)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    os.path.join(create_dir_if_needed(config['model_dir']),
                 'cp-{epoch:04d}.ckpt'),
    load_weights_on_restart=True,
    verbose=1)
callbacks_list.append(checkpoint_callback)

metrics_list = []

metrics_list.append(crf_accuracy)
metrics_list.append(SequenceCorrectness())
metrics_list.append(sequence_span_accuracy)
Esempio n. 7
0
def train_model(train_inpf, eval_inpf, config, model_fn, model_name):
    # config=kwargs['config']
    # data_dir = kwargs.pop('data_dir', '.')
    # result_dir = kwargs.pop('result_dir', '.')
    # input_fn = kwargs.pop('input_fn', simple_input_fn)
    # generator_fn = kwargs.pop('generator_fn', simple_generator_fn)
    # model = kwargs.pop('model', None)
    # model_name = kwargs.pop('model_name', None)
    # model_fn = kwargs.pop('model_fn') if kwargs.get('model_fn') else getattr(model, 'model_fn')

    # model_fn = getattr(model, 'model_fn')
    # model_name = getattr(model, 'get_model_name')()

    # params = {
    #     'dim': 300,
    #     'dropout': 0.5,
    #     'num_oov_buckets': 1,
    #     'epochs': None,
    #     'batch_size': 20,
    #     'buffer': 15000,
    #     'lstm_size': 100,
    #     'words': utils.join_path(data_dir, './unicode_char_list.txt'),
    #     'lookup': utils.join_path(data_dir, './lookup.txt'),
    #     'chars': utils.join_path(data_dir, 'vocab.chars.txt'),
    #     'tags': utils.join_path(data_dir, './tags.txt'),
    #     'glove': utils.join_path(data_dir, './glove.npz'),
    #
    #     'model_dir': utils.join_path(result_dir, 'model_dir'),
    #     'params_log_file': utils.join_path(result_dir, 'params.json'),
    #
    #     'train': utils.join_path(data_dir, '{}.conllz'.format('train')),
    #     'test': utils.join_path(data_dir, '{}.conllz'.format('test')),
    #
    #     'preds': {
    #         'train': utils.join_path(result_dir, '{}.txt'.format('preds_train')),
    #         'test': utils.join_path(result_dir, '{}.txt'.format('preds_test')),
    #     },
    #
    #     'optimizer_params': {},
    #
    #     'saved_model_dir': utils.join_path(result_dir, 'saved_model'),
    #
    #     'hook': {
    #         'stop_if_no_increase': {
    #             'min_steps': 100,
    #             'run_every_secs': 60,
    #             'max_steps_without_increase': 20
    #         }
    #     },
    #
    #     'train_spec': {
    #         'max_steps': 5000
    #     },
    #     'eval_spec': {
    #         'throttle_secs': 60
    #     },
    #
    #     'estimator': {
    #         'save_checkpoints_secs': 120
    #     },
    #
    #
    #     'embedding': {
    #         'vocabulary_size': 128003
    #     },
    #
    #     'use_tpu': False,
    #     'tpu_config': {
    #         'tpu_name': None,
    #         'zone': None,
    #         'gcp_project': None
    #     }
    # }

    # # update from kwargs
    # params.update(kwargs)
    #
    # train_inpf = params.pop('train_inpf')
    # eval_inpf = params.pop('eval_inpf')

    # with tf.io.gfile.GFile(config['params_log_file'], 'w') as f:
    #     json.dump(params, f, indent=4, sort_keys=True)

    # def fwords(name):
    #     return params[name]
    #
    # def preds_file(name):
    #     return params['preds'][name]

    # # Estimator, train and evaluate
    # if not train_inpf:
    #     train_inpf = functools.partial(input_fn, input_file=fwords('train'),
    #                                    config=params, shuffle_and_repeat=True)
    #
    # if not eval_inpf:
    #     eval_inpf = functools.partial(input_fn, input_file=fwords('test'))

    estimator_params = copy.deepcopy(config)

    # estimator_params = {
    #     'config': config,
    #     'depends': {}
    # }
    # estimator_params.update({
    #     'words_feature_columns': words_feature_columns,
    #     'words_len_feature_columns': words_len_feature_columns
    # })

    indices = [
        idx for idx, tag in enumerate(config['tags_data'])
        if tag.strip() != 'O'
    ]
    num_tags = len(indices) + 1
    estimator_params['_indices'] = indices
    estimator_params['_num_tags'] = num_tags

    cfg = tf.estimator.RunConfig(
        save_checkpoints_secs=config['save_checkpoints_secs'])

    model_specific_name = '{model_name}-{batch_size}-{learning_rate}-{max_steps}-{max_steps_without_increase}'.format(
        model_name=model_name,
        batch_size=config['batch_size'],
        learning_rate=config['learning_rate'],
        max_steps=config['max_steps'],
        max_steps_without_increase=config['max_steps_without_increase'])

    instance_model_dir = os.path.join(config['model_dir'], model_specific_name)

    if config['use_tpu']:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            tpu=config['tpu_name'],
            zone=config['tpu_zone'],
            project=config['gcp_project'])

        run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=instance_model_dir,
            session_config=tf.ConfigProto(allow_soft_placement=True,
                                          log_device_placement=True),
            tpu_config=tf.contrib.tpu.TPUConfig(),
        )

        tpu_estimator_params = copy.deepcopy(estimator_params)
        # remove reserved keys
        # tpu_estimator_params['train_batch_size'] = tpu_estimator_params['batch_size']
        del tpu_estimator_params['batch_size']
        # del tpu_estimator_params['context']

        estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=model_fn,
            params=tpu_estimator_params,
            config=run_config,
            use_tpu=True,
            train_batch_size=estimator_params['batch_size'],
            eval_batch_size=estimator_params['batch_size'],
            predict_batch_size=estimator_params['batch_size'])
    else:
        estimator = tf.estimator.Estimator(model_fn, instance_model_dir, cfg,
                                           estimator_params)

    # Path(estimator.eval_dir()).mkdir(parents=True, exist_ok=True)
    utils.create_dir_if_needed(estimator.eval_dir())

    # hook_params = params['hook']['stop_if_no_increase']
    # hook = tf.contrib.estimator.stop_if_no_increase_hook(
    #     estimator, 'f1',
    #     max_steps_without_increase=hook_params['max_steps_without_increase'],
    #     min_steps=hook_params['min_steps'],
    #     run_every_secs=hook_params['run_every_secs']
    # )

    # build hooks from config
    train_hook = []
    for i in config.get('train_hook', []):
        class_ = class_from_module_path(i['class'])
        params = i['params']
        if i.get('inject_whole_config', False):
            params['config'] = config
        train_hook.append(class_(**params))

    eval_hook = []
    for i in config.get('eval_hook', []):
        class_ = class_from_module_path(i['class'])
        params = i['params']
        if i.get('inject_whole_config', False):
            params['config'] = config
        eval_hook.append(class_(**params))

    if eval_inpf:
        train_spec = tf.estimator.TrainSpec(input_fn=train_inpf,
                                            hooks=train_hook,
                                            max_steps=config['max_steps'])
        eval_spec = tf.estimator.EvalSpec(
            input_fn=eval_inpf,
            throttle_secs=config['throttle_secs'],
            hooks=eval_hook)
        evaluate_result, export_results = tf.estimator.train_and_evaluate(
            estimator, train_spec, eval_spec)
    else:
        estimator.train(input_fn=train_inpf,
                        hooks=train_hook,
                        max_steps=config['max_steps'])
        evaluate_result, export_results = {}, None

        # # Write predictions to file
    # def write_predictions(name):
    #     output_file = preds_file(name)
    #     with tf.io.gfile.GFile(output_file, 'w') as f:
    #         test_inpf = functools.partial(input_fn, fwords(name))
    #         golds_gen = generator_fn(fwords(name))
    #         preds_gen = estimator.predict(test_inpf)
    #         for golds, preds in zip(golds_gen, preds_gen):
    #             ((words, _), tags) = golds
    #             preds_tags = [i.decode() for i in preds['tags']]
    #             for word, tag, tag_pred in zip(words, tags, preds_tags):
    #                 # f.write(b' '.join([word, tag, tag_pred]) + b'\n')
    #                 f.write(' '.join([word, tag, tag_pred]) + '\n')
    #             # f.write(b'\n')
    #             f.write('\n')
    #
    # for name in ['train', 'test']:
    #     write_predictions(name)

    # export saved_model
    feature_spec = {
        # 'words': tf.placeholder(tf.int32, [None, None]),
        'words': tf.placeholder(tf.string, [None, None]),
        'words_len': tf.placeholder(tf.int32, [None]),
    }

    if config.get('forced_saved_model_dir'):
        instance_saved_dir = config.get('forced_saved_model_dir')
    else:
        instance_saved_dir = os.path.join(config['saved_model_dir'],
                                          model_specific_name)

    utils.create_dir_if_needed(instance_saved_dir)

    serving_input_receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(
        feature_spec)
    raw_final_saved_model = estimator.export_saved_model(
        instance_saved_dir,
        serving_input_receiver_fn,
        # assets_extra={
        #     'tags.txt': 'data/tags.txt',
        #     'vocab.txt': 'data/unicode_char_list.txt'
        # }
    )

    final_saved_model = raw_final_saved_model.decode('utf-8')

    return evaluate_result, export_results, final_saved_model
Esempio n. 8
0
def main():
    # get configure
    config = _read_configure("./configure.yaml")

    # get Parameters (controller)
    EPOCHS = config.get("epochs", 10)
    PRETRAIN_EPOCHS = config.get("pretrain_cls", 5)
    BATCHSIZE = config.get("batch_size", 32)
    PRETRAIN_BATCHSIZE = config.get("pretrain_batchsize", 32)
    LEARNINGRATE = config.get("learning_rate", 0.001)
    MAX_SENTENCE_LEN = config.get("max_sentence_len", 25)
    LRDECAY = config.get('lr_decay', False)
    EARLYSTOP = config.get('early_stop', False)

    # get Parameters (model select)
    MODEL_CHOICE = config.get("model_choice", "VIRTUAL_EMBEDDING")
    FINETUNE = config.get("finetune", False)

    # get Parameters (model structure)
    CLS2NER_KEYWORD_LEN = config.get("cls2ner_keyword_len", 5)
    EMBED_DIM = config.get("embedding_dim", 128)
    ARCLOSS = config.get("Arcloss", True)
    USE_ATTENTION_LAYER = config.get("use_attention_layer", False)
    BiLSTM_STACK_CONFIG = config.get("bilstm_stack_config", [])
    CRF_PARAMS = config.get("crf_params", {})

    # get preprocessed input data dict
    from mtnlpmodel.utils.input_process_util import input_data_process
    # to build a fixed training environment, input data should be fixed.
    # input_data should be shuffled and remove duplication outside the trainer before running the program.
    # input data should be corpus(no duplication, shuffle well)

    if MODEL_CHOICE == 'VIRTUAL_EMBEDDING' or MODEL_CHOICE == 'CLS2NER_INPUT':  # different model structures have different input process way
        data_dict = input_data_process(
            config,
            **{
                'MAX_SENTENCE_LEN':
                MAX_SENTENCE_LEN,  # preprocess the input_data
                'CLS2NER_KEYWORD_LEN': CLS2NER_KEYWORD_LEN,
            })
    else:
        data_dict = input_data_process(
            config,
            **{
                'MAX_SENTENCE_LEN':
                MAX_SENTENCE_LEN,  # preprocess the input_data
                'CLS2NER_KEYWORD_LEN': 0,
            })
        PRETRAIN_EPOCHS = 0

    # get lookupers
    ner_tag_lookuper = data_dict['ner_tag_lookuper']
    cls_label_lookuper = data_dict['cls_label_lookuper']
    vocabulary_lookuper = data_dict['vocabulary_lookuper']

    # get train/test data for training model
    ner_train_x, ner_train_y = data_dict['ner_train_x'], data_dict[
        'ner_train_y']
    ner_test_x, ner_test_y = data_dict['ner_test_x'], data_dict['ner_test_y']

    cls_train_x, cls_train_y = data_dict['cls_train_x'], data_dict[
        'cls_train_y']
    cls_test_x, cls_test_y = data_dict['cls_test_x'], data_dict['cls_test_y']

    # build model or finetuning
    from mtnlpmodel.core import build_model, finetune_model, get_freeze_list_for_finetuning
    params = {
        'EMBED_DIM': EMBED_DIM,
        'PRETRAIN_EPOCHS': PRETRAIN_EPOCHS,
        'BiLSTM_STACK_CONFIG': BiLSTM_STACK_CONFIG,
        'MAX_SENTENCE_LEN': MAX_SENTENCE_LEN,
        'CLS2NER_KEYWORD_LEN': CLS2NER_KEYWORD_LEN,
        'USE_ATTENTION_LAYER': USE_ATTENTION_LAYER,
        'Arcloss': ARCLOSS,
        'ner_tag_lookuper': ner_tag_lookuper,
        'cls_label_lookuper': cls_label_lookuper,
        'vocabulary_lookuper': vocabulary_lookuper,
        'CRF_PARAMS': CRF_PARAMS
    }
    model_choice = MODEL_CHOICE  # VIRTUAL_EMBEDDING, CLS2NER_INPUT, OTHER
    print("Model structure choosing {}".format(model_choice))

    from mtnlpmodel.core import finetuning_logger
    if FINETUNE:  # fine-tuning the model, load model by the weights
        recommend_freeze_list = get_freeze_list_for_finetuning(
            model_choice
        )  # you can modify this list to customize the freeze list
        model_weights_path = os.path.abspath(
            './results/h5_weights/weights.h5')  # use weight
        finetuning_logger(*(model_weights_path,
                            recommend_freeze_list))  # print some log
        model = finetune_model(model_choice, model_weights_path,
                               recommend_freeze_list, **params)

    else:  # train the model by random initializer(make a fresh start to train a model)
        model = build_model(model_choice, **params)  # to build the model

    model.summary()

    # build callbacks list
    callbacks_list = []

    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        #log_dir=create_dir_if_needed(config["summary_log_dir"])
        log_dir='.\\results\\summary_log_dir',
        batch_size=BATCHSIZE,
    )
    callbacks_list.append(tensorboard_callback)

    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        os.path.join(create_dir_if_needed(config["model_dir"]),
                     "cp-{epoch:04d}.ckpt"),
        load_weights_on_restart=True,
        verbose=1,
    )
    callbacks_list.append(checkpoint_callback)

    # early stop util
    if EARLYSTOP:
        early_stop = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',  # early stop index
            patience=3,  # early stop delay epoch
            verbose=2,  # display mode
            mode='auto')
        callbacks_list.append(early_stop)

    #learning rate decay util
    if LRDECAY:
        reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                         factor=0.75,
                                                         patience=3,
                                                         verbose=1,
                                                         mode='auto',
                                                         epsilon=0.0001,
                                                         cooldown=0,
                                                         min_lr=0.00001)
        callbacks_list.append(reduce_lr)

    # ner_loss_func
    ner_loss_func = ConditionalRandomFieldLoss()

    # set optimizer
    adam_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNINGRATE,
                                              beta_1=0.9,
                                              beta_2=0.999,
                                              amsgrad=True)

    if FINETUNE:
        NER_out_name = 'crf_'
        CLS_out_name = 'cls_'
    else:
        NER_out_name = 'crf'
        CLS_out_name = 'cls'

    # pretrain model -> train cls branch
    model.compile(
        optimizer=adam_optimizer,
        loss={
            NER_out_name: ner_loss_func,
            CLS_out_name: 'categorical_crossentropy'
        },
        loss_weights={
            NER_out_name: 0.,
            CLS_out_name: 10.
        },  # set weight of loss
        metrics={
            NER_out_name: SequenceCorrectness(),
            CLS_out_name: 'categorical_accuracy'
        })

    model.fit(
        {
            'ner_input': ner_train_x,
            'cls_input': cls_train_x
        },
        {
            NER_out_name: ner_train_y,
            CLS_out_name: cls_train_y
        },
        epochs=PRETRAIN_EPOCHS,
        batch_size=PRETRAIN_BATCHSIZE,
        class_weight={
            NER_out_name: None,
            CLS_out_name: 'auto'
        },  # cls loss multiply the class weights
        validation_data=[{
            'ner_input': ner_test_x,
            'cls_input': cls_test_x
        }, {
            NER_out_name: ner_test_y,
            CLS_out_name: cls_test_y
        }],
        callbacks=callbacks_list,
    )

    # train model
    model.compile(
        optimizer=adam_optimizer,
        loss={
            NER_out_name: ner_loss_func,
            CLS_out_name: 'categorical_crossentropy'
        },
        loss_weights={
            NER_out_name: 15.,
            CLS_out_name: 10.
        },  # set weight of loss
        metrics={
            NER_out_name: SequenceCorrectness(),
            CLS_out_name: 'categorical_accuracy'
        })

    model.fit(
        {
            'ner_input': ner_train_x,
            'cls_input': cls_train_x
        },
        {
            NER_out_name: ner_train_y,
            CLS_out_name: cls_train_y
        },
        epochs=EPOCHS,
        batch_size=BATCHSIZE,
        class_weight={
            NER_out_name: None,
            CLS_out_name: 'auto'
        },  # cls loss multiply the class weights
        validation_data=[{
            'ner_input': ner_test_x,
            'cls_input': cls_test_x
        }, {
            NER_out_name: ner_test_y,
            CLS_out_name: cls_test_y
        }],
        callbacks=callbacks_list,
    )

    # save model
    model.save(create_file_dir_if_needed(config["h5_model_file"]))

    model.save_weights(create_file_dir_if_needed(config["h5_weights_file"]))

    tf.keras.experimental.export_saved_model(
        model, create_or_rm_dir_if_needed(config["saved_model_dir"]))

    mtinput_export_as_deliverable_model(
        create_dir_if_needed(config["deliverable_model_dir"]),
        keras_saved_model=config["saved_model_dir"],
        converter_for_request=ConverterForMTRequest(),
        converter_for_response=ConverterForMTResponse_VirtualPad(
            prepad=CLS2NER_KEYWORD_LEN),
        lookup_tables={
            'vocab_lookup': vocabulary_lookuper,
            'tag_lookup': ner_tag_lookuper,
            'label_lookup': cls_label_lookuper
        },
        padding_parameter={
            "maxlen": MAX_SENTENCE_LEN,
            "value": 0,
            "padding": "post"
        },
        addition_model_dependency=["tf-crf-layer"],
        custom_object_dependency=["tf_crf_layer"],
    )