예제 #1
0
    steps = 100

    # 0 for `pad`, 1 for `unk`
    inputs = tf.random.uniform((batch_size, steps),
                               minval=0,
                               maxval=vocab_size + 2)
    """
    import numpy as np
    inputs = np.random.randint(low=0, high=vocab_size+2, size=steps*batch_size)\
        .reshape((batch_size, steps))
    """
    print('inputs', inputs.shape)

    softmax = rnnseg(inputs)
    print('softmax', softmax.shape)

    return inputs


if __name__ == '__main__':
    vocab_size = 10
    rnnseg = RNNSeg(vocab_size=vocab_size)
    rnnseg.compile(optimizer=tf.optimizers.Adam(0.001),
                   loss=tf.losses.SparseCategoricalCrossentropy)
    test_rnnseg_once(rnnseg=rnnseg)

    from cangjie.utils.config import get_model_dir
    import os
    model_img_path = os.path.join(get_model_dir(), "images", "rnn.png")
    plot_model(rnnseg, to_file=model_img_path, show_shapes=True)
예제 #2
0
                                        line[:-1],
                                        is_use_matching=is_use_matching,
                                        matching_method=matching_method,
                                        max_num_char=max_num_char,
                                        word_dict=word_dict)
            if seg_words is None:
                fw.write("\n")
            else:
                fw.write(" ".join(seg_words) + "\n")

    fw.close()


if __name__ == "__main__":
    data_dir = get_data_dir()
    model_dir = get_model_dir()

    model_path = os.path.join(model_dir, "hmm", "hmm.pkl")
    test_path = os.path.join(data_dir, "msr_test.utf8")
    test_result_path = os.path.join(data_dir, "msr_test_hmm.utf8")
    dict_path = os.path.join(data_dir, "msr.dict")

    word_dict = load_dictionary(dict_path=dict_path)
    print("Total number of words is: %d\n" % (len(word_dict)))

    hmm = HMM()
    hmm.load_model(model_path=model_path, is_training=False)

    seg_res = seg_on_sentence(hmm, sentence='黑夜给了我黑色的眼睛,我却用它寻找光明。')
    print("/".join(seg_res))
    seg_on_file(model=hmm,
예제 #3
0
def train_model():
    vocab_size = 3954  # count > min_char_count = 5
    num_states = 4
    total_num_train = 69000  # num_lines of msr_rnn_train.utf8
    total_num_val = 17300  # num_lines of msr_rnn_val.utf8

    epochs = 100
    shuffle_buffer_size = 1024 * 2
    batch_size = 32
    rnn_steps = 30

    embedding_dim = 64
    rnn_units = 32
    pad_index = 0  # pad_index, to mask in loss

    train_path = os.path.join(get_data_dir(), "msr_rnn_train.utf8")
    val_path = os.path.join(get_data_dir(), "msr_rnn_val.utf8")
    char2id_dict_path = os.path.join(get_data_dir(),
                                     "msr_training_char2id_dict.pkl")

    num_train_batch = total_num_train // batch_size + 1
    num_val_batch = total_num_val // batch_size + 1

    char2id_dict = load_dictionary(dict_path=char2id_dict_path)
    print("#char2id_dict = %d" % len(char2id_dict))

    # === tf.data.Dataset
    train_dataset = get_dataset(data_path=train_path,
                                epochs=epochs,
                                shuffle_buffer_size=shuffle_buffer_size,
                                batch_size=batch_size,
                                steps=rnn_steps,
                                char2id_dict=char2id_dict,
                                pad_index=pad_index)

    val_dataset = get_dataset(data_path=val_path,
                              epochs=epochs,
                              shuffle_buffer_size=shuffle_buffer_size,
                              batch_size=batch_size,
                              steps=rnn_steps,
                              char2id_dict=char2id_dict,
                              pad_index=pad_index)

    # === model
    model = BiRNNCRF(vocab_size=vocab_size,
                     embedding_dim=embedding_dim,
                     rnn_units=rnn_units)
    # optimizer
    optimizer = tf.keras.optimizers.Adam(0.001)

    crf = model.crf_layer
    model.compile(
        optimizer=optimizer,
        #loss=mask_sparse_cross_entropy,
        loss=crf.loss,
        #metrics=['acc'])
        metrics=[crf.accuracy])

    # callbacks
    callbacks = []

    early_stopping_cb = EarlyStopping(monitor='val_loss',
                                      patience=5,
                                      restore_best_weights=True)
    callbacks.append(early_stopping_cb)

    tensorboard_cb = TensorBoard(
        log_dir=os.path.join(get_log_dir(), "rnn_model"))
    callbacks.append(tensorboard_cb)

    checkpoint_path = os.path.join(get_model_dir(), "rnn_model", "ckpt")
    checkpoint_cb = ModelCheckpoint(filepath=checkpoint_path,
                                    save_weights_only=True,
                                    save_best_only=True)
    callbacks.append(checkpoint_cb)

    # === Train
    history = model.fit(train_dataset,
                        batch_size=batch_size,
                        epochs=epochs,
                        steps_per_epoch=num_train_batch,
                        validation_data=val_dataset,
                        validation_steps=num_val_batch,
                        callbacks=callbacks)

    print(model.summary())

    return True
예제 #4
0
def segmentation():
    vocab_size = 3954
    embedding_dim = 64
    rnn_units = 32
    pad_index = 0  # pad_index, to mask in loss
    rnn_steps = 30  # is not needed in test

    test_path = os.path.join(get_data_dir(), "msr_test.utf8")
    seg_path = os.path.join(get_data_dir(), "msr_test_birnn_crf.utf8")
    char2id_dict_path = os.path.join(get_data_dir(),
                                     "msr_training_char2id_dict.pkl")
    char2id_dict = load_dictionary(dict_path=char2id_dict_path)
    print("#char2id_dict=%d" % len(char2id_dict))

    # === Build and compile model.
    model = BiRNNCRF(vocab_size=vocab_size,
                     embedding_dim=embedding_dim,
                     rnn_units=rnn_units)
    optimizer = tf.keras.optimizers.Adam(0.001)

    crf = model.crf_layer
    model.compile(optimizer=optimizer, loss=crf.loss, metrics=[crf.accuracy])
    """
    model.compile(optimizer=optimizer,
                   loss=mask_sparse_cross_entropy,
                   metrics=['acc'])
    """

    # === Load weights.
    checkpoint_dir = os.path.join(get_model_dir(), "rnn_model")
    checkpoint = tf.train.latest_checkpoint(checkpoint_dir=checkpoint_dir)
    model.load_weights(checkpoint)

    # === Run once, to load weights of checkpoint.
    test_model_once(model=model, vocab_size=vocab_size)

    # Load separator_dict
    separator_dict = load_separator_dict()
    print("#separator_dict=%d" % len(separator_dict))

    fw = open(seg_path, 'w', encoding='utf-8')
    with open(test_path, 'r', encoding='utf-8') as f:

        line_cnt = 0
        for line in f:

            labels = model_predict(model=model,
                                   char_list=line[:-1],
                                   char2id_dict=char2id_dict,
                                   separator_dict=separator_dict)

            if len(line[:-1]) != len(labels):
                print("Wrong")
                print(line[:-1], '\n', labels)
                print(len(line[:-1]), len(labels))
                break

            # {0: pad, 1: B, 2: M, 3: E, 4: S}
            words = []
            word = []
            for i, label in zip(range(len(line) - 1), labels):
                word.append(line[i])
                if label == 3 or label == 4:
                    words.append("".join(word))
                    word = []
            if len(word) > 0:
                words.append("".join(word))
            fw.write(" ".join(words) + '\n')

            line_cnt += 1
            if line_cnt % 100 == 0:
                print(line_cnt)

        print(line_cnt)
    fw.close()
def train_seq2seq():
    vocab_size = 3954  # count > min_char_count = 5
    num_states = 4
    total_num_train = 69000  # num_lines of msr_rnn_train.utf8
    total_num_val = 17300  # num_lines of msr_rnn_val.utf8
    batch_size = 32

    epochs = 100
    shuffle_buffer_size = 1024 * 2
    rnn_steps = 30

    embedding_dim = 64
    rnn_units = 32

    min_val_loss = None
    opt_epoch = None
    patience = 5

    train_path = os.path.join(get_data_dir(), "msr_rnn_train.utf8")
    val_path = os.path.join(get_data_dir(), "msr_rnn_val.utf8")
    char2id_dict_path = os.path.join(get_data_dir(),
                                     "msr_training_char2id_dict.pkl")

    num_train_batch = total_num_train // batch_size + 1
    num_val_batch = total_num_val // batch_size + 1

    char2id_dict = load_dictionary(dict_path=char2id_dict_path)
    print("#char2id_dict = %d" % len(char2id_dict))

    # === Dataset
    train_dataset = get_dataset(data_path=train_path,
                                epochs=epochs,
                                shuffle_buffer_size=shuffle_buffer_size,
                                batch_size=batch_size,
                                steps=rnn_steps,
                                char2id_dict=char2id_dict,
                                pad_index=0)

    val_dataset = get_dataset(data_path=val_path,
                              epochs=epochs,
                              shuffle_buffer_size=shuffle_buffer_size,
                              batch_size=batch_size,
                              steps=rnn_steps,
                              char2id_dict=char2id_dict,
                              pad_index=0)

    # === Model
    encoder = Encoder(vocab_size=vocab_size,
                      embedding_dim=embedding_dim,
                      rnn_units=rnn_units)
    decoder = Decoder(num_states=num_states,
                      embedding_dim=embedding_dim,
                      rnn_units=rnn_units)

    # === Optimizer
    optimizer = tf.keras.optimizers.Adam(0.001)

    # === Checkpoint
    checkpoint_dir = os.path.join(get_model_dir(), "seq2seq")
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
    checkpoint = tf.train.Checkpoint(encoder=encoder,
                                     decoder=decoder,
                                     optimizer=optimizer)

    start = time.time()
    for epoch in range(epochs):
        epoch_start = time.time()

        # === train
        print('\nTraining...')
        train_loss = 0

        batch_start = time.time()
        for batch, (inputs, targets) in zip(range(num_train_batch),
                                            train_dataset):
            cur_loss = train_step(encoder,
                                  decoder,
                                  optimizer,
                                  inputs,
                                  targets,
                                  mask=0)
            train_loss += cur_loss

            if (batch + 1) % 100 == 0:
                print(
                    "Epoch: %d/%d, batch: %d/%d, train_loss: %.4f, cur_loss: %.4f,"
                    % (epoch + 1, epochs, batch + 1, num_train_batch,
                       train_loss / (batch + 1), cur_loss),
                    end=" ")

                batch_end = time.time()
                batch_last = batch_end - batch_start
                print("lasts: %.2fs" % batch_last)

        train_loss /= num_train_batch
        print("Epoch: %d/%d, train_loss: %.4f" %
              (epoch + 1, epochs, train_loss))

        # === validate
        print("\nValidating...")
        val_loss = 0

        batch_start = time.time()
        for batch, (inputs, targets) in zip(range(num_val_batch), val_dataset):
            cur_loss = train_step(encoder,
                                  decoder,
                                  optimizer,
                                  inputs,
                                  targets,
                                  mask=0)
            val_loss += cur_loss

            if (batch + 1) % 100 == 0:
                print(
                    "Epoch: %d/%d, batch: %d/%d, val_loss: %.4f, cur_loss: %.4f, "
                    % (epoch + 1, epochs, batch + 1, num_val_batch, val_loss /
                       (batch + 1), cur_loss),
                    end=" ")

                batch_end = time.time()
                batch_last = batch_end - batch_start
                print("lasts: %.2fs" % batch_last)

        val_loss /= num_val_batch
        print("Epoch: %d/%d, train_loss: %.4f, val_loss: %.4f, " %
              (epoch + 1, epochs, train_loss, val_loss),
              end=" ")

        epoch_end = time.time()
        epoch_last = epoch_end - epoch_start
        print("lasts: %.2fs" % epoch_last)

        if opt_epoch is not None:
            if epoch - opt_epoch > patience:
                print("Stop training, epoch: %d, opt_epoch: %d")
                break

        if min_val_loss is None or val_loss < min_val_loss:
            min_val_loss = val_loss
            opt_epoch = epoch

            # === Save best model only.
            print("\nSaving...")
            print("Epoch: %d, train_loss: %.4f, val_loss: %.4f" %
                  (epoch + 1, train_loss, val_loss))
            checkpoint.save(file_prefix=checkpoint_prefix)

    print("Training done! min_val_loss=%.4f, opt_epoch=%d" %
          (min_val_loss, opt_epoch),
          end=" ")
    end = time.time()
    last = end - start
    print("Lasts: %.2fs" % last)
예제 #6
0
def segmentation():
    vocab_size = 3954
    embedding_dim = 64
    num_states = 4
    rnn_units = 32
    pad_index = 0  # pad_index, to mask in loss
    rnn_steps = 30

    test_path = os.path.join(get_data_dir(), "msr_test.utf8")
    seg_path = os.path.join(get_data_dir(), "msr_test_seq2seq.utf8")
    char2id_dict_path = os.path.join(get_data_dir(),
                                     "msr_training_char2id_dict.pkl")
    char2id_dict = load_dictionary(dict_path=char2id_dict_path)
    print("#char2id_dict=%d" % len(char2id_dict))

    # === Model
    encoder = Encoder(vocab_size=vocab_size,
                      embedding_dim=embedding_dim,
                      rnn_units=rnn_units)
    decoder = Decoder(num_states=num_states,
                      embedding_dim=embedding_dim,
                      rnn_units=rnn_units)
    seq2seq = Seq2Seq(encoder=encoder, decoder=decoder)

    # === Optimizer
    optimizer = tf.keras.optimizers.Adam(0.001)

    # === Checkpoint
    checkpoint_dir = os.path.join(get_model_dir(), "seq2seq")
    #checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
    checkpoint = tf.train.Checkpoint(encoder=encoder,
                                     decoder=decoder,
                                     optimizer=optimizer)
    latest = tf.train.latest_checkpoint(checkpoint_dir)
    status = checkpoint.restore(latest)
    status.assert_existing_objects_matched()

    # === Test once
    batch_size = 2
    inputs = tf.random.uniform((batch_size, rnn_steps),
                               minval=0,
                               maxval=vocab_size + 2,
                               dtype=tf.int32)
    targets = tf.random.uniform((batch_size, rnn_steps),
                                minval=0,
                                maxval=num_states + 1,
                                dtype=tf.int32)
    test_seq2seq_once(encoder=encoder,
                      decoder=decoder,
                      inputs=inputs,
                      targets=targets)

    # === Test

    # Load separator_dict
    separator_dict = load_separator_dict()
    print("#separator_dict=%d" % len(separator_dict))

    fw = open(seg_path, 'w', encoding='utf-8')

    with open(test_path, 'r', encoding='utf-8') as f:

        line_cnt = 0
        for line in f:
            buf = line[:-1]

            labels = model_predict(model=seq2seq,
                                   char_list=buf,
                                   char2id_dict=char2id_dict,
                                   separator_dict=separator_dict)

            if len(buf) != len(labels):
                print("Wrong")
                print(buf, '\n', labels)
                print(len(buf), len(labels))
                break

            # {0: pad, 1: B, 2: M, 3: E, 4: S}
            words = []
            word = []
            for i, label in zip(range(len(buf)), labels):
                word.append(buf[i])
                if label == 3 or label == 4:
                    words.append("".join(word))
                    word = []
            if len(word) > 0:
                words.append("".join(word))
            fw.write(" ".join(words) + '\n')

            line_cnt += 1
            if line_cnt % 100 == 0:
                print(line_cnt)

    fw.close()