コード例 #1
0
                batch_size=16,
                steps=10,
                pad_index=0,
                char2id_dict=None):
    return dataset_generator(data_path=data_path,
                             epochs=epochs,
                             shuffle_buffer_size=shuffle_buffer_size,
                             batch_size=batch_size,
                             steps=steps,
                             pad_index=pad_index,
                             char2id_dict=char2id_dict)


if __name__ == "__main__":
    train_path = os.path.join(get_data_dir(), "msr_training_label.utf8")
    char2id_dict_path = os.path.join(get_data_dir(),
                                     "msr_training_char2id_dict.pkl")

    char2id_dict = load_dictionary(dict_path=char2id_dict_path)
    print("#char2id_dict = %d" % len(char2id_dict))

    train_dataset = get_dataset(data_path=train_path,
                                batch_size=4,
                                steps=10,
                                char2id_dict=char2id_dict)

    # inputs: [batch_size, steps]  \in [0, 1, 2, ..., vocab_size]
    # outputs: [batch_size, steps] \in [0, 1, 2, 3, 4]

    for i, (inputs, outputs) in zip(range(2), train_dataset):
        print(i, inputs.shape, outputs.shape)
コード例 #2
0
def train_seq2seq():
    vocab_size = 3954  # count > min_char_count = 5
    num_states = 4
    total_num_train = 69000  # num_lines of msr_rnn_train.utf8
    total_num_val = 17300  # num_lines of msr_rnn_val.utf8
    batch_size = 32

    epochs = 100
    shuffle_buffer_size = 1024 * 2
    rnn_steps = 30

    embedding_dim = 64
    rnn_units = 32

    min_val_loss = None
    opt_epoch = None
    patience = 5

    train_path = os.path.join(get_data_dir(), "msr_rnn_train.utf8")
    val_path = os.path.join(get_data_dir(), "msr_rnn_val.utf8")
    char2id_dict_path = os.path.join(get_data_dir(),
                                     "msr_training_char2id_dict.pkl")

    num_train_batch = total_num_train // batch_size + 1
    num_val_batch = total_num_val // batch_size + 1

    char2id_dict = load_dictionary(dict_path=char2id_dict_path)
    print("#char2id_dict = %d" % len(char2id_dict))

    # === Dataset
    train_dataset = get_dataset(data_path=train_path,
                                epochs=epochs,
                                shuffle_buffer_size=shuffle_buffer_size,
                                batch_size=batch_size,
                                steps=rnn_steps,
                                char2id_dict=char2id_dict,
                                pad_index=0)

    val_dataset = get_dataset(data_path=val_path,
                              epochs=epochs,
                              shuffle_buffer_size=shuffle_buffer_size,
                              batch_size=batch_size,
                              steps=rnn_steps,
                              char2id_dict=char2id_dict,
                              pad_index=0)

    # === Model
    encoder = Encoder(vocab_size=vocab_size,
                      embedding_dim=embedding_dim,
                      rnn_units=rnn_units)
    decoder = Decoder(num_states=num_states,
                      embedding_dim=embedding_dim,
                      rnn_units=rnn_units)

    # === Optimizer
    optimizer = tf.keras.optimizers.Adam(0.001)

    # === Checkpoint
    checkpoint_dir = os.path.join(get_model_dir(), "seq2seq")
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
    checkpoint = tf.train.Checkpoint(encoder=encoder,
                                     decoder=decoder,
                                     optimizer=optimizer)

    start = time.time()
    for epoch in range(epochs):
        epoch_start = time.time()

        # === train
        print('\nTraining...')
        train_loss = 0

        batch_start = time.time()
        for batch, (inputs, targets) in zip(range(num_train_batch),
                                            train_dataset):
            cur_loss = train_step(encoder,
                                  decoder,
                                  optimizer,
                                  inputs,
                                  targets,
                                  mask=0)
            train_loss += cur_loss

            if (batch + 1) % 100 == 0:
                print(
                    "Epoch: %d/%d, batch: %d/%d, train_loss: %.4f, cur_loss: %.4f,"
                    % (epoch + 1, epochs, batch + 1, num_train_batch,
                       train_loss / (batch + 1), cur_loss),
                    end=" ")

                batch_end = time.time()
                batch_last = batch_end - batch_start
                print("lasts: %.2fs" % batch_last)

        train_loss /= num_train_batch
        print("Epoch: %d/%d, train_loss: %.4f" %
              (epoch + 1, epochs, train_loss))

        # === validate
        print("\nValidating...")
        val_loss = 0

        batch_start = time.time()
        for batch, (inputs, targets) in zip(range(num_val_batch), val_dataset):
            cur_loss = train_step(encoder,
                                  decoder,
                                  optimizer,
                                  inputs,
                                  targets,
                                  mask=0)
            val_loss += cur_loss

            if (batch + 1) % 100 == 0:
                print(
                    "Epoch: %d/%d, batch: %d/%d, val_loss: %.4f, cur_loss: %.4f, "
                    % (epoch + 1, epochs, batch + 1, num_val_batch, val_loss /
                       (batch + 1), cur_loss),
                    end=" ")

                batch_end = time.time()
                batch_last = batch_end - batch_start
                print("lasts: %.2fs" % batch_last)

        val_loss /= num_val_batch
        print("Epoch: %d/%d, train_loss: %.4f, val_loss: %.4f, " %
              (epoch + 1, epochs, train_loss, val_loss),
              end=" ")

        epoch_end = time.time()
        epoch_last = epoch_end - epoch_start
        print("lasts: %.2fs" % epoch_last)

        if opt_epoch is not None:
            if epoch - opt_epoch > patience:
                print("Stop training, epoch: %d, opt_epoch: %d")
                break

        if min_val_loss is None or val_loss < min_val_loss:
            min_val_loss = val_loss
            opt_epoch = epoch

            # === Save best model only.
            print("\nSaving...")
            print("Epoch: %d, train_loss: %.4f, val_loss: %.4f" %
                  (epoch + 1, train_loss, val_loss))
            checkpoint.save(file_prefix=checkpoint_prefix)

    print("Training done! min_val_loss=%.4f, opt_epoch=%d" %
          (min_val_loss, opt_epoch),
          end=" ")
    end = time.time()
    last = end - start
    print("Lasts: %.2fs" % last)
コード例 #3
0
def train_model():
    vocab_size = 3954  # count > min_char_count = 5
    num_states = 4
    total_num_train = 69000  # num_lines of msr_rnn_train.utf8
    total_num_val = 17300  # num_lines of msr_rnn_val.utf8

    epochs = 100
    shuffle_buffer_size = 1024 * 2
    batch_size = 32
    rnn_steps = 30

    embedding_dim = 64
    rnn_units = 32
    pad_index = 0  # pad_index, to mask in loss

    train_path = os.path.join(get_data_dir(), "msr_rnn_train.utf8")
    val_path = os.path.join(get_data_dir(), "msr_rnn_val.utf8")
    char2id_dict_path = os.path.join(get_data_dir(),
                                     "msr_training_char2id_dict.pkl")

    num_train_batch = total_num_train // batch_size + 1
    num_val_batch = total_num_val // batch_size + 1

    char2id_dict = load_dictionary(dict_path=char2id_dict_path)
    print("#char2id_dict = %d" % len(char2id_dict))

    # === tf.data.Dataset
    train_dataset = get_dataset(data_path=train_path,
                                epochs=epochs,
                                shuffle_buffer_size=shuffle_buffer_size,
                                batch_size=batch_size,
                                steps=rnn_steps,
                                char2id_dict=char2id_dict,
                                pad_index=pad_index)

    val_dataset = get_dataset(data_path=val_path,
                              epochs=epochs,
                              shuffle_buffer_size=shuffle_buffer_size,
                              batch_size=batch_size,
                              steps=rnn_steps,
                              char2id_dict=char2id_dict,
                              pad_index=pad_index)

    # === model
    model = BiRNNCRF(vocab_size=vocab_size,
                     embedding_dim=embedding_dim,
                     rnn_units=rnn_units)
    # optimizer
    optimizer = tf.keras.optimizers.Adam(0.001)

    crf = model.crf_layer
    model.compile(
        optimizer=optimizer,
        #loss=mask_sparse_cross_entropy,
        loss=crf.loss,
        #metrics=['acc'])
        metrics=[crf.accuracy])

    # callbacks
    callbacks = []

    early_stopping_cb = EarlyStopping(monitor='val_loss',
                                      patience=5,
                                      restore_best_weights=True)
    callbacks.append(early_stopping_cb)

    tensorboard_cb = TensorBoard(
        log_dir=os.path.join(get_log_dir(), "rnn_model"))
    callbacks.append(tensorboard_cb)

    checkpoint_path = os.path.join(get_model_dir(), "rnn_model", "ckpt")
    checkpoint_cb = ModelCheckpoint(filepath=checkpoint_path,
                                    save_weights_only=True,
                                    save_best_only=True)
    callbacks.append(checkpoint_cb)

    # === Train
    history = model.fit(train_dataset,
                        batch_size=batch_size,
                        epochs=epochs,
                        steps_per_epoch=num_train_batch,
                        validation_data=val_dataset,
                        validation_steps=num_val_batch,
                        callbacks=callbacks)

    print(model.summary())

    return True
コード例 #4
0
def segmentation():
    vocab_size = 3954
    embedding_dim = 64
    num_states = 4
    rnn_units = 32
    pad_index = 0  # pad_index, to mask in loss
    rnn_steps = 30

    test_path = os.path.join(get_data_dir(), "msr_test.utf8")
    seg_path = os.path.join(get_data_dir(), "msr_test_seq2seq.utf8")
    char2id_dict_path = os.path.join(get_data_dir(),
                                     "msr_training_char2id_dict.pkl")
    char2id_dict = load_dictionary(dict_path=char2id_dict_path)
    print("#char2id_dict=%d" % len(char2id_dict))

    # === Model
    encoder = Encoder(vocab_size=vocab_size,
                      embedding_dim=embedding_dim,
                      rnn_units=rnn_units)
    decoder = Decoder(num_states=num_states,
                      embedding_dim=embedding_dim,
                      rnn_units=rnn_units)
    seq2seq = Seq2Seq(encoder=encoder, decoder=decoder)

    # === Optimizer
    optimizer = tf.keras.optimizers.Adam(0.001)

    # === Checkpoint
    checkpoint_dir = os.path.join(get_model_dir(), "seq2seq")
    #checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
    checkpoint = tf.train.Checkpoint(encoder=encoder,
                                     decoder=decoder,
                                     optimizer=optimizer)
    latest = tf.train.latest_checkpoint(checkpoint_dir)
    status = checkpoint.restore(latest)
    status.assert_existing_objects_matched()

    # === Test once
    batch_size = 2
    inputs = tf.random.uniform((batch_size, rnn_steps),
                               minval=0,
                               maxval=vocab_size + 2,
                               dtype=tf.int32)
    targets = tf.random.uniform((batch_size, rnn_steps),
                                minval=0,
                                maxval=num_states + 1,
                                dtype=tf.int32)
    test_seq2seq_once(encoder=encoder,
                      decoder=decoder,
                      inputs=inputs,
                      targets=targets)

    # === Test

    # Load separator_dict
    separator_dict = load_separator_dict()
    print("#separator_dict=%d" % len(separator_dict))

    fw = open(seg_path, 'w', encoding='utf-8')

    with open(test_path, 'r', encoding='utf-8') as f:

        line_cnt = 0
        for line in f:
            buf = line[:-1]

            labels = model_predict(model=seq2seq,
                                   char_list=buf,
                                   char2id_dict=char2id_dict,
                                   separator_dict=separator_dict)

            if len(buf) != len(labels):
                print("Wrong")
                print(buf, '\n', labels)
                print(len(buf), len(labels))
                break

            # {0: pad, 1: B, 2: M, 3: E, 4: S}
            words = []
            word = []
            for i, label in zip(range(len(buf)), labels):
                word.append(buf[i])
                if label == 3 or label == 4:
                    words.append("".join(word))
                    word = []
            if len(word) > 0:
                words.append("".join(word))
            fw.write(" ".join(words) + '\n')

            line_cnt += 1
            if line_cnt % 100 == 0:
                print(line_cnt)

    fw.close()