예제 #1
0
import jieba
from cangjie.utils.config import get_data_dir
import os

if __name__ == '__main__':
    test_path = os.path.join(get_data_dir(), "msr_test.utf8")
    seg_path = os.path.join(get_data_dir(), "msr_test_jieba.utf8")

    fw = open(seg_path, 'w', encoding='utf-8')
    with open(test_path, 'r', encoding='utf-8') as f:
        for line in f:
            seg = jieba.cut(line[:-1])
            words = [word for word in seg]
            fw.write(" ".join(words) + '\n')

    fw.close()
    print("Write Done!", seg_path)
예제 #2
0
                sentence=line[:-1])

            fw.write(" ".join(line_seg_result) + '\n')

            line_cnt += 1
            if line_cnt % 100 == 0:
                print(line_cnt)

    fw.close()
    print(line_cnt)

    return True


if __name__ == '__main__':
    data_dir = get_data_dir()
    dict_path = os.path.join(data_dir, "msr.dict")
    test_path = os.path.join(data_dir, "msr_test.utf8")
    method = "bimm"
    max_num_char = 6

    test_result_path = os.path.join(data_dir, "msr_test_" + method + ".utf8")

    word_dict = load_dictionary(dict_path=dict_path)
    print("Total number of words is: %d\n" % (len(word_dict)))

    seg_on_file(word_dict=word_dict,
                test_path=test_path,
                seg_path=test_result_path,
                method=method,
                max_num_char=max_num_char)
예제 #3
0
                shuffle_buffer_size=1024,
                batch_size=16,
                steps=10,
                pad_index=0,
                char2id_dict=None):
    return dataset_generator(data_path=data_path,
                             epochs=epochs,
                             shuffle_buffer_size=shuffle_buffer_size,
                             batch_size=batch_size,
                             steps=steps,
                             pad_index=pad_index,
                             char2id_dict=char2id_dict)


if __name__ == "__main__":
    train_path = os.path.join(get_data_dir(), "msr_training_label.utf8")
    char2id_dict_path = os.path.join(get_data_dir(),
                                     "msr_training_char2id_dict.pkl")

    char2id_dict = load_dictionary(dict_path=char2id_dict_path)
    print("#char2id_dict = %d" % len(char2id_dict))

    train_dataset = get_dataset(data_path=train_path,
                                batch_size=4,
                                steps=10,
                                char2id_dict=char2id_dict)

    # inputs: [batch_size, steps]  \in [0, 1, 2, ..., vocab_size]
    # outputs: [batch_size, steps] \in [0, 1, 2, 3, 4]

    for i, (inputs, outputs) in zip(range(2), train_dataset):
예제 #4
0
def train_model():
    vocab_size = 3954  # count > min_char_count = 5
    num_states = 4
    total_num_train = 69000  # num_lines of msr_rnn_train.utf8
    total_num_val = 17300  # num_lines of msr_rnn_val.utf8

    epochs = 100
    shuffle_buffer_size = 1024 * 2
    batch_size = 32
    rnn_steps = 30

    embedding_dim = 64
    rnn_units = 32
    pad_index = 0  # pad_index, to mask in loss

    train_path = os.path.join(get_data_dir(), "msr_rnn_train.utf8")
    val_path = os.path.join(get_data_dir(), "msr_rnn_val.utf8")
    char2id_dict_path = os.path.join(get_data_dir(),
                                     "msr_training_char2id_dict.pkl")

    num_train_batch = total_num_train // batch_size + 1
    num_val_batch = total_num_val // batch_size + 1

    char2id_dict = load_dictionary(dict_path=char2id_dict_path)
    print("#char2id_dict = %d" % len(char2id_dict))

    # === tf.data.Dataset
    train_dataset = get_dataset(data_path=train_path,
                                epochs=epochs,
                                shuffle_buffer_size=shuffle_buffer_size,
                                batch_size=batch_size,
                                steps=rnn_steps,
                                char2id_dict=char2id_dict,
                                pad_index=pad_index)

    val_dataset = get_dataset(data_path=val_path,
                              epochs=epochs,
                              shuffle_buffer_size=shuffle_buffer_size,
                              batch_size=batch_size,
                              steps=rnn_steps,
                              char2id_dict=char2id_dict,
                              pad_index=pad_index)

    # === model
    model = BiRNNCRF(vocab_size=vocab_size,
                     embedding_dim=embedding_dim,
                     rnn_units=rnn_units)
    # optimizer
    optimizer = tf.keras.optimizers.Adam(0.001)

    crf = model.crf_layer
    model.compile(
        optimizer=optimizer,
        #loss=mask_sparse_cross_entropy,
        loss=crf.loss,
        #metrics=['acc'])
        metrics=[crf.accuracy])

    # callbacks
    callbacks = []

    early_stopping_cb = EarlyStopping(monitor='val_loss',
                                      patience=5,
                                      restore_best_weights=True)
    callbacks.append(early_stopping_cb)

    tensorboard_cb = TensorBoard(
        log_dir=os.path.join(get_log_dir(), "rnn_model"))
    callbacks.append(tensorboard_cb)

    checkpoint_path = os.path.join(get_model_dir(), "rnn_model", "ckpt")
    checkpoint_cb = ModelCheckpoint(filepath=checkpoint_path,
                                    save_weights_only=True,
                                    save_best_only=True)
    callbacks.append(checkpoint_cb)

    # === Train
    history = model.fit(train_dataset,
                        batch_size=batch_size,
                        epochs=epochs,
                        steps_per_epoch=num_train_batch,
                        validation_data=val_dataset,
                        validation_steps=num_val_batch,
                        callbacks=callbacks)

    print(model.summary())

    return True
예제 #5
0
                    val_path=None,
                    train_ratio=None):
    fw_train = open(train_path, 'w', encoding='utf-8')
    fw_val = open(val_path, 'w', encoding='utf-8')

    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            r = np.random.random()
            if r < train_ratio:
                fw_train.write(line)
            else:
                fw_val.write(line)

    fw_train.close()
    fw_val.close()


if __name__ == '__main__':
    input_path = os.path.join(get_data_dir(), "msr_training_label.utf8")
    train_path = os.path.join(get_data_dir(), "msr_rnn_train.utf8")
    val_path = os.path.join(get_data_dir(), "msr_rnn_val.utf8")

    train_ratio = 0.8

    split_train_val(input_path=input_path,
                    train_path=train_path,
                    val_path=val_path,
                    train_ratio=train_ratio)

    print("Write done!", train_path)
예제 #6
0
def segmentation():
    vocab_size = 3954
    embedding_dim = 64
    rnn_units = 32
    pad_index = 0  # pad_index, to mask in loss
    rnn_steps = 30  # is not needed in test

    test_path = os.path.join(get_data_dir(), "msr_test.utf8")
    seg_path = os.path.join(get_data_dir(), "msr_test_birnn_crf.utf8")
    char2id_dict_path = os.path.join(get_data_dir(),
                                     "msr_training_char2id_dict.pkl")
    char2id_dict = load_dictionary(dict_path=char2id_dict_path)
    print("#char2id_dict=%d" % len(char2id_dict))

    # === Build and compile model.
    model = BiRNNCRF(vocab_size=vocab_size,
                     embedding_dim=embedding_dim,
                     rnn_units=rnn_units)
    optimizer = tf.keras.optimizers.Adam(0.001)

    crf = model.crf_layer
    model.compile(optimizer=optimizer, loss=crf.loss, metrics=[crf.accuracy])
    """
    model.compile(optimizer=optimizer,
                   loss=mask_sparse_cross_entropy,
                   metrics=['acc'])
    """

    # === Load weights.
    checkpoint_dir = os.path.join(get_model_dir(), "rnn_model")
    checkpoint = tf.train.latest_checkpoint(checkpoint_dir=checkpoint_dir)
    model.load_weights(checkpoint)

    # === Run once, to load weights of checkpoint.
    test_model_once(model=model, vocab_size=vocab_size)

    # Load separator_dict
    separator_dict = load_separator_dict()
    print("#separator_dict=%d" % len(separator_dict))

    fw = open(seg_path, 'w', encoding='utf-8')
    with open(test_path, 'r', encoding='utf-8') as f:

        line_cnt = 0
        for line in f:

            labels = model_predict(model=model,
                                   char_list=line[:-1],
                                   char2id_dict=char2id_dict,
                                   separator_dict=separator_dict)

            if len(line[:-1]) != len(labels):
                print("Wrong")
                print(line[:-1], '\n', labels)
                print(len(line[:-1]), len(labels))
                break

            # {0: pad, 1: B, 2: M, 3: E, 4: S}
            words = []
            word = []
            for i, label in zip(range(len(line) - 1), labels):
                word.append(line[i])
                if label == 3 or label == 4:
                    words.append("".join(word))
                    word = []
            if len(word) > 0:
                words.append("".join(word))
            fw.write(" ".join(words) + '\n')

            line_cnt += 1
            if line_cnt % 100 == 0:
                print(line_cnt)

        print(line_cnt)
    fw.close()
def train_seq2seq():
    vocab_size = 3954  # count > min_char_count = 5
    num_states = 4
    total_num_train = 69000  # num_lines of msr_rnn_train.utf8
    total_num_val = 17300  # num_lines of msr_rnn_val.utf8
    batch_size = 32

    epochs = 100
    shuffle_buffer_size = 1024 * 2
    rnn_steps = 30

    embedding_dim = 64
    rnn_units = 32

    min_val_loss = None
    opt_epoch = None
    patience = 5

    train_path = os.path.join(get_data_dir(), "msr_rnn_train.utf8")
    val_path = os.path.join(get_data_dir(), "msr_rnn_val.utf8")
    char2id_dict_path = os.path.join(get_data_dir(),
                                     "msr_training_char2id_dict.pkl")

    num_train_batch = total_num_train // batch_size + 1
    num_val_batch = total_num_val // batch_size + 1

    char2id_dict = load_dictionary(dict_path=char2id_dict_path)
    print("#char2id_dict = %d" % len(char2id_dict))

    # === Dataset
    train_dataset = get_dataset(data_path=train_path,
                                epochs=epochs,
                                shuffle_buffer_size=shuffle_buffer_size,
                                batch_size=batch_size,
                                steps=rnn_steps,
                                char2id_dict=char2id_dict,
                                pad_index=0)

    val_dataset = get_dataset(data_path=val_path,
                              epochs=epochs,
                              shuffle_buffer_size=shuffle_buffer_size,
                              batch_size=batch_size,
                              steps=rnn_steps,
                              char2id_dict=char2id_dict,
                              pad_index=0)

    # === Model
    encoder = Encoder(vocab_size=vocab_size,
                      embedding_dim=embedding_dim,
                      rnn_units=rnn_units)
    decoder = Decoder(num_states=num_states,
                      embedding_dim=embedding_dim,
                      rnn_units=rnn_units)

    # === Optimizer
    optimizer = tf.keras.optimizers.Adam(0.001)

    # === Checkpoint
    checkpoint_dir = os.path.join(get_model_dir(), "seq2seq")
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
    checkpoint = tf.train.Checkpoint(encoder=encoder,
                                     decoder=decoder,
                                     optimizer=optimizer)

    start = time.time()
    for epoch in range(epochs):
        epoch_start = time.time()

        # === train
        print('\nTraining...')
        train_loss = 0

        batch_start = time.time()
        for batch, (inputs, targets) in zip(range(num_train_batch),
                                            train_dataset):
            cur_loss = train_step(encoder,
                                  decoder,
                                  optimizer,
                                  inputs,
                                  targets,
                                  mask=0)
            train_loss += cur_loss

            if (batch + 1) % 100 == 0:
                print(
                    "Epoch: %d/%d, batch: %d/%d, train_loss: %.4f, cur_loss: %.4f,"
                    % (epoch + 1, epochs, batch + 1, num_train_batch,
                       train_loss / (batch + 1), cur_loss),
                    end=" ")

                batch_end = time.time()
                batch_last = batch_end - batch_start
                print("lasts: %.2fs" % batch_last)

        train_loss /= num_train_batch
        print("Epoch: %d/%d, train_loss: %.4f" %
              (epoch + 1, epochs, train_loss))

        # === validate
        print("\nValidating...")
        val_loss = 0

        batch_start = time.time()
        for batch, (inputs, targets) in zip(range(num_val_batch), val_dataset):
            cur_loss = train_step(encoder,
                                  decoder,
                                  optimizer,
                                  inputs,
                                  targets,
                                  mask=0)
            val_loss += cur_loss

            if (batch + 1) % 100 == 0:
                print(
                    "Epoch: %d/%d, batch: %d/%d, val_loss: %.4f, cur_loss: %.4f, "
                    % (epoch + 1, epochs, batch + 1, num_val_batch, val_loss /
                       (batch + 1), cur_loss),
                    end=" ")

                batch_end = time.time()
                batch_last = batch_end - batch_start
                print("lasts: %.2fs" % batch_last)

        val_loss /= num_val_batch
        print("Epoch: %d/%d, train_loss: %.4f, val_loss: %.4f, " %
              (epoch + 1, epochs, train_loss, val_loss),
              end=" ")

        epoch_end = time.time()
        epoch_last = epoch_end - epoch_start
        print("lasts: %.2fs" % epoch_last)

        if opt_epoch is not None:
            if epoch - opt_epoch > patience:
                print("Stop training, epoch: %d, opt_epoch: %d")
                break

        if min_val_loss is None or val_loss < min_val_loss:
            min_val_loss = val_loss
            opt_epoch = epoch

            # === Save best model only.
            print("\nSaving...")
            print("Epoch: %d, train_loss: %.4f, val_loss: %.4f" %
                  (epoch + 1, train_loss, val_loss))
            checkpoint.save(file_prefix=checkpoint_prefix)

    print("Training done! min_val_loss=%.4f, opt_epoch=%d" %
          (min_val_loss, opt_epoch),
          end=" ")
    end = time.time()
    last = end - start
    print("Lasts: %.2fs" % last)
예제 #8
0
            labels = []

            for word in buf:
                if len(word) == 0:
                    continue
                elif len(word) == 1:
                    label = ['4']
                else:
                    label = ['2'] * len(word)
                    label[0] = '1'
                    label[-1] = '3'

                chars.extend(word)
                labels.extend(label)

            assert len(chars) == len(labels)
            fw.write("::".join(chars) + '\t' + "".join(labels) + '\n')

        fw.close()
        print("Write Done!", label_path)


if __name__ == '__main__':
    data_path = os.path.join(get_data_dir(), "msr_training.utf8")
    label_path = os.path.join(get_data_dir(), "msr_training_label.utf8")
    char2id_dict_path = os.path.join(get_data_dir(), "msr_training_rnn_dict.pkl")

    char2id_dict = load_dictionary(dict_path=char2id_dict_path)
    print("#char2id_dict = %d" % len(char2id_dict))

    generate_label(input_path=data_path, label_path=label_path)
예제 #9
0
def segmentation():
    vocab_size = 3954
    embedding_dim = 64
    num_states = 4
    rnn_units = 32
    pad_index = 0  # pad_index, to mask in loss
    rnn_steps = 30

    test_path = os.path.join(get_data_dir(), "msr_test.utf8")
    seg_path = os.path.join(get_data_dir(), "msr_test_seq2seq.utf8")
    char2id_dict_path = os.path.join(get_data_dir(),
                                     "msr_training_char2id_dict.pkl")
    char2id_dict = load_dictionary(dict_path=char2id_dict_path)
    print("#char2id_dict=%d" % len(char2id_dict))

    # === Model
    encoder = Encoder(vocab_size=vocab_size,
                      embedding_dim=embedding_dim,
                      rnn_units=rnn_units)
    decoder = Decoder(num_states=num_states,
                      embedding_dim=embedding_dim,
                      rnn_units=rnn_units)
    seq2seq = Seq2Seq(encoder=encoder, decoder=decoder)

    # === Optimizer
    optimizer = tf.keras.optimizers.Adam(0.001)

    # === Checkpoint
    checkpoint_dir = os.path.join(get_model_dir(), "seq2seq")
    #checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
    checkpoint = tf.train.Checkpoint(encoder=encoder,
                                     decoder=decoder,
                                     optimizer=optimizer)
    latest = tf.train.latest_checkpoint(checkpoint_dir)
    status = checkpoint.restore(latest)
    status.assert_existing_objects_matched()

    # === Test once
    batch_size = 2
    inputs = tf.random.uniform((batch_size, rnn_steps),
                               minval=0,
                               maxval=vocab_size + 2,
                               dtype=tf.int32)
    targets = tf.random.uniform((batch_size, rnn_steps),
                                minval=0,
                                maxval=num_states + 1,
                                dtype=tf.int32)
    test_seq2seq_once(encoder=encoder,
                      decoder=decoder,
                      inputs=inputs,
                      targets=targets)

    # === Test

    # Load separator_dict
    separator_dict = load_separator_dict()
    print("#separator_dict=%d" % len(separator_dict))

    fw = open(seg_path, 'w', encoding='utf-8')

    with open(test_path, 'r', encoding='utf-8') as f:

        line_cnt = 0
        for line in f:
            buf = line[:-1]

            labels = model_predict(model=seq2seq,
                                   char_list=buf,
                                   char2id_dict=char2id_dict,
                                   separator_dict=separator_dict)

            if len(buf) != len(labels):
                print("Wrong")
                print(buf, '\n', labels)
                print(len(buf), len(labels))
                break

            # {0: pad, 1: B, 2: M, 3: E, 4: S}
            words = []
            word = []
            for i, label in zip(range(len(buf)), labels):
                word.append(buf[i])
                if label == 3 or label == 4:
                    words.append("".join(word))
                    word = []
            if len(word) > 0:
                words.append("".join(word))
            fw.write(" ".join(words) + '\n')

            line_cnt += 1
            if line_cnt % 100 == 0:
                print(line_cnt)

    fw.close()
예제 #10
0
from cangjie.utils.config import get_data_dir
import os, pickle


def count_word(input_path=None, word_cnt_dict_path=None):
    """
    input_data: training data of segmentation, split by space.
    word_cnt_dict: {word: count}
    """
    word_cnt_dict = {}
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            buf = line[:-1].split(' ')
            for word in buf:
                if word not in word_cnt_dict:
                    word_cnt_dict[word] = 1
                else:
                    word_cnt_dict[word] += 1

    with open(word_cnt_dict_path, 'wb') as fw:
        pickle.dump(word_cnt_dict, fw)


if __name__ == '__main__':
    train_path = os.path.join(get_data_dir(), "msr_training.utf8")
    word_cnt_dict_path = os.path.join(get_data_dir(), "msr_training_word_cnt_dict.pkl")

    count_word(input_path=train_path, word_cnt_dict_path=word_cnt_dict_path)
    print('Write done!', word_cnt_dict_path)