コード例 #1
0
ファイル: train.py プロジェクト: djk111/pycorrector
def train(train_path=None,
          save_model_path=None,
          batch_size=64,
          epochs=10,
          rnn_hidden_dim=200):
    print('Training model...')
    data_reader = CGEDReader(train_path)
    input_texts, target_texts = data_reader.build_dataset(train_path)
    print('input_texts:', input_texts[0])
    print('target_texts:', target_texts[0])

    input_characters = data_reader.read_vocab(input_texts)
    target_characters = data_reader.read_vocab(target_texts)
    num_encoder_tokens = len(input_characters)
    num_decoder_tokens = len(target_characters)
    max_encoder_seq_len = max([len(text) for text in input_texts])
    max_decoder_seq_len = max([len(text) for text in target_texts])

    print('num of samples:', len(input_texts))
    print('num of unique input tokens:', num_encoder_tokens)
    print('num of unique output tokens:', num_decoder_tokens)
    print('max sequence length for inputs:', max_encoder_seq_len)
    print('max sequence length for outputs:', max_decoder_seq_len)

    input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
    target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

    encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_len, num_encoder_tokens), dtype='float32')
    decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_len, num_decoder_tokens), dtype='float32')
    decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_len, num_decoder_tokens), dtype='float32')

    # one hot representation
    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(input_text):
            encoder_input_data[i, t, input_token_index[char]] = 1.0
        for t, char in enumerate(target_text):
            # decoder_target_data is a head of decoder_input_data by one timestep
            decoder_input_data[i, t, target_token_index[char]] = 1.0
            if t > 0:
                decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    logger.info("Data loaded.")

    # model
    logger.info("Training seq2seq model...")
    model = create_model(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim)
    # save
    callbacks_list = callback(save_model_path, logger)
    model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=callbacks_list)
    logger.info("Training has finished.")

    eval(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim, input_token_index, target_token_index,
         max_decoder_seq_len, encoder_input_data, input_texts)
コード例 #2
0
def train(train_path=None,
          save_model_path=None,
          batch_size=64,
          epochs=10,
          rnn_hidden_dim=200):
    print('Training model...')
    data_reader = CGEDReader(train_path)
    input_texts, target_texts = data_reader.build_dataset(train_path)
    print('input_texts:', input_texts[0])
    print('target_texts:', target_texts[0])

    input_characters = data_reader.read_vocab(input_texts)
    target_characters = data_reader.read_vocab(target_texts)
    num_encoder_tokens = len(input_characters)
    num_decoder_tokens = len(target_characters)
    max_encoder_seq_len = max([len(text) for text in input_texts])
    max_decoder_seq_len = max([len(text) for text in target_texts])

    print('num of samples:', len(input_texts))
    print('num of unique input tokens:', num_encoder_tokens)
    print('num of unique output tokens:', num_decoder_tokens)
    print('max sequence length for inputs:', max_encoder_seq_len)
    print('max sequence length for outputs:', max_decoder_seq_len)

    input_token_index = dict([(char, i)
                              for i, char in enumerate(input_characters)])
    target_token_index = dict([(char, i)
                               for i, char in enumerate(target_characters)])

    encoder_input_data = np.zeros(
        (len(input_texts), max_encoder_seq_len, num_encoder_tokens),
        dtype='float32')
    decoder_input_data = np.zeros(
        (len(input_texts), max_decoder_seq_len, num_decoder_tokens),
        dtype='float32')
    decoder_target_data = np.zeros(
        (len(input_texts), max_decoder_seq_len, num_decoder_tokens),
        dtype='float32')

    # one hot representation
    for i, (input_text,
            target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(input_text):
            encoder_input_data[i, t, input_token_index[char]] = 1.0
        for t, char in enumerate(target_text):
            # decoder_target_data is a head of decoder_input_data by one timestep
            decoder_input_data[i, t, target_token_index[char]] = 1.0
            if t > 0:
                decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    logger.info("Data loaded.")

    # model
    logger.info("Training seq2seq model...")
    model = create_model(num_encoder_tokens, num_decoder_tokens,
                         rnn_hidden_dim)
    # save
    callbacks_list = callback(save_model_path, logger)
    model.fit([encoder_input_data, decoder_input_data],
              decoder_target_data,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=callbacks_list)
    logger.info("Training has finished.")

    eval(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim,
         input_token_index, target_token_index, max_decoder_seq_len,
         encoder_input_data, input_texts)
コード例 #3
0
def train(train_path=None,
          save_model_path=None,
          encoder_model_path=None,
          decoder_model_path=None,
          save_input_token_path=None,
          save_target_token_path=None,
          batch_size=64,
          epochs=10,
          rnn_hidden_dim=200):
    print('Training model...')
    data_reader = CGEDReader(train_path)
    input_texts, target_texts = data_reader.build_dataset(train_path)
    print('input_texts:', input_texts[0])
    print('target_texts:', target_texts[0])

    input_characters = data_reader.read_vocab(input_texts)
    target_characters = data_reader.read_vocab(target_texts)
    num_encoder_tokens = len(input_characters)
    num_decoder_tokens = len(target_characters)
    max_input_texts_len = max([len(text) for text in input_texts])
    max_target_texts_len = max([len(text) for text in target_texts])

    print('num of samples:', len(input_texts))
    print('num of unique input tokens:', num_encoder_tokens)
    print('num of unique output tokens:', num_decoder_tokens)
    print('max sequence length for inputs:', max_input_texts_len)
    print('max sequence length for outputs:', max_target_texts_len)

    input_token_index = dict([(char, i)
                              for i, char in enumerate(input_characters)])
    target_token_index = dict([(char, i)
                               for i, char in enumerate(target_characters)])

    # save word dict
    save_word_dict(input_token_index, save_input_token_path)
    save_word_dict(target_token_index, save_target_token_path)

    encoder_input_data = np.zeros(
        (len(input_texts), max_input_texts_len, num_encoder_tokens),
        dtype='float32')
    decoder_input_data = np.zeros(
        (len(input_texts), max_target_texts_len, num_decoder_tokens),
        dtype='float32')
    decoder_target_data = np.zeros(
        (len(input_texts), max_target_texts_len, num_decoder_tokens),
        dtype='float32')

    # one hot representation
    for i, (input_text,
            target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(input_text):
            encoder_input_data[i, t, input_token_index[char]] = 1.0
        for t, char in enumerate(target_text):
            # decoder_target_data is a head of decoder_input_data by one timestep
            decoder_input_data[i, t, target_token_index[char]] = 1.0
            if t > 0:
                decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    logger.info("Data loaded.")

    # split to train and val
    encoder_input_data_train, encoder_input_data_val, decoder_input_data_train, decoder_input_data_val, \
    decoder_target_data_train, decoder_target_data_val = train_test_split(
        encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.1)

    # model
    logger.info("Training seq2seq model...")
    model, encoder_model, decoder_model = create_model(num_encoder_tokens,
                                                       num_decoder_tokens,
                                                       rnn_hidden_dim)
    # Run training
    callbacks_list = callback(save_model_path, logger)
    model.fit_generator(
        generator=data_generator(encoder_input_data_train,
                                 decoder_input_data_train,
                                 decoder_target_data_train, batch_size),
        steps_per_epoch=(len(encoder_input_data_train) + batch_size - 1) //
        batch_size,
        epochs=epochs,
        verbose=1,
        validation_data=([encoder_input_data_val,
                          decoder_input_data_val], decoder_target_data_val),
        callbacks=callbacks_list)
    encoder_model.save(encoder_model_path)
    decoder_model.save(decoder_model_path)
    logger.info("Model save to " + save_model_path)
    logger.info("Training has finished.")

    evaluate(encoder_model, decoder_model, num_encoder_tokens,
             num_decoder_tokens, rnn_hidden_dim, target_token_index,
             max_target_texts_len, encoder_input_data_val, input_texts)
コード例 #4
0
def train(train_path=None,
          save_model_path=None,
          encoder_model_path=None,
          decoder_model_path=None,
          save_input_token_path=None,
          save_target_token_path=None,
          batch_size=64,
          epochs=10,
          rnn_hidden_dim=200):
    print('Training model...')
    data_reader = CGEDReader(train_path)
    input_texts, target_texts = data_reader.build_dataset(train_path)
    print('input_texts:', input_texts[0])
    print('target_texts:', target_texts[0])
    max_input_texts_len = max([len(text) for text in input_texts])
    max_target_texts_len = max([len(text) for text in target_texts])

    print('num of samples:', len(input_texts))
    print('max sequence length for inputs:', max_input_texts_len)
    print('max sequence length for outputs:', max_target_texts_len)

    # load or save word dict
    if os.path.exists(save_input_token_path) and os.path.exists(
            save_target_token_path):
        input_token_index = load_word_dict(save_input_token_path)
        target_token_index = load_word_dict(save_target_token_path)
    else:
        input_characters = data_reader.read_vocab(input_texts)
        target_characters = data_reader.read_vocab(target_texts)
        input_token_index = dict([(char, i)
                                  for i, char in enumerate(input_characters)])
        target_token_index = dict([
            (char, i) for i, char in enumerate(target_characters)
        ])
        save_word_dict(input_token_index, save_input_token_path)
        save_word_dict(target_token_index, save_target_token_path)

    encoder_input_data = np.zeros(
        (len(input_texts), max_input_texts_len, len(input_token_index)),
        dtype='float32')
    decoder_input_data = np.zeros(
        (len(input_texts), max_target_texts_len, len(target_token_index)),
        dtype='float32')
    decoder_target_data = np.zeros(
        (len(input_texts), max_target_texts_len, len(target_token_index)),
        dtype='float32')

    # one hot representation
    for i, (input_text,
            target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(input_text):
            if char in input_token_index:
                encoder_input_data[i, t, input_token_index[char]] = 1.0
        for t, char in enumerate(target_text):
            # decoder_target_data is a head of decoder_input_data by one timestep
            if char in target_token_index:
                decoder_input_data[i, t, target_token_index[char]] = 1.0
                if t > 0:
                    decoder_target_data[i, t - 1,
                                        target_token_index[char]] = 1.0
    logger.info("Data loaded.")

    # model
    logger.info("Training seq2seq model...")
    if os.path.exists(save_model_path) and os.path.exists(encoder_model_path):
        model = load_model(save_model_path)
        encoder_model = load_model(encoder_model_path)
        decoder_model = load_model(decoder_model_path)
    else:
        model, encoder_model, decoder_model = create_model(
            len(input_token_index), len(target_token_index), rnn_hidden_dim)
    # Run training
    callbacks_list = callback(save_model_path, logger)
    model.fit(x=[encoder_input_data, decoder_input_data],
              y=decoder_target_data,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.1,
              callbacks=callbacks_list)
    encoder_model.save(encoder_model_path)
    decoder_model.save(decoder_model_path)
    logger.info("Model save to " + save_model_path)
    logger.info("Training has finished.")

    evaluate(encoder_model, decoder_model, len(input_token_index),
             len(target_token_index), rnn_hidden_dim, target_token_index,
             max_target_texts_len, encoder_input_data, input_texts)