Esempio n. 1
0
def train(train_path='', test_path='', save_vocab_path='', attn_model_path='',
          batch_size=64, epochs=100, maxlen=400, hidden_dim=128, use_gpu=False):
    data_reader = CGEDReader(train_path)
    input_texts, target_texts = data_reader.build_dataset(train_path)
    test_input_texts, test_target_texts = data_reader.build_dataset(test_path)

    # load or save word dict
    if os.path.exists(save_vocab_path):
        char2id = load_word_dict(save_vocab_path)
        id2char = {int(j): i for i, j in char2id.items()}
        chars = set([i for i in char2id.keys()])
    else:
        print('Training data...')
        print('input_texts:', input_texts[0])
        print('target_texts:', target_texts[0])
        max_input_texts_len = max([len(text) for text in input_texts])

        print('num of samples:', len(input_texts))
        print('max sequence length for inputs:', max_input_texts_len)

        chars = data_reader.read_vocab(input_texts + target_texts)
        id2char = {i: j for i, j in enumerate(chars)}
        char2id = {j: i for i, j in id2char.items()}
        save_word_dict(char2id, save_vocab_path)

    model = Seq2seqAttnModel(chars,
                             attn_model_path=attn_model_path,
                             hidden_dim=hidden_dim,
                             use_gpu=use_gpu).build_model()
    evaluator = Evaluate(model, attn_model_path, char2id, id2char, maxlen)
    model.fit_generator(data_generator(input_texts, target_texts, char2id, batch_size, maxlen),
                        steps_per_epoch=(len(input_texts) + batch_size - 1) // batch_size,
                        epochs=epochs,
                        validation_data=get_validation_data(test_input_texts, test_target_texts, char2id, maxlen),
                        callbacks=[evaluator])
Esempio n. 2
0
def train(train_path='',
          test_path='',
          save_vocab_path='',
          attn_model_path='',
          batch_size=64,
          epochs=100,
          maxlen=400,
          hidden_dim=128,
          dropout=0.2,
          vocab_max_size=50000,
          vocab_min_count=5,
          gpu_id=0):
    source_texts, target_texts = build_dataset(train_path)
    test_input_texts, test_target_texts = build_dataset(test_path)

    # load or save word dict
    if os.path.exists(save_vocab_path):
        vocab2id = load_word_dict(save_vocab_path)
    else:
        print('Training data...')
        vocab2id = read_vocab(source_texts + target_texts,
                              max_size=vocab_max_size,
                              min_count=vocab_min_count)
        num_encoder_tokens = len(vocab2id)
        max_input_texts_len = max([len(text) for text in source_texts])

        print('input_texts:', source_texts[0])
        print('target_texts:', target_texts[0])
        print('num of samples:', len(source_texts))
        print('num of unique input tokens:', num_encoder_tokens)
        print('max sequence length for inputs:', max_input_texts_len)
        save_word_dict(vocab2id, save_vocab_path)

    id2vocab = {int(j): i for i, j in vocab2id.items()}
    print('The vocabulary file:%s, size: %s' %
          (save_vocab_path, len(vocab2id)))
    model = Seq2seqAttnModel(len(vocab2id),
                             attn_model_path=attn_model_path,
                             hidden_dim=hidden_dim,
                             dropout=dropout,
                             gpu_id=gpu_id).build_model()
    evaluator = Evaluate(model, attn_model_path, vocab2id, id2vocab, maxlen)
    earlystop = EarlyStopping(monitor='val_loss',
                              patience=3,
                              verbose=1,
                              mode='auto')
    model.fit_generator(
        data_generator(source_texts, target_texts, vocab2id, batch_size,
                       maxlen),
        steps_per_epoch=(len(source_texts) + batch_size - 1) // batch_size,
        epochs=epochs,
        validation_data=get_validation_data(test_input_texts,
                                            test_target_texts, vocab2id,
                                            maxlen),
        callbacks=[evaluator, earlystop])