def train(train_path='', test_path='', save_vocab_path='', attn_model_path='', batch_size=64, epochs=100, maxlen=400, hidden_dim=128, use_gpu=False): data_reader = CGEDReader(train_path) input_texts, target_texts = data_reader.build_dataset(train_path) test_input_texts, test_target_texts = data_reader.build_dataset(test_path) # load or save word dict if os.path.exists(save_vocab_path): char2id = load_word_dict(save_vocab_path) id2char = {int(j): i for i, j in char2id.items()} chars = set([i for i in char2id.keys()]) else: print('Training data...') print('input_texts:', input_texts[0]) print('target_texts:', target_texts[0]) max_input_texts_len = max([len(text) for text in input_texts]) print('num of samples:', len(input_texts)) print('max sequence length for inputs:', max_input_texts_len) chars = data_reader.read_vocab(input_texts + target_texts) id2char = {i: j for i, j in enumerate(chars)} char2id = {j: i for i, j in id2char.items()} save_word_dict(char2id, save_vocab_path) model = Seq2seqAttnModel(chars, attn_model_path=attn_model_path, hidden_dim=hidden_dim, use_gpu=use_gpu).build_model() evaluator = Evaluate(model, attn_model_path, char2id, id2char, maxlen) model.fit_generator(data_generator(input_texts, target_texts, char2id, batch_size, maxlen), steps_per_epoch=(len(input_texts) + batch_size - 1) // batch_size, epochs=epochs, validation_data=get_validation_data(test_input_texts, test_target_texts, char2id, maxlen), callbacks=[evaluator])
def train(train_path='', test_path='', save_vocab_path='', attn_model_path='', batch_size=64, epochs=100, maxlen=400, hidden_dim=128, dropout=0.2, vocab_max_size=50000, vocab_min_count=5, gpu_id=0): source_texts, target_texts = build_dataset(train_path) test_input_texts, test_target_texts = build_dataset(test_path) # load or save word dict if os.path.exists(save_vocab_path): vocab2id = load_word_dict(save_vocab_path) else: print('Training data...') vocab2id = read_vocab(source_texts + target_texts, max_size=vocab_max_size, min_count=vocab_min_count) num_encoder_tokens = len(vocab2id) max_input_texts_len = max([len(text) for text in source_texts]) print('input_texts:', source_texts[0]) print('target_texts:', target_texts[0]) print('num of samples:', len(source_texts)) print('num of unique input tokens:', num_encoder_tokens) print('max sequence length for inputs:', max_input_texts_len) save_word_dict(vocab2id, save_vocab_path) id2vocab = {int(j): i for i, j in vocab2id.items()} print('The vocabulary file:%s, size: %s' % (save_vocab_path, len(vocab2id))) model = Seq2seqAttnModel(len(vocab2id), attn_model_path=attn_model_path, hidden_dim=hidden_dim, dropout=dropout, gpu_id=gpu_id).build_model() evaluator = Evaluate(model, attn_model_path, vocab2id, id2vocab, maxlen) earlystop = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='auto') model.fit_generator( data_generator(source_texts, target_texts, vocab2id, batch_size, maxlen), steps_per_epoch=(len(source_texts) + batch_size - 1) // batch_size, epochs=epochs, validation_data=get_validation_data(test_input_texts, test_target_texts, vocab2id, maxlen), callbacks=[evaluator, earlystop])