batch_size=16, steps=10, pad_index=0, char2id_dict=None): return dataset_generator(data_path=data_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, steps=steps, pad_index=pad_index, char2id_dict=char2id_dict) if __name__ == "__main__": train_path = os.path.join(get_data_dir(), "msr_training_label.utf8") char2id_dict_path = os.path.join(get_data_dir(), "msr_training_char2id_dict.pkl") char2id_dict = load_dictionary(dict_path=char2id_dict_path) print("#char2id_dict = %d" % len(char2id_dict)) train_dataset = get_dataset(data_path=train_path, batch_size=4, steps=10, char2id_dict=char2id_dict) # inputs: [batch_size, steps] \in [0, 1, 2, ..., vocab_size] # outputs: [batch_size, steps] \in [0, 1, 2, 3, 4] for i, (inputs, outputs) in zip(range(2), train_dataset): print(i, inputs.shape, outputs.shape)
def train_seq2seq(): vocab_size = 3954 # count > min_char_count = 5 num_states = 4 total_num_train = 69000 # num_lines of msr_rnn_train.utf8 total_num_val = 17300 # num_lines of msr_rnn_val.utf8 batch_size = 32 epochs = 100 shuffle_buffer_size = 1024 * 2 rnn_steps = 30 embedding_dim = 64 rnn_units = 32 min_val_loss = None opt_epoch = None patience = 5 train_path = os.path.join(get_data_dir(), "msr_rnn_train.utf8") val_path = os.path.join(get_data_dir(), "msr_rnn_val.utf8") char2id_dict_path = os.path.join(get_data_dir(), "msr_training_char2id_dict.pkl") num_train_batch = total_num_train // batch_size + 1 num_val_batch = total_num_val // batch_size + 1 char2id_dict = load_dictionary(dict_path=char2id_dict_path) print("#char2id_dict = %d" % len(char2id_dict)) # === Dataset train_dataset = get_dataset(data_path=train_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, steps=rnn_steps, char2id_dict=char2id_dict, pad_index=0) val_dataset = get_dataset(data_path=val_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, steps=rnn_steps, char2id_dict=char2id_dict, pad_index=0) # === Model encoder = Encoder(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units) decoder = Decoder(num_states=num_states, embedding_dim=embedding_dim, rnn_units=rnn_units) # === Optimizer optimizer = tf.keras.optimizers.Adam(0.001) # === Checkpoint checkpoint_dir = os.path.join(get_model_dir(), "seq2seq") checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") checkpoint = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer) start = time.time() for epoch in range(epochs): epoch_start = time.time() # === train print('\nTraining...') train_loss = 0 batch_start = time.time() for batch, (inputs, targets) in zip(range(num_train_batch), train_dataset): cur_loss = train_step(encoder, decoder, optimizer, inputs, targets, mask=0) train_loss += cur_loss if (batch + 1) % 100 == 0: print( "Epoch: %d/%d, batch: %d/%d, train_loss: %.4f, cur_loss: %.4f," % (epoch + 1, epochs, batch + 1, num_train_batch, train_loss / (batch + 1), cur_loss), end=" ") batch_end = time.time() batch_last = batch_end - batch_start print("lasts: %.2fs" % batch_last) train_loss /= num_train_batch print("Epoch: %d/%d, train_loss: %.4f" % (epoch + 1, epochs, train_loss)) # === validate print("\nValidating...") val_loss = 0 batch_start = time.time() for batch, (inputs, targets) in zip(range(num_val_batch), val_dataset): cur_loss = train_step(encoder, decoder, optimizer, inputs, targets, mask=0) val_loss += cur_loss if (batch + 1) % 100 == 0: print( "Epoch: %d/%d, batch: %d/%d, val_loss: %.4f, cur_loss: %.4f, " % (epoch + 1, epochs, batch + 1, num_val_batch, val_loss / (batch + 1), cur_loss), end=" ") batch_end = time.time() batch_last = batch_end - batch_start print("lasts: %.2fs" % batch_last) val_loss /= num_val_batch print("Epoch: %d/%d, train_loss: %.4f, val_loss: %.4f, " % (epoch + 1, epochs, train_loss, val_loss), end=" ") epoch_end = time.time() epoch_last = epoch_end - epoch_start print("lasts: %.2fs" % epoch_last) if opt_epoch is not None: if epoch - opt_epoch > patience: print("Stop training, epoch: %d, opt_epoch: %d") break if min_val_loss is None or val_loss < min_val_loss: min_val_loss = val_loss opt_epoch = epoch # === Save best model only. print("\nSaving...") print("Epoch: %d, train_loss: %.4f, val_loss: %.4f" % (epoch + 1, train_loss, val_loss)) checkpoint.save(file_prefix=checkpoint_prefix) print("Training done! min_val_loss=%.4f, opt_epoch=%d" % (min_val_loss, opt_epoch), end=" ") end = time.time() last = end - start print("Lasts: %.2fs" % last)
def train_model(): vocab_size = 3954 # count > min_char_count = 5 num_states = 4 total_num_train = 69000 # num_lines of msr_rnn_train.utf8 total_num_val = 17300 # num_lines of msr_rnn_val.utf8 epochs = 100 shuffle_buffer_size = 1024 * 2 batch_size = 32 rnn_steps = 30 embedding_dim = 64 rnn_units = 32 pad_index = 0 # pad_index, to mask in loss train_path = os.path.join(get_data_dir(), "msr_rnn_train.utf8") val_path = os.path.join(get_data_dir(), "msr_rnn_val.utf8") char2id_dict_path = os.path.join(get_data_dir(), "msr_training_char2id_dict.pkl") num_train_batch = total_num_train // batch_size + 1 num_val_batch = total_num_val // batch_size + 1 char2id_dict = load_dictionary(dict_path=char2id_dict_path) print("#char2id_dict = %d" % len(char2id_dict)) # === tf.data.Dataset train_dataset = get_dataset(data_path=train_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, steps=rnn_steps, char2id_dict=char2id_dict, pad_index=pad_index) val_dataset = get_dataset(data_path=val_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, steps=rnn_steps, char2id_dict=char2id_dict, pad_index=pad_index) # === model model = BiRNNCRF(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units) # optimizer optimizer = tf.keras.optimizers.Adam(0.001) crf = model.crf_layer model.compile( optimizer=optimizer, #loss=mask_sparse_cross_entropy, loss=crf.loss, #metrics=['acc']) metrics=[crf.accuracy]) # callbacks callbacks = [] early_stopping_cb = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) callbacks.append(early_stopping_cb) tensorboard_cb = TensorBoard( log_dir=os.path.join(get_log_dir(), "rnn_model")) callbacks.append(tensorboard_cb) checkpoint_path = os.path.join(get_model_dir(), "rnn_model", "ckpt") checkpoint_cb = ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, save_best_only=True) callbacks.append(checkpoint_cb) # === Train history = model.fit(train_dataset, batch_size=batch_size, epochs=epochs, steps_per_epoch=num_train_batch, validation_data=val_dataset, validation_steps=num_val_batch, callbacks=callbacks) print(model.summary()) return True
def segmentation(): vocab_size = 3954 embedding_dim = 64 num_states = 4 rnn_units = 32 pad_index = 0 # pad_index, to mask in loss rnn_steps = 30 test_path = os.path.join(get_data_dir(), "msr_test.utf8") seg_path = os.path.join(get_data_dir(), "msr_test_seq2seq.utf8") char2id_dict_path = os.path.join(get_data_dir(), "msr_training_char2id_dict.pkl") char2id_dict = load_dictionary(dict_path=char2id_dict_path) print("#char2id_dict=%d" % len(char2id_dict)) # === Model encoder = Encoder(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units) decoder = Decoder(num_states=num_states, embedding_dim=embedding_dim, rnn_units=rnn_units) seq2seq = Seq2Seq(encoder=encoder, decoder=decoder) # === Optimizer optimizer = tf.keras.optimizers.Adam(0.001) # === Checkpoint checkpoint_dir = os.path.join(get_model_dir(), "seq2seq") #checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") checkpoint = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer) latest = tf.train.latest_checkpoint(checkpoint_dir) status = checkpoint.restore(latest) status.assert_existing_objects_matched() # === Test once batch_size = 2 inputs = tf.random.uniform((batch_size, rnn_steps), minval=0, maxval=vocab_size + 2, dtype=tf.int32) targets = tf.random.uniform((batch_size, rnn_steps), minval=0, maxval=num_states + 1, dtype=tf.int32) test_seq2seq_once(encoder=encoder, decoder=decoder, inputs=inputs, targets=targets) # === Test # Load separator_dict separator_dict = load_separator_dict() print("#separator_dict=%d" % len(separator_dict)) fw = open(seg_path, 'w', encoding='utf-8') with open(test_path, 'r', encoding='utf-8') as f: line_cnt = 0 for line in f: buf = line[:-1] labels = model_predict(model=seq2seq, char_list=buf, char2id_dict=char2id_dict, separator_dict=separator_dict) if len(buf) != len(labels): print("Wrong") print(buf, '\n', labels) print(len(buf), len(labels)) break # {0: pad, 1: B, 2: M, 3: E, 4: S} words = [] word = [] for i, label in zip(range(len(buf)), labels): word.append(buf[i]) if label == 3 or label == 4: words.append("".join(word)) word = [] if len(word) > 0: words.append("".join(word)) fw.write(" ".join(words) + '\n') line_cnt += 1 if line_cnt % 100 == 0: print(line_cnt) fw.close()