steps = 100 # 0 for `pad`, 1 for `unk` inputs = tf.random.uniform((batch_size, steps), minval=0, maxval=vocab_size + 2) """ import numpy as np inputs = np.random.randint(low=0, high=vocab_size+2, size=steps*batch_size)\ .reshape((batch_size, steps)) """ print('inputs', inputs.shape) softmax = rnnseg(inputs) print('softmax', softmax.shape) return inputs if __name__ == '__main__': vocab_size = 10 rnnseg = RNNSeg(vocab_size=vocab_size) rnnseg.compile(optimizer=tf.optimizers.Adam(0.001), loss=tf.losses.SparseCategoricalCrossentropy) test_rnnseg_once(rnnseg=rnnseg) from cangjie.utils.config import get_model_dir import os model_img_path = os.path.join(get_model_dir(), "images", "rnn.png") plot_model(rnnseg, to_file=model_img_path, show_shapes=True)
line[:-1], is_use_matching=is_use_matching, matching_method=matching_method, max_num_char=max_num_char, word_dict=word_dict) if seg_words is None: fw.write("\n") else: fw.write(" ".join(seg_words) + "\n") fw.close() if __name__ == "__main__": data_dir = get_data_dir() model_dir = get_model_dir() model_path = os.path.join(model_dir, "hmm", "hmm.pkl") test_path = os.path.join(data_dir, "msr_test.utf8") test_result_path = os.path.join(data_dir, "msr_test_hmm.utf8") dict_path = os.path.join(data_dir, "msr.dict") word_dict = load_dictionary(dict_path=dict_path) print("Total number of words is: %d\n" % (len(word_dict))) hmm = HMM() hmm.load_model(model_path=model_path, is_training=False) seg_res = seg_on_sentence(hmm, sentence='黑夜给了我黑色的眼睛,我却用它寻找光明。') print("/".join(seg_res)) seg_on_file(model=hmm,
def train_model(): vocab_size = 3954 # count > min_char_count = 5 num_states = 4 total_num_train = 69000 # num_lines of msr_rnn_train.utf8 total_num_val = 17300 # num_lines of msr_rnn_val.utf8 epochs = 100 shuffle_buffer_size = 1024 * 2 batch_size = 32 rnn_steps = 30 embedding_dim = 64 rnn_units = 32 pad_index = 0 # pad_index, to mask in loss train_path = os.path.join(get_data_dir(), "msr_rnn_train.utf8") val_path = os.path.join(get_data_dir(), "msr_rnn_val.utf8") char2id_dict_path = os.path.join(get_data_dir(), "msr_training_char2id_dict.pkl") num_train_batch = total_num_train // batch_size + 1 num_val_batch = total_num_val // batch_size + 1 char2id_dict = load_dictionary(dict_path=char2id_dict_path) print("#char2id_dict = %d" % len(char2id_dict)) # === tf.data.Dataset train_dataset = get_dataset(data_path=train_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, steps=rnn_steps, char2id_dict=char2id_dict, pad_index=pad_index) val_dataset = get_dataset(data_path=val_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, steps=rnn_steps, char2id_dict=char2id_dict, pad_index=pad_index) # === model model = BiRNNCRF(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units) # optimizer optimizer = tf.keras.optimizers.Adam(0.001) crf = model.crf_layer model.compile( optimizer=optimizer, #loss=mask_sparse_cross_entropy, loss=crf.loss, #metrics=['acc']) metrics=[crf.accuracy]) # callbacks callbacks = [] early_stopping_cb = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) callbacks.append(early_stopping_cb) tensorboard_cb = TensorBoard( log_dir=os.path.join(get_log_dir(), "rnn_model")) callbacks.append(tensorboard_cb) checkpoint_path = os.path.join(get_model_dir(), "rnn_model", "ckpt") checkpoint_cb = ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, save_best_only=True) callbacks.append(checkpoint_cb) # === Train history = model.fit(train_dataset, batch_size=batch_size, epochs=epochs, steps_per_epoch=num_train_batch, validation_data=val_dataset, validation_steps=num_val_batch, callbacks=callbacks) print(model.summary()) return True
def segmentation(): vocab_size = 3954 embedding_dim = 64 rnn_units = 32 pad_index = 0 # pad_index, to mask in loss rnn_steps = 30 # is not needed in test test_path = os.path.join(get_data_dir(), "msr_test.utf8") seg_path = os.path.join(get_data_dir(), "msr_test_birnn_crf.utf8") char2id_dict_path = os.path.join(get_data_dir(), "msr_training_char2id_dict.pkl") char2id_dict = load_dictionary(dict_path=char2id_dict_path) print("#char2id_dict=%d" % len(char2id_dict)) # === Build and compile model. model = BiRNNCRF(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units) optimizer = tf.keras.optimizers.Adam(0.001) crf = model.crf_layer model.compile(optimizer=optimizer, loss=crf.loss, metrics=[crf.accuracy]) """ model.compile(optimizer=optimizer, loss=mask_sparse_cross_entropy, metrics=['acc']) """ # === Load weights. checkpoint_dir = os.path.join(get_model_dir(), "rnn_model") checkpoint = tf.train.latest_checkpoint(checkpoint_dir=checkpoint_dir) model.load_weights(checkpoint) # === Run once, to load weights of checkpoint. test_model_once(model=model, vocab_size=vocab_size) # Load separator_dict separator_dict = load_separator_dict() print("#separator_dict=%d" % len(separator_dict)) fw = open(seg_path, 'w', encoding='utf-8') with open(test_path, 'r', encoding='utf-8') as f: line_cnt = 0 for line in f: labels = model_predict(model=model, char_list=line[:-1], char2id_dict=char2id_dict, separator_dict=separator_dict) if len(line[:-1]) != len(labels): print("Wrong") print(line[:-1], '\n', labels) print(len(line[:-1]), len(labels)) break # {0: pad, 1: B, 2: M, 3: E, 4: S} words = [] word = [] for i, label in zip(range(len(line) - 1), labels): word.append(line[i]) if label == 3 or label == 4: words.append("".join(word)) word = [] if len(word) > 0: words.append("".join(word)) fw.write(" ".join(words) + '\n') line_cnt += 1 if line_cnt % 100 == 0: print(line_cnt) print(line_cnt) fw.close()
def train_seq2seq(): vocab_size = 3954 # count > min_char_count = 5 num_states = 4 total_num_train = 69000 # num_lines of msr_rnn_train.utf8 total_num_val = 17300 # num_lines of msr_rnn_val.utf8 batch_size = 32 epochs = 100 shuffle_buffer_size = 1024 * 2 rnn_steps = 30 embedding_dim = 64 rnn_units = 32 min_val_loss = None opt_epoch = None patience = 5 train_path = os.path.join(get_data_dir(), "msr_rnn_train.utf8") val_path = os.path.join(get_data_dir(), "msr_rnn_val.utf8") char2id_dict_path = os.path.join(get_data_dir(), "msr_training_char2id_dict.pkl") num_train_batch = total_num_train // batch_size + 1 num_val_batch = total_num_val // batch_size + 1 char2id_dict = load_dictionary(dict_path=char2id_dict_path) print("#char2id_dict = %d" % len(char2id_dict)) # === Dataset train_dataset = get_dataset(data_path=train_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, steps=rnn_steps, char2id_dict=char2id_dict, pad_index=0) val_dataset = get_dataset(data_path=val_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, steps=rnn_steps, char2id_dict=char2id_dict, pad_index=0) # === Model encoder = Encoder(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units) decoder = Decoder(num_states=num_states, embedding_dim=embedding_dim, rnn_units=rnn_units) # === Optimizer optimizer = tf.keras.optimizers.Adam(0.001) # === Checkpoint checkpoint_dir = os.path.join(get_model_dir(), "seq2seq") checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") checkpoint = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer) start = time.time() for epoch in range(epochs): epoch_start = time.time() # === train print('\nTraining...') train_loss = 0 batch_start = time.time() for batch, (inputs, targets) in zip(range(num_train_batch), train_dataset): cur_loss = train_step(encoder, decoder, optimizer, inputs, targets, mask=0) train_loss += cur_loss if (batch + 1) % 100 == 0: print( "Epoch: %d/%d, batch: %d/%d, train_loss: %.4f, cur_loss: %.4f," % (epoch + 1, epochs, batch + 1, num_train_batch, train_loss / (batch + 1), cur_loss), end=" ") batch_end = time.time() batch_last = batch_end - batch_start print("lasts: %.2fs" % batch_last) train_loss /= num_train_batch print("Epoch: %d/%d, train_loss: %.4f" % (epoch + 1, epochs, train_loss)) # === validate print("\nValidating...") val_loss = 0 batch_start = time.time() for batch, (inputs, targets) in zip(range(num_val_batch), val_dataset): cur_loss = train_step(encoder, decoder, optimizer, inputs, targets, mask=0) val_loss += cur_loss if (batch + 1) % 100 == 0: print( "Epoch: %d/%d, batch: %d/%d, val_loss: %.4f, cur_loss: %.4f, " % (epoch + 1, epochs, batch + 1, num_val_batch, val_loss / (batch + 1), cur_loss), end=" ") batch_end = time.time() batch_last = batch_end - batch_start print("lasts: %.2fs" % batch_last) val_loss /= num_val_batch print("Epoch: %d/%d, train_loss: %.4f, val_loss: %.4f, " % (epoch + 1, epochs, train_loss, val_loss), end=" ") epoch_end = time.time() epoch_last = epoch_end - epoch_start print("lasts: %.2fs" % epoch_last) if opt_epoch is not None: if epoch - opt_epoch > patience: print("Stop training, epoch: %d, opt_epoch: %d") break if min_val_loss is None or val_loss < min_val_loss: min_val_loss = val_loss opt_epoch = epoch # === Save best model only. print("\nSaving...") print("Epoch: %d, train_loss: %.4f, val_loss: %.4f" % (epoch + 1, train_loss, val_loss)) checkpoint.save(file_prefix=checkpoint_prefix) print("Training done! min_val_loss=%.4f, opt_epoch=%d" % (min_val_loss, opt_epoch), end=" ") end = time.time() last = end - start print("Lasts: %.2fs" % last)
def segmentation(): vocab_size = 3954 embedding_dim = 64 num_states = 4 rnn_units = 32 pad_index = 0 # pad_index, to mask in loss rnn_steps = 30 test_path = os.path.join(get_data_dir(), "msr_test.utf8") seg_path = os.path.join(get_data_dir(), "msr_test_seq2seq.utf8") char2id_dict_path = os.path.join(get_data_dir(), "msr_training_char2id_dict.pkl") char2id_dict = load_dictionary(dict_path=char2id_dict_path) print("#char2id_dict=%d" % len(char2id_dict)) # === Model encoder = Encoder(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units) decoder = Decoder(num_states=num_states, embedding_dim=embedding_dim, rnn_units=rnn_units) seq2seq = Seq2Seq(encoder=encoder, decoder=decoder) # === Optimizer optimizer = tf.keras.optimizers.Adam(0.001) # === Checkpoint checkpoint_dir = os.path.join(get_model_dir(), "seq2seq") #checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") checkpoint = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer) latest = tf.train.latest_checkpoint(checkpoint_dir) status = checkpoint.restore(latest) status.assert_existing_objects_matched() # === Test once batch_size = 2 inputs = tf.random.uniform((batch_size, rnn_steps), minval=0, maxval=vocab_size + 2, dtype=tf.int32) targets = tf.random.uniform((batch_size, rnn_steps), minval=0, maxval=num_states + 1, dtype=tf.int32) test_seq2seq_once(encoder=encoder, decoder=decoder, inputs=inputs, targets=targets) # === Test # Load separator_dict separator_dict = load_separator_dict() print("#separator_dict=%d" % len(separator_dict)) fw = open(seg_path, 'w', encoding='utf-8') with open(test_path, 'r', encoding='utf-8') as f: line_cnt = 0 for line in f: buf = line[:-1] labels = model_predict(model=seq2seq, char_list=buf, char2id_dict=char2id_dict, separator_dict=separator_dict) if len(buf) != len(labels): print("Wrong") print(buf, '\n', labels) print(len(buf), len(labels)) break # {0: pad, 1: B, 2: M, 3: E, 4: S} words = [] word = [] for i, label in zip(range(len(buf)), labels): word.append(buf[i]) if label == 3 or label == 4: words.append("".join(word)) word = [] if len(word) > 0: words.append("".join(word)) fw.write(" ".join(words) + '\n') line_cnt += 1 if line_cnt % 100 == 0: print(line_cnt) fw.close()