import jieba from cangjie.utils.config import get_data_dir import os if __name__ == '__main__': test_path = os.path.join(get_data_dir(), "msr_test.utf8") seg_path = os.path.join(get_data_dir(), "msr_test_jieba.utf8") fw = open(seg_path, 'w', encoding='utf-8') with open(test_path, 'r', encoding='utf-8') as f: for line in f: seg = jieba.cut(line[:-1]) words = [word for word in seg] fw.write(" ".join(words) + '\n') fw.close() print("Write Done!", seg_path)
sentence=line[:-1]) fw.write(" ".join(line_seg_result) + '\n') line_cnt += 1 if line_cnt % 100 == 0: print(line_cnt) fw.close() print(line_cnt) return True if __name__ == '__main__': data_dir = get_data_dir() dict_path = os.path.join(data_dir, "msr.dict") test_path = os.path.join(data_dir, "msr_test.utf8") method = "bimm" max_num_char = 6 test_result_path = os.path.join(data_dir, "msr_test_" + method + ".utf8") word_dict = load_dictionary(dict_path=dict_path) print("Total number of words is: %d\n" % (len(word_dict))) seg_on_file(word_dict=word_dict, test_path=test_path, seg_path=test_result_path, method=method, max_num_char=max_num_char)
shuffle_buffer_size=1024, batch_size=16, steps=10, pad_index=0, char2id_dict=None): return dataset_generator(data_path=data_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, steps=steps, pad_index=pad_index, char2id_dict=char2id_dict) if __name__ == "__main__": train_path = os.path.join(get_data_dir(), "msr_training_label.utf8") char2id_dict_path = os.path.join(get_data_dir(), "msr_training_char2id_dict.pkl") char2id_dict = load_dictionary(dict_path=char2id_dict_path) print("#char2id_dict = %d" % len(char2id_dict)) train_dataset = get_dataset(data_path=train_path, batch_size=4, steps=10, char2id_dict=char2id_dict) # inputs: [batch_size, steps] \in [0, 1, 2, ..., vocab_size] # outputs: [batch_size, steps] \in [0, 1, 2, 3, 4] for i, (inputs, outputs) in zip(range(2), train_dataset):
def train_model(): vocab_size = 3954 # count > min_char_count = 5 num_states = 4 total_num_train = 69000 # num_lines of msr_rnn_train.utf8 total_num_val = 17300 # num_lines of msr_rnn_val.utf8 epochs = 100 shuffle_buffer_size = 1024 * 2 batch_size = 32 rnn_steps = 30 embedding_dim = 64 rnn_units = 32 pad_index = 0 # pad_index, to mask in loss train_path = os.path.join(get_data_dir(), "msr_rnn_train.utf8") val_path = os.path.join(get_data_dir(), "msr_rnn_val.utf8") char2id_dict_path = os.path.join(get_data_dir(), "msr_training_char2id_dict.pkl") num_train_batch = total_num_train // batch_size + 1 num_val_batch = total_num_val // batch_size + 1 char2id_dict = load_dictionary(dict_path=char2id_dict_path) print("#char2id_dict = %d" % len(char2id_dict)) # === tf.data.Dataset train_dataset = get_dataset(data_path=train_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, steps=rnn_steps, char2id_dict=char2id_dict, pad_index=pad_index) val_dataset = get_dataset(data_path=val_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, steps=rnn_steps, char2id_dict=char2id_dict, pad_index=pad_index) # === model model = BiRNNCRF(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units) # optimizer optimizer = tf.keras.optimizers.Adam(0.001) crf = model.crf_layer model.compile( optimizer=optimizer, #loss=mask_sparse_cross_entropy, loss=crf.loss, #metrics=['acc']) metrics=[crf.accuracy]) # callbacks callbacks = [] early_stopping_cb = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) callbacks.append(early_stopping_cb) tensorboard_cb = TensorBoard( log_dir=os.path.join(get_log_dir(), "rnn_model")) callbacks.append(tensorboard_cb) checkpoint_path = os.path.join(get_model_dir(), "rnn_model", "ckpt") checkpoint_cb = ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, save_best_only=True) callbacks.append(checkpoint_cb) # === Train history = model.fit(train_dataset, batch_size=batch_size, epochs=epochs, steps_per_epoch=num_train_batch, validation_data=val_dataset, validation_steps=num_val_batch, callbacks=callbacks) print(model.summary()) return True
val_path=None, train_ratio=None): fw_train = open(train_path, 'w', encoding='utf-8') fw_val = open(val_path, 'w', encoding='utf-8') with open(input_path, 'r', encoding='utf-8') as f: for line in f: r = np.random.random() if r < train_ratio: fw_train.write(line) else: fw_val.write(line) fw_train.close() fw_val.close() if __name__ == '__main__': input_path = os.path.join(get_data_dir(), "msr_training_label.utf8") train_path = os.path.join(get_data_dir(), "msr_rnn_train.utf8") val_path = os.path.join(get_data_dir(), "msr_rnn_val.utf8") train_ratio = 0.8 split_train_val(input_path=input_path, train_path=train_path, val_path=val_path, train_ratio=train_ratio) print("Write done!", train_path)
def segmentation(): vocab_size = 3954 embedding_dim = 64 rnn_units = 32 pad_index = 0 # pad_index, to mask in loss rnn_steps = 30 # is not needed in test test_path = os.path.join(get_data_dir(), "msr_test.utf8") seg_path = os.path.join(get_data_dir(), "msr_test_birnn_crf.utf8") char2id_dict_path = os.path.join(get_data_dir(), "msr_training_char2id_dict.pkl") char2id_dict = load_dictionary(dict_path=char2id_dict_path) print("#char2id_dict=%d" % len(char2id_dict)) # === Build and compile model. model = BiRNNCRF(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units) optimizer = tf.keras.optimizers.Adam(0.001) crf = model.crf_layer model.compile(optimizer=optimizer, loss=crf.loss, metrics=[crf.accuracy]) """ model.compile(optimizer=optimizer, loss=mask_sparse_cross_entropy, metrics=['acc']) """ # === Load weights. checkpoint_dir = os.path.join(get_model_dir(), "rnn_model") checkpoint = tf.train.latest_checkpoint(checkpoint_dir=checkpoint_dir) model.load_weights(checkpoint) # === Run once, to load weights of checkpoint. test_model_once(model=model, vocab_size=vocab_size) # Load separator_dict separator_dict = load_separator_dict() print("#separator_dict=%d" % len(separator_dict)) fw = open(seg_path, 'w', encoding='utf-8') with open(test_path, 'r', encoding='utf-8') as f: line_cnt = 0 for line in f: labels = model_predict(model=model, char_list=line[:-1], char2id_dict=char2id_dict, separator_dict=separator_dict) if len(line[:-1]) != len(labels): print("Wrong") print(line[:-1], '\n', labels) print(len(line[:-1]), len(labels)) break # {0: pad, 1: B, 2: M, 3: E, 4: S} words = [] word = [] for i, label in zip(range(len(line) - 1), labels): word.append(line[i]) if label == 3 or label == 4: words.append("".join(word)) word = [] if len(word) > 0: words.append("".join(word)) fw.write(" ".join(words) + '\n') line_cnt += 1 if line_cnt % 100 == 0: print(line_cnt) print(line_cnt) fw.close()
def train_seq2seq(): vocab_size = 3954 # count > min_char_count = 5 num_states = 4 total_num_train = 69000 # num_lines of msr_rnn_train.utf8 total_num_val = 17300 # num_lines of msr_rnn_val.utf8 batch_size = 32 epochs = 100 shuffle_buffer_size = 1024 * 2 rnn_steps = 30 embedding_dim = 64 rnn_units = 32 min_val_loss = None opt_epoch = None patience = 5 train_path = os.path.join(get_data_dir(), "msr_rnn_train.utf8") val_path = os.path.join(get_data_dir(), "msr_rnn_val.utf8") char2id_dict_path = os.path.join(get_data_dir(), "msr_training_char2id_dict.pkl") num_train_batch = total_num_train // batch_size + 1 num_val_batch = total_num_val // batch_size + 1 char2id_dict = load_dictionary(dict_path=char2id_dict_path) print("#char2id_dict = %d" % len(char2id_dict)) # === Dataset train_dataset = get_dataset(data_path=train_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, steps=rnn_steps, char2id_dict=char2id_dict, pad_index=0) val_dataset = get_dataset(data_path=val_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, steps=rnn_steps, char2id_dict=char2id_dict, pad_index=0) # === Model encoder = Encoder(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units) decoder = Decoder(num_states=num_states, embedding_dim=embedding_dim, rnn_units=rnn_units) # === Optimizer optimizer = tf.keras.optimizers.Adam(0.001) # === Checkpoint checkpoint_dir = os.path.join(get_model_dir(), "seq2seq") checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") checkpoint = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer) start = time.time() for epoch in range(epochs): epoch_start = time.time() # === train print('\nTraining...') train_loss = 0 batch_start = time.time() for batch, (inputs, targets) in zip(range(num_train_batch), train_dataset): cur_loss = train_step(encoder, decoder, optimizer, inputs, targets, mask=0) train_loss += cur_loss if (batch + 1) % 100 == 0: print( "Epoch: %d/%d, batch: %d/%d, train_loss: %.4f, cur_loss: %.4f," % (epoch + 1, epochs, batch + 1, num_train_batch, train_loss / (batch + 1), cur_loss), end=" ") batch_end = time.time() batch_last = batch_end - batch_start print("lasts: %.2fs" % batch_last) train_loss /= num_train_batch print("Epoch: %d/%d, train_loss: %.4f" % (epoch + 1, epochs, train_loss)) # === validate print("\nValidating...") val_loss = 0 batch_start = time.time() for batch, (inputs, targets) in zip(range(num_val_batch), val_dataset): cur_loss = train_step(encoder, decoder, optimizer, inputs, targets, mask=0) val_loss += cur_loss if (batch + 1) % 100 == 0: print( "Epoch: %d/%d, batch: %d/%d, val_loss: %.4f, cur_loss: %.4f, " % (epoch + 1, epochs, batch + 1, num_val_batch, val_loss / (batch + 1), cur_loss), end=" ") batch_end = time.time() batch_last = batch_end - batch_start print("lasts: %.2fs" % batch_last) val_loss /= num_val_batch print("Epoch: %d/%d, train_loss: %.4f, val_loss: %.4f, " % (epoch + 1, epochs, train_loss, val_loss), end=" ") epoch_end = time.time() epoch_last = epoch_end - epoch_start print("lasts: %.2fs" % epoch_last) if opt_epoch is not None: if epoch - opt_epoch > patience: print("Stop training, epoch: %d, opt_epoch: %d") break if min_val_loss is None or val_loss < min_val_loss: min_val_loss = val_loss opt_epoch = epoch # === Save best model only. print("\nSaving...") print("Epoch: %d, train_loss: %.4f, val_loss: %.4f" % (epoch + 1, train_loss, val_loss)) checkpoint.save(file_prefix=checkpoint_prefix) print("Training done! min_val_loss=%.4f, opt_epoch=%d" % (min_val_loss, opt_epoch), end=" ") end = time.time() last = end - start print("Lasts: %.2fs" % last)
labels = [] for word in buf: if len(word) == 0: continue elif len(word) == 1: label = ['4'] else: label = ['2'] * len(word) label[0] = '1' label[-1] = '3' chars.extend(word) labels.extend(label) assert len(chars) == len(labels) fw.write("::".join(chars) + '\t' + "".join(labels) + '\n') fw.close() print("Write Done!", label_path) if __name__ == '__main__': data_path = os.path.join(get_data_dir(), "msr_training.utf8") label_path = os.path.join(get_data_dir(), "msr_training_label.utf8") char2id_dict_path = os.path.join(get_data_dir(), "msr_training_rnn_dict.pkl") char2id_dict = load_dictionary(dict_path=char2id_dict_path) print("#char2id_dict = %d" % len(char2id_dict)) generate_label(input_path=data_path, label_path=label_path)
def segmentation(): vocab_size = 3954 embedding_dim = 64 num_states = 4 rnn_units = 32 pad_index = 0 # pad_index, to mask in loss rnn_steps = 30 test_path = os.path.join(get_data_dir(), "msr_test.utf8") seg_path = os.path.join(get_data_dir(), "msr_test_seq2seq.utf8") char2id_dict_path = os.path.join(get_data_dir(), "msr_training_char2id_dict.pkl") char2id_dict = load_dictionary(dict_path=char2id_dict_path) print("#char2id_dict=%d" % len(char2id_dict)) # === Model encoder = Encoder(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units) decoder = Decoder(num_states=num_states, embedding_dim=embedding_dim, rnn_units=rnn_units) seq2seq = Seq2Seq(encoder=encoder, decoder=decoder) # === Optimizer optimizer = tf.keras.optimizers.Adam(0.001) # === Checkpoint checkpoint_dir = os.path.join(get_model_dir(), "seq2seq") #checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") checkpoint = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer) latest = tf.train.latest_checkpoint(checkpoint_dir) status = checkpoint.restore(latest) status.assert_existing_objects_matched() # === Test once batch_size = 2 inputs = tf.random.uniform((batch_size, rnn_steps), minval=0, maxval=vocab_size + 2, dtype=tf.int32) targets = tf.random.uniform((batch_size, rnn_steps), minval=0, maxval=num_states + 1, dtype=tf.int32) test_seq2seq_once(encoder=encoder, decoder=decoder, inputs=inputs, targets=targets) # === Test # Load separator_dict separator_dict = load_separator_dict() print("#separator_dict=%d" % len(separator_dict)) fw = open(seg_path, 'w', encoding='utf-8') with open(test_path, 'r', encoding='utf-8') as f: line_cnt = 0 for line in f: buf = line[:-1] labels = model_predict(model=seq2seq, char_list=buf, char2id_dict=char2id_dict, separator_dict=separator_dict) if len(buf) != len(labels): print("Wrong") print(buf, '\n', labels) print(len(buf), len(labels)) break # {0: pad, 1: B, 2: M, 3: E, 4: S} words = [] word = [] for i, label in zip(range(len(buf)), labels): word.append(buf[i]) if label == 3 or label == 4: words.append("".join(word)) word = [] if len(word) > 0: words.append("".join(word)) fw.write(" ".join(words) + '\n') line_cnt += 1 if line_cnt % 100 == 0: print(line_cnt) fw.close()
from cangjie.utils.config import get_data_dir import os, pickle def count_word(input_path=None, word_cnt_dict_path=None): """ input_data: training data of segmentation, split by space. word_cnt_dict: {word: count} """ word_cnt_dict = {} with open(input_path, 'r', encoding='utf-8') as f: for line in f: buf = line[:-1].split(' ') for word in buf: if word not in word_cnt_dict: word_cnt_dict[word] = 1 else: word_cnt_dict[word] += 1 with open(word_cnt_dict_path, 'wb') as fw: pickle.dump(word_cnt_dict, fw) if __name__ == '__main__': train_path = os.path.join(get_data_dir(), "msr_training.utf8") word_cnt_dict_path = os.path.join(get_data_dir(), "msr_training_word_cnt_dict.pkl") count_word(input_path=train_path, word_cnt_dict_path=word_cnt_dict_path) print('Write done!', word_cnt_dict_path)