def nmt(model, corpus_src, path_vocab_src=path_vocab_src, path_vocab_tgt=path_vocab_tgt, src_seq_len_max=src_seq_len_max): # 读取字典 word2id_vocab_src, vocab_size_src = nn_lib.read_word2id_dict( path_vocab_src) word2id_vocab_tgt, vocab_size_tgt = nn_lib.read_word2id_dict( path_vocab_tgt) id2word_vocab_tgt = { value: key for key, value in word2id_vocab_tgt.items() } ids = [] id_extendeds = [] vocab_extends = [] # 处理输入语料数据 for sentence in corpus_src: sent = sentence.strip().split() id, id_extended, vocab_extend_raw = nn_lib.sentence2id( sent=sent, word2id_vocab=word2id_vocab_src, build_extend_vocab=True) ids.append(id) id_extendeds.append(id_extended) vocab_extend = { key: value - len(word2id_vocab_src) + len(word2id_vocab_tgt) for key, value in vocab_extend_raw.items() } vocab_extends.append(copy.copy(vocab_extend)) # 序列补0,统计附加词表大小 ids = nn_lib.pad_sequences(sequences=ids, max_seq_len=src_seq_len_max) id_extendeds = nn_lib.pad_sequences(sequences=id_extendeds, max_seq_len=src_seq_len_max) vocab_size_extened = max([len(i) for i in vocab_extends]) # 规整数据 ids = np.array(ids) id_extendeds = np.array(id_extendeds) vocab_extends = np.array(vocab_extends).reshape([-1, 1]) data = [ids, id_extendeds, vocab_extends] # 进行预测,输出时序概率分布 tgt_prob_seqs = model.infer(data=data) # 转换预测结果至自然语言语句 tgt_seqs = [] for seq in tgt_prob_seqs: seq = np.argmax(seq, axis=1) seq = [id2word_vocab_tgt[id] for id in seq] seq = np.array(seq).reshape([-1, 1]) tgt_seqs.append(seq) corpus_tgt = np.concatenate(tgt_seqs, axis=1) corpus_tgt = [ ''.join([tmp for tmp in corpus_tgt[i, :] if tmp != '<PAD>']) for i in range(corpus_tgt.shape[0]) ] return corpus_tgt
def preprocess_data(): # 读取语料 data = read_ner_corpus(path_ner + r'train_data') # 读取字典 word2id, vocab_size = nn_lib.read_word2id_dict(path_ner + r'word2id.pkl') # 转换语料至id seqs, labels = [], [] for (sent_, label_) in data: sent_id_ = nn_lib.sentence2id(sent_, word2id) label_id_ = [label2id[label] for label in label_] seqs.append(sent_id_) labels.append(label_id_) max_seq_len = max([len(x) for x in labels]) # 规整语料数据 seqs = nn_lib.pad_sequences(seqs, max_seq_len) seqs = np.array(seqs) labels = nn_lib.pad_sequences(labels, max_seq_len) labels = np.array(labels) # 构建训练集、测试集、验证集 x, x_vali, y, y_vali = train_test_split(seqs, labels, test_size=1024) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1024) # 存储训练集、测试集、验证集 for name in processed_corpus_names: with open(path_corpus_processed + name, 'wb') as file: pickle.dump(eval(name), file) return
def demo(): word2id_vocab_src, vocab_size_src = nn_lib.read_word2id_dict( path_vocab_src) word2id_vocab_tgt, vocab_size_tgt = nn_lib.read_word2id_dict( path_vocab_tgt) id2word_vocab_src = { value: key for key, value in word2id_vocab_src.items() } id2word_vocab_tgt = { value: key for key, value in word2id_vocab_tgt.items() } # ' '.join([id2word_vocab_src[i] for i in corpus['x_test'][100]]) # ''.join([id2word_vocab_tgt[i] for i in corpus['y_test'][100]]) return
def demo_rebuild_w2v_matrix(path_word2id, path_w2v): path_word2id = u'E:\\MachineLearning\\data\\seq2seq_nmt\\vocab_zh.pkl' path_w2v = path_wiki + u'45000-samll.txt' word2id_vocab, vocab_size = nn_lib.read_word2id_dict(path_word2id) w2v_vector = KeyedVectors.load_word2vec_format(path_w2v) w2v_matrix = rebuild_w2v_matrix(word2id_vocab, w2v_vector) return w2v_matrix
for i in range(len(word_list)): corpus_label = [] for j in range(len(word_list[i])): corpus_label.append( (word_list[i][j], id2label[label_id_list[i][j]])) corpus_labels.append(corpus_label) return corpus_labels if __name__ == '__main__': if flag_build_vocab is True: build_vocab() if flag_process_data is True: preprocess_data() word2id, vocab_size = nn_lib.read_word2id_dict(path_ner + r'word2id.pkl') corpus = load_processed_corpus() data = [corpus['x_train'], corpus['y_train']] data_test = [corpus['x_test'], corpus['y_test']] model = NeuralNetwork(data=data, model_type='bilstm_crf', loss_fun_type='bilstm_crf', model_parameter={ 'word_embd_pretrain': None, 'keep_prob': keep_prob, 'vocab_num': vocab_size, 'word_embd_dim': word_embd_dim, 'label_num': len(label2id), 'dim_rnn': dim_rnn, 'batch_size': batch_size },
def preprocess_data(): # 读取字典 word2id_vocab_src, vocab_size_src = nn_lib.read_word2id_dict( path_vocab_src) word2id_vocab_tgt, vocab_size_tgt = nn_lib.read_word2id_dict( path_vocab_tgt) # 读取语料 corpus_src = read_file(path_corpus_src) corpus_tgt = read_file(path_corpus_tgt) # 转换语料文本至id值 src_ids = [] src_id_extendeds = [] word2id_vocab_extends = [] tgt_ids = [] tgt_id_extendeds = [] for src, tgt in zip(corpus_src, corpus_tgt): # 转换src至ID src_id, src_id_extended, vocab_extend_raw = nn_lib.sentence2id( sent=src, word2id_vocab=word2id_vocab_src, build_extend_vocab=True) src_ids.append(src_id) src_id_extendeds.append(src_id_extended) vocab_extend = { key: value - len(word2id_vocab_src) + len(word2id_vocab_tgt) for key, value in vocab_extend_raw.items() } word2id_vocab_extends.append(copy.copy(vocab_extend)) # 转换tgt至ID tgt_id = nn_lib.sentence2id(sent=tgt, word2id_vocab=word2id_vocab_tgt, build_extend_vocab=False) vocab_extend.update(word2id_vocab_tgt) tgt_id_extended = nn_lib.sentence2id(sent=tgt, word2id_vocab=vocab_extend, build_extend_vocab=False) tgt_ids.append(tgt_id) tgt_id_extendeds.append(tgt_id_extended) del src, src_id, src_id_extended, tgt, tgt_id, tgt_id_extended # 序列补0,统计附加词表大小 src_ids = nn_lib.pad_sequences(sequences=src_ids, max_seq_len=src_seq_len_max) src_id_extendeds = nn_lib.pad_sequences(sequences=src_id_extendeds, max_seq_len=src_seq_len_max) tgt_ids = nn_lib.pad_sequences(sequences=tgt_ids, max_seq_len=tgt_seq_len_max, add_sos=True) tgt_id_extendeds = nn_lib.pad_sequences(sequences=tgt_id_extendeds, max_seq_len=tgt_seq_len_max, add_sos=True) vocab_size_extened = max([len(i) for i in word2id_vocab_extends]) # 规整数据 src_ids = np.array(src_ids) src_id_extendeds = np.array(src_id_extendeds) tgt_ids = np.array(tgt_ids) tgt_id_extendeds = np.array(tgt_id_extendeds) word2id_vocab_extends = np.array(word2id_vocab_extends).reshape([-1, 1]) # 构建训练集、测试集、验证集 nn_lib.generate_train_test_vali(src_ids, src_id_extendeds, tgt_ids, tgt_id_extendeds, word2id_vocab_extends, file_path=path_corpus_processed, corpus_names=corpus_names, data_test_size=128 * 2, data_vali_size=128 * 3) # x, x_vali, x_extended, x_extended_vali, y, y_vali, y_extended, y_extended_vali, vocab_extend, vocab_extend_vali \ # = train_test_split(src_ids, src_id_extendeds, tgt_ids, tgt_id_extendeds, word2id_vocab_extends, test_size=128*3) # x_train, x_test, x_extended_train, x_extended_test, y_train, y_test, y_extended_train, y_extended_test, vocab_extend_train, vocab_extend_test \ # = train_test_split(x, x_extended, y, y_extended, vocab_extend, test_size=128*2) # del x, x_extended, y, y_extended, vocab_extend # # 存储训练集、测试集、验证集 # for name in processed_corpus_names: # with open(path_corpus_processed + name, 'wb') as file: # pickle.dump(eval(name), file) return
if __name__ == "__main__": # 创建字典 if flag_build_vocab is True: word2id_vocab_en, word2id_vocab_zh = build_vocab() # 预处理数据 if flag_process_data is True: preprocess_data() # 预训练词向量 if flag_pretrain_w2v is True: nn_lib.train_word2vec(path_corpus=path_corpus_src, word2vec_dim=word_embd_dim, path_w2v_model=path_seq2seq + 'en_w2v_model', path_w2v_vector=path_seq2seq + 'en_w2v_vector') w2v_vector = nn_lib.load_w2v_vector(path_seq2seq + 'en_w2v_vector') word2id_vocab_src, vocab_size_src = nn_lib.read_word2id_dict( path_vocab_src) encoder_word_embd_pretrain = nn_lib.rebuild_w2v_matrix( word2id_vocab_src, w2v_vector) with open(path_seq2seq + 'encoder_word_embd_pretrain', 'wb') as file: pickle.dump(encoder_word_embd_pretrain, file) # word2id_vocab_tgt, vocab_size_tgt = nn_lib.read_word2id_dict(path_vocab_tgt) corpus = load_processed_corpus() data = [ corpus['x_train'], corpus['x_extended_train'], corpus['vocab_extend_train'], corpus['y_train'], corpus['y_extended_train'] ] data_test = [ corpus['x_test'], corpus['x_extended_test'], corpus['vocab_extend_test'], corpus['y_test'],
def preprocess_data(vocab_size): # 读取语料数据 data_q = [] data_a = [] for file_name in corpus_list: print('loading file {}'.format(path_corpus + file_name)) data_q_, data_a_ = load_file(path_corpus + file_name) data_q = data_q + data_q_ data_a = data_a + data_a_ del data_q_, data_a_ # 构建/读取字典 if flag_build_vocab: word2id_vocab, vocab_size = nn_lib.build_word2id_vocab( data=data_q, saved_path=path_vocab, vocab_size=vocab_size, use_seg=True) else: word2id_vocab, vocab_size = nn_lib.read_word2id_dict(path_vocab) # 转换语料文本至id值 q_ids = [] q_id_extendeds = [] word2id_vocab_extends = [] a_ids = [] a_id_extendeds = [] for q, a in zip(data_q, data_a): # 转换问题Q至ID q_seg = nn_lib.str_segment(q) q_id, q_id_extended, vocab_extend = nn_lib.sentence2id( sent=q_seg, word2id_vocab=word2id_vocab, build_extend_vocab=True) q_ids.append(q_id) q_id_extendeds.append(q_id_extended) word2id_vocab_extends.append(copy.copy(vocab_extend)) # 转换答案A至ID a_seg = nn_lib.str_segment(a) a_id = nn_lib.sentence2id(sent=a_seg, word2id_vocab=word2id_vocab, build_extend_vocab=False) vocab_extend.update(word2id_vocab) a_id_extended = nn_lib.sentence2id(sent=a_seg, word2id_vocab=vocab_extend, build_extend_vocab=False) a_ids.append(a_id) a_id_extendeds.append(a_id_extended) del q, q_seg, q_id, q_id_extended, a, a_seg, a_id, a_id_extended # 序列补0,统计附加词表大小 q_ids = nn_lib.pad_sequences(sequences=q_ids, max_seq_len=max_seq_len) q_id_extendeds = nn_lib.pad_sequences(sequences=q_id_extendeds, max_seq_len=max_seq_len) a_ids = nn_lib.pad_sequences(sequences=a_ids, max_seq_len=max_seq_len, add_sos=True) a_id_extendeds = nn_lib.pad_sequences(sequences=a_id_extendeds, max_seq_len=max_seq_len, add_sos=True) vocab_size_extened = max([len(i) for i in word2id_vocab_extends]) # 规整数据 q_ids = np.array(q_ids) q_id_extendeds = np.array(q_id_extendeds) a_ids = np.array(a_ids) a_id_extendeds = np.array(a_id_extendeds) word2id_vocab_extends = np.array(word2id_vocab_extends).reshape([-1, 1]) # 构建训练集、测试集、验证集 x, x_vali, x_extended, x_extended_vali, y, y_vali, y_extended, y_extended_vali, vocab_extend, vocab_extend_vali \ = train_test_split(q_ids, q_id_extendeds, a_ids, a_id_extendeds, word2id_vocab_extends, test_size=1024*8) x_train, x_test, x_extended_train, x_extended_test, y_train, y_test, y_extended_train, y_extended_test, vocab_extend_train, vocab_extend_test \ = train_test_split(x, x_extended, y, y_extended, vocab_extend, test_size=1024) del x, x_extended, y, y_extended, vocab_extend # 存储训练集、测试集、验证集 for name in processed_corpus_names: with open(path_corpus_processed + name, 'wb') as file: pickle.dump(eval(name), file) return