def preprocess_data(): # 读取语料 data = read_ner_corpus(path_ner + r'train_data') # 读取字典 word2id, vocab_size = nn_lib.read_word2id_dict(path_ner + r'word2id.pkl') # 转换语料至id seqs, labels = [], [] for (sent_, label_) in data: sent_id_ = nn_lib.sentence2id(sent_, word2id) label_id_ = [label2id[label] for label in label_] seqs.append(sent_id_) labels.append(label_id_) max_seq_len = max([len(x) for x in labels]) # 规整语料数据 seqs = nn_lib.pad_sequences(seqs, max_seq_len) seqs = np.array(seqs) labels = nn_lib.pad_sequences(labels, max_seq_len) labels = np.array(labels) # 构建训练集、测试集、验证集 x, x_vali, y, y_vali = train_test_split(seqs, labels, test_size=1024) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1024) # 存储训练集、测试集、验证集 for name in processed_corpus_names: with open(path_corpus_processed + name, 'wb') as file: pickle.dump(eval(name), file) return
def nmt(model, corpus_src, path_vocab_src=path_vocab_src, path_vocab_tgt=path_vocab_tgt, src_seq_len_max=src_seq_len_max): # 读取字典 word2id_vocab_src, vocab_size_src = nn_lib.read_word2id_dict( path_vocab_src) word2id_vocab_tgt, vocab_size_tgt = nn_lib.read_word2id_dict( path_vocab_tgt) id2word_vocab_tgt = { value: key for key, value in word2id_vocab_tgt.items() } ids = [] id_extendeds = [] vocab_extends = [] # 处理输入语料数据 for sentence in corpus_src: sent = sentence.strip().split() id, id_extended, vocab_extend_raw = nn_lib.sentence2id( sent=sent, word2id_vocab=word2id_vocab_src, build_extend_vocab=True) ids.append(id) id_extendeds.append(id_extended) vocab_extend = { key: value - len(word2id_vocab_src) + len(word2id_vocab_tgt) for key, value in vocab_extend_raw.items() } vocab_extends.append(copy.copy(vocab_extend)) # 序列补0,统计附加词表大小 ids = nn_lib.pad_sequences(sequences=ids, max_seq_len=src_seq_len_max) id_extendeds = nn_lib.pad_sequences(sequences=id_extendeds, max_seq_len=src_seq_len_max) vocab_size_extened = max([len(i) for i in vocab_extends]) # 规整数据 ids = np.array(ids) id_extendeds = np.array(id_extendeds) vocab_extends = np.array(vocab_extends).reshape([-1, 1]) data = [ids, id_extendeds, vocab_extends] # 进行预测,输出时序概率分布 tgt_prob_seqs = model.infer(data=data) # 转换预测结果至自然语言语句 tgt_seqs = [] for seq in tgt_prob_seqs: seq = np.argmax(seq, axis=1) seq = [id2word_vocab_tgt[id] for id in seq] seq = np.array(seq).reshape([-1, 1]) tgt_seqs.append(seq) corpus_tgt = np.concatenate(tgt_seqs, axis=1) corpus_tgt = [ ''.join([tmp for tmp in corpus_tgt[i, :] if tmp != '<PAD>']) for i in range(corpus_tgt.shape[0]) ] return corpus_tgt
def ner_predict(model, x, word2id, label2id, max_len=None, do_word2id=True): # 反映射 id2word = {id: word for word, id in word2id.items()} id2label = {id: label for label, id in label2id.items()} # 获取最大seq长度 if max_len == None: max_len = max(map(lambda seq: len(seq), x)) # 规整输入文本 if do_word2id == True: seqs = [] word_list = [] for seq in x: seq = list(seq) word_list.append(seq) seq = nn_lib.sentence2id(seq, word2id) seqs.append(seq) seqs = nn_lib.pad_sequences(seqs, max_len) else: seqs = x word_list = [] for row in x: word_list.append(series(row).map(id2word).tolist()) seqs = np.array(seqs) # 预测标签 label_id_list = model.infer([seqs]) # 拼接语料和标签 corpus_labels = [] for i in range(len(word_list)): corpus_label = [] for j in range(len(word_list[i])): corpus_label.append( (word_list[i][j], id2label[label_id_list[i][j]])) corpus_labels.append(corpus_label) return corpus_labels
def preprocess_data(): # 读取字典 word2id_vocab_src, vocab_size_src = nn_lib.read_word2id_dict( path_vocab_src) word2id_vocab_tgt, vocab_size_tgt = nn_lib.read_word2id_dict( path_vocab_tgt) # 读取语料 corpus_src = read_file(path_corpus_src) corpus_tgt = read_file(path_corpus_tgt) # 转换语料文本至id值 src_ids = [] src_id_extendeds = [] word2id_vocab_extends = [] tgt_ids = [] tgt_id_extendeds = [] for src, tgt in zip(corpus_src, corpus_tgt): # 转换src至ID src_id, src_id_extended, vocab_extend_raw = nn_lib.sentence2id( sent=src, word2id_vocab=word2id_vocab_src, build_extend_vocab=True) src_ids.append(src_id) src_id_extendeds.append(src_id_extended) vocab_extend = { key: value - len(word2id_vocab_src) + len(word2id_vocab_tgt) for key, value in vocab_extend_raw.items() } word2id_vocab_extends.append(copy.copy(vocab_extend)) # 转换tgt至ID tgt_id = nn_lib.sentence2id(sent=tgt, word2id_vocab=word2id_vocab_tgt, build_extend_vocab=False) vocab_extend.update(word2id_vocab_tgt) tgt_id_extended = nn_lib.sentence2id(sent=tgt, word2id_vocab=vocab_extend, build_extend_vocab=False) tgt_ids.append(tgt_id) tgt_id_extendeds.append(tgt_id_extended) del src, src_id, src_id_extended, tgt, tgt_id, tgt_id_extended # 序列补0,统计附加词表大小 src_ids = nn_lib.pad_sequences(sequences=src_ids, max_seq_len=src_seq_len_max) src_id_extendeds = nn_lib.pad_sequences(sequences=src_id_extendeds, max_seq_len=src_seq_len_max) tgt_ids = nn_lib.pad_sequences(sequences=tgt_ids, max_seq_len=tgt_seq_len_max, add_sos=True) tgt_id_extendeds = nn_lib.pad_sequences(sequences=tgt_id_extendeds, max_seq_len=tgt_seq_len_max, add_sos=True) vocab_size_extened = max([len(i) for i in word2id_vocab_extends]) # 规整数据 src_ids = np.array(src_ids) src_id_extendeds = np.array(src_id_extendeds) tgt_ids = np.array(tgt_ids) tgt_id_extendeds = np.array(tgt_id_extendeds) word2id_vocab_extends = np.array(word2id_vocab_extends).reshape([-1, 1]) # 构建训练集、测试集、验证集 nn_lib.generate_train_test_vali(src_ids, src_id_extendeds, tgt_ids, tgt_id_extendeds, word2id_vocab_extends, file_path=path_corpus_processed, corpus_names=corpus_names, data_test_size=128 * 2, data_vali_size=128 * 3) # x, x_vali, x_extended, x_extended_vali, y, y_vali, y_extended, y_extended_vali, vocab_extend, vocab_extend_vali \ # = train_test_split(src_ids, src_id_extendeds, tgt_ids, tgt_id_extendeds, word2id_vocab_extends, test_size=128*3) # x_train, x_test, x_extended_train, x_extended_test, y_train, y_test, y_extended_train, y_extended_test, vocab_extend_train, vocab_extend_test \ # = train_test_split(x, x_extended, y, y_extended, vocab_extend, test_size=128*2) # del x, x_extended, y, y_extended, vocab_extend # # 存储训练集、测试集、验证集 # for name in processed_corpus_names: # with open(path_corpus_processed + name, 'wb') as file: # pickle.dump(eval(name), file) return
def preprocess_data(vocab_size): # 读取语料数据 data_q = [] data_a = [] for file_name in corpus_list: print('loading file {}'.format(path_corpus + file_name)) data_q_, data_a_ = load_file(path_corpus + file_name) data_q = data_q + data_q_ data_a = data_a + data_a_ del data_q_, data_a_ # 构建/读取字典 if flag_build_vocab: word2id_vocab, vocab_size = nn_lib.build_word2id_vocab( data=data_q, saved_path=path_vocab, vocab_size=vocab_size, use_seg=True) else: word2id_vocab, vocab_size = nn_lib.read_word2id_dict(path_vocab) # 转换语料文本至id值 q_ids = [] q_id_extendeds = [] word2id_vocab_extends = [] a_ids = [] a_id_extendeds = [] for q, a in zip(data_q, data_a): # 转换问题Q至ID q_seg = nn_lib.str_segment(q) q_id, q_id_extended, vocab_extend = nn_lib.sentence2id( sent=q_seg, word2id_vocab=word2id_vocab, build_extend_vocab=True) q_ids.append(q_id) q_id_extendeds.append(q_id_extended) word2id_vocab_extends.append(copy.copy(vocab_extend)) # 转换答案A至ID a_seg = nn_lib.str_segment(a) a_id = nn_lib.sentence2id(sent=a_seg, word2id_vocab=word2id_vocab, build_extend_vocab=False) vocab_extend.update(word2id_vocab) a_id_extended = nn_lib.sentence2id(sent=a_seg, word2id_vocab=vocab_extend, build_extend_vocab=False) a_ids.append(a_id) a_id_extendeds.append(a_id_extended) del q, q_seg, q_id, q_id_extended, a, a_seg, a_id, a_id_extended # 序列补0,统计附加词表大小 q_ids = nn_lib.pad_sequences(sequences=q_ids, max_seq_len=max_seq_len) q_id_extendeds = nn_lib.pad_sequences(sequences=q_id_extendeds, max_seq_len=max_seq_len) a_ids = nn_lib.pad_sequences(sequences=a_ids, max_seq_len=max_seq_len, add_sos=True) a_id_extendeds = nn_lib.pad_sequences(sequences=a_id_extendeds, max_seq_len=max_seq_len, add_sos=True) vocab_size_extened = max([len(i) for i in word2id_vocab_extends]) # 规整数据 q_ids = np.array(q_ids) q_id_extendeds = np.array(q_id_extendeds) a_ids = np.array(a_ids) a_id_extendeds = np.array(a_id_extendeds) word2id_vocab_extends = np.array(word2id_vocab_extends).reshape([-1, 1]) # 构建训练集、测试集、验证集 x, x_vali, x_extended, x_extended_vali, y, y_vali, y_extended, y_extended_vali, vocab_extend, vocab_extend_vali \ = train_test_split(q_ids, q_id_extendeds, a_ids, a_id_extendeds, word2id_vocab_extends, test_size=1024*8) x_train, x_test, x_extended_train, x_extended_test, y_train, y_test, y_extended_train, y_extended_test, vocab_extend_train, vocab_extend_test \ = train_test_split(x, x_extended, y, y_extended, vocab_extend, test_size=1024) del x, x_extended, y, y_extended, vocab_extend # 存储训练集、测试集、验证集 for name in processed_corpus_names: with open(path_corpus_processed + name, 'wb') as file: pickle.dump(eval(name), file) return