def rnn_layer(units: int, input_feature_dim: int, cell_type: str = 'lstm', if_bidirectional: bool = True) -> tf.keras.Model: """ RNNCell层,其中可定义cell类型,是否双向 :param units: cell单元数 :param input_feature_dim: 输入的特征维大小 :param cell_type: cell类型,lstm/gru, 默认lstm :param if_bidirectional: 是否双向 :return: Multi-layer RNN """ inputs = tf.keras.Input(shape=(None, input_feature_dim)) if cell_type == 'lstm': rnn = tf.keras.layers.LSTM(units=units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform') elif cell_type == 'gru': rnn = tf.keras.layers.GRU(units=units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform') else: print('cell执行了类型执行出错,定位细节参见log') utils.log_operator(level=10).info("cell执行了类型执行出错") if if_bidirectional: rnn = tf.keras.layers.Bidirectional(rnn) rnn_outputs = rnn(inputs) outputs = rnn_outputs[0] states = outputs[:, -1, :] return tf.keras.Model(inputs=inputs, outputs=[outputs, states])
def __init__(self, execute_type: str, checkpoint_dir: str, num_layers: int, units: int, d_model: int, num_heads: int, dropout: float, start_sign: str, end_sign: str, beam_size: int, vocab_size: int, dict_fn: str, max_length: int): """ Transformer聊天器初始化,用于加载模型 :param execute_type: 对话执行模式 :param checkpoint_dir: 检查点保存目录路径 :param num_layers: transformer内部层数 :param units: 单元数 :param d_model: 嵌入层维度 :param num_heads: 注意力头数 :param dropout: 采样率 :param start_sign: 开始标记 :param end_sign: 结束标记 :param beam_size: batch大小 :param vocab_size: 词汇量大小 :param dict_fn: 保存字典路径 :param max_length: 单个句子最大长度 :return: 无返回值 """ super().__init__(checkpoint_dir, beam_size, max_length, dict_fn, start_sign, end_sign) self.model = transformer.transformer(vocab_size=vocab_size, num_layers=num_layers, units=units, d_model=d_model, num_heads=num_heads, dropout=dropout) self.learning_rate = optimizers.CustomSchedule(d_model) self.optimizer = tf.keras.optimizers.Adam(self.learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) self.train_loss = tf.keras.metrics.Mean(name='train_loss') self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') self.checkpoint = tf.train.Checkpoint(transformer=self.model, optimizer=self.optimizer) print('正在检查是否存在检查点') if self.ckpt: print('存在检查点,正在加载检查点') self.checkpoint.restore( tf.train.latest_checkpoint(checkpoint_dir)).expect_partial() else: if execute_type == "train": print('不存在检查点,正在train模式') else: print('不存在检查点,请先执行train模式,再进入chat模式') exit(0) log_operator(level=10).info( "启动Transformer聊天器,执行类别为:{},模型参数配置为:num_layers:{}," "d_model:{},num_heads:{},units:{},dropout:{},vocab_size:{}," "max_length:{}".format(execute_type, num_layers, d_model, num_heads, units, dropout, vocab_size, max_length))
def combine_tokenized_data_single(standby_data: list, combine_data: str, if_remove: bool = True): """ *单轮对话数据集处理模块* 将所有已经分词好的问答对集中整合到一个文件中 :param standby_data: 分词好的数据文本路径 :param combine_data: 汇总数据的文本路径 :param if_remove: 是否移除原有分词文本 :return: 无返回值 """ if os.path.exists(combine_data) and if_remove: os.remove(combine_data) count = 0 file_count = 0 for file_fn in standby_data: if not os.path.exists(file_fn): print("{}文件不存在,请检查之后再次运行".format(file_fn)) exit(0) with open(file_fn, 'r', encoding='utf-8') as tokenized_file, open(combine_data, 'a', encoding='utf-8') as combine_file: for line in tokenized_file: line = line.strip().strip("\n").replace("/", " ") combine_file.write(line + "\n") count += 1 print("\r数据处理进度:{}".format(count), flush=True, end="") file_count += 1 message = "数据处理完毕,数据信息统计:共处理{}个分词文件,整理出{}条数据".format(file_count, count) print("\n" + message) logger = log_operator(level=10) logger.info(message)
def to_single_turn_dataset(tokenized_data_path: str, qa_data_path: str, remove_tokenized: bool = True): """生成单轮对话数据集 用于处理已经分词好的多轮次数据集的方法,将数据集处理成问答对的形式 :param tokenized_data_path: 已切分多轮对话数据路径 :param qa_data_path: 单轮对话数据保存路径 :param remove_tokenized: 是否移除原有分词文本 :return: 无返回值 """ # _check_file(raw_file=raw_data_path, processed_file=qa_data_path, remove_tokenized=remove_tokenized) count = 0 sentences_count = 0 max_len = 0 min_len = 10000 sentence_len = [] one_pair = [] # 对每一轮对话上下文进行配对,形成一问一答两个部分,如果遇到下一轮对话,直接跳过 with open(tokenized_data_path, encoding="utf-8") as raw_file, \ open(qa_data_path, 'w', encoding="utf-8") as single_turn_data_file: for line in raw_file: line = line.strip('\n').replace('/', '') # line = re.sub(r"[%s]+" % punctuation, "", line) # 因为原始数据集中,是一轮一轮的对话排列的,所以需要注意的是 # 在一轮对话结束之后,最后一句不能作为问句,需要跳到下一轮进行处理 if line == '': one_pair = [] count += 1 continue elif len(one_pair) == 1: one_pair.append(line) single_turn_data_file.write(one_pair[0] + "\t" + one_pair[1] + "\n") one_pair = [line] sentences_count += 1 if sentences_count % 10000 == 0: print('已处理:', sentences_count, '个问答对') else: one_pair.append(line) length = len(line) max_len = max(max_len, length) min_len = min(min_len, length) sentence_len.append(length) message = "对话数据集转换完毕:共处理{}轮对话数据,整理出{}对" \ "问答对,语句最大长度:{},语句最短长度{},语句平均长度{:.3f}".format(count, sentences_count, max_len, min_len, np.mean(sentence_len)) print(message) logger = log_operator(level=10) logger.info(message)
def preprocess_raw_douban_data(raw_data: str, tokenized_data: str, repeat_data: int = 10, if_remove: bool = True): """ 用于处理douban数据集的方法,将douban数据集处理成多轮次对话的形式,并分词 :param raw_data: 原始数据路径 :param tokenized_data: 生成token数据保存路径 :param repeat_data: 每轮对话重复数据条数 :param if_remove: 是否移除原有分词文本 :return: 无返回值 """ _check_file(raw_file=raw_data, processed_file=tokenized_data, remove_tokenized=if_remove) count = 0 max_len = 0 min_len = 10000 sentence_len = [] with open(raw_data, 'r', encoding='utf-8') as raw_file, open( tokenized_data, 'a', encoding='utf-8') as tokenized_file: iter_count = -1 for line in raw_file: iter_count += 1 if iter_count % repeat_data != 0: continue line = line.strip('\n').replace('/', '') if line == "": continue # 因为原始数据集中,是一轮一轮的对话排列的,所以需要注意的是在一轮对话结束之后,最后 # 一句不能作为问句,需要跳到下一轮进行处理去掉最前面的标签和最后面的不正确语句 utterances = line.split('\t')[1:-1] for utterance in utterances: length = len(utterance) sentence_len.append(length) max_len = max(max_len, length) min_len = min(min_len, length) tokenized_file.write(utterance + "\n") tokenized_file.write("\n") count += 1 if count % 10000 == 0: print("数据处理进度:{}".format(count)) message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,语句最大长度:{},语" \ "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len)) print(message) logger = log_operator(level=10) logger.info(message)
def preprocess_raw_wei_bo_data(raw_post_data: str, raw_response_data, tokenized_data: str, if_remove: bool = True): """ 用于处理weibo数据集的方法,将weibo数据集处理成多轮次的形式,并分词 :param raw_post_data: 微博的post原始文本数据中的路径 :param raw_response_data: 微博的response原始文本数据中的路径 :param tokenized_data: 生成token数据保存路径 :param if_remove: 是否移除原有分词文本 :return: 无返回值 """ _check_file(raw_file=raw_post_data, processed_file=tokenized_data, remove_tokenized=if_remove) if not os.path.exists(raw_response_data): print('数据集不存在,请添加数据集!') exit(0) count = 0 max_len = 0 min_len = 10000 sentence_len = [] with open(raw_post_data, 'r', encoding='utf-8') as post_file, open( raw_response_data, 'r', encoding='utf-8') as response_file, \ open(tokenized_data, 'a', encoding='utf-8') as tokenized_file: for post_data, response_data in zip(post_file, response_file): post_data = post_data.strip("\n").replace("/", " ") response_data = response_data.strip("\n").replace("/", " ") if post_data == "" or response_data == "": continue post_len = len(post_data) response_len = len(response_data) max_len = max(max_len, post_len, response_len) min_len = min(min_len, post_len, response_len) sentence_len.append(post_len) sentence_len.append(response_len) tokenized_file.write(post_data + "\n" + response_data + "\n\n") count += 1 if count % 10000 == 0: print("已读取:{}轮对话数据".format(count)) message = "数据处理完毕:共处理{}轮对话数据,语句最大长度:{},语" \ "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len)) print(message) logger = log_operator(level=10) logger.info(message)
def preprocess_raw_qin_yun_data(raw_data: str, tokenized_data: str, if_remove: bool = True): """ 用于处理青云数据集的方法,将青云数据集处理成多轮次的形式,并分词 :param raw_data: 原始数据路径 :param tokenized_data: 生成token数据保存路径 :param if_remove: 是否移除原有分词文本 :return: 无返回值 """ _check_file(raw_file=raw_data, processed_file=tokenized_data, remove_tokenized=if_remove) count = 0 max_len = 0 min_len = 10000 sentence_len = [] with open(raw_data, 'r', encoding='utf-8') as raw_file, open( tokenized_data, 'a', encoding='utf-8') as tokenized_file: for line in raw_file: line = line.strip().strip("\n").replace("/", " ") if line == "": continue for sentence in line.split("|"): sentence = sentence.strip() length = len(sentence) sentence_len.append(length) max_len = max(max_len, length) min_len = min(min_len, length) tokenized_file.write(" ".join(jieba.cut(sentence)) + "\n") tokenized_file.write("\n") count += 1 if count % 10000 == 0: print("已读取:{}轮对话数据".format(count)) message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,语句最大长度:{},语" \ "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len)) print(message) logger = log_operator(level=10) logger.info(message)
def preprocess_raw_lccc_data(raw_data_path: str, tokenized_data_path: str, remove_tokenized: bool = True): """将LCCC数据集从JSON格式转换每行一条话语 LCCC原始数据集已分词. :param raw_data_path: 原始数据路径 :param tokenized_data_path: 生成token数据保存路径 :param remove_tokenized: 是否移除原有分词文本 :return: 无返回值 """ _check_file(raw_file=raw_data_path, processed_file=tokenized_data_path, remove_tokenized=remove_tokenized) count = 0 max_len = 0 min_len = 10000 sentence_len = [] with open(raw_data_path, 'r', encoding="utf-8") as raw_file, open( tokenized_data_path, 'a', encoding="utf-8") as tokenized_file: raw_data_path = json.load(raw_file) for data in raw_data_path: for sentence in data: length = len(sentence) sentence_len.append(length) max_len = max(max_len, length) min_len = min(min_len, length) tokenized_file.write(sentence + "\n") tokenized_file.write("\n") count += 1 if count % 10000 == 0: print("已读取:{}轮对话数据".format(count)) message = "数据预处理完毕:共处理{}轮对话数据,语句最大长度:{},语" \ "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len)) print(message) logger = log_operator(level=10) logger.info(message)
def preprocess_raw_xiao_huang_ji_data(raw_data: str, tokenized_data: str, if_remove: bool = True): """ 用于处理小黄鸡数据集的方法,将小黄鸡数据集处理成多轮次对话的形式,并分词 :param raw_data: 原始数据路径 :param tokenized_data: 生成token数据保存路径 :param if_remove: 是否移除原有分词文本 :return: """ _check_file(raw_file=raw_data, processed_file=tokenized_data, remove_tokenized=if_remove) count = 1 max_len = 0 min_len = 10000 sentence_len = [] with open(raw_data, 'r', encoding="utf-8") as raw_file, open( tokenized_data, 'a', encoding="utf-8") as tokenized_file: for line in raw_file: line = line.strip('\n').replace('/', '') if line == "": tokenized_file.write("\n") count += 1 if count % 10000 == 0: print("已读取:{}轮对话数据".format(count)) continue length = len(line) sentence_len.append(length) max_len = max(max_len, length) min_len = min(min_len, length) tokenized_file.write(" ".join(jieba.cut(line)) + "\n") message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,语句最大长度:{},语" \ "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len)) print(message) logger = log_operator(level=10) logger.info(message)
def preprocess_raw_cross_woz_data(raw_data: str, tokenized_data: str, if_remove: bool = True): """ 用于处理crossWOZ数据集的方法,将crossWOZ数据集处理成多轮次对话的形式,并分词 :param raw_data: 原始数据路径 :param tokenized_data: 生成token数据保存路径 :param if_remove: 是否移除原有分词文本 :return: 无返回值 """ _check_file(raw_file=raw_data, processed_file=tokenized_data, remove_tokenized=if_remove) count = 0 max_len = 0 min_len = 10000 sentence_len = [] with open(raw_data, 'r', encoding='utf-8') as raw_file, open( tokenized_data, 'a', encoding='utf-8') as tokenized_file: raw_data = json.load(raw_file) for data in raw_data: turn_utterances = raw_data[data]["messages"] for content in turn_utterances: sentence = content["content"] length = len(sentence) sentence_len.append(length) max_len = max(max_len, length) min_len = min(min_len, length) tokenized_file.write(" ".join(jieba.cut(sentence)) + "\n") tokenized_file.write("\n") count += 1 if count % 10000 == 0: print("已读取:{}轮对话数据".format(count)) message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,语句最大长度:{},语" \ "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len)) print(message) logger = log_operator(level=10) logger.info(message)
def __init__(self, execute_type: str, checkpoint_dir: str, units: int, embedding_dim: int, batch_size: int, start_sign: str, end_sign: str, beam_size: int, vocab_size: int, dict_fn: str, max_length: int, encoder_layers: int, decoder_layers: int, cell_type: str, if_bidirectional: bool = True): """ Seq2Seq聊天器初始化,用于加载模型 :param execute_type: 对话执行模式 :param checkpoint_dir: 检查点保存目录路径 :param units: 单元数 :param embedding_dim: 嵌入层维度 :param batch_size: batch大小 :param start_sign: 开始标记 :param end_sign: 结束标记 :param beam_size: batch大小 :param vocab_size: 词汇量大小 :param dict_fn: 保存字典路径 :param max_length: 单个句子最大长度 :param encoder_layers: encoder中内部RNN层数 :param decoder_layers: decoder中内部RNN层数 :param cell_type: cell类型,lstm/gru, 默认lstm :param if_bidirectional: 是否双向 :return: 无返回值 """ super().__init__(checkpoint_dir, beam_size, max_length, dict_fn, start_sign, end_sign) self.units = units self.batch_size = batch_size self.enc_units = units self.encoder = seq2seq.encoder(vocab_size=vocab_size, embedding_dim=embedding_dim, enc_units=int(units / 2), num_layers=encoder_layers, cell_type=cell_type, if_bidirectional=if_bidirectional) self.decoder = seq2seq.decoder(vocab_size=vocab_size, embedding_dim=embedding_dim, enc_units=units, dec_units=units, num_layers=decoder_layers, cell_type=cell_type) self.optimizer = tf.keras.optimizers.Adam() self.train_loss = tf.keras.metrics.Mean() self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, encoder=self.encoder, decoder=self.decoder) print('正在检查是否存在检查点') if self.ckpt: print('存在检查点,正在加载检查点') self.checkpoint.restore( tf.train.latest_checkpoint(checkpoint_dir)).expect_partial() else: if execute_type == "train": print('不存在检查点,从头开始训练') else: print('不存在检查点,请先执行train模式,再进入chat模式') exit(0) log_operator(level=10).info( "启动SMN聊天器,执行类别为:{},模型参数配置为:vocab_size:{}," "embedding_dim:{},units:{},max_length:{}".format( execute_type, vocab_size, embedding_dim, units, max_length))
def __init__(self, units: int, vocab_size: int, execute_type: str, dict_fn: str, embedding_dim: int, checkpoint_dir: int, max_utterance: int, max_sentence: int, learning_rate: float, database_fn: str, solr_server: str): """ SMN聊天器初始化,用于加载模型 :param units: 单元数 :param vocab_size: 词汇量大小 :param execute_type: 对话执行模式 :param dict_fn: 保存字典路径 :param embedding_dim: 嵌入层维度 :param checkpoint_dir: 检查点保存目录路径 :param max_utterance: 每轮句子数量 :param max_sentence: 单个句子最大长度 :param learning_rate: 学习率 :param database_fn: 候选数据库路径 :return: 无返回值 """ self.dict_fn = dict_fn self.checkpoint_dir = checkpoint_dir self.max_utterance = max_utterance self.max_sentence = max_sentence self.database_fn = database_fn self.optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) self.solr = pysolr.Solr(url=solr_server, always_commit=True, timeout=10) self.train_loss = tf.keras.metrics.Mean() self.model = smn.smn(units=units, vocab_size=vocab_size, embedding_dim=embedding_dim, max_utterance=self.max_utterance, max_sentence=self.max_sentence) self.checkpoint = tf.train.Checkpoint( model=self.model, optimizer=self.optimizer, ) ckpt = os.path.exists(checkpoint_dir) if not ckpt: os.makedirs(checkpoint_dir) print('正在检查是否存在检查点') if ckpt: print('存在检查点,正在加载检查点'.format(checkpoint_dir)) self.checkpoint.restore( tf.train.latest_checkpoint(checkpoint_dir)).expect_partial() else: if execute_type == "train": print('不存在检查点,正在train模式') else: print('不存在检查点,请先执行train模式,再进入chat模式') exit(0) logger = utils.log_operator(level=10) logger.info("启动SMN聊天器,执行类别为:{},模型参数配置为:embedding_dim:{}," "max_sentence:{},max_utterance:{},units:{},vocab_size:{}," "learning_rate:{}".format(execute_type, embedding_dim, max_sentence, max_utterance, units, vocab_size, learning_rate))
def to_single_turn_dataset(tokenized_data_path: str, qa_data_path: str, dict_path: str, vocab_size: int, start_sign: str = "<start>", end_sign: str = "<end>", unk_sign: str = "<unk>", max_data_size: int = 0, remove_tokenized: bool = True): """生成单轮对话数据集 用于处理已经分词好的多轮次数据集的方法,将数据集处理成问答对的形式 :param tokenized_data_path: 已切分多轮对话数据路径 :param qa_data_path: 单轮对话数据保存路径 :param dict_path: 字典保存路径 :param vocab_size: 词汇量大小 :param start_sign: 开始标记 :param end_sign: 结束标记 :param unk_sign: 未登录词 :param max_data_size: 最大加载数据量,,0为所有数据 :param remove_tokenized: 是否移除原有分词文本 :return: 无返回值 """ # _check_file(raw_file=raw_data_path, processed_file=qa_data_path, remove_tokenized=remove_tokenized) count = 0 sentences_count = 0 max_len = 0 min_len = 10000 sentence_len = [] one_pair = [] all_text_list = [] # 对每一轮对话上下文进行配对,形成一问一答两个部分,如果遇到下一轮对话,直接跳过 with open(tokenized_data_path, encoding="utf-8") as raw_file, \ open(qa_data_path, 'w', encoding="utf-8") as single_turn_data_file: for line in raw_file: line = line.strip('\n').replace('/', '') # line = re.sub(r"[%s]+" % punctuation, "", line) # 因为原始数据集中,是一轮一轮的对话排列的,所以需要注意的是 # 在一轮对话结束之后,最后一句不能作为问句,需要跳到下一轮进行处理 if line == '': one_pair = [] count += 1 continue elif len(one_pair) == 1: one_pair.append(line) question = start_sign + " " + one_pair[0] + " " + end_sign answer = start_sign + " " + one_pair[1] + " " + end_sign single_turn_data_file.write(question + "\t" + answer + "\n") all_text_list.append(question) all_text_list.append(answer) one_pair = [line] sentences_count += 1 print('\r已处理:{}个问答对'.format(sentences_count), flush=True, end="") if sentences_count == max_data_size: break else: one_pair.append(line) length = len(line) max_len = max(max_len, length) min_len = min(min_len, length) sentence_len.append(length) tokenizer = tf.keras.preprocessing.text.Tokenizer(filters="", num_words=vocab_size, oov_token=unk_sign) tokenizer.fit_on_texts(all_text_list) with open(dict_path, 'w', encoding='utf-8') as dict_file: dict_file.write(tokenizer.to_json()) message = "对话数据集转换完毕,并保存字典:共处理{}轮对话数据,整理出{}对" \ "问答对,语句最大长度:{},语句最短长度{},语句平均长度{:.3f}".format(count, sentences_count, max_len, min_len, np.mean(sentence_len)) print("\n" + message) logger = log_operator(level=10) logger.info(message)