def preprocess_raw_data_qa_single(raw_data: str, qa_data: str, if_remove: bool = True): """ 单轮对话数据集处理模块 用于处理已经分词好的多轮次数据集的方法,将数据集处理成问答对的形式 Args: raw_data: 原始数据路径 qa_data: 生成token数据保存路径 if_remove: 是否移除原有分词文本 Returns: """ check_file(raw_file=raw_data, treat_file=qa_data, if_remove=if_remove) count = 0 sentences_count = 0 max_len = 0 min_len = 10000 sentence_len = [] one_pair = [] # 对每一轮对话上下文进行配对,形成一问一答两个部分,如果遇到下一轮对话,直接跳过 with open(raw_data, encoding="utf-8") as raw_file, open( qa_data, 'w', encoding="utf-8") as tokenized_file: for line in raw_file: line = line.strip('\n').replace('/', '') # line = re.sub(r"[%s]+" % punctuation, "", line) # 因为原始数据集中,是一轮一轮的对话排列的,所以需要注意的是 # 在一轮对话结束之后,最后一句不能作为问句,需要跳到下一轮进行处理 if line == '': one_pair = [] count += 1 continue elif len(one_pair) == 1: one_pair.append(line) tokenized_file.write(one_pair[0] + "\t" + one_pair[1] + "\n") one_pair = [line] sentences_count += 1 if sentences_count % 10000 == 0: print('已处理:', sentences_count, '个问答对') else: one_pair.append(line) length = len(line) max_len = max(max_len, length) min_len = min(min_len, length) sentence_len.append(length) message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,整理出{}对" \ "问答对,语句最大长度:{},语句最短长度{},语句平均长度{:.3f}".format(count, sentences_count, max_len, min_len, np.mean(sentence_len)) print(message) logger = log_operator(level=10) logger.info(message)
def preprocess_raw_douban_data(raw_data: str, tokenized_data: str, repeat_data: int = 10, if_remove: bool = True): """ 用于处理douban数据集的方法,将douban数据集处理成多轮次对话的形式,并分词 Args: raw_data: 原始数据路径 tokenized_data: 生成token数据保存路径 repeat_data: 每轮对话重复数据条数 if_remove: 是否移除原有分词文本 Returns: """ check_file(raw_file=raw_data, treat_file=tokenized_data, if_remove=if_remove) count = 0 max_len = 0 min_len = 10000 sentence_len = [] with open(raw_data, 'r', encoding='utf-8') as raw_file, open( tokenized_data, 'a', encoding='utf-8') as tokenized_file: iter_count = -1 for line in raw_file: iter_count += 1 if iter_count % repeat_data != 0: continue line = line.strip('\n').replace('/', '') if line == "": continue # 因为原始数据集中,是一轮一轮的对话排列的,所以需要注意的是在一轮对话结束之后,最后 # 一句不能作为问句,需要跳到下一轮进行处理去掉最前面的标签和最后面的不正确语句 utterances = line.split('\t')[1:-1] for utterance in utterances: length = len(utterance) sentence_len.append(length) max_len = max(max_len, length) min_len = min(min_len, length) tokenized_file.write(utterance + "\n") tokenized_file.write("\n") count += 1 if count % 10000 == 0: print("数据处理进度:{}".format(count)) message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,语句最大长度:{},语" \ "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len)) print(message) logger = log_operator(level=10) logger.info(message)
def preprocess_raw_wei_bo_data(raw_post_data: str, raw_response_data, tokenized_data: str, if_remove: bool = True): """ 用于处理weibo数据集的方法,将weibo数据集处理成多轮次的形式,并分词 Args: raw_post_data: 微博的post原始文本数据中的路径 raw_response_data: 微博的response原始文本数据中的路径 tokenized_data: 生成token数据保存路径 if_remove: 是否移除原有分词文本 Returns: """ check_file(raw_file=raw_post_data, treat_file=tokenized_data, if_remove=if_remove) if not os.path.exists(raw_response_data): print('数据集不存在,请添加数据集!') exit(0) count = 0 max_len = 0 min_len = 10000 sentence_len = [] with open(raw_post_data, 'r', encoding='utf-8') as post_file, open( raw_response_data, 'r', encoding='utf-8') as response_file, \ open(tokenized_data, 'a', encoding='utf-8') as tokenized_file: for post_data, response_data in zip(post_file, response_file): post_data = post_data.strip("\n").replace("/", " ") response_data = response_data.strip("\n").replace("/", " ") if post_data == "" or response_data == "": continue post_len = len(post_data) response_len = len(response_data) max_len = max(max_len, post_len, response_len) min_len = min(min_len, post_len, response_len) sentence_len.append(post_len) sentence_len.append(response_len) tokenized_file.write(post_data + "\n" + response_data + "\n\n") count += 1 if count % 10000 == 0: print("已读取:{}轮对话数据".format(count)) message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,语句最大长度:{},语" \ "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len)) print(message) logger = log_operator(level=10) logger.info(message)
def cell_layer(units: int, input_feature_dim: int, cell_type: str = 'lstm', if_bidirectional: bool = True) -> tf.keras.Model: """ RNNCell层,其中可定义cell类型,是否双向 Args: units: cell单元数 input_feature_dim: 输入的特征维大小 cell_type: cell类型,lstm/gru, 默认lstm if_bidirectional: 是否双向 Returns: """ inputs = tf.keras.Input(shape=(None, input_feature_dim)) if cell_type == 'lstm': rnn = tf.keras.layers.LSTM(units=units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform') elif cell_type == 'gru': rnn = tf.keras.layers.GRU(units=units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform') else: print('cell执行了类型执行出错,定位细节参见log') utils.log_operator(level=10).info("cell执行了类型执行出错") if if_bidirectional: rnn = tf.keras.layers.Bidirectional(rnn) rnn_outputs = rnn(inputs) outputs = rnn_outputs[0] states = outputs[:, -1, :] return tf.keras.Model(inputs=inputs, outputs=[outputs, states])
def preprocess_raw_qin_yun_data(raw_data: str, tokenized_data: str, if_remove: bool = True): """ 用于处理青云数据集的方法,将青云数据集处理成多轮次的形式,并分词 Args: raw_data: 原始数据路径 tokenized_data: 生成token数据保存路径 if_remove: 是否移除原有分词文本 Returns: """ check_file(raw_file=raw_data, treat_file=tokenized_data, if_remove=if_remove) count = 0 max_len = 0 min_len = 10000 sentence_len = [] with open(raw_data, 'r', encoding='utf-8') as raw_file, open( tokenized_data, 'a', encoding='utf-8') as tokenized_file: for line in raw_file: line = line.strip().strip("\n").replace("/", " ") if line == "": continue for sentence in line.split("|"): sentence = sentence.strip() length = len(sentence) sentence_len.append(length) max_len = max(max_len, length) min_len = min(min_len, length) tokenized_file.write(" ".join(jieba.cut(sentence)) + "\n") tokenized_file.write("\n") count += 1 if count % 10000 == 0: print("已读取:{}轮对话数据".format(count)) message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,语句最大长度:{},语" \ "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len)) print(message) logger = log_operator(level=10) logger.info(message)
def preprocess_raw_cross_woz_data(raw_data: str, tokenized_data: str, if_remove: bool = True): """ 用于处理crossWOZ数据集的方法,将crossWOZ数据集处理成多轮次对话的形式,并分词 Args: raw_data: 原始数据路径 tokenized_data: 生成token数据保存路径 if_remove: 是否移除原有分词文本 Returns: """ check_file(raw_file=raw_data, treat_file=tokenized_data, if_remove=if_remove) count = 0 max_len = 0 min_len = 10000 sentence_len = [] with open(raw_data, 'r', encoding='utf-8') as raw_file, open( tokenized_data, 'a', encoding='utf-8') as tokenized_file: raw_data = json.load(raw_file) for data in raw_data: turn_utterances = raw_data[data]["messages"] for content in turn_utterances: sentence = content["content"] length = len(sentence) sentence_len.append(length) max_len = max(max_len, length) min_len = min(min_len, length) tokenized_file.write(" ".join(jieba.cut(sentence)) + "\n") tokenized_file.write("\n") count += 1 if count % 10000 == 0: print("已读取:{}轮对话数据".format(count)) message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,语句最大长度:{},语" \ "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len)) print(message) logger = log_operator(level=10) logger.info(message)
def preprocess_raw_lccc_data(raw_data: str, tokenized_data: str, if_remove: bool = True): """ 用于处理LCCC数据集的方法,将LCCC数据集处理成多轮次对话的形式,并分词 Args: raw_data: 原始数据路径 tokenized_data: 生成token数据保存路径 if_remove: 是否移除原有分词文本 Returns: """ check_file(raw_file=raw_data, treat_file=tokenized_data, if_remove=if_remove) count = 0 max_len = 0 min_len = 10000 sentence_len = [] with open(raw_data, 'r', encoding="utf-8") as raw_file, open( tokenized_data, 'a', encoding="utf-8") as tokenized_file: raw_data = json.load(raw_file) for data in raw_data: for sentence in data: length = len(sentence) sentence_len.append(length) max_len = max(max_len, length) min_len = min(min_len, length) tokenized_file.write(sentence + "\n") tokenized_file.write("\n") count += 1 if count % 10000 == 0: print("已读取:{}轮对话数据".format(count)) message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,语句最大长度:{},语" \ "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len)) print(message) logger = log_operator(level=10) logger.info(message)
def combine_tokenized_data_single(standby_data: list, combine_data: str, if_remove: bool = True): """ *单轮对话数据集处理模块* 将所有已经分词好的问答对集中整合到一个文件中 Args: standby_data: 分词好的数据文本路径 combine_data: 汇总数据的文本路径 if_remove: 是否移除原有分词文本 Returns: """ if os.path.exists(combine_data) and if_remove: os.remove(combine_data) count = 0 file_count = 0 for file_fn in standby_data: if not os.path.exists(file_fn): print("{}文件不存在,请检查之后再次运行".format(file_fn)) exit(0) with open(file_fn, 'r', encoding='utf-8') as tokenized_file, open( combine_data, 'a', encoding='utf-8') as combine_file: for line in tokenized_file: line = line.strip().strip("\n").replace("/", " ") combine_file.write(line + "\n") count += 1 if count % 10000 == 0: print("数据处理进度:{}".format(count)) file_count += 1 message = "数据处理完毕,数据信息统计:共处理{}个分词文件,整理出{}条数据".format(file_count, count) print(message) logger = log_operator(level=10) logger.info(message)
def __init__(self, units: int, vocab_size: int, execute_type: str, dict_fn: str, embedding_dim: int, checkpoint_dir: int, max_utterance: int, max_sentence: int, learning_rate: float, database_fn: str, solr_server: str): """ SMN聊天器初始化,用于加载模型 Args: units: 单元数 vocab_size: 词汇量大小 execute_type: 对话执行模式 dict_fn: 保存字典路径 embedding_dim: 嵌入层维度 checkpoint_dir: 检查点保存目录路径 max_utterance: 每轮句子数量 max_sentence: 单个句子最大长度 learning_rate: 学习率 database_fn: 候选数据库路径 Returns: """ self.dict_fn = dict_fn self.checkpoint_dir = checkpoint_dir self.max_utterance = max_utterance self.max_sentence = max_sentence self.database_fn = database_fn self.optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) self.solr = pysolr.Solr(url=solr_server, always_commit=True, timeout=10) self.train_loss = tf.keras.metrics.Mean() self.model = smn.smn(units=units, vocab_size=vocab_size, embedding_dim=embedding_dim, max_utterance=self.max_utterance, max_sentence=self.max_sentence) self.checkpoint = tf.train.Checkpoint( model=self.model, optimizer=self.optimizer, ) ckpt = os.path.exists(checkpoint_dir) if not ckpt: os.makedirs(checkpoint_dir) if execute_type == "chat": print('正在从“{}”处加载字典...'.format(self.dict_fn)) self.token = data_utils.load_token_dict(dict_fn=self.dict_fn) print('正在检查是否存在检查点...') if ckpt: print('存在检查点,正在从“{}”中加载检查点...'.format(checkpoint_dir)) self.checkpoint.restore( tf.train.latest_checkpoint(checkpoint_dir)).expect_partial() else: if execute_type == "train": print('不存在检查点,正在train模式...') else: print('不存在检查点,请先执行train模式,再进入chat模式') exit(0) logger = utils.log_operator(level=10) logger.info("启动SMN聊天器,执行类别为:{},模型参数配置为:embedding_dim:{}," "max_sentence:{},max_utterance:{},units:{},vocab_size:{}," "learning_rate:{}".format(execute_type, embedding_dim, max_sentence, max_utterance, units, vocab_size, learning_rate))
def __init__(self, execute_type: str, checkpoint_dir: str, units: int, embedding_dim: int, batch_size: int, start_sign: str, end_sign: str, beam_size: int, vocab_size: int, dict_fn: str, max_length: int, encoder_layers: int, decoder_layers: int, cell_type: str, if_bidirectional: bool = True): """ Seq2Seq聊天器初始化,用于加载模型 Args: execute_type: 对话执行模式 checkpoint_dir: 检查点保存目录路径 units: 单元数 embedding_dim: 嵌入层维度 batch_size: batch大小 start_sign: 开始标记 end_sign: 结束标记 beam_size: batch大小 vocab_size: 词汇量大小 dict_fn: 保存字典路径 max_length: 单个句子最大长度 encoder_layers: encoder中内部RNN层数 decoder_layers: decoder中内部RNN层数 cell_type: cell类型,lstm/gru, 默认lstm if_bidirectional: 是否双向 Returns: """ super().__init__(checkpoint_dir, beam_size, max_length) self.units = units self.start_sign = start_sign self.end_sign = end_sign self.batch_size = batch_size self.enc_units = units self.encoder = seq2seq.encoder(vocab_size=vocab_size, embedding_dim=embedding_dim, enc_units=int(units / 2), layer_size=encoder_layers, cell_type=cell_type, if_bidirectional=if_bidirectional) self.decoder = seq2seq.decoder(vocab_size=vocab_size, embedding_dim=embedding_dim, enc_units=units, dec_units=units, layer_size=decoder_layers, cell_type=cell_type) self.optimizer = tf.keras.optimizers.Adam() self.train_loss = tf.keras.metrics.Mean() self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, encoder=self.encoder, decoder=self.decoder) if execute_type == "chat": print('正在从“{}”处加载字典...'.format(dict_fn)) self.token = data_utils.load_token_dict(dict_fn=dict_fn) print('正在检查是否存在检查点...') if self.ckpt: print('存在检查点,正在从“{}”中加载检查点...'.format(checkpoint_dir)) self.checkpoint.restore( tf.train.latest_checkpoint(checkpoint_dir)).expect_partial() else: if execute_type == "train": print('不存在检查点,正在train模式...') else: print('不存在检查点,请先执行train模式,再进入chat模式') exit(0) utils.log_operator(level=10).info( "启动SMN聊天器,执行类别为:{},模型参数配置为:vocab_size:{}," "embedding_dim:{},units:{},max_length:{}".format( execute_type, vocab_size, embedding_dim, units, max_length))
def __init__(self, execute_type: str, checkpoint_dir: str, num_layers: int, units: int, d_model: int, num_heads: int, dropout: float, start_sign: str, end_sign: str, beam_size: int, vocab_size: int, dict_fn: str, max_length: int): """ Transformer聊天器初始化,用于加载模型 Args: execute_type: 对话执行模式 checkpoint_dir: 检查点保存目录路径 num_layers: transformer内部层数 units: 单元数 d_model: 嵌入层维度 num_heads: 注意力头数 dropout: 采样率 start_sign: 开始标记 end_sign: 结束标记 beam_size: batch大小 vocab_size: 词汇量大小 dict_fn: 保存字典路径 max_length: 单个句子最大长度 Returns: """ super().__init__(checkpoint_dir, beam_size, max_length) self.start_sign = start_sign self.end_sign = end_sign self.model = transformer.transformer(vocab_size=vocab_size, num_layers=num_layers, units=units, d_model=d_model, num_heads=num_heads, dropout=dropout) self.learning_rate = optimizers.CustomSchedule(d_model) self.optimizer = tf.keras.optimizers.Adam(self.learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) self.train_loss = tf.keras.metrics.Mean(name='train_loss') self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') self.checkpoint = tf.train.Checkpoint(transformer=self.model, optimizer=self.optimizer) if execute_type == "chat": print('正在从“{}”处加载字典...'.format(dict_fn)) self.token = data_utils.load_token_dict(dict_fn=dict_fn) print('正在检查是否存在检查点...') if self.ckpt: print('存在检查点,正在从“{}”中加载检查点...'.format(checkpoint_dir)) self.checkpoint.restore( tf.train.latest_checkpoint(checkpoint_dir)).expect_partial() else: if execute_type == "train": print('不存在检查点,正在train模式...') else: print('不存在检查点,请先执行train模式,再进入chat模式') exit(0) utils.log_operator(level=10).info( "启动SMN聊天器,执行类别为:{},模型参数配置为:num_layers:{}," "d_model:{},num_heads:{},units:{},dropout:{},vocab_size:{}," "max_length:{}".format(execute_type, num_layers, d_model, num_heads, units, dropout, vocab_size, max_length))