Beispiel #1
0
def preprocess_raw_data_qa_single(raw_data: str,
                                  qa_data: str,
                                  if_remove: bool = True):
    """
    单轮对话数据集处理模块
    用于处理已经分词好的多轮次数据集的方法,将数据集处理成问答对的形式
    Args:
        raw_data: 原始数据路径
        qa_data: 生成token数据保存路径
        if_remove: 是否移除原有分词文本
    Returns:
    """

    check_file(raw_file=raw_data, treat_file=qa_data, if_remove=if_remove)

    count = 0
    sentences_count = 0
    max_len = 0
    min_len = 10000
    sentence_len = []
    one_pair = []

    # 对每一轮对话上下文进行配对,形成一问一答两个部分,如果遇到下一轮对话,直接跳过
    with open(raw_data, encoding="utf-8") as raw_file, open(
            qa_data, 'w', encoding="utf-8") as tokenized_file:
        for line in raw_file:
            line = line.strip('\n').replace('/', '')
            # line = re.sub(r"[%s]+" % punctuation, "", line)
            # 因为原始数据集中,是一轮一轮的对话排列的,所以需要注意的是
            # 在一轮对话结束之后,最后一句不能作为问句,需要跳到下一轮进行处理
            if line == '':
                one_pair = []
                count += 1
                continue
            elif len(one_pair) == 1:
                one_pair.append(line)
                tokenized_file.write(one_pair[0] + "\t" + one_pair[1] + "\n")
                one_pair = [line]
                sentences_count += 1
                if sentences_count % 10000 == 0:
                    print('已处理:', sentences_count, '个问答对')
            else:
                one_pair.append(line)

            length = len(line)
            max_len = max(max_len, length)
            min_len = min(min_len, length)
            sentence_len.append(length)

    message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,整理出{}对" \
              "问答对,语句最大长度:{},语句最短长度{},语句平均长度{:.3f}".format(count, sentences_count,
                                                           max_len, min_len, np.mean(sentence_len))
    print(message)
    logger = log_operator(level=10)
    logger.info(message)
Beispiel #2
0
def preprocess_raw_douban_data(raw_data: str,
                               tokenized_data: str,
                               repeat_data: int = 10,
                               if_remove: bool = True):
    """
    用于处理douban数据集的方法,将douban数据集处理成多轮次对话的形式,并分词
    Args:
        raw_data: 原始数据路径
        tokenized_data: 生成token数据保存路径
        repeat_data: 每轮对话重复数据条数
        if_remove: 是否移除原有分词文本
    Returns:
    """
    check_file(raw_file=raw_data,
               treat_file=tokenized_data,
               if_remove=if_remove)

    count = 0
    max_len = 0
    min_len = 10000
    sentence_len = []

    with open(raw_data, 'r', encoding='utf-8') as raw_file, open(
            tokenized_data, 'a', encoding='utf-8') as tokenized_file:
        iter_count = -1
        for line in raw_file:
            iter_count += 1
            if iter_count % repeat_data != 0:
                continue
            line = line.strip('\n').replace('/', '')
            if line == "":
                continue

            # 因为原始数据集中,是一轮一轮的对话排列的,所以需要注意的是在一轮对话结束之后,最后
            # 一句不能作为问句,需要跳到下一轮进行处理去掉最前面的标签和最后面的不正确语句
            utterances = line.split('\t')[1:-1]
            for utterance in utterances:
                length = len(utterance)
                sentence_len.append(length)
                max_len = max(max_len, length)
                min_len = min(min_len, length)
                tokenized_file.write(utterance + "\n")
            tokenized_file.write("\n")
            count += 1
            if count % 10000 == 0:
                print("数据处理进度:{}".format(count))

    message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,语句最大长度:{},语" \
              "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len))

    print(message)
    logger = log_operator(level=10)
    logger.info(message)
Beispiel #3
0
def preprocess_raw_wei_bo_data(raw_post_data: str,
                               raw_response_data,
                               tokenized_data: str,
                               if_remove: bool = True):
    """
    用于处理weibo数据集的方法,将weibo数据集处理成多轮次的形式,并分词
    Args:
        raw_post_data: 微博的post原始文本数据中的路径
        raw_response_data: 微博的response原始文本数据中的路径
        tokenized_data: 生成token数据保存路径
        if_remove: 是否移除原有分词文本
    Returns:
    """
    check_file(raw_file=raw_post_data,
               treat_file=tokenized_data,
               if_remove=if_remove)
    if not os.path.exists(raw_response_data):
        print('数据集不存在,请添加数据集!')
        exit(0)

    count = 0
    max_len = 0
    min_len = 10000
    sentence_len = []

    with open(raw_post_data, 'r', encoding='utf-8') as post_file, open(
            raw_response_data, 'r', encoding='utf-8') as response_file, \
            open(tokenized_data, 'a', encoding='utf-8') as tokenized_file:
        for post_data, response_data in zip(post_file, response_file):
            post_data = post_data.strip("\n").replace("/", " ")
            response_data = response_data.strip("\n").replace("/", " ")
            if post_data == "" or response_data == "":
                continue

            post_len = len(post_data)
            response_len = len(response_data)
            max_len = max(max_len, post_len, response_len)
            min_len = min(min_len, post_len, response_len)
            sentence_len.append(post_len)
            sentence_len.append(response_len)
            tokenized_file.write(post_data + "\n" + response_data + "\n\n")

            count += 1
            if count % 10000 == 0:
                print("已读取:{}轮对话数据".format(count))

    message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,语句最大长度:{},语" \
              "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len))

    print(message)
    logger = log_operator(level=10)
    logger.info(message)
Beispiel #4
0
def cell_layer(units: int,
               input_feature_dim: int,
               cell_type: str = 'lstm',
               if_bidirectional: bool = True) -> tf.keras.Model:
    """
    RNNCell层,其中可定义cell类型,是否双向
    Args:
        units: cell单元数
        input_feature_dim: 输入的特征维大小
        cell_type: cell类型,lstm/gru, 默认lstm
        if_bidirectional: 是否双向
    Returns:
    """
    inputs = tf.keras.Input(shape=(None, input_feature_dim))
    if cell_type == 'lstm':
        rnn = tf.keras.layers.LSTM(units=units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    elif cell_type == 'gru':
        rnn = tf.keras.layers.GRU(units=units,
                                  return_sequences=True,
                                  return_state=True,
                                  recurrent_initializer='glorot_uniform')
    else:
        print('cell执行了类型执行出错,定位细节参见log')
        utils.log_operator(level=10).info("cell执行了类型执行出错")

    if if_bidirectional:
        rnn = tf.keras.layers.Bidirectional(rnn)

    rnn_outputs = rnn(inputs)
    outputs = rnn_outputs[0]
    states = outputs[:, -1, :]

    return tf.keras.Model(inputs=inputs, outputs=[outputs, states])
Beispiel #5
0
def preprocess_raw_qin_yun_data(raw_data: str,
                                tokenized_data: str,
                                if_remove: bool = True):
    """
    用于处理青云数据集的方法,将青云数据集处理成多轮次的形式,并分词
    Args:
        raw_data: 原始数据路径
        tokenized_data: 生成token数据保存路径
        if_remove: 是否移除原有分词文本
    Returns:
    """
    check_file(raw_file=raw_data,
               treat_file=tokenized_data,
               if_remove=if_remove)

    count = 0
    max_len = 0
    min_len = 10000
    sentence_len = []

    with open(raw_data, 'r', encoding='utf-8') as raw_file, open(
            tokenized_data, 'a', encoding='utf-8') as tokenized_file:
        for line in raw_file:
            line = line.strip().strip("\n").replace("/", " ")
            if line == "":
                continue

            for sentence in line.split("|"):
                sentence = sentence.strip()

                length = len(sentence)
                sentence_len.append(length)
                max_len = max(max_len, length)
                min_len = min(min_len, length)
                tokenized_file.write(" ".join(jieba.cut(sentence)) + "\n")
            tokenized_file.write("\n")

            count += 1
            if count % 10000 == 0:
                print("已读取:{}轮对话数据".format(count))

    message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,语句最大长度:{},语" \
              "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len))

    print(message)
    logger = log_operator(level=10)
    logger.info(message)
Beispiel #6
0
def preprocess_raw_cross_woz_data(raw_data: str,
                                  tokenized_data: str,
                                  if_remove: bool = True):
    """
    用于处理crossWOZ数据集的方法,将crossWOZ数据集处理成多轮次对话的形式,并分词
    Args:
        raw_data: 原始数据路径
        tokenized_data: 生成token数据保存路径
        if_remove: 是否移除原有分词文本
    Returns:
    """
    check_file(raw_file=raw_data,
               treat_file=tokenized_data,
               if_remove=if_remove)

    count = 0
    max_len = 0
    min_len = 10000
    sentence_len = []

    with open(raw_data, 'r', encoding='utf-8') as raw_file, open(
            tokenized_data, 'a', encoding='utf-8') as tokenized_file:
        raw_data = json.load(raw_file)
        for data in raw_data:
            turn_utterances = raw_data[data]["messages"]
            for content in turn_utterances:
                sentence = content["content"]
                length = len(sentence)
                sentence_len.append(length)
                max_len = max(max_len, length)
                min_len = min(min_len, length)
                tokenized_file.write(" ".join(jieba.cut(sentence)) + "\n")
            tokenized_file.write("\n")
            count += 1
            if count % 10000 == 0:
                print("已读取:{}轮对话数据".format(count))

    message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,语句最大长度:{},语" \
              "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len))

    print(message)
    logger = log_operator(level=10)
    logger.info(message)
Beispiel #7
0
def preprocess_raw_lccc_data(raw_data: str,
                             tokenized_data: str,
                             if_remove: bool = True):
    """
    用于处理LCCC数据集的方法,将LCCC数据集处理成多轮次对话的形式,并分词
    Args:
        raw_data: 原始数据路径
        tokenized_data: 生成token数据保存路径
        if_remove: 是否移除原有分词文本
    Returns:
    """
    check_file(raw_file=raw_data,
               treat_file=tokenized_data,
               if_remove=if_remove)

    count = 0
    max_len = 0
    min_len = 10000
    sentence_len = []

    with open(raw_data, 'r', encoding="utf-8") as raw_file, open(
            tokenized_data, 'a', encoding="utf-8") as tokenized_file:
        raw_data = json.load(raw_file)
        for data in raw_data:
            for sentence in data:
                length = len(sentence)
                sentence_len.append(length)
                max_len = max(max_len, length)
                min_len = min(min_len, length)
                tokenized_file.write(sentence + "\n")
            tokenized_file.write("\n")
            count += 1
            if count % 10000 == 0:
                print("已读取:{}轮对话数据".format(count))

    message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,语句最大长度:{},语" \
              "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len))

    print(message)
    logger = log_operator(level=10)
    logger.info(message)
Beispiel #8
0
def combine_tokenized_data_single(standby_data: list,
                                  combine_data: str,
                                  if_remove: bool = True):
    """
    *单轮对话数据集处理模块*
    将所有已经分词好的问答对集中整合到一个文件中
    Args:
        standby_data: 分词好的数据文本路径
        combine_data: 汇总数据的文本路径
        if_remove: 是否移除原有分词文本
    Returns:
    """
    if os.path.exists(combine_data) and if_remove:
        os.remove(combine_data)

    count = 0
    file_count = 0

    for file_fn in standby_data:
        if not os.path.exists(file_fn):
            print("{}文件不存在,请检查之后再次运行".format(file_fn))
            exit(0)
        with open(file_fn, 'r', encoding='utf-8') as tokenized_file, open(
                combine_data, 'a', encoding='utf-8') as combine_file:
            for line in tokenized_file:
                line = line.strip().strip("\n").replace("/", " ")
                combine_file.write(line + "\n")
                count += 1
                if count % 10000 == 0:
                    print("数据处理进度:{}".format(count))

        file_count += 1

    message = "数据处理完毕,数据信息统计:共处理{}个分词文件,整理出{}条数据".format(file_count, count)
    print(message)
    logger = log_operator(level=10)
    logger.info(message)
Beispiel #9
0
    def __init__(self, units: int, vocab_size: int, execute_type: str,
                 dict_fn: str, embedding_dim: int, checkpoint_dir: int,
                 max_utterance: int, max_sentence: int, learning_rate: float,
                 database_fn: str, solr_server: str):
        """
        SMN聊天器初始化,用于加载模型
        Args:
            units: 单元数
            vocab_size: 词汇量大小
            execute_type: 对话执行模式
            dict_fn: 保存字典路径
            embedding_dim: 嵌入层维度
            checkpoint_dir: 检查点保存目录路径
            max_utterance: 每轮句子数量
            max_sentence: 单个句子最大长度
            learning_rate: 学习率
            database_fn: 候选数据库路径
        Returns:
        """
        self.dict_fn = dict_fn
        self.checkpoint_dir = checkpoint_dir
        self.max_utterance = max_utterance
        self.max_sentence = max_sentence
        self.database_fn = database_fn
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        self.solr = pysolr.Solr(url=solr_server,
                                always_commit=True,
                                timeout=10)
        self.train_loss = tf.keras.metrics.Mean()

        self.model = smn.smn(units=units,
                             vocab_size=vocab_size,
                             embedding_dim=embedding_dim,
                             max_utterance=self.max_utterance,
                             max_sentence=self.max_sentence)

        self.checkpoint = tf.train.Checkpoint(
            model=self.model,
            optimizer=self.optimizer,
        )

        ckpt = os.path.exists(checkpoint_dir)
        if not ckpt:
            os.makedirs(checkpoint_dir)

        if execute_type == "chat":
            print('正在从“{}”处加载字典...'.format(self.dict_fn))
            self.token = data_utils.load_token_dict(dict_fn=self.dict_fn)
        print('正在检查是否存在检查点...')
        if ckpt:
            print('存在检查点,正在从“{}”中加载检查点...'.format(checkpoint_dir))
            self.checkpoint.restore(
                tf.train.latest_checkpoint(checkpoint_dir)).expect_partial()
        else:
            if execute_type == "train":
                print('不存在检查点,正在train模式...')
            else:
                print('不存在检查点,请先执行train模式,再进入chat模式')
                exit(0)

        logger = utils.log_operator(level=10)
        logger.info("启动SMN聊天器,执行类别为:{},模型参数配置为:embedding_dim:{},"
                    "max_sentence:{},max_utterance:{},units:{},vocab_size:{},"
                    "learning_rate:{}".format(execute_type, embedding_dim,
                                              max_sentence, max_utterance,
                                              units, vocab_size,
                                              learning_rate))
Beispiel #10
0
    def __init__(self,
                 execute_type: str,
                 checkpoint_dir: str,
                 units: int,
                 embedding_dim: int,
                 batch_size: int,
                 start_sign: str,
                 end_sign: str,
                 beam_size: int,
                 vocab_size: int,
                 dict_fn: str,
                 max_length: int,
                 encoder_layers: int,
                 decoder_layers: int,
                 cell_type: str,
                 if_bidirectional: bool = True):
        """
        Seq2Seq聊天器初始化,用于加载模型
        Args:
            execute_type: 对话执行模式
            checkpoint_dir: 检查点保存目录路径
            units: 单元数
            embedding_dim: 嵌入层维度
            batch_size: batch大小
            start_sign: 开始标记
            end_sign: 结束标记
            beam_size: batch大小
            vocab_size: 词汇量大小
            dict_fn: 保存字典路径
            max_length: 单个句子最大长度
            encoder_layers: encoder中内部RNN层数
            decoder_layers: decoder中内部RNN层数
            cell_type: cell类型,lstm/gru, 默认lstm
            if_bidirectional: 是否双向
        Returns:
        """
        super().__init__(checkpoint_dir, beam_size, max_length)
        self.units = units
        self.start_sign = start_sign
        self.end_sign = end_sign
        self.batch_size = batch_size
        self.enc_units = units

        self.encoder = seq2seq.encoder(vocab_size=vocab_size,
                                       embedding_dim=embedding_dim,
                                       enc_units=int(units / 2),
                                       layer_size=encoder_layers,
                                       cell_type=cell_type,
                                       if_bidirectional=if_bidirectional)
        self.decoder = seq2seq.decoder(vocab_size=vocab_size,
                                       embedding_dim=embedding_dim,
                                       enc_units=units,
                                       dec_units=units,
                                       layer_size=decoder_layers,
                                       cell_type=cell_type)

        self.optimizer = tf.keras.optimizers.Adam()
        self.train_loss = tf.keras.metrics.Mean()
        self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
        self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction='none')
        self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer,
                                              encoder=self.encoder,
                                              decoder=self.decoder)

        if execute_type == "chat":
            print('正在从“{}”处加载字典...'.format(dict_fn))
            self.token = data_utils.load_token_dict(dict_fn=dict_fn)
        print('正在检查是否存在检查点...')
        if self.ckpt:
            print('存在检查点,正在从“{}”中加载检查点...'.format(checkpoint_dir))
            self.checkpoint.restore(
                tf.train.latest_checkpoint(checkpoint_dir)).expect_partial()
        else:
            if execute_type == "train":
                print('不存在检查点,正在train模式...')
            else:
                print('不存在检查点,请先执行train模式,再进入chat模式')
                exit(0)

        utils.log_operator(level=10).info(
            "启动SMN聊天器,执行类别为:{},模型参数配置为:vocab_size:{},"
            "embedding_dim:{},units:{},max_length:{}".format(
                execute_type, vocab_size, embedding_dim, units, max_length))
Beispiel #11
0
    def __init__(self, execute_type: str, checkpoint_dir: str, num_layers: int,
                 units: int, d_model: int, num_heads: int, dropout: float,
                 start_sign: str, end_sign: str, beam_size: int,
                 vocab_size: int, dict_fn: str, max_length: int):
        """
        Transformer聊天器初始化,用于加载模型
        Args:
            execute_type: 对话执行模式
            checkpoint_dir: 检查点保存目录路径
            num_layers: transformer内部层数
            units: 单元数
            d_model: 嵌入层维度
            num_heads: 注意力头数
            dropout: 采样率
            start_sign: 开始标记
            end_sign: 结束标记
            beam_size: batch大小
            vocab_size: 词汇量大小
            dict_fn: 保存字典路径
            max_length: 单个句子最大长度
        Returns:
        """
        super().__init__(checkpoint_dir, beam_size, max_length)
        self.start_sign = start_sign
        self.end_sign = end_sign

        self.model = transformer.transformer(vocab_size=vocab_size,
                                             num_layers=num_layers,
                                             units=units,
                                             d_model=d_model,
                                             num_heads=num_heads,
                                             dropout=dropout)

        self.learning_rate = optimizers.CustomSchedule(d_model)
        self.optimizer = tf.keras.optimizers.Adam(self.learning_rate,
                                                  beta_1=0.9,
                                                  beta_2=0.98,
                                                  epsilon=1e-9)
        self.train_loss = tf.keras.metrics.Mean(name='train_loss')
        self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
            name='train_accuracy')

        self.checkpoint = tf.train.Checkpoint(transformer=self.model,
                                              optimizer=self.optimizer)

        if execute_type == "chat":
            print('正在从“{}”处加载字典...'.format(dict_fn))
            self.token = data_utils.load_token_dict(dict_fn=dict_fn)
        print('正在检查是否存在检查点...')
        if self.ckpt:
            print('存在检查点,正在从“{}”中加载检查点...'.format(checkpoint_dir))
            self.checkpoint.restore(
                tf.train.latest_checkpoint(checkpoint_dir)).expect_partial()
        else:
            if execute_type == "train":
                print('不存在检查点,正在train模式...')
            else:
                print('不存在检查点,请先执行train模式,再进入chat模式')
                exit(0)

        utils.log_operator(level=10).info(
            "启动SMN聊天器,执行类别为:{},模型参数配置为:num_layers:{},"
            "d_model:{},num_heads:{},units:{},dropout:{},vocab_size:{},"
            "max_length:{}".format(execute_type, num_layers, d_model,
                                   num_heads, units, dropout, vocab_size,
                                   max_length))