Ejemplo n.º 1
0
def convertToSimplifiedChinese(traditionalChineseString):
    """
     * 繁转简
     *
     * @param traditionalChineseString 繁体中文
     * @return 简体中文
    """
    return HanLP.convertToSimplifiedChinese(traditionalChineseString)
Ejemplo n.º 2
0
def zh_simplify(line, server_model):
    return HanLP.convertToSimplifiedChinese(line)
Ejemplo n.º 3
0
def zh_simplify(line):
    return HanLP.convertToSimplifiedChinese(line)
Ejemplo n.º 4
0
def data_cleaning(source,
                  del_space=False,
                  del_t=True,
                  del_r=True,
                  del_n=True,
                  simplify=True,
                  half_width=True,
                  upper=False,
                  lower=False,
                  merge_3_chinese=False,
                  merge_3_english=False,
                  merge_3_number=False,
                  merge_3_punctuation=False):
    """
    通用自然语言文本清洗

    :param source: <str> 原始字符串
    :param del_space: <bool> 是否剔除空格(半角空格和全角空格): 默认 = False
    :param del_t: <bool> 是否剔除制表符(\t): 默认 = True
    :param del_r: <bool> 是否剔除回车符(\r): 默认 = True
    :param del_n: <bool> 是否剔除换行符(\n): 默认 = True
    :param simplify: <bool> 是否将繁体字转换为简体字: 默认=True
    :param half_width: <bool> 是否将全角字符转换为半角字符: 默认 = True
    :param upper: <bool> 是否将所有英文字母统一为大写字母(优先级低于lower): 默认 = False
    :param lower: <bool> 是否将所有英文字母统一为小写字母(优先级高于upper): 默认 = False
    :param merge_3_chinese: <bool> 是否合并连续的相同的大于等于3个中文汉字: 默认 = False
    :param merge_3_english: <bool> 是否合并连续的相同的大于等于3个英文字符: 默认 = False
    :param merge_3_number: <bool> 是否合并连续的相同的大于等于3个数字: 默认 = False
    :param merge_3_punctuation: <bool> 是否合并连续的相同的大于等于2个标点符号: 默认 = False
    :return: <str> 结果字符串
    """
    if del_space:
        source = source.replace(" ", "").replace(" ", "")
    if del_t:
        source = source.replace("\t", "")
    if del_r:
        source = source.replace("\r", "")
    if del_n:
        source = source.replace("\n", "")
    if simplify:
        source = HanLP.convertToSimplifiedChinese(source)  # 使用HanLP将繁体字转换为简体字
    if half_width:
        source = full_width_to_half_width(source)  # 将全角字符转换为半角字符
    if upper:
        source = source.upper()
    if lower:
        source = source.lower()
    if merge_3_chinese:
        # 合并连续的相同的中文汉字
        for chinese_character in re.findall(r"([\u4e00-\u9fa5])\1{3,}",
                                            source):
            source = re.sub("[" + chinese_character[0] + "]{3,}",
                            chinese_character * 3, source)
    if merge_3_english:
        # 合并连续的英文字母(将大于等于3个连续的英文字母均替换为3个)
        for chinese_character in re.findall(r"([A-Za-z])\1{3,}", source):
            source = re.sub("[" + chinese_character[0] + "]{3,}",
                            chinese_character * 3, source)
    if merge_3_number:
        # 合并连续的数字(将大于等于3个连续的英文字母均替换为3个)
        for chinese_character in re.findall(r"([0-9])\1{3,}", source):
            source = re.sub("[" + chinese_character[0] + "]{3,}",
                            chinese_character * 3, source)
    if merge_3_punctuation:
        punctuation_list = "".join(PUNCTUATION_LIST)
        for match_punctuation in re.findall(
                "([" + punctuation_list + "])\\1{2,}", source):
            source = re.sub("[" + match_punctuation[0] + "]{2,}",
                            match_punctuation * 3, source)
        source = re.sub("-{2,}", "---", source)  # 处理特殊的短横杠
    return source
Ejemplo n.º 5
0
 def simplechinese(self, sentence):
     simple = HanLP.convertToSimplifiedChinese(sentence)
     return simple
Ejemplo n.º 6
0
 def simplify(self):
     """将繁体字转换为简体字"""
     self._data = HanLP.convertToSimplifiedChinese(self._data)
     return self