def main(): args = parse() pwd_path = os.path.abspath(os.path.dirname(__file__)) file_path = os.path.join(pwd_path, "/" + args.input_file) if not os.path.exists(file_path): default_logger = get_logger(__file__) default_logger.debug("file not exists:", file_path) file_in = codecs.open(args.input_file, 'rb', encoding = 'utf-8').readlines() file_ou = codecs.open(args.output_file, 'w', encoding = 'utf-8') if args.effect: PUNCTUATION_LIST = "。,,、?:;{}[]【】“‘’”《》/!%……()<>@#$~^¥%&*\"\'=+-" for line in tqdm(file_in): line = line.strip() if not is_chinese(line[0]) and line[0] not in {'“', '‘', '{', '[', '【', '(', '<', '《'}: continue if line[-1] not in {'。', '?', '”', '!', '……', '’', ')'}: if is_chinese(line[-1]): line += '。' else: continue if len(line) < 5: continue if False not in [(char in PUNCTUATION_LIST or is_chinese(char)) for char in line]: line = traditional2simplified(line) file_ou.write(line + '\n') file_ou.close() else: for line in tqdm(file_in): line = line.strip() line = traditional2simplified(line) file_ou.write(line + '\n') file_ou.close()
def load_same_pinyin(path, sep='\t'): """ 加载同音字 :param path: :return: """ result = dict() if not os.path.exists(path): default_logger.debug("file not exists:", path) return result with codecs.open(path, 'r', encoding='utf-8') as f: for line in f: line = traditional2simplified(line.strip()) parts = line.split(sep) if parts and len(parts) > 2: key_char = parts[0] # same_pron_same_tone = set(list(parts[1])) # same_pron_diff_tone = set(list(parts[2])) # value = same_pron_same_tone.union(same_pron_diff_tone) value = set(list("".join(parts))) if len(key_char) > 1 or not value: continue result[key_char] = value # these pairs would be dealed with rule result['他'] -= {'她', '它'} result['她'] -= {'他', '它'} result['它'] -= {'她', '他'} result['影'] -= {'音'} result['车'] = result['扯'] return result
def load_same_stroke(path, sep=','): """ 加载形似字 :param path: :param sep: :return: """ result = dict() if not os.path.exists(path): default_logger.debug("file not exists:", path) return result with codecs.open(path, 'r', encoding='utf-8') as f: for line in f: line = traditional2simplified(line.strip()) parts = line.strip().split(sep) if parts and len(parts) > 1: for i, c in enumerate(parts): result[c] = set(list(parts[:i] + parts[i + 1:])) return result
def load_same_stroke(path, sep=','): """ 加载形似字 :param path: :param sep: :return: """ result = dict() if not os.path.exists(path): default_logger.debug("file not exists:", path) return result with codecs.open(path, 'r', encoding='utf-8') as f: for line in f: line = traditional2simplified(line.strip()) parts = line.strip().split(sep) if parts and len(parts) > 1: for i, c in enumerate(parts): result[c] = set(list(parts[:i] + parts[i + 1:])) return result
def load_same_pinyin(path, sep='\t'): """ 加载同音字 :param path: :return: """ result = dict() if not os.path.exists(path): default_logger.debug("file not exists:", path) return result with codecs.open(path, 'r', encoding='utf-8') as f: for line in f: line = traditional2simplified(line.strip()) parts = line.split(sep) if parts and len(parts) > 2: key_char = parts[0] same_pron_same_tone = set(list(parts[1])) same_pron_diff_tone = set(list(parts[2])) value = same_pron_same_tone.union(same_pron_diff_tone) if len(key_char) > 1 or not value: continue result[key_char] = value return result
def load_same_pinyin(path, sep='\t'): """ 加载同音字 :param path: :return: """ result = dict() if not os.path.exists(path): default_logger.debug("file not exists:", path) return result with codecs.open(path, 'r', encoding='utf-8') as f: for line in f: line = traditional2simplified(line.strip()) parts = line.split(sep) if parts and len(parts) > 2: key_char = parts[0] same_pron_same_tone = set(list(parts[1])) same_pron_diff_tone = set(list(parts[2])) value = same_pron_same_tone.union(same_pron_diff_tone) if len(key_char) > 1 or not value: continue result[key_char] = value return result
# -*- coding: utf-8 -*- # Author: XuMing <*****@*****.**> # Brief: import enchant from pypinyin import lazy_pinyin from pycorrector.utils.text_utils import traditional2simplified, simplified2traditional from pycorrector.utils.text_utils import tokenize, get_homophones_by_char, get_homophones_by_pinyin traditional_sentence = '憂郁的臺灣烏龜' simplified_sentence = traditional2simplified(traditional_sentence) print(simplified_sentence) simplified_sentence = '忧郁的台湾乌龟' traditional_sentence = simplified2traditional(simplified_sentence) print(traditional_sentence) print(lazy_pinyin('中心')) # 不带音调 print(tokenize('小姑娘蹦蹦跳跳的去了她外公家')) # 判断拼音还是英文 en_dict = enchant.Dict("en_US") print(en_dict.check("hello")) print(en_dict.check("hello boy what is your name")) strs = "hello boy what is your name" flag = False for word in strs: if en_dict.check(word): flag = True else: