def make_traditional(text): """ Makes Chinese text Traditional Chinese :param text: unicode string of Chinese :return: unicode string of Traditional Chinese """ if not mafan.is_traditional(text): trad = mafan.tradify(text) else: trad = text return trad
def can_st(page): simplified = simplify(page) == page.decode("utf8") traditional = tradify(page) == page.decode("utf8") # only simplified if simplified and not traditional and not config.zh_s: return False # only traditional elif traditional and not simplified and not config.zh_t: return False else: return config.zh_t or config.zh_s
def StdNm(nonstd=None): ## make a DataFrame indexed by standardized name and ## contains columns "FirstName", "OtherNames", "OtherNames1" std = pd.read_csv("csv/StandardNames.csv") std["FullName"] = std["LastName"] + std["FirstName"] std["FullName"].fillna(method="ffill", inplace=True) std.set_index("FullName", inplace=True) #### Simplified characters for LastName happen sometimes std["ConvLast"] = [ sTR if pd.isnull(sTR) else mf.simplify(sTR) if mf.is_traditional(sTR) else mf.tradify(sTR) for sTR in std["LastName"] ] std["LastOth"] = std["LastName"].fillna(method="ffill") + std['OtherNames'] std["ConvFst"] = std["ConvLast"].fillna(method="ffill") + std['FirstName'] std["ConvOth"] = std["ConvLast"].fillna(method="ffill") + std['OtherNames'] std.drop(["Details", "Studio", "LastName", "ConvLast"], axis=1, inplace=True) ## make a dataframe of {key: alternative names, value: standard names} with ## unique keys and overlapping values map_df = pd.DataFrame() for colName in list(std.columns): df = pd.DataFrame({"key": std[colName], "value": std.index}) map_df = map_df.append(df, ignore_index=True) map_df.dropna(inplace=True) ## Standardize names in the given Series map_dict = map_df.set_index('key').to_dict()['value'] def standardize_names(participant): if participant in map_dict: return map_dict[participant] else: return participant ans = nonstd.map(standardize_names) return ans
from mafan import encoding from mafan import text from mafan import simplify, tradify from mafan import split_text import mafan from mafan import pinyin # 对包含其他编码的文本转化成utf8格式 # filename = 'test.txt' # name or path of file as string # encoding.convert(filename) # creates a file with name 'ugly_big5_utf-8.txt' in glorious utf-8 encoding # 简体和繁体转化 print('-' * 50) string = u'这是麻烦啦' print(tradify(string)) # convert string to traditional print(simplify(tradify(string))) # convert back to simplified # 是否包含符号或者拉丁字符 print('-' * 50) flag = text.has_punctuation(u'这是麻烦啦') print(flag) flag = text.has_punctuation(u'这是麻烦啦.') print(flag) flag = text.has_punctuation(u'这是麻烦啦。') print(flag) flag = text.contains_latin(u'这是麻烦啦。') print(flag) flag = text.contains_latin(u'You are麻烦啦。') print(flag)
if len(s2t_dict[key]) > 1: for t in s2t_dict[key]: checklist.append(t) total = 0 correct = 0 wrong = 0 micro_total = 0 micro_correct = 0 line_count = 0 for line in sim_file: line = line.rstrip() line = tradify(line) tra_line = tra_lines[line_count].rstrip() if len(line) == len(tra_line): char_count = 0 for c in line: total = total + 1 if c == tra_line[char_count]: correct = correct + 1 else: wrong = wrong + 1 if tra_line[char_count] in checklist: micro_total += 1 if c == tra_line[char_count]: micro_correct = micro_correct + 1
def tradify(): text = request.args.get('text') d = {'text': mafan.tradify(text)} return jsonify(**d)
import mafan with open("hlm_45_tokenized.conll", "r") as inFile: with open("hlm_45_tokenized_trad.conll", "w") as outFile: for line in inFile: if len(line) > 1: line = line.strip().split("\t") line[1] = mafan.tradify(line[1].decode("utf8")).encode("utf8") print >> outFile, "\t".join(line) else: print >> outFile, ("\n") # print(mafan.tradify(line.strip()).encode("utf8")) # print >> outFile, (mafan.tradify(line.strip()).encode("utf8"))
def on_chat_message(message): message = json.loads(message) sid = request.sid[:5] for room in all_rooms: emit("chat", {"message": sid + ": " + tradify(message["message"])}, room=room)
# if a!=None: # trans_t += transform(list(text), 0)[0] else: trans_t += text print(trans_t) # 谐音转换 transform = PronunciationTransform( chinese_chars_file='../data/chaizi/中国所有汉字大全 一行一个.txt') for text in t: print(''.join(transform(list(text), 0))) string = word # 繁体转换 from mafan import simplify, tradify print(tradify(string)) #拼音替换 # PhoneticTransform transform = PhoneticTransform() # for text in t: # print(''.join(transform(list(text), 0))) t2 = word[:2] trans_t = '' for text in t2: # print(''.join(transform(list(text), 0))) if transform(list(text), 0) != None: trans_t += transform(list(text), 0)[0] else: trans_t += text