Beispiel #1
0
def make_traditional(text):
    """
    Makes Chinese text Traditional Chinese
    :param text: unicode string of Chinese
    :return: unicode string of Traditional Chinese
    """
    if not mafan.is_traditional(text):
        trad = mafan.tradify(text)
    else:
        trad = text
    return trad
Beispiel #2
0
def can_st(page):
	simplified = simplify(page) == page.decode("utf8")
	traditional = tradify(page) == page.decode("utf8")
	# only simplified
	if simplified and not traditional and not config.zh_s:
		return False
	# only traditional
	elif traditional and not simplified and not config.zh_t:
		return False
	else:
		return config.zh_t or config.zh_s
Beispiel #3
0
def can_st(page):
    simplified = simplify(page) == page.decode("utf8")
    traditional = tradify(page) == page.decode("utf8")
    # only simplified
    if simplified and not traditional and not config.zh_s:
        return False
    # only traditional
    elif traditional and not simplified and not config.zh_t:
        return False
    else:
        return config.zh_t or config.zh_s
Beispiel #4
0
def StdNm(nonstd=None):

    ## make a DataFrame indexed by standardized name and
    ## contains columns "FirstName", "OtherNames", "OtherNames1"
    std = pd.read_csv("csv/StandardNames.csv")
    std["FullName"] = std["LastName"] + std["FirstName"]
    std["FullName"].fillna(method="ffill", inplace=True)
    std.set_index("FullName", inplace=True)

    #### Simplified characters for LastName happen sometimes
    std["ConvLast"] = [
        sTR if pd.isnull(sTR) else
        mf.simplify(sTR) if mf.is_traditional(sTR) else mf.tradify(sTR)
        for sTR in std["LastName"]
    ]
    std["LastOth"] = std["LastName"].fillna(method="ffill") + std['OtherNames']
    std["ConvFst"] = std["ConvLast"].fillna(method="ffill") + std['FirstName']
    std["ConvOth"] = std["ConvLast"].fillna(method="ffill") + std['OtherNames']
    std.drop(["Details", "Studio", "LastName", "ConvLast"],
             axis=1,
             inplace=True)

    ## make a dataframe of {key: alternative names, value: standard names} with
    ## unique keys and overlapping values
    map_df = pd.DataFrame()

    for colName in list(std.columns):
        df = pd.DataFrame({"key": std[colName], "value": std.index})
        map_df = map_df.append(df, ignore_index=True)

    map_df.dropna(inplace=True)

    ## Standardize names in the given Series
    map_dict = map_df.set_index('key').to_dict()['value']

    def standardize_names(participant):
        if participant in map_dict:
            return map_dict[participant]
        else:
            return participant

    ans = nonstd.map(standardize_names)

    return ans
from mafan import encoding
from mafan import text
from mafan import simplify, tradify
from mafan import split_text
import mafan
from mafan import pinyin

# 对包含其他编码的文本转化成utf8格式
# filename = 'test.txt'  # name or path of file as string
# encoding.convert(filename)  # creates a file with name 'ugly_big5_utf-8.txt' in glorious utf-8 encoding

# 简体和繁体转化
print('-' * 50)
string = u'这是麻烦啦'
print(tradify(string))  # convert string to traditional
print(simplify(tradify(string)))  # convert back to simplified

# 是否包含符号或者拉丁字符
print('-' * 50)
flag = text.has_punctuation(u'这是麻烦啦')
print(flag)
flag = text.has_punctuation(u'这是麻烦啦.')
print(flag)
flag = text.has_punctuation(u'这是麻烦啦。')
print(flag)
flag = text.contains_latin(u'这是麻烦啦。')
print(flag)
flag = text.contains_latin(u'You are麻烦啦。')
print(flag)
    if len(s2t_dict[key]) > 1:
        for t in s2t_dict[key]:
            checklist.append(t)

total = 0
correct = 0
wrong = 0
micro_total = 0
micro_correct = 0

line_count = 0

for line in sim_file:

    line = line.rstrip()
    line = tradify(line)
    tra_line = tra_lines[line_count].rstrip()

    if len(line) == len(tra_line):
        char_count = 0
        for c in line:
            total = total + 1
            if c == tra_line[char_count]:
                correct = correct + 1
            else:
                wrong = wrong + 1

            if tra_line[char_count] in checklist:
                micro_total += 1
                if c == tra_line[char_count]:
                    micro_correct = micro_correct + 1
Beispiel #7
0
def tradify():
    text = request.args.get('text')
    d = {'text': mafan.tradify(text)}
    return jsonify(**d)
Beispiel #8
0
import mafan

with open("hlm_45_tokenized.conll", "r") as inFile:
    with open("hlm_45_tokenized_trad.conll", "w") as outFile:
        for line in inFile:
            if len(line) > 1:
                line = line.strip().split("\t")

                line[1] = mafan.tradify(line[1].decode("utf8")).encode("utf8")
                print >> outFile, "\t".join(line)
            else:
                print >> outFile, ("\n")
#      print(mafan.tradify(line.strip()).encode("utf8"))
#     print >> outFile, (mafan.tradify(line.strip()).encode("utf8"))
def on_chat_message(message):
    message = json.loads(message)
    sid = request.sid[:5]
    for room in all_rooms:
        emit("chat", {"message": sid + ": " + tradify(message["message"])}, room=room)
Beispiel #10
0
#     if  a!=None:
#         trans_t += transform(list(text), 0)[0]
        else:
            trans_t += text
    print(trans_t)

    # 谐音转换
    transform = PronunciationTransform(
        chinese_chars_file='../data/chaizi/中国所有汉字大全 一行一个.txt')
    for text in t:
        print(''.join(transform(list(text), 0)))

    string = word
    # 繁体转换
    from mafan import simplify, tradify
    print(tradify(string))

    #拼音替换
    # PhoneticTransform
    transform = PhoneticTransform()

    #   for text in t:
    #     print(''.join(transform(list(text), 0)))
    t2 = word[:2]
    trans_t = ''
    for text in t2:
        # print(''.join(transform(list(text), 0)))
        if transform(list(text), 0) != None:
            trans_t += transform(list(text), 0)[0]
        else:
            trans_t += text