def ckipnlp_cutwords(data: pd.DataFrame, ws, *args: str,
                     **kwargs) -> pd.DataFrame:
    wg_dict = {}
    for wordPackage in args:
        wg_dict.update(txt_to_dict('頻道列表/' + wordPackage + '.txt'))
    wg_dict = construct_dictionary(wg_dict)
    data2 = data.copy()
    cut = []
    for text in data['textOriginal']:
        comment_cut = ws([text], recommend_dictionary=wg_dict)[0]
        cut.append(comment_cut)
        print(comment_cut)
        # cut.append(ws([text])[0])
    data2['ckipnlp_cut'] = cut
    year_month_cut(data2)
    if kwargs.get('language'):
        #若有給language這個keyword參數,就會回傳其value(True or False),沒有給language這個參數將回傳None,等同於False
        data2['traditional'] = [
            1 if check.hasTraditional(s) else 0 for s in data2['textOriginal']
        ]
        data2['simplified'] = [
            1 if check.hasSimplified(s) else 0 for s in data2['textOriginal']
        ]
        data2['english'] = [
            1 if check.hasEnglish(s) else 0 for s in data2['textOriginal']
        ]
    data2.reset_index(inplace=True, drop=True)
    return data2
def jieba_cutwords(data: pd.DataFrame, *args: str, **kwargs) -> pd.DataFrame:
    for wordPackage in args:
        jieba.load_userdict('頻道列表/' + wordPackage + '.txt')

    data2 = data.copy()
    cut = []
    for text in data['textOriginal']:
        # print(text)
        cut.append(list(jieba.cut(str(text))))
    data2['jieba_cut'] = cut
    year_month_cut(data2)
    if (kwargs.get('language', -1) != -1):
        if kwargs['language'] == True:
            data2['traditional'] = [
                1 if check.hasTraditional(s) else 0
                for s in data2['textOriginal']
            ]
            data2['simplified'] = [
                1 if check.hasSimplified(s) else 0
                for s in data2['textOriginal']
            ]
            data2['english'] = [
                1 if check.hasEnglish(s) else 0 for s in data2['textOriginal']
            ]
    data2.reset_index(inplace=True, drop=True)
    return data2
Example #3
0
# 判斷文字是否包含繁體中文、簡體中文、英文
#-------------------------------------#

# 共有4個Function可使用
# check.hasTradional() : 判斷是否包含繁體中文
# check.hasSimplified(): 判斷是否包含簡體中文
# check.hasBoth():判斷是否同時包含繁體、簡體中文
# check.hasEnglish():判斷是否包含英文

import checkword as check
check.hasTraditional('ABC abc 国家 國家')  #注意:家同時為繁體、簡體字

check.hasSimplified('ABC abc 国家 國家')

check.hasBoth('ABC abc 國家')

check.hasEnglish('ABC abc 國家')