def ckipnlp_cutwords(data: pd.DataFrame, ws, *args: str, **kwargs) -> pd.DataFrame: wg_dict = {} for wordPackage in args: wg_dict.update(txt_to_dict('頻道列表/' + wordPackage + '.txt')) wg_dict = construct_dictionary(wg_dict) data2 = data.copy() cut = [] for text in data['textOriginal']: comment_cut = ws([text], recommend_dictionary=wg_dict)[0] cut.append(comment_cut) print(comment_cut) # cut.append(ws([text])[0]) data2['ckipnlp_cut'] = cut year_month_cut(data2) if kwargs.get('language'): #若有給language這個keyword參數,就會回傳其value(True or False),沒有給language這個參數將回傳None,等同於False data2['traditional'] = [ 1 if check.hasTraditional(s) else 0 for s in data2['textOriginal'] ] data2['simplified'] = [ 1 if check.hasSimplified(s) else 0 for s in data2['textOriginal'] ] data2['english'] = [ 1 if check.hasEnglish(s) else 0 for s in data2['textOriginal'] ] data2.reset_index(inplace=True, drop=True) return data2
def jieba_cutwords(data: pd.DataFrame, *args: str, **kwargs) -> pd.DataFrame: for wordPackage in args: jieba.load_userdict('頻道列表/' + wordPackage + '.txt') data2 = data.copy() cut = [] for text in data['textOriginal']: # print(text) cut.append(list(jieba.cut(str(text)))) data2['jieba_cut'] = cut year_month_cut(data2) if (kwargs.get('language', -1) != -1): if kwargs['language'] == True: data2['traditional'] = [ 1 if check.hasTraditional(s) else 0 for s in data2['textOriginal'] ] data2['simplified'] = [ 1 if check.hasSimplified(s) else 0 for s in data2['textOriginal'] ] data2['english'] = [ 1 if check.hasEnglish(s) else 0 for s in data2['textOriginal'] ] data2.reset_index(inplace=True, drop=True) return data2
# 判斷文字是否包含繁體中文、簡體中文、英文 #-------------------------------------# # 共有4個Function可使用 # check.hasTradional() : 判斷是否包含繁體中文 # check.hasSimplified(): 判斷是否包含簡體中文 # check.hasBoth():判斷是否同時包含繁體、簡體中文 # check.hasEnglish():判斷是否包含英文 import checkword as check check.hasTraditional('ABC abc 国家 國家') #注意:家同時為繁體、簡體字 check.hasSimplified('ABC abc 国家 國家') check.hasBoth('ABC abc 國家') check.hasEnglish('ABC abc 國家')