def _init(self): import ckiptagger if self._recommend_lexicons: self._opts['recommend_dictionary'] = ckiptagger.construct_dictionary(self._recommend_lexicons) if self._coerce_lexicons: self._opts['coerce_dictionary'] = ckiptagger.construct_dictionary(self._coerce_lexicons) self._core = ckiptagger.WS(_get_tagger_data(), disable_cuda=self._disable_cuda)
def ckipnlp_cutwords(data: pd.DataFrame, ws, *args: str, **kwargs) -> pd.DataFrame: wg_dict = {} for wordPackage in args: wg_dict.update(txt_to_dict('頻道列表/' + wordPackage + '.txt')) wg_dict = construct_dictionary(wg_dict) data2 = data.copy() cut = [] for text in data['textOriginal']: comment_cut = ws([text], recommend_dictionary=wg_dict)[0] cut.append(comment_cut) print(comment_cut) # cut.append(ws([text])[0]) data2['ckipnlp_cut'] = cut year_month_cut(data2) if kwargs.get('language'): #若有給language這個keyword參數,就會回傳其value(True or False),沒有給language這個參數將回傳None,等同於False data2['traditional'] = [ 1 if check.hasTraditional(s) else 0 for s in data2['textOriginal'] ] data2['simplified'] = [ 1 if check.hasSimplified(s) else 0 for s in data2['textOriginal'] ] data2['english'] = [ 1 if check.hasEnglish(s) else 0 for s in data2['textOriginal'] ] data2.reset_index(inplace=True, drop=True) return data2
def word_seg(text): userdic = [] userDic = {} with open('C:/ckip-learning/project/Dict/userDict.txt', 'r', encoding='utf-8') as f1: us = f1.readlines() for t in us: t1 = t.replace('\n', '') if len(t1) == 1: pass else: userdic.append(t1) for t2 in userdic: userDic[t2] = 1 dictionary = construct_dictionary(userDic) stopWords = [] ws_result = [] with open('C:/ckip-learning/project/Dict/stopDict.txt', 'r', encoding='utf-8') as s: st = s.readlines() for std in st: stopWords.append(std.replace('\n', '')) ws = WS('C:/ckip-learning/data') words = ws([text], recommend_dictionary=dictionary) for word in words[0]: if word in stopWords: pass elif len(word_filter(word)) == 0: pass else: ws_result.append(word) res = ','.join(ws_result) return res
def __init__(self, ckip_data_path='./data', custom_dict_path='./dict'): # Load model self.ws = WS(ckip_data_path) self.pos = POS(ckip_data_path) self.ner = NER(ckip_data_path) self.dictionary = construct_dictionary( self.__load_custom_dict(custom_dict_path))
def create_word_dict(legal_name_file, word_file, output_file): with open(legal_name_file, 'r', encoding='big5') as k1, open(word_file, 'r', encoding='big5') as k2: k = k1.read().split('\n') + k2.read().split('\n') word_to_weight = dict([(_, 1) for _ in k]) dictionary = construct_dictionary(word_to_weight) pickle.dump(dictionary, open(output_file, 'wb')) print(output_file, ' exported.')
def main(): # Download data #data_utils.download_data("./") #第一次執行需要這行 把前面#弄掉 # Load model ws = WS("./data") pos = POS("./data") ner = NER("./data") word_to_weight = { "橋本有菜": 1, } #因為CKIP不認識橋本有菜,所以要教 dictionary = construct_dictionary(word_to_weight) txt = open('./input.txt', "r", encoding="utf-8") #輸入文字檔 sentence_list = [] for line in txt: line = line.strip('\n') #讀取文件 並變成CKIP吃的list sentence_list.append(line) print(sentence_list) # Run WS-POS-NER pipeline '''sentence_list = [ "傅達仁今將執行安樂死,卻突然爆出自己20年前遭緯來體育台封殺,他不懂自己哪裡得罪到電視台。", "美國參議院針對今天總統布什所提名的勞工部長趙小蘭展開認可聽證會,預料她將會很順利通過參議院支持,成為該國有史以來第一位的華裔女性內閣成員。", "", "土地公有政策??還是土地婆有政策。.", "… 你確定嗎… 不要再騙了……", "最多容納59,000個人,或5.9萬人,再多就不行了.這是環評的結論.", "科長說:1,坪數對人數為1:3。2,可以再增加。", ]''' #word_sentence_list = ws(sentence_list) word_sentence_list = ws( sentence_list, recommend_dictionary=dictionary) #要認識橋本就套用這行有字典的,不想認識就套上一行 pos_sentence_list = pos(word_sentence_list) entity_sentence_list = ner(word_sentence_list, pos_sentence_list) # Release model del ws del pos #我們放上去雲端之後應該不用release del ner # Show results output = open('output.txt', 'w', encoding='utf-8') #輸出文字檔 def print_word_pos_sentence(word_sentence, pos_sentence): assert len(word_sentence) == len(pos_sentence) for word, pos in zip(word_sentence, pos_sentence): #print(f"{word}", end="\u3000") output.write(f"{word}" + " ") #output的重點在這 #print() output.write('\n') for i, sentence in enumerate(sentence_list): #print() #print(f"'{sentence}'") print_word_pos_sentence(word_sentence_list[i], pos_sentence_list[i])
def main(): # Download data data_utils.download_data("./") # Load model ws = WS("./data") pos = POS("./data") ner = NER("./data") # Create custom dictionary word_to_weight = { "土地公": 1, "土地婆": 1, "公有": 2, "": 1, "來亂的": "啦", "緯來體育台": 1, } dictionary = construct_dictionary(word_to_weight) print(dictionary) # Run WS-POS-NER pipeline sentence_list = [ "傅達仁今將執行安樂死,卻突然爆出自己20年前遭緯來體育台封殺,他不懂自己哪裡得罪到電視台。", "美國參議院針對今天總統布什所提名的勞工部長趙小蘭展開認可聽證會,預料她將會很順利通過參議院支持,成為該國有史以來第一位的華裔女性內閣成員。", "", "土地公有政策??還是土地婆有政策。.", "… 你確定嗎… 不要再騙了……", "最多容納59,000個人,或5.9萬人,再多就不行了.這是環評的結論.", "科長說:1,坪數對人數為1:3。2,可以再增加。", ] word_sentence_list = ws(sentence_list) # word_sentence_list = ws(sentence_list, sentence_segmentation=True) # word_sentence_list = ws(sentence_list, recommend_dictionary=dictionary) # word_sentence_list = ws(sentence_list, coerce_dictionary=dictionary) pos_sentence_list = pos(word_sentence_list) entity_sentence_list = ner(word_sentence_list, pos_sentence_list) # Release model del ws del pos del ner # Show results def print_word_pos_sentence(word_sentence, pos_sentence): assert len(word_sentence) == len(pos_sentence) for word, pos in zip(word_sentence, pos_sentence): print(f"{word}({pos})", end="\u3000") print() return for i, sentence in enumerate(sentence_list): print() print(f"'{sentence}'") print_word_pos_sentence(word_sentence_list[i], pos_sentence_list[i]) for entity in sorted(entity_sentence_list[i]): print(entity) return
def __init__(self, model_path=None, dict_path=None, coerce_dict=None): self._logger = logging.getLogger(__name__) self._recommend_dict = {} if dict_path: self._recommend_dict = construct_dictionary( self.load_userdict(dict_path)) self._coerce_dict = {} if coerce_dict: self._coerce_dict = construct_dictionary( self.load_userdict(coerce_dict)) self._model_path = "" if model_path: self._model_path = model_path self._ws = None self._pos = None
def __init__(self, root_dir, lexicon=None, coerce_dictionary=True): self.ws = WS(root_dir, disable_cuda=False) word_to_weight = {word: 1 for word in lexicon} self.coerce_dictionary = None self.recommend_dictionary = None self.segment_delimiter_set = {",", "。", ":", "?", "!", ";", "-"} dictionary = construct_dictionary(word_to_weight) if coerce_dictionary: self.coerce_dictionary = dictionary else: self.recommend_dictionary = dictionary
def __init__(self, component_config: Dict[Text, Any] = None) -> None: super(CKIPTokenizer, self).__init__(component_config) # must configure 'model_apth', or raise exception if not self.component_config.get("model_path"): raise Exception("model_path must be configured") # construct recommend_dict if 'recommend_dict' is configured self._recommend_dict = {} if self.component_config.get("recommend_dict_path", None): self._recommend_dict = construct_dictionary( self.load_userdict( self.component_config.get("recommend_dict_path"))) # construct coerce_dict if 'coerce_dict' is configured self._coerce_dict = {} if self.component_config.get("coerce_dict_path", None): self._coerce_dict = construct_dictionary( self.load_userdict( self.component_config.get("coerce_dict_path"))) self._ws = WS(self.component_config.get("model_path"))
def ckip(keywords): """ CKIP Lab Chinese NLP """ # 將三份工具的模型路徑指向我們剛才下載的模型 # Load model ws = WS("./data") pos = POS("./data") ner = NER("./data") # 自訂字典 if os.path.isfile('./school_data.csv'): # 檢查下有沒有學校名稱列表 print("發現官方學校名稱檔案,將作為強制詞加入字典") force_dictionary = construct_dictionary(school('school_data', True)) else: force_dictionary = {} if os.path.isfile('./school_alias.csv'): # 各種別名、簡稱等 print("發現非官方學校名稱檔案,將作為推薦詞加入字典") encourage_dictionary = construct_dictionary(school('school_alias')) else: encourage_dictionary = {} # 分析文本 ws_results = ws(keywords, recommend_dictionary = encourage_dictionary, coerce_dictionary = force_dictionary) # pos_results = pos(ws_results) # ner_results = ner(ws_results, pos_results) # ner(文本, POS結果) # 結果 # print(ws_results) # 斷詞 # print(pos_results) # 詞性 # for name in ner_results[0]: # 實體辨識 # print(name) # release memory del ws del pos del ner return ws_results
def ckip_cut_gpu(input_data, data_col, do_NER=False): #whole csv dataframe, colname wait for cut from ckiptagger import WS, construct_dictionary User_Dict = {} with open("dict2.txt", "r", encoding='utf-8') as USDic: for tmpwords in USDic: words = tmpwords.strip().split(" ") if len(words) > 1: User_Dict[words[0]] = words[1] else: User_Dict[words[0]] = 10 dictionary = construct_dictionary(User_Dict) os.environ["CUDA_VISIBLE_DEVICES"] = "0" ws = WS("./data", disable_cuda=False) input_data = input_data.replace(np.nan, '', regex=True) tmp_text = list(input_data[data_col]) stopwordslist = stopwordlist() ckip_cut_result = pd.DataFrame(columns=['CKIP_Result']) ckip_cut_result['CKIP_Result'] = ckip_cut_result['CKIP_Result'].astype( 'str') total = len(tmp_text) counter = 1 tmp_things = [] for things in tmp_text: print("Now: ", str(counter), " of ", total) tmp_things.append(things) ckip_cut = ws( tmp_things, sentence_segmentation=True, segment_delimiter_set={",", "。", ":", "?", "!", ";", "、"} ) #sentence_segmentation=True,segment_delimiter_set = {",", "。", ":", "?", "!", ";", "、"},coerce_dictionary = dictionary tmp_things.clear() if do_NER: print("Not yet.") else: text = '' for cutted in ckip_cut: if cutted not in stopwordslist: text = str(cutted) + " " + text text = re.sub(r'[0-9]', '', text) text = re.sub(r'[^\w\s]', '', text) text = re.sub(r'[a-zA-Z]', '', text) tmp = pd.Series({'CKIP_Result': text}) ckip_cut_result = ckip_cut_result.append(tmp, ignore_index=True) counter += 1 del ws return ckip_cut_result
def main(): sql1 = "SELECT id,title FROM bingnews2 WHERE title LIKE '%驚呆%'UNION SELECT id,title FROM bingnews2 WHERE title LIKE'%爆氣%' UNION SELECT id,title FROM bingnews2 WHERE title LIKE'%網友這麼說%' UNION SELECT id,title FROM bingnews2 WHERE title LIKE'%網友這樣說%'UNION SELECT id,title FROM bingnews2 WHERE title LIKE'%網驚%'" #將資料表中部份資料抓出來,若需將資料庫中資料全部抓出來:SELECT [欄位] FROM [資料表] cs1.execute(sql1) idc = [] #id title = [] #標題 user = {} str4 = "" alldata = cs1.fetchall() for s in alldata: idc.append(s[0]) title.append(s[1]) #print(len(idc)) # Load model without GPU ws = WS("請上CKipTagger 的github下載模型,網址詳見READ") #斷詞 pos = POS("請上CKipTagger 的github下載模型,網址詳見READ") #詞性標註 ner = NER("請上CKipTagger 的github下載模型,網址詳見READ") #實體辨識 # Create custom dictionary # 用讀CSV的方式讀取前面匯出的txt df_ner_dict = pd.read_csv(r"停用詞文件儲存位置", delimiter="\t", quoting=csv.QUOTE_NONE, header=None, encoding="utf-8") #使用停用詞 # 存到list df_ner_dict.columns = ['NER'] list_ner_dict = list(df_ner_dict['NER']) dict_for_CKIP = dict((el, 1) for el in list_ner_dict) dict_for_CKIP = construct_dictionary(dict_for_CKIP) for i in range(len(title)): sentence_list = '朴敏英進廠「修鼻子」?最新近照曝光 網驚:有點怪怪的' #若修改成sentence_list = title[i],則可以讀取資料表中所有字串 idh = idc[i] word_s = np.ravel(ws(sentence_list, coerce_dictionary=dict_for_CKIP)) #斷詞 word_p = np.ravel(pos(word_s)) #詞性標註 pos_sentence_list = pos(word_s) print(word_s) print(word_p) for key, value in zip(word_s, word_p): #將斷詞結果和對應詞性以鍵值方式存為JSON檔 user[key] = value jsoninfo = json.dumps(user, ensure_ascii=False) print("complete") # Release model del ws del pos del ner
def cut_func(input_data,data_col,name): os.environ["CUDA_VISIBLE_DEVICES"] = "0" from ckiptagger import data_utils, construct_dictionary, WS User_Dict = {} with open("dict.txt","r",encoding = 'utf-8') as USDic: for tmpwords in USDic: words = tmpwords.strip().split(" ") if len(words) > 1: User_Dict[words[0]] = words[1] else: User_Dict[words[0]] = 10 dictionary = construct_dictionary(User_Dict) ws = WS("./data",disable_cuda=False) # pos = POS("/data") # ner = NER("/data") print(input_data) punctuation = " 的也//,::""()\n!!?。"#$%&'()*+-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏""<->#。!⋯.➡?=&▶_%♀!❗🎉⏰💪🔥⁉❓" re_punctuation = "[{}] ".format(punctuation) input_data = input_data.replace(np.nan,'',regex = True) tmp_fbtext = list(input_data[data_col]) stopwordslist = stopwordlist() ckip_pd = pd.DataFrame(columns = ['CKIP_Result']) ckip_pd['CKIP_Result'] = ckip_pd['CKIP_Result'].astype('str') print("Total Data to process: ",len(tmp_fbtext),'\n','----------------') counter = 1 tmp_things = [] for things in tmp_fbtext: print("Now processing:", name," No.",counter) tmp_things.append(things) ckip_cut = ws(tmp_things,sentence_segmentation=True,segment_delimiter_set = {",", "。", ":", "?", "!", ";", "、"},coerce_dictionary = dictionary) text = '' tmp_things.clear() ner_thread = threading.Thread(target = do_NER, args = (ckip_cut,)) ner_thread.start() for cutted in ckip_cut: if cutted not in stopwordslist: text = str(cutted) + " " + text text = re.sub(r'[0-9]','',text) text = re.sub(r'[a-zA-Z]','',text) text = re.sub(r'[^\w\s]','',text) text = re.sub(re_punctuation,'',text) tmp = pd.Series({'CKIP_Result' : text}) ckip_pd = ckip_pd.append(tmp,ignore_index = True) ner_thread.join() counter += 1 return ckip_pd
def WordSegment_and_write2file(give): ws = WS("./data",disable_cuda=False) with open('WikiDict_plus_allfieldskeywordsDict.pkl', 'rb') as fp: WikiDict_plus_allfieldskeywordsDict = pickle.load(fp) fp.close() for i in [give]: # print(i) word_sentence_list = ws( i, sentence_segmentation = True, segment_delimiter_set = {",", "。", ":", "?", "!", ";", "?", ",", "、", " ", "。", "!", "? ", "NULL","\n","\n3000","(",")","=","/"}, recommend_dictionary = construct_dictionary(WikiDict_plus_allfieldskeywordsDict), ) # print(word_sentence_list) # with open('allfields_list.pkl', 'wb') as fp: # pickle.dump(word_sentence_list, fp) # fp.close() # print("1") All.append(word_sentence_list) # del word_sentence_list # with open("allfields_list.pkl",'rb') as f: # final = pickle.loads(f.read()) # print("2") # print(final) new_final = [] for i in word_sentence_list: new_i = [] # print(i) for j in i: j = remove_punctuation(j) # print(j) if j != "" : new_i.append(j) new_final.append(new_i) # print(new_final) # print("$$$$$",new_final) return new_final,word_sentence_list
def __init__(self, ckip_data_path='./data', custom_dict_path=None, disable_cuda=True, cuda_memory_limit=2048): if (disable_cuda == False): gpus = tf.config.experimental.list_physical_devices('GPU') try: tf.config.experimental.set_virtual_device_configuration( gpus[0], [ tf.config.experimental.VirtualDeviceConfiguration( cuda_memory_limit) ]) except RuntimeError as e: print(e) # Load model self.ws = WS(ckip_data_path, disable_cuda=disable_cuda) self.pos = POS(ckip_data_path, disable_cuda=disable_cuda) self.ner = NER(ckip_data_path, disable_cuda=disable_cuda) if (custom_dict_path is not None): self.dictionary = construct_dictionary( self.__load_custom_dict(custom_dict_path)) else: self.dictionary = {}
print("read data in...") data = np.load(FILENAME) if (LIMIT): data = data[:1000] print("read WORD_TO_WEIGHT in...") word_to_weight = {} with open(WORD_TO_WEIGHT, encoding='utf-8') as f: for line in f: word = line.split('\n')[0] if (word not in word_to_weight): word_to_weight[word] = 1 else: word_to_weight[word] += 1 dictionary = construct_dictionary(word_to_weight) print("start segementation...") word_sentence_list = ws( data, sentence_segmentation=True, # To consider delimiters # segment_delimiter_set = {",", "。", ":", "?", "!", ";"}), # This is the defualt set of delimiters # recommend_dictionary = dictionary1, # words in this dictionary are encouraged # coerce_dictionary = dictionary2, # words in this dictionary are forced ) print("start POS...") pos_sentence_list = pos(word_sentence_list) print("start to save the result...") savename = "%s_ws.json" % FILENAME[:-4]
import telepot import datetime from telethon import TelegramClient, sync from telethon import events, functions, types from telethon.tl.types import PeerUser, PeerChat, PeerChannel from telethon.tl.functions.messages import AddChatUserRequest from ckiptagger import data_utils, construct_dictionary, WS # 函式庫引入完畢 from config import base from config import weights_dictionary base = base.base() ws = WS("./data") recommend_dictionary = weights_dictionary.coerce_dictionary() recommend_dictionary = construct_dictionary(recommend_dictionary) coerce_dictionary = weights_dictionary.coerce_dictionary() coerce_dictionary = construct_dictionary(coerce_dictionary) # 設定檔、資源檔引入 owner = base['owner'] timezone = base['timezone'] bots_len = len(base['tgbots']) group_name = base['group_name'] channel_id = base['channel_id'] interval_time = base['interval'] sleep_time = interval_time / bots_len # 基本定義 bots = [] for i in range(len(base['tgbots'])):
path = str(Path.home()) + '/ckip/' zh_ws = WS(path + '/data') weight = 1 user_words = list() user_dict = dict() # load user words with open('ckip_ud.txt', 'r', encoding='utf8') as f: for l in f.readlines(): user_words.append(l.strip()) user_words = list(set(user_words)) # create user dictionary for w in user_words: user_dict[w] = weight user_dictionary = construct_dictionary(user_dict) def show(doc): global add_word title = [] content = [] c_sentence = doc['content'] t_sentence = doc['title'] n = 0 while n < len(add_word): if add_word[n] in t_sentence: t_sentence = t_sentence.replace(add_word[n], '', 1) else: n += 1 doc_title = zh_nlp(t_sentence)
def tokenize(news_df): """ To tokenize & extract key word :param news_df: (title,content,date) 依照時間日期遞增的方式(1月=>12月)排序好之df :return: df: ('ori_title', 'ori_news', 'tok_title_news', 'keyWord_algorithm') """ load_start = time.time() define_dict_path = "./TokSentLeo/user_dict/company_dict.txt" model_path = './TokSentLeo/CKIP_model/' ws = WS(model_path) pos = POS(model_path) ner = NER(model_path) word_to_weight = {} with open(define_dict_path, "r", encoding='utf8') as file: for line in file: key, value = line.split() word_to_weight[str(key)] = 2 dictionary = construct_dictionary(word_to_weight) all_date_li = news_df.Date.tolist() all_news_li = news_df.Content.tolist() all_title_li = news_df.Title.tolist() all_news_li2 = [] for title, news in zip(all_title_li, all_news_li): if type(news) == float: # news is nan, only title all_news_li2.append(title) elif type(title) == float: all_news_li2.append(news) else: all_news_li2.append(title + ":" + news) load_end = time.time() - load_start tokenize_start = time.time() print( "Model Load Time:", '{:02f}:{:02f}:{:02f}'.format(load_end // 3600, (load_end % 3600 // 60), load_end % 60)) word_sentence_list = ws(all_news_li2, recommend_dictionary=dictionary, segment_delimiter_set={ ",", "。", ":", "?", "!", ";", ",", ":", "?", "!", ";" }) pos_sentence_list = pos(word_sentence_list) entity_sentence_list = ner(word_sentence_list, pos_sentence_list) temp = [] temp1 = [] bad_list = [ "<br>", "br", "BR", "<BR>", ",", "【", "】", "╱", "▲", "▼", "&amp;amp;amp;amp;lt;br", "&amp;amp;amp;amp;gt", "amp", "lt", "br&", "gt", "&", "[", "]" ] for w_s_l, e_s_l in zip(word_sentence_list, entity_sentence_list): # t = [] # t1 = [] # for i, x in enumerate(w_s_l): # if x not in bad_list: # t.append(x) # t1.append(e_s_l[i]) t = [x for x in w_s_l if x not in bad_list] temp.append(t) # temp1.append(t1) word_sentence_list = temp # entity_sentence_list = temp1 tokenize_end = time.time() - tokenize_start print( "DL Tokenize Time:", '{:02f}:{:02f}:{:02f}'.format(tokenize_end // 3600, (tokenize_end % 3600 // 60), tokenize_end % 60)) algo_start = time.time() # by news TF TF_news_li = [] for i, news_toks in enumerate(word_sentence_list): temp = dict(Counter(news_toks)) df = pd.DataFrame(list(temp.items()), columns=['Word', 'TF_norm_score']) df.TF_norm_score = (df.TF_norm_score - df.TF_norm_score.min()) / ( df.TF_norm_score.max() - df.TF_norm_score.min()) # for item in temp.items(): # df.append(pd.Series(list(item))) all_words = df.Word.tolist() for i, w in enumerate(all_words): if len(w) < 2: df.iloc[i, 1] = df.iloc[i, 1] - (df.TF_norm_score.mean() + 3 * df.TF_norm_score.std()) TF_news_li.append(df) # by news NER NER_news_li = [] for i, en_sentence in enumerate(entity_sentence_list): df = pd.DataFrame(columns=['Word', 'NER']) word = [] for entity in en_sentence: word_ = entity[-1] if word_ in word: continue word.append(word_) temp = [word_, entity[-2]] temp = pd.Series(temp, index=df.columns) df = df.append(temp, ignore_index=True) NER_news_li.append(df) # by news TFIDF TFIDF_df_li = [] all_corpus = [] for sentence in word_sentence_list: all_corpus.append(" ".join(sentence)) # print(all_corpus[0]) vectoerizer = CountVectorizer(min_df=3, max_df=0.9, token_pattern='\\b\\w+\\b') vectoerizer.fit(all_corpus) X = vectoerizer.transform(all_corpus) tfidf_transformer = TfidfTransformer() tfidf = tfidf_transformer.fit_transform(X.toarray()) word = vectoerizer.get_feature_names() weight = tfidf.toarray() for i in range(len(weight)): # print("text:",i) df = pd.DataFrame(columns=['Word', 'Tfidf']) for j in range(len(word)): if weight[i][j] <= 0: continue temp = [word[j], weight[i][j]] temp = pd.Series(temp, index=df.columns) df = df.append(temp, ignore_index=True) # print(word[j],weight[i][j]) TFIDF_df_li.append(df) # by NEWS TR TR_df_li = [] for sentence in all_corpus: text_rank_words = keywords.keywords(sentence, split=True) all_length = len(text_rank_words) df = pd.DataFrame(columns=['Word', 'TR_normScore']) for i, words in enumerate(text_rank_words): word_li = words.split() for word in word_li: score = (all_length - i) / all_length temp = [word, score] temp = pd.Series(temp, index=df.columns) df = df.append(temp, ignore_index=True) all_words = df.Word.tolist() for i, w in enumerate(all_words): if len(w) < 2: df.iloc[i, 1] = df.iloc[i, 1] - (df.TR_normScore.mean() + 3 * df.TR_normScore.std()) TR_df_li.append(df) # combine all COM2_df_li = [] for tf_df, ner_df, tfidf_df, tr_df in zip(TF_news_li, NER_news_li, TFIDF_df_li, TR_df_li): com_df = tf_df.merge(ner_df, how='outer', on='Word') com_df = com_df.fillna(0) def transform(s): if s != 0: return (tfidf_df.Tfidf.median() + tr_df.TR_normScore.median() ) # /2 else: return 0.0 com_df.NER = com_df.NER.map(transform) com_df['score'] = com_df.TF_norm_score + com_df.NER com2_df = com_df.merge(tfidf_df, on='Word', how='outer') com2_df = com2_df.merge(tr_df, on='Word', how='outer') com2_df = com2_df.fillna(0) com2_df['score'] = com2_df.score + com2_df.TR_normScore + com2_df.Tfidf COM2_df_li.append(com2_df) # write result df = pd.DataFrame(columns=[ 'Date', 'ori_title', 'ori_news', 'tok_title_news', 'keyWord_algorithm' ]) # df's columns month = [] word_month = [] score_month = [] for day, title_str, news_str, news_tok_li, com_df in zip( all_date_li, all_title_li, all_news_li, word_sentence_list, COM2_df_li): key_words = com_df[com_df.score > com_df.score.mean() + 1.65 * com_df.score.std()] # 2* key_words = key_words.Word.tolist() key_words_month = com_df[com_df.score > com_df.score.mean() + 2 * com_df.score.std()] words_score_month = key_words_month.score.tolist() key_words_month = key_words_month.Word.tolist() temp = [ str(day), title_str, news_str, " ".join(news_tok_li), "、".join(key_words) ] temp = pd.Series(temp, index=df.columns) for word, score in zip(key_words_month, words_score_month): month.append(str(day).split('/')[1]) word_month.append(word) score_month.append(score) df = df.append(temp, ignore_index=True) current_month = month[0] need_dict = {} dict_order = [] month_order = [] for i, (m, w, s) in enumerate(zip(month, word_month, score_month)): if m != current_month: month_order.append(current_month) current_month = m for k, v in need_dict.items(): if len(v) < 3: # DF<3 do not take need_dict[k] = 0 else: need_dict[k] = np.mean(v) dict_order.append(need_dict) need_dict = {} need_dict[w] = list([float(s)]) if i == len(month) - 1: dict_order.append(need_dict) else: if w not in list(need_dict.keys()): need_dict[w] = list([float(s)]) else: temp = need_dict[w] temp.append(float(s)) need_dict[w] = temp if i == len(month) - 1: month_order.append(m) for k, v in need_dict.items(): if len(v) < 3: # DF<3 do not take need_dict[k] = 0 else: need_dict[k] = np.mean(v) dict_order.append(need_dict) df_month_key = pd.DataFrame( columns=['Month', 'key_word', 'score']) # overall month key word with score for mo, dict_mo in zip(month_order, dict_order): for k, v in dict_mo.items(): if v < 0.1: # DF<3 do not take continue temp = [int(mo), str(k), v] temp = pd.Series(temp, index=df_month_key.columns) df_month_key = df_month_key.append(temp, ignore_index=True) algo_end = time.time() - algo_start print( "KeyWord Algorithm Time:", '{:02f}:{:02f}:{:02f}'.format(algo_end // 3600, (algo_end % 3600 // 60), algo_end % 60)) return df, df_month_key