def __init__(self, ckip_data_path='./data', custom_dict_path='./dict'): # Load model self.ws = WS(ckip_data_path) self.pos = POS(ckip_data_path) self.ner = NER(ckip_data_path) self.dictionary = construct_dictionary( self.__load_custom_dict(custom_dict_path))
def name_extractor(): from ckiptagger import WS, POS, NER ws = WS(ckipt_data, disable_cuda=not use_gpu) pos = POS(ckipt_data, disable_cuda=not use_gpu) ner = NER(ckipt_data, disable_cuda=not use_gpu) def extract_name(doc, attr='PERSON'): start = timeit.default_timer() word_s = ws([doc], sentence_segmentation=True, segment_delimiter_set={ '?', '?', '!', '!', '。', ',', ',', ';', ':', '、' }) word_p = pos(word_s) word_n = ner(word_s, word_p) stop = timeit.default_timer() namelist = set([ e[3] for e in word_n[0] if e[2] == attr and len(e[3]) > 1 and '、' not in e[3] and e[3][-1] not in '案犯' ]) return namelist, word_s[0], word_p[0], word_n[0], stop - start return extract_name
def do_NER(ws_result): from ckiptagger import POS,NER os.environ["CUDA_VISIBLE_DEVICES"] = "0" pos = POS("./data",disable_cuda=False) ner = NER("./data",disable_cuda=False) pos_result = pos(ws_result) ner_result = ner(ws_result,pos_result) class_l = [] ner_l = [] # print(ner_result) for a in range(len(ner_result[0])): popobj = ner_result[0].pop() class_l.append(popobj[2]) ner_l.append(popobj[3]) ner_output = pd.DataFrame({"Class":list(class_l), "NER":list(ner_l)}) ner_output.drop_duplicates(inplace = True) tmp2 = ner_output.loc[ner_output['Class'].isin(['LOC','PERSON', 'ORG', 'LAW', 'EVENT','GPE','NORP'])] tmp3 = tmp2[tmp2['NER'].map(len) >= 2] del tmp2 clean_NER = tmp3.sort_values(by = ['Class']) del tmp3 if os.path.isfile("pol_NER.csv"): clean_NER.to_csv("pol_NER.csv",mode = 'a+',header = False, index = False) return else: clean_NER.to_csv("pol_NER.csv",mode = 'a+',header = True, index = False) return
def __init__(self): print("prepare ws pos ner") assert os.path.exists("./ckiptagger_data"), "ckiptagger_data 不在同層目錄" self.ws = WS("./ckiptagger_data") self.pos = POS("./ckiptagger_data") self.ner = NER("./ckiptagger_data") clear_output()
def __init__(self, component_config: Dict[Text, Any] = None) -> None: super(CKIPFeaturizer, self).__init__(component_config) # must configure 'model_path', or raise exception if not self.component_config.get("model_path"): raise Exception("model_path must be configured") self._pos = POS(self.component_config.get("model_path"))
def _initialize(self, label_map_path, ngram_model_path, pos_model_path): with open(label_map_path, 'r') as file: lines = file.readlines() for line in lines[1:]: self.replacements.append( line.split('\t')[-1].replace('\n', '')) self.mdl_2 = KneserNey.load(ngram_model_path) self.pos = POS(POS_model_path)
def main(): # Download data #data_utils.download_data("./") #第一次執行需要這行 把前面#弄掉 # Load model ws = WS("./data") pos = POS("./data") ner = NER("./data") word_to_weight = { "橋本有菜": 1, } #因為CKIP不認識橋本有菜,所以要教 dictionary = construct_dictionary(word_to_weight) txt = open('./input.txt', "r", encoding="utf-8") #輸入文字檔 sentence_list = [] for line in txt: line = line.strip('\n') #讀取文件 並變成CKIP吃的list sentence_list.append(line) print(sentence_list) # Run WS-POS-NER pipeline '''sentence_list = [ "傅達仁今將執行安樂死,卻突然爆出自己20年前遭緯來體育台封殺,他不懂自己哪裡得罪到電視台。", "美國參議院針對今天總統布什所提名的勞工部長趙小蘭展開認可聽證會,預料她將會很順利通過參議院支持,成為該國有史以來第一位的華裔女性內閣成員。", "", "土地公有政策??還是土地婆有政策。.", "… 你確定嗎… 不要再騙了……", "最多容納59,000個人,或5.9萬人,再多就不行了.這是環評的結論.", "科長說:1,坪數對人數為1:3。2,可以再增加。", ]''' #word_sentence_list = ws(sentence_list) word_sentence_list = ws( sentence_list, recommend_dictionary=dictionary) #要認識橋本就套用這行有字典的,不想認識就套上一行 pos_sentence_list = pos(word_sentence_list) entity_sentence_list = ner(word_sentence_list, pos_sentence_list) # Release model del ws del pos #我們放上去雲端之後應該不用release del ner # Show results output = open('output.txt', 'w', encoding='utf-8') #輸出文字檔 def print_word_pos_sentence(word_sentence, pos_sentence): assert len(word_sentence) == len(pos_sentence) for word, pos in zip(word_sentence, pos_sentence): #print(f"{word}", end="\u3000") output.write(f"{word}" + " ") #output的重點在這 #print() output.write('\n') for i, sentence in enumerate(sentence_list): #print() #print(f"'{sentence}'") print_word_pos_sentence(word_sentence_list[i], pos_sentence_list[i])
def main(sentence_list): # Download data #data_utils.download_data("./") # Load model ws = WS("./data") pos = POS("./data") ner = NER("./data") # Create custom dictionary # word_to_weight = { # "土地公": 1, # "土地婆": 1, # "公有": 2, # "": 1, # "來亂的": "啦", # "緯來體育台": 1, # } # dictionary = construct_dictionary(word_to_weight) # print(dictionary) # Run WS-POS-NER pipeline # sentence_list = [ # "傅達仁今將執行安樂死,卻突然爆出自己20年前遭緯來體育台封殺,他不懂自己哪裡得罪到電視台。", # "美國參議院針對今天總統布什所提名的勞工部長趙小蘭展開認可聽證會,預料她將會很順利通過參議院支持,成為該國有史以來第一位的華裔女性內閣成員。", # "", # "土地公有政策??還是土地婆有政策。.", # "… 你確定嗎… 不要再騙了……", # "最多容納59,000個人,或5.9萬人,再多就不行了.這是環評的結論.", # "科長說:1,坪數對人數為1:3。2,可以再增加。", # ] word_sentence_list = ws(sentence_list) # word_sentence_list = ws(sentence_list, sentence_segmentation=True) # word_sentence_list = ws(sentence_list, recommend_dictionary=dictionary) # word_sentence_list = ws(sentence_list, coerce_dictionary=dictionary) pos_sentence_list = pos(word_sentence_list) entity_sentence_list = ner(word_sentence_list, pos_sentence_list) # Release model del ws del pos del ner # Show results def print_word_pos_sentence(word_sentence, pos_sentence): assert len(word_sentence) == len(pos_sentence) for word, pos in zip(word_sentence, pos_sentence): print(f"{word}({pos})", end="\u3000") print() return for i, sentence in enumerate(sentence_list): print() print(f"'{sentence}'") print_word_pos_sentence(word_sentence_list[i], pos_sentence_list[i]) for entity in sorted(entity_sentence_list[i]): print(entity) return
def load_data(): # 使用 GPU: # 1. 安裝 tensorflow-gpu (請見安裝說明) # 2. 設定 CUDA_VISIBLE_DEVICES 環境變數,例如:os.environ["CUDA_VISIBLE_DEVICES"] = "0" # 3. 設定 disable_cuda=False,例如:ws = WS("./data", disable_cuda=False) # 使用 CPU: ws_ = WS("./core/data") pos_ = POS("./core/data") ner_ = NER("./core/data") return ws_, pos_, ner_
def open_spider(self, spider): settings = get_project_settings() cli = MongoClient(settings['MONGO_HOST']) self.cur = cli[settings['MONGO_DB']][ spider.custom_settings['COL_NAME']] self.tokenizer = Tokenizer(self.TOKEN) # self.ws = WS('./ckip_model', disable_cuda=self.DISABLE_CUDA) self.pos = POS('./ckip_model', disable_cuda=self.DISABLE_CUDA) self.ner = NER('./ckip_model', disable_cuda=self.DISABLE_CUDA)
def __init__(self, ws_model_path, pos_model_path, w2v_model_path, anti_dict_path): nltk.download('wordnet') nltk.download('omw') self.ws = WS(ws_model_path) self.pos = POS(pos_model_path) self.model = Word2Vec.load(w2v_model_path) self.new_anti = self.build_antidict(anti_dict_path) self.cc1 = opencc.OpenCC('t2s') self.cc2 = opencc.OpenCC('s2t')
def __init__(self, GPU_MEMORY_FRACTION, CUDA_VISIBLE_DEVICES): print("set GPU stat...") cfg = tf.ConfigProto() cfg.gpu_options.per_process_gpu_memory_fraction = GPU_MEMORY_FRACTION ###設定gpu使用量 session = tf.Session(config=cfg) os.environ["CUDA_VISIBLE_DEVICES"] = CUDA_VISIBLE_DEVICES ###設定gpu編號 print("prepare ws pos ner") path = "./module/data" self.ws = WS(path, disable_cuda=False) self.pos = POS(path, disable_cuda=False) ner = NER(path, disable_cuda=False) clear_output()
def generate_predict_data(): ''' 這個函數是為了將要被預測的句子轉成詞性序列,接受的格式為小學數學輸出的ALL.csv ''' df = pd.read_csv('batch_all_2.csv', encoding='utf-8') fail_sentence_list = [] mapping = [] answers = [] count = 0 sp = 0 good_pattern = [] for index, row in df.iterrows(): pattern = row['Matched_Frame_Sequential'].split(':') if row['Result'] == 'Good': for p in pattern: if p not in good_pattern: good_pattern.append(p) for index, row in df.iterrows(): sent = row['Question'].replace("(1)", "").replace("(2)", "") pattern = row['Matched_Frame_Sequential'].split(':') split_sent = re.split('?|,|。|:|:', sent) if split_sent[-1] == '': split_sent = split_sent[: -1] if len(split_sent) != len(pattern) or sum([0 if p in good_pattern else 1 for p in pattern])>0: sp += 1 continue if row['Result'] == 'Fail': fail_sentence_list += split_sent mapping += [count for _ in range(len(split_sent))] answers.append(row['Answer']) count += 1 ws = WS("./data") pos = POS("./data") fail_word_sent_list = ws(fail_sentence_list) fail_pos_sent_list = pos(fail_word_sent_list) out = {'Question':[], 'Mapping':[], 'Original':[]} for i in range(len(fail_pos_sent_list)): out['Question'].append(','.join(fail_pos_sent_list[i])) out['Original'].append(','.join(fail_word_sent_list[i])) out['Mapping'].append(mapping[i]) out_df = pd.DataFrame.from_dict(out) out_df.to_csv('test.csv') pdb.set_trace() with open('answer.txt', 'w') as file: for ans in answers: file.write(str(ans)+'\n')
def main(): sql1 = "SELECT id,title FROM bingnews2 WHERE title LIKE '%驚呆%'UNION SELECT id,title FROM bingnews2 WHERE title LIKE'%爆氣%' UNION SELECT id,title FROM bingnews2 WHERE title LIKE'%網友這麼說%' UNION SELECT id,title FROM bingnews2 WHERE title LIKE'%網友這樣說%'UNION SELECT id,title FROM bingnews2 WHERE title LIKE'%網驚%'" #將資料表中部份資料抓出來,若需將資料庫中資料全部抓出來:SELECT [欄位] FROM [資料表] cs1.execute(sql1) idc = [] #id title = [] #標題 user = {} str4 = "" alldata = cs1.fetchall() for s in alldata: idc.append(s[0]) title.append(s[1]) #print(len(idc)) # Load model without GPU ws = WS("請上CKipTagger 的github下載模型,網址詳見READ") #斷詞 pos = POS("請上CKipTagger 的github下載模型,網址詳見READ") #詞性標註 ner = NER("請上CKipTagger 的github下載模型,網址詳見READ") #實體辨識 # Create custom dictionary # 用讀CSV的方式讀取前面匯出的txt df_ner_dict = pd.read_csv(r"停用詞文件儲存位置", delimiter="\t", quoting=csv.QUOTE_NONE, header=None, encoding="utf-8") #使用停用詞 # 存到list df_ner_dict.columns = ['NER'] list_ner_dict = list(df_ner_dict['NER']) dict_for_CKIP = dict((el, 1) for el in list_ner_dict) dict_for_CKIP = construct_dictionary(dict_for_CKIP) for i in range(len(title)): sentence_list = '朴敏英進廠「修鼻子」?最新近照曝光 網驚:有點怪怪的' #若修改成sentence_list = title[i],則可以讀取資料表中所有字串 idh = idc[i] word_s = np.ravel(ws(sentence_list, coerce_dictionary=dict_for_CKIP)) #斷詞 word_p = np.ravel(pos(word_s)) #詞性標註 pos_sentence_list = pos(word_s) print(word_s) print(word_p) for key, value in zip(word_s, word_p): #將斷詞結果和對應詞性以鍵值方式存為JSON檔 user[key] = value jsoninfo = json.dumps(user, ensure_ascii=False) print("complete") # Release model del ws del pos del ner
def pos(): f = open("train.txt", 'r', encoding='utf-8') train = f.readlines() f.close() f = open("test.txt", 'r', encoding='utf-8') test = f.readlines() f.close() from ckiptagger import WS, POS, NER # ws = WS("./data") pos = POS("./data") # ner = NER("./data") out = [] for line in train: line = line.replace('\n', '').split(' ') out.append(pos([line])[0]) return out
def check_model_and_load(self): # To use GPU: # 1. Install tensorflow-gpu (see Installation) # 2. Set CUDA_VISIBLE_DEVICES environment variable, e.g. os.environ["CUDA_VISIBLE_DEVICES"] = "0" # 3. Set disable_cuda=False, e.g. ws = WS("./data", disable_cuda=False) # Do not use CPU: disable_cuda = True if "CUDA_VISIBLE_DEVICES" in os.environ: # To use CPU: disable_cuda = False if not self._ws or not self._pos: self._logger.info("ckiptagger WS/POS: Model Loading...") self._ws = WS(self._model_path, disable_cuda=disable_cuda) self._pos = POS(self._model_path, disable_cuda=disable_cuda) self._logger.info("ckiptagger WS/POS: Model Done...")
def handler(input_text): """The Hook of text-handler.""" # load model ws = WS("/home/erica/hololink/data") pos = POS("/home/erica/hololink/data") ner = NER("/home/erica/hololink/data") sentence_list = [input_text] word_sentence_list = ws(sentence_list) pos_sentence_list = pos(word_sentence_list) entity_sentence_list = ner(word_sentence_list, pos_sentence_list) output = {'Input_text': sentence_list[0]} for i, entity in enumerate(entity_sentence_list[0]): output[f'Entity_{i}'] = entity output_json = json.dumps(output, sort_keys=True, indent=4, ensure_ascii=False) # result_text = f'hello world, the text is {input_text}\nentity is {sorted(entity_texts)}' return output_json
def nlp(sent): ws = WS("./data") pos = POS("./data") ner = NER("./data") sentence_list=[] sentence_list.append(sent) word_list = ws(sentence_list) pos_list = pos(word_list) entity_list = ner(word_list,pos_list) return_list=[] flat_list=[] for word_sent in word_list: for word in word_sent: flat_list.append(word) word_list=[] word_list=flat_list flat_list=[] for pos_sent in pos_list: for pos in pos_sent: flat_list.append(pos) pos_list=[] pos_list=flat_list words_and_pos=dict(zip(word_list,pos_list)) return_list.append(words_and_pos) return_list.append(entity_list) return return_list
def ckip(keywords): """ CKIP Lab Chinese NLP """ # 將三份工具的模型路徑指向我們剛才下載的模型 # Load model ws = WS("./data") pos = POS("./data") ner = NER("./data") # 自訂字典 if os.path.isfile('./school_data.csv'): # 檢查下有沒有學校名稱列表 print("發現官方學校名稱檔案,將作為強制詞加入字典") force_dictionary = construct_dictionary(school('school_data', True)) else: force_dictionary = {} if os.path.isfile('./school_alias.csv'): # 各種別名、簡稱等 print("發現非官方學校名稱檔案,將作為推薦詞加入字典") encourage_dictionary = construct_dictionary(school('school_alias')) else: encourage_dictionary = {} # 分析文本 ws_results = ws(keywords, recommend_dictionary = encourage_dictionary, coerce_dictionary = force_dictionary) # pos_results = pos(ws_results) # ner_results = ner(ws_results, pos_results) # ner(文本, POS結果) # 結果 # print(ws_results) # 斷詞 # print(pos_results) # 詞性 # for name in ner_results[0]: # 實體辨識 # print(name) # release memory del ws del pos del ner return ws_results
def __init__(self, ckip_data_path='./data', custom_dict_path=None, disable_cuda=True, cuda_memory_limit=2048): if (disable_cuda == False): gpus = tf.config.experimental.list_physical_devices('GPU') try: tf.config.experimental.set_virtual_device_configuration( gpus[0], [ tf.config.experimental.VirtualDeviceConfiguration( cuda_memory_limit) ]) except RuntimeError as e: print(e) # Load model self.ws = WS(ckip_data_path, disable_cuda=disable_cuda) self.pos = POS(ckip_data_path, disable_cuda=disable_cuda) self.ner = NER(ckip_data_path, disable_cuda=disable_cuda) if (custom_dict_path is not None): self.dictionary = construct_dictionary( self.__load_custom_dict(custom_dict_path)) else: self.dictionary = {}
if p != None: res.append({"path": url, "matchType": phone_t, "match": p.group(0)}) found = 1 break # once we add one post to list, we abort post_regex = time.time() print( f"phone,email check finished, found: {found == True}, took {post_regex - pre_regex} seconds..." ) # NER, time-consuming. end_time = post_regex if not found: print("=======names stage=====") pre_load_model = time.time() ws = WS("./data") pos = POS("./data") ner = NER("./data") post_load_model = time.time() print(f"model loaded, took {post_load_model - pre_load_model} seconds...") pre_ckip = time.time() sentence_list = [title, content] # ws, pos, ner in order ws_list = ws(sentence_list) pos_list = pos(ws_list) entity_list = ner(ws_list, pos_list) del ws, pos, ner post_ckip = time.time() print( f"ckip-preprocessing(ws,pos,ner) finished, took {post_ckip - pre_ckip} seconds..." )
# -*- coding: UTF-8 -*- import tensorflow as tf from ckiptagger import WS, POS, NER #導入CKIPTAGGER ckip_Data="D:\CKIPtagger_data\data" #訓練資料路徑 ws = WS(ckip_Data, disable_cuda=False) #指定到訓練資料 pos = POS(ckip_Data, disable_cuda=False) #指定到訓練資料 ner = NER(ckip_Data, disable_cuda=False) #指定到訓練資料 sentence_list = ["美國政府今天證實,會修改美國企業和中國華為公司商業往來的相關禁令,允許美企和華為合作制定5G標準。路透社引述消息人士報導,這項法規修改獲美國商務部等部門通過,12日已提交聯邦公報(Federal Register),最快16日公布。美國商務部長羅斯(Wilbur Ross)發聲明證實這項舉措:「美國不會放棄在全球創新領域的領導地位,(商務部)會致力保護美國的國家安全和外交政策利益,鼓勵美國產業全面參與和提倡美國技術成為國際標準。」美國商務部今天稍後也公開宣布,美國參與標準制定「會影響5G、自動駕駛汽車、人工智慧和其他尖端技術的未來」。華為暫未對此事發表評論。美國去年將華為列入出口管制實體清單(entity list),以國家安全為由,禁止華為在未經政府許可下,向美國企業採購零組件和技術。產業人士和政府官員表示,此舉不該被解讀為美國對全球最大電信設備製造商華為立場軟化。他們表示,華為被列入實體清單,不利美國參與標準的制定,因為美企無法確定哪些資訊可分享,美國工程師在制定標準的會議中不出聲,反會讓華為取得更大的發言權。資訊科技業協會(Information Technology Industry Council)亞洲政策高階主任威爾遜(Naomi Wilson)說:「2019年5月實體清單的更新引發混亂,無意中使美國公司被排除在一些技術標準對話之外,使美企處於劣勢。」資訊科技業協會所代表的企業包含亞馬遜(Amazon.com Inc)、高通(Qualcomm Inc)和英特爾(Intel Corp.)等大廠。日經亞洲評論(Nikkei Asian Review)指出,華為在5G標準制定上,近年已躍居全球領導者地位。德國的專利統計公司IPlytics一份研究指出,華為在5G標準的研發上排名世界第一,提出的相關專利截至今年1月便有3147項,三星(Samsung)、中興通訊(ZTE)與樂金電子(LG Electronics)分居第2到第4。設於波士頓的顧問企業「策略分析公司」(Strategy Analytics)也有一份類似研究。在分析國際電訊標準訂定組織「第三代合作夥伴計畫」(3GPP)的600個會員企業後,發現華為在制定5G標準的貢獻度上執世界牛耳。日經亞洲評論認為,美國去年5月所頒的華為禁令阻止美企與華為的技術合作,當時就有許多政府官員與科技業者警告,這會傷害到美國參與全球5G標準的制定。"] word_s = ws(sentence_list, sentence_segmentation=True, segment_delimiter_set={'?', '?', '!', '!', '。', ',',',', ';', ':', '、'}) print(word_s)
'PAUSECATEGORY', 'PERIODCATEGORY', 'QUESTIONCATEGORY', 'SEMICOLONCATEGORY', 'SPCHANGECATEGORY', 'WHITESPACE', 'Nh', 'Nf', 'T', 'Nep', 'D', 'Neu', 'Di', 'Caa', 'Cab', 'Cba', 'Cbb' } def init_ckip_models(): from ckiptagger import data_utils if not os.path.isdir('data'): data_utils.download_data('./') init_ckip_models() ws_cls = WS('data') pos_cls = POS('data') ner_cls = NER('data') wordcloud = WordCloud(background_color='white', font_path='C:\\Windows\\Fonts\\msjh.ttc', width=800, height=600) def find_files_in_dir(directory, file_pattern): for root, dir_names, file_names in os.walk(directory): for file_name in file_names: if fnmatch.fnmatch(file_name, file_pattern): yield os.path.join(root, file_name) def main():
def nlp(sent): ws = WS("./data") pos = POS("./data") ner = NER("./data") sentence_list = [] sentence_list.append(sent) word_list = ws(sentence_list) pos_list = pos(word_list) entity_list = ner(word_list, pos_list) return_list = [] flat_list = [] for word_sent in word_list: for word in word_sent: flat_list.append(word) word_list = [] word_list = flat_list flat_list = [] for pos_sent in pos_list: for pos in pos_sent: flat_list.append(pos) pos_list = flat_list words_and_pos = dict(zip(word_list, pos_list)) noun_list = [] for key, value in words_and_pos.items(): arrow_indicate = "" if "Na" in value: noun_list.append(key) elif "Nb" in value: noun_list.append(key) elif "Ncd" in value: arrow_indicate = "plus" if "上" in key: arrow_indicate = "plus" elif "下" in key: arrow_indicate = "down" elif "前" in key: arrow_indicate = "left" elif "後" or "中" or "內" or "裡" or "外" or "邊" in key: arrow_indicate = "right" print(arrow_indicate + " " + str(len(noun_list))) noun_list.append(arrow_indicate) if len(noun_list) >= 2: temp = "" temp = noun_list[len(noun_list) - 2] noun_list[len(noun_list) - 2] = noun_list[len(noun_list) - 1] noun_list[len(noun_list) - 1] = temp elif "Nc" in value: noun_list.append(key) elif "VC" in value: if "起" in key: arrow_indicate = "up" noun_list.append(arrow_indicate) elif "下" in value: arrow_indicate = "down" noun_list.append(arrow_indicate) return noun_list
def ckiptagger_fun_init(): global ws, pos, ner data_path = "./data" ws = WS(data_path, disable_cuda=False) pos = POS(data_path, disable_cuda=False) ner = NER(data_path, disable_cuda=False)
``` ## Segmenting Texts The initialized word segmenter object, `ws()`, can tokenize any input **character vectors** into a list of **word vectors** of the same size. from ckiptagger import data_utils, construct_dictionary, WS, POS, NER # Set Parameter Path MODEL_PATH = '../../../NTNU/CorpusLinguistics/CorpusLinguistics_bookdown/data/' #'/Users/Alvin/Dropbox/NTNU/CorpusLinguistics/CorpusLinguistics_bookdown/data/' ## Loading model #ws = WS('/Users/Alvin/Dropbox/NTNU/CorpusLinguistics/CorpusLinguistics_bookdown/data/') ws = WS(MODEL_PATH) #ws = WS('../../../NTNU/CorpusLinguistics/CorpusLinguistics_bookdown/data/') pos = POS(MODEL_PATH) ner = NER(MODEL_PATH) ## Raw text corpus sentence_list = ['傅達仁今將執行安樂死,卻突然爆出自己20年前遭緯來體育台封殺,他不懂自己哪裡得罪到電視台。', '美國參議院針對今天總統布什所提名的勞工部長趙小蘭展開認可聽證會,預料她將會很順利通過參議院支持,成為該國有史以來第一位的華裔女性內閣成員。', '土地公有政策??還是土地婆有政策。', '… 你確定嗎… 不要再騙了……他來亂的啦', '最多容納59,000個人,或5.9萬人,再多就不行了.這是環評的結論.', '科長說:1,坪數對人數為1:3。2,可以再增加。'] ## other parameters # sentence_segmentation = True, # To consider delimiters # segment_delimiter_set = {",", "。", ":", "?", "!", ";"}), # This is the defualt set of delimiters # recommend_dictionary = dictionary1, # words in this dictionary are encouraged # coerce_dictionary = dictionary2, # words in this dictionary are forced
and word not in ckip_dropped_words \ and not word.startswith('阿') \ and not word.startswith('恩') \ and not word.startswith('齁') \ and not word.startswith('哈'): nes[word] = ckip_type_map[word_type] return nes # Segment articles print("Segment articles...") load_nes = util.load_dictionary("../dataset/named_entities.txt") coerce_words = dict([(k, 1) for k in load_nes]) ws = WS("../ckipdata") # , disable_cuda=not GPU) print(" CKIP Pos articles...") pos = POS("../ckipdata") print(" CKIP Ner articles...") ner = NER("../ckipdata") article_words = ws(articles, coerce_dictionary=util.construct_dictionary(coerce_words)) ckip_pos_words = pos(article_words) ckip_entity_words = ner(article_words, ckip_pos_words) ckip_nes = parse_nes(ckip_entity_words) print("Segment articles done.") # Recognize name entities print("Recognize name entities...") # Write header with open(fillout_file, "w") as fp: fp.write(
ws = None pos = None parser = argparse.ArgumentParser(description="將一路徑底下的.html檔全部輸出到一個.vrt檔") parser.add_argument("-c", "--ckip-path", help="輸入ckiptagger的模型資料夾路徑", required=True) parser.add_argument("--output", help="輸入要輸出的.vrt檔的完整路徑(含檔名)", required=True) parser.add_argument("--html-path", help="輸入.html檔所在路徑", required=True) parser.add_argument("--use-gpu", help="如要使用GPU,請輸入這個參數", action="store_true") parser.add_argument("--use-mp", help="如要使用多進程,請輸入這個參數", action="store_true") args = parser.parse_args() # 是否使用 gpu 決定如何初始化 WS 和 POS if args.use_gpu: ws = WS(args.ckip_path, disable_cuda=False) pos = POS(args.ckip_path, disable_cuda=False) else: ws = WS(args.ckip_path, disable_cuda=True) pos = POS(args.ckip_path, disable_cuda=True) t1 = datetime.now() # 是否使用多進程 if args.use_mp: # 加 filter() 確保是 .html 檔 glob_iter = filter(lambda x: x.endswith(".html"), glob.iglob(f"{args.html_path}/**", recursive=True)) mp_handler(output_path=args.output, glob_iter=glob_iter) else: with open(args.output, "w") as f: for root, _, files in os.walk(args.html_path): for file in files:
# GPU_MEMORY_FRACTION = 0.7 FILENAME = sys.argv[1] WORD_TO_WEIGHT = sys.argv[2] LIMIT = int(sys.argv[3]) CUDA_VISIBLE_DEVICES = str(sys.argv[4]) GPU_MEMORY_FRACTION = float(sys.argv[4]) print("set GPU stat...") cfg = tf.ConfigProto() cfg.gpu_options.per_process_gpu_memory_fraction = GPU_MEMORY_FRACTION ###設定gpu使用量 session = tf.Session(config=cfg) os.environ["CUDA_VISIBLE_DEVICES"] = CUDA_VISIBLE_DEVICES ###設定gpu編號 print("prepare ws pos ner") ws = WS("./data", disable_cuda=False) pos = POS("./data", disable_cuda=False) ner = NER("./data", disable_cuda=False) print("read data in...") data = np.load(FILENAME) if (LIMIT): data = data[:1000] print("read WORD_TO_WEIGHT in...") word_to_weight = {} with open(WORD_TO_WEIGHT, encoding='utf-8') as f: for line in f: word = line.split('\n')[0] if (word not in word_to_weight): word_to_weight[word] = 1 else:
# -*- coding: UTF-8 -*- # WS(斷詞), POS(詞性標注), NER(實體辨識) from ckiptagger import WS, POS, NER import pandas as pd modelPath = 'C:/Users/wmmkslab/Desktop/CKIP/data' ws = WS(modelPath) pos = POS(modelPath) ner = NER(modelPath) newsPath = 'udnNews/news.xlsx' if __name__ == '__main__': df = pd.read_excel(newsPath, usecols='B:F') while 1: numInput = input('select one content(1~1000): ') if numInput.isdigit(): numInput = int(numInput) numInput -= 1 break else: print('please input a number(1~1000)!') # remove '\n', '\r' content = '' for i in df['Content'][numInput]: if i != '\n' and i != '\r': content += i # WS word_s = ws([content],