async def thesaurusInitialization(): # Dynamically load Thesaurus _p = './HolyGrailWar/Config/Goods/Goods.json' _content = await Utils.readFileToJSON(_p) for _g in _content['goodslist']: jieba.add_word(_g['name']) for _a in _g['abbreviation']: jieba.add_word(_a)
def add_words(self, yanwenzi_dict_list, freq=100, tag='ywz'): ''' 分词词典中加入新增的词 yanwenzi_dict_list,dict/list都可以,新增到分词的词典之中 ''' if isinstance(yanwenzi_dict_list, dict): for k, v in yanwenzi_dict_list.items(): jieba.add_word('_{}_'.format(v), freq=freq, tag=tag) elif isinstance(yanwenzi_dict_list, list): for word in yanwenzi_dict_list: jieba.add_word('_{}_'.format(word), freq=freq, tag=tag) else: raise Exception('yanwenzi_dict_list format must be dict or list.') print('jieba add words. length :', len(yanwenzi_dict_list))
import sys reload(sys) sys.setdefaultencoding('utf8') """数据处理""" ''' 结巴分词模块 ''' import time import jieba_fast import jieba_fast.posseg as pseg import sys reload(sys) sys.setdefaultencoding('utf8') jieba_fast.load_userdict('/Users/zhuxinquan/Desktop/mykeyword.dict') jieba_fast.add_word('烤鸭炉') #存储停用词 fid2 = '/Users/zhuxinquan/Desktop/停用词调整_二手.txt' stopword = {} fid2 = open(fid2, 'r') for j in fid2.readlines(): stopword[j.strip().decode("utf-8")] = 1 def stop_word(line): data_line = line.strip() wordList = jieba_fast.cut(data_line) # wordlist是一个生成器 outStr = '' t1 = time.time() for word in wordList: if word not in stopword:
except Exception as e: import jieba try: print(model_dir) test_size = 0.025 online = True except: model_dir = "pai_model/" test_size = 0.05 online = False new_words = "支付宝 付款码 二维码 收钱码 转账 退款 退钱 余额宝 运费险 还钱 还款 花呗 借呗 蚂蚁花呗 蚂蚁借呗 蚂蚁森林 小黄车 飞猪 微客 宝卡 芝麻信用 亲密付 淘票票 饿了么 摩拜 滴滴 滴滴出行".split( " ") for word in new_words: jieba.add_word(word) star = re.compile("\*+") if False: stops = [ "、", "。", "〈", "〉", "《", "》", "一", "一切", "一则", "一方面", "一旦", "一来", "一样", "一般", "七", "万一", "三", "上下", "不仅", "不但", "不光", "不单", "不只", "不如", "不怕", "不惟", "不成", "不拘", "不比", "不然", "不特", "不独", "不管", "不论", "不过", "不问", "与", "与其", "与否", "与此同时", "且", "两者", "个", "临", "为", "为了", "为什么", "为何", "为着", "乃", "乃至", "么", "之", "之一", "之所以", "之类", "乌乎", "乎", "乘", "九", "也", "也好", "也罢", "了", "二", "于", "于是", "于是乎", "云云", "五", "人家", "什么", "什么样", "从", "从而", "他", "他人", "他们", "以", "以便", "以免", "以及", "以至", "以至于", "以致", "们", "任", "任何", "任凭", "似的", "但", "但是", "何", "何况", "何处", "何时", "作为", "你", "你们", "使得", "例如", "依", "依照", "俺", "俺们", "倘", "倘使", "倘或", "倘然", "倘若", "借", "假使", "假如", "假若", "像", "八", "六", "兮", "关于", "其", "其一", "其中", "其二", "其他", "其余", "其它", "其次", "具体地说", "具体说来", "再者", "再说", "冒", "冲", "况且",
def add_jieba(wds, tag): for wd in wds: jieba.add_word(wd, tag=tag, freq=3000000)
# -*- coding: utf-8 -*- """ @Time : 2020/3/8 11:06 @Auth : joleo @File :bm_recall.py """ # https://blog.csdn.net/byn12345/article/details/81112973 from gensim import corpora, models, similarities from load_data import read_test, read_context, read_train from collections import defaultdict import jieba_fast as jieba from recall.rank_bm25 import BM25Okapi import pandas as pd # jieba.load_userdict(file_name='') jieba.add_word('复工') jieba.add_word('稳岗') jieba.add_word('医保局') jieba.add_word('暖企') # jieba.del_word('医保局') paper_data = read_context( 'data/NCPPolicies_context_20200301/NCPPolicies_context_20200301.csv') train_data = read_train( './data/NCPPolicies_train_20200301/NCPPolicies_train_20200301.csv') valid_data = read_test('./data/NCPPolicies_test/NCPPolicies_test.csv') train_data['question'] = train_data['question'].map( lambda x: x.replace('\n', '')) train_data['answer'] = train_data['answer'].map(lambda x: x.replace('\n', '')) valid_data['question'] = valid_data['question'].map(
mode = 0 char_size = 128 maxlen = 256 min_count = 16 word2vec = Word2Vec.load('../word2vec_baike/word2vec_baike') id2word = {i + 1: j for i, j in enumerate(word2vec.wv.index2word)} word2id = {j: i for i, j in id2word.items()} word2vec = word2vec.wv.syn0 word_size = word2vec.shape[1] word2vec = np.concatenate([np.zeros((1, word_size)), word2vec]) for w in word2id: if w not in jieba.dt.FREQ: jieba.add_word(w) def tokenize(s): return jieba.lcut(s, HMM=False) def sent2vec(S): """S格式:[[w1, w2]] """ V = [] for s in S: V.append([]) for w in s: for _ in w: V[-1].append(word2id.get(w, 0))
def Autotag_gen(sentences_list): global existing_ner_df global jieba_dict_df OOVs_df = OOV_scan_run(sentences_list) #print(OOVs_df) #print('\n\n') #### Post-processing for OOV: ### OOV post-processing 1: adding OOVs to existing OOV data: existing_ner_df = pd.concat([existing_ner_df, OOVs_df], axis=0, ignore_index=True) ### OOV post-processing 2: adding OOVs to jieba data: for i in OOVs_df.itertuples(): OOV = str(i.word) OOV_nertag = str(i.ner) OOV_postag = ner2pos[OOV_nertag] jieba_freq = jieba.suggest_freq(OOV, tune=False) jieba.add_word(OOV, jieba_freq, tag=OOV_postag) jieba_dict_df = jieba_dict_df.append( pd.DataFrame([[OOV, jieba_freq, OOV_postag]], columns=jieba_dict_df.columns)) del OOVs_df #### Run Hash-tag generation # #### Hash-tag generation step 1: cleaning raw text + split into sentences # preprocessing_sentences =text_preprocessing(raw_sentences) # preprocessing_sentences_list= DummySentencizer(preprocessing_sentences, split_characters=['。」' ,'。' ,';' ,'!' ,'*']).sentences # preprocessing_sentences_list=[s for s in preprocessing_sentences_list if len(s)>1] #### Hash-tag generation step 2: tokenize the pre-processing sentences #### Hash-tag generation step 2.1 : apply jieba to tokenize the pre-processing sentences tokens = [] for preprocessing_sentence in sentences_list: tokens += [ (t, pos) for t, pos, in jieba_posseg.cut(preprocessing_sentence, HMM=False) if pos in ['nr', 'nt', 'ns', 'n', 'nz', 'x'] and len(t) > 1 ] tokens = list(set(tokens)) tokens = sorted(tokens, key=lambda x: len(x[0]), reverse=False) #### Hash-tag generation step 2.2 : clean tokens tokens_clean = [] for index, (token, pos) in enumerate(tokens): sub_tokens = tokens[index + 1:] Sum_check = 0 for sub_t, sub_pos in sub_tokens: if token in sub_t: Sum_check += 1 else: Sum_check += 0 if Sum_check == 0: tokens_clean.append((token, pos)) del tokens # duplicate_check=pd.DataFrame(tokens_duplicate_check,columns=['token','pos','Sum_check']) # duplicate_check.sort_values('Sum_check',ascending=True,inplace=True) # duplicate_check.drop_duplicates('token',inplace=True) # tokens_clean= list(duplicate_check.to_records(index=False)) tokens_clean = sorted(tokens_clean, key=lambda x: len(x[0]), reverse=True) #### Hash-tag generation step 3: locate hash-tags by NER classes raw_autotags = [] for (token, postag) in tokens_clean: ##---- Hash-tag generation step 3.1 : Convert jieba POS tags into NER class if token in existing_ner_df['word'].values: ner_class = existing_ner_df[existing_ner_df['word'] == token]['ner'].values[0] raw_autotags.append((token, ner_class)) # print((token,ner_class)) elif token in jieba_dict_df['word'].values: # print('cannot process: ',token,postag) try: ner_class = pos2ner[jieba_dict_df[jieba_dict_df['word'] == token]['pos'].values[0]] raw_autotags.append((token, ner_class)) except: print(token, postag) #print((token, ner_class)) # print(jieba_dict_df[jieba_dict_df['word']==token]['pos'].values[0]) else: #print((token,postag)) continue ##--- Hash-tag generation step 3.2 : filter of NER class del tokens_clean autotags = [] for (token, ner_class) in raw_autotags: if len(token) >= 5 and ner_class not in ['J']: autotags.append((token, ner_class)) elif len(token) > 2 and len(token) < 5 and ner_class not in [ 'TITLE', 'J' ]: #print(token,ner_class) autotags.append((token, ner_class)) elif len(token) == 2 and ner_class not in [ 'PRODUCT', 'TITLE', 'J', 'TERM' ]: #print(token,ner_class) autotags.append((token, ner_class)) del raw_autotags autotags_df = pd.DataFrame(autotags, columns=['hashtags', 'type']) autotags_df.drop_duplicates('hashtags', inplace=True) return autotags_df
def jiebaCustomSetting(self, dict_path, usr_dict_path): jieba.set_dictionary(dict_path) with open(usr_dict_path, "r", encoding="utf-8") as dic: for word in dic: jieba.add_word(word.strip("\n"))