def segment_pos(dir='rawdata', datetype='all', outdir='nohref_seg'): jieba.set_dictionary('dict/dict.txt.big') for tag in loadTag(): jieba.add_word(tag) chinese_postagger = StanfordPOSTagger('tagger/chinese-distsim.tagger', 'tagger/stanford-postagger.jar', encoding='utf-8') for file in parseDateType(dir,datetype): dirname, filename = os.path.split(file) head = filename.split('.')[0] outfile = outdir + '/' + head + '.txt' if os.path.isfile(outfile): print 'pass %s...' %head continue print 'segment %s ...' %head f = open(outfile, 'w') dataList = readJson(file) p = re.compile("http[s]?://.*\n") for data in dataList: content = data['content'] content = re.sub(p, '', content) segList = jieba.cut(content) wordList, tagList = postagging(chinese_postagger, segList) for w, t in zip(wordList, tagList): f.write(w.encode('utf-8')) f.write(' ') f.write(t) f.write(' ') f.write('\n') f.close()
def fetch_data(readfile, writefile): f = open('custom.dict', 'r') for line in f: jieba.add_word(line.strip()) f.close() fp = open(readfile, 'r') ID = [] words = [] for line in fp: test = [] content = line.split('\t') ID.append(content[0]) final = jieba.cut(content[1]) a = ' '.join(final) test.append(a.strip()) words.append(test) fpp = open(writefile, 'wb') i = 0 for item in words: fpp.write(ID[i] + '\t') for itemo in item: fpp.write(itemo) fpp.write('\n') i = i + 1 fp.close() fpp.close() return
def init_jieba(): jieba.dt.check_initialized() with codecs.open('new_words', 'r', 'utf-8') as f: for line in f: w = line.strip() if w: jieba.add_word(w)
def generate_top(): from collections import defaultdict import simplejson as json import operator from_product = defaultdict(lambda: 0) results = defaultdict(lambda: 0) for product in cols['product'].find().sort('_id', -1): for k in product.keys(): from_product[k] += 1 product_keys = dict(from_product) for w in list(product_keys.keys()): jieba.add_word(w, tag='nz') progress = 0 for comment in cols['mobile_comment'].find(projection={'content': 1}): c = comment['content'] words = jieba.analyse.extract_tags(c, topK=20, withWeight=False, allowPOS=('ns', 'n', 'nz')) for w in words: results[w] += 1 progress += 1 if progress % 100 == 0: print('Current Progress: ', progress) sorted_x = reversed(sorted(dict(results).items(), key=operator.itemgetter(1))) json.dump( list(sorted_x), open('sorted_mobile.json', mode='w', encoding='utf-8'), ensure_ascii=False, indent=2 )
def cut_news(news, names): #添加自定义词典 for name in names: jieba.add_word(name, freq = 1000, tag = 'nr') #初步分词 words = [[word.encode('utf-8'), flag.encode('utf-8')] for word, flag in pesg.cut(news)] #纪录名字与关系关键词在words中的index k = 0 names_loc = [] rels_loc = [] for word, flag in words: cur = relationship_keywords.find_one({'keywords': word}) if cur: rel = cur['name'].encode('utf-8') if rel in rel_one_way: words[k][1] = 'relationwords_one' elif rel in rel_two_way_single: words[k][1] = 'relationwords_two_single' elif rel in rel_two_way_double: words[k][1] = 'relationwords_two_double' rels_loc.append(k) elif word in names: words[k][1] = 'namewords' names_loc.append(k) k += 1 if (len(names_loc) != len(names)) | (len( rels_loc) < 1): return False return [words, rels_loc, names_loc]
def main(): starttime = datetime.datetime.now() path = os.path.abspath('.') path = path.split('/') basepath = "/".join(path[:-2]) dictpath = os.path.join(basepath,'data/myDict.txt') jieba.load_userdict(dictpath) target_rel = u'夫妻' train_user_path = os.path.join(basepath,'data/train_user.txt') with open(train_user_path) as f: userdata = f.readlines() userset = [] for line in userdata: userset.append(line[:-1]) for user in userset[0:1]: tupu_path = os.path.join(basepath,'data/train/entity_tupu/entity_tupu.%s' % user) with open(tupu_path) as f: tupu_data = f.readlines() entity_pair = [] for line in tupu_data: data = line[:-1].split('\t') rel = data[0].decode('utf-8') entity1 = data[1].decode('utf-8') entity2 = data[2].decode('utf-8') if rel == target_rel: entity_pair.append([entity1,entity2]) datapath = os.path.join(basepath,'data/train/entity_sentence/entity_sentence.%s' % user) with open(datapath) as f: dataset = f.readlines() three_split_set = [] for line in dataset: try: data = line[:-1].split('\t') entity1 = data[1].decode('utf-8') entity2 = data[2].decode('utf-8') sentence = data[0].decode('utf-8') if [entity1,entity2] in entity_pair or [entity2,entity1] in entity_pair: print sentence,entity1,entity2 jieba.add_word(entity1,1000) jieba.add_word(entity2,1000) three_split = cut_sentence(sentence,entity1,entity2) if three_split == None: continue three_split_set.append(three_split) # if rel in sentence: # print sentence except Exception, e: print e
def addToDictionary(): """Grabs list of checked words and adds to the operating dictionary Note: If word already exists in dictionary, increments frequency""" wordList = request.form.get('segCheckbox') for word in wordList: jieba.add_word(word,1) flash("You successfully updated the dictionary!") return redirect(url_for('model_show_entries'))
def __init(): user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt") jieba.load_userdict(user_dict_path) jieba.add_word("快递", 10000) jieba.suggest_freq(("面", "太厚")) jieba.suggest_freq(("价格", "便宜")) jieba.suggest_freq(("服务", "周到")) jieba.suggest_freq(("速度", "快"))
def tokenize(sentence,addwords=None): if(addwords!=None): for word in addwords: jieba.add_word(word) tokens = [] for term in jieba.tokenize(sentence): tokens.append(term[0]) return tokens
def add_word(self, key, code): """动态向结巴词典添加自定义的词""" if code not in MYSELF_ADD_DICT_TYPE: logger.info("动态添加字词的类型出错") return user_weight, user_type = MYSELF_ADD_DICT_TYPE.get(code) # 使用 add_word(word, freq=None, tag=None) 和 del_word(word) 可在程序中动态修改词典。 jieba.add_word(key, user_weight, user_type) self.ADD_MAP_DATA.setdefault(key, {}) self.ADD_MAP_DATA[key][user_type] = user_weight
def load_user_library(file): ''' Load user dictionary to increase segmentation accuracy ''' if isinstance(file, str): jieba.load_userdict(file) elif isinstance(file, list): for value in file: jieba.add_word(value.lower()) else: pass
def cn_ci(dir_path): for rdf in ci_list: jieba.add_word(rdf[0]) all_text = u"" for file_name in os.listdir(dir_path): if file_name.find(".txt") != -1: file_path = "/".join([dir_path, file_name]) with open(file_path, "r") as f: all_text += f.read().decode("utf-8") terms = jieba.cut(all_text) return [ci for ci in ','.join(terms).split(',') if ci not in [u'', u" "]]
def addDictToJieba(): ##### roadList content=open('../data_crawl/finalRoads.txt','r').read().strip('\n') contentList=content.split('\n');print len(contentList) #############load district dict districtNameList=grab('/home/yr/intellicredit/data/'+'districtNameList0503') test_sent = [ "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿,上海市浦东区\n" ] ######## print cut before add dictionary #print test_sent[0].decode('utf-8') words = jieba.cut(test_sent[0].decode('utf-8')) #print('/'.join(words)) ####add word_dictionary to jieba for w in districtNameList[:]+contentList: #print w jieba.add_word(w) ####add district-name-not-in-dictionary to jieba jieba.add_word('浦东区'); jieba.add_word('浦东新区') jieba.del_word('上海市') jieba.add_word('兰城路') words = jieba.cut(test_sent[0].decode('utf-8'))
def jieba_processing_txt(text): for word in userdict_list: jieba.add_word(word) mywordlist = [] seg_list = jieba.cut(text, cut_all=False) liststr = "/ ".join(seg_list) with open(stopwords_path, encoding='utf-8') as f_stop: f_stop_text = f_stop.read() f_stop_seg_list = f_stop_text.splitlines() for myword in liststr.split('/'): if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1: mywordlist.append(myword) return ' '.join(mywordlist)
def get_key_words(): client = MongoClient('localhost', 27017) db= client['baidu'] relationship_keywords = db.relationship_keywords file = open('/Users/wutong/workspace/da/baidu/data/condensedata/condensetraindata.txt') for line in file.readlines(): items = line.split() names = [items[1], items[2]] news = items[3] rel = items[0] for name in names: jieba.add_word(name, freq = None, tag = 'nr') words = pesg.cut(news) for word, flag in words: if flag == 'n': relationship_keywords.update({ 'name': rel }, {'$addToSet': { 'keywords': word}})
def Get_fenci(self): # jieba.add_word('石墨烯')#动态添加自定义单词 jieba.add_word('凱特琳') jieba.del_word('自定义词') jieba.add_word("易风化") filtered_tokens = [] test_sent = "" for i in range(1,2): Data_path = path + "he"+".txt" test_sent ="".join(open(Data_path, 'rb').read()) print (test_sent) words = jieba.cut(test_sent) filtered_tokens.append([each for each in jieba.cut(test_sent)]) print ('-'*40) print (json.dumps(filtered_tokens)) print("="*40)
def handle(data): oper = json.loads(data) if oper[0] == 'cut': return json.dumps(tuple(jieba.cut(*oper[1], **oper[2]))).encode('utf-8') elif oper[0] == 'cut_for_search': return json.dumps(tuple(jieba.cut_for_search(*oper[1], **oper[2]))).encode('utf-8') elif oper[0] == 'tokenize': return json.dumps(tuple(jieba.tokenize(*oper[1], **oper[2]))).encode('utf-8') elif oper[0] == 'add_word': jieba.add_word(*oper[1], **oper[2]) elif oper[0] == 'load_userdict': jieba.load_userdict(*oper[1]) elif oper[0] == 'set_dictionary': jieba.set_dictionary(*oper[1]) elif oper[0] == 'stopserver': return b'stop' elif oper[0] == 'ping': return b'pong'
def rich_train_data_by_editor(self, files=[]): """ 通过人工编辑规则增强Train Data """ # 20140910_1427 没效果,反而有一两个百分点下降。 import jieba dict_dir = None for file1 in files: parsed = ReadManualKps.process(dict_dir + file1) for node_name1, node_features in parsed.iteritems(): features2 = [node_name1] + node_features # 1. Add features to jieba for name2 in features2: jieba.add_word(name2, 1000000) # 1000000 copied from lianhua # 2. Add features to tags_tree.name_to_nodes nodes_set2 = self.name_to_nodes.get(node_name1, set([])) for node3 in nodes_set2: node3_feature_max_value = max(node3.features_weight.values() or [0.25]) for feature4 in features2: node3.features_weight[feature4] = node3_feature_max_value self.feature_to_nodes[feature4].add(node3)
def condense_key_words(rel): client = MongoClient('localhost', 27017) db= client['baidu'] relationship_keywords = db.relationship_keywords file = open('/Users/wutong/workspace/da/baidu/data/condensedata/condensetraindata.txt') file1 = '' name_list = [] for line in file.readlines(): items = line.split() names = [items[1], items[2]] news = items[3] rela = items[0] if rela == rel: for name in names: name_list.append(name) jieba.add_word(name, freq = None, tag = 'nr') file1 = file1 + news + '\n' tags = jieba.analyse.extract_tags(file1, topK=20, withWeight=False, allowPOS=()) for tag in tags: if tag.encode('utf-8') not in name_list: relationship_keywords.update({ 'name': rel }, {'$addToSet': { 'keywords': tag}})
def handlemsg(data): oper = loadsjson(data) if oper[0] == 'c2m': return dumpsjson(mc.c2m.translate(*oper[1:])) elif oper[0] == 'm2c': return dumpsjson(mc.m2c.translate(*oper[1:])) elif oper[0] == 'c2m.raw': return dumpsjson(mc.c2m.rawtranslate(oper[1])) elif oper[0] == 'm2c.raw': return dumpsjson(mc.m2c.rawtranslate(oper[1])) elif oper[0] == 'modelname': return dumpsjson(mc.name()) elif oper[0] == 'cut': return dumpsjson(tuple(jieba.cut(*oper[1], **oper[2]))) elif oper[0] == 'cut_for_search': return dumpsjson(tuple(jieba.cut_for_search(*oper[1], **oper[2]))) elif oper[0] == 'tokenize': return dumpsjson(tuple(jieba.tokenize(*oper[1], **oper[2]))) elif oper[0] == 'jiebazhc.cut': return dumpsjson(tuple(jiebazhc.cut(*oper[1], **oper[2]))) elif oper[0] == 'jiebazhc.cut_for_search': return dumpsjson( tuple(jiebazhc.cut_for_search(*oper[1], **oper[2]))) elif oper[0] == 'jiebazhc.tokenize': return dumpsjson(tuple(jiebazhc.tokenize(*oper[1], **oper[2]))) elif oper[0] == 'add_word': jieba.add_word(*oper[1], **oper[2]) elif oper[0] == 'load_userdict': jieba.load_userdict(*oper[1]) elif oper[0] == 'set_dictionary': jieba.set_dictionary(*oper[1]) elif oper[0] == 'stopserver': return b'stop' elif oper[0] == 'ping': return b'pong' else: return dumpsjson('Command not found')
def getCorpusFromFile(self, filename): ''' 该函数通过读入文本,得到分词结果的语料库 ''' #处理掉特殊符号 punctuation = re.compile("[a-zA-Z0-9\s+\.\!\/_,$%^*(+\"\']+|[-+——!,。?、~@#¥%……&*()]+".decode('utf-8')) #对关键词提高权重 with open('keywords.txt', 'r') as parse_file: for eachline in parse_file: jieba.add_word(eachline.strip()) #处理停用词 jieba.analyse.set_stop_words('stop_words.txt') #读入训练集数据 with open(filename, "rb") as fp: trainData = [line.strip().split("\t") for line in fp] #分词处理 self.corpus = [[wordSplit(content[0], punctuation), int(content[1])] for content in trainData]
def add_usr_dict(path, sep=','): with open(path, 'r') as f: for l in f.xreadlines(): items = l.split(sep) if len(items) == 3: jieba.add_word(items[0].rstrip(), int(items[1].rstrip()), items[2].rstrip()) elif len(items) == 2: jieba.add_word(items[0].rstrip(), int(items[1].rstrip())) elif len(items) == 1: jieba.add_word(items[0].rstrip()) else: raise ValueError('too less number of word info \'%s\'' % (l.strip()))
def add_word(list): for items in list: jieba.add_word(items)
end_index = end_index + 1 return sub_contents def build_new_sub_content(sub_content): cut_content_word_list = pseg.cut(sub_content, use_paddle=True) new_content_list = [] for word, flag in cut_content_word_list: new_content_list.append(word) new_content_list.append(flag) return ''.join(new_content_list) # ORG用于删除 jieba.add_word('热情', tag='a') jieba.add_word('位置', tag='n') jieba.add_word('预定', tag='n') jieba.add_word('卫生', tag='n') jieba.add_word('不值', tag='v') jieba.add_word('齐全', tag='a') jieba.add_word('也', tag='ORG') jieba.add_word('服务态度', tag='n') jieba.add_word('隔音', tag='n') jieba.add_word('房间隔音', tag='n') jieba.add_word('安静', tag='a') jieba.add_word('无语') jieba.add_word('有味道') jieba.add_word('极') jieba.add_word('最') jieba.add_word('太')
def add_word(text, number): strs1 = getNewWordsByNLPIR(text, number) for i in strs1: jieba.add_word(i) for i in my_words_list: jieba.add_word(i)
for name, data in self.data[index].items(): for alias in set(data['alias']): if re.search(ur"[省市县]$", alias): jieba.add_word(alias, 10000000) elif re.search(ur"[区]$", alias): jieba.add_word(alias, 1000000) else: jieba.add_word(alias, 100000) for suffix in u"路镇乡圩河区村": jieba.add_word(u"{}{}".format( alias, suffix), 1000000) names = file2iter(file2abspath('region_dict.txt', __file__)) for name in names: jieba.add_word(name.strip(), 1) # jieba.del_word(u"广州药业") def normalize_region_name(self, name, xtype): if not hasattr(self, "normalizeRegion_mapped"): setattr(self, "normalizeRegion_mapped", collections.Counter()) mapped = getattr(self, "normalizeRegion_mapped") if len(name) > 2: name = re.sub(u"[省市]+$", "", name) if name in [u"市辖区"]: return name if name in ["", u"省市"]:
def __init__(self, url): # 爬取团队介绍的页面 res = requests.get(url) # 采集介绍信息 selector = etree.HTML(res.text) content = selector.xpath( "//div[@class='piece-body p-lg clearfix']/p/text()") content = re.sub('[\\\\x|\s|,|)|(|,|:|:|!|!|、|。]', '', ''.join(content)).strip() # jieba 分词 jieba.add_word('学号') jieba.add_word('Python') jieba.add_word('MySQL') jieba.add_word('JavaScript') jieba.add_word('HTML') jieba.add_word('Java') self.__words = list(jieba.cut(content)) self.__record_set = set(self.__words) self.__record_dict = {} for word in self.__record_set: self.__record_dict[word] = self.__words.count(word) print(self.__record_dict)
seg_list = jieba.cut("网易智造N520除螨吸尘器", cut_all=True) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 seg_list = jieba.cut("宇宙沙盘控温被 薄被", cut_all=False) print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式 print(", ".join(seg_list)) seg_list = jieba.cut_for_search("心想智能胶囊咖啡机") # 搜索引擎模式 for seg in seg_list: print(seg) # 自定义关键词库 jieba.load_userdict('./user_dict.txt') jieba.add_word('石墨烯') jieba.add_word('凱特琳') jieba.del_word('自定义词') test_sent = ("李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n" "例如我输入一个带“韩玉赏鉴”的标题,宇宙沙盘控温被 薄被在自定宇宙沙盘控温被 薄被此词为N类\n" "「台中」正確應該网易智造N520除螨吸尘器不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。") words = jieba.lcut('心想智能胶囊咖啡机', cut_all=False) for word in words: if word in jieba.user_word_tag_tab: print(word) print(jieba.user_word_tag_tab.__contains__('薄被')) # 将所有的产品名进行分词,按词频进行排序并统计,词频较高的作为分类关键词
import numpy as np import matplotlib.pyplot as plt #词云生成工具 from wordcloud import WordCloud, ImageColorGenerator #需要对中文进行处理 import matplotlib.font_manager as fm #背景图 bg = np.array(Image.open("alice.png")) #获取当前的项目文件加的路径 d = path.dirname(__file__) #读取停用词表 stopwords_path = 'wangfeng.txt' #添加需要自定以的分词 jieba.add_word("丁") #读取要分析的文本 text_path = "wangfeng.txt" #读取要分析的文本,读取格式 text = open(path.join(d, text_path), encoding="utf8").read() #删除指定内容 text = re.sub(u'.*?::', '', text) #定义个函数式用于分词 def jiebaclearText(text): #定义一个空的列表,将去除的停用词的分词保存 mywordList = [] #进行分词 seg_list = jieba.cut(text, cut_all=False)
def pro(filename, path, choose): global xxxxxx f = open(path, 'r', encoding='utf-8') text = f.read() f.close() keyWords = re.findall(r'\((.*?)\)', text) for word in keyWords: jieba.add_word(word) xxxxxx.append(word) xx = [] text = jieba.cut(text) for i in text: xx.append(i) text = ' '.join(xx) sss = [] numberOfDelete = 0 for m in re.finditer(r'\( (.*?) \)|\[ (.*?) \]|\{ (.*?) \}', text): #print(m.start(), m.end(), text[m.start():m.end()]) numberOfDelete += 2 dic = {} beginPos = m.start() + 2 - numberOfDelete endPos = m.end() - numberOfDelete - 2 dic['beginPos'] = str(beginPos) dic['endPos'] = str(endPos) dic['text'] = m.group(0)[2:-2] if (text[m.start()] == '('): dic['type'] = 'LOC' elif (text[m.start()] == '['): dic['type'] = 'PER' elif (text[m.start()] == '{'): dic['type'] = 'ORG' numberOfDelete += 2 sss.append(dic) number = 1 m = [] for obj in sss: x = [] x.append('T' + str(number)) x.append(obj['type']) x.append(obj['beginPos']) x.append(obj['endPos']) x.append(obj['text']) m.append('\t'.join(x)) number += 1 filename = filename.split('.')[0] f = open('../data/conll2003/en/' + choose + '/' + filename + '.ann', 'w', encoding='utf-8') f.write('\n'.join(m)) f.close() f1 = open('../data/conll2003/en/' + choose + '/' + filename + '.txt', 'w', encoding='utf-8') text = text.replace('( ', '').replace(' )', '') text = text.replace('[ ', '').replace(' ]', '') text = text.replace('{ ', '').replace(' }', '') f1.write(text) f1.close()
import re data = pd.read_excel('Excel/Laptop_data_processed.xls') title_list = data.raw_title.values.tolist() new_title_list = [] #对每一个title进行提取清洗, 删除所有符号 for title in title_list: new_title = "".join(re.findall(r'[\u4e00-\u9fa5a-zA-Z0-9]', title)) new_title_list.append(new_title) import jieba #添加自定义中文词 corpus = ['吃鸡', 'ThinkPad', 'MacBookAir', 'IdeaPad'] for word in corpus: jieba.add_word(word) #分词 title_list = [] for title in new_title_list: title_cut = jieba.lcut(title) #list title_list.append(title_cut) #创建停用词 stop_words = [] with open('Text/Stop words.txt') as f: for line in f: word = f.readline() word = word.strip() stop_words.append(word) stop_words.append('笔记本电脑')
def init(): # dic_path = '/Users/li/PycharmProjects/huihongcaihui/src/corpus' dic_path = conf.dic_path # 读取词典 d_path = dic_path + "/程度副词_datatang.txt" s_path = dic_path + "/senti.txt" f_path = dic_path + "/fou.txt" b_path = dic_path + "/but.txt" e_path = dic_path + "/eng.txt" l_path = dic_path + "/limit.dict" a_path = dic_path + "/dic.txt" ns_path = dic_path + "/新增_stock" n_path = dic_path + "/新增" n2_path = dic_path + "/新增2" st_path = dic_path + "/stock_words.txt" st_new_path = dic_path + "/stock.csv" zhi_ne_path = dic_path + "/知网/zhi_neg.txt" zhi_po_path = dic_path + "/知网/zhi_pos.txt" jg_path = dic_path + "/机构" # 添加基金公司实体名字,比如("工银瑞信基金", "华泰柏瑞基金", "东方基金") # 结巴新词 word_add = set() for d in open(d_path): # temp = d.decode("utf-8").split(" ") temp = d.split(" ") word_arr = temp[1].strip("\n").rstrip(" ").split("、") for w in word_arr: deg_dict[w] = float(temp[0]) word_add.add(temp[0]) for s in open(s_path): # temp = s.decode("utf-8").split(" ") temp = s.split(" ") senti_dict[temp[0]] = float(temp[1]) word_add.add(temp[0]) for e in open(e_path): temp = e.split(" ") eng_dict[temp[0]] = float(temp[1]) word_add.add(temp[0]) for f in open(f_path): # f = f.decode("utf-8-sig") fou_dict.append(f.strip("\n")) word_add.add(f.strip("\n")) for b in open(b_path): but_dict.append(b.strip("\n")) word_add.add(b.strip("\n")) for l in open(l_path): lim_dict.append(l.strip("\n")) word_add.add(l.strip("\n")) for a in open(a_path): new_dict.append(a.strip("\n")) word_add.add(a.strip("\n")) for st in open(st_path): # st = st.decode("utf8") code1, st_code = st.split("\t") code, stock = st_code.split(",") stock_code_dict.append(code.strip("\n")) stock_dict.append(stock.strip("\n")) word_add.add(code.strip("\n")) word_add.add(stock.strip("\n")) stocks_df = pd.read_csv(st_new_path, encoding='utf-8') stock_df.append(stocks_df.set_index('SESNAME')) for index, row in stocks_df.iterrows(): stock_dict.append(row.SESNAME) for z1 in open(zhi_ne_path): # z1 = z1.decode("utf8") new_dict.append(z1.strip("\n")) word_add.add(z1.strip("\n")) for z2 in open(zhi_po_path): # z2 = z2.decode("utf8") z2_data = z2.strip("\n") new_dict.append(z2_data) word_add.add(z2_data) for jg in open(jg_path): # jg = jg.decode("utf8") jg_data = jg.split("\t")[0].strip("\n") new_dict.append(jg_data) word_add.add(jg_data) ''' # 将stock_words.txt中的股票词转换成jieba用户自定义词典的格式,然后添加到jieba的userdict中 for st in open(st_path): code1, st_code = st.split("\t") code, stock = st_code.split(",") stock_dict.append(code + ' ' + '5' + ' ' + 'n') stock_dict.append(stock.strip('\n').decode('utf-8') + ' ' + '5' + ' ' + 'n') apply_func = codecs.open(n_path, 'w', 'utf-8') for i in stock_dict: apply_func.write(i + '\n') # \n为换行符 apply_func.close() ''' # 添加用户自定义字典 jieba.load_userdict(ns_path) jieba.load_userdict(n_path) jieba.load_userdict(jg_path) jieba.load_userdict(n2_path) # 添加新词 for w in word_add: jieba.add_word(w) # 结巴添加新词 jieba.add_word("淡定") # jieba.add_word("加多宝") # jieba.add_word("红罐") jieba.add_word("非公开") jieba.add_word("不成人形") jieba.add_word("中美贸易战") logging.logger.info("[Info] jieba总共添加了{}个自定义词汇。".format(len(word_add)))
- Download the traditional chinese dictionary from [`jieba-tw`](https://raw.githubusercontent.com/ldkrsi/jieba-zh_TW/master/jieba/dict.txt) ``` jieba.set_dictionary(file_path) ``` - Add own project-specific dictionary ``` jieba.load_userdict(file_path) ``` - Add add-hoc words to dictionary ``` jieba.add_word(word, freq=None, tag=None) ``` - Remove words ``` jieba.del_word(word) ``` - Chinese stopwords (See [林宏任老師 GitHub](https://github.com/tomlinNTUB/Python/tree/master/%E4%B8%AD%E6%96%87%E5%88%86%E8%A9%9E) - `jieba.cut()` does not interact with stopword list - `jieba.analyse.set_stop_words(file_apth)` - Word segmentation - `jieba.cut()` returns a `generator` object
# entity mention: 识别出的实体名称 # pos_b: entity mention在文件中的起始位置(从0开始编号) # pos_e: entity mention在文件中的终止位置 # category:entity所属的类别 # DICT_NOW.csv: # 所有标记对应的语言组成的dictionary # 这一部分只需要将***.txt中的entity mention和entity category进行对应输出即可 # 得到DICT_NOW.csv文件。(实际中可以在医药网站或者医学百科中爬取一些医学类entity) dics = csv.reader(open("./source_data/DICT_NOW.csv", 'r', encoding='utf8')) # ======================================================================================= # 利用jieba自定义分词,进行专有名词输入 # 将识别对象加入jieba识别词表,标记视为词性 for row in dics: if len(row) == 2: jieba.add_word(row[0].strip(), tag=row[1].strip()) # 强制加入词为一个joined整体 jieba.suggest_freq(row[0].strip()) # ======================================================================================= # 读取目标文件,进行IOB格式的标记,并写入dev、train、test文件 split_num = 0 start_time = time.time() for file in os.listdir(c_root): if "txtoriginal.txt" in file: fp = open(c_root + file, 'r', encoding='utf8') for line in fp: split_num += 1 words = pseg.cut(line) # 带词性切词 # key: word; value: part of speech
plt.rcParams[ 'savefig.dpi'] = 200 # 定义图形清晰度,set dpi for figure, affect the figure's size plt.rcParams['figure.dpi'] = 200 #set dpi for figure w, l, p = plt.pie(values, explode=[0.02 for i in range(16)], labels=labels, pctdistance=0.8, radius=1, rotatelabels=True, autopct=make_autopct(values)) [t.set_rotation(315) for t in p] # rotate the text for the labels plt.title('豆瓣 TOP250 电影种类', y=-0.1) plt.show() # word cloud jieba.add_word('久石让') jieba.add_word('谢耳朵') # 一些语气词和没有意义的词 del_words = [ '的', ' ', '人', '就是', '一个', '被', '电影', '我们', '不是', '也', '最', '了', '才', '给', '要', '就', '让', '在', '都', '是', '与', '和', '不', '有', '我', '你', '能', '每个', '不会', '中', '没有', '这样', '那么', '不要', '如果', '来', '它', '对', '当', '比', '不能', '却', '一种', '而', '不过', '只有', '不得不', '再', '不得不', '比', '一部', '啦', '他', '像', '会', '得', '里' ] all_quotes = ''.join(quote_list) # 将所有代表性评论拼接为一个文本 # 去掉标点符号 all_quotes = re.sub( r"[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+", " ", all_quotes) words = jieba.lcut(all_quotes)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # @Time : 2020/9/1 18:22 # @Author : way # @Site : # @Describe: 结巴分词 # import jieba_fast as jieba import jieba for i in ('悟空', '沙僧', '沙和尚', '猪八戒', '唐僧', '唐三藏'): jieba.add_word(i) path = r"C:\Users\Administrator\Desktop\西游记.txt" with open(path, 'r', encoding='utf-8') as f: result = jieba.lcut(f.read()) new_result = [(i, result.count(i)) for i in set(result) if i in ('悟空', '沙僧', '沙和尚', '猪八戒', '唐僧', '唐三藏')] new_result.sort(key=lambda x: x[1], reverse=True) for i in new_result: print(i)
# -*- coding: utf-8 -*- # @Time : 2020/7/5 12:57 # @Author : MYH # @File : 6.5 jieba库的使用.py # @Software: PyCharm # 如果对于英文文本可以通过split方法进行分离提取单词 可是对于中文文本中缺少分隔符,所以有了"分词"问题 而jieba库则是python中一个重要的第三方中文分词函数库 # jieba库不是python自带库,所以需要我们自行安装 通过pip install jieba 命令完成安装 但是太慢了 需要使用国内的镜像源 # 引入jieba库 import jieba # jieba.cut("你你你好") # 精确模式 返回一个可迭代的数据类型 # jieba.cut(s, cut_all=True) # 全模式 输出文本中所有可能的单词 # jieba.cut_for_search(s) # 搜索引擎模式,蛇和搜索引擎建立索引的分词结束 # jieba.lcut(s) # 建议使用 精确模式,返回一个列表类型 # jieba.lcut(s, cut_all=True) # 建议使用 全模式 返回一个列表类型 # jieba.lcut_for_search(s) # 搜索引擎模式 返回一个列表类型 建议使用 # jieba.add_word(w) # 向分词词典中添加一个新词w print(jieba.lcut("中国是我的祖国,中华人民共和国是一个伟大的国家")) # 精确模式,输出的分词能够完整且不多于组成原始文本 print(jieba.lcut("中国是我的祖国,中华人民共和国是一个伟大的国家", cut_all=True)) # 返回安全模式,输出原始文本中可能产生的所有问题,冗余最大 print(jieba.lcut_for_search("中国是我的祖国,中华人民共和国是一个伟大的国家")) # 搜索引擎模式,首先执行精确模式,然后再对其中的长词进行进一步切分 print(jieba.lcut("习大大在西湖游玩")) jieba.add_word('习大大') # 添加单词 print(jieba.lcut("习大大在西湖游玩"))
u'郑州',u'武汉',u'长沙',u'厦门',u'苏州',u'南京',u'合肥',u'哈尔滨',u'青岛',u'湖州',u'无锡' ,u'福州',u'阜阳',u'贵阳',u'沈阳',u'浦东新区',u'通化'] stop_words_file_list = [ "hit_stopword.txt", "baidu_stopword.txt", "sichuan_stopword.txt", "chinese_stopword.txt" ] for file_path in stop_words_file_list: with open(os.path.join(os.path.dirname(__file__), file_path)) as f: lines = f.readlines() for line in lines: stop_words.append(line.strip().decode('utf-8')) jieba.add_word("location_dic.txt") with open(os.path.join(os.path.dirname(__file__), "fensi.txt")) as f: lines = f.readlines() for line in lines: clean_line = line.strip() if len(clean_line) > 0: seg_list = jieba.cut(clean_line) for seg in seg_list: if seg not in stop_words: tokens.append(seg) counter = Counter(tokens) result="" totalnum=0 for a in counter.most_common(57):
def load_word_dict(self): if self.external_word_dict: for word in self.external_word_dict: jieba.add_word(word, freq=1000000)
def add_word_dict(word, freq=None, tag=None): ''' 向词典中添加新单词 ''' jieba.add_word(word, freq=None, tag=None)
# -*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding('utf-8') #gb2312 import random import numpy as np from tflearn.data_utils import pad_sequences #from pypinyin import pinyin,lazy_pinyin from collections import Counter import os import pickle import csv import jieba from data_mining.data_util_tfidf import cos_distance_bag_tfidf, get_tfidf_score_and_save jieba.add_word('花呗') jieba.add_word('借呗') PAD_ID = 0 UNK_ID = 1 _PAD = "_PAD" _UNK = "UNK" TRUE_LABEL = '1' splitter = "&|&" special_start_token = [u'怎么', u'如何', u'为什么', u'为何'] def load_data(traning_data_path, vocab_word2index, vocab_label2index, sentence_len, name_scope,
yield questions, answers def convert_to_str(string): segvec = segmentor.jieba_segment(string) cleanedvec = string_process.remove_invalid_string(segvec) sortedvec = sorted(cleanedvec) return (u'|'.join(sortedvec)) if __name__ == '__main__': logging.basicConfig(format='[%(asctime)-15s] : %(levelname)s : %(message)s', level=logging.INFO) jieba.add_word('做什么') jieba.add_word('吃什么') jieba.add_word('叫什么') jieba.add_word('唱什么') jieba.add_word('想什么') jieba.add_word('是什么') jieba.add_word('有什么') jieba.add_word('用什么') jieba.add_word('看什么') jieba.add_word('玩什么') jieba.add_word('爱什么') jieba.add_word('干什么') jieba.add_word('整什么') jieba.add_word('为什么') jieba.add_word('什么事')
def __init__(self, strict_mode=True): data = file2json(file2abspath('region_data.json', __file__)) counter = collections.Counter() self.strict_mode = strict_mode self.data = { 'items': {}, # 原始数据,基于cityid(多种指代,可以市省,市,区县级别) # 基于别名的索引, NER使用 'province': {}, # 无重名 'city': {}, # 有重名 'district': {}, # 有重名 'alias': {}, # 别名索引 'lookup': collections.defaultdict(set), } # copy data for item in data: self.data['items'][item['cityid']] = item # process province map_province = collections.defaultdict(set) for item in data: p = item.get('province') c = item.get('city') d = item.get('district') if p and not c: # cityid 为省的ID item["type"] = "province" item["name"] = p map_province[p].add(item['cityid']) assert 34 == len(map_province), len(map_province) # logging.info(json.dumps(list(map_province.keys()), ensure_ascii=False)) for p in sorted(list(map_province)): alias_list = normalize_province(p) pnorm = alias_list[1] self.data['province'][p] = { 'province': self._get_list_province_unique(map_province[p]), 'cityid_list': list(map_province[p]), 'alias': [p] + alias_list} map_province[p] = pnorm if pnorm.startswith(u'安徽'): logging.info(json.dumps(alias_list)) # print json.dumps(list(set([p,pnorm,pnorm2])),ensure_ascii=False) # process city map_city = collections.defaultdict(set) for item in data: c = item.get('city') d = item.get('district') if c in [u"市辖区", u"县", u"省直辖县级行政区划", u"自治区直辖县级行政区划"]: continue """ { "city": "市辖区", "cityid": "310105", "district": "长宁区", "province": "上海市" } { "city": "南通市", "cityid": "320601", "district": "市辖区", "province": "江苏省" }, """ if c and not d: item["type"] = "city" item["name"] = c map_city[c].add(item['cityid']) if len(map_city[c]) != 1: logging.error(json.dumps(item, ensure_ascii=False)) logging.error(len(map_city[c])) assert len(map_city[c]) == 1 assert 333 == len(map_city), len(map_city) # logging.info(json.dumps(list(map_city.keys()), ensure_ascii=False)) for p in sorted(list(map_city)): alias_list = normalize_city(p) assert pnorm # print p, '-->',pnorm, '-->', pcompact self.data['city'][p] = { 'province': self._get_list_province_unique(map_city[p]), 'cityid_list': list(map_city[p]), 'alias': [p] + alias_list} assert len(map_city) == len(self.data['city']), len(self.data['city']) # process district map_district = collections.defaultdict(set) for item in data: d = item.get('district') if d in [u"市辖区"]: # check above 市辖区 is used both as value of city and district # simply drop them since they already defined in city level continue if d: item["type"] = "district" item["name"] = d map_district[d].add(item['cityid']) assert 2821 == len(map_district), len(map_district) for p in sorted(list(map_district)): alias_list = normalize_district(p) # print p, '-->',pnorm, '-->', pcompact cityid_list = list(map_district[p]) if len(cityid_list) > 1: # logging.info( len(cityid_list) ) # logging.info( p ) pass self.data['district'][p] = { 'province': self._get_list_province_unique(map_district[p]), 'cityid_list': cityid_list, 'alias': [p] + alias_list} # process duplicated name 别名索引 for index in ['province', 'city', 'district']: for name, data in self.data[index].items(): for alias in set(data['alias']): # if alias.startswith(u"清"): # logging.info(alias) self.data['lookup'][alias].update(data['cityid_list']) for alias, alias_cityid_list in self.data['lookup'].items(): alias_cityid_list_unique = set(alias_cityid_list) if len(alias_cityid_list_unique) > 1: # logging.debug(u"{} {}".format(alias, len(alias_cityid_list_unique))) # print alias for code in alias_cityid_list_unique: # print json.dumps(self.data['items'][code], ensure_ascii=False) pass # 有唯一省的地点名, 歧义地点名不管 for alias, alias_cityid_list in self.data['lookup'].items(): alias_cityid_list_unique = set(alias_cityid_list) province = self._get_list_province_unique(alias_cityid_list_unique) if province: self.data['alias'][alias] = province # with codecs.open(getTheFile('libcity_cn.new.json'),'w',encoding='utf-8') as f: # json.dump(self.data, f,ensure_ascii=False, indent=4) # 统计 for index in self.data: counter[index] = len(self.data[index]) # validation for alias, entities in self.data['lookup'].items(): if len(alias) == 1: logging.error(json.dumps( entities, ensure_ascii=False, indent=4, sort_keys=True)) if self.strict_mode: exit() if alias in [u'自治']: logging.error(json.dumps( entities, ensure_ascii=False, indent=4, sort_keys=True)) if self.strict_mode: exit() if len(entities) > 1: counter["one-alias-many-entities"] += 1 # logging.info(u"{}[{}] {}".format(alias, len(entities), u",".join([x["name"]+x["type"] for x in entities]))) # prepare for NER for index in ['province', 'city', 'district']: for name, data in self.data[index].items(): for alias in set(data['alias']): if re.search(ur"[省市县]$", alias): jieba.add_word(alias, 10000000) elif re.search(ur"[区]$", alias): jieba.add_word(alias, 1000000) else: jieba.add_word(alias, 100000) for suffix in u"路镇乡圩河区村": jieba.add_word(u"{}{}".format( alias, suffix), 1000000)
# -*- coding: utf-8 -*- import jieba jieba.add_word("台中")
cutted_words = jieba.cut(input_str) words = [word for word in cutted_words] print(words) # 辨識新字詞 # 啟用 HMM 已辨識新字詞 (預設 HMM 功能即為啟用,可以不用特地設為 True) input_string = "他来到了网易杭研大厦" cutted_words = jieba.cut(input_string, HMM=True) words = [word for word in cutted_words] print(words) # 在既有使用的字典下新增自定義字詞 jieba.load_userdict("jieba/test/userdict.txt") # 動態加入字典 jieba.add_word("國立臺灣大學") # 動態調整詞頻 jieba.suggest_freq("國立臺灣大學", True) # 進行詞性標注(PoS Tagging) import jieba.posseg as pseg input_str = "小明碩士畢業於國立臺灣大學,現在在日本東京大學進修深造" cutted_words = pseg.cut(input_str) words = [(word, flag) for (word, flag) in cutted_words] print(words) # 取出斷詞位置 input_str = u'小明碩士畢業於國立臺灣大學,現在在日本東京大學進修深造' #在此將字串轉為unicode
import jieba a = "我想听祖国祖国我们爱你" b = "我想听祖国我爱你" jieba.add_word("我想听") jieba.add_word("请播放") jieba.add_word("我要听") # res = list(jieba.cut(a)) # res = list(jieba.cut(a)) res_search = list(jieba.cut_for_search(a)) res = list(jieba.cut_for_search(b)) print(res) print(res_search) # 0:我想听 1:祖国 2:我爱你 3:我们 4:爱 5:你 # 01(2,45) # 01(345,45)
def add_words(): import jieba jieba.add_word('大数据') jieba.add_word('深度学习') jieba.add_word('机器学习') jieba.add_word('数据分析')
print("精准模式: " + "/ ".join(seg_list)) seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #默认是全模式 print("搜索引擎模式: " + "/ ".join(seg_list)) seg_list = jieba.lcut("我来到北京清华大学", cut_all=False) print(seg_list) # 精准模式 import os root_path = '' jieba.set_dictionary(os.path.join(root_path, 'corpus/dict.txt.big')) seg_list = jieba.lcut("今年他不幸得了新冠肺炎住院") print(seg_list) jieba.load_userdict(os.path.join(root_path, 'corpus/medical_term.txt')) seg_list = jieba.lcut("今年他不幸得了COVID-19住院") print(seg_list) jieba.add_word('新冠肺炎') seg_list = jieba.lcut("今年他不幸得了新冠肺炎住院") print(seg_list) seg_list = jieba.lcut("王小二是一个农民") print(seg_list) seg_list = jieba.lcut("发展社会主义的新乡村") print(seg_list) jieba.add_word('新乡村') seg_list = jieba.lcut("发展社会主义的新乡村") print(seg_list) seg_list = jieba.lcut("王军虎去广州了") print(seg_list) seg_list = jieba.cut("王军虎去广州了。", HMM=False)
def add_to_dict(characters_names): for name in characters_names: jieba.add_word(name)
idf_score += self.tfidf.idf_freq.get(match_keyword, self.tfidf.median_idf) score = (len(match_keywords), idf_score) if score > max_score and len(match_keywords) == len(and_keywords): max_score = score return max_score if __name__ == '__main__': clf = KeywordClassifier() attrs = { "pos": { "target": "了解情况:1", "keywords": ["嗯", "可以", "好的"] }, "neg": { "target": "挽回:1", "keywords": ["不需要", "不用了", "没有", "物业费,价格"], "patterns": ["不.*要"] }, "spe": { "target": "挽回:2", "keywords": ["你是谁", "干什么", "楼盘,价格"] } } jieba.add_word("你是谁") print(clf.predict(attrs, '干什么')) print(clf.predict(attrs, '物业费价格'))
def __init__(): global FULL_DICT temp_full_dict = { # u'test': [], u'question': [], u'assessment-factor': [], # u'application': [], u'school': [], u'faculty': [], u'program': [], u'country': [], u'location': [], u'rank-source': [], u'keyword': [], u'faculty-type': [], u'location-field': [], u'school-field': [], u'faculty-field': [], u'program-field': [], } global STOP_WORDS # dict init for theme in temp_full_dict: print(theme) dict_reader = csv.reader(open('resource/sym_dict/' + theme + '.csv', 'rb')) rule_list = [] for origin_rule in dict_reader: if len(origin_rule) < 2: continue rule_dict = {} try: rule_dict['value'] = int(origin_rule[0]) except ValueError: rule_dict['value'] = smart_unicode(origin_rule[0]) for rule_type in ['keyword', 'regex', 'entity']: rule_dict[rule_type] = list(map( lambda x: smart_unicode(x)[len(rule_type) + 1:], filter( lambda x: x.find(rule_type) is 0, origin_rule[1:] ) )) for rule_type in ['keyword']: rule_dict[rule_type] = list(map( lambda x: x.lower(), rule_dict[rule_type] )) for rule_type in ['keyword', 'entity']: rule_dict[rule_type] = set(rule_dict[rule_type]) rule_list.append(rule_dict) temp_full_dict[theme] = rule_list # cut dict for theme in FULL_DICT: for rule_dict in FULL_DICT[theme]: for keyword in rule_dict['keyword']: jieba.add_word(keyword) jieba.load_userdict('resource/jieba_dict/other_dict.txt') temp_stop_words = set( map( lambda x: x.strip().decode('utf-8'), open('resource/jieba_dict/stop_word_dict.txt').readlines() ) + [u' '] ) print('Dicts are successfully loaded!') FULL_DICT = temp_full_dict STOP_WORDS = temp_stop_words
del tf_counter[term] adjusted_size = len(tf_counter) df_counter = tf_counter & df_counter print('Origin Size', origin_size) print('Adjusted Size', adjusted_size) print('----------------------------------') print('\n') for term in tf_counter: jieba.add_word(term, tf_counter[term] * 100000) the_term_sets = [] for i in range(ws_max_row): the_row_idx = i + 1 if the_row_idx == 1: continue the_title = ws['D' + str(the_row_idx)].value the_content = ws['E' + str(the_row_idx)].value
"历史回顾", "方针思想", "人才培育", "法制建设", "环境保护", "产业结构", "海外投资", "企业改革", "土地改革", "服务业", "私营经济", "科技研究" ] for word in label: jieba.add_word(word, freq=10) train_data = [] train_target= [] with io.open('newkeywords.txt', 'r', encoding='utf8') as f: # sourceInLines = f.readlines() #按行读出文件内容 line = "" for lines in f: if lines.strip().split(" ")[0] in label: if line == "": continue lables_in_sentence = lines.encode('utf-8').strip().split(" ")
from collections import Counter import jieba import read_data import json counter = Counter() data = read_data.collection.find() all_list = [] jieba.add_word("大数据") jieba.add_word("大数据挖掘") stop_words = [ '(', ')', ':', '-', '(', ')', '/', '-', '+', ' ', '丨', 'k', '6300' ] for item in data: list = jieba.lcut(item['pname']) # print(item['pname']) for i in list: if i not in stop_words: counter[i] += 1 # all_list.append(i) counter = counter.most_common(250) # print(c + " : " + str(counter[c])) counter = dict(counter) print(counter) # print(str(counter)c) # print(str(json.dumps(counter, ensure_ascii=False)))
#common functions in jieba import jieba print(jieba.lcut("中国是一个伟大的国家")) #precision mode print(jieba.lcut("中国是一个伟大的国家", cut_all=True)) #full mode print(jieba.lcut_for_search("中国人民共和国是伟大的")) #search mode jieba.add_word("蟒蛇语言") #add a word to the dict
#coding:gbk ''' 作者:王冠超 程序目标:获取小说《黎明破晓的街道》里的人物出现频率及人物关系。 ''' import re import jieba from collections import Counter characters = ['渡部','秋叶','有美子','仲西达彦','新谷','妙子','园美','芦原','钉宫真纪子','本条'] #小说里的主要人物 for character in characters: jieba.add_word(character) #加进jieba字典里 #初始化容器分别存储每行的人物名和人物关系 line_names = list() relationships = {i:dict() for i in characters} character_attention = list() #预处理文本并统计人物频率作为节点权重 with open('黎明破晓的街道.txt','r',encoding = 'utf-8')as f: for line in f.readlines(): #找出每行问题中出现的人物名 tmp_line_names = [word for word in jieba.cut(line) if word in characters] character_attention += tmp_line_names if len(tmp_line_names) > 1: line_names.append(tmp_line_names) #统计人物出场频率 character_counter = Counter(character_attention) #统计人物关系(以每行一起出现的次数代表人物关系) for line in line_names: for character1 in line: for character2 in line: if character1 == character2:
# coding=utf-8 ''' Created on 2015年5月11日 @author: BFD474 ''' from __future__ import print_function, unicode_literals import sys import jieba import jieba.posseg as pseg sys.path.append( "../" ) jieba.load_userdict( "userdict.txt" ) jieba.add_word( '石墨烯' ) jieba.add_word( '凱特琳' ) jieba.del_word( '自定义词' ) test_sent = ( "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n" "例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n" "「台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。" ) words = jieba.cut( test_sent ) print( '/'.join( words ) ) print( "="*40 ) result = pseg.cut( test_sent )
def add_words(words): if words: for word in words: jieba.add_word(word) else: jieba.add_word('')
MONGO_CONCEPT_COLLECTION = MONGO_CONCEPT_DB['conceptlist'] XML_SOURCE_FILE = '/home/daoming/data/*.xml' XML_DEST_DIR = '/home/daoming/xml/' def jiebaToList(data): wordList = [] for tk in data: wordList.append({'word':tk[0],'pos':tk[1]}) return wordList jieba.initialize() jieba.set_dictionary('dict.txt.big') reader = MONGO_WORD_COLLECTION.find({'user':1,'cut':1}) for word in reader: jieba.add_word(word['name'],word['weight'],word['tag']) dictConceptList = {} reader = MONGO_CONCEPT_COLLECTION.find() for word in reader: jieba.add_word(word['conceptname'],999,'NCI') dictConceptList[word['conceptname']] = word['conceptid'] filenameList=glob.glob(XML_SOURCE_FILE) fileList = [] i = 0 for filename in filenameList: basename = os.path.basename(filename) reader = open(filename) xmldict = xmltodict.parse(reader.read()) reader.close()
# -*- coding: utf-8 -*- import jieba from jieba import del_word, add_word # 分词 # 精确模式,试图将句子最精确地切开,适合文本分析; # 全模式,把句子中所有的可以成词的词语都扫描出来, 速度非常快,但是不能解决歧义; # 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。 # paddle模式,利用PaddlePaddle深度学习框架,训练序列标注(双向GRU)网络模型实现分词。同时支持词性标注. seg_list = jieba.cut('我来到北京清华大学', cut_all=True) print('全模式分词结果: ' + '/ '.join(seg_list)) seg_list = jieba.cut('我来到北京清华大学', cut_all=False) print('精确模式分词结果: ' + '/ '.join(seg_list)) seg_list = jieba.cut('他来到了网易杭研大厦') print(', '.join(seg_list)) seg_list = jieba.cut_for_search('小明硕士毕业于中国科学院计算所,后在日本京都大学深造') print('搜索引擎模式分词结果: ' + ', '.join(seg_list)) # 载入词典 载入文件格式一个词占一行,每一行分三部分:词语、词频(可省略)、词性(可省略),用空格隔开,顺序不可颠倒,词频省略时使用自动计算的能保证分出该词的词频。 jieba.load_userdict('vocabularity.txt') # 使用 add_word(word, freq=None, tag=None) 和 del_word(word) 可在程序中动态修改词典 # del_word('杭研大厦') add_word('杭研大厦') # 比如医疗,旅游中有一些词分不出来,可以加大这些词的权重,这样就可以分出来了 seg_list = jieba.cut('他来到了网易杭研大厦', cut_all=False) print('载入词典分词结果: ' + '/ '.join(seg_list))