def __init__(self, dict_paths): # TODO 加载外部词典 for p in dict_paths: jieba.load_userdict(p) # TODO jieba不能正确切分的词语,我们人工调整其频率。 jieba.suggest_freq(('特征', '症状', '症候'), True)
def init_jieba(self, seg_dic, userdic): jieba.load_userdict(userdic) jieba.set_dictionary(seg_dic) with open(userdic, 'r', encoding='utf-8') as input: for word in input: word = word.strip('\n') jieba.suggest_freq(word, True)
def data_process(data_paths='./data/cnews/news_{}.txt'): stop_words = get_stopword() with open(data_paths.format(1),'r',encoding='utf8') as f: text_1 = f.read() text_1 = re.sub(r'\n','',text_1) text_1 = re.sub(r'\d+','',text_1) with open(data_paths.format(2),'r',encoding='utf8') as f: text_2 = f.read() text_2 = re.sub(r'\n','',text_2) text_2 = re.sub(r'\d+','',text_2) with open(data_paths.format(3),'r',encoding='utf8') as f: text_3 = f.read() text_3 = re.sub(r'\n','',text_3) text_3 = re.sub(r'\d+','',text_3) text = text_1 + text_2 + text_3 jieba.suggest_freq(['易会满']) processed_text = ' '.join([word for word in jieba.cut(text) if word not in stop_words]) with open('./data/processed_txt','w',encoding='utf8') as f: f.write(processed_text) processed_text = word2vec.LineSentence('./data/processed_txt') return processed_text
def gen_wordcloud(text, filename): # 1). 强调分割中有问题的词; jieba.suggest_freq(('微博'), True) jieba.suggest_freq(('热搜'), True) # 2). 难点: 如何切割中文, jieba, lcut result = jieba.lcut(text) print(result) # 绘制词云 # 3). 打开图片, 获取图片的数据信息; imgObj = Image.open('./doc/wordcloud.jpg') img_mask = np.array(imgObj) # print(img_mask) # 4). 创建词云对象, 设置属性 wcObj = wordcloud.WordCloud( mask=img_mask, # 数据如何填充到图片 background_color="snow", # 北京颜色 font_path= "/usr/share/fonts/wqy-zenhei/wqy-zenhei.ttc", # 如果是中文, 指定字体库(fc-list :lang=zh) min_font_size=5, # 图片中最小的字体大小 max_font_size=50, # 图片中最小的字体大小 width=1000, # 图片宽度 height=1000, # 高 ) # 5). 生成图片; # 词云绘制时, 默认之处理字符串类型, 怎么分隔每个单词? 必须以逗号分隔符分割 wcObj.generate(",".join(result)) wcObj.to_file(filename)
def outofdate_func(): """ Set suggest word in jieba & write the sentence word-cut to data/text.segment.txt :return: """ list_suggest_word = [] with open('data/suggest_frep.txt') as fr: line = fr.readline() while True: if not line: break lists = line.strip().split() list_suggest_word = list_suggest_word + lists line = fr.readline() for suggest_word in list_suggest_word: jieba.suggest_freq(suggest_word, True) with open('data/text.txt', 'r') as f: fw = open('data/text_segment.txt', 'w') while True: line = f.readline() if not line: line = f.readline() if not line: break seg_list = jieba.cut(line, cut_all=False) new_line = " ".join(seg_list) # print(new_line) fw.write(new_line) fw.close() print('the cut processor done!')
def word_seg(src_path='MRP_Analysis_2.txt', self_dict_path='self_dict.txt'): #以只读模式打开需求文档 txt = '' with open(src_path, 'r', encoding='utf-8') as f: txt = f.read() f.close() logging.debug('---------File read done---------!') logging.debug(txt) #结巴分词 wordstr = '' try: try: jieba.load_userdict(self_dict_path) except FileNotFoundError: pass except Exception as e: raise e jieba.add_word('拜欧海多汀', freq=20000, tag=None) jieba.suggest_freq(('护', '将'), tune=True) #cut返回的是生成器,lcut返回的是列表.选择精准模式,关闭HMM模式,返回一个list. words = jieba.lcut(txt, cut_all=False, HMM=False) logging.debug(words) wordstr = ''.join(words) logging.debug(wordstr) except Exception as e: raise e return wordstr
def set_dict(): """ 自定义词典 一词一行 一行:词语 词频(可省)词性(可省) 空格分隔 **顺序不可颠倒 UTF-8编码** ``` 华能 3 nz 云泥 ns 河势 n 庙沟 ns ``` 使用 add_word(word, freq=None, tag=None) 和 del_word(word) 可在程序中**动态修改**词典。 使用 suggest_freq(segment, tune=True) 可调节单个词语的词频,使其能(或不能)被分出来。 注意:自动计算的词频在使用 HMM 新词发现功能时可能无效。 :return: """ jieba.load_userdict('../Data/dict/30wChinsesSeqDic_clean.txt') jieba.add_word('加入自定义词') test_sent1 = jieba.cut('在南京市长江大桥研究生命的起源和加入自定义词') print('test_sent1', "/".join(test_sent1)) jieba.del_word('加入自定义词') test_sent2 = jieba.cut('在南京市长江大桥研究生命的起源和加入自定义词') print('test_sent2', "/".join(test_sent2)) jieba.suggest_freq('研究生命', True) test_sent3 = jieba.cut('在南京市长江大桥研究生命的起源和加入自定义词') print('test_sent3', "/".join(test_sent3))
def segment_words(stars): comments = None if stars == 'all': comments = data['Comments'] else: comments = data[data['Star'] == stars]['Comments'] comments_list = [] for comment in comments: comment = str(comment).strip().replace('span', '').replace( 'class', '').replace('emoji', '') comment = re.compile('1f\d+\w*|[<>/=]').sub('', comment) if len(comment) > 0: comments_list.append(comment) text = ''.join(comments_list) # word_list = list(jieba.cut(text)) jieba.suggest_freq("无问西东", True) word_list = jieba.analyse.extract_tags(text, topK=50, withWeight=False, allowPOS=()) print(word_list) c = Counter(word_list) print(c) common_c = c.most_common(50) print(common_c) #words = ''.join(word_list) return common_c
def _jiebaPOSRule(): needRetain = [ '去大脑', '广谱', # 广谱抗生素 '阳转', ] for retain_word in needRetain: jieba.suggest_freq(retain_word, tune=True) needExtract = [ '体格检查', '光反应', '对光', '创伤性', '细菌性', '行为矫正', '粟粒状', # '安全性', # TA said don't split XX性 = =, but the given example need to split WTF '应予以', '常继发', # '迟发性', # '灵敏性', '若有阳', '完全恢复', ] for del_word in needExtract: jieba.del_word(del_word)
def split_word(my_str): jieba.suggest_freq('中美', True) jieba.add_word('中美') word_list = jieba.cut(my_str, cut_all=False) my_list = " ".join(word_list).split(' ') # pprint(my_list) return(my_list)
def set_jieba(self): """ Suggest frequency to jieba """ for line in open('adjust_words').readlines(): word = line.strip() jieba.suggest_freq(word, True)
def writer_newword(self): dir_path = os.path.dirname(os.path.abspath(__file__)) existWord = self.remove_dup('./jieba/industrydict/') infile = './jieba/industrydict/newword.dict' if os.path.isfile(infile): writer = open(infile, 'a+') else: writer = open(infile, 'wb') lword = self.read_newword() num = 0 if lword: for sub in lword: if sub['word']: word = sub['word'].encode('utf-8', 'ignore').replace(' ', '') sex = sub['word_sex'].encode('utf-8', 'ignore').strip() if word not in existWord and sex: freq = suggest_freq(word, True) if suggest_freq( word, True) else 1 num += 1 writer.write('%s %s %s\n' % (word, freq, sex)) logger.info('%s words have been writted in infile' % num) writer.close() logger.info('write newword task is finish')
def add_dict(): # 导入自定义字典,这是在检查分词结果后自己创建的字典 jieba.load_userdict("userdict.txt") dict1 = open("userdict.txt","r",encoding='utf8') #需要调整自定义词的词频,确保它的词频足够高,能够被分出来。 #比如双肾区,如果在jiaba原有的字典中,双肾的频率是400,区的频率是500,而双肾区的频率是100,那么即使加入字典,也会被分成“双肾/区” [jieba.suggest_freq(line.strip(), tune=True) for line in dict1] #加载命名实体识别字典 dic2 = csv.reader(open("DICT_NOW.csv","r",encoding='utf8')) for row in dic2: if len(row) ==2: jieba.add_word(row[0].strip(),tag=row[1].strip()) jieba.suggest_freq(row[0].strip(),tune=True) # 用正则表达式匹配到的词,作为字典 fout_regex = open('regex_dict.txt','w',encoding='utf8') for file in os.listdir(path=c_root): if "txtoriginal.txt" in file: fp = open(c_root+file,"r",encoding="utf8") for line in fp.readlines(): if line.strip() : #正则表达式匹配 p1 = re.compile(r'\d+[次度]').findall(line) p2 = re.compile(r'([a-zA-Z0-9+]+[\.^]*[A-Za-z0-9%(℃)]+(?![次度]))').findall(line) p_merge = p1+p2 for word in p_merge: jieba.add_word(word.strip()) jieba.suggest_freq(word.strip(),tune=True) fout_regex.write(word+'\n') fp.close() fout_regex.close()
def Initialization(): jieba.suggest_freq('采购单', True) jieba.suggest_freq('采购提交', True) jieba.suggest_freq('玻尿酸', True) jieba.suggest_freq("新增采购", True) jieba.suggest_freq("水泥", True) jieba.del_word('采购提交水泥')
def _build_vocabulary(dictionary_path='../data/vocabulary.dict', ngram=[2, 3], filter=True): ''' 词表是一个很重要的影响因素,不过滤构造的词矩阵会OOM ''' id2laws = pd.read_pickle('../cache/law_vocab.pkl') for id, laws in id2laws.items(): for law in laws: jieba.add_word(law) jieba.suggest_freq(law) with codecs.open('../data/form-laws.txt', encoding='utf-8') as f: ls = f.readlines() for i, line in enumerate(ls): for law in re.findall('【(.*?)】', line): for word in law.split(';'): jieba.add_word(word) jieba.suggest_freq(word) if os.path.exists(dictionary_path): dictionary = corpora.Dictionary().load(dictionary_path) else: doc_list = pd.read_pickle('../cache/doc_list.pkl') doc_list_te = pd.read_pickle('../cache/doc_list_te.pkl') doc_list.extend(doc_list_te) with codecs.open('../data/form-laws.txt', encoding='utf-8') as f: ls = f.readlines() doc_list.extend(ls) # cor = [ token_extract(remove_previous_convictions(line) ) for line in doc_list ] cor = [token_extract(line, ngram=[]) for line in doc_list] dictionary = corpora.Dictionary(cor) if ngram: cor = [token_extract(line, ngram=[2, 3]) for line in doc_list] dictionary2 = corpora.Dictionary(cor) once_ids = [ tokenid for tokenid, docfreq in dictionary2.dfs.items() if docfreq < 100 ] dictionary2.filter_tokens(once_ids) dictionary2.compactify() print('len dictionary = %s' % len(dictionary)) # len dictionary = 125156 dict2_to_dict1 = dictionary.merge_with(dictionary2) if filter: once_ids = set([ tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq < 3 ]) dictionary.filter_tokens(once_ids.union(stop_words)) dictionary.compactify() print('len dictionary = %s' % len(dictionary)) #len dictionary = 125156 dictionary.save(dictionary_path) del doc_list, doc_list_te gc.collect() return dictionary
def transform_json(self, json_content): """ 将json存储为字典 """ result_dict = {} # topic_keywords_list eg: "悠闲": ["惬意", "休闲", "发呆", "放松", "宁静"] topic_keyword_dict = self.get_topic_keyword( pkg_resources.resource_filename( 'HotelSentimentTagAnalysis.resource', 'topic_keyword.txt')) for json_line in json_content: for topic_word, keyword_critical_reverse in json_line.items(): topic_word = topic_word.encode('utf8') # topic 下对应的 keywords keywords = set(topic_keyword_dict.get(topic_word, [])) # 好 差 漂亮 critical_word_set = set( keyword_critical_reverse.get('critical_words', [])) defaultValue = int(keyword_critical_reverse.get('default', 0)) for each_keyword in keywords: jieba.suggest_freq(each_keyword, True) result_dict[each_keyword.decode('utf8')] = { 'topic_word': topic_word, 'default_value': defaultValue, 'critical_word_set': critical_word_set } return result_dict
def load_suggest_words(self, suggestwords_file=None): """ Argument : suggestwords_file : suggest words file name loading suggest words """ if type(suggestwords_file) is str: filename = suggestwords_file else: filename = util.get_default_suggest_words_file() f = codecs.open(filename, 'r', 'utf-8') for word in f: word = util.normalize_sentence(word.strip()) if len(word): if '\t' in word: w = word.split('\t') jieba.suggest_freq((w[0].strip(), w[1].strip()), True) else: jieba.suggest_freq(word.strip(), True)
def jieba_diy(): diywords_filepath = PathUtil().diy_words_filepath jieba.load_userdict(diywords_filepath) # print(psg.lcut('什么是职业分类表')) for word_iter in jieba_add_words(): jieba.add_word(word_iter) jieba.suggest_freq(['那', '不买'], tune=True)
def suggest_words(file_name,seg_list): #with open("suggest.txt", 'r', encoding='UTF-8') as f: with open(file_name, 'r', encoding='UTF-8') as f: lines=f.readlines() for data in lines: data=data.strip() jieba.suggest_freq(data, True)
def sep_words(): sentence_treated_list = [] stopwords = stopwords_list() jieba.load_userdict("/Users/chensx/Desktop/大学/毕业设计文献/THUOCL/THUOCL_it.txt") jieba.suggest_freq("下单", True) jieba.suggest_freq("WiFi", True) # Read review data sentence_csv = pd.read_csv('review_crawler/review.csv', index_col=0) sentence_list = sentence_csv['text'] for sentence in sentence_list: sentence = sentence.rstrip('\n') sentence = emoji_filter(sentence) sentence_after = jieba.cut(sentence, cut_all=False) outstr = '' # If word in stopwords, delete for word in sentence_after: if word not in stopwords and word != ' ': outstr += word outstr += ' ' print(outstr) sentence_treated_list.append(outstr.rstrip()) # Write into csv data = pd.read_csv('review_crawler/review.csv', encoding='utf-8') data['after_treatment'] = sentence_treated_list data.to_csv('review_crawler/review.csv', encoding='utf-8', index=False)
def segment_test(self, types=2, mode=True, noThu=True): """ word segmentation @param types: 0->pkuseg, 1->jieba, 2->jieba_v2, 3->thulac, 4->thulac_v2, 5->pkuseg_v2 @param mode: True-> prediction, False-> test """ origin_word = self.origin_word if mode else self.test_origin # origin_word = ['小儿脑性瘫痪'] '''pkuseg''' seg = pkuseg.pkuseg(model_name='medicine') pos_pkuseg = [' '.join(seg.cut(ii)) for ii in origin_word] seg = pkuseg.pkuseg(model_name='medicine', user_dict='{}medicine_dict.txt'.format(pickle_dir)) pos_pkuseg_v2 = [' '.join(seg.cut(ii)) for ii in origin_word] '''jieba''' pos_jieba = [' '.join(jieba.cut(ii)) for ii in origin_word] jieba.load_userdict(self.medicine_dict) jieba.suggest_freq('$$_', True) for ii in self.del_dict: jieba.del_word(ii) pos_jieba_v2 = [' '.join(jieba.cut(ii)) for ii in origin_word] '''thulac''' if not noThu: thu1 = thulac.thulac(seg_only=True) pos_thulac = [thu1.cut(ii, text=True) for ii in origin_word] thu2 = thulac.thulac( seg_only=True, user_dict='%smedicine_dict.txt' % pickle_dir) pos_thulac_v2 = [thu2.cut(ii, text=True) for ii in origin_word] if not mode: print('Pkuseg\n', pos_pkuseg) self.evaluation_pos(pos_pkuseg, self.test_seg) print('Pkuseg & medicine\n', pos_pkuseg_v2) self.evaluation_pos(pos_pkuseg_v2, self.test_seg) print('Jieba\n', pos_jieba) self.evaluation_pos(pos_jieba, self.test_seg) print('Jieba & medicine\n', pos_jieba_v2) self.evaluation_pos(pos_jieba_v2, self.test_seg) if not noThu: print('Thulac\n', pos_thulac) self.evaluation_pos(pos_thulac, self.test_seg) print('Thulac & medicine\n', pos_thulac_v2) self.evaluation_pos(pos_thulac_v2, self.test_seg) print('Reference\n', self.test_seg) if not types: self.pos_word = pos_pkuseg elif types == 1: self.pos_word = pos_jieba elif types == 2: self.pos_word = pos_jieba_v2 elif types == 3: self.pos_word = pos_thulac elif types == 4: self.pos_word = pos_thulac_v2 elif types == 5: self.pos_word = pos_pkuseg_v2
def go(): comment_list = [] for x in range(0, 50): url = 'https://movie.douban.com/subject/26931786/comments?start={}&limit=20&sort=new_score&status=P'.format( x * 20) text = requests.get(url).text html = etree.HTML(text) result = html.xpath("//div[@class='comment']//p//span/text()") if len(result) > 0: for comment in result: comment_list.append(comment) join = "".join(comment_list) jieba.suggest_freq(('钢铁侠'), True) cut_text = " ".join(jieba.cut(join)) stopwords = set(STOPWORDS) stopwords.add('一部') stopwords.add('就是') stopwords.add('还是') stopwords.add('一个') stopwords.add('不过') stopwords.add('电影') wordcloud = WordCloud(stopwords=stopwords, background_color="white", width=1000, font_path='simsun.ttc', height=860, margin=2).generate(cut_text) plt.imshow(wordcloud) plt.axis("off") plt.show()
def preprocess_regardless_stopwords(): jieba.enable_paddle() for name in names: jieba.suggest_freq(name, tune=True) with codecs.open("corpus.txt", 'w', 'utf-8') as standard: standard.seek(0) standard.truncate() for novel in os.listdir('resources/'): path = 'resources/' + novel print("novel " + novel + " start loading") with open(path, 'r', encoding='utf-8') as f: text = f.read() sentences = re.split("(。|!|\!|\.|?|\?)", text) print("there are " + str(len(sentences)) + " sentences in this novel") new_sents = [] for i in range(int(len(sentences) / 2)): sent = sentences[2 * i] + sentences[2 * i + 1] new_sents.append(remove_punctuation(sent)) for sent in new_sents: if sent != '': split_sent = ' '.join( entity_mapping(jieba.cut(sent, use_paddle=True))) standard.write(split_sent + '\n') print("novel " + novel + " finished")
def get_cipin(self): result = dict() cutResult = self.cut() jieba.suggest_freq(('服务', '质量'), True) resultList = list(cutResult) # 转化成一个序列 resultSet = set(resultList) # 转化成一个集合 # 转化成序列是为了转化成集合。 # 转化成集合的目的是获取不重复的元素用于统计(下面的for循环中会使用)。 # 创建一个存放统计结果的字典,拆分之后的单词作为key,单词出现的次数作为value。 resultForstatistic = dict() # 词列表内的词词频 keywords = {'服务', '关系', '质量', '忠诚度', '满意', '顾客', '员工'} # keywords = resultSet # 所有切出来的词统计词频 for item in resultSet: resultForstatistic[item] = resultList.count(item) # 统计结果字典中没有的单词,其出现次数设为0。 for keyword in keywords: try: keyCounts = resultForstatistic[keyword] except KeyError: keyCounts = 0 result[keyword] = keyCounts # result.append((keyword, keyCounts)) self.key_cipin = result return result
def adjust_jieba_dict(adjust_word_file): f = open(adjust_word_file, encoding='utf-8') adjust_list = f.readlines() for i in adjust_list: jieba.suggest_freq(i.strip(), True) f.close()
def other_discomm(self, third_house): # 提取出所有小区名,作为停用词(小区+desc中提取的小区) stop_word_comm = third_house['PropertyCommunity'].unique().tolist() # 从desc中提取小区 ext_comm = [] cond = third_house['PropertyCommunity'].isnull() for desc in third_house[cond]['HouseDesc'].unique(): pattern = re.compile('.*?·(.*?)\s') res = pattern.findall(desc) try: ext_comm.append(res[0]) except: ext_comm.append(desc) # 将desc中提取的小区和直接取出的小区求交集 finally_comm = set() for comm in ext_comm + stop_word_comm: patt = re.compile('(.*?)[\(\(]') try: # print(comm) res = patt.findall(comm) finally_comm.add(res[0]) except: finally_comm.add(comm) lj_total_comm = pd.DataFrame(data=finally_comm, columns=['comm']) lj_total_comm.to_csv(self.community, index=None) # 将停用词文件读取出来,设置特定词语不被分开 stop_word_set = set(pd.read_csv(self.community)['comm'].to_list()) # 加载自己的停用词典 jieba.load_userdict(self.community) # 调整词典,使特定的词语不被分开 for word in stop_word_set: if word != None: jieba.suggest_freq(str(word), True)
def add_org(filename): with open(filename,'r',encoding='utf8') as f: lines = f.readlines() for row in lines: row =row.strip().split(",") if len(row)==2: jieba.add_word(row[0],tag=row[1]) jieba.suggest_freq(row[0])
def segmentation(sentence,lists=[]): jieba.suggest_freq(u"刘超", True) #分词 sentence = jieba.cut(sentence,cut_all=False,HMM=False) #组合 format_sentence=",".join(sentence) lists=format_sentence.split(",") return lists
def wordseg(sentence, alter): """ 分词函数 :return: """ jieba.suggest_freq(alter, True) seg_list = jieba.cut(sentence, cut_all=False) return " ".join(seg_list).split(" ")
def load_suggest_freq(): if os.path.exists('./suggest_freq.txt'): f = open('./suggest_freq.txt', 'r') lines = f.readlines() for line in lines: words = line.split(' ') jieba.suggest_freq((words[0], words[1]), True) f.close()
def init_jieba(self, seg_dic, userdic): """ jieba custom setting. """ jieba.load_userdict(userdic) jieba.set_dictionary(seg_dic) with open(userdic,'r',encoding='utf-8') as input: for word in input: word = word.strip('\n') jieba.suggest_freq(word, True)
def __init(): user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt") jieba.load_userdict(user_dict_path) jieba.add_word("快递", 10000) jieba.suggest_freq(("面", "太厚")) jieba.suggest_freq(("价格", "便宜")) jieba.suggest_freq(("服务", "周到")) jieba.suggest_freq(("速度", "快"))
def segmentation(sentence,dicts=None): """ 接受一个字符串,转换成对应的词频字典及分词列表 """ jieba.suggest_freq("BOS", True) jieba.suggest_freq("EOS", True) sentence = jieba.cut(sentence,HMM=False) # format_sentence=",".join(sentence) #将词按","分割后依次填入数组word_list[] # lists=format_sentence.split(",") #统计词频,如果词在字典word_dir{}中出现过则+1,未出现则=1 lists = [t for t in sentence] if isinstance(dicts, dict): for index, word in enumerate(lists): if index != 0: # 二元语法, 记录与前一个词一起出现的词频 word_2 = '{}_{}'.format(lists[index-1], word) dicts.setdefault(word_2, 0) dicts[word_2] += 1 dicts.setdefault(word, 0) dicts[word] += 1 return lists
# 导入wordcount,用于制作词云图 from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator if __name__ == '__main__': # 获取所有评论 comments = [] with open('/Users/youpeng/zhizhi/beastcancer/qas.txt', mode='r', encoding='utf-8') as f: rows = f.readlines() i=0 for row in rows: comments.append(row) # 设置分词 jieba.suggest_freq('他莫昔芬', True) jieba.suggest_freq('他莫西芬', True) jieba.suggest_freq('它莫昔芬', True) jieba.suggest_freq('阿那曲唑', True) jieba.suggest_freq('诺雷德', True) jieba.suggest_freq('导管癌', True) jieba.suggest_freq('赫赛汀', True) jieba.suggest_freq('戈舍瑞林', True) comment_after_split = jieba.cut(str(comments), cut_all=False) # 非全模式分词,cut_all=false words = ' '.join(comment_after_split) # 以空格进行拼接 print(words)
print( '/'.join( words ) ) print( "="*40 ) result = pseg.cut( test_sent ) for w in result: print( w.word, "/", w.flag, ", ", end = ' ' ) print( "\n" + "="*40 ) terms = jieba.cut( 'easy_install is great' ) print( '/'.join( terms ) ) terms = jieba.cut( 'python 的正则表达式是好用的' ) print( '/'.join( terms ) ) print( "="*40 ) # test frequency tune testlist = [ ( '今天天气不错', ( '今天', '天气' ) ), ( '如果放到post中将出错。', ( '中', '将' ) ), ( '我们中出了一个叛徒', ( '中', '出' ) ), ] for sent, seg in testlist: print( '/'.join( jieba.cut( sent, HMM = False ) ) ) word = ''.join( seg ) print( '%s Before: %s, After: %s' % ( word, jieba.FREQ[word], jieba.suggest_freq( seg, True ) ) ) print( '/'.join( jieba.cut( sent, HMM = False ) ) ) print( "-"*40 )
# ###调整词典 # 使用 add_word(word, freq=None, tag=None) 和 del_word(word) 可在程序中动态修改词典。 # 使用 suggest_freq(segment, tune=True) 可调节单个词语的词频,使其能(或不能)被分出来。 # 注意:自动计算的词频在使用 HMM 新词发现功能时可能无效。 # In[7]: print("/".join(jieba.cut("如果放到post中将出错。", HMM = False))) # In[8]: #利用调节词频使“中”,“将”都能被分出来 jieba.suggest_freq(("中", "将"), tune = True) # In[9]: print("/".join(jieba.cut("如果放到post中将出错。", HMM = False))) # In[16]: Original = "/".join(jieba.cut("江州市长江大桥参加了长江大桥的通车仪式。", HMM = False)) print "Original: " + Original # In[21]:
print('/'.join(terms)) terms = jieba.cut('python 的正则表达式是好用的') print('/'.join(terms)) print("="*40) # test frequency tune testlist = [ ('今天天气不错', ('今天', '天气')), ('如果放到post中将出错。', ('中', '将')), ('我们中出了一个叛徒', ('中', '出')), ] for sent, seg in testlist: print('/'.join(jieba.cut(sent, HMM=False))) word = ''.join(seg) print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True))) print('/'.join(jieba.cut(sent, HMM=False))) print("-"*40) # quit() jieba.add_word('石墨烯') seg_list = jieba.cut(p, cut_all=True) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 seg_list = jieba.cut(p, cut_all=False) print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 seg_list = jieba.cut(p) # 默认是精确模式 print(", ".join(seg_list)) seg_list = jieba.cut_for_search(p) # 搜索引擎模式
words = jieba.cut(test_sent) #print('/'.join(words)) #print("="*40) result = pseg.cut(test_sent) #for w in result: # print(w.word, "/", w.flag, ", ", end=' ') #print("\n" + "="*40) terms = jieba.cut('easy_install is great') #print('/'.join(terms)) terms = jieba.cut('python 的正则表达式是好用的') #print('/'.join(terms)) #print("="*40) # test frequency tune testlist = [ ('今天天气不错', ('今天', '天气')), ('如果放到post中将出错。', ('中', '将')), ('我们中出了一个叛徒', ('中', '出')), ] for sent, seg in testlist: # print('/'.join(jieba.cut(sent, HMM=False))) word = ''.join(seg) print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True))) print('/'.join(jieba.cut(sent, HMM=False))) # print("-"*40)
'点击','客服','QQ','微信' ] file_clean2 = open("all_text_clean2.txt",'w') with open("all_text_clean.txt",'r') as f: text_clean = f.readlines() f.close() for line in text_clean: #print line find = False for words in key_words: if line.find(words) != -1 : find = True break if not find: file_clean2.write(line) file_clean2.close() with open("all_text_clean2.txt",'r') as f: text_clean2 = f.read() f.close() jieba.suggest_freq('融魔方',True) tags = jieba.analyse.extract_tags(text_clean2, topK=20) print(",".join(tags)) if __name__ == "__main__": main()
#!/usr/bin/python3 # coding: utf-8 import jieba ################################################################## ## suggest_freq(segment, tune=True) 可调节单个词语的词频, 使其能(或不能)被分出来 # suggest_freq() 每执行一次, 频率会增加 1 print(jieba.get_FREQ(('中', '将'))) # None print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False))) # 如果/放到/post/中将/出错/。 print(jieba.suggest_freq(('中', '将'), True)) # 494; 意思是 中将 两个字要分开 print(jieba.get_FREQ('中'), jieba.get_FREQ('将')) # 243191 122305 print(jieba.get_FREQ('中', '将')) # 243191; 输出的是 中 的词频 print(jieba.get_FREQ(('中', '将'))) # None, 没有意义 print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False))) # 如果/放到/post/中/将/出错/。 print(jieba.get_FREQ('台中')) print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False))) # 「/台/中/」/正确/应该/不会/被/切开 print(jieba.suggest_freq('台中', True)) # 69; 执行几次以后会增加..., print(jieba.get_FREQ('台中')) print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False))) # 「/台中/」/正确/应该/不会/被/切开 ################################################################## ## "台中"总是被切成"台 中"; P(台中) < P(台) x P(中), "台中"词频不够导致其成词概率较低 # 解决方法: 强制调高词频 # jieba.add_word('台中') 或者 jieba.suggest_freq('台中', True) ################################################################## ## test frequency tune testlist = [ ('今天天气不错', ('今天', '天气')), ('如果放到post中将出错。', ('中', '将')), ('我们中出了一个叛徒', ('中', '出')), ] for sent, seg in testlist:
seg_list = jieba.cut("我来到北京清华大学", cut_all=False) print("Default Mode: " + "/ ".join(seg_list)) # 默认模式 seg_list = jieba.cut("他来到了网易杭研大厦") print(", ".join(seg_list)) seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 print(", ".join(seg_list)) print('='*40) print('2. 添加自定义词典/调整词典') print('-'*40) print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False))) #如果/放到/post/中将/出错/。 print(jieba.suggest_freq(('中', '将'), True)) #494 print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False))) #如果/放到/post/中/将/出错/。 print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False))) #「/台/中/」/正确/应该/不会/被/切开 print(jieba.suggest_freq('台中', True)) #69 print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False))) #「/台中/」/正确/应该/不会/被/切开 print('='*40) print('3. 关键词提取') print('-'*40) print(' TF-IDF') print('-'*40)
def suggest_usr_dict(path): with open(path, 'r') as f: for l in f.xreadlines(): word1, word2 = l.split(',')[0].rstrip(), l.split(',')[1].rstrip() jieba.suggest_freq((word1, word2), True)
f = open('/Users/xenia/Desktop/text.txt', 'r', buffering=-1, encoding='utf-8') for line in f: print (line) f.close() """ ### 將 content 中文檔斷詞後,分行寫入 revised 中,並計算總詞數(不包含標點符號) import jieba import sys sys.path.append("../") import jieba.posseg import jieba.analyse content = open('/Users/xenia/Desktop/txt_file/text_1_1225_policy.txt', 'rb').read() revised = open('/Users/xenia/Desktop/txt_file/out.txt', 'w', buffering=-1, encoding='utf-8') jieba.suggest_freq(('民進黨'), True) #若有新詞、專有名詞,可隨時提高詞頻 or add_word words=jieba.cut(content, cut_all=False) term = 0 for word in words: revised.write(word) revised.write('\n') if word != ',' and word != '。' and word != ':' and word != ';' and word != '、' and word != '「' and word != '」' and word != '?' and word != '!' and word != ' ' and word != '\n' and word != '(' and word != ')' and word != '”' and word != '“': term = term +1 revised.close() ### 針對 revised 中斷詞完成之文檔,搜尋目標詞,計算其出現次數及詞頻 (term frequency) ### 因 jieba 功能限制,計算「我」的次數,必須扣除「我們」的次數 import re wordnumber = list() regex=re.compile('立倫') #輸入目標詞
seg_list = jieba.cut("我来到北京清华大学", cut_all=False) print("Default Mode: " + "/ ".join(seg_list)) # 默认模式 seg_list = jieba.cut("他来到了网易杭研大厦") print(", ".join(seg_list)) seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 print(", ".join(seg_list)) print("=" * 40) print("2. 添加自定义词典/调整词典") print("-" * 40) print("/".join(jieba.cut("如果放到post中将出错。", HMM=False))) # 如果/放到/post/中将/出错/。 print(jieba.suggest_freq(("中", "将"), True)) # 494 print("/".join(jieba.cut("如果放到post中将出错。", HMM=False))) # 如果/放到/post/中/将/出错/。 print("/".join(jieba.cut("「台中」正确应该不会被切开", HMM=False))) # 「/台/中/」/正确/应该/不会/被/切开 print(jieba.suggest_freq("台中", True)) # 69 print("/".join(jieba.cut("「台中」正确应该不会被切开", HMM=False))) # 「/台中/」/正确/应该/不会/被/切开 print("=" * 40) print("3. 关键词提取") print("-" * 40) print(" TF-IDF") print("-" * 40)