def set_keyword(docId, db, question=None, answer=None): cursor = db.cursor() sqlSelect = "select * from chatrobot where id = %d" sqlInsert = "INSERT INTO key_word(id,word) VALUES (%d,'%s')" if question is None or answer is None: cursor.execute(sqlSelect%(docId)) data = cursor.fetchone() question = data[2] answer = data[3] question_set = set(jieba.cut_for_search(question))|set(question) answer_set = set(jieba.cut_for_search(answer))|set(answer) inter = question_set & answer_set sqlSelect = "SELECT count(*) from chatrobot where question like '%%%s%%'" minCount = 10000 keyWord = '' for i in inter: cursor.execute(sqlSelect % i) count = cursor.fetchone()[0] if count < minCount: minCount = count keyWord = i if keyWord == u',': keyWord = '' # print "keyword:",keyWord cursor.execute(sqlInsert % (docId, keyWord)) db.commit()
def save(): for course in Courses.query.all(): seg_list = jieba.cut_for_search(course.name) str = '/'.join(seg_list) results = str.split('/') results.append(course.name) for result in results: if(Search.query.filter_by(name=result).first() == None): s = Search(name=result) s.courses.append(course) db.session.add(s) db.session.commit() elif(course not in Search.query.filter_by(name=result).first().courses.all()): s = Search.query.filter_by(name=result).first() s.courses.append(course) db.session.add(s) db.session.commit() for tag in Tags.query.all(): seg_list = jieba.cut_for_search(tag.name) str = '/'.join(seg_list) results = str.split('/') results.append(tag.name) for result in results: if(Search.query.filter_by(name=result).first() == None): s = Search(name=result) s.tags.append(tag) db.session.add(s) db.session.commit() elif(tag not in Search.query.filter_by(name=result).first().tags.all()): s = Search.query.filter_by(name=result).first() s.tags.append(tag) db.session.add(s) db.session.commit()
def get_key_words(self,question): keywords = jieba.cut_for_search(question) keywordslist = list(keywords) if len(keywordslist)!=0: return {}.fromkeys(keywordslist).keys() else: return question
def endElement(self, tag): if self.CurrentData == "text": if self.title.startswith('Wikipedia:'): print "Skip", self.title self.title = "" return print self.title print len(self.text) time0 = time.time() line = Converter('zh-hans').convert(self.text.decode('utf-8')) self.text = line.encode('utf-8') #words = pseg.cut(self.text) time_set=time.time() words = jieba.cut_for_search(self.text) sentenceStart = True for w in words: self.file.write(w + ' ') print time.time() - time0 self.counter += 1 self.title = "" print "Counter", self.counter self.CurrentData = "" self.text = ""
def new_course(): """ 创建一个新的课程 :return: """ # request.get_json.get('item', 'default') if request.method == "POST": course = Courses.from_json(request.get_json()) db.session.add(course) db.session.commit() generator = jieba.cut_for_search(course.name) seg_list = '/'.join(generator) results = seg_list.split('/') if course.name not in results: results.append(course.name) for seg in results: s = Search.query.filter_by(name=seg).first() if not s: s = Search(name=seg) s.courses.append(course) db.session.add(s) db.session.commit() return jsonify({ 'id': course.id }), 201
def tokens(self, intext): intext = u' '.join(intext.split()) if self.mode == 's': token_list = jieba.cut_for_search(intext) else: token_list = jieba.cut(intext) return [token for token in token_list if token.strip() != u'' and not token in self.stopword_set]
def Search(this, searchString, sortOrder): seglist = jieba.cut_for_search(searchString) timeSummary = {} articleList = [] topicList = [] this.SearchArticle(seglist, sortOrder) this.SearchRelated(seglist) print('search complete') i = 1 for article in this.articleList: for post in this.articleDB.find({u'DocID':article[0]}): postTime = time.strptime((post[u'Time'].split(' '))[0], u'%Y-%m-%d') timeSummary.setdefault(postTime, 0) timeSummary[postTime] = timeSummary[postTime] + 1 articleList.append(post) i = i+1 if i>100: break for topic in this.topicList: for post in this.topicDB.find({u'TopicID':topic[0]}): topicList.append(post) finalResult = { u'Article': articleList, u'Topic': topicList, u'Summary': sorted(timeSummary.iteritems(),cmp = lambda x,y:cmp(x[0],y[0])) } return finalResult
def _fields_txt_2_dict(*txts): # txt1 = txts[0].encode('utf-8') # term_dict = seg_txt_2_dict(txt1) # for key in term_dict.iterkeys(): # term_dict[key] = 3 # for txt in txts[1:]: # txt = txt.encode('utf-8') # d = seg_txt_2_dict(txt) # term_dict.update(d) # return term_dict term_dict = {} for txt in txts: txt = txt.encode('utf-8') seg_list = cut_for_search(txt) for seg in seg_list: value = term_dict.get(seg) if value is None: term_dict[seg] = 1 else: term_dict[seg] = value + 1 return term_dict
def find_dian_word(): pkl_file = open('../data/new_words.pkl', 'rb') preprocessed_word_lists = pkl.load(pkl_file) word = [] count = 0 tmpt = [] ci =['电','网','磁','流','感','源','揽','频','耦', '热','压','场','量','信','圈','耗','能','建', '机','燃','控','负','巡','阻','匝','线','度', '势','经','缘','贮','波','气','障','操','微', '谐','联','监','光','趋'] for ele in preprocessed_word_lists: count = count + 1 for each in ele: if '电' in each: word.append(each) sys.stdout.write('generated:{0}/total:{1}\r'.format(count, 950018)) sys.stdout.flush() new_word = [] for i in word: ss = '/'.join(jieba.cut_for_search(i, HMM=False)) ss = ss.split('/') for eve in ss: for ele in ci: if ele in eve and len(eve)>=2: new_word.append(eve) else: continue df = pd.DataFrame() df[''] = list(set(new_word)) df.to_csv('9-27-one_2.csv',encoding='utf-8',index=False) print(len(set(new_word))) return new_word
def tokenize(text): tokens = [] text = preprocess(text) tokens += ASCII_SLUG_RE.findall(text) # ASCII tokens are already usable for unit in CJK_SLUG_RE.findall(text): # CJK tokens need extraction # Search engine mode. Might return ambiguous result unit_tokens = list(jieba.cut_for_search(unit)) # Make better word guessing by joining non-conjunction words i = 0 length = len(unit_tokens) while i < length: j = i buf = '' while j < length: token = unit_tokens[j] if token in CONJUNCTIONS or len(token) > 1: break else: buf += token j += 1 if len(buf) > 1 and buf not in unit_tokens: unit_tokens.append(buf) i = j + 1 tokens.extend(unit_tokens) return tokens
def make_index(): dbfile=file("tieba.json") dat=dbfile.read() datas=dat.split('\n') database = xapian.WritableDatabase('indexes/', xapian.DB_CREATE_OR_OPEN) #stemmer = xapian.Stem("english") for data in datas: try: ddata=eval(data) use_data={} use_data["title"]=ddata["title"] reply={} reply["content"]=ddata["reply"]["content"] reply["name"]=ddata["reply"]["name"] reply["time"]=ddata["reply"]["time"] use_data["reply"]=reply doc = xapian.Document() doc.set_data(str(use_data)) use_data=str(ddata["reply"]["name"])+str(ddata["reply"]["time"])+str(ddata["reply"]["content"])+str(ddata["title"]) for term in jieba.cut_for_search(str(use_data)): doc.add_term(term.encode('utf-8')) database.add_document(doc) except: pass database.commit() dbfile.close()
def put_course(id): """ 更新一门课 """ course = Courses.query.get_or_404(id) if request.method == "PUT": data_dict = eval(request.data) course.name = data_dict.get('name', course.name) course.teacher = data_dict.get('teacher', course.teacher) course.category_id = data_dict.get('category_id', course.category_id) course.subcategory_id = data_dict.get('sub_category_id', course.subcategory_id) course.type_id = data_dict.get('type_id', course.type_id) db.session.add(course) db.session.commit() generator = jieba.cut_for_search(course.name) seg_list = '/'.join(generator) results = seg_list.split('/') if course.name not in results: results.append(course.name) for seg in results: s = Search(name=seg) s.courses.append(course) db.session.add(s) db.session.commit() return jsonify({'update': id}), 200
def cut_doc(doc): text = [] url = re.findall("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", doc) for link in url: doc.remove(link) words = jieba.cut_for_search(doc) chcontent = [] encontent = [] # Used regular expression to split Chinese and English ch = re.compile("[\u4e00-\u9fa5]") en = re.compile("[^\u4e00-\u9fa5]") for word in words: if ch.match(word): # chcontent.append(str(word.encode("utf-8"))) chcontent.append(word) else: encontent.append(word.lower()) Tokens = chcontent + encontent + url removewords = [" ", "", "~", "~"] filted = [x for x in Tokens if x not in removewords] phrase = [x for x in filted if len(x) > 1] print([x for x in phrase]) return phrase
def testCutForSearch(self): for content in test_contents: result = jieba.cut_for_search(content) assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error" result = list(result) assert isinstance(result, list), "Test CutForSearch error on content: %s" % content print >> sys.stderr, " , ".join(result)
def search(self, qstr, topn=None, limit=None, extend=True): if not qstr: print "query must not be empty" #sys.exit(701) return start_query = datetime.datetime.now() pre_query = datetime.timedelta() begin_index = datetime.datetime.now() raw_qs = wfilter(jieba.cut_for_search(qstr)) end_index = datetime.datetime.now() pre_query += (end_index - begin_index) sortResult, Qs, raw_qs, hit, index_cost = self._search(raw_qs, topn=topn, limit=limit, extend=extend) index_cost += pre_query # add time costs of pre_query index_cost = index_cost.total_seconds() end_query = datetime.datetime.now() delta_query = end_query - start_query delta_query = delta_query.total_seconds() return sortResult, Qs, raw_qs, hit, index_cost, delta_query
def preprocess_query_str(query_str): result = [] keywords = [keyword for keyword in query_str.split(" ") if keyword.strip() != ""] for keyword in keywords: cutted_keyword = " ".join(["%s" % term for term in jieba.cut_for_search(keyword)]) result.append(cutted_keyword) return result
def CHseparatewords(text): seg_list = jieba.cut_for_search(text) result = [] for seg in seg_list: result.append(seg) return result
def get_search_videos(request): try: if request.GET.has_key('title'): q_titles = request.GET['title'].encode('utf8') seg_list = jieba.cut_for_search(q_titles) seg_list = list(seg_list) if "" in seg_list: seg_list.remove("") if " " in seg_list: seg_list.remove(" ") if getLen(seg_list) == 0: return None # return Video.objects.all() temp = [] for i in range(len(seg_list)): if str(seg_list[i].encode("utf8")) not in stop_list: print seg_list[i] temp.append(seg_list[i]) # seg_list.remove(seg_list[i]) # continue # elif seg_list[i].encode("utf8") not in key_: # seg_list.remove(seg_list[i]) seg_list = temp if getLen(seg_list) == 0: return None q_title = seg_list[0] videos = Video.objects.filter(Q(title__icontains=q_title)|Q(kind_str__icontains=q_title)|Q(tags_str__icontains=q_title)).all() for i in range(1, len(seg_list)): q_title = seg_list[i] videos = videos | Video.objects.filter(Q(title__icontains=q_title)|Q(kind_str__icontains=q_title)|Q(tags_str__icontains=q_title)).all() # ids = [] # for seg in seg_list: # if seg.encode("utf8") in key_: # v_ids = video_[key_.index(seg.encode("utf8"))] # # print v_ids # for v in v_ids.split(','): # ids.append(v) # ids = list(set(ids)) # videos = None # if getLen(ids) >= 1: # q_id = ids[0] # videos = Video.objects.filter(id=q_id).all() # for i in range(1, len(ids)): # q_id = ids[i] # videos = videos | Video.objects.filter(id=q_id).all() # if getLen(videos) == 0: # return Video.objects.all() return videos else: return Video.objects.all() except Exception, e: printError("search:"+str(e))
def jiebasplit(stringlist): stringset = set() for string in stringlist: string_seg_list = jieba.cut_for_search(string) string_seg_set = set(string_seg_list) stringset = stringset | string_seg_set return stringset
def split(self, input): chinese = [] if JIEBA: chinese = list(jieba.cut_for_search(input)) latin1 = self.latin1_letters.findall(input) return chinese + latin1
def getWords(doc): # _mood = set() # for i in re.findall(r'\[\S+?\]',doc.decode('utf-8')): # # print 'data:' +i #心情表情 # _mood.add(i) # doc = doc.replace(i, '') # return dict([(w,1) for w in jieba.cut(doc)]) # _mood = set() # regxs = {r'\[\S+?\]': '', r'//@.*:': ''} # for key,value in regxs.items(): # print "pre:%s"%doc # doc = re.sub(key, value, doc, flags=re.IGNORECASE) # print "aft:%s"%doc # return dict([(w,1) for w in jieba.cut_for_search(doc)]) _mood = set() regxs = {r'\[.*\]': '', r'//@.*:': ''} for key,value in regxs.items(): # print "pre:%s"%doc doc = re.sub(key, value, doc.decode('utf8'), flags=re.IGNORECASE) # print "aft:%s"%doc # table = string.maketrans("", "") # doc.translate(table, string.punctuation) regex = re.compile('[%s]' % re.escape(string.punctuation)) doc = regex.sub('', doc) print "reg:%s" % doc res = dict() for w in jieba.cut_for_search(doc): if w in string.punctuation+extra_punctuation or len(w)<2: print "s", w else: res[w] =1 return res
def add_content(self, content, obj_key): """ 添加文档到索引 """ seg_list = jieba.cut_for_search(content) seg_list = min_nlp.get_weight(seg_list) self.add_word_index(seg_list, obj_key)
def search(self, keywords, start=0, length=20): """ 搜索关键字 """ seg_list = list(jieba.cut_for_search(keywords)) key_list = self.search_by_words(seg_list, start, length) return key_list
def dosearch(query): weight = 0 raw_query = lower_letters(query) query = query_parser(query) query = query + [raw_query] id_list = [] res_name = [] weight = {} if query: for term in query: if term in t_inverted_index: for key, value in t_inverted_index[term].iteritems(): if key not in weight: weight[key] = tdxidf_weighting(term, key) else: weight[key] = weight[key] + tdxidf_weighting(term, key) if key not in id_list: id_list.append(key) rank_list = calc_vector_space(query, id_list) rank_fin = [] q = list(set(jieba.cut_for_search(raw_query))) if u" " in q: q.remove(u" ") cnt = [] for key_index, key in reversed(list(enumerate(rank_list))): info_term = list(set(jieba.cut_for_search(id_info_list[key]))) if u" " in info_term: info_term.remove(u" ") for term in q: if term in info_term: cnt.append(key) freq_cnt = Counter(cnt) freq_cnt_tuples = freq_cnt.most_common() for item, cnt in freq_cnt_tuples: rank_fin.append(item) for item in rank_list: if item not in rank_fin: rank_fin.append(item) if id_list: for ids in rank_fin: res = os.path.splitext(doc_id_list[ids])[0] res = res[7:] res_name.append(res) return res_name
def split(self, input): # type: (unicode) -> List[unicode] chinese = [] # type: List[unicode] if JIEBA: chinese = list(jieba.cut_for_search(input)) latin1 = self.latin1_letters.findall(input) return chinese + latin1
def jiebasplit2(stringlist): # 测试关键词或运算的分组结果,不理想。 stringsetlist = list() for string in stringlist: substringset = set() string_seg_list = jieba.cut_for_search(string) substringset = set(string_seg_list) stringsetlist.append(substringset) return stringsetlist
def search(self, query): keys = jieba.cut_for_search(query) p = self._search(keys) result = [] while p is not None: result.append(p.word) p = p.next return result;
def testCutForSearch_NOHMM(self): for content in test_contents: result = jieba.cut_for_search(content,HMM=False) assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error" result = list(result) assert isinstance(result, list), "Test CutForSearch error on content: %s" % content print(" , ".join(result), file=sys.stderr) print("testCutForSearch_NOHMM", file=sys.stderr)
def segment(content): """ 使用结巴分词对fin进行分词并输出到fout """ print 'segment...please wait..' words = ' '.join(jieba.cut_for_search(content)) f = file(fout, 'wb') f.write(words.encode('utf8'))
def cut_search(data): ''' 采用搜索引擎模式分词,在精确模式的基础上,对长词再次切分, 来到北京大学-->来到/北京/大学/北京大学 ''' temp_result = jieba.cut_for_search(data) temp_result = '/'.join(temp_result) return temp_result
def gen_keywords(self, text): result_list = list(jieba.cut_for_search(text)) return self.__trim_stop_word__(result_list)
import jieba #全模式 text = "我来到北京清华大学" seg_list = jieba.cut(text, cut_all=True) print( u"[全模式]: ", "/ ".join(seg_list) ) #精确模式 seg_list = jieba.cut(text, cut_all=False) print(u"[精确模式]: ", "/ ".join(seg_list) ) #默认是精确模式 seg_list = jieba.cut(text) print( u"[默认模式]: ", "/ ".join(seg_list) ) #新词识别 “杭研”并没有在词典中,但是也被Viterbi算法识别出来了 seg_list = jieba.cut("他来到了网易杭研大厦") print( u"[新词识别]: ", "/ ".join(seg_list) ) #搜索引擎模式 seg_list = jieba.cut_for_search(text) print( u"[搜索引擎模式]: ", "/ ".join(seg_list) ) ''' [全模式]: 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学 [精确模式]: 我/ 来到/ 北京/ 清华大学 [默认模式]: 我/ 来到/ 北京/ 清华大学 [新词识别]: 他/ 来到/ 了/ 网易/ 杭研/ 大厦 [搜索引擎模式]: 我/ 来到/ 北京/ 清华/ 华大/ 大学/ 清华大学 '''
if i == 0: y.append("交通") else: y.append("计算机") print y[0], y[1], y[2] import jieba """ jieba的中文github地址:https://github.com/fxsjy/jieba """ x1 = ''' 三个臭皮匠顶个诸葛亮,以此类推,如果能把一个人跟另外100万人的大脑连接起来,就会诞生“超级大脑”。正因如此,现在才出现了好几家公司争相开发脑机界面,希望把人的思维与机器连接起来。如果能够率先将笔记本电脑的功能植入你的大脑,就将为人们开辟一条道路,使之得以随意通过无缝渠道与任何人(甚至任何东西)交换信息。目前有两位IT行业的大佬都在参与这场角逐,他们分别是特斯拉创始人埃隆·马斯克(Elon Musk)和Facebook创始人马克·扎克伯格(Mark Zuckerberg)。他们两人的项目分别名为Neuralink和Building 8。而据知情人士透露,这两个项目都需要对大脑进行外科手术。然而,还有一些没有那么野心勃勃的微创方式,也可以解决脑机界面问题。只需要把脑电波的数据转化成简单的指令,然后由应用或设备进行处理即可。一家名为Nuro的创业公司就采取了这种方式。他们希望借助自己的软件平台,让那些因为严重受伤或疾病而丧失交流能力的人恢复这种能力。 ''' x2 = "本期企鹅评测团产品——华为MateBook X Pro笔记本电脑。作者是一名普通公务员,同时又是一名数码发烧友,多年来一直沉迷于各种新潮的数码产品,工作以后也不忘通过数码产品提升工作效率。随着笔记本电脑市场竞争的日益激烈,再加上硬件性能不断提升,越来越多的非游戏用户选择使用更加方便携带的超极本,各大厂商自然也是迎合用户需求,推出外观更加靓丽、身材更加小巧、功能更加丰富的超极本。" seg_list = jieba.cut(x2.strip(), cut_all=True) # seg_list这是一个可循环的对象 print("Full Mode: " + " ".join(seg_list)) # 全模式 seg_list = jieba.cut("我来到北京清华大学,工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", cut_all=False) print("Precise Mode: " + " ".join(seg_list)) #精确模式,默认状态下也是精确模式 seg_list = jieba.cut_for_search( "我来到北京清华大学,工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作") print "搜索模式:", " ".join(seg_list) mcase = {'a': 10, 'b': 34} mcase_frequency = {v: k for k, v in mcase.items()} print mcase_frequency
# -*- coding: utf-8 -*- import sys import os import jieba sent='在包含問題的所有解的解空間樹中,按照深度優先搜尋的策略,從根節點出發深度優先搜尋解空間樹' # 全模式 wordList=jieba.cut(sent, cut_all=True) print(' | '.join(wordList)) # 精準切分 wordList=jieba.cut(sent) print(' | '.join(wordList)) # 搜尋引擎模式 wordList=jieba.cut_for_search(sent) print(' | '.join(wordList))
# -*- coding: utf-8 -*- # coding=utf-8 import jieba import jieba.analyse text = "故宫的著名景点包括乾清宫、太和殿和午门等。其中乾清宫非常精美,午门是紫禁城的正门,午门居中向阳。" # #jieba.load_userdict("jieba_dict.txt") # 用户自定义词典 (用户可以自己在这个文本文件中,写好自定制词汇) # f = open('jieba_text.txt', 'r', encoding='utf8') # 要进行分词处理的文本文件 (统统按照utf8文件去处理,省得麻烦) # lines = f.readlines() # for line in lines: # text += line # seg_list = jieba.cut(text, cut_all=False) #精确模式(默认是精确模式) seg_list = jieba.cut(text) # 精确模式(默认是精确模式) for va in seg_list: print(va) print(seg_list) print("[精确模式]: ", " ".join(seg_list)) seg_list2 = jieba.cut(text, cut_all=True) #全模式 print("[全模式]: ", " ".join(seg_list2)) seg_list3 = jieba.cut_for_search(text) #搜索引擎模式 print("[搜索引擎模式]: ", " ".join(seg_list3)) tags = jieba.analyse.extract_tags(text, topK=5) print("关键词: ", " ".join(tags))
line.strip() for line in open('stopwords.txt', encoding='UTF-8').readlines() ] #一行行读取csv file_object2 = open(args["file_name"]).read().split('\n') #建立分词存储列表 Rs1 = [] Rs2 = [] #统计词频的字典 dic = {} for i in range(len(file_object2)): result = [] # seg_list = jieba.cut(file_object2[i]) 选择cut的模式 seg_list = jieba.cut_for_search(file_object2[i]) #添加源数据列 result.append(file_object2[i]) #读取每一行分词 for w in seg_list: if w not in stopwords: result.append(w) dic[w] = dic.get(w, 0) + 1 continue #把分词写入源列表后面 Rs1.append(result) #写入CSV,并用时间命名文件 避免重名 # 08 05 2019 09:49:02 时间格式 doctime = str(time.strftime("%m %d %Y %H:%M:%S", time.localtime())) mon = doctime[0:2]
def testTokenize(self): vocab_file = 'testdata/vocab_chinese.txt' user_dict_files = [ 'data/jieba/hello.txt', 'naivenlp/tokenizers/data/dict.txt', ] intervener = DefaultIntervener() tokenizer = JiebaTokenizer( vocab_file=vocab_file, user_dict_files=user_dict_files, intervener=intervener, pad_token='[PAD]', unk_token='[UNK]', bos_token='<S>', eos_token='<T>', cls_token='[CLS]', sep_token='[SEP]', mask_token='[MASK]', ) self.assertEqual(0, tokenizer.pad_id) self.assertEqual(100, tokenizer.unk_id) self.assertEqual(104, tokenizer.bos_id) self.assertEqual(105, tokenizer.eos_id) self.assertEqual(101, tokenizer.cls_id) self.assertEqual(102, tokenizer.sep_id) self.assertEqual(103, tokenizer.mask_id) sentences = [ '我在上海工作', '我来到北京清华大学', '乒乓球拍卖完了', '中国科学技术大学', ] for sent in sentences: self.assertEqual( [t for t in jieba.cut(sent, cut_all=False, HMM=True)], tokenizer.tokenize(sent, mode='accurate', hmm=True)) self.assertEqual( [t for t in jieba.cut(sent, cut_all=False, HMM=False)], tokenizer.tokenize(sent, mode='accurate', hmm=False)) self.assertEqual( [t for t in jieba.cut(sent, cut_all=True, HMM=True)], tokenizer.tokenize(sent, mode='full', hmm=True)) self.assertEqual( [t for t in jieba.cut(sent, cut_all=True, HMM=False)], tokenizer.tokenize(sent, mode='full', hmm=True)) self.assertEqual( [t for t in jieba.cut_for_search(sent, HMM=True)], tokenizer.tokenize(sent, mode='search', hmm=True)) self.assertEqual( [t for t in jieba.cut_for_search(sent, HMM=False)], tokenizer.tokenize(sent, mode='search', hmm=False)) tokens = tokenizer.tokenize('高级javadeveloper') self.assertListEqual(['高级', 'javadeveloper'], tokens) intervener.add_split_token('javadeveloper', 'java developer') tokens = tokenizer.tokenize('高级javadeveloper') self.assertListEqual(['高级', 'java', 'developer'], tokens) intervener.add_combine_token('javadeveloper') tokens = tokenizer.tokenize('高级javadeveloper') self.assertListEqual(['高级', 'javadeveloper'], tokens) intervener.remove_combine_token('javadeveloper') tokens = tokenizer.tokenize('高级javadeveloper') self.assertListEqual(['高级', 'java', 'developer'], tokens) intervener.remove_split_token('javadeveloper') tokens = tokenizer.tokenize('高级javadeveloper') self.assertListEqual(['高级', 'javadeveloper'], tokens)
def nlp_jieba_cut(self, text): stop_words = '。,?:@—,、!![]【】《》“”.…#~ ' self.data['jieba_cut'] = list( filter(lambda x: x.strip(stop_words), jieba.cut_for_search(text)))
import jieba import sklearn s1 = "我来贪心学院学习python" s1_result = jieba.cut(s1) print(list(s1_result)) s1_result = jieba.cut(s1, cut_all=True) print(list(s1_result)) s1_result = jieba.cut_for_search(s1) print(list(s1_result)) word_vector_list = ["我们", "来", "贪心", "学院", "学习", "人工智能", "和", "Python"] question = "Python学习多久" s1 = "我来贪心学院学习Python" s2 = "我学习人工智能" s3 = "Python课程的学习周期是多久" import numpy as np def get_vector(data): vector_list = [] for i in word_vector_list: if i in list(jieba.cut(data)): vector_list.append(1) else: vector_list.append(0)
# encoding=utf-8 #jieba的三种模式 import jieba str= "2018汉马全马男子冠军诞生!摩洛哥选手卫冕" seg_list= jieba.cut(str, cut_all=True) print("全模式: " + "/".join(seg_list)) # 全模式 print("-------------------------------------") seg_list= jieba.cut(str) print("默认模式: " + "/".join(seg_list)) # 默认模式= 精确模式 print("-------------------------------------") seg_list= jieba.cut_for_search(str) # 搜索引擎模式 print("搜索引擎模式: " + "/".join(seg_list))
def cut_search(label): seg_list = jieba.cut_for_search(label) return seg_list
def get_words_list(df): df['words_list'] = [] word_generator = jieba.cut_for_search(df['title']) for word in word_generator: df['words_list'].append(word) return df
def read_post(flag): stop_words = stopwordslist() pre_path = "../Data/weibo/tweets/" file_list = [pre_path + "test_nonrumor.txt", pre_path + "test_rumor.txt", \ pre_path + "train_nonrumor.txt", pre_path + "train_rumor.txt"] if flag == "train": id = pickle.load(open("../Data/weibo/train_id" + str(fold_id) + ".pkl", 'rb')) elif flag == "validate": id = pickle.load(open("../Data/weibo/validate_id.pickle", 'rb')) elif flag == "test": id = pickle.load(open("../Data/weibo/test_id" + str(fold_id) + ".pkl", 'rb')) post_content = [] labels = [] image_ids = [] twitter_ids = [] data = [] column = ['post_id', 'image_id', 'original_post', 'post_text', 'label', 'event_label'] key = -1 map_id = {} top_data = [] for k, f in enumerate(file_list): f = open(f, 'r') if (k + 1) % 2 == 1: label = 0 # real is 0 else: label = 1 # fake is 1 twitter_id = 0 line_data = [] top_line_data = [] for i, l in enumerate(f.readlines()): if (i + 1) % 3 == 1: line_data = [] twitter_id = l.split('|')[0] line_data.append(twitter_id) if (i + 1) % 3 == 2: line_data.append(l.lower()) if (i + 1) % 3 == 0: l = clean_str_sst(l) seg_list = jieba.cut_for_search(l) new_seg_list = [] for word in seg_list: if word not in stop_words: new_seg_list.append(word) clean_l = " ".join(new_seg_list) if len(clean_l) > 10 and line_data[0] in id: post_content.append(l) line_data.append(l) line_data.append(clean_l) line_data.append(label) event = int(id[line_data[0]]) if event not in map_id: map_id[event] = len(map_id) event = map_id[event] else: event = map_id[event] line_data.append(event) data.append(line_data) f.close() # print(data) # return post_content data_df = pd.DataFrame(np.array(data), columns=column) write_txt(top_data) return post_content, data_df, len(map_id)
def get_cut_for_search(self, sentence): return jieba.cut_for_search(sentence, HMM=False)
import jieba import time jieba.initialize() #手动初始化jieba分词词典 time.sleep(1) s = u'我想去北京故宫博物院参观和闲逛。' cut = jieba.cut(s) # print cut print '精确模式-----------------------------' print ','.join(cut) print '全模式------------------------------' print ','.join(jieba.cut(s, cut_all=True)) print '搜索引擎模式-------------------------' print ','.join(jieba.cut_for_search(s)) print '获取词性----------------------------' import jieba.posseg as psg # print [(x.word,x.flag) for x in psg.cut(s)] for x in psg.cut(s): print x.word + " " + x.flag + ",", print '\n只获取名词--------------------------' # print [(x.word,x.flag) for x in psg.cut(s) if x.flag.startswith('n')] for x in psg.cut(s): if x.flag.startswith('n'): print x.word + " " + x.flag + ",", print '' #并行分词 # 开启并行分词模式,参数为并发执行的进程数 jieba.enable_parallel(5)
import jieba # 精确模式 seg_list = jieba.cut("我去过清华大学和北京大学。") print("精确模式: " + "/".join(seg_list)) # 全模式 seg_list = jieba.cut("我去过清华大学和北京大学。", cut_all=True) print("全模式: " + "/".join(seg_list)) # 搜索引擎模式 seg_list = jieba.cut_for_search("我去过清华大学和北京大学。") print("搜索引擎模式: " + "/".join(seg_list)) # 精确模式/全模式下-新词发现 “杭研”没有在词典中,也被HMM模型 Viterbi算法识别出来 seg_list = jieba.cut("他来到了网易杭研大厦", HMM=True) print("精确模式/全模式-新词发现: " + "/".join(seg_list)) # 搜索引擎模式下-新词发现 “杭研”没有在词典中,也被HMM模型 Viterbi算法识别出来 seg_list = jieba.cut_for_search("他来到了网易杭研大厦", HMM=True) print("搜索引擎模式-新词发现: " + "/".join(seg_list))
print('='*40) print('1. 分词') print('-'*40) prpaStr="我来到北京清华大学,看到让我蓝瘦香菇的word哥,真是让人无语。" seg_list = jieba.cut(prpaStr, cut_all=True) print("全模式分词: \n" + "/ ".join(seg_list)) # 全模式 seg_list = jieba.cut(prpaStr, cut_all=False) print("默认模式分词: \n" + "/ ".join(seg_list)) # 默认模式 seg_list = jieba.cut(prpaStr) print("自定义分隔符分词:\n"+",".join(seg_list)) seg_list = jieba.cut_for_search(prpaStr) # 搜索引擎模式 print("搜索引擎模式:\n"+",".join(seg_list)) print("\n"*5+'='*40) print('2. 添加自定义词典/调整词典') print('-'*40) prpaStr1 = '如果放到post中将出错。' print("未调整词典的分词:\n"+'/'.join(jieba.cut(prpaStr1, HMM=False))) print(jieba.suggest_freq(('中', '将'), True)) print("调整词典的分词:\n"+'/'.join(jieba.cut(prpaStr1, HMM=False))) prpaStr2 = '「台中」正确应该不会被切开'
#encoding=utf-8 import sys import jieba # dic_path='sougou.txt' # #dic_path=dic_path.encode('utf8') # #print type(dic_path) # jieba.load_userdict(dic_path) # print(", ".join(jieba.cut("胆碱酯酶减少胆碱脂酶试剂盒胆碱酯酶试纸胆碱酯酶增加"))) #encoding=utf-8 import jieba seg_list = jieba.cut("我来到北京清华大学", cut_all=True) print("Full Mode:", "/ ".join(seg_list)) # 全模式 seg_list = jieba.cut("我来到北京清华大学", cut_all=False) print("Default Mode:", "/ ".join(seg_list)) # 精确模式 seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式 print(", ".join(seg_list)) seg_list = jieba.cut_for_search("小明硕士毕业后在日本京都大学深造") # 搜索引擎模式 print(", ".join(seg_list)) import jieba.posseg as pseg words = pseg.cut("我爱北京天安门") for w in words: print(w.word, w.flag)
""" import jieba """ 算法: 基于前缀词典实现高效的词图扫描,生成句子中汉字所有可能成词情况所构成的有向无环图 (DAG) 采用了动态规划查找最大概率路径, 找出基于词频的最大切分组合 对于未登录词,采用了基于汉字成词能力的 HMM 模型,使用了 Viterbi 算法 """ """ 1. 分词 jieba.cut 方法接受三个输入参数: 需要分词的字符串;cut_all 参数用来控制是否采用全模式;HMM 参数用来控制是否使用 HMM 模型 jieba.cut_for_search 方法接受两个参数:需要分词的字符串;是否使用 HMM 模型。该方法适合用于搜索引擎构建倒排索引的分词,粒度比较细 待分词的字符串可以是 unicode 或 UTF-8 字符串、GBK 字符串。注意:不建议直接输入 GBK 字符串,可能无法预料地错误解码成 UTF-8 jieba.cut 以及 jieba.cut_for_search 返回的结构都是一个可迭代的 generator,可以使用 for 循环来获得分词后得到的每一个词语(unicode),或者用 jieba.lcut 以及 jieba.lcut_for_search 直接返回 list jieba.Tokenizer(dictionary=DEFAULT_DICT) 新建自定义分词器,可用于同时使用不同词典。jieba.dt 为默认分词器,所有全局分词相关函数都是该分词器的映射。 """ #代码示例 seg_list = jieba.cut("我来到北京清华大学", cut_all=True) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 seg_list = jieba.cut("我来到北京清华大学", cut_all=False) print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式 print(", ".join(seg_list)) seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 print(", ".join(seg_list))
from sklearn import metrics if __name__ == '__main__': #f = open("C:\\Users\\Administrator\\Desktop\\python note\\craw\\taobaomm\\sj_names.txt") module_path = dirname(__file__) f = open(join(module_path, 'sj_names.txt')) class_list = [] term_str = [] try: for line in f: lt = line.split(',') if lt[1] == '全部': #过滤掉全部分类 continue class_list.append(lt[0]) temstr = lt[2].split('(') seg_list = jieba.cut_for_search(temstr[0]) #搜索引擎模式 terlist = ", ".join(seg_list) #解析成字符串 try: term_str.append([ term.strip() for term in terlist.split(',') if len(term.strip()) > 1 ]) #去掉空格字符转换为列表 except UnicodeEncodeError: print 'err' finally: f.close() fs = fj.Feature_select() term_vec = fs.transform(term_str) x_train, x_test, y_train, y_test = train_test_split(term_vec, class_list, test_size=0.2)
import jieba from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator from PIL import Image import numpy import matplotlib.pyplot as plt from os import path str = "明明知识点都熟记于心,可是在考试的时候脑子一片空白,什么都想不起来了" # 使用自定义字典 #自定义词典的格式:一个词占一行;每一行分三部分,一部分为词语,另一部分为词频,最后为词性(可省略),用空格隔开 # jieba.load_userdict('dict.txt') ex_list1 = jieba.cut(str) ex_list2 = jieba.cut(str, cut_all=True) ex_list3 = jieba.cut_for_search(str) print("精准模式:" + '/'.join(ex_list1)) print("全模式:" + '/'.join(ex_list2)) print("搜索引擎模式:" + '/'.join(ex_list3)) #可以看到全模式和搜索引擎模式下分词分得比精准模式更稀碎 # 定义绝对路径地址 __file__ = r"/Users/jiaxiaopeng/" # 把路径地址字符串转换为文件路径 d = path.dirname(__file__) # 调用包PIL中的open方法,读取图片文件,通过numpy中的array方法生成数组 backgroud_Image = numpy.array(Image.open(path.join(d, "111.jpg"))) # 绘制词云图 wc = WordCloud( background_color='white', # 设置背景颜色,与图片的背景色相关
import jieba # aa=jieba.cut('IHS Markit的最新调查报告称,苹果可能会为 iPhone 和 Apple Watch的屏幕长期采用一种全新的节能背板技术,有助于延长其电池续航时间。IHS认为未来的iPhone中改用LTPO TFT(低温多晶硅氧化物)背板,从理论上估算 LTPO可以比LTPS 节省 5 ~ 15% 的功耗,从而延长 iPhone的电池续航时间。',cut_all=True) # print('Full Mode:'+'/'.join(aa)) # # bb=jieba.cut('IHS Markit的最新调查报告称,苹果可能会为 iPhone 和 Apple Watch的屏幕长期采用一种全新的节能背板技术,有助于延长其电池续航时间。IHS认为未来的iPhone中改用LTPO TFT(低温多晶硅氧化物)背板,从理论上估算 LTPO可以比LTPS 节省 5 ~ 15% 的功耗,从而延长 iPhone的电池续航时间。',cut_all=False) # print('Default Mode:'+'/'.join(bb))#cut_all 默认为Faulse # bb=jieba.cut_for_search('IHS Markit的最新调查报告称,苹果可能会为 iPhone 和 Apple Watch的屏幕长期采用一种全新的节能背板技术,有助于延长其电池续航时间。IHS认为未来的iPhone中改用LTPO TFT(低温多晶硅氧化物)背板,从理论上估算 LTPO可以比LTPS 节省 5 ~ 15% 的功耗,从而延长 iPhone的电池续航时间。',HMM=True) print('/'.join(bb)) print('/'.join(bb)) print('/'.join(bb)) print('/'.join(bb)) pp = 'IHS Markit的最新调查报告称,苹果可能会为 iPhone 和 Apple Watch的屏幕长期采用一种全新的节能背板技术,有助于延长其电池续航时间。'
# coding: utf-8 from os import path from scipy.misc import imread import matplotlib.pyplot as plt import jieba from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator # 设置文件路径 dir = path.dirname('.') text1 = open(path.join(dir, 'chinese.txt')).read() text2 = jieba.cut_for_search(text1) text_ch = " ".join(text2) # 设置词云蒙版 mask_coloring = imread(path.join(dir, "mask.png")) wc = WordCloud(font_path='simsun.ttf', mask=mask_coloring, background_color="white", max_words=2000, max_font_size=80, random_state=80) # 生成词云图 wc.generate(text_ch) image_colors = ImageColorGenerator(mask_coloring) # 原始色彩 plt.figure() plt.imshow(wc) plt.axis("off") # 生成词云图 plt.figure() plt.imshow(wc.recolor(color_func=image_colors)) # 重置为底图色彩蒙版 plt.axis("off") # 生成词云图 plt.figure()
# CsvFile = csv.reader(open('/Users/sallyfan/desktop/csat2017.csv')) # contents = [] # # for i in CsvFile: # # contents1 = contents.append(i) # print(type(CsvFile)) # for i in CsvFile: # print(i) openfile = open('/Users/sallyfan/desktop/cusfeedbacks.txt') # openfile = open('/Users/sallyfan/desktop/csat.txt') # for i in file: # print(i) file = [] for i in openfile: file.append(i) # print(file) finalfile = "".join(file) # print(type((finalfile)) cutwords = jieba.cut_for_search(finalfile) jieba.suggest_freq(('充电桩','30米','特斯拉'), True ) cipin = jieba.analyse.textrank(finalfile,topK=30, allowPOS= ('a','v'), withFlag=False) print(cipin) cipin2 = jieba.analyse.extract_tags(finalfile, topK=30,allowPOS= ('a','v','ver'),withFlag=False) print(cipin2) # count = Counter(cutwords).most_common(30) print(count)
def cuttest(test_sent): result = jieba.cut_for_search(test_sent) for word in result: print(word, "/", end=' ') print("")
import jieba import gensim from gensim import corpora from gensim import models from gensim import similarities l1 = ["你的名字是什么", "你今年几岁了", "你有多高你胸多大", "你胸多大"] a = "你今年多大了" all_doc_list = [] for doc in l1: # doc = "你的名字是什么" doc_list = [word for word in jieba.cut_for_search(doc)] all_doc_list.append(doc_list) # all_doc_list = [['你', '的', '名字', '是', '什么'], ['你', '今年', '几岁', '了'], ['你', '有', '多', '高', '你', '胸多大'], ['你', '胸多大']] print(all_doc_list) doc_test_list = [word for word in jieba.cut_for_search(a)] # doc_test_list = [你,今年,多,大,了] # 制作语料库 dictionary = corpora.Dictionary(all_doc_list) # 制作词袋 not 磁带 #{'你':1,"的":2,"名字":3} # 词袋的理解 # 词袋就是将很多很多的词,进行排列形成一个 词(key) 与一个 标志位(value) 的字典 # 例如: {'什么': 0, '你': 1, '名字': 2, '是': 3, '的': 4, '了': 5, '今年': 6, '几岁': 7, '多': 8, '有': 9, '胸多大': 10, '高': 11} # 至于它是做什么用的,带着问题往下看 # dictionary 词袋 # dictionary = {'什么': 0, '你': 1, '名字': 2, '是': 3, '的': 4, '了': 5, '今年': 6, '几岁': 7, '多': 8, '有': 9, '胸多大': 10, '高': 11} print("token2id", dictionary.token2id) print("dictionary", dictionary, type(dictionary))
import datetime from jieba import analyse #结巴词性标注 sent = "六月12" seg_list = psg.cut(sent) print(' '.join(['{0}/{1}'.format(w, t) for w, t in seg_list])) #分词-全模式 seg_list = jieba.cut(sent, cut_all=True) print('/'.join(seg_list)) #分词-精确模式 seg_list = jieba.cut(sent, cut_all=False) print('/'.join(seg_list)) #分词-搜索引擎模式 seg_list = jieba.cut_for_search(sent) print('/'.join(seg_list)) # print(str(datetime.today().strftime('%Y年%m月%d日'))) #结巴词性标注 #加在自定义词典 jieba.load_userdict("dict.txt") sent = '中文分词是文本处理中不可或缺的一部分,魏亚通今天很不错,支静阳今天也很不错' seg_list = psg.cut(sent) print(' '.join(['{0}/{1}'.format(w, t) for w, t in seg_list])) #结巴关键词提取技术 # 引入TF-IDF关键词抽取接口
#!/usr/bin/env python3 """Map example.""" import sys import re import jieba for line in sys.stdin: tokens = line.split('\t') id = tokens[0] content = re.sub(r'[^a-zA-Z0-9\u4e00-\u9fff]+', ' ', tokens[2]) for word in jieba.cut_for_search(content): if word.strip() != '': print(id + ' ' + word.lower() + '\t1')
'subtitles/Dodo_Village.txt', 'subtitles/Empty_Bottle_King.txt', 'subtitles/Gamker.txt', 'subtitles/Hello_Catie.txt', 'subtitles/Huan.txt', 'subtitles/Little_Hot_Sing.txt', 'subtitles/Lulu.txt', 'subtitles/Table_Games_Taichung.txt' ] # with open('text/CYFIT.txt', 'r') as input: for youtuber in youtubers: temp = youtuber.split('/') print(temp) data[temp[1]] = [] with open(youtuber) as input1: # item_now = '' for i, item in enumerate(input1): data[temp[1]] += [ t for t in jieba.cut_for_search(item) if t not in stops ] # if re.match(r'(.*?).wav', item): # #print(item) # item_now = item.strip() # data[item_now] = [t for t in jieba.cut_for_search(item) if t not in stops] + [t for t in jieba.cut_for_search(item) if t not in stops] # #print(data) # else: # #print(item) # data[item_now] += [t for t in jieba.cut_for_search(item) if t not in stops] #terms = [t for t in jieba.cut_for_search(item) if t not in stops] # print(sorted(Counter(terms).items(), key=lambda x:x[1], reverse=True)) #print(data) #break # if(terms != []): # done += terms
#!/usr/bin/env python # coding: utf-8 import pandas as pd import numpy as np import tensorflow as tf import jieba import opencc jieba.set_dictionary('dict_v2.txt') with open('stopwords_only_symbol_v2.txt', 'r', encoding='utf8') as f: stops_symbol = f.read().split('\n') input_str = input('請輸入文字:') # 輸入新聞標題 converter = opencc.OpenCC('s2twp.json') s2twp_str = converter.convert(input_str) jieba_str = ' '.join([t for t in jieba.cut_for_search(str(s2twp_str)) if t not in stops_symbol]) input_data_np = np.array([jieba_str]) vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore('search_jieba_no_stopwords_train_vocab.pickle') input_data_pd = np.array(list(vocab_processor.transform(input_data_np))) tf.reset_default_graph() saver = tf.train.import_meta_graph("Saved_model/search_jieba_no_stopwords_train_vocab.ckpt.meta") with tf.Session() as sess: saver.restore(sess, "Saved_model/search_jieba_no_stopwords_train_vocab.ckpt") prob_and_ans = {"Placeholder:0": input_data_pd, "Placeholder_2:0": 1} prob = sess.run("probability:0", feed_dict = prob_and_ans) ans = sess.run("ans:0", feed_dict = prob_and_ans) print(f'probability: {prob}') # 印出較高的機率 print(f'ans: {ans}') # 印出真或假( 1為真, 0為假)