Example #1
0
def set_keyword(docId, db, question=None, answer=None):
    cursor = db.cursor()
    sqlSelect = "select * from chatrobot where id = %d"
    sqlInsert = "INSERT INTO key_word(id,word) VALUES (%d,'%s')"
    if question is None or answer is None:
        cursor.execute(sqlSelect%(docId))
        data = cursor.fetchone()
        question = data[2]
        answer = data[3]
    question_set = set(jieba.cut_for_search(question))|set(question)
    answer_set = set(jieba.cut_for_search(answer))|set(answer)
    inter = question_set & answer_set
    sqlSelect = "SELECT count(*) from chatrobot where question like '%%%s%%'"
    minCount = 10000
    keyWord = ''
    for i in inter:
        cursor.execute(sqlSelect % i)
        count = cursor.fetchone()[0]
        if count < minCount:
            minCount = count
            keyWord = i
    if keyWord == u',':
        keyWord = ''
    # print "keyword:",keyWord
    cursor.execute(sqlInsert % (docId, keyWord))
    db.commit()
Example #2
0
def save():
   for course in Courses.query.all():
       seg_list = jieba.cut_for_search(course.name)
       str = '/'.join(seg_list)
       results = str.split('/')
       results.append(course.name)
       for result in results:
           if(Search.query.filter_by(name=result).first() == None):
               s = Search(name=result)
               s.courses.append(course)
               db.session.add(s)
               db.session.commit()
           elif(course not in Search.query.filter_by(name=result).first().courses.all()):
               s = Search.query.filter_by(name=result).first()
               s.courses.append(course)
               db.session.add(s)
               db.session.commit()

   for tag in Tags.query.all():
       seg_list = jieba.cut_for_search(tag.name)
       str = '/'.join(seg_list)
       results = str.split('/')
       results.append(tag.name)
       for result in results:
           if(Search.query.filter_by(name=result).first() == None):
               s = Search(name=result)
               s.tags.append(tag)
               db.session.add(s)
               db.session.commit()
           elif(tag not in Search.query.filter_by(name=result).first().tags.all()):
               s = Search.query.filter_by(name=result).first()
               s.tags.append(tag)
               db.session.add(s)
               db.session.commit()
Example #3
0
 def get_key_words(self,question):
     keywords = jieba.cut_for_search(question)
     keywordslist = list(keywords)
     if len(keywordslist)!=0:
         return {}.fromkeys(keywordslist).keys()
     else:
         return question
	def endElement(self, tag):
		if self.CurrentData == "text":
			if self.title.startswith('Wikipedia:'):
				print "Skip", self.title
				self.title = ""
				return
			print self.title
			print len(self.text)
			time0 = time.time()

			line = Converter('zh-hans').convert(self.text.decode('utf-8'))
			self.text = line.encode('utf-8')

			#words = pseg.cut(self.text)
			time_set=time.time()
			words = jieba.cut_for_search(self.text)
			sentenceStart = True
			for w in words:
				self.file.write(w + ' ')

			print time.time() - time0

			self.counter += 1
			self.title = ""

			print "Counter", self.counter

		self.CurrentData = ""
		self.text = ""
Example #5
0
def new_course():
    """
    创建一个新的课程
    :return:
    """
    # request.get_json.get('item', 'default')
    if request.method == "POST":
        course = Courses.from_json(request.get_json())
        db.session.add(course)
        db.session.commit()
        generator = jieba.cut_for_search(course.name)
        seg_list = '/'.join(generator)
        results = seg_list.split('/')
        if course.name not in results:
            results.append(course.name)
        for seg in results:
            s = Search.query.filter_by(name=seg).first()
            if not s:
                s = Search(name=seg)
            s.courses.append(course)
            db.session.add(s)
            db.session.commit()
        return jsonify({
            'id': course.id
        }), 201
Example #6
0
 def tokens(self, intext):
     intext = u' '.join(intext.split())
     if self.mode == 's':
         token_list = jieba.cut_for_search(intext)
     else:
         token_list = jieba.cut(intext)
     return [token for token in token_list if token.strip() != u'' and not token in self.stopword_set]
	def Search(this, searchString, sortOrder):
		seglist = jieba.cut_for_search(searchString)
		timeSummary = {}
		articleList = []
		topicList = []
		this.SearchArticle(seglist, sortOrder)
		this.SearchRelated(seglist)
		print('search complete')
		i = 1
		for article in this.articleList:
			for post in this.articleDB.find({u'DocID':article[0]}):
				postTime = time.strptime((post[u'Time'].split(' '))[0], u'%Y-%m-%d')
				timeSummary.setdefault(postTime, 0)
				timeSummary[postTime] = timeSummary[postTime] + 1
				articleList.append(post)
			i = i+1
			if i>100:
				break
				
		for topic in this.topicList:
			for post in this.topicDB.find({u'TopicID':topic[0]}):
				topicList.append(post)
		
		finalResult = {
			u'Article': articleList,
			u'Topic': topicList,
			u'Summary': sorted(timeSummary.iteritems(),cmp = lambda x,y:cmp(x[0],y[0]))
			}
		return finalResult
Example #8
0
def _fields_txt_2_dict(*txts):

	# txt1 = txts[0].encode('utf-8')
	# term_dict = seg_txt_2_dict(txt1)
	# for key in term_dict.iterkeys():
	# 	term_dict[key] = 3

	# for txt in txts[1:]:
	# 	txt = txt.encode('utf-8')
	# 	d = seg_txt_2_dict(txt)
	# 	term_dict.update(d)
	# return term_dict

	term_dict = {}
	for txt in txts:
		txt = txt.encode('utf-8')
		seg_list = cut_for_search(txt)
		for seg in seg_list:
			value = term_dict.get(seg)
			if value is None:
				term_dict[seg] = 1
			else:
				term_dict[seg] = value + 1

	return term_dict
Example #9
0
def find_dian_word():
    pkl_file = open('../data/new_words.pkl', 'rb')
    preprocessed_word_lists = pkl.load(pkl_file)
    word = []
    count = 0
    tmpt = []
    ci =['电','网','磁','流','感','源','揽','频','耦',
         '热','压','场','量','信','圈','耗','能','建',
         '机','燃','控','负','巡','阻','匝','线','度',
         '势','经','缘','贮','波','气','障','操','微',
         '谐','联','监','光','趋']
    for ele in preprocessed_word_lists:
        count = count + 1
        for each in ele:
            if '电' in each:
                word.append(each)
        sys.stdout.write('generated:{0}/total:{1}\r'.format(count, 950018))
        sys.stdout.flush()
    new_word = []
    for i in word:
        ss = '/'.join(jieba.cut_for_search(i, HMM=False))
        ss = ss.split('/')
        for eve in ss:
            for ele in ci:
                if ele in eve and len(eve)>=2:
                    new_word.append(eve)
            else:
                continue
    df = pd.DataFrame()
    df[''] = list(set(new_word))
    df.to_csv('9-27-one_2.csv',encoding='utf-8',index=False)
    print(len(set(new_word)))
    return new_word
Example #10
0
def tokenize(text):
	tokens = []

	text = preprocess(text)
	tokens += ASCII_SLUG_RE.findall(text)	# ASCII tokens are already usable

	for unit in CJK_SLUG_RE.findall(text):	# CJK tokens need extraction
		# Search engine mode. Might return ambiguous result
		unit_tokens = list(jieba.cut_for_search(unit))

		# Make better word guessing by joining non-conjunction words
		i = 0
		length = len(unit_tokens)
		while i < length:
			j = i
			buf = ''
			while j < length:
				token = unit_tokens[j]
				if token in CONJUNCTIONS or len(token) > 1:
					break
				else:
					buf += token
					j += 1

			if len(buf) > 1 and buf not in unit_tokens:
				unit_tokens.append(buf)
			
			i = j + 1

		tokens.extend(unit_tokens)

	return tokens
Example #11
0
def make_index():
	dbfile=file("tieba.json")
	dat=dbfile.read()
	datas=dat.split('\n')
	database = xapian.WritableDatabase('indexes/', xapian.DB_CREATE_OR_OPEN)
	#stemmer = xapian.Stem("english")
	for data in datas:
		try:
			ddata=eval(data)
			use_data={}
			use_data["title"]=ddata["title"]
			reply={}
			reply["content"]=ddata["reply"]["content"]
			reply["name"]=ddata["reply"]["name"]
			reply["time"]=ddata["reply"]["time"]
			use_data["reply"]=reply
			doc = xapian.Document()
			doc.set_data(str(use_data))
			use_data=str(ddata["reply"]["name"])+str(ddata["reply"]["time"])+str(ddata["reply"]["content"])+str(ddata["title"])
			for term in jieba.cut_for_search(str(use_data)):
				doc.add_term(term.encode('utf-8'))
			database.add_document(doc)
		except:
			pass	
	database.commit()
	dbfile.close()
Example #12
0
def put_course(id):
    """
    更新一门课
    """
    course = Courses.query.get_or_404(id)
    if request.method == "PUT":
        data_dict = eval(request.data)
        course.name = data_dict.get('name', course.name)
        course.teacher = data_dict.get('teacher', course.teacher)
        course.category_id = data_dict.get('category_id', course.category_id)
        course.subcategory_id = data_dict.get('sub_category_id', course.subcategory_id)
        course.type_id = data_dict.get('type_id', course.type_id)
        db.session.add(course)
        db.session.commit()
        generator = jieba.cut_for_search(course.name)
        seg_list = '/'.join(generator)
        results = seg_list.split('/')
        if course.name not in results:
            results.append(course.name)
        for seg in results:
            s = Search(name=seg)
            s.courses.append(course)
            db.session.add(s)
            db.session.commit()
    return jsonify({'update': id}), 200
Example #13
0
def cut_doc(doc):

    text = []
    url = re.findall("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", doc)
    for link in url:
        doc.remove(link)
    words = jieba.cut_for_search(doc)

    chcontent = []
    encontent = []
    # Used regular expression to split Chinese and English
    ch = re.compile("[\u4e00-\u9fa5]")
    en = re.compile("[^\u4e00-\u9fa5]")

    for word in words:

        if ch.match(word):
            # chcontent.append(str(word.encode("utf-8")))
            chcontent.append(word)
        else:
            encontent.append(word.lower())
    Tokens = chcontent + encontent + url
    removewords = [" ", "", "~", "~"]
    filted = [x for x in Tokens if x not in removewords]
    phrase = [x for x in filted if len(x) > 1]
    print([x for x in phrase])
    return phrase
Example #14
0
 def testCutForSearch(self):
     for content in test_contents:
         result = jieba.cut_for_search(content)
         assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
         result = list(result)
         assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
         print >> sys.stderr, " , ".join(result)
Example #15
0
    def search(self, qstr, topn=None, limit=None, extend=True):
        if not qstr:
            print "query must not be empty"
            #sys.exit(701)
            return

        start_query = datetime.datetime.now()
        pre_query = datetime.timedelta()
        begin_index = datetime.datetime.now()

        raw_qs = wfilter(jieba.cut_for_search(qstr))

        end_index = datetime.datetime.now()
        pre_query += (end_index - begin_index)

        sortResult, Qs, raw_qs, hit, index_cost = self._search(raw_qs, topn=topn, limit=limit, extend=extend)
        
        index_cost += pre_query     # add time costs of pre_query
        index_cost = index_cost.total_seconds()

        end_query = datetime.datetime.now()
        delta_query = end_query - start_query
        delta_query = delta_query.total_seconds()

        return sortResult, Qs, raw_qs, hit, index_cost, delta_query
Example #16
0
def preprocess_query_str(query_str):
    result = []
    keywords = [keyword for keyword in query_str.split(" ") if keyword.strip() != ""]
    for keyword in keywords:
        cutted_keyword = " ".join(["%s" % term for term in jieba.cut_for_search(keyword)])
        result.append(cutted_keyword)
    return result
Example #17
0
def CHseparatewords(text):
    
    seg_list = jieba.cut_for_search(text)
    result = []
    for seg in seg_list:
        result.append(seg)
    return result
Example #18
0
def get_search_videos(request):
    try:
        if request.GET.has_key('title'):
            q_titles = request.GET['title'].encode('utf8')
            seg_list = jieba.cut_for_search(q_titles)
            seg_list = list(seg_list)
            if "" in seg_list:
                seg_list.remove("")
            if " " in seg_list:
                seg_list.remove(" ")

            if getLen(seg_list) == 0:
                return None
                # return Video.objects.all()

            temp = []
            for i in range(len(seg_list)):
                if str(seg_list[i].encode("utf8")) not in stop_list:
                    print seg_list[i]
                    temp.append(seg_list[i])
                    # seg_list.remove(seg_list[i])
                    # continue
                # elif seg_list[i].encode("utf8") not in key_:
                    # seg_list.remove(seg_list[i])
            seg_list = temp
            if getLen(seg_list) == 0:
                return None

            q_title = seg_list[0]
            videos = Video.objects.filter(Q(title__icontains=q_title)|Q(kind_str__icontains=q_title)|Q(tags_str__icontains=q_title)).all()
            for i in range(1, len(seg_list)):
                q_title = seg_list[i]
                videos = videos | Video.objects.filter(Q(title__icontains=q_title)|Q(kind_str__icontains=q_title)|Q(tags_str__icontains=q_title)).all()

            # ids = []
            # for seg in seg_list:
                # if seg.encode("utf8") in key_:
                    # v_ids = video_[key_.index(seg.encode("utf8"))]
                    # # print v_ids
                    # for v in v_ids.split(','):
                        # ids.append(v)

            # ids =  list(set(ids))
            # videos = None
            # if getLen(ids) >= 1:
                # q_id = ids[0]
                # videos = Video.objects.filter(id=q_id).all()
                # for i in range(1, len(ids)):
                    # q_id = ids[i]
                    # videos = videos | Video.objects.filter(id=q_id).all()

            # if getLen(videos) == 0:
                # return Video.objects.all()
            return videos

        else:
            return Video.objects.all()

    except Exception, e:
        printError("search:"+str(e))
def jiebasplit(stringlist):
    stringset = set()
    for string in stringlist:
        string_seg_list = jieba.cut_for_search(string)
        string_seg_set = set(string_seg_list)
        stringset = stringset | string_seg_set
    return stringset
Example #20
0
    def split(self, input):
        chinese = []
        if JIEBA:
            chinese = list(jieba.cut_for_search(input))

        latin1 = self.latin1_letters.findall(input)
        return chinese + latin1
Example #21
0
def getWords(doc):
	# _mood = set()
	# for i in re.findall(r'\[\S+?\]',doc.decode('utf-8')):  
	# 	# print 'data:' +i  #心情表情
	# 	_mood.add(i)
	# 	doc = doc.replace(i, '')
	# return dict([(w,1) for w in jieba.cut(doc)])
	# _mood = set()
	# regxs = {r'\[\S+?\]': '', r'//@.*:': ''}
	# for key,value in regxs.items():
	# 	print "pre:%s"%doc
	# 	doc = re.sub(key, value, doc, flags=re.IGNORECASE)
	# 	print "aft:%s"%doc
	# return dict([(w,1) for w in jieba.cut_for_search(doc)])
	_mood = set()
	regxs = {r'\[.*\]': '', r'//@.*:': ''}
	for key,value in regxs.items():
	  # print "pre:%s"%doc
	  doc = re.sub(key, value, doc.decode('utf8'), flags=re.IGNORECASE)
	  # print "aft:%s"%doc
	# table = string.maketrans("", "")
	# doc.translate(table, string.punctuation)
	regex = re.compile('[%s]' % re.escape(string.punctuation))
	doc = regex.sub('', doc)
	print "reg:%s" % doc
	res = dict()
	for w in jieba.cut_for_search(doc):
		if w in string.punctuation+extra_punctuation or len(w)<2:
			print "s", w
		else:
			res[w] =1
	return res
Example #22
0
File: min.py Project: ziyueit/min
 def add_content(self, content, obj_key):
     """
     添加文档到索引
     """
     seg_list = jieba.cut_for_search(content)
     seg_list = min_nlp.get_weight(seg_list)
     self.add_word_index(seg_list, obj_key)
Example #23
0
File: min.py Project: ziyueit/min
 def search(self, keywords, start=0, length=20):
     """
     搜索关键字
     """
     seg_list = list(jieba.cut_for_search(keywords))
     key_list = self.search_by_words(seg_list, start, length)
     return key_list
def dosearch(query):
    weight = 0
    raw_query = lower_letters(query)
    query = query_parser(query)
    query = query + [raw_query]
    id_list = []
    res_name = []
    weight = {}
    if query:
        for term in query:
            if term in t_inverted_index:
                for key, value in t_inverted_index[term].iteritems():
                    if key not in weight:
                        weight[key] = tdxidf_weighting(term, key)
                    else:
                        weight[key] = weight[key] + tdxidf_weighting(term, key)
                    if key not in id_list:
                        id_list.append(key)

        rank_list = calc_vector_space(query, id_list)
        rank_fin = []
        q = list(set(jieba.cut_for_search(raw_query)))
        if u" " in q:
            q.remove(u" ")
        cnt = []
        for key_index, key in reversed(list(enumerate(rank_list))):
            info_term = list(set(jieba.cut_for_search(id_info_list[key])))
            if u" " in info_term:
                info_term.remove(u" ")
            for term in q:
                if term in info_term:
                    cnt.append(key)

        freq_cnt = Counter(cnt)
        freq_cnt_tuples = freq_cnt.most_common()
        for item, cnt in freq_cnt_tuples:
            rank_fin.append(item)

        for item in rank_list:
            if item not in rank_fin:
                rank_fin.append(item)
        if id_list:
            for ids in rank_fin:
                res = os.path.splitext(doc_id_list[ids])[0]
                res = res[7:]
                res_name.append(res)
        return res_name
Example #25
0
    def split(self, input):
        # type: (unicode) -> List[unicode]
        chinese = []  # type: List[unicode]
        if JIEBA:
            chinese = list(jieba.cut_for_search(input))

        latin1 = self.latin1_letters.findall(input)
        return chinese + latin1
def jiebasplit2(stringlist):   # 测试关键词或运算的分组结果,不理想。
    stringsetlist = list()
    for string in stringlist:
        substringset = set()
        string_seg_list = jieba.cut_for_search(string)
        substringset = set(string_seg_list)
        stringsetlist.append(substringset)
    return stringsetlist
Example #27
0
 def search(self, query):
     keys = jieba.cut_for_search(query)
     p = self._search(keys)
     result = []
     while p is not None:
         result.append(p.word)
         p = p.next
     return result;
Example #28
0
 def testCutForSearch_NOHMM(self):
     for content in test_contents:
         result = jieba.cut_for_search(content,HMM=False)
         assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
         result = list(result)
         assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
         print(" , ".join(result), file=sys.stderr)
     print("testCutForSearch_NOHMM", file=sys.stderr)
Example #29
0
def segment(content):
    """
    使用结巴分词对fin进行分词并输出到fout
    """
    print 'segment...please wait..'
    words = ' '.join(jieba.cut_for_search(content))
    f = file(fout, 'wb')
    f.write(words.encode('utf8'))
Example #30
0
def cut_search(data):
    '''
    采用搜索引擎模式分词,在精确模式的基础上,对长词再次切分,
    来到北京大学-->来到/北京/大学/北京大学
    '''
    temp_result = jieba.cut_for_search(data)
    temp_result = '/'.join(temp_result)
    return temp_result
Example #31
0
 def gen_keywords(self, text):
     result_list = list(jieba.cut_for_search(text))
     return self.__trim_stop_word__(result_list)
Example #32
0
import jieba

#全模式
text = "我来到北京清华大学"
seg_list = jieba.cut(text, cut_all=True)
print( u"[全模式]: ", "/ ".join(seg_list) )

#精确模式
seg_list = jieba.cut(text, cut_all=False)
print(u"[精确模式]: ", "/ ".join(seg_list) )

#默认是精确模式
seg_list = jieba.cut(text)
print( u"[默认模式]: ", "/ ".join(seg_list)  )

#新词识别 “杭研”并没有在词典中,但是也被Viterbi算法识别出来了
seg_list = jieba.cut("他来到了网易杭研大厦") 
print( u"[新词识别]: ", "/ ".join(seg_list) )

#搜索引擎模式
seg_list = jieba.cut_for_search(text) 
print( u"[搜索引擎模式]: ", "/ ".join(seg_list) )

'''
[全模式]:  我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学
[精确模式]:  我/ 来到/ 北京/ 清华大学
[默认模式]:  我/ 来到/ 北京/ 清华大学
[新词识别]:  他/ 来到/ 了/ 网易/ 杭研/ 大厦
[搜索引擎模式]:  我/ 来到/ 北京/ 清华/ 华大/ 大学/ 清华大学
'''
    if i == 0:
        y.append("交通")
    else:
        y.append("计算机")

print y[0], y[1], y[2]

import jieba
"""
jieba的中文github地址:https://github.com/fxsjy/jieba
"""
x1 = '''

    三个臭皮匠顶个诸葛亮,以此类推,如果能把一个人跟另外100万人的大脑连接起来,就会诞生“超级大脑”。正因如此,现在才出现了好几家公司争相开发脑机界面,希望把人的思维与机器连接起来。如果能够率先将笔记本电脑的功能植入你的大脑,就将为人们开辟一条道路,使之得以随意通过无缝渠道与任何人(甚至任何东西)交换信息。目前有两位IT行业的大佬都在参与这场角逐,他们分别是特斯拉创始人埃隆·马斯克(Elon Musk)和Facebook创始人马克·扎克伯格(Mark Zuckerberg)。他们两人的项目分别名为Neuralink和Building 8。而据知情人士透露,这两个项目都需要对大脑进行外科手术。然而,还有一些没有那么野心勃勃的微创方式,也可以解决脑机界面问题。只需要把脑电波的数据转化成简单的指令,然后由应用或设备进行处理即可。一家名为Nuro的创业公司就采取了这种方式。他们希望借助自己的软件平台,让那些因为严重受伤或疾病而丧失交流能力的人恢复这种能力。
    '''
x2 = "本期企鹅评测团产品——华为MateBook X Pro笔记本电脑。作者是一名普通公务员,同时又是一名数码发烧友,多年来一直沉迷于各种新潮的数码产品,工作以后也不忘通过数码产品提升工作效率。随着笔记本电脑市场竞争的日益激烈,再加上硬件性能不断提升,越来越多的非游戏用户选择使用更加方便携带的超极本,各大厂商自然也是迎合用户需求,推出外观更加靓丽、身材更加小巧、功能更加丰富的超极本。"
seg_list = jieba.cut(x2.strip(), cut_all=True)
# seg_list这是一个可循环的对象
print("Full Mode: " + " ".join(seg_list))  # 全模式

seg_list = jieba.cut("我来到北京清华大学,工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
                     cut_all=False)
print("Precise Mode: " + " ".join(seg_list))  #精确模式,默认状态下也是精确模式

seg_list = jieba.cut_for_search(
    "我来到北京清华大学,工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
print "搜索模式:", " ".join(seg_list)

mcase = {'a': 10, 'b': 34}
mcase_frequency = {v: k for k, v in mcase.items()}
print mcase_frequency
Example #34
0
# -*- coding: utf-8 -*-
import sys
import os
import jieba

sent='在包含問題的所有解的解空間樹中,按照深度優先搜尋的策略,從根節點出發深度優先搜尋解空間樹'

# 全模式
wordList=jieba.cut(sent, cut_all=True)
print(' | '.join(wordList)) 

# 精準切分
wordList=jieba.cut(sent)
print(' | '.join(wordList))

# 搜尋引擎模式
wordList=jieba.cut_for_search(sent)
print(' | '.join(wordList))
Example #35
0
# -*- coding: utf-8 -*-
# coding=utf-8

import jieba
import jieba.analyse

text = "故宫的著名景点包括乾清宫、太和殿和午门等。其中乾清宫非常精美,午门是紫禁城的正门,午门居中向阳。"

# #jieba.load_userdict("jieba_dict.txt")  # 用户自定义词典 (用户可以自己在这个文本文件中,写好自定制词汇)
# f = open('jieba_text.txt', 'r', encoding='utf8')  # 要进行分词处理的文本文件 (统统按照utf8文件去处理,省得麻烦)
# lines = f.readlines()
# for line in lines:
#     text += line

# seg_list = jieba.cut(text, cut_all=False)  #精确模式(默认是精确模式)
seg_list = jieba.cut(text)  # 精确模式(默认是精确模式)
for va in seg_list:
    print(va)
print(seg_list)
print("[精确模式]: ", " ".join(seg_list))

seg_list2 = jieba.cut(text, cut_all=True)  #全模式
print("[全模式]: ", " ".join(seg_list2))

seg_list3 = jieba.cut_for_search(text)  #搜索引擎模式
print("[搜索引擎模式]: ", " ".join(seg_list3))

tags = jieba.analyse.extract_tags(text, topK=5)
print("关键词:    ", "  ".join(tags))
Example #36
0
    line.strip()
    for line in open('stopwords.txt', encoding='UTF-8').readlines()
]
#一行行读取csv
file_object2 = open(args["file_name"]).read().split('\n')

#建立分词存储列表
Rs1 = []
Rs2 = []
#统计词频的字典
dic = {}

for i in range(len(file_object2)):
    result = []
    #	seg_list = jieba.cut(file_object2[i]) 选择cut的模式
    seg_list = jieba.cut_for_search(file_object2[i])
    #添加源数据列
    result.append(file_object2[i])
    #读取每一行分词
    for w in seg_list:
        if w not in stopwords:
            result.append(w)
            dic[w] = dic.get(w, 0) + 1
            continue
    #把分词写入源列表后面
    Rs1.append(result)

#写入CSV,并用时间命名文件 避免重名
# 08 05 2019 09:49:02 时间格式
doctime = str(time.strftime("%m %d %Y %H:%M:%S", time.localtime()))
mon = doctime[0:2]
    def testTokenize(self):
        vocab_file = 'testdata/vocab_chinese.txt'
        user_dict_files = [
            'data/jieba/hello.txt',
            'naivenlp/tokenizers/data/dict.txt',
        ]
        intervener = DefaultIntervener()
        tokenizer = JiebaTokenizer(
            vocab_file=vocab_file,
            user_dict_files=user_dict_files,
            intervener=intervener,
            pad_token='[PAD]',
            unk_token='[UNK]',
            bos_token='<S>',
            eos_token='<T>',
            cls_token='[CLS]',
            sep_token='[SEP]',
            mask_token='[MASK]',
        )

        self.assertEqual(0, tokenizer.pad_id)
        self.assertEqual(100, tokenizer.unk_id)
        self.assertEqual(104, tokenizer.bos_id)
        self.assertEqual(105, tokenizer.eos_id)
        self.assertEqual(101, tokenizer.cls_id)
        self.assertEqual(102, tokenizer.sep_id)
        self.assertEqual(103, tokenizer.mask_id)

        sentences = [
            '我在上海工作',
            '我来到北京清华大学',
            '乒乓球拍卖完了',
            '中国科学技术大学',
        ]

        for sent in sentences:
            self.assertEqual(
                [t for t in jieba.cut(sent, cut_all=False, HMM=True)],
                tokenizer.tokenize(sent, mode='accurate', hmm=True))
            self.assertEqual(
                [t for t in jieba.cut(sent, cut_all=False, HMM=False)],
                tokenizer.tokenize(sent, mode='accurate', hmm=False))
            self.assertEqual(
                [t for t in jieba.cut(sent, cut_all=True, HMM=True)],
                tokenizer.tokenize(sent, mode='full', hmm=True))
            self.assertEqual(
                [t for t in jieba.cut(sent, cut_all=True, HMM=False)],
                tokenizer.tokenize(sent, mode='full', hmm=True))
            self.assertEqual(
                [t for t in jieba.cut_for_search(sent, HMM=True)],
                tokenizer.tokenize(sent, mode='search', hmm=True))
            self.assertEqual(
                [t for t in jieba.cut_for_search(sent, HMM=False)],
                tokenizer.tokenize(sent, mode='search', hmm=False))

        tokens = tokenizer.tokenize('高级javadeveloper')
        self.assertListEqual(['高级', 'javadeveloper'], tokens)

        intervener.add_split_token('javadeveloper', 'java developer')
        tokens = tokenizer.tokenize('高级javadeveloper')
        self.assertListEqual(['高级', 'java', 'developer'], tokens)

        intervener.add_combine_token('javadeveloper')
        tokens = tokenizer.tokenize('高级javadeveloper')
        self.assertListEqual(['高级', 'javadeveloper'], tokens)

        intervener.remove_combine_token('javadeveloper')
        tokens = tokenizer.tokenize('高级javadeveloper')
        self.assertListEqual(['高级', 'java', 'developer'], tokens)

        intervener.remove_split_token('javadeveloper')
        tokens = tokenizer.tokenize('高级javadeveloper')
        self.assertListEqual(['高级', 'javadeveloper'], tokens)
Example #38
0
 def nlp_jieba_cut(self, text):
     stop_words = '。,?:@—,、!![]【】《》“”.…#~ '
     self.data['jieba_cut'] = list(
         filter(lambda x: x.strip(stop_words), jieba.cut_for_search(text)))
Example #39
0
import jieba
import sklearn

s1 = "我来贪心学院学习python"

s1_result = jieba.cut(s1)
print(list(s1_result))

s1_result = jieba.cut(s1, cut_all=True)
print(list(s1_result))

s1_result = jieba.cut_for_search(s1)
print(list(s1_result))

word_vector_list = ["我们", "来", "贪心", "学院", "学习", "人工智能", "和", "Python"]
question = "Python学习多久"
s1 = "我来贪心学院学习Python"
s2 = "我学习人工智能"
s3 = "Python课程的学习周期是多久"

import numpy as np


def get_vector(data):
    vector_list = []

    for i in word_vector_list:
        if i in list(jieba.cut(data)):
            vector_list.append(1)
        else:
            vector_list.append(0)
Example #40
0
# encoding=utf-8
#jieba的三种模式
import jieba
str= "2018汉马全马男子冠军诞生!摩洛哥选手卫冕"

seg_list= jieba.cut(str, cut_all=True)
print("全模式: " + "/".join(seg_list)) # 全模式
print("-------------------------------------")

seg_list= jieba.cut(str)
print("默认模式: " + "/".join(seg_list)) # 默认模式= 精确模式
print("-------------------------------------")

seg_list= jieba.cut_for_search(str) # 搜索引擎模式
print("搜索引擎模式: " + "/".join(seg_list))
def cut_search(label):
    seg_list = jieba.cut_for_search(label)
    return seg_list
Example #42
0
def get_words_list(df):
    df['words_list'] = []
    word_generator = jieba.cut_for_search(df['title'])
    for word in word_generator:
        df['words_list'].append(word)
    return df
    def read_post(flag):
        stop_words = stopwordslist()
        pre_path = "../Data/weibo/tweets/"
        file_list = [pre_path + "test_nonrumor.txt", pre_path + "test_rumor.txt", \
                     pre_path + "train_nonrumor.txt", pre_path + "train_rumor.txt"]
        if flag == "train":
            id = pickle.load(open("../Data/weibo/train_id" + str(fold_id) + ".pkl", 'rb'))
        elif flag == "validate":
            id = pickle.load(open("../Data/weibo/validate_id.pickle", 'rb'))
        elif flag == "test":
            id = pickle.load(open("../Data/weibo/test_id" + str(fold_id) + ".pkl", 'rb'))

        post_content = []
        labels = []
        image_ids = []
        twitter_ids = []
        data = []
        column = ['post_id', 'image_id', 'original_post', 'post_text', 'label', 'event_label']
        key = -1
        map_id = {}
        top_data = []
        for k, f in enumerate(file_list):

            f = open(f, 'r')
            if (k + 1) % 2 == 1:
                label = 0  # real is 0
            else:
                label = 1  # fake is 1

            twitter_id = 0
            line_data = []
            top_line_data = []

            for i, l in enumerate(f.readlines()):

                if (i + 1) % 3 == 1:
                    line_data = []
                    twitter_id = l.split('|')[0]
                    line_data.append(twitter_id)

                if (i + 1) % 3 == 2:
                    line_data.append(l.lower())

                if (i + 1) % 3 == 0:
                    l = clean_str_sst(l)

                    seg_list = jieba.cut_for_search(l)
                    new_seg_list = []
                    for word in seg_list:
                        if word not in stop_words:
                            new_seg_list.append(word)

                    clean_l = " ".join(new_seg_list)
                    if len(clean_l) > 10 and line_data[0] in id:
                        post_content.append(l)
                        line_data.append(l)
                        line_data.append(clean_l)
                        line_data.append(label)
                        event = int(id[line_data[0]])
                        if event not in map_id:
                            map_id[event] = len(map_id)
                            event = map_id[event]
                        else:
                            event = map_id[event]

                        line_data.append(event)

                        data.append(line_data)

            f.close()
            # print(data)
            #     return post_content

        data_df = pd.DataFrame(np.array(data), columns=column)
        write_txt(top_data)

        return post_content, data_df, len(map_id)
 def get_cut_for_search(self, sentence):
     return jieba.cut_for_search(sentence, HMM=False)
Example #45
0
import jieba
import time
jieba.initialize()  #手动初始化jieba分词词典
time.sleep(1)

s = u'我想去北京故宫博物院参观和闲逛。'

cut = jieba.cut(s)
# print cut
print '精确模式-----------------------------'
print ','.join(cut)
print '全模式------------------------------'
print ','.join(jieba.cut(s, cut_all=True))
print '搜索引擎模式-------------------------'
print ','.join(jieba.cut_for_search(s))
print '获取词性----------------------------'
import jieba.posseg as psg
# print [(x.word,x.flag) for x in psg.cut(s)]
for x in psg.cut(s):
    print x.word + " " + x.flag + ",",
print '\n只获取名词--------------------------'
# print [(x.word,x.flag) for x in psg.cut(s) if x.flag.startswith('n')]
for x in psg.cut(s):
    if x.flag.startswith('n'): print x.word + " " + x.flag + ",",

print ''
#并行分词
# 开启并行分词模式,参数为并发执行的进程数
jieba.enable_parallel(5)
Example #46
0
import jieba

# 精确模式
seg_list = jieba.cut("我去过清华大学和北京大学。")
print("精确模式: " + "/".join(seg_list))

# 全模式
seg_list = jieba.cut("我去过清华大学和北京大学。", cut_all=True)
print("全模式: " + "/".join(seg_list))

# 搜索引擎模式
seg_list = jieba.cut_for_search("我去过清华大学和北京大学。")
print("搜索引擎模式: " + "/".join(seg_list))

# 精确模式/全模式下-新词发现 “杭研”没有在词典中,也被HMM模型 Viterbi算法识别出来
seg_list = jieba.cut("他来到了网易杭研大厦", HMM=True)
print("精确模式/全模式-新词发现: " + "/".join(seg_list))

# 搜索引擎模式下-新词发现 “杭研”没有在词典中,也被HMM模型 Viterbi算法识别出来
seg_list = jieba.cut_for_search("他来到了网易杭研大厦", HMM=True)
print("搜索引擎模式-新词发现: " + "/".join(seg_list))
Example #47
0
print('='*40)
print('1. 分词')
print('-'*40)

prpaStr="我来到北京清华大学,看到让我蓝瘦香菇的word哥,真是让人无语。"
seg_list = jieba.cut(prpaStr, cut_all=True)
print("全模式分词: \n" + "/ ".join(seg_list))  # 全模式

seg_list = jieba.cut(prpaStr, cut_all=False)
print("默认模式分词: \n" + "/ ".join(seg_list))  # 默认模式

seg_list = jieba.cut(prpaStr)
print("自定义分隔符分词:\n"+",".join(seg_list))

seg_list = jieba.cut_for_search(prpaStr)  # 搜索引擎模式
print("搜索引擎模式:\n"+",".join(seg_list))




print("\n"*5+'='*40)
print('2. 添加自定义词典/调整词典')
print('-'*40)

prpaStr1 = '如果放到post中将出错。'
print("未调整词典的分词:\n"+'/'.join(jieba.cut(prpaStr1, HMM=False)))
print(jieba.suggest_freq(('中', '将'), True))
print("调整词典的分词:\n"+'/'.join(jieba.cut(prpaStr1, HMM=False)))

prpaStr2 = '「台中」正确应该不会被切开'
Example #48
0
#encoding=utf-8
import sys
import jieba
# dic_path='sougou.txt'
# #dic_path=dic_path.encode('utf8')
# #print type(dic_path)
# jieba.load_userdict(dic_path)
# print(", ".join(jieba.cut("胆碱酯酶减少胆碱脂酶试剂盒胆碱酯酶试纸胆碱酯酶增加")))

#encoding=utf-8
import jieba

seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print("Full Mode:", "/ ".join(seg_list))  # 全模式

seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("Default Mode:", "/ ".join(seg_list))  # 精确模式

seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
print(", ".join(seg_list))

seg_list = jieba.cut_for_search("小明硕士毕业后在日本京都大学深造")  # 搜索引擎模式
print(", ".join(seg_list))

import jieba.posseg as pseg
words = pseg.cut("我爱北京天安门")
for w in words:
    print(w.word, w.flag)
Example #49
0
"""

import jieba
"""
算法:
    基于前缀词典实现高效的词图扫描,生成句子中汉字所有可能成词情况所构成的有向无环图 (DAG)
    采用了动态规划查找最大概率路径, 找出基于词频的最大切分组合
    对于未登录词,采用了基于汉字成词能力的 HMM 模型,使用了 Viterbi 算法
"""
"""
1. 分词
jieba.cut 方法接受三个输入参数: 需要分词的字符串;cut_all 参数用来控制是否采用全模式;HMM 参数用来控制是否使用 HMM 模型
jieba.cut_for_search 方法接受两个参数:需要分词的字符串;是否使用 HMM 模型。该方法适合用于搜索引擎构建倒排索引的分词,粒度比较细
待分词的字符串可以是 unicode 或 UTF-8 字符串、GBK 字符串。注意:不建议直接输入 GBK 字符串,可能无法预料地错误解码成 UTF-8
jieba.cut 以及 jieba.cut_for_search 返回的结构都是一个可迭代的 generator,可以使用 for 循环来获得分词后得到的每一个词语(unicode),或者用
jieba.lcut 以及 jieba.lcut_for_search 直接返回 list
jieba.Tokenizer(dictionary=DEFAULT_DICT) 新建自定义分词器,可用于同时使用不同词典。jieba.dt 为默认分词器,所有全局分词相关函数都是该分词器的映射。
"""

#代码示例
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print("Full Mode: " + "/ ".join(seg_list))  # 全模式

seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
print(", ".join(seg_list))

seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造")  # 搜索引擎模式
print(", ".join(seg_list))
Example #50
0
from sklearn import metrics
if __name__ == '__main__':

    #f = open("C:\\Users\\Administrator\\Desktop\\python note\\craw\\taobaomm\\sj_names.txt")
    module_path = dirname(__file__)
    f = open(join(module_path, 'sj_names.txt'))
    class_list = []
    term_str = []
    try:
        for line in f:
            lt = line.split(',')
            if lt[1] == '全部':  #过滤掉全部分类
                continue
            class_list.append(lt[0])
            temstr = lt[2].split('(')
            seg_list = jieba.cut_for_search(temstr[0])  #搜索引擎模式
            terlist = ", ".join(seg_list)  #解析成字符串
            try:
                term_str.append([
                    term.strip() for term in terlist.split(',')
                    if len(term.strip()) > 1
                ])  #去掉空格字符转换为列表
            except UnicodeEncodeError:
                print 'err'
    finally:
        f.close()
    fs = fj.Feature_select()
    term_vec = fs.transform(term_str)
    x_train, x_test, y_train, y_test = train_test_split(term_vec,
                                                        class_list,
                                                        test_size=0.2)
Example #51
0
import jieba
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import numpy
import matplotlib.pyplot as plt
from os import path

str = "明明知识点都熟记于心,可是在考试的时候脑子一片空白,什么都想不起来了"
# 使用自定义字典
#自定义词典的格式:一个词占一行;每一行分三部分,一部分为词语,另一部分为词频,最后为词性(可省略),用空格隔开

# jieba.load_userdict('dict.txt')

ex_list1 = jieba.cut(str)
ex_list2 = jieba.cut(str, cut_all=True)
ex_list3 = jieba.cut_for_search(str)
print("精准模式:" + '/'.join(ex_list1))
print("全模式:" + '/'.join(ex_list2))
print("搜索引擎模式:" + '/'.join(ex_list3))
#可以看到全模式和搜索引擎模式下分词分得比精准模式更稀碎

# 定义绝对路径地址
__file__ = r"/Users/jiaxiaopeng/"
# 把路径地址字符串转换为文件路径
d = path.dirname(__file__)
# 调用包PIL中的open方法,读取图片文件,通过numpy中的array方法生成数组
backgroud_Image = numpy.array(Image.open(path.join(d, "111.jpg")))

# 绘制词云图
wc = WordCloud(
    background_color='white',  # 设置背景颜色,与图片的背景色相关
Example #52
0
import jieba
# aa=jieba.cut('IHS Markit的最新调查报告称,苹果可能会为 iPhone 和 Apple Watch的屏幕长期采用一种全新的节能背板技术,有助于延长其电池续航时间。IHS认为未来的iPhone中改用LTPO TFT(低温多晶硅氧化物)背板,从理论上估算 LTPO可以比LTPS 节省 5 ~ 15% 的功耗,从而延长 iPhone的电池续航时间。',cut_all=True)
# print('Full Mode:'+'/'.join(aa))
#
# bb=jieba.cut('IHS Markit的最新调查报告称,苹果可能会为 iPhone 和 Apple Watch的屏幕长期采用一种全新的节能背板技术,有助于延长其电池续航时间。IHS认为未来的iPhone中改用LTPO TFT(低温多晶硅氧化物)背板,从理论上估算 LTPO可以比LTPS 节省 5 ~ 15% 的功耗,从而延长 iPhone的电池续航时间。',cut_all=False)
# print('Default Mode:'+'/'.join(bb))#cut_all 默认为Faulse
#
bb=jieba.cut_for_search('IHS Markit的最新调查报告称,苹果可能会为 iPhone 和 Apple Watch的屏幕长期采用一种全新的节能背板技术,有助于延长其电池续航时间。IHS认为未来的iPhone中改用LTPO TFT(低温多晶硅氧化物)背板,从理论上估算 LTPO可以比LTPS 节省 5 ~ 15% 的功耗,从而延长 iPhone的电池续航时间。',HMM=True)
print('/'.join(bb))
print('/'.join(bb))
print('/'.join(bb))
print('/'.join(bb))


pp = 'IHS Markit的最新调查报告称,苹果可能会为 iPhone 和 Apple Watch的屏幕长期采用一种全新的节能背板技术,有助于延长其电池续航时间。'
Example #53
0
# coding: utf-8
from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt
import jieba
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# 设置文件路径
dir = path.dirname('.')
text1 = open(path.join(dir, 'chinese.txt')).read()
text2 = jieba.cut_for_search(text1)
text_ch = " ".join(text2)
# 设置词云蒙版
mask_coloring = imread(path.join(dir, "mask.png"))
wc = WordCloud(font_path='simsun.ttf',
               mask=mask_coloring,
               background_color="white",
               max_words=2000,
               max_font_size=80,
               random_state=80)
# 生成词云图
wc.generate(text_ch)
image_colors = ImageColorGenerator(mask_coloring)  # 原始色彩
plt.figure()
plt.imshow(wc)
plt.axis("off")
# 生成词云图
plt.figure()
plt.imshow(wc.recolor(color_func=image_colors))  # 重置为底图色彩蒙版
plt.axis("off")
# 生成词云图
plt.figure()
Example #54
0
# CsvFile = csv.reader(open('/Users/sallyfan/desktop/csat2017.csv'))
# contents = []
# # for i in CsvFile:
# #     contents1  = contents.append(i)
# print(type(CsvFile))
# for i in CsvFile:
#     print(i)


openfile = open('/Users/sallyfan/desktop/cusfeedbacks.txt')
# openfile = open('/Users/sallyfan/desktop/csat.txt')
# for i in file:
#     print(i)
file = []
for i in openfile:
    file.append(i)
# print(file)
finalfile = "".join(file)
# print(type((finalfile))

cutwords =  jieba.cut_for_search(finalfile)
jieba.suggest_freq(('充电桩','30米','特斯拉'), True )
cipin = jieba.analyse.textrank(finalfile,topK=30, allowPOS= ('a','v'), withFlag=False)
print(cipin)
cipin2 = jieba.analyse.extract_tags(finalfile, topK=30,allowPOS= ('a','v','ver'),withFlag=False)
print(cipin2)

#
count  = Counter(cutwords).most_common(30)
print(count)
def cuttest(test_sent):
    result = jieba.cut_for_search(test_sent)
    for word in result:
        print(word, "/", end=' ')
    print("")
Example #56
0
import jieba
import gensim
from gensim import corpora
from gensim import models
from gensim import similarities

l1 = ["你的名字是什么", "你今年几岁了", "你有多高你胸多大", "你胸多大"]
a = "你今年多大了"

all_doc_list = []
for doc in l1: # doc = "你的名字是什么"
    doc_list = [word for word in jieba.cut_for_search(doc)]
    all_doc_list.append(doc_list)

# all_doc_list = [['你', '的', '名字', '是', '什么'], ['你', '今年', '几岁', '了'], ['你', '有', '多', '高', '你', '胸多大'], ['你', '胸多大']]

print(all_doc_list)
doc_test_list = [word for word in jieba.cut_for_search(a)]
# doc_test_list = [你,今年,多,大,了]

# 制作语料库
dictionary = corpora.Dictionary(all_doc_list)  # 制作词袋 not 磁带
#{'你':1,"的":2,"名字":3}
# 词袋的理解
# 词袋就是将很多很多的词,进行排列形成一个 词(key) 与一个 标志位(value) 的字典
# 例如: {'什么': 0, '你': 1, '名字': 2, '是': 3, '的': 4, '了': 5, '今年': 6, '几岁': 7, '多': 8, '有': 9, '胸多大': 10, '高': 11}
# 至于它是做什么用的,带着问题往下看
# dictionary 词袋
# dictionary = {'什么': 0, '你': 1, '名字': 2, '是': 3, '的': 4, '了': 5, '今年': 6, '几岁': 7, '多': 8, '有': 9, '胸多大': 10, '高': 11}
print("token2id", dictionary.token2id)
print("dictionary", dictionary, type(dictionary))
Example #57
0
import datetime
from jieba import analyse

#结巴词性标注
sent = "六月12"
seg_list = psg.cut(sent)
print(' '.join(['{0}/{1}'.format(w, t) for w, t in seg_list]))

#分词-全模式
seg_list = jieba.cut(sent, cut_all=True)
print('/'.join(seg_list))
#分词-精确模式
seg_list = jieba.cut(sent, cut_all=False)
print('/'.join(seg_list))
#分词-搜索引擎模式
seg_list = jieba.cut_for_search(sent)
print('/'.join(seg_list))

# print(str(datetime.today().strftime('%Y年%m月%d日')))

#结巴词性标注
#加在自定义词典
jieba.load_userdict("dict.txt")
sent = '中文分词是文本处理中不可或缺的一部分,魏亚通今天很不错,支静阳今天也很不错'

seg_list = psg.cut(sent)

print(' '.join(['{0}/{1}'.format(w, t) for w, t in seg_list]))

#结巴关键词提取技术
# 引入TF-IDF关键词抽取接口
Example #58
0
#!/usr/bin/env python3
"""Map example."""

import sys
import re
import jieba

for line in sys.stdin:
    tokens = line.split('\t')
    id = tokens[0]
    content = re.sub(r'[^a-zA-Z0-9\u4e00-\u9fff]+', ' ', tokens[2])
    for word in jieba.cut_for_search(content):
        if word.strip() != '':
            print(id + ' ' + word.lower() + '\t1')
    'subtitles/Dodo_Village.txt', 'subtitles/Empty_Bottle_King.txt',
    'subtitles/Gamker.txt', 'subtitles/Hello_Catie.txt', 'subtitles/Huan.txt',
    'subtitles/Little_Hot_Sing.txt', 'subtitles/Lulu.txt',
    'subtitles/Table_Games_Taichung.txt'
]
# with open('text/CYFIT.txt', 'r') as input:
for youtuber in youtubers:
    temp = youtuber.split('/')
    print(temp)
    data[temp[1]] = []
    with open(youtuber) as input1:

        # item_now = ''
        for i, item in enumerate(input1):
            data[temp[1]] += [
                t for t in jieba.cut_for_search(item) if t not in stops
            ]
            # if re.match(r'(.*?).wav', item):
            # 	#print(item)
            # 	item_now = item.strip()
            # 	data[item_now] = [t for t in jieba.cut_for_search(item) if t not in stops] + [t for t in jieba.cut_for_search(item) if t not in stops]
            # 	#print(data)
            # else:
            # 	#print(item)
            # 	data[item_now] += [t for t in jieba.cut_for_search(item) if t not in stops]
            #terms = [t for t in jieba.cut_for_search(item) if t not in stops]
            # print(sorted(Counter(terms).items(), key=lambda x:x[1], reverse=True))
            #print(data)
            #break
            # if(terms != []):
            # done += terms
Example #60
0
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import numpy as np
import tensorflow as tf
import jieba
import opencc
jieba.set_dictionary('dict_v2.txt')
with open('stopwords_only_symbol_v2.txt', 'r', encoding='utf8') as f:
    stops_symbol = f.read().split('\n')
input_str = input('請輸入文字:') # 輸入新聞標題
converter = opencc.OpenCC('s2twp.json')
s2twp_str = converter.convert(input_str)
jieba_str = ' '.join([t for t in jieba.cut_for_search(str(s2twp_str)) if t not in stops_symbol])
input_data_np = np.array([jieba_str])
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore('search_jieba_no_stopwords_train_vocab.pickle')
input_data_pd = np.array(list(vocab_processor.transform(input_data_np)))
tf.reset_default_graph()
saver = tf.train.import_meta_graph("Saved_model/search_jieba_no_stopwords_train_vocab.ckpt.meta")
with tf.Session() as sess:
    saver.restore(sess, "Saved_model/search_jieba_no_stopwords_train_vocab.ckpt")
    prob_and_ans = {"Placeholder:0": input_data_pd, "Placeholder_2:0": 1}
    prob = sess.run("probability:0", feed_dict = prob_and_ans)
    ans = sess.run("ans:0", feed_dict = prob_and_ans)
    print(f'probability: {prob}') # 印出較高的機率
    print(f'ans: {ans}') # 印出真或假( 1為真, 0為假)