def single(): content = u"" if request.method == 'GET': content = u"12日上午,国家主席习近平同美国总统特朗普通电话。两国元首就朝鲜半岛局势等共同关心的问题交换了意见。习近平强调,中方坚持实现半岛无核化目标,坚持维护半岛和平稳定,主张通过和平方式解决问题,愿同美方就半岛问题保持沟通协调。关于叙利亚问题,习近平指出,任何使用化学武器的行为都不可接受。叙利亚问题要坚持政治解决的方向。联合国安理会保持团结对解决叙利亚问题非常重要,希望安理会发出一致声音。两国元首同意通过各种方式保持密切联系。" if request.method == 'POST': content = request.form['content'] content = re.sub(u'(\s|\n|t)', u'', content) print content seg_list = [(word, flag) for word, flag in pseg.cut(content)] textrank_key_list = analyse.textrank(content, topK=5, withWeight=True) tf_idf_key_list = analyse.tfidf(content, topK=5, withWeight=True) s = sentiment.Sentiment() sentiment_score = s.single_review_sentiment_score(content) sentiment_score_up = (math.atan(sentiment_score) * 2 / math.pi + 1) / 2 * 100 sentiment_score_down = 100 - sentiment_score_up s = SnowNLP(content) summary = s.summary(3) # print key_list # print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 return render_template( "single.html", seg_list=seg_list, textrank_key_list=textrank_key_list, tf_idf_key_list=tf_idf_key_list, sentiment_score_up=sentiment_score_up, sentiment_score_down=sentiment_score_down, summary=summary, content=content, )
def cut_word(sentence, tag='tfidf'): # 分词 -- 特定条件 if tag == 'textrank': return ' '.join( [i for i in analyse.textrank(sentence) if i not in stopwords]) else: return ' '.join( [i for i in analyse.tfidf(sentence) if i not in stopwords])
def preprocess(self, sentence): # 清洗chatbot收到的输入语句,该输入语句待查询 # 和preparecorpus.py中的预处理一致,和Statement数据结构一致 from chinesechatterbot import simhash strip = ''.join(self.signstopword_withoutsome) in_response_to = sentence.strip(strip + ' ') # +' '清洗掉句首句尾的空白字符 if len(in_response_to) < 50: keyword_in_response_to = analyse.tfidf(in_response_to, topK=10) else: keyword_in_response_to = analyse.textrank(in_response_to, topK=10) tags_in_response_to = [flag for word, flag in pseg.cut(in_response_to)] qhash = simhash.simhash(keyword_in_response_to).simhashindex() qhash = [None, str(qhash)][qhash > 0] self.logger.info("Input sentence:{},keywords extracted:{}".format( in_response_to, keyword_in_response_to)) return (in_response_to, keyword_in_response_to, tags_in_response_to, qhash)
def extract_keywords(text, topk=5, method="tf-idf"): """ 关键词提取 Args: text: topk: method: Returns: """ # sentence = "".join([w.value for w in text]) if method == "textrank": return jieba_analyse.textrank(sentence=text, topK=topk, withWeight=True) else: return jieba_analyse.tfidf(sentence=text, topK=topk, withWeight=True)
def output_html(self): #输出html文件 fout = open('out7.html', 'w') fout.write("<html>") fout.write( "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\">" ) fout.write("<body>") fout.write("<table>") for data in self.datas: #关键词 keyword = tfidf(data['title']) arr = [] n = 0 for s in keyword: n = n + 1 arr.append(s) if n == 3: break strs = ','.join(arr) keywords = strs fout.write("<tr>") fout.write("<td>%s</td>" % data['url']) #fout.write("<td>%s</td>" % data['head_img']) fout.write("<td>%s</td>" % data['title'].encode('utf-8')) fout.write("<td>%s</td>" % keywords.encode('utf-8')) fout.write("<td>%s</td>" % data['body']) #fout.write("<td>%s</td>" % data['summary'].encode('utf-8')) fout.write("</tr>") fout.write("</table>") fout.write("</body>") fout.write("</html>") fout.close() #存入数据库 self.insertdata()
def get_topk_keywords(self, text: str, topk: int, method: str = None) -> List[Tuple[str, int]]: """ TODO: maybe add a mix mode?! TODO: make withWeight an argument? Return [(keyword, weight), ...] """ if not method: method = self.default_method if method == 'jieba.textrank': result = analyse.textrank(text, topK=topk, withWeight=True) elif method == 'jieba.extract_tags': result = analyse.extract_tags(text, topK=topk, withWeight=True) elif method == 'jieba.tfidf': result = analyse.tfidf(text, topK=topk, withWeight=True) return result
def working(self): logging.info("Start finding paragraph keywords...") count = 1 self._msg_queue.put('keyword', 1) for news_dict in self._datareader.working(): url = news_dict[1] title = news_dict[2] news = news_dict[4] # seg = "/".join(jieba.cut(news)) # tfidf = "/".join(analyse.tfidf(news)) # textrank = "/".join(analyse.textrank(news)) keywords = "/".join(analyse.tfidf(news)[:10]) seg_dict = utils.add_dict(url=url, title=title, keywords=keywords) # seg_dict = utils.add_dict( # title=title, news=news, seg=seg, # tfidf=tfidf, textrank=textrank, keywords=keywords) logging.info("loop:%s, keyword:%s", count, keywords) self._datasaver.working(seg_dict) count += 1 self._msg_queue.put('keyword', 2) self._datareader.close() self._datasaver.close() logging.info("End finding paragraph keywords...")
def extractkeyword(self, sentence): if len(sentence) < 50: keyword = analyse.tfidf(sentence, topK=10) else: keyword = analyse.textrank(sentence, topK=10) return keyword
for i in obj: cprint(i) print('\n') else: cprint(obj, bg) xtqdm = xx(lambda iterable, desc=None: tqdm(iterable, desc)) # base types xtuple, xlist, xset = xx(tuple), xx(list), xx(set) # string xjoin = xx(lambda s, sep=' ': sep.join(s)) xcut = xx(lambda s, cut_all=False: jieba.lcut(s, cut_all=cut_all)) xtfidf = xx(lambda s, topK=20: ja.tfidf(s, topK=topK)) xsame_key_dict_merge = xx(lambda dics: pd.DataFrame(dics)) @xx def hump_str(string="a_b", pattern='_'): """驼峰式转换""" reg = re.compile(pattern) _ = reg.sub('', string.title()) return _.replace(_[0], _[0].lower()) # list transform xgroup_by_step = xx( lambda ls, step=3: [ls[idx:idx + step] for idx in range(0, len(ls), step)])
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Project : tql-App. # @File : nlp_app # @Time : 2020/11/4 10:24 上午 # @Author : yuanjie # @Email : [email protected] # @Software : PyCharm # @Description : from appzoo import App import jieba.analyse as ja app = App() app.add_route('/get/{text}', lambda **kwargs: ja.tfidf(kwargs.get('text', '')), method="GET", text="") app.run(port=9955, debug=False, reload=False)
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Project : tql-App. # @File : nlp_app # @Time : 2020/11/4 10:24 上午 # @Author : yuanjie # @Email : [email protected] # @Software : PyCharm # @Description : from iapp import App import jieba.analyse as ja app = App() app.add_route('/get', lambda **kwargs: ja.tfidf(kwargs.get('text', '')), method="GET", result_key="keywords") app.run(port=9955, debug=False, reload=False)
def get_keyword(string): return set(analyse.tfidf(string))
def cut_word(sentence): # 分词 -- 特定条件 if tag == 'textrank': return ' '.join(analyse.textrank(sentence)) else: return ' '.join(analyse.tfidf(sentence))
def insertdata(self): db = MySQLdb.connect("101.200.208.135", "python", "admin!@#", "bbs",charset="utf8") cursor = db.cursor() # insert = ("INSERT INTO web_article(title,categroy_id,head_img,content,author_id,publish_date,hideden,weight,keywords,description)" "VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)") x = 0 for data in self.datas: title = data['title'].encode('utf-8') body = data['summary'] selecttitle= "SELECT title from web_article WHERE title ='%s'" % (title) print body #cursor.execute(selecttitle) #resultstitle = cursor.fetchall() try: #if len(resultstitle) ==0: if '美国'.decode("utf-8") in body: author_id = '2' categroy_id = '2' #下载图片 #urllib2.urlretrieve(data['head_img'],'/home/lmb/bbs02/static/uploads\\d.jpg'%x) url = data['head_img'] tim = int(time.time()) f = open('/home/lmb/bbs02/static/uploads/'+str(x)+ '.jpg','w') req = urllib2.urlopen(url) buf = req.read() f.write(buf) head_img = 'static/uploads/'+str(x) + str(tim) +'.jpg' x=x+1 #关键词 #textrank = analyse.textrank keywords = tfidf(data['summary']) #循环组合前3个关键词 arr = [] n=0 for s in keywords: selecttag = "SELECT num from web_tags WHERE tagname ='%s'" % (s) print selecttag cursor.execute(selecttag) results = cursor.fetchall() # 循环保存到tags表 try: ISOTIMEFORMAT = '%Y-%m-%d %X' update_time = time.strftime(ISOTIMEFORMAT, time.localtime(time.time())) if len(results) ==0: # tag不存在直接插入keyword #inserttag = ("INSERT INTO web_tags(tagname,num,update_time)" "VALUES(%s,%s,%s)") #print inserttag #datatag = (s,1,update_time) #cursor.execute(inserttag, datatag) db.commit() else: for row in results: num = int(row[0]) num = num + 1 #updatetag = "UPDATE web_tags SET num = '%s',update_time="%(num)+"\'"+ update_time +"\'"+" WHERE tagname =" + "\'"+ s +"\'" #print updatetag #cursor.execute(updatetag) db.commit() except: return n=n+1 arr.append(s) if n==3: break strs = ','.join(arr) keywords =strs content = data['summary'].encode('utf-8') description = mvhtml.strip_tags(content[0:200]) ISOTIMEFORMAT ='%Y-%m-%d %X' publish_date = time.strftime(ISOTIMEFORMAT, time.localtime(time.time())) hideden = '0' weight = '1000' # data = (title, categroy_id, head_img, content, author_id,publish_date,hideden,weight,keywords,description) #try: #cursor.execute(insert, data) # except: # return # db.commit() else: print '标题重复' except: print "异常" db.close()
from jieba.analyse import tfidf from db.dao import CommentOper from jieba import analyse if __name__ == '__main__': infos = CommentOper.get_all_comment_by_weibo_id(4081978523493142) test="" for info in infos: test+=info.comment_cont keyWords = tfidf(test) for keyWord in keyWords: print(keyWord)
# !/usr/bin/python # -*- coding:utf-8 -*- from jieba.analyse import extract_tags as tfidf # 引入TF-IDF关键词抽取接口 # 原始文本 text = "线程是程序执行时的最小单位,它是进程的一个执行流,\ 是CPU调度和分派的基本单位,一个进程可以由很多个线程组成,\ 线程间共享进程的所有资源,每个线程有自己的堆栈和局部变量。\ 线程由CPU独立调度执行,在多CPU环境下就允许多个线程同时运行。\ 同样多线程也可以实现并发操作,每个请求分配一个线程来处理。" # 基于TF-IDF算法进行关键词抽取 keywords = tfidf(text, topK=5) # 输出抽取出的关键词 for keyword in keywords: print(keyword + "/", )
tot = 0 while x: tot += 1 x &= x - 1 return tot #求相似度 def hash_similarity(thishash, otherhash): a = float(thishash) b = float(otherhash) if a > b: return b / a else: return a / b if __name__ == '__main__': from jieba import analyse s = '你是一个人工智能软件' hash1 = simhash(analyse.tfidf(s)) hash1.simhashindex() #hash1的simhash值,64位bit转换成了 s = '你是人工智障' hash2 = simhash(analyse.tfidf(s)) hash2.simhashindex() s = '你是人工智能' hash3 = simhash(analyse.tfidf(s)) hash3.simhashindex() print(hash1.hamming_distance(hash2.simhashindex()), " ", hash1.similarity(hash2.simhashindex())) print(hash1.hamming_distance(hash3.simhashindex()), " ", hash1.similarity(hash3.simhashindex()))