Python tfidf Beispiele, jieba.analyse.tfidf Python Beispiele

Beispiel #1

0

Datei anzeigen

def single():
    content = u""
    if request.method == 'GET':
        content = u"12日上午，国家主席习近平同美国总统特朗普通电话。两国元首就朝鲜半岛局势等共同关心的问题交换了意见。习近平强调，中方坚持实现半岛无核化目标，坚持维护半岛和平稳定，主张通过和平方式解决问题，愿同美方就半岛问题保持沟通协调。关于叙利亚问题，习近平指出，任何使用化学武器的行为都不可接受。叙利亚问题要坚持政治解决的方向。联合国安理会保持团结对解决叙利亚问题非常重要，希望安理会发出一致声音。两国元首同意通过各种方式保持密切联系。"
    if request.method == 'POST':
        content = request.form['content']
    content = re.sub(u'(\s|\n|t)', u'', content)
    print content
    seg_list = [(word, flag) for word, flag in pseg.cut(content)]
    textrank_key_list = analyse.textrank(content, topK=5, withWeight=True)
    tf_idf_key_list = analyse.tfidf(content, topK=5, withWeight=True)
    s = sentiment.Sentiment()
    sentiment_score = s.single_review_sentiment_score(content)
    sentiment_score_up = (math.atan(sentiment_score) * 2 / math.pi +
                          1) / 2 * 100
    sentiment_score_down = 100 - sentiment_score_up
    s = SnowNLP(content)
    summary = s.summary(3)
    # print key_list
    # print("Default Mode: " + "/ ".join(seg_list))  # 精确模式
    return render_template(
        "single.html",
        seg_list=seg_list,
        textrank_key_list=textrank_key_list,
        tf_idf_key_list=tf_idf_key_list,
        sentiment_score_up=sentiment_score_up,
        sentiment_score_down=sentiment_score_down,
        summary=summary,
        content=content,
    )

Beispiel #2

0

Datei anzeigen

Datei: train_cf_svm_lr_whathow.py Projekt: luislv/caa_bot

def cut_word(sentence, tag='tfidf'):
    # 分词 -- 特定条件
    if tag == 'textrank':
        return ' '.join(
            [i for i in analyse.textrank(sentence) if i not in stopwords])
    else:
        return ' '.join(
            [i for i in analyse.tfidf(sentence) if i not in stopwords])

Beispiel #3

0

Datei anzeigen

Datei: chatterbotbody.py Projekt: yukixuen/chinese-chatterbot

 def preprocess(self, sentence):
     # 清洗chatbot收到的输入语句，该输入语句待查询
     # 和preparecorpus.py中的预处理一致，和Statement数据结构一致
     from chinesechatterbot import simhash
     strip = ''.join(self.signstopword_withoutsome)
     in_response_to = sentence.strip(strip + ' ')  # +' '清洗掉句首句尾的空白字符
     if len(in_response_to) < 50:
         keyword_in_response_to = analyse.tfidf(in_response_to, topK=10)
     else:
         keyword_in_response_to = analyse.textrank(in_response_to, topK=10)
     tags_in_response_to = [flag for word, flag in pseg.cut(in_response_to)]
     qhash = simhash.simhash(keyword_in_response_to).simhashindex()
     qhash = [None, str(qhash)][qhash > 0]
     self.logger.info("Input sentence:{},keywords extracted:{}".format(
         in_response_to, keyword_in_response_to))
     return (in_response_to, keyword_in_response_to, tags_in_response_to,
             qhash)

Beispiel #4

0

Datei anzeigen

Datei: word_segment.py Projekt: IanLiYi1996/StockRelevance

def extract_keywords(text, topk=5, method="tf-idf"):
    """
    关键词提取
    Args:
        text:
        topk:
        method:

    Returns:

    """
    # sentence = "".join([w.value for w in text])
    if method == "textrank":
        return jieba_analyse.textrank(sentence=text,
                                      topK=topk,
                                      withWeight=True)
    else:
        return jieba_analyse.tfidf(sentence=text, topK=topk, withWeight=True)

Beispiel #5

0

Datei anzeigen

    def output_html(self):
        #输出html文件
        fout = open('out7.html', 'w')

        fout.write("<html>")
        fout.write(
            "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\">"
        )
        fout.write("<body>")
        fout.write("<table>")

        for data in self.datas:

            #关键词
            keyword = tfidf(data['title'])
            arr = []
            n = 0
            for s in keyword:
                n = n + 1
                arr.append(s)
                if n == 3:
                    break
            strs = ','.join(arr)
            keywords = strs

            fout.write("<tr>")
            fout.write("<td>%s</td>" % data['url'])
            #fout.write("<td>%s</td>" % data['head_img'])
            fout.write("<td>%s</td>" % data['title'].encode('utf-8'))
            fout.write("<td>%s</td>" % keywords.encode('utf-8'))
            fout.write("<td>%s</td>" % data['body'])

            #fout.write("<td>%s</td>" % data['summary'].encode('utf-8'))

            fout.write("</tr>")

        fout.write("</table>")
        fout.write("</body>")
        fout.write("</html>")

        fout.close()

        #存入数据库
        self.insertdata()

Beispiel #6

0

Datei anzeigen

Datei: keyword_extractor.py Projekt: daviddwlee84/SearchEngine

    def get_topk_keywords(self,
                          text: str,
                          topk: int,
                          method: str = None) -> List[Tuple[str, int]]:
        """
        TODO: maybe add a mix mode?!

        TODO: make withWeight an argument?

        Return [(keyword, weight), ...]
        """
        if not method:
            method = self.default_method

        if method == 'jieba.textrank':
            result = analyse.textrank(text, topK=topk, withWeight=True)
        elif method == 'jieba.extract_tags':
            result = analyse.extract_tags(text, topK=topk, withWeight=True)
        elif method == 'jieba.tfidf':
            result = analyse.tfidf(text, topK=topk, withWeight=True)

        return result

Beispiel #7

0

Datei anzeigen

Datei: mykeyword.py Projekt: shlllshlll/KeyVisual

 def working(self):
     logging.info("Start finding paragraph keywords...")
     count = 1
     self._msg_queue.put('keyword', 1)
     for news_dict in self._datareader.working():
         url = news_dict[1]
         title = news_dict[2]
         news = news_dict[4]
         # seg = "/".join(jieba.cut(news))
         # tfidf = "/".join(analyse.tfidf(news))
         # textrank = "/".join(analyse.textrank(news))
         keywords = "/".join(analyse.tfidf(news)[:10])
         seg_dict = utils.add_dict(url=url, title=title, keywords=keywords)
         # seg_dict = utils.add_dict(
         #     title=title, news=news, seg=seg,
         #     tfidf=tfidf, textrank=textrank, keywords=keywords)
         logging.info("loop:%s, keyword:%s", count, keywords)
         self._datasaver.working(seg_dict)
         count += 1
     self._msg_queue.put('keyword', 2)
     self._datareader.close()
     self._datasaver.close()
     logging.info("End finding paragraph keywords...")

Beispiel #8

0

Datei anzeigen

 def extractkeyword(self, sentence):
     if len(sentence) < 50:
         keyword = analyse.tfidf(sentence, topK=10)
     else:
         keyword = analyse.textrank(sentence, topK=10)
     return keyword

Beispiel #9

0

Datei anzeigen

Datei: pipe.py Projekt: Jie-Yuan/tql-Python

        for i in obj:
            cprint(i)
            print('\n')
    else:
        cprint(obj, bg)


xtqdm = xx(lambda iterable, desc=None: tqdm(iterable, desc))

# base types
xtuple, xlist, xset = xx(tuple), xx(list), xx(set)

# string
xjoin = xx(lambda s, sep=' ': sep.join(s))
xcut = xx(lambda s, cut_all=False: jieba.lcut(s, cut_all=cut_all))
xtfidf = xx(lambda s, topK=20: ja.tfidf(s, topK=topK))

xsame_key_dict_merge = xx(lambda dics: pd.DataFrame(dics))


@xx
def hump_str(string="a_b", pattern='_'):
    """驼峰式转换"""
    reg = re.compile(pattern)
    _ = reg.sub('', string.title())
    return _.replace(_[0], _[0].lower())


# list transform
xgroup_by_step = xx(
    lambda ls, step=3: [ls[idx:idx + step] for idx in range(0, len(ls), step)])

Beispiel #10

0

Datei anzeigen

Datei: nlp_app.py Projekt: Jie-Yuan/AppZoo

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Project      : tql-App.
# @File         : nlp_app
# @Time         : 2020/11/4 10:24 上午
# @Author       : yuanjie
# @Email        : [email protected]
# @Software     : PyCharm
# @Description  :

from appzoo import App
import jieba.analyse as ja

app = App()
app.add_route('/get/{text}',
              lambda **kwargs: ja.tfidf(kwargs.get('text', '')),
              method="GET",
              text="")
app.run(port=9955, debug=False, reload=False)

Beispiel #11

0

Datei anzeigen

Datei: nlp_app.py Projekt: 0Gz2bflQyU0hpW/tql-App

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Project      : tql-App.
# @File         : nlp_app
# @Time         : 2020/11/4 10:24 上午
# @Author       : yuanjie
# @Email        : [email protected]
# @Software     : PyCharm
# @Description  : 


from iapp import App
import jieba.analyse  as ja

app = App()
app.add_route('/get', lambda **kwargs: ja.tfidf(kwargs.get('text', '')), method="GET", result_key="keywords")
app.run(port=9955, debug=False, reload=False)

Beispiel #12

0

Datei anzeigen

Datei: MillionareHero.py Projekt: DuodenumL/MillionareHero

 def get_keyword(string):
     return set(analyse.tfidf(string))

Beispiel #13

0

Datei anzeigen

Datei: inc_qa_longtime.py Projekt: luislv/caa_bot

def cut_word(sentence):
    # 分词 -- 特定条件
    if tag == 'textrank':
        return ' '.join(analyse.textrank(sentence))
    else:
        return ' '.join(analyse.tfidf(sentence))

Beispiel #14

0

Datei anzeigen

    def insertdata(self):
        db = MySQLdb.connect("101.200.208.135", "python", "admin!@#", "bbs",charset="utf8")
        cursor = db.cursor()
       # insert = ("INSERT INTO web_article(title,categroy_id,head_img,content,author_id,publish_date,hideden,weight,keywords,description)" "VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)")




        x = 0
        for data in self.datas:
            title = data['title'].encode('utf-8')
            body =  data['summary']

            selecttitle= "SELECT title from web_article WHERE title ='%s'" % (title)
            print body
            #cursor.execute(selecttitle)
            #resultstitle = cursor.fetchall()
            try:
                #if len(resultstitle) ==0:
                if '美国'.decode("utf-8") in body:

                    author_id = '2'
                    categroy_id = '2'
                    #下载图片
                    #urllib2.urlretrieve(data['head_img'],'/home/lmb/bbs02/static/uploads\\d.jpg'%x)
                    url = data['head_img']
                    tim = int(time.time())

                    f = open('/home/lmb/bbs02/static/uploads/'+str(x)+ '.jpg','w')
                    req = urllib2.urlopen(url)
                    buf = req.read()
                    f.write(buf)


                    head_img = 'static/uploads/'+str(x) + str(tim) +'.jpg'
                    x=x+1

                    #关键词
                    #textrank = analyse.textrank
                    keywords = tfidf(data['summary'])
                    #循环组合前3个关键词
                    arr = []
                    n=0
                    for s in keywords:
                        selecttag = "SELECT num from web_tags WHERE tagname ='%s'" % (s)
                        print selecttag
                        cursor.execute(selecttag)
                        results = cursor.fetchall()
                        # 循环保存到tags表
                        try:
                            ISOTIMEFORMAT = '%Y-%m-%d %X'
                            update_time = time.strftime(ISOTIMEFORMAT, time.localtime(time.time()))
                            if len(results) ==0:
                                # tag不存在直接插入keyword
                                #inserttag = ("INSERT INTO web_tags(tagname,num,update_time)" "VALUES(%s,%s,%s)")
                                #print inserttag
                                #datatag = (s,1,update_time)
                                #cursor.execute(inserttag, datatag)
                                db.commit()
                            else:
                                for row in results:
                                    num = int(row[0])
                                    num = num + 1
                                    #updatetag = "UPDATE web_tags SET num = '%s',update_time="%(num)+"\'"+ update_time +"\'"+"  WHERE tagname =" + "\'"+ s +"\'"
                                    #print updatetag
                                    #cursor.execute(updatetag)
                                    db.commit()
                        except:
                            return

                        n=n+1
                        arr.append(s)
                        if n==3:
                            break
                    strs = ','.join(arr)
                    keywords =strs

                    content = data['summary'].encode('utf-8')

                    description = mvhtml.strip_tags(content[0:200])

                    ISOTIMEFORMAT ='%Y-%m-%d %X'
                    publish_date = time.strftime(ISOTIMEFORMAT, time.localtime(time.time()))
                    hideden = '0'
                    weight = '1000'
                   # data = (title, categroy_id, head_img, content, author_id,publish_date,hideden,weight,keywords,description)
                    #try:
                       #cursor.execute(insert, data)
                   # except:
                    #    return
                  #  db.commit()
                else:
                    print '标题重复'
            except:
                print "异常"

        db.close()

Beispiel #15

0

Datei anzeigen

Datei: Test.py Projekt: cptBTptpwbct/weibo

from jieba.analyse import tfidf

from db.dao import CommentOper
from jieba import analyse

if __name__ == '__main__':
    infos = CommentOper.get_all_comment_by_weibo_id(4081978523493142)
    test=""
    for info in infos:
        test+=info.comment_cont
    keyWords = tfidf(test)
    for keyWord in keyWords:
        print(keyWord)

Beispiel #16

0

Datei anzeigen

Datei: src_tfIdf.py Projekt: qingxiaoye/NLP-learn

# !/usr/bin/python
# -*- coding:utf-8 -*-
from jieba.analyse import extract_tags as tfidf

# 引入TF-IDF关键词抽取接口
# 原始文本
text = "线程是程序执行时的最小单位，它是进程的一个执行流，\
        是CPU调度和分派的基本单位，一个进程可以由很多个线程组成，\
        线程间共享进程的所有资源，每个线程有自己的堆栈和局部变量。\
        线程由CPU独立调度执行，在多CPU环境下就允许多个线程同时运行。\
        同样多线程也可以实现并发操作，每个请求分配一个线程来处理。"

# 基于TF-IDF算法进行关键词抽取
keywords = tfidf(text, topK=5)
# 输出抽取出的关键词
for keyword in keywords:
    print(keyword + "/", )

Beispiel #17

0

Datei anzeigen

Datei: simhash.py Projekt: yukixuen/chinese-chatterbot

    tot = 0
    while x:
        tot += 1
        x &= x - 1
    return tot


#求相似度
def hash_similarity(thishash, otherhash):
    a = float(thishash)
    b = float(otherhash)
    if a > b: return b / a
    else: return a / b


if __name__ == '__main__':
    from jieba import analyse
    s = '你是一个人工智能软件'
    hash1 = simhash(analyse.tfidf(s))
    hash1.simhashindex()  #hash1的simhash值，64位bit转换成了

    s = '你是人工智障'
    hash2 = simhash(analyse.tfidf(s))
    hash2.simhashindex()
    s = '你是人工智能'
    hash3 = simhash(analyse.tfidf(s))
    hash3.simhashindex()
    print(hash1.hamming_distance(hash2.simhashindex()), "   ",
          hash1.similarity(hash2.simhashindex()))
    print(hash1.hamming_distance(hash3.simhashindex()), "   ",
          hash1.similarity(hash3.simhashindex()))