Esempio n. 1
0
def analysis_data():
    """

    :return:
    """
    with open("../resource/bilibili/bilibili_data.json", "r") as f:
        d = json.load(f)
    d["text_analysis"] = list()
    count1, count2 = 0, 0
    for text in d["text"]:
        t = snownlp.SnowNLP(text)
        if t.sentiments > 0.5:
            d["text_analysis"].append(1)
            count1 += 1
        else:
            d["text_analysis"].append(-1)
            count2 += 1
    print("count1 : ", count1, "\t count2 : ", count2)
    '''analysis'''
    text = "".join(d["text"])
    s = snownlp.SnowNLP(text)
    dict_word = dict()
    s_word = s.words
    for it in s_word:
        dict_word[it] = s_word.count(it)
    dict_word = sorted(dict_word.items(),
                       key=operator.itemgetter(1),
                       reverse=True)[:10]
    for t in dict_word:
        print(t[0], "\t", t[1])
Esempio n. 2
0
    def save_mysql(self, feedback_json_all):
        cur = self.db.cursor()
        cur.execute('use feedback_database')
        try:
            create_projectid_table = 'create table projectid_%d(author varchar(100),version varchar(10),rating int(20),id varchar(30),title text(50),content text(2000),sent_title float(8),sent_content float(8))' % self.ProjectID
            #print(create_projectid_table)
            cur.execute(create_projectid_table)
            self.db.commit()
            print('----------------Table 创建成功----------------')
        except Exception as e:
            pass
            #print("Mysql Err:{}".format(e))
        #j = 1
        projectid = 'projectid_%s' % self.ProjectID
        idcount = 0
        for i in feedback_json_all:
            # author_name = i['author']['name']['label']
            argv = "author,version,rating,id,title,content,sent_title,sent_content"

            author = str(i['author']['name']['label'])
            #print(author)
            version = str(i['im:version']['label'])
            rating = str(i['im:rating']['label'])
            id = str(i['id']['label'])
            title = str(i['title']['label'])
            content = str(i['content']['label'])
            sent_title = float('%.2f' % snownlp.SnowNLP(title).sentiments)
            sent_content = float('%.2f' % snownlp.SnowNLP(content).sentiments)
            #print(sent_title)
            mql_value = (author, version, rating, id, title, content,
                         sent_title, sent_content)
            #print(mql_value)
            try:
                ss = "select  * from {} where id = {} ".format(projectid, id)
                idcount = cur.execute(ss)
            except Exception as e:
                print('运行错误', e)

            #print(idcount)
            if idcount == 0:  #判断是否有数据。如果没有插入新数据。
                insert_content = 'insert into {}({}) values{}'.format(
                    projectid, argv, mql_value)
                #print(insert_content)
                cur.execute(insert_content)
            elif idcount != 0:
                print('有重复!! {}'.format(id))
            #j += 1
            self.db.commit()  # 提交数据
        cur.close()
        self.db.close()
        print('mql saved')
	def __computeOneDaySentiments(self, date):
		meassageInOneDay = self.messageDataFrame[self.messageDataFrame.date == date + ' ' + '00:00:00']
		sentimentsGrades = []

		# If there is no news, we set this date's sentimental grade to be 0.5 (which is a neutral value).
		if len(meassageInOneDay.message) == 0:
			self.messageSentiments[date] = 0.5
		# If there is news, we get the sentimental grade of every item of news, and count their mean to be day's final grade.
		else:
			for Message in meassageInOneDay.message:
				s = sn.SnowNLP(Message)
				MessageSentGrades = np.mean([sn.SnowNLP(word).sentiments for word in s.words]) 	# The item of news' sentimental grade.
				sentimentsGrades.append(MessageSentGrades) 										# Append every item of news sentimental grade to the day's list.
			self.messageSentiments[date] = np.mean(sentimentsGrades)
Esempio n. 4
0
def get_sentiment_cn(text):
    s = snownlp.SnowNLP(text)
    res = s.sentiments
    if res > 0.3:
        return "积极"
    else:
        return "消极"
Esempio n. 5
0
 def commentparse(self, response):
     # print("-----------------------scrapy爬取微博评论---------------------------------------")
     status_after_url = "https://m.weibo.cn/comments/hotflow?id=%s&mid=%s&max_id=%s&max_id_type=%s"
     message_id = response.meta.get("message_id")
     keyword = response.meta.get("keyword")
     results = json.loads(response.text, encoding="utf-8")
     if results.get("ok"):
         max_id = results.get("data").get("max_id")
         max_id_type = results.get("data").get("max_id_type")
         if max_id:
             # 评论10 个为一段,下一段在上一段JSON中定义:
             yield scrapy.Request(
                 url=status_after_url %
                 (message_id, message_id, str(max_id), str(max_id_type)),
                 callback=self.commentparse,
                 meta={
                     "keyword": keyword,
                     "message_id": message_id
                 })
         datas = results.get("data").get("data")
         for data in datas:
             text1 = data.get("text")
             like_count = data.get("like_count")
             user1 = data.get("user").get("screen_name")
             user_url = data.get("user").get("profile_url")
             emotion = snownlp.SnowNLP(
                 text1).sentiments  # #利用SnowNLP函数进行情感分析
             weibocommentitem = WeiboCommentItem()
             weibocommentitem["title"] = keyword
             weibocommentitem["message_id"] = message_id
             weibocommentitem["text1"] = text1
             weibocommentitem["user1"] = user1
             weibocommentitem["user_url"] = user_url
             weibocommentitem["emotion"] = emotion
             yield weibocommentitem
Esempio n. 6
0
def run():
    df = pd.read_csv('./cleanfile.csv', encoding='utf-8', sep=',')
    df["sentiments"] = df["content"].map(
        lambda c: snownlp.SnowNLP(c).sentiments)
    df["keywords"] = df["content"].map(getKeyWord)
    df["input_time"] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    #engine = create_engine('mysql+pymysql://root:@127.0.0.1:3306/sina')
    engine = create_engine('mysql+mysqlconnector://root:@127.0.0.1:3306/sina')

    dtypedict = {
        'id': Integer(),
        'mid': VARCHAR(length=50),
        'content': TEXT,
        'uid': VARCHAR(length=15),
        'area': VARCHAR(length=15),
        'nick': VARCHAR(length=50),
        'ip': VARCHAR(length=15),
        'newsid': VARCHAR(length=50),
        'time': DATETIME(),
        'sentiments': DECIMAL('10,10'),
        'keywords': VARCHAR(length=100),
        'input_time': DATETIME(),
    }
    df.to_sql(name='news',
              con=engine,
              chunksize=100000,
              if_exists='replace',
              index=True,
              index_label='id',
              dtype=dtypedict)
Esempio n. 7
0
def test(filename, to_filename):
    """商品评论-情感分析-测试"""
    with open(f'scrapingfile/{filename}.csv', 'r', encoding='utf-8-sig') as fr:
        for line in fr.readlines():
            s = snownlp.SnowNLP(line)

            if s.sentiments >= 0.8:
                res = '超赞'
                res_list.append(1)
            elif 0.6 <= s.sentiments < 0.8:
                res = '喜欢'
                res_list.append(0.5)
            elif 0.2 <= s.sentiments < 0.4:
                res = '还行'
                res_list.append(-0.5)
            elif s.sentiments < 0.2:
                res = '厌恶'
                res_list.append(-1)
            else:
                res = '一般'
                res_list.append(0)
            sent_dict = {
                '情感分析结果': s.sentiments,
                '评价倾向': res,
                '商品评论': line.replace('\n', '')
            }
            sentiment_list.append(sent_dict)
            print(sent_dict)
        df = pd.DataFrame(sentiment_list)
        df.to_csv(f'scrapingfile/{to_filename}.txt', index=None, encoding='utf-8-sig', index_label=None, mode='w')
    fr.close()
Esempio n. 8
0
    def generateTextFeature(self, item):
        content = BeautifulSoup(item['mcontent'], self.DEFAULT_PARSER)
        content = re.sub("\s", "", content.get_text())

        if content and len(content) != 0:
            title_words = list(jieba.cut(item['mtitle']))
            jb_tags = jieba.analyse.extract_tags(content,
                                                 self.KEYWORD_NUMBER,
                                                 withWeight=True)
            result = snownlp.SnowNLP(content)
            sn_tags = result.keywords(self.KEYWORD_NUMBER)

            #             print('\ '.join(title_words))
            #             print('\ '.join(jieba.cut(item[self.ITEM_COLNAME_DICT['mTitle']], True)))
            #             print(jb_tags)
            #             print(sn_tags)
            #             print(result.keywords(self.KEYWORD_NUMBER, True))

            result_tag, tag_len = self.clacWordWeight(jb_tags, sn_tags,
                                                      title_words)
            top_tags = list(result_tag.items())
            top_tags = sorted(top_tags, key=lambda x: x[1], reverse=True)
            #             print(top_tags[:tag_len])
            top_tag_map = {}
            for index in range(0, tag_len):
                top_tag_map[top_tags[index][0]] = top_tags[index][1]
            result_tag_text = {'type': item['mtags'], 'tag': top_tag_map}

            #             print(item[self.ITEM_COLNAME_DICT['mSource']])
            #             print(result_tag_text)

            #             print(json.dumps(result_tag_text, indent=4, separators=(',', ': ')))
            global tag_relation_tmp
            tag_relation_tmp = [item['mtags'], list(result_tag.keys())]
            item['mtags'] = json.dumps(result_tag_text, ensure_ascii=False)
Esempio n. 9
0
def test(path):
    with open(path, 'r', encoding='utf-8') as fr:
        for line in fr.readlines():
            s = snownlp.SnowNLP(line)
            sent_dict = {'情感分析结果': s.sentiments, '微博内容': line}
            sentiment_list.append(sent_dict)
            print(sent_dict)
Esempio n. 10
0
def test(filename, to_filename):
    """商品评论-情感分析-测试"""
    with open(filename, 'r', encoding='gbk') as fr:
        for line in fr.readlines():
            s = snownlp.SnowNLP(line)
            if s.sentiments > 0.6:
                res = '喜欢'
                res_list.append(1)
            elif s.sentiments < 0.4:
                res = '不喜欢'
                res_list.append(-1)
            else:
                res = '一般'
                res_list.append(0)
            sent_dict = {
                '情感分析结果': s.sentiments,
                '评价倾向': res,
                '商品评论': line.replace('\n', '')
            }
            sentiment_list.append(sent_dict)
            print(sent_dict)
        df = pd.DataFrame(sentiment_list)
        df.to_csv(to_filename,
                  index=None,
                  encoding='gbk',
                  index_label=None,
                  mode='w')
Esempio n. 11
0
def postTiezi(word):
    driver = webdriver.Chrome()
    driver.maximize_window()
    driver.get('https://passport.baidu.com/v2/?login')
    time.sleep(2)
    driver.find_element_by_id("TANGRAM__PSP_3__footerULoginBtn").click()
    driver.find_element_by_name("userName").clear()
    driver.find_element_by_name("userName").send_keys('13009806115')
    driver.find_element_by_name("password").clear()
    driver.find_element_by_name("password").send_keys('1245586921')
    driver.find_element_by_id("TANGRAM__PSP_3__submit").click()
    time.sleep(50)
    driver.get('https://tieba.baidu.com/f?kw=万元归一诀&fr=index')
    time.sleep(6)
    if snownlp.SnowNLP(word).sentiments > 0.5:
        for i in range(0, 10):
            driver.find_element_by_name("title").send_keys('怎么会有这么弱智的问题')
            driver.find_element_by_id("ueditor_replace").send_keys('爬爬爬')
            time.sleep(2)
            driver.find_element_by_xpath(
                '//*[@id="tb_rich_poster"]/div[3]/div[5]/div/button[1]').click(
                )
    else:
        for i in range(0, 10):
            driver.find_element_by_name("title").send_keys('是个好问题')
            driver.find_element_by_id("ueditor_replace").send_keys('赞赞赞')
            time.sleep(2)
            driver.find_element_by_xpath(
                '//*[@id="tb_rich_poster"]/div[3]/div[5]/div/button[1]').click(
                )
def run():
    df = pd.read_csv('./cleanfile.csv', encoding='utf-8', sep=',')
    df["sentiments"] = df["content"].map(
        lambda c: snownlp.SnowNLP(c).sentiments)
    df["keywords"] = df["content"].map(getKeyWord)

    #engine = create_engine('mysql+pymysql://root:@127.0.0.1:3306/sina')
    engine = create_engine(
        'mysql+mysqlconnector://root:[email protected]:3306/sina?charset=utf8&connect_timeout=10'
    )

    dtypedict = {
        'id': Integer(),
        'uid': VARCHAR(length=15),
        'area': VARCHAR(length=15),
        'ipadd': VARCHAR(length=15),
        'usertype': VARCHAR(length=10),
        'agree': VARCHAR(length=10),
        'cmttime': DATETIME(),
        'content': TEXT,
        'sentiments': DECIMAL('10,10'),
        'keywords': VARCHAR(length=100),
    }
    df.to_sql(name='news',
              con=engine,
              chunksize=100000,
              if_exists='replace',
              index=True,
              index_label='id',
              dtype=dtypedict)
Esempio n. 13
0
def analyse_Signature(friends):
    """
    分析好友签名
    :param friends:
    :return:
    """
    signatures = ''
    emotions = []
    for friend in friends:
        signature = friend['Signature']
        print(signature)
        if signature != None:
            signature = signature.strip().replace('span', '').replace(
                'class', '').replace('emoji', '')
            signature = re.sub(r'1f(\d.+)', '', signature)
            print('signature>>>', signature)
        if len(signature) > 0:
            # 权值
            nlp = snownlp.SnowNLP(signature)
            emotions.append(nlp.sentiments)
            # 关键字提取
            signatures += ' '.join(jieba.analyse.extract_tags(signature, 5))
            print('signatures>>>', signatures)
    # 标签名词云图
    # 读取背景图
    img_back = imread('heart.png')
    wordcloud = WordCloud(
        background_color='white',  # 背景图片中不添加word的颜色
        max_words=2000,  # 最大词个数
        mask=img_back,
        font_path='E:\Web-Crawler\Wechat\SimHei.ttf',
        # 设置字体格式,如不设置显示不了中文,而且字体名不能是中文
        max_font_size=45,  # 设置字体大小的最大值
        random_state=30,
        scale=1.5,
    )
    wordcloud.generate(signatures)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
    wordcloud.to_file('signatures.jpg')
    # 情感比重反应人生观
    positive = len(list(filter(lambda x: x > 0.66, emotions)))
    negative = len(list(filter(lambda x: x < 0.33, emotions)))
    neutral = len(list(filter(lambda x: x >= 0.33 and x <= 0.66, emotions)))
    lables = [u'积极', u'中性', u'消极']
    values = [positive, negative, neutral]
    # plt.rcParams['font.sans-serif'] = ['simHei']
    # plt.rcParams['axes.unicode_minus'] = False
    plt.xlabel('情感判断')
    plt.ylabel('频数')
    plt.xticks(range(3), lables)
    plt.legend(loc='upper right')
    plt.bar(range(3), values, color='rgb')
    plt.title(u'%s好友的情感分析' % friends[0]['NickName'])
    plt.show()
Esempio n. 14
0
 def analysis_mention(text):
     s = snownlp.SnowNLP(text)
     mention = s.sentiments
     if mention > 0.6:
         return 1
     elif mention < 0.4:
         return -1
     else:
         return 0
     pass
Esempio n. 15
0
def save_data():
    df = pd.read_csv('./douban_book/comment_25984204.txt')
    df["sentiments"] = df["content"].map(
        lambda c: snownlp.SnowNLP(c).sentiments)
    engine = create_engine('mysql+pymysql://root:@127.0.0.1:3306/douban')
    df.to_sql(name='book',
              con=engine,
              chunksize=1000,
              if_exists='replace',
              index=None)
Esempio n. 16
0
	def test_seg(self):
		# 甲骨分词
		jiagu_result = []
		for sen in sentence:
			jiagu_result.append(jiagu.seg(sen))

		# 结巴分词
		jieba_result = []
		for sen in sentence:
			jieba_result.append(jieba.cut(sen))
		
		# 哈工大LTP
		pyltp_result = []
		for sen in sentence:
			pyltp_result.append(self.ltpseg.segment(sen))
			
		# HanLP
		pyhanlp_result = []
		for sen in sentence:
			words = []
			for term in pyhanlp.HanLP.segment(sen):
				words.append(term.word)
			pyhanlp_result.append(words)
		
		# 清华分词
		thulac_result = []
		for sen in sentence:
			thulac_result.append(self.thu1.cut(sen, text=True).split())
			
		# NLPIR
		pynlpir_result = []
		for sen in sentence:
			pynlpir_result.append(pynlpir.segment(sen, pos_tagging=False))
			
		# SnowNLP
		snownlp_result = []
		for sen in sentence:
			snownlp_result.append(snownlp.SnowNLP(sen).words)
			
		# FoolNLTK
		fool_result = fool.cut(sentence)

		for sen, jgr, jbr, ltp, hanlp, thu, nlpir, snow, fnltk, in zip(sentence, jiagu_result,
					jieba_result, pyltp_result, pyhanlp_result,
					thulac_result, pynlpir_result, snownlp_result, fool_result):
			print('句子:\t\t' + sen + '\n')
			print('结巴:\t\t' + ' '.join(jbr))
			print('HanLP:\t\t' + ' '.join(hanlp))
			print('SnowNLP\t\t' + ' '.join(snow))
			print('FoolNLTK\t' + ' '.join(fnltk))
			print('甲骨:\t\t' + ' '.join(jgr))
			print('哈工大:\t' + ' '.join(ltp))
			print('清华:\t\t' + ' '.join(thu))
			print('NLPIR:\t\t' + ' '.join(nlpir))
			print('\n')
Esempio n. 17
0
 def get_sentiment_cn(text):
     try:
         s = snownlp.SnowNLP(text).sentiments
         if s == 0.5:
             return '中性'
         elif s > 0.5:
             return "积极"
         else:
             return "消极"
     except:
         return text
Esempio n. 18
0
    def sentimentDeal(self):
        self.result = [0, 0, 0]
        for i in range(len(self.data)):
            v = snownlp.SnowNLP(self.data[i]).sentiments

            if v <= 0.33:
                self.result[0] += 1
            elif v > 0.33 and v <= 0.66:
                self.result[1] += 1
            else:
                self.result[2] += 1
Esempio n. 19
0
def wordsence(self):
    sentimentslist = []
    for li in self:
        #print(li)
        s = snownlp.SnowNLP(li)
        #print(s.sentiments)
        sentimentslist.append(s.sentiments)
    myfont = fm.FontProperties(fname='C:\Windows\Fonts\simsun.ttc')

    plt.title('琅琊榜之风起长林豆瓣评论情感分析', fontproperties=myfont)
    plt.hist(sentimentslist, bins=np.arange(0, 1, 0.01))
    plt.show()
Esempio n. 20
0
    def save_feedback(self, path, feedback_json_all):
        self.workbook = xlwt.Workbook(encoding='utf-8')
        self.worksheet = self.workbook.add_sheet('Feedback')
        excel_title = [
            '作者', '版本', '打分', '评论id', '标题', '内容', '标题情感分析', '内容情感分析'
        ]
        for i in range(0, 8):
            self.worksheet.write(0, i, label=excel_title[i])
        j = 1
        for i in feedback_json_all:
            #if i['id']['label']) 不存在
            self.worksheet.write(j, 0, label=i['author']['name']['label'])
            self.worksheet.write(j, 1, label=i['im:version']['label'])
            #print(i['im:rating']['label'])
            self.worksheet.write(j, 2, label=i['im:rating']['label'])
            #print(i['id']['label'])
            self.worksheet.write(j, 3, label=i['id']['label'])
            #print(i['title']['label'])
            self.worksheet.write(j, 4, label=i['title']['label'])
            #print(i['content']['label'])
            self.worksheet.write(j, 5, label=i['content']['label'])
            #sentiments
            s_title = snownlp.SnowNLP(i['title']['label'])
            s_title = float('%.2f' % s_title.sentiments)
            #print(s)
            self.worksheet.write(j, 6, label=s_title)

            s_content = snownlp.SnowNLP(i['content']['label'])
            s_content = float('%.2f' % s_content.sentiments)
            #print(s)
            self.worksheet.write(j, 7, label=s_content)

            j += 1
        #else:
        #   break

        self.workbook.save(path)
        print('Excel文件保存完成到:{}'.format(path))
Esempio n. 21
0
 def predict(self, sentence):
     words = list(jieba.cut(snownlp.SnowNLP(sentence).han))
     length = len(words)
     print(
         list(map(lambda x: x
                  if x in self.vocab else UNKNOWN_TOKEN, words)))
     words = list(
         map(
             lambda x: self.vocab[x]
             if x in self.vocab else self.vocab[UNKNOWN_TOKEN], words))
     matrix = csr_matrix((np.ones(length, ), (range(length), words)),
                         shape=(length, self.x_dim),
                         dtype=np.float32)
     print(self.predictor.eval({self.x: matrix})[0] + 1)
Esempio n. 22
0
    def chinese2pinyin_v1(self, x, method='xpinyin'):
        if not self.is_chinese(x):
            return x
        else:
            if method == 'xpinyin':
                res = self.P.get_pinyin(x, "").lower()
                return res

            elif method == 'snowNLP':
                pin_yin = snownlp.SnowNLP(x)
                try:
                    res = pin_yin.pinyin
                    return "".join(res)
                except Exception as e:
                    pass
Esempio n. 23
0
def test(filename, to_filename):
    '''商品评论-情感分析'''
    averageSentiment = 0

    with open(f'{filename}.csv', 'r', encoding=ENCODING) as fr:
        
        for line in fr.readlines():
            s = snownlp.SnowNLP(line)
            averageSentiment += s.sentiments
            if s.sentiments > 0.6:
                res = '喜欢'
                res_list.append(1)
            elif s.sentiments < 0.4:
                res = '不喜欢'
                res_list.append(-1)
            else:
                res = '一般'
                res_list.append(0)
            sent_dict = {
                '情感分析结果': s.sentiments,
                '评价倾向': res,
                '商品评论': line.replace('\n', '')
            }
            sentiment_list.append(sent_dict)
            # print(sent_dict)

        # 计算情感分析值的平均数
        averageSentiment = averageSentiment / len(sentiment_list)
        # 转为JSON对象,注意,写进文件时没有写JSON对象的花括号
        result = {'comments_num': len(sentiment_list), 'average_sentiment': averageSentiment }
        json_result = json.dumps(result, sort_keys=True, indent=4, separators=(',', ':'))
        print(json_result)

        # 将结果写入文件
        if os.path.exists('average_sentiment.txt'):
            os.remove('average_sentiment.txt')
        with open(f'average_sentiment.txt','x', encoding='utf8') as text_file:
            text_file.write(json_result)
            # json_item = "'commentsNum': " + str(len(sentiment_list)) + ",\n"
            # text_file.write(json_item)
            # json_item = "'averageSentiment': " + str(averageSentiment) + ",\n"
            # text_file.write(json_item)


        df = pd.DataFrame(sentiment_list)
        df.to_csv(f'{to_filename}.csv', index=None, encoding=ENCODING,
                  index_label=None, mode='w')
Esempio n. 24
0
 def itchat_friends_sign(self):
     signatures = ''
     emotions = []
     pattern = re.compile("1f\d.+")
     for friend in self.friends:
         signature = friend['Signature']
         if signature!=None:
             signature = signature.strip().replace('span','').replace('class','').replace('emoji','')
             if (len(signature) > 0):
                 nlp = snownlp.SnowNLP(signature)
                 emotions.append(nlp.sentiments)
                 signatures += ' '.join(jieba.analyse.extract_tags(signature, 5))
     with open('signatures.txt', 'wt', encoding='utf-8') as file:
         file.write(signatures)
     back_coloring = np.array(Image.open('timg (1).jpg'))
     wordcloud =WordCloud(
         font_path='C:/Windows/Fonts/simfang.ttf',
         background_color="white",
         max_words=1200,
         mask=back_coloring,
         max_font_size=75,
         random_state=45,
         width=960,
         height=720,
         margin=15
     )
     wordcloud.generate(signatures)
     plt.imshow(wordcloud)
     plt.axis("off")
     plt.show()
     # Signature Emotional Judgment
     count_good = len(list(filter(lambda x: x > 0.66, emotions)))
     count_normal = len(list(filter(lambda x: x >= 0.33 and x <= 0.66, emotions)))
     count_bad = len(list(filter(lambda x: x < 0.33, emotions)))
     labels = [u'负面消极', u'中性', u'正面积极']
     values = (count_bad, count_normal, count_good)
     plt.rcParams['font.sans-serif'] = ['simHei']
     plt.rcParams['axes.unicode_minus'] = False
     plt.xlabel(u'情感判断')
     plt.ylabel(u'频数')
     plt.xticks(range(3), labels)
     plt.legend(loc='upper right', )
     plt.bar(range(3), values, color='rgb')
     plt.title(u'%s的微信好友签名信息情感分析' % self.friends[0]['NickName'])
     plt.show()
Esempio n. 25
0
def useful_word_filter(words):
    '''去掉没用的词汇, 比如时间词, 副词, 还有'豆瓣'这种出现频率高但是没用的词'''
    wordlist = []
    for word in words:
        # 词性过滤
        if word.flag in ['f', 'm', 'v', 'nt', 'r', 'c', 'd']:
            pass
        # 无关词过滤
        elif word.word in ['豆瓣', '小说', '平装', '王德威']:
            pass
        # 感情色彩分析
        elif snownlp.SnowNLP(word.word).sentiments > 0.7:
            wordlist.append(word.word)
    # 以上这些可以用一行代码表示:
    # wordlist = [word.word if not(word.flag in ['f', 'm', 'v', 'nt', 'r', 'c', 'd'] and word.word in ['豆瓣', '小说', '平装', '王德威'] and snownlp.SnowNLP(word.word).sentiments <= 0.7) else '' for word in words]
    # with open('result_useful_word_filter.txt') as f:
    #     pickle.dump(wordlist, f)
    return wordlist
Esempio n. 26
0
    def processOnePost(self, book):
        url1 = "https://bbs.hupu.com/%s.html" % (book[0])
        status_code, html = utils.getHtml(url1)
        print("status_code, ", status_code)
        if status_code != 200:
            return

        soup = BeautifulSoup(html, "lxml")
        chucuole=soup.find("h4")
        if chucuole!= None:
            print(chucuole.get_text())
            if chucuole.get_text().startswith("\n嗯,出错了..."):
                print("嗯,出错了...")
                return

        subhead = soup.find_all(attrs={"class": "subhead"})
        subheadstr = subhead[0].get_text()

        quote_content = soup.find_all(attrs={"class": "quote-content"})
        quote_content_str = quote_content[0].get_text()



        sn = snownlp.SnowNLP(subheadstr)
        print(subheadstr)
        print(sn.summary()[0], sn.sentiments, sn.keywords())
        senti = ", 呵呵"
        if sn.sentiments>=0.8:
            senti = "happy, 哈哈"
        if sn.sentiments<=0.2:
            senti = "悲伤, 气愤"

        reponse_str = "%s, %s" % (chat_utils.deepThought.get_response(sn.summary()[0]), senti)
        self._post_writer.write("url\001%s\n" % url1)
        self._post_writer.write("subhead\001%s\n" % subheadstr)
        self._post_writer.write("quote_content\001%s\n" % quote_content_str)
        self._post_writer.write("sn.summary()[0]\001%s\n" % sn.summary()[0])
        self._post_writer.write("reponse_str\001%s\n" % reponse_str)
Esempio n. 27
0
    def setSentimentSimilarity(self):
        """
        计算评论文本的情感相似度
        使用snownlp(背后是朴素贝叶斯方法)来判断评论的情感,从0(消极)~1(积极)分布,然后计算其标准差
        有待改进:分类精度问题,即目前的情感分类的工具的都很笨,对于复杂一点的句式就不行了,也许用自己以前的可能更好
        :return: none
        """
        col = self.mdb.sentimentSimilarity
        if not col.find_one():
            logging.info('sentimentSimilarity为空,设置主键为wblogId')
            col.create_index([('wblogId', pymongo.DESCENDING)], unique=True)

        # swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"')
        # wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="no"')
        # all_wblog = swblog + wblog

        swblog = self.swblog
        wblog = self.wblog
        unknown = self.unknown
        all_wblog = swblog + wblog + unknown

        # 有一些评论很短或者没有字之类的
        # 对于这些微博,不参与计算情感极性
        # 过滤的方法是分词后判断去除一个词都不剩下的文本
        stop_words = WblogFeature.get_stop_words(
            os.path.dirname(os.getcwd()) + '/microblog/stop_words.txt')

        cc = MongoClient().comment.comment

        for wblogId in all_wblog:
            corpus = []
            try:
                for comment in cc.find({'wblogId': str(wblogId)}):
                    text = self.remove_html(comment['json_text']['text'])
                    text = self.remove_tag(text)
                    fenci = list(jieba.cut_for_search(text))
                    if len(fenci) == 0:
                        continue
                    # 由于jieba分词没有提供去停用词的接口,所以手动去停用词
                    stop_cnt = 0
                    for word in fenci:
                        if word in stop_words:
                            stop_cnt += 1
                    if stop_cnt == len(fenci):
                        continue
                    corpus.append(text)
            except Exception as e:
                logging.error('%s. The wblogId is %s' % (e, str(wblogId)))

            std = 0.0
            if len(corpus) > 3:
                sentiment_list = []
                for text in corpus:
                    sentiment_list.append(snownlp.SnowNLP(text).sentiments)
                std = numpy.std(numpy.array(sentiment_list), ddof=1)

            try:
                if wblogId in swblog:
                    col.insert_one({
                        'wblogId': wblogId,
                        'swblog': 'true',
                        'sentiment_similarity': std
                    })
                elif wblogId in wblog:
                    col.insert_one({
                        'wblogId': wblogId,
                        'swblog': 'false',
                        'sentiment_similarity': std
                    })
                elif wblogId in unknown:
                    col.insert_one({
                        'wblogId': wblogId,
                        'swblog': 'unknown',
                        'sentiment_similarity': std
                    })

            except Exception as e:
                logging.error('%s. The wblogId is %s' % (e, str(wblogId)))
        logging.info('setSentimentSimilarity finished')
Esempio n. 28
0
# from snownlp import SnowNLP
# s = SnowNLP(u'这个东西真心很赞')
# print(s.words)

import snownlp as nlp
# s = nlp.SnowNLP(u'这个东西真心很赞')
# print(s.words)
# print(s.tags)
print('请输入评价:')
comment = input()
while comment != 'Q':
    s = nlp.SnowNLP(comment)
    # 中文自然语言中积极情绪的概率
    positive_prob = s.sentiments
    print(s.sentiments)
    if positive_prob > 0.7:
        print('持积极评价')
    elif positive_prob > 0.3:
        print('中性评价')
    else:
        print('持消极评价')

    print('请输入评价:')
    comment = input()
Esempio n. 29
0
                        mask=mk,
                        scale=15)

# 对来自外部文件的文本进行中文分词,得到积极词汇和消极词汇的两个列表
f = open('三体黑暗森林.txt',encoding='GBK')
txt = f.read()
txtlist = jieba.lcut(txt)
positivelist = []
negativelist = []

# 下面对文本中的每个词进行情感分析,情感>0.96判为积极词,情感<0.06判为消极词
print('开始进行情感分析,请稍等,三国演义全文那么长的文本需要三分钟左右')
# 导入自然语言处理第三方库snownlp
import snownlp
for each in txtlist:
    each_word = snownlp.SnowNLP(each)
    feeling = each_word.sentiments
    if feeling > 0.96:
        positivelist.append(each)
    elif feeling < 0.06:
        negativelist.append(each)
    else:
        pass
# 将积极和消极的两个列表各自合并成积极字符串和消极字符串,字符串中的词用空格分隔
positive_string = " ".join(positivelist)
negative_string = " ".join(negativelist)


# 将string变量传入w的generate()方法,给词云输入文字
w1.generate(positive_string)
w2.generate(negative_string)
Esempio n. 30
0
    def parse(self, response):
        # print(response.text)
        # 【解析】
        weibo_list = response.xpath("//div[@class='c' and @id]")
        # 对所有的页面上的微博做一个划分
        for weibo in weibo_list:
            # 找到微博里面的div
            div_list = weibo.xpath("./div")
            item = WeiboproItem()
            if len(div_list) == 1:
                # 原创不带图
                item["categary"] = "YC NO PIC"
                item["name"] = weibo.xpath(
                    ".//a[@class='nk']/text()").extract_first()
                item["content"] = "\n".join(
                    weibo.xpath(".//span[@class='ctt']//text()").extract())
                item["dianzan"] = weibo.xpath(".//div/a/text()").extract()[-4]
                item["pinglun"] = weibo.xpath(".//div/a/text()").extract()[-2]
                item["zhuanfa"] = weibo.xpath(".//div/a/text()").extract()[-3]
                q = snownlp.SnowNLP("\n".join(
                    weibo.xpath(".//span[@class='ctt']//text()").extract()))
                qingganzhi = q.sentiments
                item["qingganzhi"] = float(qingganzhi)

            elif len(div_list) == 2:

                item["name"] = weibo.xpath(
                    ".//a[@class='nk']/text()").extract_first()
                item["content"] = "\n".join(
                    weibo.xpath(".//span[@class='ctt']//text()").extract())
                item["dianzan"] = weibo.xpath(
                    ".//div[2]/a/text()").extract()[-4]
                item["pinglun"] = weibo.xpath(
                    ".//div[2]/a/text()").extract()[-2]
                item["zhuanfa"] = weibo.xpath(
                    ".//div[2]/a/text()").extract()[-3]
                # 通过退图片进一步区分
                img_src = weibo.xpath(".//img[@class='ib']/@src")
                if len(img_src) > 0:
                    # 原创带图
                    item["categary"] = "YC PIC"
                    item["pic"] = img_src.extract_first()
                    q = snownlp.SnowNLP("\n".join(
                        weibo.xpath(
                            ".//span[@class='ctt']//text()").extract()))
                    qingganzhi = q.sentiments
                    item["qingganzhi"] = float(qingganzhi)
                else:
                    # 转发不带图
                    item["categary"] = "ZF NO PIC"
                    item["liyou"] = weibo.xpath(
                        ".//div[2]//text()").extract()[1]
                    q = snownlp.SnowNLP(
                        weibo.xpath(".//div[2]//text()").extract()[1])
                    qingganzhi = q.sentiments
                    item["qingganzhi"] = float(qingganzhi)
            else:
                # 转发带图
                item["categary"] = "ZF PIC"
                item["name"] = weibo.xpath(
                    ".//a[@class='nk']/text()").extract_first()
                item["content"] = "\n".join(
                    weibo.xpath(".//span[@class='ctt']//text()").extract())
                item["dianzan"] = weibo.xpath(
                    ".//div[3]/a/text()").extract()[-4]
                item["pinglun"] = weibo.xpath(
                    ".//div[3]/a/text()").extract()[-2]
                item["zhuanfa"] = weibo.xpath(
                    ".//div[3]/a/text()").extract()[-3]

                item["pic"] = weibo.xpath(
                    ".//img[@class='ib']/@src").extract_first()
                item["liyou"] = weibo.xpath(".//div[3]//text()").extract()[1]
                q = snownlp.SnowNLP(
                    weibo.xpath(".//div[3]//text()").extract()[1])
                qingganzhi = q.sentiments
                item["qingganzhi"] = float(qingganzhi)

            print(item)
            yield item