def analysis_data(): """ :return: """ with open("../resource/bilibili/bilibili_data.json", "r") as f: d = json.load(f) d["text_analysis"] = list() count1, count2 = 0, 0 for text in d["text"]: t = snownlp.SnowNLP(text) if t.sentiments > 0.5: d["text_analysis"].append(1) count1 += 1 else: d["text_analysis"].append(-1) count2 += 1 print("count1 : ", count1, "\t count2 : ", count2) '''analysis''' text = "".join(d["text"]) s = snownlp.SnowNLP(text) dict_word = dict() s_word = s.words for it in s_word: dict_word[it] = s_word.count(it) dict_word = sorted(dict_word.items(), key=operator.itemgetter(1), reverse=True)[:10] for t in dict_word: print(t[0], "\t", t[1])
def save_mysql(self, feedback_json_all): cur = self.db.cursor() cur.execute('use feedback_database') try: create_projectid_table = 'create table projectid_%d(author varchar(100),version varchar(10),rating int(20),id varchar(30),title text(50),content text(2000),sent_title float(8),sent_content float(8))' % self.ProjectID #print(create_projectid_table) cur.execute(create_projectid_table) self.db.commit() print('----------------Table 创建成功----------------') except Exception as e: pass #print("Mysql Err:{}".format(e)) #j = 1 projectid = 'projectid_%s' % self.ProjectID idcount = 0 for i in feedback_json_all: # author_name = i['author']['name']['label'] argv = "author,version,rating,id,title,content,sent_title,sent_content" author = str(i['author']['name']['label']) #print(author) version = str(i['im:version']['label']) rating = str(i['im:rating']['label']) id = str(i['id']['label']) title = str(i['title']['label']) content = str(i['content']['label']) sent_title = float('%.2f' % snownlp.SnowNLP(title).sentiments) sent_content = float('%.2f' % snownlp.SnowNLP(content).sentiments) #print(sent_title) mql_value = (author, version, rating, id, title, content, sent_title, sent_content) #print(mql_value) try: ss = "select * from {} where id = {} ".format(projectid, id) idcount = cur.execute(ss) except Exception as e: print('运行错误', e) #print(idcount) if idcount == 0: #判断是否有数据。如果没有插入新数据。 insert_content = 'insert into {}({}) values{}'.format( projectid, argv, mql_value) #print(insert_content) cur.execute(insert_content) elif idcount != 0: print('有重复!! {}'.format(id)) #j += 1 self.db.commit() # 提交数据 cur.close() self.db.close() print('mql saved')
def __computeOneDaySentiments(self, date): meassageInOneDay = self.messageDataFrame[self.messageDataFrame.date == date + ' ' + '00:00:00'] sentimentsGrades = [] # If there is no news, we set this date's sentimental grade to be 0.5 (which is a neutral value). if len(meassageInOneDay.message) == 0: self.messageSentiments[date] = 0.5 # If there is news, we get the sentimental grade of every item of news, and count their mean to be day's final grade. else: for Message in meassageInOneDay.message: s = sn.SnowNLP(Message) MessageSentGrades = np.mean([sn.SnowNLP(word).sentiments for word in s.words]) # The item of news' sentimental grade. sentimentsGrades.append(MessageSentGrades) # Append every item of news sentimental grade to the day's list. self.messageSentiments[date] = np.mean(sentimentsGrades)
def get_sentiment_cn(text): s = snownlp.SnowNLP(text) res = s.sentiments if res > 0.3: return "积极" else: return "消极"
def commentparse(self, response): # print("-----------------------scrapy爬取微博评论---------------------------------------") status_after_url = "https://m.weibo.cn/comments/hotflow?id=%s&mid=%s&max_id=%s&max_id_type=%s" message_id = response.meta.get("message_id") keyword = response.meta.get("keyword") results = json.loads(response.text, encoding="utf-8") if results.get("ok"): max_id = results.get("data").get("max_id") max_id_type = results.get("data").get("max_id_type") if max_id: # 评论10 个为一段,下一段在上一段JSON中定义: yield scrapy.Request( url=status_after_url % (message_id, message_id, str(max_id), str(max_id_type)), callback=self.commentparse, meta={ "keyword": keyword, "message_id": message_id }) datas = results.get("data").get("data") for data in datas: text1 = data.get("text") like_count = data.get("like_count") user1 = data.get("user").get("screen_name") user_url = data.get("user").get("profile_url") emotion = snownlp.SnowNLP( text1).sentiments # #利用SnowNLP函数进行情感分析 weibocommentitem = WeiboCommentItem() weibocommentitem["title"] = keyword weibocommentitem["message_id"] = message_id weibocommentitem["text1"] = text1 weibocommentitem["user1"] = user1 weibocommentitem["user_url"] = user_url weibocommentitem["emotion"] = emotion yield weibocommentitem
def run(): df = pd.read_csv('./cleanfile.csv', encoding='utf-8', sep=',') df["sentiments"] = df["content"].map( lambda c: snownlp.SnowNLP(c).sentiments) df["keywords"] = df["content"].map(getKeyWord) df["input_time"] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') #engine = create_engine('mysql+pymysql://root:@127.0.0.1:3306/sina') engine = create_engine('mysql+mysqlconnector://root:@127.0.0.1:3306/sina') dtypedict = { 'id': Integer(), 'mid': VARCHAR(length=50), 'content': TEXT, 'uid': VARCHAR(length=15), 'area': VARCHAR(length=15), 'nick': VARCHAR(length=50), 'ip': VARCHAR(length=15), 'newsid': VARCHAR(length=50), 'time': DATETIME(), 'sentiments': DECIMAL('10,10'), 'keywords': VARCHAR(length=100), 'input_time': DATETIME(), } df.to_sql(name='news', con=engine, chunksize=100000, if_exists='replace', index=True, index_label='id', dtype=dtypedict)
def test(filename, to_filename): """商品评论-情感分析-测试""" with open(f'scrapingfile/{filename}.csv', 'r', encoding='utf-8-sig') as fr: for line in fr.readlines(): s = snownlp.SnowNLP(line) if s.sentiments >= 0.8: res = '超赞' res_list.append(1) elif 0.6 <= s.sentiments < 0.8: res = '喜欢' res_list.append(0.5) elif 0.2 <= s.sentiments < 0.4: res = '还行' res_list.append(-0.5) elif s.sentiments < 0.2: res = '厌恶' res_list.append(-1) else: res = '一般' res_list.append(0) sent_dict = { '情感分析结果': s.sentiments, '评价倾向': res, '商品评论': line.replace('\n', '') } sentiment_list.append(sent_dict) print(sent_dict) df = pd.DataFrame(sentiment_list) df.to_csv(f'scrapingfile/{to_filename}.txt', index=None, encoding='utf-8-sig', index_label=None, mode='w') fr.close()
def generateTextFeature(self, item): content = BeautifulSoup(item['mcontent'], self.DEFAULT_PARSER) content = re.sub("\s", "", content.get_text()) if content and len(content) != 0: title_words = list(jieba.cut(item['mtitle'])) jb_tags = jieba.analyse.extract_tags(content, self.KEYWORD_NUMBER, withWeight=True) result = snownlp.SnowNLP(content) sn_tags = result.keywords(self.KEYWORD_NUMBER) # print('\ '.join(title_words)) # print('\ '.join(jieba.cut(item[self.ITEM_COLNAME_DICT['mTitle']], True))) # print(jb_tags) # print(sn_tags) # print(result.keywords(self.KEYWORD_NUMBER, True)) result_tag, tag_len = self.clacWordWeight(jb_tags, sn_tags, title_words) top_tags = list(result_tag.items()) top_tags = sorted(top_tags, key=lambda x: x[1], reverse=True) # print(top_tags[:tag_len]) top_tag_map = {} for index in range(0, tag_len): top_tag_map[top_tags[index][0]] = top_tags[index][1] result_tag_text = {'type': item['mtags'], 'tag': top_tag_map} # print(item[self.ITEM_COLNAME_DICT['mSource']]) # print(result_tag_text) # print(json.dumps(result_tag_text, indent=4, separators=(',', ': '))) global tag_relation_tmp tag_relation_tmp = [item['mtags'], list(result_tag.keys())] item['mtags'] = json.dumps(result_tag_text, ensure_ascii=False)
def test(path): with open(path, 'r', encoding='utf-8') as fr: for line in fr.readlines(): s = snownlp.SnowNLP(line) sent_dict = {'情感分析结果': s.sentiments, '微博内容': line} sentiment_list.append(sent_dict) print(sent_dict)
def test(filename, to_filename): """商品评论-情感分析-测试""" with open(filename, 'r', encoding='gbk') as fr: for line in fr.readlines(): s = snownlp.SnowNLP(line) if s.sentiments > 0.6: res = '喜欢' res_list.append(1) elif s.sentiments < 0.4: res = '不喜欢' res_list.append(-1) else: res = '一般' res_list.append(0) sent_dict = { '情感分析结果': s.sentiments, '评价倾向': res, '商品评论': line.replace('\n', '') } sentiment_list.append(sent_dict) print(sent_dict) df = pd.DataFrame(sentiment_list) df.to_csv(to_filename, index=None, encoding='gbk', index_label=None, mode='w')
def postTiezi(word): driver = webdriver.Chrome() driver.maximize_window() driver.get('https://passport.baidu.com/v2/?login') time.sleep(2) driver.find_element_by_id("TANGRAM__PSP_3__footerULoginBtn").click() driver.find_element_by_name("userName").clear() driver.find_element_by_name("userName").send_keys('13009806115') driver.find_element_by_name("password").clear() driver.find_element_by_name("password").send_keys('1245586921') driver.find_element_by_id("TANGRAM__PSP_3__submit").click() time.sleep(50) driver.get('https://tieba.baidu.com/f?kw=万元归一诀&fr=index') time.sleep(6) if snownlp.SnowNLP(word).sentiments > 0.5: for i in range(0, 10): driver.find_element_by_name("title").send_keys('怎么会有这么弱智的问题') driver.find_element_by_id("ueditor_replace").send_keys('爬爬爬') time.sleep(2) driver.find_element_by_xpath( '//*[@id="tb_rich_poster"]/div[3]/div[5]/div/button[1]').click( ) else: for i in range(0, 10): driver.find_element_by_name("title").send_keys('是个好问题') driver.find_element_by_id("ueditor_replace").send_keys('赞赞赞') time.sleep(2) driver.find_element_by_xpath( '//*[@id="tb_rich_poster"]/div[3]/div[5]/div/button[1]').click( )
def run(): df = pd.read_csv('./cleanfile.csv', encoding='utf-8', sep=',') df["sentiments"] = df["content"].map( lambda c: snownlp.SnowNLP(c).sentiments) df["keywords"] = df["content"].map(getKeyWord) #engine = create_engine('mysql+pymysql://root:@127.0.0.1:3306/sina') engine = create_engine( 'mysql+mysqlconnector://root:[email protected]:3306/sina?charset=utf8&connect_timeout=10' ) dtypedict = { 'id': Integer(), 'uid': VARCHAR(length=15), 'area': VARCHAR(length=15), 'ipadd': VARCHAR(length=15), 'usertype': VARCHAR(length=10), 'agree': VARCHAR(length=10), 'cmttime': DATETIME(), 'content': TEXT, 'sentiments': DECIMAL('10,10'), 'keywords': VARCHAR(length=100), } df.to_sql(name='news', con=engine, chunksize=100000, if_exists='replace', index=True, index_label='id', dtype=dtypedict)
def analyse_Signature(friends): """ 分析好友签名 :param friends: :return: """ signatures = '' emotions = [] for friend in friends: signature = friend['Signature'] print(signature) if signature != None: signature = signature.strip().replace('span', '').replace( 'class', '').replace('emoji', '') signature = re.sub(r'1f(\d.+)', '', signature) print('signature>>>', signature) if len(signature) > 0: # 权值 nlp = snownlp.SnowNLP(signature) emotions.append(nlp.sentiments) # 关键字提取 signatures += ' '.join(jieba.analyse.extract_tags(signature, 5)) print('signatures>>>', signatures) # 标签名词云图 # 读取背景图 img_back = imread('heart.png') wordcloud = WordCloud( background_color='white', # 背景图片中不添加word的颜色 max_words=2000, # 最大词个数 mask=img_back, font_path='E:\Web-Crawler\Wechat\SimHei.ttf', # 设置字体格式,如不设置显示不了中文,而且字体名不能是中文 max_font_size=45, # 设置字体大小的最大值 random_state=30, scale=1.5, ) wordcloud.generate(signatures) plt.imshow(wordcloud) plt.axis("off") plt.show() wordcloud.to_file('signatures.jpg') # 情感比重反应人生观 positive = len(list(filter(lambda x: x > 0.66, emotions))) negative = len(list(filter(lambda x: x < 0.33, emotions))) neutral = len(list(filter(lambda x: x >= 0.33 and x <= 0.66, emotions))) lables = [u'积极', u'中性', u'消极'] values = [positive, negative, neutral] # plt.rcParams['font.sans-serif'] = ['simHei'] # plt.rcParams['axes.unicode_minus'] = False plt.xlabel('情感判断') plt.ylabel('频数') plt.xticks(range(3), lables) plt.legend(loc='upper right') plt.bar(range(3), values, color='rgb') plt.title(u'%s好友的情感分析' % friends[0]['NickName']) plt.show()
def analysis_mention(text): s = snownlp.SnowNLP(text) mention = s.sentiments if mention > 0.6: return 1 elif mention < 0.4: return -1 else: return 0 pass
def save_data(): df = pd.read_csv('./douban_book/comment_25984204.txt') df["sentiments"] = df["content"].map( lambda c: snownlp.SnowNLP(c).sentiments) engine = create_engine('mysql+pymysql://root:@127.0.0.1:3306/douban') df.to_sql(name='book', con=engine, chunksize=1000, if_exists='replace', index=None)
def test_seg(self): # 甲骨分词 jiagu_result = [] for sen in sentence: jiagu_result.append(jiagu.seg(sen)) # 结巴分词 jieba_result = [] for sen in sentence: jieba_result.append(jieba.cut(sen)) # 哈工大LTP pyltp_result = [] for sen in sentence: pyltp_result.append(self.ltpseg.segment(sen)) # HanLP pyhanlp_result = [] for sen in sentence: words = [] for term in pyhanlp.HanLP.segment(sen): words.append(term.word) pyhanlp_result.append(words) # 清华分词 thulac_result = [] for sen in sentence: thulac_result.append(self.thu1.cut(sen, text=True).split()) # NLPIR pynlpir_result = [] for sen in sentence: pynlpir_result.append(pynlpir.segment(sen, pos_tagging=False)) # SnowNLP snownlp_result = [] for sen in sentence: snownlp_result.append(snownlp.SnowNLP(sen).words) # FoolNLTK fool_result = fool.cut(sentence) for sen, jgr, jbr, ltp, hanlp, thu, nlpir, snow, fnltk, in zip(sentence, jiagu_result, jieba_result, pyltp_result, pyhanlp_result, thulac_result, pynlpir_result, snownlp_result, fool_result): print('句子:\t\t' + sen + '\n') print('结巴:\t\t' + ' '.join(jbr)) print('HanLP:\t\t' + ' '.join(hanlp)) print('SnowNLP\t\t' + ' '.join(snow)) print('FoolNLTK\t' + ' '.join(fnltk)) print('甲骨:\t\t' + ' '.join(jgr)) print('哈工大:\t' + ' '.join(ltp)) print('清华:\t\t' + ' '.join(thu)) print('NLPIR:\t\t' + ' '.join(nlpir)) print('\n')
def get_sentiment_cn(text): try: s = snownlp.SnowNLP(text).sentiments if s == 0.5: return '中性' elif s > 0.5: return "积极" else: return "消极" except: return text
def sentimentDeal(self): self.result = [0, 0, 0] for i in range(len(self.data)): v = snownlp.SnowNLP(self.data[i]).sentiments if v <= 0.33: self.result[0] += 1 elif v > 0.33 and v <= 0.66: self.result[1] += 1 else: self.result[2] += 1
def wordsence(self): sentimentslist = [] for li in self: #print(li) s = snownlp.SnowNLP(li) #print(s.sentiments) sentimentslist.append(s.sentiments) myfont = fm.FontProperties(fname='C:\Windows\Fonts\simsun.ttc') plt.title('琅琊榜之风起长林豆瓣评论情感分析', fontproperties=myfont) plt.hist(sentimentslist, bins=np.arange(0, 1, 0.01)) plt.show()
def save_feedback(self, path, feedback_json_all): self.workbook = xlwt.Workbook(encoding='utf-8') self.worksheet = self.workbook.add_sheet('Feedback') excel_title = [ '作者', '版本', '打分', '评论id', '标题', '内容', '标题情感分析', '内容情感分析' ] for i in range(0, 8): self.worksheet.write(0, i, label=excel_title[i]) j = 1 for i in feedback_json_all: #if i['id']['label']) 不存在 self.worksheet.write(j, 0, label=i['author']['name']['label']) self.worksheet.write(j, 1, label=i['im:version']['label']) #print(i['im:rating']['label']) self.worksheet.write(j, 2, label=i['im:rating']['label']) #print(i['id']['label']) self.worksheet.write(j, 3, label=i['id']['label']) #print(i['title']['label']) self.worksheet.write(j, 4, label=i['title']['label']) #print(i['content']['label']) self.worksheet.write(j, 5, label=i['content']['label']) #sentiments s_title = snownlp.SnowNLP(i['title']['label']) s_title = float('%.2f' % s_title.sentiments) #print(s) self.worksheet.write(j, 6, label=s_title) s_content = snownlp.SnowNLP(i['content']['label']) s_content = float('%.2f' % s_content.sentiments) #print(s) self.worksheet.write(j, 7, label=s_content) j += 1 #else: # break self.workbook.save(path) print('Excel文件保存完成到:{}'.format(path))
def predict(self, sentence): words = list(jieba.cut(snownlp.SnowNLP(sentence).han)) length = len(words) print( list(map(lambda x: x if x in self.vocab else UNKNOWN_TOKEN, words))) words = list( map( lambda x: self.vocab[x] if x in self.vocab else self.vocab[UNKNOWN_TOKEN], words)) matrix = csr_matrix((np.ones(length, ), (range(length), words)), shape=(length, self.x_dim), dtype=np.float32) print(self.predictor.eval({self.x: matrix})[0] + 1)
def chinese2pinyin_v1(self, x, method='xpinyin'): if not self.is_chinese(x): return x else: if method == 'xpinyin': res = self.P.get_pinyin(x, "").lower() return res elif method == 'snowNLP': pin_yin = snownlp.SnowNLP(x) try: res = pin_yin.pinyin return "".join(res) except Exception as e: pass
def test(filename, to_filename): '''商品评论-情感分析''' averageSentiment = 0 with open(f'{filename}.csv', 'r', encoding=ENCODING) as fr: for line in fr.readlines(): s = snownlp.SnowNLP(line) averageSentiment += s.sentiments if s.sentiments > 0.6: res = '喜欢' res_list.append(1) elif s.sentiments < 0.4: res = '不喜欢' res_list.append(-1) else: res = '一般' res_list.append(0) sent_dict = { '情感分析结果': s.sentiments, '评价倾向': res, '商品评论': line.replace('\n', '') } sentiment_list.append(sent_dict) # print(sent_dict) # 计算情感分析值的平均数 averageSentiment = averageSentiment / len(sentiment_list) # 转为JSON对象,注意,写进文件时没有写JSON对象的花括号 result = {'comments_num': len(sentiment_list), 'average_sentiment': averageSentiment } json_result = json.dumps(result, sort_keys=True, indent=4, separators=(',', ':')) print(json_result) # 将结果写入文件 if os.path.exists('average_sentiment.txt'): os.remove('average_sentiment.txt') with open(f'average_sentiment.txt','x', encoding='utf8') as text_file: text_file.write(json_result) # json_item = "'commentsNum': " + str(len(sentiment_list)) + ",\n" # text_file.write(json_item) # json_item = "'averageSentiment': " + str(averageSentiment) + ",\n" # text_file.write(json_item) df = pd.DataFrame(sentiment_list) df.to_csv(f'{to_filename}.csv', index=None, encoding=ENCODING, index_label=None, mode='w')
def itchat_friends_sign(self): signatures = '' emotions = [] pattern = re.compile("1f\d.+") for friend in self.friends: signature = friend['Signature'] if signature!=None: signature = signature.strip().replace('span','').replace('class','').replace('emoji','') if (len(signature) > 0): nlp = snownlp.SnowNLP(signature) emotions.append(nlp.sentiments) signatures += ' '.join(jieba.analyse.extract_tags(signature, 5)) with open('signatures.txt', 'wt', encoding='utf-8') as file: file.write(signatures) back_coloring = np.array(Image.open('timg (1).jpg')) wordcloud =WordCloud( font_path='C:/Windows/Fonts/simfang.ttf', background_color="white", max_words=1200, mask=back_coloring, max_font_size=75, random_state=45, width=960, height=720, margin=15 ) wordcloud.generate(signatures) plt.imshow(wordcloud) plt.axis("off") plt.show() # Signature Emotional Judgment count_good = len(list(filter(lambda x: x > 0.66, emotions))) count_normal = len(list(filter(lambda x: x >= 0.33 and x <= 0.66, emotions))) count_bad = len(list(filter(lambda x: x < 0.33, emotions))) labels = [u'负面消极', u'中性', u'正面积极'] values = (count_bad, count_normal, count_good) plt.rcParams['font.sans-serif'] = ['simHei'] plt.rcParams['axes.unicode_minus'] = False plt.xlabel(u'情感判断') plt.ylabel(u'频数') plt.xticks(range(3), labels) plt.legend(loc='upper right', ) plt.bar(range(3), values, color='rgb') plt.title(u'%s的微信好友签名信息情感分析' % self.friends[0]['NickName']) plt.show()
def useful_word_filter(words): '''去掉没用的词汇, 比如时间词, 副词, 还有'豆瓣'这种出现频率高但是没用的词''' wordlist = [] for word in words: # 词性过滤 if word.flag in ['f', 'm', 'v', 'nt', 'r', 'c', 'd']: pass # 无关词过滤 elif word.word in ['豆瓣', '小说', '平装', '王德威']: pass # 感情色彩分析 elif snownlp.SnowNLP(word.word).sentiments > 0.7: wordlist.append(word.word) # 以上这些可以用一行代码表示: # wordlist = [word.word if not(word.flag in ['f', 'm', 'v', 'nt', 'r', 'c', 'd'] and word.word in ['豆瓣', '小说', '平装', '王德威'] and snownlp.SnowNLP(word.word).sentiments <= 0.7) else '' for word in words] # with open('result_useful_word_filter.txt') as f: # pickle.dump(wordlist, f) return wordlist
def processOnePost(self, book): url1 = "https://bbs.hupu.com/%s.html" % (book[0]) status_code, html = utils.getHtml(url1) print("status_code, ", status_code) if status_code != 200: return soup = BeautifulSoup(html, "lxml") chucuole=soup.find("h4") if chucuole!= None: print(chucuole.get_text()) if chucuole.get_text().startswith("\n嗯,出错了..."): print("嗯,出错了...") return subhead = soup.find_all(attrs={"class": "subhead"}) subheadstr = subhead[0].get_text() quote_content = soup.find_all(attrs={"class": "quote-content"}) quote_content_str = quote_content[0].get_text() sn = snownlp.SnowNLP(subheadstr) print(subheadstr) print(sn.summary()[0], sn.sentiments, sn.keywords()) senti = ", 呵呵" if sn.sentiments>=0.8: senti = "happy, 哈哈" if sn.sentiments<=0.2: senti = "悲伤, 气愤" reponse_str = "%s, %s" % (chat_utils.deepThought.get_response(sn.summary()[0]), senti) self._post_writer.write("url\001%s\n" % url1) self._post_writer.write("subhead\001%s\n" % subheadstr) self._post_writer.write("quote_content\001%s\n" % quote_content_str) self._post_writer.write("sn.summary()[0]\001%s\n" % sn.summary()[0]) self._post_writer.write("reponse_str\001%s\n" % reponse_str)
def setSentimentSimilarity(self): """ 计算评论文本的情感相似度 使用snownlp(背后是朴素贝叶斯方法)来判断评论的情感,从0(消极)~1(积极)分布,然后计算其标准差 有待改进:分类精度问题,即目前的情感分类的工具的都很笨,对于复杂一点的句式就不行了,也许用自己以前的可能更好 :return: none """ col = self.mdb.sentimentSimilarity if not col.find_one(): logging.info('sentimentSimilarity为空,设置主键为wblogId') col.create_index([('wblogId', pymongo.DESCENDING)], unique=True) # swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"') # wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="no"') # all_wblog = swblog + wblog swblog = self.swblog wblog = self.wblog unknown = self.unknown all_wblog = swblog + wblog + unknown # 有一些评论很短或者没有字之类的 # 对于这些微博,不参与计算情感极性 # 过滤的方法是分词后判断去除一个词都不剩下的文本 stop_words = WblogFeature.get_stop_words( os.path.dirname(os.getcwd()) + '/microblog/stop_words.txt') cc = MongoClient().comment.comment for wblogId in all_wblog: corpus = [] try: for comment in cc.find({'wblogId': str(wblogId)}): text = self.remove_html(comment['json_text']['text']) text = self.remove_tag(text) fenci = list(jieba.cut_for_search(text)) if len(fenci) == 0: continue # 由于jieba分词没有提供去停用词的接口,所以手动去停用词 stop_cnt = 0 for word in fenci: if word in stop_words: stop_cnt += 1 if stop_cnt == len(fenci): continue corpus.append(text) except Exception as e: logging.error('%s. The wblogId is %s' % (e, str(wblogId))) std = 0.0 if len(corpus) > 3: sentiment_list = [] for text in corpus: sentiment_list.append(snownlp.SnowNLP(text).sentiments) std = numpy.std(numpy.array(sentiment_list), ddof=1) try: if wblogId in swblog: col.insert_one({ 'wblogId': wblogId, 'swblog': 'true', 'sentiment_similarity': std }) elif wblogId in wblog: col.insert_one({ 'wblogId': wblogId, 'swblog': 'false', 'sentiment_similarity': std }) elif wblogId in unknown: col.insert_one({ 'wblogId': wblogId, 'swblog': 'unknown', 'sentiment_similarity': std }) except Exception as e: logging.error('%s. The wblogId is %s' % (e, str(wblogId))) logging.info('setSentimentSimilarity finished')
# from snownlp import SnowNLP # s = SnowNLP(u'这个东西真心很赞') # print(s.words) import snownlp as nlp # s = nlp.SnowNLP(u'这个东西真心很赞') # print(s.words) # print(s.tags) print('请输入评价:') comment = input() while comment != 'Q': s = nlp.SnowNLP(comment) # 中文自然语言中积极情绪的概率 positive_prob = s.sentiments print(s.sentiments) if positive_prob > 0.7: print('持积极评价') elif positive_prob > 0.3: print('中性评价') else: print('持消极评价') print('请输入评价:') comment = input()
mask=mk, scale=15) # 对来自外部文件的文本进行中文分词,得到积极词汇和消极词汇的两个列表 f = open('三体黑暗森林.txt',encoding='GBK') txt = f.read() txtlist = jieba.lcut(txt) positivelist = [] negativelist = [] # 下面对文本中的每个词进行情感分析,情感>0.96判为积极词,情感<0.06判为消极词 print('开始进行情感分析,请稍等,三国演义全文那么长的文本需要三分钟左右') # 导入自然语言处理第三方库snownlp import snownlp for each in txtlist: each_word = snownlp.SnowNLP(each) feeling = each_word.sentiments if feeling > 0.96: positivelist.append(each) elif feeling < 0.06: negativelist.append(each) else: pass # 将积极和消极的两个列表各自合并成积极字符串和消极字符串,字符串中的词用空格分隔 positive_string = " ".join(positivelist) negative_string = " ".join(negativelist) # 将string变量传入w的generate()方法,给词云输入文字 w1.generate(positive_string) w2.generate(negative_string)
def parse(self, response): # print(response.text) # 【解析】 weibo_list = response.xpath("//div[@class='c' and @id]") # 对所有的页面上的微博做一个划分 for weibo in weibo_list: # 找到微博里面的div div_list = weibo.xpath("./div") item = WeiboproItem() if len(div_list) == 1: # 原创不带图 item["categary"] = "YC NO PIC" item["name"] = weibo.xpath( ".//a[@class='nk']/text()").extract_first() item["content"] = "\n".join( weibo.xpath(".//span[@class='ctt']//text()").extract()) item["dianzan"] = weibo.xpath(".//div/a/text()").extract()[-4] item["pinglun"] = weibo.xpath(".//div/a/text()").extract()[-2] item["zhuanfa"] = weibo.xpath(".//div/a/text()").extract()[-3] q = snownlp.SnowNLP("\n".join( weibo.xpath(".//span[@class='ctt']//text()").extract())) qingganzhi = q.sentiments item["qingganzhi"] = float(qingganzhi) elif len(div_list) == 2: item["name"] = weibo.xpath( ".//a[@class='nk']/text()").extract_first() item["content"] = "\n".join( weibo.xpath(".//span[@class='ctt']//text()").extract()) item["dianzan"] = weibo.xpath( ".//div[2]/a/text()").extract()[-4] item["pinglun"] = weibo.xpath( ".//div[2]/a/text()").extract()[-2] item["zhuanfa"] = weibo.xpath( ".//div[2]/a/text()").extract()[-3] # 通过退图片进一步区分 img_src = weibo.xpath(".//img[@class='ib']/@src") if len(img_src) > 0: # 原创带图 item["categary"] = "YC PIC" item["pic"] = img_src.extract_first() q = snownlp.SnowNLP("\n".join( weibo.xpath( ".//span[@class='ctt']//text()").extract())) qingganzhi = q.sentiments item["qingganzhi"] = float(qingganzhi) else: # 转发不带图 item["categary"] = "ZF NO PIC" item["liyou"] = weibo.xpath( ".//div[2]//text()").extract()[1] q = snownlp.SnowNLP( weibo.xpath(".//div[2]//text()").extract()[1]) qingganzhi = q.sentiments item["qingganzhi"] = float(qingganzhi) else: # 转发带图 item["categary"] = "ZF PIC" item["name"] = weibo.xpath( ".//a[@class='nk']/text()").extract_first() item["content"] = "\n".join( weibo.xpath(".//span[@class='ctt']//text()").extract()) item["dianzan"] = weibo.xpath( ".//div[3]/a/text()").extract()[-4] item["pinglun"] = weibo.xpath( ".//div[3]/a/text()").extract()[-2] item["zhuanfa"] = weibo.xpath( ".//div[3]/a/text()").extract()[-3] item["pic"] = weibo.xpath( ".//img[@class='ib']/@src").extract_first() item["liyou"] = weibo.xpath(".//div[3]//text()").extract()[1] q = snownlp.SnowNLP( weibo.xpath(".//div[3]//text()").extract()[1]) qingganzhi = q.sentiments item["qingganzhi"] = float(qingganzhi) print(item) yield item