matplotlib.rcParams['axes.unicode_minus'] = False plt.figure() plt.pie(title_num[0:maxNum], labels=title_name[0:maxNum], autopct='%3.1f%%') font2 = { 'weight': 'normal', 'size': 12 } plt.title('收藏、回答、点赞、评论四个维度各取top100后,400个数据分专题占比(取前10)', font2) # plt.legend(loc=0,ncol=2) plt.axis('equal') if __name__ == '__main__': # 获得专题的记录数 selectCountSql = 'SELECT keyword,COUNT(*) AS num FROM tb_wukong GROUP BY keyword ORDER BY num DESC' countResult = my_mysql.selectTable(selectCountSql) print('-----悟空问答大家关注的专题top5-----') for i in range(5): print('第 {} 名:'.format(i + 1),countResult[i][0]) resultList = title_top(100) result_top_num = 10 result_top100_4 = resultList[0] result_top100_collectNum = result_top100_4[0] result_top100_replyNum = result_top100_4[1] result_top100_likeNum = result_top100_4[2] result_top100_commentNum = result_top100_4[3] print('-----悟空问答 收藏数 top10的问题-----') for i in range(result_top_num): print('第 {} 名:'.format(i + 1),result_top100_collectNum[i][1])
def title_top(topNum = 100): # 用于配合 selectSql 组成四个维度的sql语句 orderName = ['wk.collectNum', 'wk.replyNum', 'wk.likeNum', 'wk.commentNum'] order_results = [] for li in orderName: selectSql = 'SELECT wk.keyword,wk.title,wk.collectNum,wk.replyNum,wk.likeNum,wk.commentNum FROM tb_wukong wk ORDER BY ' + li + ' DESC;' order_results.append(my_mysql.selectTable(selectSql)) # 从 收藏、回答、点赞、评论 四个维度,各取topNum,每个topNum存为 result_topNum_4 的一个元素,每个topNum有topNum个元素 topNum = topNum result_topNum_4 = [] for result in order_results: result_topNum = result[0:topNum] result_topNum_4.append(result_topNum) # top_all_in = [] # for i in result_topNum_4[0]: # if i in result_topNum_4[1] and i in result_topNum_4[2] and i in result_topNum_4[3]: # top_all_in.append(i) dic_title = { "科技": 0, "美食": 0, "军事": 0, "财经": 0, "动漫": 0, "汽车": 0, "热门": 0, "国际": 0, "育儿": 0, "旅游": 0, "三农": 0, "文化": 0, "数码": 0, "家居": 0, "时尚": 0, "科学": 0, "游戏": 0, "历史": 0, "收藏": 0, "健康": 0, "心理": 0, "电影": 0, "教育": 0, "宠物": 0, "职场": 0, "娱乐": 0, "社会": 0, "体育": 0 } print(dic_title.keys()) print(dic_title.values()) # 这400个数据中,只要出现一次 专题,就对字典对应的key的value加一操作,统计每个 专题出现的频数 for result_topNum in result_topNum_4: for one in result_topNum: if one[0] in dic_title: value = dic_title[one[0]] dic_title[one[0]] = value + 1 else: print(one[0], ' not in dic_title') sum = 0 # 输出操作后的字典的 key-value对 和其中value的总和 # 迭代的过程中删除会报错 RuntimeError: dictionary changed size during iteration delKeys = [] for key in dic_title: print(key, ' -- ', dic_title[key]) sum = sum + dic_title[key] if dic_title[key] <= 10: delKeys.append(key) # del dic_title[key] print('sum = ', sum) # for delKey in delKeys: # del dic_title[delKey] x = dic_title.items() # sorted() 对 可迭代对象有效,item 代表 dic_title.items() 在某一次迭代中的值,而item这个元祖有两个数据:key和value print(sorted(dic_title.items(), key=lambda item: item[1], reverse=True)) # 对 专题数进行降序排序,排序后的结果为可迭代对象 [(title1,num1),(title2,num2),...] title_num_from_top_100 = sorted(dic_title.items(), key=lambda item: item[1], reverse=True) # 通过迭代器分别获得 [(title1,num1),(title2,num2),...] 中的 [title1,title2,...] 和 [num1,num2,...] title_name = [x[0] for x in title_num_from_top_100] title_num = [x[1] for x in title_num_from_top_100] print(title_name) print(title_num) return [result_topNum_4, title_name, title_num]
from bs4 import BeautifulSoup #解析网址模块 import time #时间模块 from selenium import webdriver #浏览器模块 from datetime import datetime import my_mysql selectNumOne = 'SELECT wk.title,wk.keyword,wk.url,MAX( wk.replyNum ),collectNum,likeNum,commentNum FROM tb_wukong wk GROUP BY wk.keyword;' numOneResult = my_mysql.selectTable(selectNumOne) for li in numOneResult: print(li[0], li[1], li[2]) firstUrl = '' chrome_options = webdriver.ChromeOptions() # chrome_options.add_argument('--headless') # chrome_options.add_argument('--disable-gpu') driv = webdriver.Chrome(chrome_options=chrome_options) driv.get(firstUrl) # 在谷歌浏览器中打开网址 driv.get(li[2]) driv.close()