def update_num_of_comment_commentlike(): # 更新content表中的评论量和点赞量 time0 = time.time() sql = "SELECT id, `comment` FROM content" result, rowcount = dbutil.query_with_sql_rowcount(conn, sql) for (id, comment) in result: if len(comment) > 2: comment = comment[2:len(comment) - 2] aa = comment.split('}, {') comment_list = [] for a in aa: a = "{%s}" % a comment_list.append(a) comment_like = 0 for discuss in comment_list: di = eval(discuss) # 字符串转数组,该函数不安全 tmp = di.get('elected') # 评论点赞量也有过万的,在数据持久化的时候,'elected'存放的数据不太规范,有String类型,把无数据的当做0整型数据存放了 if isinstance(tmp, str) and '万' in tmp: # print('评论点赞量过万: %s' % discuss) comment_like += float(tmp.split('万')[0]) * 10000 else: comment_like += int(tmp) # print('评论点赞量:%d' % comment_like) sql2 = "update content set comment_num = %d, commentlike_num = %d WHERE id = %d" \ % (len(comment_list), comment_like, id) dbutil.exec_sql(conn, sql2) print('用时 ', time.time()-time0)
def read_daily(figurename): num_map = collections.OrderedDict() # key为日期如20190102,value为该日的阅读量 sql = "SELECT DISTINCT(datetime) FROM test where datetime between 1546272000 and 1577808000 ORDER BY datetime" result, rowcount = dbutil.query_with_sql_rowcount(conn, sql) for (datetime, ) in result: time_local = time.localtime(datetime) time_format = time.strftime('%Y%m%d %H:%M:%S', time_local) day = time_format.split(' ')[0] # split取出日期 num_map.setdefault(day, 0) sql2 = "SELECT readnum FROM test WHERE readnum!='' and datetime = %s" % datetime result2 = dbutil.query_with_sql(conn, sql2) for (readnum, ) in result2: if '万' in readnum: readnum = float(readnum.split('万')[0]) * 10000 readnum = int(readnum) num_map[day] += readnum # 开始画图 每日阅读量随时间变化图 plt.title('Read Daily Information Analysis') x = list(num_map.keys()) y = np.array(list(num_map.values())) / 100000 plt.xticks(np.arange(0, len(x), 30)) plt.plot(x, y) # plt.scatter(x, y) plt.legend() # 显示图例 plt.savefig('%s_read_daily.png' % figurename) # plt.show() plt.close()
def update_readnum(): time0 = time.time() sql = "select id, readnum from content where readnum like '%万%'" result, rowcount = dbutil.query_with_sql_rowcount(conn, sql) for (id, readnum) in result: tmp = float(readnum.split('万')[0]) * 10000 sql2 = "update content set readnum= %s where id = %d" % (int(tmp), id) dbutil.exec_sql(conn, sql2) print('用时 ', time.time()-time0)
def drag_index_to_tabel(): a = [] sql = "SELECT id, contenturl FROM content" result, rowcount = dbutil.query_with_sql_rowcount(conn, sql) for (id, contenturl) in result: index = contenturl.find('&idx=') idx = int(contenturl[index + 5:index + 6]) tmp = {'id': id, 'idx': idx} a.append(tmp) sql = 'update content set idx= %d where id = %d' % (idx, id) dbutil.exec_sql(conn, sql)
def read_vary_with_time(figurename): """ 阅读量随时间变化图 和 阅读统计分布图 :param figurename: 一般为公众号昵称 :return: """ data = [] # 阅读量数据 count = 0 # 超过均值的阅读量 sql = "SELECT readnum FROM test where readnum!='' and datetime between 1546272000 and 1577808000 ORDER BY datetime" result, rowcount = dbutil.query_with_sql_rowcount(conn, sql) for (readnum, ) in result: if '万' in readnum: readnum = float(readnum.split('万')[0]) * 10000 data.append(int(readnum)) mean = sum(data) / len(data) print('阅读量均值', mean) for i in data: if i > mean: count += 1 print('超过均值的占比', count / len(data)) # 开始画图 阅读量随时间变化图 plt.title('Read Information Analysis') x = list(range(0, len(data))) y = data plt.scatter(x, y) plt.legend() # 显示图例 plt.savefig('%s_read_vary.png' % figurename) plt.close() # 阅读量统计分布图 map = {} for i in data: map.setdefault(i, 0) map[i] += 1 # 开始画图 plt.title('Read Distribute Analysis') x = [] y = [] for i in sorted(map): print((i, map[i]), end=" ") x.append(i) y.append(map[i]) plt.plot(x, y) # plt.scatter(x, y) plt.legend() # 显示图例 plt.savefig('%s_read_distribute.png' % figurename) plt.show() plt.close()
def index_each_biz(): """ 索引位置对各个公众号的影响:计算不同索引位置的平均阅读量和点赞量 :return: """ sql = "select biz, nickname from bizinfo" result = dbutil.query_with_sql(conn, sql) for (biz, nickname) in result: print("公众号昵称", nickname) read_map = {} # 阅读量 read_id = {} # 文章标识 like_map = {} # 点赞量 sql2 = "SELECT id, contenturl, readnum, likenum, title, digest FROM content where biz = '%s'" % biz result2, rowcount = dbutil.query_with_sql_rowcount(conn, sql2) for (id, contenturl, readnum, likenum, title, digest) in result2: index = contenturl.find('&idx=') idx = int(contenturl[index + 5:index + 6]) read_map.setdefault(idx, []) read_map[idx].append(readnum) read_id.setdefault(idx, []) read_id[idx].append(id) like_map.setdefault(idx, []) like_map[idx].append(likenum) idx_read_list = [] # 索引位置平均阅读量 idx_like_list = [] # 索引位置平均点赞量 x = list(range(1, len(read_map) + 1)) for i in x: idx_read_list.append( round(sum(read_map[i]) / (len(read_map[i]) * 10000), 2)) # 以万为单位 idx_like_list.append(round(sum(like_map[i]) / len(like_map[i]), 1)) # 分布图 plot_single_figure(x, idx_read_list, "索引位置对阅读量的影响", "索引位置", "阅读量/万", nickname + "_read") plot_single_figure(x, idx_like_list, "索引位置对点赞量的影响", "索引位置", "点赞量", nickname + "_like")
def comment_to_table(): """ content表评论数据写入评论表 :return: """ discusses = [] sql = "SELECT biz, contenturl, title, datetime,`comment` FROM content order by datetime" result, rowcount = dbutil.query_with_sql_rowcount(conn, sql) for (biz, contenturl, title, datetime, comment) in result: if len(comment) > 2: comment = comment[2:len(comment) - 2] aa = comment.split('}, {') comment_list = [] for a in aa: a = "{%s}" % a comment_list.append(a) for discuss in comment_list: comment_like = 0 di = eval(discuss) # 字符串转数组,该函数不安全 tmp = di.get('elected') # 评论点赞量也有过万的,在数据持久化的时候,'elected'存放的数据不太规范,有String类型,把无数据的当做0整型数据存放了 if isinstance(tmp, str) and '万' in tmp: print('评论点赞量过万: %s' % discuss) comment_like += float(tmp.split('万')[0]) * 10000 else: comment_like += int(tmp) # print('评论点赞量:%d' % comment_like) nickname = di.get('nickname') comment0 = di.get('comment') single = { 'biz' : biz, 'contenturl': contenturl, 'title': title, 'datetime': datetime, 'comment': comment0, 'nickname': nickname, 'commentlike': comment_like } discusses.append(single) dbutil.insert_by_many_comment(conn, discusses)
def index_totol_infection(): x = list(range(1, 9)) # sql = "SELECT ROUND(AVG(readnum)/10,1) a, ROUND(AVG(likenum)) b FROM content GROUP BY idx ORDER BY idx" sql = "SELECT ROUND(AVG(readnum)) a, ROUND(AVG(likenum)) b, ROUND(AVG(comment_num)) c, ROUND(AVG(commentlike_num)) d FROM content GROUP BY idx ORDER BY idx" result, rowcount = dbutil.query_with_sql_rowcount(conn, sql) readnum = [] likenum = [] commentnum = [] commentlikenum = [] for (a, b, c, d) in result: readnum.append(a) likenum.append(b) commentnum.append(c) commentlikenum.append(d) scaler = preprocessing.MinMaxScaler() y1 = scaler.fit_transform(np.array(readnum).reshape(-1, 1)) y2 = scaler.fit_transform(np.array(likenum).reshape(-1, 1)) y3 = scaler.fit_transform(np.array(commentnum).reshape(-1, 1)) y4 = scaler.fit_transform(np.array(commentlikenum).reshape(-1, 1)) # 标题 plt.title('索引对微文各项数据的影响', fontproperties=font_set) # 横坐标描述 plt.xlabel('索引位置', fontproperties=font_set) # 纵坐标描述 # plt.ylabel('', fontproperties=font_set) plt.plot(x, y1, color='green', label='阅读量') plt.plot(x, y2, color='red', label='点赞量') plt.plot(x, y3, color='skyblue', label='评论量') plt.plot(x, y4, color='blue', label='评论点赞量') # plt.legend(prop=font_set1, loc="upper right", fontsize=8) plt.legend(prop=font_set2, loc="upper right") plt.savefig('./索引对微文各项数据的影响') plt.show()
def index_total_biz(): """ 索引位置对所有公众号的影响统计:计算不同索引位置的平均阅读量和点赞量 :return: """ # 索引位置最长8篇 read_map = {} # 阅读量 read_id = {} # 文章标识 like_map = {} # 点赞量 sql = "SELECT id, contenturl, readnum, likenum, title, digest FROM content" result, rowcount = dbutil.query_with_sql_rowcount(conn, sql) for (id, contenturl, readnum, likenum, title, digest) in result: index = contenturl.find('&idx=') idx = int(contenturl[index + 5:index + 6]) read_map.setdefault(idx, []) read_map[idx].append(readnum) read_id.setdefault(idx, []) read_id[idx].append(id) like_map.setdefault(idx, []) like_map[idx].append(likenum) idx_read_list = [] # 索引位置平均阅读量 idx_like_list = [] # 索引位置平均点赞量 x = list(range(1, len(read_map) + 1)) for i in x: idx_read_list.append( round(sum(read_map[i]) / (len(read_map[i]) * 10000), 2)) # 以万为单位 idx_like_list.append(round(sum(like_map[i]) / len(like_map[i]), 1)) # 分布图 pltutil.plot_single_figure(x, idx_read_list, "索引位置对阅读量的影响", "索引位置", "阅读量/万", "total_read") pltutil.plot_single_figure(x, idx_like_list, "索引位置对点赞量的影响", "索引位置", "点赞量", "total_like")
from matplotlib.font_manager import FontProperties font_set = FontProperties(fname='/Library/Fonts/Songti.ttc', size=15) conn = dbutil.connectdb_wechatcluster() def millions(x, pos): 'The two args are the value and tick position' return '$%1.1fM' % (x * 1e-6) if __name__ == '__main__': sql = "SELECT nickname, sum(num) as total from advisement GROUP BY nickname ORDER BY sum(num)" result, rowcount = dbutil.query_with_sql_rowcount(conn, sql) names = [] nums = [] for (nickname, total) in result: names.append(nickname) nums.append(total) # # 标题 plt.title("各公众号广告植入", fontproperties=font_set) # 横坐标描述 plt.xlabel("公众号昵称", fontproperties=font_set) # 纵坐标描述 plt.ylabel("2019广告植入量", fontproperties=font_set)
def publish_abnormal_data(figurename): """ 时间维度,发文数维度。 1、统计一天内,不同时间发文数量 2、画出分布图 3、以一年内每次发文均值作为阈值,小于此阈值判断为发文时间异常点,推测有突发话题 4、查找数据库,找出文章集合。 :param figurename: :return: """ map = collections.OrderedDict() # key 为时分[0000,2400),value 为该分钟发文数 sql = "SELECT datetime, COUNT(*) as num FROM test where datetime between 1546272000 and 1577808000 GROUP BY datetime ORDER BY datetime" result, n = dbutil.query_with_sql_rowcount(conn, sql) total = 0 for (datetime, num) in result: time_local = time.localtime(datetime) time_format = time.strftime('%Y%m%d %H%M', time_local) minute = time_format.split(' ')[1] # 取出时分 map.setdefault(minute, 0) map[minute] += num total += num print('一年内发文次数', n) # print(len(map)) mean = total / n # 平均每次发文数量 print('每次发布平均值:', mean) # 按照键值排序 x = [] y = [] abnormal = [] # 异常时间点 for i in sorted(map): x.append(i) y.append(map[i]) print((i, map[i]), end=" ") if map[i] < mean: # 小于每次发布数量平均值的认为是异常时间点 abnormal.append(i) # print(x) # print(y) plt.xticks(np.arange(0, 2400, 30)) # 30个数据一个间隔 plt.plot(x, y) # plt.scatter(x, y) plt.legend() plt.savefig('%s_minute.png' % figurename) plt.show() plt.close() print('\n') print('异常时间点数据:', abnormal) print('长度:', len(abnormal)) aa = set(abnormal) # 检索异常时间点的数据,这里set是多此一举 sql = "SELECT title, digest, datetime FROM test where datetime between 1546272000 and 1577808000 ORDER BY datetime" result, rowcount = dbutil.query_with_sql_rowcount(conn, sql) bb = collections.OrderedDict() count = 0 for (title, digest, datetime) in result: time_local = time.localtime(datetime) time_format = time.strftime('%Y%m%d %H%M', time_local) minute = time_format.split(' ')[1] if minute in aa: bb.setdefault(time_format.split(' ')[0], 0) bb[time_format.split(' ')[0]] += 1 count += 1 print(time_format) print(title) print(digest) print('异常数据时间序列:', bb.keys()) print('异常数据长度:', count) print('异常数据百分比 ', round(count / rowcount, 2))
def comment_vary_with_time(figurename): """ 评论随时间变化图,评论统计分布图 评论点赞量随时间变化图,评论点赞量统计分布图 :param figurename: :return: """ discusses = [] commentlike = [] sql = "SELECT `comment` FROM test order by datetime" result, rowcount = dbutil.query_with_sql_rowcount(conn, sql) for (comment,) in result: if len(comment) > 2: comment = comment[2:len(comment) - 2] aa = comment.split('}, {') comment_list = [] for a in aa: a = "{%s}" % a comment_list.append(a) discusses.append(len(comment_list)) comment_like = len(comment_list) for discuss in comment_list: # print(discuss) di = eval(discuss) # 字符串转数组,该函数不安全 tmp = di.get('elected') # 评论点赞量也有过万的,在数据持久化的时候,'elected'存放的数据不太规范,有String类型,把无数据的当做0整型数据存放了 if isinstance(tmp, str) and '万' in tmp: print('评论点赞量过万: %s' % discuss) comment_like += float(tmp.split('万')[0]) * 10000 else: comment_like += int(tmp) # print('评论点赞量:%d' % comment_like) commentlike.append(comment_like) else: discusses.append(0) commentlike.append(0) mean_commentnum = int(sum(discusses) / rowcount) mean_commentlikenum = int(sum(commentlike) / rowcount) print('平均评论量 ', mean_commentnum) print('平均评论点赞量 ', mean_commentlikenum) # 开始画图 评论随时间变化图 x = list(range(0, len(discusses))) plt.title('Comment Vary With Time') plt.plot(x, discusses, color='skyblue', label='comment') # plt.scatter(x, discusses, color='skyblue', label='comment') plt.legend() # 显示图例 plt.savefig('%s_comment_vary.png' % figurename) # plt.show() plt.close() # 开始画图 评论点赞量随时间变化图 plt.title('CommentLike Vary With Time') # plt.scatter(x, commentlike, color='skyblue', label='comment') plt.plot(x, commentlike, color='blue', label='comment like') plt.legend() # 显示图例 plt.savefig('%s_comment_like_vary.png' % figurename) # plt.show() plt.close()
def comment_distribute(figurename): """ 评论统计分布图、评论点赞量统计分布图 :param figurename: :return: """ discusses = [] commentlike = [] sql = "SELECT `comment` FROM test order by datetime" result, rowcount = dbutil.query_with_sql_rowcount(conn, sql) for (comment,) in result: if len(comment) > 2: comment = comment[2:len(comment) - 2] aa = comment.split('}, {') comment_list = [] for a in aa: a = "{%s}" % a comment_list.append(a) discusses.append(len(comment_list)) comment_like = len(comment_list) for discuss in comment_list: # print(discuss) di = eval(discuss) # 字符串转数组,该函数不安全 tmp = di.get('elected') # 评论点赞量也有过万的,在数据持久化的时候,'elected'存放的数据不太规范,有String类型,把无数据的当做0整型数据存放了 if isinstance(tmp, str) and '万' in tmp: print('评论点赞量过万: %s' % discuss) comment_like += float(tmp.split('万')[0]) * 10000 else: comment_like += int(tmp) # print('评论点赞量:%d' % comment_like) commentlike.append(comment_like) else: discusses.append(0) commentlike.append(0) mean_commentnum = int(sum(discusses) / rowcount) mean_commentlikenum = int(sum(commentlike) / rowcount) print('平均评论量 ', mean_commentnum) print('平均评论点赞量 ', mean_commentlikenum) # 评论统计分布图 map_comment = {} for i in discusses: map_comment.setdefault(i, 0) map_comment[i] += 1 # 评论点赞统计分布图 map_comment_like = {} for i in commentlike: map_comment_like.setdefault(i, 0) map_comment_like[i] += 1 x = [] y = [] for i in sorted(map_comment): x.append(i) y.append(map_comment[i]) plt.title('Comment Distribute') plt.plot(x, y, color='skyblue', label='comment') # plt.scatter(x, discusses, color='skyblue', label='comment') plt.legend() # 显示图例 plt.savefig('%s_comment_distribute.png' % figurename) # plt.show() plt.close() # 评论点赞量统计分布图 x = [] y = [] for i in sorted(map_comment_like): print((i, map_comment_like[i]), end=' ') x.append(i) y.append(map_comment_like[i]) plt.title('CommentLike Distribute') # plt.scatter(x, commentlike, color='skyblue', label='comment') plt.plot(x, y, color='blue', label='comment like') plt.legend() # 显示图例 plt.savefig('%s_comment_like_distribute.png' % figurename) # plt.show() plt.close()
def comment_vary_with_every_punish(figurename): """ 评论随每次发布时间变化图 评论点赞量随每次发布时间变化图 :param figurename: :return: """ discusses = [] commentlike = [] date = [] sql = "SELECT distinct(datetime) FROM test order by datetime" result, rowcount = dbutil.query_with_sql_rowcount(conn, sql) for (datetime,) in result: date.append(datetime) sql2 = "SELECT `comment` FROM test where datetime= %d " % datetime result2, rowcount2 = dbutil.query_with_sql_rowcount(conn, sql2) discuss_count = 0 comment_like = 0 for (comment,) in result2: if len(comment) > 2: comment = comment[2:len(comment) - 2] aa = comment.split('}, {') comment_list = [] for a in aa: a = "{%s}" % a comment_list.append(a) discuss_count += len(comment_list) comment_like = len(comment_list) # 每个评论看成一次点赞,这个权重可以调整 for discuss in comment_list: # print(discuss) di = eval(discuss) # 字符串转数组,该函数不安全 tmp = di.get('elected') # 评论点赞量也有过万的,在数据持久化的时候,'elected'存放的数据不太规范,有String类型,把无数据的当做0整型数据存放了 if isinstance(tmp, str) and '万' in tmp: print('评论点赞量过万: %s' % discuss) comment_like += float(tmp.split('万')[0]) * 10000 else: comment_like += int(tmp) # print('评论点赞量:%d' % comment_like) discusses.append(discuss_count) commentlike.append(comment_like) mean_commentnum = int(sum(discusses) / len(discusses)) mean_commentlikenum = int(sum(commentlike) / len(commentlike)) print('平均评论量 ', mean_commentnum) print('平均评论点赞量 ', mean_commentlikenum) # 开始画图 评论随时间变化图 x = date plt.title('Comment Vary With Time') plt.plot(x, discusses, color='skyblue', label='comment') # plt.scatter(x, discusses, color='skyblue', label='comment') plt.legend() # 显示图例 plt.savefig('%s_comment_vary_with_punish.png' % figurename) # plt.show() plt.close() # 开始画图 评论点赞量随时间变化图 plt.title('CommentLike Vary With Time') # plt.scatter(x, commentlike, color='skyblue', label='comment') plt.plot(x, commentlike, color='blue', label='comment like') plt.legend() # 显示图例 plt.savefig('%s_comment_like_vary_with_punish.png' % figurename) # plt.show() plt.close()
def like_var_with_time(figurename): """ 点赞量随时间变化图 和 点赞量统计分布图 :param figurename: 一般为公众号昵称 :return: """ data = [] # 点赞数据 count = 0 # 超过均值的点赞量 sql = "SELECT likenum,contenturl FROM test where datetime between 1546272000 and 1577808000 ORDER BY datetime" result, rowcount = dbutil.query_with_sql_rowcount(conn, sql) for (likenum, contenturl) in result: if likenum == '': likenum = 0 elif '万' in likenum: print('点赞量过万的文章:', contenturl) likenum = float(likenum.split('万')[0]) * 10000 data.append(int(likenum)) mean = sum(data) / len(data) print('均值', mean) for i in data: if i > mean: count += 1 print('超过均值的占比', count / len(data)) # 开始画图 点赞量随时间变化图 plt.title('Like Information Analysis') x = list(range(0, len(data))) y = data plt.plot(x, y) # plt.scatter(x, y) plt.legend() # 显示图例 plt.savefig('%s_like_vary.png' % figurename) plt.close() # 去除top5的数据 data = sorted(data) data = data[:len(data)-10] mean = sum(data)/len(data) print("去除top5之后的均值", mean) index = np.searchsorted(data, mean) print('超过均值的占比', index / len(data)) # 点赞量统计分布图 map = {} for i in data: map.setdefault(i, 0) map[i] += 1 # 开始画图 plt.title('Like Distribute Analysis') x = [] y = [] for i in sorted(map): print((i, map[i]), end=" ") x.append(i) y.append(map[i]) # plt.plot(x, y) plt.scatter(x, y) plt.legend() # 显示图例 plt.savefig('%s_like_distribute.png' % figurename) plt.show() plt.close()