def heat_quantity2(): """ 热议文章和优质文章的交集,评论点赞量常量比,这个比值可以根据各个公众号进行拟合 :return: """ sql = "select biz, nickname from bizinfo" result = dbutil.query_with_sql(conn, sql) total = 0 for (biz, nickname) in result: print("公众号昵称", nickname) heat_id = set() sql = "SELECT AVG((comment_num+commentlike_num/200)/readnum) as mean FROM content WHERE biz = '%s'" % biz result = dbutil.query_with_sql_one(conn, sql) mean = result[0] sql = "SELECT id, (comment_num+commentlike_num/200)/readnum as num FROM content WHERE biz = '%s'" % biz result = dbutil.query_with_sql(conn, sql) for (id, num) in result: if num and num > mean: heat_id.add(id) print(len(heat_id)) quanlity_id = set() sql = "SELECT AVG(likenum/readnum) as mean FROM content WHERE biz = '%s'" % biz result = dbutil.query_with_sql_one(conn, sql) mean = result[0] sql = "SELECT id, likenum/readnum as num FROM content WHERE biz = '%s'" % biz result = dbutil.query_with_sql(conn, sql) for (id, num) in result: if num and num > mean: quanlity_id.add(id) print(len(quanlity_id)) cross = heat_id & quanlity_id total += len(cross) plt.figure(figsize=(4, 4)) venn2(subsets=[heat_id, quanlity_id], set_labels=('heat', 'quanlity')) plt.savefig("./heat/" + '%s.png' % nickname) plt.close() print("总量", total)
def publish_daily(biz, nickname): """ 时间维度和发文数维度。公众号每日发文折线图,找出发文数量突增的点 :param filename: 公众号昵称 :return: """ num_map = collections.OrderedDict() # key为日期如20190102,value为该日发布的文章数 sql = "SELECT FROM_UNIXTIME(datetime) as stamp, COUNT(*) as num FROM content where biz = '%s' group by datetime ORDER BY datetime" % biz result = dbutil.query_with_sql(conn, sql) for (stamp, num) in result: ss = str(stamp) day = ss.split(' ')[0] # split取出日期 print(day) num_map.setdefault(day, 0) num_map[day] += num x = list(range(1, len(num_map.values()) + 1)) # x = list(num_map.keys()) # 不能以此为坐标轴,画图太稠密 y = list(num_map.values()) print(x) print(y) title = '%s发文数量统计图' % nickname xlabel = '时间/天' ylabel = '发文数量' path = './%s_punish_daily.png' % nickname pltutil.plot_single_figure(x, y, title, xlabel, ylabel, path)
def read_daily(figurename): num_map = collections.OrderedDict() # key为日期如20190102,value为该日的阅读量 sql = "SELECT DISTINCT(datetime) FROM test where datetime between 1546272000 and 1577808000 ORDER BY datetime" result, rowcount = dbutil.query_with_sql_rowcount(conn, sql) for (datetime, ) in result: time_local = time.localtime(datetime) time_format = time.strftime('%Y%m%d %H:%M:%S', time_local) day = time_format.split(' ')[0] # split取出日期 num_map.setdefault(day, 0) sql2 = "SELECT readnum FROM test WHERE readnum!='' and datetime = %s" % datetime result2 = dbutil.query_with_sql(conn, sql2) for (readnum, ) in result2: if '万' in readnum: readnum = float(readnum.split('万')[0]) * 10000 readnum = int(readnum) num_map[day] += readnum # 开始画图 每日阅读量随时间变化图 plt.title('Read Daily Information Analysis') x = list(num_map.keys()) y = np.array(list(num_map.values())) / 100000 plt.xticks(np.arange(0, len(x), 30)) plt.plot(x, y) # plt.scatter(x, y) plt.legend() # 显示图例 plt.savefig('%s_read_daily.png' % figurename) # plt.show() plt.close()
def publish_time(figurename): """ 时间维度,公众号发布时间变化率,可以大致看出发布文章的规律 :param filename: 图例保存名称 :return: """ date = [] sql = "SELECT datetime FROM test where datetime between 1546272000 and 1577808000 GROUP BY datetime ORDER BY datetime" result = dbutil.query_with_sql(conn, sql) for (datetime,) in result: tmp = round((datetime - START_TIME) / 3600, 1) # 以小时为单位 date.append(tmp) # 减去最小值 print(date) delta = [] # 变化率 for i in range(1, len(date)): delta.append(round(date[i] - date[i - 1], 1)) x = list(range(0, len(delta))) y = delta print(delta) plt.scatter(x, y) # 这里画散点图比较好, plt.legend() plt.savefig('%s_delta.png' % figurename) plt.close()
def extract_comment(): sql = "SELECT id, biz, title, digest, content, strong_content, color_content,`comment` FROM content WHERE datetime BETWEEN UNIX_TIMESTAMP('2019-06-01 00:00:00') and UNIX_TIMESTAMP('2019-07-01 00:00:00') " result = dbutil.query_with_sql(conn, sql) id_list = [] biz_list = [] title_list = [] digest_list = [] content_list = [] strong_content_list = [] color_content_list = [] for (id, biz, title, digest, content, strong_content, color_content, comment) in result: if content: contents = content.split('\n') print('段落长度', len(contents)) if len(contents) == 1: tmp = contents[0].split('。') if len(tmp) < 4: content = contents[0] content = re.sub("\n", "。", content) else: content = "。".join(i for i in tmp[0:4]) # 前四句 else: content = contents[0] + contents[1] # 前两段 id_list.append(id) biz_list.append(biz) title_list.append(title) digest_list.append(digest) content_list.append(content) if strong_content and len(strong_content.split('。')) > 4: tmp = strong_content.split('。') strong_content = '。'.join(i for i in tmp[0:4]) if color_content and len(color_content.split('。')) > 4: tmp = color_content.split('。') color_content = '。'.join(i for i in tmp[0:4]) strong_content_list.append(strong_content) color_content_list.append(color_content) dataframe = pd.DataFrame({ 'id': id_list, 'biz': biz_list, 'title': title_list, 'digest': digest_list, 'content': content_list, 'strong_content': strong_content_list, 'color_content': color_content_list }) dataframe.to_csv("trainsix_wechat.csv", index=False, sep=',', quotechar='\"')
def write_content_to_file_with_sql(sql): """ 把文章内容写入文件 :return: """ time0 = time.time() result = dbutil.query_with_sql(conn, sql) file = open("content.txt", "w") for (content,) in result: if content: file.write(content) file.write("\n") file.close() print('用时 ', time.time()-time0)
def write_content_to_file(table): """ 把文章内容写入文件 :return: """ time0 = time.time() sql = "SELECT id,content FROM %s" % table result = dbutil.query_with_sql(conn, sql) file = open("content.txt", "w") for (id, content) in result: if content: file.write(content) file.write("\n---------------------------\n") file.close() print('用时 ', time.time()-time0)
def publish_minute(figurename): """ 时间维度和发文数维度。公众号发布分钟统计图 :param figurename: :return: """ map = collections.OrderedDict() # key 为分钟[0000,2400),value 为该分钟发文数 sql = "SELECT datetime, COUNT(*) as num FROM test where datetime between 1546272000 and 1577808000 GROUP BY datetime ORDER BY datetime" result = dbutil.query_with_sql(conn, sql) for (datetime, num) in result: time_local = time.localtime(datetime) time_format = time.strftime('%Y%m%d %H%M', time_local) minute = time_format.split(' ')[1] map.setdefault(minute, 0) map[minute] += num # 按照键值排序 x = [] y = [] abnormal = [] aa = set() for i in sorted(map): print((i, map[i]), end=" ") x.append(i) aa.add(map[i]) y.append(map[i]) if map[i] < 7: abnormal.append(i) print('\n') print(abnormal) print(len(abnormal)) print(aa) print('\n不同的分钟数 %d ' % len(x)) # print(x) # print(y) plt.xticks(np.arange(0, 2400, 30)) plt.plot(x, y) # plt.scatter(x, y) plt.legend() plt.savefig('%s_minute.png' % figurename) plt.show() plt.close()
def publish_hour(biz, figurename): """ 时间维度 和 发布数维度。公众号每小时发文折线图,找出不在规律内的;误差在正负30分钟之内,进行了四舍五入 :param figurename: 图例名称 :return: """ map = {} # key为小时,取值范围为[0,24),value为小时数的统计 sql = "SELECT FROM_UNIXTIME(datetime) as stamp, COUNT(*) as num FROM content where biz = '%s' group by datetime ORDER BY datetime" % biz result = dbutil.query_with_sql(conn, sql) total = 0 for (stamp, num) in result: total += num ss = str(stamp) tmp = ss.split(' ')[1].split(':') # 取出时分秒 hour = int(tmp[0]) if int(tmp[1]) > 30: hour += 1 if hour == 24: hour = 0 map.setdefault(hour, 0) map[hour] += num # 按照键值排序 x = [] y = [] for i in sorted(map): print((i, map[i]), end=" ") x.append(i) y.append(map[i]) print('\n--------------') # print(sorted(map.items(), key=lambda kv: (kv[1], kv[0]))) print(x) print(y) title = '%s发布时间分布图' % nickname xlabel = '时间/h' ylabel = '发文数量' path = './%s_punish_hour.png' % nickname pltutil.plot_single_figure(x,y, title,xlabel,ylabel,path)
def delete_white_line(): """ 去除内容冗余的空行和前后空格 """ time0 = time.time() sql = "SELECT id,digest FROM content" result = dbutil.query_with_sql(conn, sql) for (id, digest) in result: # 不为空 if digest: # 去除多余的空行,只保留一个空行 content = digest.lstrip("\n").rstrip("\n") content = re.sub("\n{2,}", "\n", content) digest = content.strip() sql = "update content set digest = '{}' where id = {}"\ .format(pymysql.escape_string(digest), id) dbutil.exec_sql(conn, sql) # print(digest) print('用时 ', time.time()-time0)
def index_each_biz(): """ 索引位置对各个公众号的影响:计算不同索引位置的平均阅读量和点赞量 :return: """ sql = "select biz, nickname from bizinfo" result = dbutil.query_with_sql(conn, sql) for (biz, nickname) in result: print("公众号昵称", nickname) read_map = {} # 阅读量 read_id = {} # 文章标识 like_map = {} # 点赞量 sql2 = "SELECT id, contenturl, readnum, likenum, title, digest FROM content where biz = '%s'" % biz result2, rowcount = dbutil.query_with_sql_rowcount(conn, sql2) for (id, contenturl, readnum, likenum, title, digest) in result2: index = contenturl.find('&idx=') idx = int(contenturl[index + 5:index + 6]) read_map.setdefault(idx, []) read_map[idx].append(readnum) read_id.setdefault(idx, []) read_id[idx].append(id) like_map.setdefault(idx, []) like_map[idx].append(likenum) idx_read_list = [] # 索引位置平均阅读量 idx_like_list = [] # 索引位置平均点赞量 x = list(range(1, len(read_map) + 1)) for i in x: idx_read_list.append( round(sum(read_map[i]) / (len(read_map[i]) * 10000), 2)) # 以万为单位 idx_like_list.append(round(sum(like_map[i]) / len(like_map[i]), 1)) # 分布图 plot_single_figure(x, idx_read_list, "索引位置对阅读量的影响", "索引位置", "阅读量/万", nickname + "_read") plot_single_figure(x, idx_like_list, "索引位置对点赞量的影响", "索引位置", "点赞量", nickname + "_like")
def count_comment_nickname(): dict = {} sql = "SELECT `comment` FROM content" result = dbutil.query_with_sql(conn, sql) for (comment, ) in result: if len(comment) > 2: comment = comment[2:len(comment) - 2] aa = comment.split('}, {') comment_list = [] for a in aa: a = "{%s}" % a comment_list.append(a) for discuss in comment_list: # print(discuss) di = eval(discuss) # 字符串转数组,该函数不安全 tmp = di.get( 'nickname' ) # 评论点赞量也有过万的,在数据持久化的时候,'elected'存放的数据不太规范,有String类型,把无数据的当做0整型数据存放了 dict.setdefault(tmp, 0) dict[tmp] += 1 # jsObj = json.dumps(dict) # fileObject = open('jsonFile.json', 'w') # fileObject.write(jsObj) # fileObject.close() # csv_file = "nickname_output.csv" # csv_columns = ['Nickname', 'Count'] # with open(csv_file, 'w') as csvfile: # writer = csv.DictWriter(csvfile, fieldnames=csv_columns) # writer.writeheader() # for key, val in dict: # writer.writerow([key, val]) filename = "nickname_output.csv" with open(filename, 'w') as csv_file: # with open(filename, 'w', encoding='utf_8_sig') as csv_file: 或者这样 csv_file.write(codecs.BOM_UTF8.decode()) # 不加会导致中文乱码 writer = csv.writer(csv_file) for key, val in dict.items(): writer.writerow([key, val])
def delete_white_space(): """ 去除内容冗余的空行和前后空格 """ time0 = time.time() sql = "SELECT id,strong_content, color_content FROM content" result = dbutil.query_with_sql(conn, sql) for (id, strong_content, color_content) in result: # 不为空 if strong_content: # 去除多余的空行,只保留一个空行 content = strong_content.lstrip("。").rstrip("。") content = re.sub("。{2,}", "。", content) strong_content = content.strip() if color_content: # 去除多余的空行,只保留一个空行 content = color_content.lstrip("。").rstrip("。") content = re.sub("。{2,}", "。", content) color_content = content.strip() sql = "update content set strong_content = '{}' , color_content='{}' where id = {}"\ .format(pymysql.escape_string(strong_content), pymysql.escape_string(color_content), id) dbutil.exec_sql(conn, sql) # print(digest) print('用时 ', time.time()-time0)
total += len(cross) plt.figure(figsize=(4, 4)) venn2(subsets=[heat_id, quanlity_id], set_labels=('heat', 'quanlity')) plt.savefig("./heat/" + '%s.png' % nickname) plt.close() print("总量", total) if __name__ == '__main__': headlines = set() quans = set() hot = set() sql = 'select id from content where idx=1' result = dbutil.query_with_sql(conn, sql) for (id, ) in result: headlines.add(id) # data = pd.read_excel('优质和热议的文章.xls') # mean_quan = 106.8859 # mean_hot = 84.6293 # quans = data['id'][data['quan']>mean_quan] # print(quans) sql = 'select id from content where likenum/readnum > 0.0106886' result = dbutil.query_with_sql(conn, sql) for (id, ) in result: quans.add(id) sql = 'select id from content where (comment_num+0.004*commentlike_num)/readnum > 0.0008463'