def showLastYears(word, years): begdate, enddate = getTimeDomain(RFF.getDateList()) begdate = enddate - datetime.timedelta(days=years * 365) spostdate = getPostDatebyTimeDomain(begdate, enddate, RFF.getPostDataList()) #开始统计词频 feqlist = [] timeline = [] x = 0 xdate = begdate print("begdate=", begdate, "enddate=", enddate) while x <= years: #初始化频率数组 feqlist.append(0) timeline.append(str(xdate.year) + "年") print(str(xdate.year)) xdate += datetime.timedelta(days=365) x += 1 #sposdate:[ [内容,作者,时间],[......],...... ] for post in spostdate: if post[0].find(word) > -1: postdate = datetime.datetime.strptime(post[2], "%Y-%m-%d %H:%M") satpos = postdate.year - begdate.year print("satpos=", satpos, "\tpostdate=", postdate, "\tbegdate=", begdate, "\tyear1=", postdate.year, "\tyear2=", begdate.year) feqlist[satpos] += 1 #开始绘图 drawGraphic.linePlotGraphics( '时间', '出现次数(帖子/回帖总数:' + str(len(spostdate)) + ')', timeline, feqlist, '时间频率图(' + str(begdate.year) + "->" + str(enddate.year) + ")") print('>>>>>图像加载完毕')
def showLastDays(word, days): begdate, enddate = getTimeDomain(RFF.getDateList()) begdate = enddate - datetime.timedelta(days=days) spostdate = getPostDatebyTimeDomain(begdate, enddate, RFF.getPostDataList()) #开始统计词频 feqlist = [] timeline = [] x = 0 xdate = begdate while x < days: #初始化频率数组 feqlist.append(0) timeline.append(str(xdate.month) + "-" + str(xdate.day)) xdate += datetime.timedelta(days=1) x += 1 #sposdate:[ [内容,作者,时间],[......],...... ] for post in spostdate: if post[0].find(word) > -1: satpos = (datetime.datetime.strptime(post[2], "%Y-%m-%d %H:%M") - begdate).days feqlist[satpos - 1] += 1 #开始绘图 drawGraphic.linePlotGraphics( '时间', '出现次数(帖子/回帖总数:' + str(len(spostdate)) + ')', timeline, feqlist, '时间频率图(' + str(begdate.date()) + "->" + str(enddate.date()) + ")") print('>>>>>图像加载完毕')
def showLastDays(authorname,days): print("加载任务结果文件...") buf = RFF.openResult() datebuf = RFF.getDateList(buf) begdate,enddate = getTimeDomain(datebuf) del datebuf print("计算时间区间...") begdate = enddate - datetime.timedelta(days=days) print("解析回帖数据...") buf = RFF.getPostDataList(buf) spostdate = [] if days > 0: begdate = enddate - datetime.timedelta(days=days) spostdate = getPostDatebyTimeDomain(begdate,enddate,buf) else: spostdate = getPostDatebyTimeDomain(begdate,enddate,buf) del buf print("开始统计.") spostdate = getPostByAuthor(authorname,spostdate) llen = len(spostdate) #开始统计词频 feqlist = [] timeline = [] x = 0 xdate = begdate if days > 30: ommit_xlabel_per = days/30 #忽略x label的个数 ommit_xlabel_per-=1 #同上 while x<=days: feqlist.append(0) timeline.append(str(xdate.month)+"-"+str(xdate.day)) xdate += datetime.timedelta(days=1) feqlist[x] = getCountByDate(xdate,spostdate) x+=1 ppp = 0 while ppp < ommit_xlabel_per and x <= days: feqlist.append(0) timeline.append("") xdate += datetime.timedelta(days=1) feqlist[x] = getCountByDate(xdate,spostdate) x+=1 ppp+=1 xdate -= datetime.timedelta(days=1) timeline[len(timeline)-1] == str(xdate.date()) else: while x < days: #初始化频率数组 feqlist.append(0) timeline.append(str(xdate.month)+"-"+str(xdate.day)) xdate += datetime.timedelta(days=1) feqlist[x] = getCountByDate(xdate,spostdate) x+=1 #开始绘图 drawGraphic.linePlotGraphics('时间','出现次数(帖子/回帖总数:'+str(llen)+')',timeline,feqlist,"【"+ authorname +'】的活跃程度图('+ str(begdate.date()) + "->" + str(enddate.date()) +")") print('>>>>>图像加载完毕')
def showLastDays(word, days): print("加载任务结果文件...") buf = RFF.openResult() datebuf = RFF.getDateList(buf) begdate, enddate = getTimeDomain(datebuf) del datebuf print("计算时间区间...") begdate = enddate - datetime.timedelta(days=days) print("解析回帖数据...") buf = RFF.getPostDataList(buf) spostdate = getPostDatebyTimeDomain(begdate, enddate, buf) del buf print("开始统计.") #开始统计词频 feqlist = [] timeline = [] x = 0 xdate = begdate if days > 30: ommit_xlabel_per = days / 30 #忽略x label的个数 ommit_xlabel_per -= 1 #同上 while x <= days: feqlist.append(0) timeline.append(str(xdate.month) + "-" + str(xdate.day)) xdate += datetime.timedelta(days=1) x += 1 ppp = 0 while ppp < ommit_xlabel_per and x <= days: feqlist.append(0) timeline.append("") xdate += datetime.timedelta(days=1) x += 1 ppp += 1 xdate -= datetime.timedelta(days=1) timeline[len(timeline) - 1] == str(xdate.date()) else: while x < days: #初始化频率数组 feqlist.append(0) timeline.append(str(xdate.month) + "-" + str(xdate.day)) xdate += datetime.timedelta(days=1) x += 1 #sposdate:[ [内容,作者,时间],[......],...... ] for post in spostdate: if post[0].find(word) > -1: satpos = (datetime.datetime.strptime(post[2], "%Y-%m-%d %H:%M") - begdate).days feqlist[satpos - 1] += 1 #开始绘图 drawGraphic.linePlotGraphics( '时间', '出现次数(帖子/回帖总数:' + str(len(spostdate)) + ')', timeline, feqlist, "【" + word + '】的时间频率图(' + str(begdate.date()) + "->" + str(enddate.date()) + ")") print('>>>>>图像加载完毕')
def singleWordTF(word, datalist, scale=30): #实现解析时间线,获取最小最大时间范围 begtime, endtime = getTimeDomain(RFF.getDateList()) print("对比日期范围:", begtime, "->", endtime) c = endtime - begtime blocks = int(c.days / scale) feqlist = [] timeline = [] x = 0 #初始化频率数组 print('>>>>>开始处理.....') xdate = begtime while x <= blocks: feqlist.append(0) timeline.append(str(xdate.date())) xdate += datetime.timedelta(days=scale) x += 1 # [ [[帖子标题,作者,发帖时间] , [回帖列表:[回帖内容,作者,回帖时间],[回帖内容,作者,回帖时间],[[......]],.....]] ] for post in datalist: if post[0][0].find(word) > -1: titledate = datetime.datetime.strptime(post[0][2], "%Y-%m-%d %H:%M") deltadate = titledate - begtime feqpos = int(deltadate.days / scale) feqlist[feqpos] += 1 replylist = post[1] for reply in replylist: if len(reply) < 3: continue if reply[0].find(word) > -1: replydate = datetime.datetime.strptime(reply[2], "%Y-%m-%d %H:%M") deltadate = replydate - begtime feqpos = int(deltadate.days / scale) feqlist[feqpos] += 1 print('>>>>>处理完成,加载图像中.....') print(str(feqlist)) print(str(timeline), str(feqlist)) #开始绘图 drawGraphic.linePlotGraphics( '时间', '出现次数(帖子/回帖总数:' + str(len(datalist * len(datalist[0][0][0]))) + ')', timeline, feqlist, '时间频率图(' + str(begtime) + "->" + str(endtime) + ")") print('>>>>>图像加载完毕')
def activeTimeAnaylize(authorname,days): buf = RFF.openResult() datebuf = RFF.getDateList(buf) begdate,enddate = getTimeDomain(datebuf) del datebuf spostdate = [] buf = RFF.getPostDataList(buf) if days > 0: begdate = enddate - datetime.timedelta(days=days) spostdate = getPostDatebyTimeDomain(begdate,enddate,buf) else: spostdate = getPostDatebyTimeDomain(begdate,enddate,buf) del buf spostdate = getPostByAuthor(authorname,spostdate) #[[内容,时间],[...],...] tpostdata = sortandget(spostdate) tpostdata = gatherbyDays(tpostdata) # [ [date,[ countlist ] ], ] #for post in tpostdata: # print(str(post)) #开始分析活跃时间段 #每天的情况都分析一次,然后叠加求均值 # [ [date,[ countlist ] ], ] xvalue = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23] FEQLIST = [] for post in tpostdata: feqlist = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] for time in post[1]: hour = time.hour feqlist[hour]+=1 FEQLIST.append(feqlist) print(str(feqlist)) del tpostdata #平均下 avgfeq = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] hour = 0 for x in avgfeq: sum = 0 for hoursum in FEQLIST: sum+=hoursum[hour] avgfeq[hour] = sum hour+=1 print("after add up all :\n\n",str(avgfeq)) drawGraphic.linePlotGraphics('时间(小时)','发帖次数',xvalue,avgfeq,"【"+ authorname +'】的活跃时间段图(共 '+ str(len(FEQLIST)) +" 天数据)")
def showKeyWord(authorname,days): buf = RFF.openResult() datebuf = RFF.getDateList(buf) begdate,enddate = getTimeDomain(datebuf) del datebuf spostdate = [] buf = RFF.getPostDataList(buf) if days > 0: begdate = enddate - datetime.timedelta(days=days) spostdate = getPostDatebyTimeDomain(begdate,enddate,buf) else: spostdate = getPostDatebyTimeDomain(begdate,enddate,buf) del buf spostdate = getPostByAuthor(authorname,spostdate) dp = "" #开始统计关键词 #合并回帖 for post in spostdate: dp += "。" + post[0] del spostdate kd = jieba.analyse.extract_tags(dp, topK=10,allowPOS=( 'n', 'v')) print("\n\n贴吧ID:",authorname,":\n总计回帖长度(基于已有数据):",len(dp),"\n关键词:\n") feqlist = [] sumfeq = 0 for keyword in kd: print(keyword,end="\t") feqlist.append(0) print("\n\n") #显示条形图 #统计词频 ttt = 0 for keyword in kd: feqlist[ttt] = dp.count(keyword) sumfeq+=feqlist[ttt] ttt+=1 print(str(feqlist)) drawGraphic.barHonGraphics("关键字","出现次数",kd,feqlist,"用户【"+authorname+"】的关键字")