async def getNewsCrawled(self): # 抓取新闻类型 # 抓取新闻条数 ret = {} table = '{}_countNewsByType'.format(self.LOCAL) sql = ''' SELECT newsType, newsCount FROM {db}.{table} WHERE date = {date} '''.format(db=DATABASE, table=table, date=self.date) res = DB(**DBCONFIG).query(sql) newsTypeName = { '0': 'News', '1': 'FunnyPictures', '2': 'Gallery', '3': 'Video', '4': 'Joke', '5': 'Blog', '6': 'Forum', '10000': 'AppWall', } for item in res: newsType = newsTypeName.get(item['newsType'], 'AppWall') newsCount = item['newsCount'] ret.update({newsType: newsCount}) return ret
async def getFreshPushCount(self): # 推送到达数 # 推送阅读数 ret = {} table = '{}_freshPushRealCtr'.format(self.LOCAL) sql = ''' SELECT freshPushSuccessCount, freshPushCount, freshPushRead FROM {db}.{table} WHERE date = {date} '''.format(db=DATABASE, table=table, date=self.date) res = DB(**DBCONFIG).query(sql) for item in res: ret.update({ 'freshPushSuccessCount': item.get('freshPushSuccessCount', 0) }) ret.update({'freshPushCount': item.get('freshPushCount', 0)}) ret.update({'freshPushRead': item.get('freshPushRead', 0)}) return ret
async def getSearchKeywordDesc(self): lst = [] ret = {} table = '{}_keywordSearchCountDesc'.format(self.LOCAL) sql = ''' SELECT keyword, sum(keywordCount) AS keyword_count FROM {db}.{table} WHERE date = {date} AND keywordCount > 1 GROUP BY keyword ORDER BY keyword_count DESC '''.format(db=DATABASE, table=table, date=self.date) res = DB(**DBCONFIG).query(sql) for item in res: lst.append({ 'keyword': item['keyword'], 'keywordCount': int(item.get('keyword_count', 0)) }) ret.update({self.date: lst}) return ret
async def getTypeCtr(self): # 展示新闻数 ret = {} table = '{}_fetchReadByNewsType'.format(self.LOCAL) sql = ''' SELECT newsType, newsTypeRead, newsTypeFetch FROM {db}.{table} WHERE date = {date} '''.format(db=DATABASE, table=table, date=self.date) res = DB(**DBCONFIG).query(sql) for item in res: newsTypeRead = item['newsTypeRead'] newsTypeFetch = item['newsTypeFetch'] try: ctr = '{:3%}'.format(newsTypeRead / newsTypeFetch) except ZeroDivisionError: ctr = '0.00%' ret.update({ item['newsType']: { 'newsTypeRead': newsTypeRead, 'newsTypeFetch': newsTypeFetch, 'ctr': ctr } }) return ret
async def getPushSummary(self): # 推送到达数 # 推送阅读数 ret = {} table = '{}_pushSummary'.format(self.LOCAL) sql = ''' SELECT validPushCount, validPushSuccess, validPushClick FROM {db}.{table} WHERE date = {date} '''.format(db=DATABASE, table=table, date=self.date) res = DB(**DBCONFIG).query(sql) for item in res: validPushCount = item['validPushCount'] validPushSuccess = item['validPushSuccess'] validPushClick = item['validPushClick'] ret.update({'validPushCount': validPushCount}) ret.update({'validPushSuccess': validPushSuccess}) ret.update({'validPushClick': validPushClick}) try: ret.update( {'pushReachRatio': validPushSuccess / validPushCount}) except ZeroDivisionError as e: ret.update({'pushReachRatio': 0.00}) try: ret.update({'pushRealCtr': validPushClick / validPushSuccess}) except ZeroDivisionError as e: ret.update({'pushRealCtr': 0.00}) return ret
async def getPushNewsTotalNormalReadCount(self): # 非推送阅读数 date = [] totalNormalReadCount = [] sql = ''' SELECT CAST(SUM(readCount) AS UNSIGNED) AS totalNormalReadCount FROM {table1} INNER JOIN {table2} ON {table1}.date = {date} AND {db}.{table1}.date = {db}.{table2}.date AND {db}.{table1}.newsId = {db}.{table2}.newsId '''.format(db=DATABASE, table1='{}_ctrNews'.format(LOCAL), table2='{}_pushCtrNews'.format(LOCAL), date=self.date) res = DB(**DBCONFIG).query(sql) for item in res: try: date.append(self.date) totalNormalReadCount.append(int(item['totalNormalReadCount'])) except Exception: pass if len(date) == 0: date.append(self.date) totalNormalReadCount.append(0) await self.insert(date=date, table='pushNewsTotalNormalReadCount', totalNormalReadCount=totalNormalReadCount)
async def getFetchReadByCategory(self): # 类别 ret = {} table = '{}_fetchReadByCategory'.format(self.LOCAL) sql = ''' SELECT categoryName, readCount, fetchCount FROM {db}.{table} WHERE date = {date} '''.format(db=DATABASE, table=table, date=self.date) res = DB(**DBCONFIG).query(sql) for item in res: readCount = item['readCount'] fetchCount = item['fetchCount'] categoryName = item['categoryName'] try: ctr = '{:.3%}'.format(readCount / fetchCount) except ZeroDivisionError: ctr = '0.00%' ret.update({ categoryName: { 'readCount': readCount, 'fetchCount': fetchCount, 'ctr': ctr } }) return ret
async def getCrawledNews(self): # 抓取新闻数 # 含相关新闻的新闻比例 ret = {} table = '{}_crawledNews'.format(self.LOCAL) sql = ''' SELECT crawledNews, crawledNewsWithRelative FROM {db}.{table} WHERE date = {date} '''.format(db=DATABASE, table=table, date=self.date) res = DB(**DBCONFIG).query(sql) for item in res: crawledNews = item['crawledNews'] crawledNewsWithRelative = item['crawledNewsWithRelative'] relativeRatio = '{:.3%}'.format(crawledNewsWithRelative / crawledNews) ret.update({'crawledNews': crawledNews}) ret.update({'relativeRatio': relativeRatio}) return ret
async def getTrendingCtr(self): # trending展示 # trending点击 ret = {} table = '{}_trendingCtr'.format(self.LOCAL) sql = ''' SELECT CAST(SUM(impression) AS UNSIGNED) AS trendingImpression, CAST(SUM(click) AS UNSIGNED) AS trendingClick FROM {db}.{table} WHERE date = {date}; '''.format(db=DATABASE, table=table, date=self.date) res = DB(**DBCONFIG).query(sql) for item in res: ret.update({'trendingImpression': item['trendingImpression']}) ret.update({'trendingClick': item['trendingClick']}) return ret
async def getDisplayCount(self): # 总展示次数 ret = {} table = '{}_fetchReadByTag'.format(self.LOCAL) sql = ''' SELECT CAST(SUM(categoryTagFetch) AS UNSIGNED) AS displayCount, CAST(SUM(categoryTagRead) AS UNSIGNED) AS readCount FROM {db}.{table} WHERE date = {date}; '''.format(db=DATABASE, table=table, date=self.date) res = DB(**DBCONFIG).query(sql) for item in res: ret.update({'displayCount': item['displayCount']}) ret.update({'readCount': item['readCount']}) return ret
async def getNewsToPush(self): # 推送新闻数 # 总推送数 # 推送阅读数 ret = {} table = '{}_pushCtrNews'.format(self.LOCAL) sql = ''' SELECT COUNT(pushNews) AS newsToPush, CAST(SUM(pushNews) AS UNSIGNED) AS totalPushCount, CAST(SUM(pushNewsRead) AS UNSIGNED) AS totalPushReadCount FROM {db}.{table} WHERE date = {date} '''.format( db=DATABASE, table=table, date=self.date ) res = DB(**DBCONFIG).query(sql) for item in res: ret.update({'newsToPush': item['newsToPush']}) try: totalPushCount = int(item['totalPushCount']) except TypeError: totalPushCount = 0 try: totalPushReadCount = int(item['totalPushReadCount']) except TypeError: totalPushReadCount = 0 ret.update({'totalPushCount': totalPushCount}) ret.update({'totalPushReadCount': totalPushReadCount}) try: pushReadRatio = '{:.3%}'.format( totalPushReadCount / totalPushCount) except Exception: pushReadRatio = 0.00 ret.update({'pushReadRatio': pushReadRatio}) return ret
def transforDataFromSQLServerToMysql(): utc = pytz.utc maxDateTime = getMySQLMaxCreatedTime() print('mysql most recent time : ', maxDateTime) totalSQLServerCnt = getSQLServerNewInfo(maxDateTime) print('sql server total info: ' + str(totalSQLServerCnt)) sql = ''' SELECT NewsId, Type, DATEDIFF(s, '1970-01-01 00:00:00', CreatedTime) AS time FROM News WHERE CAST( CreatedTime AS DATETIMEOFFSET) > CAST('{}' AS DATETIMEOFFSET) '''.format(maxDateTime) datas = '' with pymssql.connect(**mssqlconfig_online_br) as conn: with conn.cursor(as_dict=True) as cursor: cursor.execute(sql) datas = cursor.fetchall() for item in datas: newsId = item['NewsId'] newsCreatedTime = item['time'] newsType = item['Type'] newsCreatedTime = datetime.datetime.utcfromtimestamp( int(newsCreatedTime)).replace(tzinfo=utc) # 如果newsCreatedTIme 比当前大于等于当前时间则跳过(说明是问题数据) now = datetime.datetime.utcfromtimestamp( time.time()).replace(tzinfo=utc) if newsCreatedTime >= now: continue newsCreatedTime = newsCreatedTime.strftime('%Y-%m-%d %H:%M:%S') sql = ('INSERT INTO daily_report.br_crawlFrequency(' 'newsId, newsCreatedTime, newsType) VALUES (' '{}, str_to_date("{}", "%Y-%m-%d %H:%i:%s"), {})').format( newsId, newsCreatedTime, newsType) try: DB(**DBCONFIG).insert(sql) except Exception as e: print(e)
async def getNewsReadTimeStatistics(self): # 新闻阅读时长 ret = {} table = '{}_readsNewsTimeStatistics'.format(self.LOCAL) sql = ''' SELECT lessThan5sec, moreThan5secLessThan20sec, moreThan20secLessThan60sec, moreThan60sec, avgReadTime FROM {db}.{table} WHERE date = {date} '''.format( db=DATABASE, table=table, date=self.date, ) res = DB(**DBCONFIG).query(sql) for item in res: lessThan5sec = int(item['lessThan5sec']) moreThan5secLessThan20sec = int(item['moreThan5secLessThan20sec']) moreThan20secLessThan60sec =\ int(item['moreThan20secLessThan60sec']) moreThan60sec = int(item['moreThan60sec']) totalReadTime = sum((lessThan5sec, moreThan5secLessThan20sec, moreThan20secLessThan60sec, moreThan60sec)) lessThan5sec = '{:.3%}'.format(lessThan5sec / totalReadTime) moreThan5secLessThan20sec = '{:.3%}'.format( moreThan5secLessThan20sec / totalReadTime) moreThan20secLessThan60sec = '{:.3%}'.format( moreThan20secLessThan60sec / totalReadTime) moreThan60sec = '{:.3%}'.format(moreThan60sec / totalReadTime) ret.update({'lessThan5sec': lessThan5sec}) ret.update( {'moreThan5secLessThan20sec': moreThan5secLessThan20sec}) ret.update( {'moreThan20secLessThan60sec': moreThan20secLessThan60sec}) ret.update({'moreThan60sec': moreThan60sec}) ret.update({'totalReadTime': totalReadTime}) ret.update({'avgReadTime': float(item['avgReadTime'])}) return ret
async def getBannerCtr(self): # 新闻ID 展示数 点击数 CTR res = [] table = '{}_bannerCtr'.format(self.LOCAL) sql = ''' SELECT news_id, title, start_time, end_time, impression_cnt, click_cnt, page_id FROM {db}.{table} ORDER BY end_time DESC '''.format( db=DATABASE, table=table, ) query_res = DB(**DBCONFIG).query(sql) for item in query_res: news_id = item['news_id'] title = item['title'] start_time = item['start_time'] end_time = item['end_time'] impression_cnt = int(item['impression_cnt']) click_cnt = int(item['click_cnt']) page_id = item['page_id'] ctr = '{:.3%}'.format(click_cnt / impression_cnt) ret = {} ret.update({ 'news_id': news_id, 'title': title, 'page_id': page_id, 'start_time': start_time, 'end_time': end_time, 'impression_cnt': impression_cnt, 'click_cnt': click_cnt, 'ctr': ctr }) res.append(ret) return res
async def insert(self, **kwargs): table = LOCAL + '_' + kwargs['table'] del kwargs['table'] sql = ''' INSERT INTO {db}.{table} ({keys}) VALUES '''.format(db=DATABASE, table=table, keys=', '.join(kwargs.keys())) for item in zip(*kwargs.values()): sql += str(item) sql += ', ' sql = sql[:-2] print('start inserting') try: DB(**DBCONFIG).insert(sql) except Exception as e: print(e)
async def getActiveUser(self): # 活跃用户 ret = {} table = '{}_fetchesUserNews'.format(self.LOCAL) sql = ''' SELECT activeUser FROM {db}.{table} WHERE date = {date} '''.format(db=DATABASE, table=table, date=self.date) res = DB(**DBCONFIG).query(sql) for item in res: ret.update({'activeUser': item['activeUser']}) return ret
async def getReadNewsUser(self): # 读新闻用户 ret = {} table = '{}_readsUserNews'.format(self.LOCAL) sql = ''' SELECT readNewsUser FROM {db}.{table} WHERE date = {date} '''.format(db=DATABASE, table=table, date=self.date) res = DB(**DBCONFIG).query(sql) for item in res: ret.update({'readNewsUser': item['readNewsUser']}) return ret
async def getFreshUser(self): # 新用户 ret = {} table = '{}_freshUserCount'.format(self.LOCAL) sql = ''' SELECT freshUser FROM {db}.{table} WHERE date = {date} '''.format(db=DATABASE, table=table, date=self.date) res = DB(**DBCONFIG).query(sql) for item in res: ret.update({'freshUser': item['freshUser']}) return ret
async def getDisplayedNews(self): # 展示新闻数 ret = {} table = '{}_fetchesUserNews'.format(self.LOCAL) sql = ''' SELECT displayedNews FROM {db}.{table} WHERE date = {date} '''.format(db=DATABASE, table=table, date=self.date) res = DB(**DBCONFIG).query(sql) for item in res: ret.update({'displayedNews': item['displayedNews']}) return ret
async def getLoginUser(self): # 登录用户 ret = {} table = '{}_loginUserTotal'.format(self.LOCAL) sql = ''' SELECT loginUser FROM {db}.{table} WHERE date = {date} '''.format(db=DATABASE, table=table, date=self.date) res = DB(**DBCONFIG).query(sql) for item in res: ret.update({'loginUser': item['loginUser']}) return ret
async def getPullNewsCount(self): # 总刷新次数 ret = {} table = '{}_pullCount'.format(self.LOCAL) sql = ''' SELECT pullNewsCount FROM {db}.{table} WHERE date = {date} '''.format(db=DATABASE, table=table, date=self.date) res = DB(**DBCONFIG).query(sql) for item in res: ret.update({'pullNewsCount': item['pullNewsCount']}) return ret
async def getLatestDate(self): # 活跃用户 ret = {} table = '{}_fetchesUserNews'.format(self.LOCAL) sql = ''' SELECT MAX(date) AS date FROM {db}.{table} '''.format( db=DATABASE, table=table, ) res = DB(**DBCONFIG).query(sql) for item in res: ret.update({'date': item['date']}) return ret
def readNewsCtrByType(start, length): try: sqlstring = ''' SELECt * FROM {db}.{table} LIMIT {start}, {length} '''.format( db=DATABASE, table='id_ctr_news', start=start, length=length ) data = DB(**DBCONFIG).query(sqlstring) return data except Exception as e: print(e, sqlstring) return False
async def getdisplayNewsWithoutRelative(self): # 展示新闻数 ret = {} table = '{}_fetchNewsCountWithoutRelative'.format(self.LOCAL) sql = ''' SELECT displayNewsWithoutRelative FROM {db}.{table} WHERE date = {date} '''.format(db=DATABASE, table=table, date=self.date) res = DB(**DBCONFIG).query(sql) for item in res: ret.update({ 'displayNewsWithoutRelative': item['displayNewsWithoutRelative'] }) return ret
async def getCommentUser(self): # 评论用户 # 评论用户比例 ret = {} table = '{}_commentUser'.format(self.LOCAL) sql = ''' SELECT commentUser, totalComments FROM {db}.{table} WHERE date = {date} '''.format(db=DATABASE, table=table, date=self.date) res = DB(**DBCONFIG).query(sql) for item in res: ret.update({'commentUser': item['commentUser']}) ret.update({'totalComments': item['totalComments']}) return ret
def writeNewsCtrByType(logDate, categoryName, readCount, fetchCount, ctr): try: sqlstring = ''' INSERT INTO {db}.{table}(logDate, categoryName, readCount, fetchCount, ctr) VALUES ({logDate}, '{categoryName}', {readCount}, {fetchCount}, {ctr}) '''.format( db=DATABASE, table='id_ctr_news', logDate=logDate, categoryName=categoryName, readCount=readCount, fetchCount=fetchCount, ctr=ctr ) data = DB(**DBCONFIG).insert(sqlstring) return data except Exception as e: print(e, sqlstring) return False
async def getPushNewsTotalNormalReadCount(self): # 非推送阅读数 ret = {} table = '{}_pushNewsTotalNormalReadCount'.format(self.LOCAL) sql = ''' SELECT totalNormalReadCount FROM {db}.{table} WHERE date={date} '''.format( db=DATABASE, table=table, date=self.date ) res = DB(**DBCONFIG).query(sql) for item in res: ret.update({'totalNormalReadCount': item['totalNormalReadCount']}) return ret
def getKeywordSearchCountDesc(self): # 搜索关键词以及搜索次数 date = [] keyword = [] keywordCount = [] content = self.readFile('keywordSearchCountDesc') for line in content: if line[0].startswith('""'): continue else: date.append(self.date) line[0] = '"' + line[0] + '"' keyword.append(line[0]) try: keywordCount.append(int(line[1])) except IndexError: keywordCount.append(0) table = 'br_keywordSearchCountDesc' sql_base = ''' INSERT INTO {db}.{table} (date, keyword, keywordCount) VALUES ( '''.format( db=DATABASE, table=table, ) for index, dt in enumerate(date): sql = sql_base + ', '.join( [str(dt), keyword[index], str(keywordCount[index])]) sql += ')' try: DB(**DBCONFIG).insert(sql) except Exception: pass
async def getCrawlFrequency(self, newsType): # 新闻标签-展示新闻数 table = '{}_crawlFrequency'.format(self.LOCAL) sql = ''' SELECT crawledNewsNum, intervals FROM( SELECT count(*) AS crawledNewsNum, sec_to_time( time_to_sec(newsCreatedTime) - time_to_sec(newsCreatedTime) % (10 * 60) ) AS intervals FROM {db}.{table} WHERE date(newsCreatedTime) = '{newsCreatedTime}' '''.format( db=DATABASE, table=table, newsCreatedTime=self.date ) if newsType is None: sql = sql + ''' GROUP BY intervals) as tmp ORDER BY intervals ''' else: sql = sql + ''' AND newsType = {newsType} GROUP BY intervals) as tmp ORDER BY intervals '''.format( newsType=newsType ) ret = {} res = DB(**DBCONFIG).query(sql) crawledNewsNum = 0 for item in res: crawledNewsNum += int(item['crawledNewsNum']) newsCreatedTime = str(item['intervals']) ret.update({newsCreatedTime: crawledNewsNum}) interval = arrow.get(str(self.date), 'YYYYMMDD') key = str(interval.format('H:mm:ss')) if key not in ret.keys(): ret.update({key: 0}) for _ in range(144): previous_key = key interval = interval.replace(minutes=+10) key = str(interval.format('H:mm:ss')) if key not in ret.keys(): # {time: crawledNewsNum} ret.update({key: ret[previous_key]}) utc = arrow.utcnow() _to_delete_keys = [] for key in ret.keys(): time = str(self.date) + ' ' + key if arrow.get(time, 'YYYYMMDD H:mm:ss') > utc: _to_delete_keys.append(key) for key in _to_delete_keys: ret.pop(key, None) ret_lst = [{'time': item[0], 'crawledNewsNum': item[1]} for item in sorted( ret.items(), key=lambda x: arrow.get(x[0], 'H:mm:ss'))] return ret_lst
async def getCategoryTagFetch(self): # 新闻标签-展示新闻数 ret = {} table = '{}_hotFetchReadByTag'.format(self.LOCAL) partreads = {} partfetches = {} totalRead = 0 totalFetch = 0 validRead = 0 validFetch = 0 videoRead = 0 videoFetch = 0 sql = ''' SELECT categoryTag, hotTagFetch, hotTagRead FROM {db}.{table} WHERE date = {date} '''.format(db=DATABASE, table=table, date=self.date) res = DB(**DBCONFIG).query(sql) for item in res: categoryTag = item['categoryTag'] hotTagRead = item['hotTagRead'] hotTagFetch = item['hotTagFetch'] totalRead += hotTagRead totalFetch += hotTagFetch validTags = [ 'JOKE', 'VIDEO', 'FUNNY', 'RELATIVENEWS', 'LOCKEDSCREEN' ] if all(tag not in categoryTag.upper() for tag in validTags): validRead += hotTagRead validFetch += hotTagFetch if 'VIDEO' in categoryTag.upper(): videoRead += hotTagRead videoFetch += hotTagFetch partname = categoryTag.split(':')[0] partreads.update( {partname: (partreads.get(partname, 0) + hotTagRead)}) partfetches.update( {partname: (partfetches.get(partname, 0) + hotTagFetch)}) for item in res: categoryTag = item['categoryTag'] hotTagRead = item['hotTagRead'] hotTagFetch = item['hotTagFetch'] partname = categoryTag.split(':')[0] percent = '{:.3%}'.format(hotTagFetch / partfetches[partname]) ctr = '{:.3%}'.format(hotTagRead / hotTagFetch) ret.update({ categoryTag: { 'hotTagRead': hotTagRead, 'hotTagFetch': hotTagFetch, 'percent': percent, 'ctr': ctr } }) ret.update({ 'Video': { 'hotTagFetch': videoFetch, 'hotTagRead': videoRead, 'percent': '==', 'ctr': '{:.3%}'.format(videoRead / videoFetch) } }) ret.update({ 'Total': { 'hotTagFetch': totalFetch, 'hotTagRead': totalRead, 'percent': '==', 'ctr': '{:.3%}'.format(totalRead / totalFetch) } }) ret.update({ 'Valid': { 'hotTagFetch': validFetch, 'hotTagRead': validRead, 'percent': '==', 'ctr': '{:.3%}'.format(validRead / validFetch) } }) for partname in partfetches.keys(): ret.update({ partname: { 'hotTagFetch': partfetches[partname], 'hotTagRead': partreads[partname], 'percent': '==', 'ctr': '{:.3%}'.format(partreads[partname] / partfetches[partname]) } }) return ret