def computeVideoRankTable(cls): task_name = '计算视频排名对照表' videoCollection = MongoDbDao.getCollection('video') count = videoCollection.estimated_document_count() top_n = 60 keys = ['cView', 'cLike', 'cDanmaku', 'cFavorite', 'cCoin', 'cShare'] task = ProgressTask(task_name, top_n * len(keys), collection=MongoDbDao.getCollection('tracer')) videoRankTable = {} skip = int(count / 100) for each_key_index in range(len(keys)): each_key = keys[each_key_index] videoRankTable[each_key] = {} videoRankTable['name'] = 'video_rank' videoRankTable[each_key]['rate'] = [] last_value = 9999999999 videoCollectionResult = videoCollection.find({}, { 'title': 1 }).limit(200).sort(each_key, DESCENDING).batch_size(200) top = 1 for each_video in list(videoCollectionResult): videoRankTable[each_key][each_video['title']] = top top += 1 for i in range(1, top_n + 1): task.current_value = i + top_n * each_key_index videoCollectionResult = list( videoCollection.find({ each_key: { '$lt': last_value } }, { each_key: 1 }).limit(1).skip(skip).sort(each_key, DESCENDING)) if len(videoCollectionResult) != 0: videoCollectionResult = videoCollectionResult[0] else: i += 1 continue if each_key not in videoCollectionResult: break last_value = videoCollectionResult[each_key] videoRankTable[each_key]['rate'].append(last_value) i += 1 videoRankTable['update_time'] = datetime.datetime.now() rankTableCollection = MongoDbDao.getCollection('rank_table') rankTableCollection.update_one({'name': 'video_rank'}, {'$set': videoRankTable}, upsert=True)
def process_item(self, item, spider): try: if (item['c_fans'] != 0): self.authorCollection.update_one({ 'mid': item['mid'] }, { '$set': { 'focus': True, 'sex': item['sex'], 'name': item['name'], 'face': item['face'], 'level': item['level'], 'cFans': item['c_fans'], 'cLike': item['c_like'], 'cRate': item['c_rate'], 'official': item['official'], }, '$push': { 'data': { '$each': [item['data']], '$position': 0 } } }, True) if 'object_id' in item: sentCallBack(item['object_id'], MongoDbDao.getCollection('user_record')) return item except Exception as error: logging.error('{}: {}'.format(spider.name, error))
def process_item(self, item, spider): try: self.videoCollection.update_one({ 'aid': int(item['aid']) }, { '$set': { 'cView': item['current_view'], 'cFavorite': item['current_favorite'], 'cDanmaku': item['current_danmaku'], 'cCoin': item['current_coin'], 'cShare': item['current_share'], 'cLike': item['current_like'], 'cDatetime': item['current_datetime'], 'author': item['author'], 'subChannel': item['subChannel'], 'channel': item['channel'], 'mid': item['mid'], 'pic': item['pic'], 'title': item['title'], 'datetime': datetime.datetime.fromtimestamp( item['datetime']) }, '$push': { 'data': { '$each': [item['data']], '$position': 0 } } }, True) if 'object_id' in item: sentCallBack(item['object_id'], MongoDbDao.getCollection('user_record')) return item except Exception as error: logging.error('{}: {}'.format(spider.name, error))
def updateVideo(cls, focus=True): if focus: task_name = "生成每日视频待爬链接" else: task_name = "生成保守观测视频待爬链接" logger.info(task_name) doc_filter = {'focus': focus} videoCollection = MongoDbDao.getCollection('video') total = videoCollection.count_documents(doc_filter) cursor = videoCollection.find(doc_filter, {"aid": 1}).batch_size(200) if total == 0: return countNum = 0 aid_list = '' progressTask = ProgressTask(task_name, total, collection=cls.__tracerCollection) for each_doc in cursor: aid_list += str(each_doc['aid']) + ',' countNum += 1 logger.info(each_doc['aid']) if countNum == 50: progressTask.current_value += countNum cls.pushVideoRedisUrlToRedis(aid_list[:-1]) aid_list = '' countNum = 0 progressTask.current_value += countNum cls.pushVideoRedisUrlToRedis(aid_list[:-1])
class OnlineService: __online_url = 'https://www.bilibili.com/video/online.html' __online_key = "online:start_urls" __redisConnection = RedisDao.getRedisConnect() __tracerCollection = MongoDbDao.getCollection('tracer') @classmethod def genOnline(cls): task_name = "生成在线人数爬取链接" progressTask = ProgressTask(task_name, 1, collection=cls.__tracerCollection) cls.__redisConnection.rpush(cls.__online_key, cls.__online_url) progressTask.current_value = 1 @classmethod def crawlOnlineTopListData(cls): task_name = "生成强力追踪待爬链接" logger.info(task_name) response = requests.get(cls.__online_url) data_text = etree.HTML( response.content.decode('utf8')).xpath('//script/text()')[-2] j = json.loads(data_text.lstrip('window.__INITIAL_STATE__=')[:-122]) total = len(j['onlineList']) progressTask = ProgressTask(task_name, total, collection=cls.__tracerCollection) for each_video in j['onlineList']: mid = each_video['owner']['mid'] if mid not in [7584632, 928123]: AuthorService.pushAuthorRedisUrlToRedis(mid) VideoService.pushVideoRedisUrlToRedis(each_video['aid']) progressTask.current_value += 1
def __autoCrawlTask(cls): task_name = "自动爬虫计划调度服务" logger.info(task_name) ExistsTask(task_name, update_frequency=60, collection=MongoDbDao.getCollection('tracer')) while True: schedule.run_pending() sleep(60)
def updateAuthor(cls): task_name = "生成每日作者待爬链接" logger.info(task_name) coll = MongoDbDao.getCollection('author') filter_dict = { '$or': [{ 'focus': True }, { 'forceFocus': True }] } cursor = coll.find(filter_dict, {"mid": 1}).batch_size(200) total = coll.count_documents(filter_dict) if total != 0: t = ProgressTask(task_name, total, collection=MongoDbDao.getCollection('tracer')) for each_doc in cursor: t.current_value += 1 cls.pushAuthorRedisUrlToRedis(each_doc['mid'])
class VideoService: __video_url = "https://api.bilibili.com/x/article/archives?ids={aid}" __video_key = "videoRedis:start_urls" __redisConnection = RedisDao.getRedisConnect() __tracerCollection = MongoDbDao.getCollection('tracer') @classmethod def updateVideo(cls, focus=True): if focus: task_name = "生成每日视频待爬链接" else: task_name = "生成保守观测视频待爬链接" logger.info(task_name) doc_filter = {'focus': focus} videoCollection = MongoDbDao.getCollection('video') total = videoCollection.count_documents(doc_filter) cursor = videoCollection.find(doc_filter, {"aid": 1}).batch_size(200) if total == 0: return countNum = 0 aid_list = '' progressTask = ProgressTask(task_name, total, collection=cls.__tracerCollection) for each_doc in cursor: aid_list += str(each_doc['aid']) + ',' countNum += 1 logger.info(each_doc['aid']) if countNum == 50: progressTask.current_value += countNum cls.pushVideoRedisUrlToRedis(aid_list[:-1]) aid_list = '' countNum = 0 progressTask.current_value += countNum cls.pushVideoRedisUrlToRedis(aid_list[:-1]) @classmethod def pushVideoRedisUrlToRedis(cls, aid): cls.__redisConnection.rpush(cls.__video_key, cls.__video_url.format(aid=aid)) @classmethod def updateAutoAddVideo(cls): task_name = "生成作者最新发布的视频的待爬链接" logger.info(task_name) authorCollection = MongoDbDao.getCollection('author') doc_filter = {'$or': [{'focus': True}, {'forceFocus': True}]} total = authorCollection.count_documents(doc_filter) authorCollectionResult = authorCollection.find(doc_filter, {'mid': 1}) if total != 0: progressTask = ProgressTask(task_name, total, collection=cls.__tracerCollection) for each_doc in authorCollectionResult: progressTask.current_value += 1 url = 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&pagesize=10&page=1&order=pubdate'.format( each_doc['mid']) cls.__redisConnection.rpush("videoAutoAdd:start_urls", url)
def updateAutoAddVideo(cls): task_name = "生成作者最新发布的视频的待爬链接" logger.info(task_name) authorCollection = MongoDbDao.getCollection('author') doc_filter = {'$or': [{'focus': True}, {'forceFocus': True}]} total = authorCollection.count_documents(doc_filter) authorCollectionResult = authorCollection.find(doc_filter, {'mid': 1}) if total != 0: progressTask = ProgressTask(task_name, total, collection=cls.__tracerCollection) for each_doc in authorCollectionResult: progressTask.current_value += 1 url = 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&pagesize=10&page=1&order=pubdate'.format( each_doc['mid']) cls.__redisConnection.rpush("videoAutoAdd:start_urls", url)
class TagAdderService: __videoCollection = MongoDbDao.getCollection('video') __tracerCollection = MongoDbDao.getCollection('tracer') __redisConnection = RedisDao.getRedisConnect() @classmethod def addTagTask(cls): task_name = "生成待爬标签视频链接" doc_filter = {'tag': {'$exists': False}} total = cls.__videoCollection.find(doc_filter, {"aid": 1}).count() cursor = cls.__videoCollection.find(doc_filter, { "aid": 1 }).batch_size(100) progressTask = ProgressTask(task_name, total, collection=cls.__tracerCollection) url = 'https://www.bilibili.com/video/av{}' for each_video in cursor: progressTask.current_value += 1 aid = each_video['aid'] logger.info("待爬AV号{}".format(aid)) cls.__redisConnection.rpush("tagAdder:start_urls", url.format(aid))
def process_item(self, item, spider): self.coll.update_one({ 'aid': int(item['aid']) }, { '$set': { 'danmaku_aggregate.{}'.format(item['page_number']): { 'duration': item['duration'], 'p_name': item['p_name'], 'danmaku_density': item['danmaku_density'], 'word_frequency': item['word_frequency'] }, 'danmaku_aggregate.updatetime': datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') } }, True) sentCallBack(item['object_id'], MongoDbDao.getCollection('user_record'))
class BangumiAndDonghuaService: __tracerCollection = MongoDbDao.getCollection('tracer') __redisConnection = RedisDao.getRedisConnect() __bangumiAndDonghua_key = "bangumiAndDonghua:start_urls" @classmethod def autoCrawlBangumi(cls): task_name = "生成番剧国创待爬链接" logger.info(task_name) progressTask = ProgressTask(task_name, 1, collection=cls.__tracerCollection) urls = [ "https://www.bilibili.com/ranking/bangumi/167/0/7", "https://www.bilibili.com/ranking/bangumi/13/0/7" ] for url in urls: cls.__redisConnection.rpush(cls.__bangumiAndDonghua_key, url) progressTask.current_value += 1
def __init__(self): self.coll = MongoDbDao.getCollection('video')
class KeywordAdderService: __authorCollection = MongoDbDao.getCollection('author') __videoCollection = MongoDbDao.getCollection('video') __searchWordCollection = MongoDbDao.getCollection('search_word') __tracerCollection = MongoDbDao.getCollection('tracer') jieba.load_userdict('./resources/dict.txt') @classmethod def addOmitted(cls): if cls.__searchWordCollection.count_documents({}) < 100: return progressTask = ProgressTask( "更新查询关键词字典", total_value=cls.__searchWordCollection.count_documents({}), collection=cls.__tracerCollection) with open('./resources/dict.txt', 'r', encoding='utf8').read().split('\n') as fileConnection: for each in cls.__searchWordCollection.find(): if 'aid' in each and each['aid'] not in fileConnection: fileConnection.append(each['aid']) elif 'mid' in each and each['mid'] not in fileConnection: fileConnection.append(each['mid']) progressTask.current_value += 1 progressTask.finished = True with open('./resources/dict.txt', 'w', encoding='utf8', newline='') as fileConnection: for each in fileConnection: fileConnection.write(each + '\n') cls.__searchWordCollection.delete_many({}) jieba.load_userdict('./resources/dict.txt') cls.__refreshAllAuthor() cls.__refreshAllVideo() @classmethod def __refreshAllAuthor(cls): for each_author in cls.__authorCollection.find({}, { '_id': 0, 'mid': 1 }).batch_size(100): mid = each_author['mid'] print("[mid]" + str(mid)) # 关键字从name和official中提取 authorCollectionResult = cls.__authorCollection.find_one( {'mid': mid}, { '_id': 0, 'name': 1, 'official': 1, 'keyword': 1 }) keyWord = [] for each_key in authorCollectionResult: if each_key != 'keyword': keyWord.append( str(authorCollectionResult[each_key]).lower()) else: keyWord += authorCollectionResult['keyword'] seg_list = jieba.lcut_for_search(' '.join(keyWord), True) # 搜索引擎模式 # 全名算作关键字 if 'name' in authorCollectionResult and authorCollectionResult[ 'name'].lower() not in seg_list: seg_list.append(authorCollectionResult['name'].lower()) while ' ' in seg_list: seg_list.remove(' ') while '、' in seg_list: seg_list.remove('、') cls.__authorCollection.update_one( {'mid': mid}, {'$set': { 'keyword': list(set(seg_list)) }}) sleep(0.01) @classmethod def __refreshAllVideo(cls): for each_video in cls.__videoCollection.find({}, { '_id': 0, 'aid': 1 }).batch_size(100): aid = each_video['aid'] print("[aid]" + str(aid)) # 关键字从name和official中提取 videoCollectionResult = cls.__videoCollection.find_one( {'aid': aid}, { '_id': 0, 'title': 1, 'channel': 1, 'subChannel': 1, 'author': 1, 'tag': 1 }) keyword = [] for each_key in videoCollectionResult: if each_key != 'keyword' or each_key != 'tag': keyword.append( str(videoCollectionResult[each_key]).lower()) elif each_key == 'tag': keyword += videoCollectionResult['tag'] else: keyword += videoCollectionResult['keyword'] seg_list = jieba.lcut_for_search(' '.join(keyword), True) # 搜索引擎模式 # 全名算作关键字 if 'author' in videoCollectionResult and videoCollectionResult[ 'author'].lower() not in seg_list: seg_list.append(videoCollectionResult['author'].lower()) while ' ' in seg_list: seg_list.remove(' ') while '、' in seg_list: seg_list.remove('、') cls.__videoCollection.update_one( {'aid': aid}, {'$set': { 'keyword': list(set(seg_list)) }}) sleep(0.01)
def __init__(self): self.videoCollection = MongoDbDao.getCollection('video') self.task = SpiderTask('视频数据更新爬虫', collection=MongoDbDao.getCollection('tracer'))
def __init__(self): self.task = SpiderTask('同时在线人数爬虫', collection=MongoDbDao.getCollection('tracer'))
def __init__(self): self.task = SpiderTask("番剧动画爬虫", collection=MongoDbDao.getCollection('tracer'))
def __init__(self): RedisSpider.__init__(self) self.task = SpiderTask('活跃作者自动追加爬虫', collection=MongoDbDao.getCollection('tracer'))
class FansWatcherService: __videoCollection = MongoDbDao.getCollection('video') __authorCollection = MongoDbDao.getCollection('author') __fansVariationCollection = MongoDbDao.getCollection('fans_variation') @classmethod def watchBigAuthor(cls): a = cls.__authorCollection.aggregate([{ '$match': { 'data': { '$exists': True }, 'cFans': { '$gt': 10000 } } }, { '$project': { "mid": 1, "face": 1, "name": 1, "data": { "$filter": { "input": "$data", "as": "data", "cond": { "$gt": [ "$$data.datetime", datetime.datetime.now() - datetime.timedelta(32) ] } } } } }, { "$match": { "data.0": { "$exists": True } } }]) for each_author in a: cls.__judge(each_author) @classmethod def __insertEvent(cls, delta_rate, d_daily, author, info, date): out_data = { 'variation': int(d_daily), 'mid': author['mid'], 'author': author['name'], 'face': author['face'], 'deltaRate': delta_rate, 'datetime': date.strftime("%Y-%m-%d"), 'info': info, } videos = cls.__videoCollection.find({'mid': author['mid']}) temp_video = {} cause = {'type': 'video'} for each_v in videos: if type(each_v['datetime']) == str: pass elif (date - each_v['datetime']).days >= -1 and ( date - each_v['datetime']).days <= 7: temp_video['aid'] = each_v['aid'] temp_video['title'] = each_v['title'] temp_video['pic'] = each_v['pic'] temp_video['cView'] = each_v['data'][0]['view'] temp_video['channel'] = each_v['channel'] temp_video['subChannel'] = each_v['subChannel'] if 'cView' not in temp_video or 'aid' not in cause or temp_video[ 'cView'] > cause['cView']: cause['aid'] = temp_video['aid'] cause['title'] = temp_video['title'] cause['pic'] = temp_video['pic'] cause['cView'] = temp_video['cView'] cause['channel'] = temp_video['channel'] cause['subChannel'] = temp_video['subChannel'] if cause != {'type': 'video'}: out_data['cause'] = cause cls.__fansVariationCollection.replace_one( { 'mid': out_data['mid'], 'datetime': out_data['datetime'] }, out_data, upsert=True) @classmethod def __judge(cls, author): """ 一共有这样几种可能: 1、 大量涨粉 日涨粉数超过上周平均的25倍 2、 史诗级涨粉 日涨粉数超过上周平均的50倍或单日涨粉超过10W 3、 传说级涨粉 日涨粉数超过上周平均的100倍或单日涨粉超过20W 4、 急转直下 上升轨道中的UP主突然掉粉 5、 大量掉粉 每日掉粉数突破5K 6、 雪崩级掉粉 每日掉粉数突破2W 7、 末日级掉粉 每日掉粉数突破5W 8、 新星爆发 日涨粉超过粉丝总数的20% """ data = sorted(author['data'], key=lambda x: x['datetime']) start_date = data[0]['datetime'].timestamp() end_date = data[-1]['datetime'].timestamp() x = [] y = [] for each in data: x.append(each['datetime'].timestamp()) y.append(each['fans']) if len(x) <= 1: return # 线性插值 interrupted_fans = interp1d(x, y, kind='linear') temp_date = datetime.datetime.fromtimestamp(start_date) c_date = datetime.datetime(temp_date.year, temp_date.month, temp_date.day).timestamp() + 86400 * 3 if c_date - 86400 * 2 <= start_date: return while (c_date <= end_date): date = datetime.datetime.fromtimestamp(c_date) daily_array = interrupted_fans([c_date - 86400, c_date]) p_daily_array = interrupted_fans( [c_date - 86400 * 2, c_date - 86400]) # 24小时前涨粉数 pd_daily = p_daily_array[1] - p_daily_array[0] # 每日涨粉数 d_daily = daily_array[1] - daily_array[0] if (d_daily >= 5000 or d_daily <= -2000): delta_rate = round(d_daily / pd_daily * 100, 2) if (d_daily >= daily_array[1] * 0.50): cls.__insertEvent(round(d_daily / daily_array[1] * 100, 2), d_daily, author, '新星爆发', date) if (d_daily <= 0 and pd_daily >= 0): cls.__insertEvent('-', d_daily, author, '急转直下', date) c_date += 86400 continue if (d_daily <= -50000): cls.__insertEvent(delta_rate, d_daily, author, '末日级掉粉', date) elif (d_daily <= -20000): cls.__insertEvent(delta_rate, d_daily, author, '雪崩级掉粉', date) elif (d_daily <= -5000): cls.__insertEvent(delta_rate, d_daily, author, '大量掉粉', date) if (c_date >= start_date * 86400 * 8 and d_daily > 0): weekly_array = interrupted_fans( [c_date - 86400 * 8, c_date - 86400]) weekly_mean = (weekly_array[1] - weekly_array[0]) / 7 # 上周平均涨粉数 delta_rate = round(d_daily / weekly_mean * 100, 2) if delta_rate >= 10000 or d_daily >= 200000: # 日涨粉数超过上日的100倍 cls.__insertEvent(delta_rate, d_daily, author, '传说级涨粉', date) elif delta_rate >= 5000 or d_daily >= 100000: # 日涨粉数超过上日的50倍 cls.__insertEvent(delta_rate, d_daily, author, '史诗级涨粉', date) elif delta_rate >= 2500: # 日涨粉数超过上日的25倍 cls.__insertEvent(delta_rate, d_daily, author, '大量涨粉', date) c_date += 86400
def __init__(self): jieba.load_userdict('./resources/dict.txt') self.videoCollection = MongoDbDao.getCollection('video') self.errorCollection = MongoDbDao.getCollection('error')
def __init__(self): self.authorCollection = MongoDbDao.getCollection('author') self.redisConnection = RedisDao.getRedisConnect()
def __init__(self): self.videoCollection = MongoDbDao.getCollection('video') self.redisConnection = RedisDao.getRedisConnect()
def __init__(self): self.videoOnlineCollection = MongoDbDao.getCollection('video_online')
def __init__(self): self.authorCollection = MongoDbDao.getCollection('author') self.task = SpiderTask('观测UP主的视频数据自动追加爬虫', collection=MongoDbDao.getCollection('tracer'))
def __init__(self): self.coll = MongoDbDao.getCollection('site_info')
def __init__(self): self.authorCollection = MongoDbDao.getCollection('author')
def __init__(self): self.mongoDbConnection = MongoDbDao.getMongoDb()
def calculateAuthorRank(cls): task_name = "计算作者排名数据" authorCollection = MongoDbDao.getCollection('author') keys = ['cFans', 'cArchive_view', 'cArticle_view'] allCount = authorCollection.count_documents({keys[0]: {'$exists': 1}}) progressTask = ProgressTask( task_name, allCount * len(keys), collection=MongoDbDao.getCollection('tracer')) for each_key in keys: logger.info("开始计算作者{}排名".format(each_key)) authorCollectionResult = authorCollection.find( { each_key: { '$exists': 1 } }, { 'mid': 1, 'rank': 1, each_key: 1 }).batch_size(300).sort(each_key, DESCENDING) if each_key == 'cFans': each_rank = 'fansRank' each_d_rank = 'dFansRank' each_p_rank = 'pFansRank' elif each_key == 'cArchive_view': each_rank = 'archiveViewRank' each_d_rank = 'dArchiveViewRank' each_p_rank = 'pArchiveViewRank' elif each_key == 'cArticle_view': each_rank = 'articleViewRank' each_d_rank = 'dArticleViewRank' each_p_rank = 'pArticleViewRank' iTh = 1 for each_author in authorCollectionResult: progressTask.current_value += 1 logger.info("计算{}排名".format(each_author['mid'])) if each_key in each_author: if 'rank' in each_author: rank = each_author['rank'] if each_rank in each_author['rank']: rank[each_d_rank] = each_author['rank'][ each_rank] - iTh else: rank[each_d_rank] = 0 rank[each_rank] = iTh rank[each_p_rank] = cls.__format_p_rank(iTh, allCount) else: # 初始化 rank = { each_rank: iTh, each_d_rank: 0, each_p_rank: cls.__format_p_rank(iTh, allCount) } if each_author[each_key] == 0: if 'rank' in each_author: rank = each_author['rank'] rank[each_d_rank] = 0 rank[each_rank] = -1 rank[each_p_rank] = -1 else: rank = {each_rank: -1, each_d_rank: 0, each_p_rank: -1} if each_key == 'cArticle_view': rank['updateTime'] = datetime.datetime.now() authorCollection.update_one({'mid': each_author['mid']}, {'$set': { 'rank': rank, }}) iTh += 1 progressTask.current_value = progressTask.total_value logger.info("计算作者排名结束")
def __init__(self): self.authorCollection = MongoDbDao.getCollection('author') self.redisCollection = RedisDao.getRedisConnect() self.task = SpiderTask('作者数据更新爬虫', collection=MongoDbDao.getCollection('tracer'))