def add_omitted(self): total_value = self.mongo_word.count_documents({}) if self.mongo_word.count_documents({}) < 100: return t = ProgressTask("更新查询关键词字典", total_value=total_value, collection=db['tracer']) d = open('./biliob_analyzer/dict.txt', 'r', encoding='utf8').read().split('\n') for each in self.mongo_word.find(): if 'aid' in each and each['aid'] not in d: d.append(each['aid']) elif 'mid' in each and each['mid'] not in d: d.append(each['mid']) t.current_value += 1 pass t.finished = True o = open('./biliob_analyzer/dict.txt', 'w', encoding='utf8', newline='') for each in d: o.write(each + '\n') o.close() self.mongo_word.delete_many({}) jieba.load_userdict('./biliob_analyzer/dict.txt') self.refresh_all_author() self.refresh_all_video()
def compute_video_rank_table(): task_name = '计算视频排名对照表' coll = db['video'] # 获得collection的句柄 count = coll.estimated_document_count() top_n = 60 print(count) keys = ['cView', 'cLike', 'cDanmaku', 'cFavorite', 'cCoin', 'cShare'] task = ProgressTask(task_name, top_n * len(keys), collection=db['tracer']) o = {} skip = int(count / 100) for each_key_index in range(len(keys)): each_key = keys[each_key_index] o[each_key] = {} o['name'] = 'video_rank' o[each_key]['rate'] = [] i = 1 last_value = 9999999999 # logger.info("开始计算视频{}排名对照表".format(each_key)) video = coll.find({}, { 'title': 1 }).limit(200).sort(each_key, DESCENDING).batch_size(200) top = 1 for each_video in list(video): o[each_key][each_video['title']] = top top += 1 while i <= top_n: task.current_value = i + top_n * each_key_index video = list( coll.find({ each_key: { '$lt': last_value } }, { each_key: 1 }).limit(1).skip(skip).sort(each_key, DESCENDING)) print(video) if len(video) != 0: video = video[0] else: i += 1 continue if each_key not in video: break last_value = video[each_key] o[each_key]['rate'].append(last_value) print(last_value) i += 1 o['update_time'] = datetime.datetime.utcnow() + datetime.timedelta(hours=8) output_coll = db['rank_table'] output_coll.update_one({'name': 'video_rank'}, {'$set': o}, upsert=True)
def auto_crawl_bangumi(): task_name = "生成番剧国创待爬链接" logger.info(task_name) t = ProgressTask(task_name, 1, collection=db['tracer']) redis_connection.rpush("bangumiAndDonghua:start_urls", "https://www.bilibili.com/ranking/bangumi/167/0/7") redis_connection.rpush("bangumiAndDonghua:start_urls", "https://www.bilibili.com/ranking/bangumi/13/0/7") t.current_value += 1
def add_tag_task(): task_name = "生成待爬标签视频链接" coll = db['video'] doc_filter = {'tag': {'$exists': False}} total = coll.find(doc_filter, {"aid": 1}).count() cursor = coll.find(doc_filter, {"aid": 1}).batch_size(100) t = ProgressTask(task_name, total, collection=db['tracer']) url = 'https://www.bilibili.com/video/av{}' for each_video in cursor: t.current_value += 1 aid = each_video['aid'] redis_connection.rpush("tagAdder:start_urls", url.format(aid))
def update_author(): task_name = "生成每日作者待爬链接" logger.info(task_name) coll = db['author'] filter_dict = {'$or': [{'focus': True}, {'forceFocus': True}]} cursor = coll.find(filter_dict, {"mid": 1}).batch_size(200) total = coll.count_documents(filter_dict) if total != 0: t = ProgressTask(task_name, total, collection=db['tracer']) for each_doc in cursor: t.current_value += 1 redis_connection.rpush(AUTHOR_KEY, AUTHOR_URL.format(mid=each_doc['mid']))
def __judge_author(self, author_filter): author_cursor = author_coll.find(author_filter) count = author_cursor.count() a = author_coll.aggregate([{ '$match': author_filter }, { '$project': { "mid": 1, "face": 1, "name": 1, "data": { "$filter": { "input": "$data", "as": "data", "cond": { "$gt": [ "$$data.datetime", datetime.datetime.now() - datetime.timedelta(32) ] } } } } }, { "$match": { "data.0": { "$exists": True } } }]) print("待爬取作者数量:{}".format(count)) t = ProgressTask("粉丝数变动探测", total_value=count, collection=db['tracer']) for each_author in a: print(each_author['mid']) t.current_value += 1 self.__judge(each_author) t.finished = True pass
def auto_add_video(): task_name = "生成作者最新发布的视频的待爬链接" logger.info(task_name) coll = db['author'] doc_filter = {'$or': [{'focus': True}, {'forceFocus': True}]} total = coll.count_documents(doc_filter) c = coll.find(doc_filter, {'mid': 1}) if total != 0: t = ProgressTask(task_name, total, collection=db['tracer']) for each_doc in c: t.current_value += 1 URL = 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&pagesize=10&page=1&order=pubdate'.format( each_doc['mid']) redis_connection.rpush("videoAutoAdd:start_urls", URL)
def crawlOnlineTopListData(): task_name = "生成强力追踪待爬链接" logger.info(task_name) ONLINE_URL = 'https://www.bilibili.com/video/online.html' response = requests.get(ONLINE_URL) data_text = etree.HTML( response.content.decode('utf8')).xpath('//script/text()')[-2] j = json.loads(data_text.lstrip('window.__INITIAL_STATE__=')[:-122]) total = len(j['onlineList']) t = ProgressTask(task_name, total, collection=db['tracer']) for each_video in j['onlineList']: aid = each_video['aid'] mid = each_video['owner']['mid'] if mid not in [7584632, 928123]: priorityAuthorCrawlRequest(mid) priorityVideoCrawlRequest(aid) t.current_value += 1
def send_aids(task_name, total, cursor): if total == 0: return t = ProgressTask(task_name, total, collection=db['tracer']) aid_list = '' i = 0 for each_doc in cursor: aid_list += str(each_doc['aid']) + ',' i += 1 if i == 100: t.current_value += i redis_connection.rpush(VIDEO_KEY, VIDEO_URL.format(aid=aid_list[:-1])) aid_list = '' i = 0 t.current_value += i redis_connection.rpush(VIDEO_KEY, VIDEO_URL.format(aid=aid_list[:-1]))
def auto_add_author(): task_name = "生成排行榜待爬链接" logger.info(task_name) start_urls = [ 'https://www.bilibili.com/ranking', 'https://www.bilibili.com/ranking/all/1/0/3', 'https://www.bilibili.com/ranking/all/168/0/3', 'https://www.bilibili.com/ranking/all/3/0/3', 'https://www.bilibili.com/ranking/all/129/0/3', 'https://www.bilibili.com/ranking/all/4/0/3', 'https://www.bilibili.com/ranking/all/36/0/3', 'https://www.bilibili.com/ranking/all/160/0/3', 'https://www.bilibili.com/ranking/all/119/0/3', 'https://www.bilibili.com/ranking/all/155/0/3', 'https://www.bilibili.com/ranking/all/5/0/3', 'https://www.bilibili.com/ranking/all/181/0/3' ] t = ProgressTask(task_name, len(start_urls), collection=db['tracer']) for each in start_urls: t.current_value += 1 redis_connection.rpush('authorAutoAdd:start_urls', each)
def gen_online(): task_name = "生成在线人数爬取链接" t = ProgressTask(task_name, 1, collection=db['tracer']) ONLINE_URL = 'https://www.bilibili.com/video/online.html' redis_connection.rpush("online:start_urls", ONLINE_URL) t.current_value = 1
def author_fans_rate_caculate(): logging.basicConfig( level=logging.INFO, format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') logger = logging.getLogger(__name__) coll = db['author'] # 获得collection的句柄 logger.info('开始计算粉丝增速') c_datetime = datetime.datetime.now() end_date = ( datetime.datetime(c_datetime.year, c_datetime.month, c_datetime.day) - datetime.timedelta(1)).timestamp() start_date = ( datetime.datetime(c_datetime.year, c_datetime.month, c_datetime.day) - datetime.timedelta(2)).timestamp() task = ProgressTask("计算粉丝增速", coll.count_documents({}), collection=db['tracer']) c = 0 for each in coll.find({}, {'mid': 1, '_id': 0}).batch_size(200): c += 1 task.current_value = c ag = coll.aggregate([{ '$match': { 'mid': each['mid'] } }, { '$project': { 'mid': 1, 'data': { "$filter": { "input": "$data", "as": "each_data", "cond": { "$gt": [ "$$each_data.datetime", datetime.datetime.now() - datetime.timedelta(7) ] } } } } }]).batch_size(1) each_author = next(ag) if 'data' in each_author and each_author['data'] != None: data = sorted(each_author['data'], key=lambda x: x['datetime']) if len(data) >= 2: logger.info(each_author['mid']) x = tuple(map(lambda x: x['datetime'].timestamp(), data)) y = tuple(map(lambda x: x['fans'], data)) inter_fun = interp1d(x, y, kind='linear') if start_date > x[0] and end_date < x[-1]: inter_data = inter_fun([start_date, end_date]) delta_fans = inter_data[1] - inter_data[0] coll.update_one({'mid': each_author['mid']}, {"$set": { 'cRate': int(delta_fans) }})
def calculate_author_rank(): task_name = "计算作者排名数据" keys = ['cFans', 'cArchive_view', 'cArticle_view'] count = coll.count_documents({keys[0]: {'$exists': 1}}) t = ProgressTask(task_name, count * len(keys), collection=db['tracer']) for each_key in keys: logger.info("开始计算作者{}排名".format(each_key)) i = 1 authors = coll.find({ each_key: { '$exists': 1 } }, { 'mid': 1, 'rank': 1, each_key: 1 }).batch_size(300).sort(each_key, DESCENDING) if each_key == 'cFans': each_rank = 'fansRank' each_d_rank = 'dFansRank' each_p_rank = 'pFansRank' elif each_key == 'cArchive_view': each_rank = 'archiveViewRank' each_d_rank = 'dArchiveViewRank' each_p_rank = 'pArchiveViewRank' elif each_key == 'cArticle_view': each_rank = 'articleViewRank' each_d_rank = 'dArticleViewRank' each_p_rank = 'pArticleViewRank' for each_author in authors: t.current_value += 1 logger.info("计算{}排名".format(each_author['mid'])) # 如果没有data 直接下一个 if each_key in each_author: # 如果已经计算过rank if 'rank' in each_author: rank = each_author['rank'] if each_rank in each_author['rank']: rank[each_d_rank] = each_author['rank'][each_rank] - i else: rank[each_d_rank] = 0 rank[each_rank] = i rank[each_p_rank] = format_p_rank(i, count) else: # 初始化 rank = { each_rank: i, each_d_rank: 0, each_p_rank: format_p_rank(i, count) } if each_author[each_key] == 0: if 'rank' in each_author: rank = each_author['rank'] rank[each_d_rank] = 0 rank[each_rank] = -1 rank[each_p_rank] = -1 else: rank = {each_rank: -1, each_d_rank: 0, each_p_rank: -1} if each_key == 'cArticle_view': rank['updateTime'] = datetime.datetime.now() coll.update_one({'mid': each_author['mid']}, {'$set': { 'rank': rank, }}) i += 1 t.current_value = t.total_value logger.info("计算作者排名结束")