def auto_crawl_bangumi(): task_name = "生成番剧国创待爬链接" logger.info(task_name) redis_connection.rpush("bangumiAndDonghua:start_urls", "https://www.bilibili.com/ranking/bangumi/167/0/7") redis_connection.rpush("bangumiAndDonghua:start_urls", "https://www.bilibili.com/ranking/bangumi/13/0/7")
def auto_crawl_bangumi(): task_name = "生成番剧国创待爬链接" logger.info(task_name) t = ProgressTask(task_name, 1, collection=db['tracer']) redis_connection.rpush("bangumiAndDonghua:start_urls", "https://www.bilibili.com/ranking/bangumi/167/0/7") redis_connection.rpush("bangumiAndDonghua:start_urls", "https://www.bilibili.com/ranking/bangumi/13/0/7") t.current_value += 1
def add_tag_task(): task_name = "生成待爬标签视频链接" coll = db['video'] doc_filter = {'tag': {'$exists': False}} total = coll.find(doc_filter, {"aid": 1}).count() cursor = coll.find(doc_filter, {"aid": 1}).batch_size(100) url = 'https://www.bilibili.com/video/av{}' for each_video in cursor: aid = each_video['aid'] logger.info("待爬AV号{}".format(aid)) redis_connection.rpush("tagAdder:start_urls", url.format(aid))
def update_author(): task_name = "生成每日作者待爬链接" logger.info(task_name) coll = db['author'] filter_dict = {'$or': [{'focus': True}, {'forceFocus': True}]} cursor = coll.find(filter_dict, {"mid": 1}).batch_size(200) total = coll.count_documents(filter_dict) if total != 0: for each_doc in cursor: redis_connection.rpush(AUTHOR_KEY, AUTHOR_URL.format(mid=each_doc['mid']))
def add_tag_task(): task_name = "生成待爬标签视频链接" coll = db['video'] doc_filter = {'tag': {'$exists': False}} total = coll.find(doc_filter, {"aid": 1}).count() cursor = coll.find(doc_filter, {"aid": 1}).batch_size(100) t = ProgressTask(task_name, total, collection=db['tracer']) url = 'https://www.bilibili.com/video/av{}' for each_video in cursor: t.current_value += 1 aid = each_video['aid'] redis_connection.rpush("tagAdder:start_urls", url.format(aid))
def auto_add_video(): task_name = "生成作者最新发布的视频的待爬链接" logger.info(task_name) coll = db['author'] doc_filter = {'$or': [{'focus': True}, {'forceFocus': True}]} total = coll.count_documents(doc_filter) c = coll.find(doc_filter, {'mid': 1}) if total != 0: for each_doc in c: URL = 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&pagesize=10&page=1&order=pubdate'.format( each_doc['mid']) redis_connection.rpush("videoAutoAdd:start_urls", URL)
def send_aids(task_name, total, cursor): if total == 0: return t = ProgressTask(task_name, total, collection=db['tracer']) aid_list = '' i = 0 for each_doc in cursor: aid_list += str(each_doc['aid']) + ',' i += 1 if i == 100: t.current_value += i redis_connection.rpush(VIDEO_KEY, VIDEO_URL.format(aid=aid_list[:-1])) aid_list = '' i = 0 t.current_value += i redis_connection.rpush(VIDEO_KEY, VIDEO_URL.format(aid=aid_list[:-1]))
def send_aids(task_name, total, cursor): if total == 0: return aid_list = '' i = 0 c = 0 for each_doc in cursor: c += 1 aid_list += str(each_doc['aid']) + ',' i += 1 logger.info(each_doc['aid']) if i == 50: logger.info('传送第{}个'.format(c)) redis_connection.rpush(VIDEO_KEY, VIDEO_URL.format(aid=aid_list[:-1])) aid_list = '' i = 0 redis_connection.rpush(VIDEO_KEY, VIDEO_URL.format(aid=aid_list[:-1]))
def auto_add_author(): task_name = "生成排行榜待爬链接" logger.info(task_name) start_urls = [ 'https://www.bilibili.com/ranking', 'https://www.bilibili.com/ranking/all/1/0/3', 'https://www.bilibili.com/ranking/all/168/0/3', 'https://www.bilibili.com/ranking/all/3/0/3', 'https://www.bilibili.com/ranking/all/129/0/3', 'https://www.bilibili.com/ranking/all/188/0/3', 'https://www.bilibili.com/ranking/all/4/0/3', 'https://www.bilibili.com/ranking/all/36/0/3', 'https://www.bilibili.com/ranking/all/160/0/3', 'https://www.bilibili.com/ranking/all/119/0/3', 'https://www.bilibili.com/ranking/all/155/0/3', 'https://www.bilibili.com/ranking/all/5/0/3', 'https://www.bilibili.com/ranking/all/181/0/3' ] for each in start_urls: redis_connection.rpush('authorAutoAdd:start_urls', each)
def auto_add_author(): task_name = "生成排行榜待爬链接" logger.info(task_name) start_urls = [ 'https://www.bilibili.com/ranking', 'https://www.bilibili.com/ranking/all/1/0/3', 'https://www.bilibili.com/ranking/all/168/0/3', 'https://www.bilibili.com/ranking/all/3/0/3', 'https://www.bilibili.com/ranking/all/129/0/3', 'https://www.bilibili.com/ranking/all/4/0/3', 'https://www.bilibili.com/ranking/all/36/0/3', 'https://www.bilibili.com/ranking/all/160/0/3', 'https://www.bilibili.com/ranking/all/119/0/3', 'https://www.bilibili.com/ranking/all/155/0/3', 'https://www.bilibili.com/ranking/all/5/0/3', 'https://www.bilibili.com/ranking/all/181/0/3' ] t = ProgressTask(task_name, len(start_urls), collection=db['tracer']) for each in start_urls: t.current_value += 1 redis_connection.rpush('authorAutoAdd:start_urls', each)
def gen_online(): task_name = "生成在线人数爬取链接" t = ProgressTask(task_name, 1, collection=db['tracer']) ONLINE_URL = 'https://www.bilibili.com/video/online.html' redis_connection.rpush("online:start_urls", ONLINE_URL) t.current_value = 1
def sendSiteInfoCrawlRequest(): redis_connection.rpush(SITEINFO_KEY, SITEINFO_URL)
def sendVideoCrawlRequest(aid): redis_connection.rpush(VIDEO_KEY, VIDEO_URL.format(aid=aid))
def sendAuthorCrawlRequest(mid): redis_connection.rpush(AUTHOR_KEY, AUTHOR_URL.format(mid=mid))
def gen_online(): task_name = "生成在线人数爬取链接" ONLINE_URL = 'https://www.bilibili.com/video/online.html' redis_connection.rpush("online:start_urls", ONLINE_URL)