class OnlineService: __online_url = 'https://www.bilibili.com/video/online.html' __online_key = "online:start_urls" __redisConnection = RedisDao.getRedisConnect() __tracerCollection = MongoDbDao.getCollection('tracer') @classmethod def genOnline(cls): task_name = "生成在线人数爬取链接" progressTask = ProgressTask(task_name, 1, collection=cls.__tracerCollection) cls.__redisConnection.rpush(cls.__online_key, cls.__online_url) progressTask.current_value = 1 @classmethod def crawlOnlineTopListData(cls): task_name = "生成强力追踪待爬链接" logger.info(task_name) response = requests.get(cls.__online_url) data_text = etree.HTML( response.content.decode('utf8')).xpath('//script/text()')[-2] j = json.loads(data_text.lstrip('window.__INITIAL_STATE__=')[:-122]) total = len(j['onlineList']) progressTask = ProgressTask(task_name, total, collection=cls.__tracerCollection) for each_video in j['onlineList']: mid = each_video['owner']['mid'] if mid not in [7584632, 928123]: AuthorService.pushAuthorRedisUrlToRedis(mid) VideoService.pushVideoRedisUrlToRedis(each_video['aid']) progressTask.current_value += 1
class SiteService: __site_url = 'https://api.bilibili.com/x/web-interface/online' __site_key = "site:start_urls" redis_connection = RedisDao.getRedisConnect() @classmethod def sendSiteInfoCrawlRequest(cls): cls.redis_connection.rpush(cls.__site_key, cls.__site_url)
class VideoService: __video_url = "https://api.bilibili.com/x/article/archives?ids={aid}" __video_key = "videoRedis:start_urls" __redisConnection = RedisDao.getRedisConnect() __tracerCollection = MongoDbDao.getCollection('tracer') @classmethod def updateVideo(cls, focus=True): if focus: task_name = "生成每日视频待爬链接" else: task_name = "生成保守观测视频待爬链接" logger.info(task_name) doc_filter = {'focus': focus} videoCollection = MongoDbDao.getCollection('video') total = videoCollection.count_documents(doc_filter) cursor = videoCollection.find(doc_filter, {"aid": 1}).batch_size(200) if total == 0: return countNum = 0 aid_list = '' progressTask = ProgressTask(task_name, total, collection=cls.__tracerCollection) for each_doc in cursor: aid_list += str(each_doc['aid']) + ',' countNum += 1 logger.info(each_doc['aid']) if countNum == 50: progressTask.current_value += countNum cls.pushVideoRedisUrlToRedis(aid_list[:-1]) aid_list = '' countNum = 0 progressTask.current_value += countNum cls.pushVideoRedisUrlToRedis(aid_list[:-1]) @classmethod def pushVideoRedisUrlToRedis(cls, aid): cls.__redisConnection.rpush(cls.__video_key, cls.__video_url.format(aid=aid)) @classmethod def updateAutoAddVideo(cls): task_name = "生成作者最新发布的视频的待爬链接" logger.info(task_name) authorCollection = MongoDbDao.getCollection('author') doc_filter = {'$or': [{'focus': True}, {'forceFocus': True}]} total = authorCollection.count_documents(doc_filter) authorCollectionResult = authorCollection.find(doc_filter, {'mid': 1}) if total != 0: progressTask = ProgressTask(task_name, total, collection=cls.__tracerCollection) for each_doc in authorCollectionResult: progressTask.current_value += 1 url = 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&pagesize=10&page=1&order=pubdate'.format( each_doc['mid']) cls.__redisConnection.rpush("videoAutoAdd:start_urls", url)
def updateAutoAddAuthor(cls): start_urls = [ 'https://www.bilibili.com/ranking' 'https://www.bilibili.com/ranking/all/1/0/3', 'https://www.bilibili.com/ranking/all/168/0/3', 'https://www.bilibili.com/ranking/all/3/0/3', 'https://www.bilibili.com/ranking/all/129/0/3', 'https://www.bilibili.com/ranking/all/188/0/3', 'https://www.bilibili.com/ranking/all/4/0/3', 'https://www.bilibili.com/ranking/all/36/0/3', 'https://www.bilibili.com/ranking/all/160/0/3', 'https://www.bilibili.com/ranking/all/119/0/3', 'https://www.bilibili.com/ranking/all/155/0/3', 'https://www.bilibili.com/ranking/all/5/0/3', 'https://www.bilibili.com/ranking/all/181/0/3' ] for url in start_urls: RedisDao.getRedisConnect().rpush("authorAutoAdd:start_urls", url)
class AuthorService: __author_url = "https://api.bilibili.com/x/web-interface/card?mid={mid}" __author_key = "authorRedis:start_urls" __redisConnection = RedisDao.getRedisConnect() @classmethod def updateAuthor(cls): task_name = "生成每日作者待爬链接" logger.info(task_name) coll = MongoDbDao.getCollection('author') filter_dict = { '$or': [{ 'focus': True }, { 'forceFocus': True }] } cursor = coll.find(filter_dict, {"mid": 1}).batch_size(200) total = coll.count_documents(filter_dict) if total != 0: t = ProgressTask(task_name, total, collection=MongoDbDao.getCollection('tracer')) for each_doc in cursor: t.current_value += 1 cls.pushAuthorRedisUrlToRedis(each_doc['mid']) @classmethod def pushAuthorRedisUrlToRedis(cls, mid): cls.__redisConnection.rpush(cls.__author_key, cls.__author_url.format(mid=mid)) @classmethod def updateAutoAddAuthor(cls): start_urls = [ 'https://www.bilibili.com/ranking' 'https://www.bilibili.com/ranking/all/1/0/3', 'https://www.bilibili.com/ranking/all/168/0/3', 'https://www.bilibili.com/ranking/all/3/0/3', 'https://www.bilibili.com/ranking/all/129/0/3', 'https://www.bilibili.com/ranking/all/188/0/3', 'https://www.bilibili.com/ranking/all/4/0/3', 'https://www.bilibili.com/ranking/all/36/0/3', 'https://www.bilibili.com/ranking/all/160/0/3', 'https://www.bilibili.com/ranking/all/119/0/3', 'https://www.bilibili.com/ranking/all/155/0/3', 'https://www.bilibili.com/ranking/all/5/0/3', 'https://www.bilibili.com/ranking/all/181/0/3' ] for url in start_urls: RedisDao.getRedisConnect().rpush("authorAutoAdd:start_urls", url)
class BangumiAndDonghuaService: __tracerCollection = MongoDbDao.getCollection('tracer') __redisConnection = RedisDao.getRedisConnect() __bangumiAndDonghua_key = "bangumiAndDonghua:start_urls" @classmethod def autoCrawlBangumi(cls): task_name = "生成番剧国创待爬链接" logger.info(task_name) progressTask = ProgressTask(task_name, 1, collection=cls.__tracerCollection) urls = [ "https://www.bilibili.com/ranking/bangumi/167/0/7", "https://www.bilibili.com/ranking/bangumi/13/0/7" ] for url in urls: cls.__redisConnection.rpush(cls.__bangumiAndDonghua_key, url) progressTask.current_value += 1
class TagAdderService: __videoCollection = MongoDbDao.getCollection('video') __tracerCollection = MongoDbDao.getCollection('tracer') __redisConnection = RedisDao.getRedisConnect() @classmethod def addTagTask(cls): task_name = "生成待爬标签视频链接" doc_filter = {'tag': {'$exists': False}} total = cls.__videoCollection.find(doc_filter, {"aid": 1}).count() cursor = cls.__videoCollection.find(doc_filter, { "aid": 1 }).batch_size(100) progressTask = ProgressTask(task_name, total, collection=cls.__tracerCollection) url = 'https://www.bilibili.com/video/av{}' for each_video in cursor: progressTask.current_value += 1 aid = each_video['aid'] logger.info("待爬AV号{}".format(aid)) cls.__redisConnection.rpush("tagAdder:start_urls", url.format(aid))
def __init__(self): self.videoCollection = MongoDbDao.getCollection('video') self.redisConnection = RedisDao.getRedisConnect()
def __init__(self): self.authorCollection = MongoDbDao.getCollection('author') self.redisConnection = RedisDao.getRedisConnect()
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html # ITEM_PIPELINES = { # 'nspider.pipelines.NspiderPipeline': 300, # } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html # AUTOTHROTTLE_ENABLED = True # The initial download delay # AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies # AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: # AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # HTTPCACHE_ENABLED = True # HTTPCACHE_EXPIRATION_SECS = 0 # HTTPCACHE_DIR = 'httpcache' # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' # ---------新加属性------ REDIS_URL = RedisDao.getRedisConnectString() COMMANDS_MODULE = "nspider.commands"
def __init__(self): self.authorCollection = MongoDbDao.getCollection('author') self.redisCollection = RedisDao.getRedisConnect() self.task = SpiderTask('作者数据更新爬虫', collection=MongoDbDao.getCollection('tracer'))