Esempio n. 1
0
class OnlineService:
    __online_url = 'https://www.bilibili.com/video/online.html'
    __online_key = "online:start_urls"
    __redisConnection = RedisDao.getRedisConnect()
    __tracerCollection = MongoDbDao.getCollection('tracer')

    @classmethod
    def genOnline(cls):
        task_name = "生成在线人数爬取链接"
        progressTask = ProgressTask(task_name,
                                    1,
                                    collection=cls.__tracerCollection)
        cls.__redisConnection.rpush(cls.__online_key, cls.__online_url)
        progressTask.current_value = 1

    @classmethod
    def crawlOnlineTopListData(cls):
        task_name = "生成强力追踪待爬链接"
        logger.info(task_name)
        response = requests.get(cls.__online_url)
        data_text = etree.HTML(
            response.content.decode('utf8')).xpath('//script/text()')[-2]
        j = json.loads(data_text.lstrip('window.__INITIAL_STATE__=')[:-122])
        total = len(j['onlineList'])
        progressTask = ProgressTask(task_name,
                                    total,
                                    collection=cls.__tracerCollection)

        for each_video in j['onlineList']:
            mid = each_video['owner']['mid']
            if mid not in [7584632, 928123]:
                AuthorService.pushAuthorRedisUrlToRedis(mid)
            VideoService.pushVideoRedisUrlToRedis(each_video['aid'])
            progressTask.current_value += 1
Esempio n. 2
0
class SiteService:
    __site_url = 'https://api.bilibili.com/x/web-interface/online'
    __site_key = "site:start_urls"
    redis_connection = RedisDao.getRedisConnect()

    @classmethod
    def sendSiteInfoCrawlRequest(cls):
        cls.redis_connection.rpush(cls.__site_key, cls.__site_url)
Esempio n. 3
0
class VideoService:
    __video_url = "https://api.bilibili.com/x/article/archives?ids={aid}"
    __video_key = "videoRedis:start_urls"
    __redisConnection = RedisDao.getRedisConnect()
    __tracerCollection = MongoDbDao.getCollection('tracer')

    @classmethod
    def updateVideo(cls, focus=True):
        if focus:
            task_name = "生成每日视频待爬链接"
        else:
            task_name = "生成保守观测视频待爬链接"
        logger.info(task_name)

        doc_filter = {'focus': focus}
        videoCollection = MongoDbDao.getCollection('video')
        total = videoCollection.count_documents(doc_filter)
        cursor = videoCollection.find(doc_filter, {"aid": 1}).batch_size(200)

        if total == 0:
            return

        countNum = 0
        aid_list = ''
        progressTask = ProgressTask(task_name, total, collection=cls.__tracerCollection)
        for each_doc in cursor:
            aid_list += str(each_doc['aid']) + ','
            countNum += 1
            logger.info(each_doc['aid'])
            if countNum == 50:
                progressTask.current_value += countNum
                cls.pushVideoRedisUrlToRedis(aid_list[:-1])
                aid_list = ''
                countNum = 0

        progressTask.current_value += countNum
        cls.pushVideoRedisUrlToRedis(aid_list[:-1])

    @classmethod
    def pushVideoRedisUrlToRedis(cls, aid):
        cls.__redisConnection.rpush(cls.__video_key, cls.__video_url.format(aid=aid))

    @classmethod
    def updateAutoAddVideo(cls):
        task_name = "生成作者最新发布的视频的待爬链接"
        logger.info(task_name)
        authorCollection = MongoDbDao.getCollection('author')
        doc_filter = {'$or': [{'focus': True}, {'forceFocus': True}]}
        total = authorCollection.count_documents(doc_filter)
        authorCollectionResult = authorCollection.find(doc_filter, {'mid': 1})
        if total != 0:
            progressTask = ProgressTask(task_name, total, collection=cls.__tracerCollection)
            for each_doc in authorCollectionResult:
                progressTask.current_value += 1
                url = 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&pagesize=10&page=1&order=pubdate'.format(
                    each_doc['mid'])
                cls.__redisConnection.rpush("videoAutoAdd:start_urls", url)
Esempio n. 4
0
 def updateAutoAddAuthor(cls):
     start_urls = [
         'https://www.bilibili.com/ranking'
         'https://www.bilibili.com/ranking/all/1/0/3',
         'https://www.bilibili.com/ranking/all/168/0/3',
         'https://www.bilibili.com/ranking/all/3/0/3',
         'https://www.bilibili.com/ranking/all/129/0/3',
         'https://www.bilibili.com/ranking/all/188/0/3',
         'https://www.bilibili.com/ranking/all/4/0/3',
         'https://www.bilibili.com/ranking/all/36/0/3',
         'https://www.bilibili.com/ranking/all/160/0/3',
         'https://www.bilibili.com/ranking/all/119/0/3',
         'https://www.bilibili.com/ranking/all/155/0/3',
         'https://www.bilibili.com/ranking/all/5/0/3',
         'https://www.bilibili.com/ranking/all/181/0/3'
     ]
     for url in start_urls:
         RedisDao.getRedisConnect().rpush("authorAutoAdd:start_urls", url)
Esempio n. 5
0
class AuthorService:
    __author_url = "https://api.bilibili.com/x/web-interface/card?mid={mid}"
    __author_key = "authorRedis:start_urls"
    __redisConnection = RedisDao.getRedisConnect()

    @classmethod
    def updateAuthor(cls):
        task_name = "生成每日作者待爬链接"
        logger.info(task_name)
        coll = MongoDbDao.getCollection('author')
        filter_dict = {
            '$or': [{
                'focus': True
            }, {
                'forceFocus': True
            }]
        }
        cursor = coll.find(filter_dict, {"mid": 1}).batch_size(200)
        total = coll.count_documents(filter_dict)
        if total != 0:
            t = ProgressTask(task_name, total, collection=MongoDbDao.getCollection('tracer'))
            for each_doc in cursor:
                t.current_value += 1
                cls.pushAuthorRedisUrlToRedis(each_doc['mid'])

    @classmethod
    def pushAuthorRedisUrlToRedis(cls, mid):
        cls.__redisConnection.rpush(cls.__author_key, cls.__author_url.format(mid=mid))

    @classmethod
    def updateAutoAddAuthor(cls):
        start_urls = [
            'https://www.bilibili.com/ranking'
            'https://www.bilibili.com/ranking/all/1/0/3',
            'https://www.bilibili.com/ranking/all/168/0/3',
            'https://www.bilibili.com/ranking/all/3/0/3',
            'https://www.bilibili.com/ranking/all/129/0/3',
            'https://www.bilibili.com/ranking/all/188/0/3',
            'https://www.bilibili.com/ranking/all/4/0/3',
            'https://www.bilibili.com/ranking/all/36/0/3',
            'https://www.bilibili.com/ranking/all/160/0/3',
            'https://www.bilibili.com/ranking/all/119/0/3',
            'https://www.bilibili.com/ranking/all/155/0/3',
            'https://www.bilibili.com/ranking/all/5/0/3',
            'https://www.bilibili.com/ranking/all/181/0/3'
        ]
        for url in start_urls:
            RedisDao.getRedisConnect().rpush("authorAutoAdd:start_urls", url)
Esempio n. 6
0
class BangumiAndDonghuaService:
    __tracerCollection = MongoDbDao.getCollection('tracer')
    __redisConnection = RedisDao.getRedisConnect()
    __bangumiAndDonghua_key = "bangumiAndDonghua:start_urls"

    @classmethod
    def autoCrawlBangumi(cls):
        task_name = "生成番剧国创待爬链接"
        logger.info(task_name)
        progressTask = ProgressTask(task_name,
                                    1,
                                    collection=cls.__tracerCollection)

        urls = [
            "https://www.bilibili.com/ranking/bangumi/167/0/7",
            "https://www.bilibili.com/ranking/bangumi/13/0/7"
        ]
        for url in urls:
            cls.__redisConnection.rpush(cls.__bangumiAndDonghua_key, url)

        progressTask.current_value += 1
Esempio n. 7
0
class TagAdderService:
    __videoCollection = MongoDbDao.getCollection('video')
    __tracerCollection = MongoDbDao.getCollection('tracer')
    __redisConnection = RedisDao.getRedisConnect()

    @classmethod
    def addTagTask(cls):
        task_name = "生成待爬标签视频链接"
        doc_filter = {'tag': {'$exists': False}}
        total = cls.__videoCollection.find(doc_filter, {"aid": 1}).count()
        cursor = cls.__videoCollection.find(doc_filter, {
            "aid": 1
        }).batch_size(100)

        progressTask = ProgressTask(task_name,
                                    total,
                                    collection=cls.__tracerCollection)
        url = 'https://www.bilibili.com/video/av{}'
        for each_video in cursor:
            progressTask.current_value += 1
            aid = each_video['aid']
            logger.info("待爬AV号{}".format(aid))
            cls.__redisConnection.rpush("tagAdder:start_urls", url.format(aid))
Esempio n. 8
0
 def __init__(self):
     self.videoCollection = MongoDbDao.getCollection('video')
     self.redisConnection = RedisDao.getRedisConnect()
Esempio n. 9
0
 def __init__(self):
     self.authorCollection = MongoDbDao.getCollection('author')
     self.redisConnection = RedisDao.getRedisConnect()
Esempio n. 10
0
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
#    'nspider.pipelines.NspiderPipeline': 300,
# }

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'


# ---------新加属性------
REDIS_URL = RedisDao.getRedisConnectString()
COMMANDS_MODULE = "nspider.commands"
Esempio n. 11
0
 def __init__(self):
     self.authorCollection = MongoDbDao.getCollection('author')
     self.redisCollection = RedisDao.getRedisConnect()
     self.task = SpiderTask('作者数据更新爬虫',
                            collection=MongoDbDao.getCollection('tracer'))