Beispiel #1
0
    def computeVideoRankTable(cls):
        task_name = '计算视频排名对照表'
        videoCollection = MongoDbDao.getCollection('video')
        count = videoCollection.estimated_document_count()
        top_n = 60
        keys = ['cView', 'cLike', 'cDanmaku', 'cFavorite', 'cCoin', 'cShare']
        task = ProgressTask(task_name,
                            top_n * len(keys),
                            collection=MongoDbDao.getCollection('tracer'))

        videoRankTable = {}
        skip = int(count / 100)
        for each_key_index in range(len(keys)):
            each_key = keys[each_key_index]
            videoRankTable[each_key] = {}
            videoRankTable['name'] = 'video_rank'
            videoRankTable[each_key]['rate'] = []
            last_value = 9999999999
            videoCollectionResult = videoCollection.find({}, {
                'title': 1
            }).limit(200).sort(each_key, DESCENDING).batch_size(200)
            top = 1
            for each_video in list(videoCollectionResult):
                videoRankTable[each_key][each_video['title']] = top
                top += 1

            for i in range(1, top_n + 1):
                task.current_value = i + top_n * each_key_index
                videoCollectionResult = list(
                    videoCollection.find({
                        each_key: {
                            '$lt': last_value
                        }
                    }, {
                        each_key: 1
                    }).limit(1).skip(skip).sort(each_key, DESCENDING))
                if len(videoCollectionResult) != 0:
                    videoCollectionResult = videoCollectionResult[0]
                else:
                    i += 1
                    continue
                if each_key not in videoCollectionResult:
                    break
                last_value = videoCollectionResult[each_key]
                videoRankTable[each_key]['rate'].append(last_value)
                i += 1

        videoRankTable['update_time'] = datetime.datetime.now()
        rankTableCollection = MongoDbDao.getCollection('rank_table')
        rankTableCollection.update_one({'name': 'video_rank'},
                                       {'$set': videoRankTable},
                                       upsert=True)
Beispiel #2
0
 def process_item(self, item, spider):
     try:
         if (item['c_fans'] != 0):
             self.authorCollection.update_one({
                 'mid': item['mid']
             }, {
                 '$set': {
                     'focus': True,
                     'sex': item['sex'],
                     'name': item['name'],
                     'face': item['face'],
                     'level': item['level'],
                     'cFans': item['c_fans'],
                     'cLike': item['c_like'],
                     'cRate': item['c_rate'],
                     'official': item['official'],
                 },
                 '$push': {
                     'data': {
                         '$each': [item['data']],
                         '$position': 0
                     }
                 }
             }, True)
             if 'object_id' in item:
                 sentCallBack(item['object_id'], MongoDbDao.getCollection('user_record'))
             return item
     except Exception as error:
         logging.error('{}: {}'.format(spider.name, error))
Beispiel #3
0
 def process_item(self, item, spider):
     try:
         self.videoCollection.update_one({
             'aid': int(item['aid'])
         }, {
             '$set': {
                 'cView': item['current_view'],
                 'cFavorite': item['current_favorite'],
                 'cDanmaku': item['current_danmaku'],
                 'cCoin': item['current_coin'],
                 'cShare': item['current_share'],
                 'cLike': item['current_like'],
                 'cDatetime': item['current_datetime'],
                 'author': item['author'],
                 'subChannel': item['subChannel'],
                 'channel': item['channel'],
                 'mid': item['mid'],
                 'pic': item['pic'],
                 'title': item['title'],
                 'datetime': datetime.datetime.fromtimestamp(
                     item['datetime'])
             },
             '$push': {
                 'data': {
                     '$each': [item['data']],
                     '$position': 0
                 }
             }
         }, True)
         if 'object_id' in item:
             sentCallBack(item['object_id'], MongoDbDao.getCollection('user_record'))
         return item
     except Exception as error:
         logging.error('{}: {}'.format(spider.name, error))
    def updateVideo(cls, focus=True):
        if focus:
            task_name = "生成每日视频待爬链接"
        else:
            task_name = "生成保守观测视频待爬链接"
        logger.info(task_name)

        doc_filter = {'focus': focus}
        videoCollection = MongoDbDao.getCollection('video')
        total = videoCollection.count_documents(doc_filter)
        cursor = videoCollection.find(doc_filter, {"aid": 1}).batch_size(200)

        if total == 0:
            return

        countNum = 0
        aid_list = ''
        progressTask = ProgressTask(task_name, total, collection=cls.__tracerCollection)
        for each_doc in cursor:
            aid_list += str(each_doc['aid']) + ','
            countNum += 1
            logger.info(each_doc['aid'])
            if countNum == 50:
                progressTask.current_value += countNum
                cls.pushVideoRedisUrlToRedis(aid_list[:-1])
                aid_list = ''
                countNum = 0

        progressTask.current_value += countNum
        cls.pushVideoRedisUrlToRedis(aid_list[:-1])
Beispiel #5
0
class OnlineService:
    __online_url = 'https://www.bilibili.com/video/online.html'
    __online_key = "online:start_urls"
    __redisConnection = RedisDao.getRedisConnect()
    __tracerCollection = MongoDbDao.getCollection('tracer')

    @classmethod
    def genOnline(cls):
        task_name = "生成在线人数爬取链接"
        progressTask = ProgressTask(task_name,
                                    1,
                                    collection=cls.__tracerCollection)
        cls.__redisConnection.rpush(cls.__online_key, cls.__online_url)
        progressTask.current_value = 1

    @classmethod
    def crawlOnlineTopListData(cls):
        task_name = "生成强力追踪待爬链接"
        logger.info(task_name)
        response = requests.get(cls.__online_url)
        data_text = etree.HTML(
            response.content.decode('utf8')).xpath('//script/text()')[-2]
        j = json.loads(data_text.lstrip('window.__INITIAL_STATE__=')[:-122])
        total = len(j['onlineList'])
        progressTask = ProgressTask(task_name,
                                    total,
                                    collection=cls.__tracerCollection)

        for each_video in j['onlineList']:
            mid = each_video['owner']['mid']
            if mid not in [7584632, 928123]:
                AuthorService.pushAuthorRedisUrlToRedis(mid)
            VideoService.pushVideoRedisUrlToRedis(each_video['aid'])
            progressTask.current_value += 1
Beispiel #6
0
 def __autoCrawlTask(cls):
     task_name = "自动爬虫计划调度服务"
     logger.info(task_name)
     ExistsTask(task_name,
                update_frequency=60,
                collection=MongoDbDao.getCollection('tracer'))
     while True:
         schedule.run_pending()
         sleep(60)
Beispiel #7
0
 def updateAuthor(cls):
     task_name = "生成每日作者待爬链接"
     logger.info(task_name)
     coll = MongoDbDao.getCollection('author')
     filter_dict = {
         '$or': [{
             'focus': True
         }, {
             'forceFocus': True
         }]
     }
     cursor = coll.find(filter_dict, {"mid": 1}).batch_size(200)
     total = coll.count_documents(filter_dict)
     if total != 0:
         t = ProgressTask(task_name, total, collection=MongoDbDao.getCollection('tracer'))
         for each_doc in cursor:
             t.current_value += 1
             cls.pushAuthorRedisUrlToRedis(each_doc['mid'])
class VideoService:
    __video_url = "https://api.bilibili.com/x/article/archives?ids={aid}"
    __video_key = "videoRedis:start_urls"
    __redisConnection = RedisDao.getRedisConnect()
    __tracerCollection = MongoDbDao.getCollection('tracer')

    @classmethod
    def updateVideo(cls, focus=True):
        if focus:
            task_name = "生成每日视频待爬链接"
        else:
            task_name = "生成保守观测视频待爬链接"
        logger.info(task_name)

        doc_filter = {'focus': focus}
        videoCollection = MongoDbDao.getCollection('video')
        total = videoCollection.count_documents(doc_filter)
        cursor = videoCollection.find(doc_filter, {"aid": 1}).batch_size(200)

        if total == 0:
            return

        countNum = 0
        aid_list = ''
        progressTask = ProgressTask(task_name, total, collection=cls.__tracerCollection)
        for each_doc in cursor:
            aid_list += str(each_doc['aid']) + ','
            countNum += 1
            logger.info(each_doc['aid'])
            if countNum == 50:
                progressTask.current_value += countNum
                cls.pushVideoRedisUrlToRedis(aid_list[:-1])
                aid_list = ''
                countNum = 0

        progressTask.current_value += countNum
        cls.pushVideoRedisUrlToRedis(aid_list[:-1])

    @classmethod
    def pushVideoRedisUrlToRedis(cls, aid):
        cls.__redisConnection.rpush(cls.__video_key, cls.__video_url.format(aid=aid))

    @classmethod
    def updateAutoAddVideo(cls):
        task_name = "生成作者最新发布的视频的待爬链接"
        logger.info(task_name)
        authorCollection = MongoDbDao.getCollection('author')
        doc_filter = {'$or': [{'focus': True}, {'forceFocus': True}]}
        total = authorCollection.count_documents(doc_filter)
        authorCollectionResult = authorCollection.find(doc_filter, {'mid': 1})
        if total != 0:
            progressTask = ProgressTask(task_name, total, collection=cls.__tracerCollection)
            for each_doc in authorCollectionResult:
                progressTask.current_value += 1
                url = 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&pagesize=10&page=1&order=pubdate'.format(
                    each_doc['mid'])
                cls.__redisConnection.rpush("videoAutoAdd:start_urls", url)
 def updateAutoAddVideo(cls):
     task_name = "生成作者最新发布的视频的待爬链接"
     logger.info(task_name)
     authorCollection = MongoDbDao.getCollection('author')
     doc_filter = {'$or': [{'focus': True}, {'forceFocus': True}]}
     total = authorCollection.count_documents(doc_filter)
     authorCollectionResult = authorCollection.find(doc_filter, {'mid': 1})
     if total != 0:
         progressTask = ProgressTask(task_name, total, collection=cls.__tracerCollection)
         for each_doc in authorCollectionResult:
             progressTask.current_value += 1
             url = 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&pagesize=10&page=1&order=pubdate'.format(
                 each_doc['mid'])
             cls.__redisConnection.rpush("videoAutoAdd:start_urls", url)
class TagAdderService:
    __videoCollection = MongoDbDao.getCollection('video')
    __tracerCollection = MongoDbDao.getCollection('tracer')
    __redisConnection = RedisDao.getRedisConnect()

    @classmethod
    def addTagTask(cls):
        task_name = "生成待爬标签视频链接"
        doc_filter = {'tag': {'$exists': False}}
        total = cls.__videoCollection.find(doc_filter, {"aid": 1}).count()
        cursor = cls.__videoCollection.find(doc_filter, {
            "aid": 1
        }).batch_size(100)

        progressTask = ProgressTask(task_name,
                                    total,
                                    collection=cls.__tracerCollection)
        url = 'https://www.bilibili.com/video/av{}'
        for each_video in cursor:
            progressTask.current_value += 1
            aid = each_video['aid']
            logger.info("待爬AV号{}".format(aid))
            cls.__redisConnection.rpush("tagAdder:start_urls", url.format(aid))
Beispiel #11
0
 def process_item(self, item, spider):
     self.coll.update_one({
         'aid': int(item['aid'])
     }, {
         '$set': {
             'danmaku_aggregate.{}'.format(item['page_number']): {
                 'duration': item['duration'],
                 'p_name': item['p_name'],
                 'danmaku_density': item['danmaku_density'],
                 'word_frequency': item['word_frequency']
             },
             'danmaku_aggregate.updatetime': datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
         }
     }, True)
     sentCallBack(item['object_id'], MongoDbDao.getCollection('user_record'))
Beispiel #12
0
class BangumiAndDonghuaService:
    __tracerCollection = MongoDbDao.getCollection('tracer')
    __redisConnection = RedisDao.getRedisConnect()
    __bangumiAndDonghua_key = "bangumiAndDonghua:start_urls"

    @classmethod
    def autoCrawlBangumi(cls):
        task_name = "生成番剧国创待爬链接"
        logger.info(task_name)
        progressTask = ProgressTask(task_name,
                                    1,
                                    collection=cls.__tracerCollection)

        urls = [
            "https://www.bilibili.com/ranking/bangumi/167/0/7",
            "https://www.bilibili.com/ranking/bangumi/13/0/7"
        ]
        for url in urls:
            cls.__redisConnection.rpush(cls.__bangumiAndDonghua_key, url)

        progressTask.current_value += 1
Beispiel #13
0
 def __init__(self):
     self.coll = MongoDbDao.getCollection('video')
Beispiel #14
0
class KeywordAdderService:
    __authorCollection = MongoDbDao.getCollection('author')
    __videoCollection = MongoDbDao.getCollection('video')
    __searchWordCollection = MongoDbDao.getCollection('search_word')
    __tracerCollection = MongoDbDao.getCollection('tracer')
    jieba.load_userdict('./resources/dict.txt')

    @classmethod
    def addOmitted(cls):
        if cls.__searchWordCollection.count_documents({}) < 100:
            return

        progressTask = ProgressTask(
            "更新查询关键词字典",
            total_value=cls.__searchWordCollection.count_documents({}),
            collection=cls.__tracerCollection)

        with open('./resources/dict.txt', 'r',
                  encoding='utf8').read().split('\n') as fileConnection:
            for each in cls.__searchWordCollection.find():
                if 'aid' in each and each['aid'] not in fileConnection:
                    fileConnection.append(each['aid'])
                elif 'mid' in each and each['mid'] not in fileConnection:
                    fileConnection.append(each['mid'])
                progressTask.current_value += 1

        progressTask.finished = True

        with open('./resources/dict.txt', 'w', encoding='utf8',
                  newline='') as fileConnection:
            for each in fileConnection:
                fileConnection.write(each + '\n')

        cls.__searchWordCollection.delete_many({})
        jieba.load_userdict('./resources/dict.txt')

        cls.__refreshAllAuthor()
        cls.__refreshAllVideo()

    @classmethod
    def __refreshAllAuthor(cls):
        for each_author in cls.__authorCollection.find({}, {
                '_id': 0,
                'mid': 1
        }).batch_size(100):
            mid = each_author['mid']
            print("[mid]" + str(mid))
            # 关键字从name和official中提取
            authorCollectionResult = cls.__authorCollection.find_one(
                {'mid': mid}, {
                    '_id': 0,
                    'name': 1,
                    'official': 1,
                    'keyword': 1
                })
            keyWord = []
            for each_key in authorCollectionResult:
                if each_key != 'keyword':
                    keyWord.append(
                        str(authorCollectionResult[each_key]).lower())
                else:
                    keyWord += authorCollectionResult['keyword']

            seg_list = jieba.lcut_for_search(' '.join(keyWord), True)  # 搜索引擎模式
            # 全名算作关键字
            if 'name' in authorCollectionResult and authorCollectionResult[
                    'name'].lower() not in seg_list:
                seg_list.append(authorCollectionResult['name'].lower())

            while ' ' in seg_list:
                seg_list.remove(' ')
            while '、' in seg_list:
                seg_list.remove('、')

            cls.__authorCollection.update_one(
                {'mid': mid}, {'$set': {
                    'keyword': list(set(seg_list))
                }})
            sleep(0.01)

    @classmethod
    def __refreshAllVideo(cls):
        for each_video in cls.__videoCollection.find({}, {
                '_id': 0,
                'aid': 1
        }).batch_size(100):
            aid = each_video['aid']
            print("[aid]" + str(aid))
            # 关键字从name和official中提取
            videoCollectionResult = cls.__videoCollection.find_one(
                {'aid': aid}, {
                    '_id': 0,
                    'title': 1,
                    'channel': 1,
                    'subChannel': 1,
                    'author': 1,
                    'tag': 1
                })
            keyword = []
            for each_key in videoCollectionResult:
                if each_key != 'keyword' or each_key != 'tag':
                    keyword.append(
                        str(videoCollectionResult[each_key]).lower())
                elif each_key == 'tag':
                    keyword += videoCollectionResult['tag']
                else:
                    keyword += videoCollectionResult['keyword']
            seg_list = jieba.lcut_for_search(' '.join(keyword), True)  # 搜索引擎模式

            # 全名算作关键字
            if 'author' in videoCollectionResult and videoCollectionResult[
                    'author'].lower() not in seg_list:
                seg_list.append(videoCollectionResult['author'].lower())

            while ' ' in seg_list:
                seg_list.remove(' ')
            while '、' in seg_list:
                seg_list.remove('、')

            cls.__videoCollection.update_one(
                {'aid': aid}, {'$set': {
                    'keyword': list(set(seg_list))
                }})
            sleep(0.01)
 def __init__(self):
     self.videoCollection = MongoDbDao.getCollection('video')
     self.task = SpiderTask('视频数据更新爬虫',
                            collection=MongoDbDao.getCollection('tracer'))
Beispiel #16
0
 def __init__(self):
     self.task = SpiderTask('同时在线人数爬虫',
                            collection=MongoDbDao.getCollection('tracer'))
Beispiel #17
0
 def __init__(self):
     self.task = SpiderTask("番剧动画爬虫",
                            collection=MongoDbDao.getCollection('tracer'))
 def __init__(self):
     RedisSpider.__init__(self)
     self.task = SpiderTask('活跃作者自动追加爬虫',
                            collection=MongoDbDao.getCollection('tracer'))
Beispiel #19
0
class FansWatcherService:
    __videoCollection = MongoDbDao.getCollection('video')
    __authorCollection = MongoDbDao.getCollection('author')
    __fansVariationCollection = MongoDbDao.getCollection('fans_variation')

    @classmethod
    def watchBigAuthor(cls):
        a = cls.__authorCollection.aggregate([{
            '$match': {
                'data': {
                    '$exists': True
                },
                'cFans': {
                    '$gt': 10000
                }
            }
        }, {
            '$project': {
                "mid": 1,
                "face": 1,
                "name": 1,
                "data": {
                    "$filter": {
                        "input": "$data",
                        "as": "data",
                        "cond": {
                            "$gt": [
                                "$$data.datetime",
                                datetime.datetime.now() -
                                datetime.timedelta(32)
                            ]
                        }
                    }
                }
            }
        }, {
            "$match": {
                "data.0": {
                    "$exists": True
                }
            }
        }])
        for each_author in a:
            cls.__judge(each_author)

    @classmethod
    def __insertEvent(cls, delta_rate, d_daily, author, info, date):
        out_data = {
            'variation': int(d_daily),
            'mid': author['mid'],
            'author': author['name'],
            'face': author['face'],
            'deltaRate': delta_rate,
            'datetime': date.strftime("%Y-%m-%d"),
            'info': info,
        }

        videos = cls.__videoCollection.find({'mid': author['mid']})
        temp_video = {}
        cause = {'type': 'video'}
        for each_v in videos:
            if type(each_v['datetime']) == str:
                pass
            elif (date - each_v['datetime']).days >= -1 and (
                    date - each_v['datetime']).days <= 7:
                temp_video['aid'] = each_v['aid']
                temp_video['title'] = each_v['title']
                temp_video['pic'] = each_v['pic']
                temp_video['cView'] = each_v['data'][0]['view']
                temp_video['channel'] = each_v['channel']
                temp_video['subChannel'] = each_v['subChannel']
                if 'cView' not in temp_video or 'aid' not in cause or temp_video[
                        'cView'] > cause['cView']:
                    cause['aid'] = temp_video['aid']
                    cause['title'] = temp_video['title']
                    cause['pic'] = temp_video['pic']
                    cause['cView'] = temp_video['cView']
                    cause['channel'] = temp_video['channel']
                    cause['subChannel'] = temp_video['subChannel']

        if cause != {'type': 'video'}:
            out_data['cause'] = cause
        cls.__fansVariationCollection.replace_one(
            {
                'mid': out_data['mid'],
                'datetime': out_data['datetime']
            },
            out_data,
            upsert=True)

    @classmethod
    def __judge(cls, author):
        """
            一共有这样几种可能:
                1、 大量涨粉        日涨粉数超过上周平均的25倍
                2、 史诗级涨粉      日涨粉数超过上周平均的50倍或单日涨粉超过10W
                3、 传说级涨粉      日涨粉数超过上周平均的100倍或单日涨粉超过20W
                4、 急转直下        上升轨道中的UP主突然掉粉
                5、 大量掉粉        每日掉粉数突破5K
                6、 雪崩级掉粉      每日掉粉数突破2W
                7、 末日级掉粉      每日掉粉数突破5W
                8、 新星爆发         日涨粉超过粉丝总数的20%
        """

        data = sorted(author['data'], key=lambda x: x['datetime'])
        start_date = data[0]['datetime'].timestamp()
        end_date = data[-1]['datetime'].timestamp()
        x = []
        y = []
        for each in data:
            x.append(each['datetime'].timestamp())
            y.append(each['fans'])
        if len(x) <= 1:
            return
        # 线性插值
        interrupted_fans = interp1d(x, y, kind='linear')
        temp_date = datetime.datetime.fromtimestamp(start_date)
        c_date = datetime.datetime(temp_date.year, temp_date.month,
                                   temp_date.day).timestamp() + 86400 * 3
        if c_date - 86400 * 2 <= start_date:
            return
        while (c_date <= end_date):
            date = datetime.datetime.fromtimestamp(c_date)
            daily_array = interrupted_fans([c_date - 86400, c_date])
            p_daily_array = interrupted_fans(
                [c_date - 86400 * 2, c_date - 86400])

            # 24小时前涨粉数
            pd_daily = p_daily_array[1] - p_daily_array[0]

            # 每日涨粉数
            d_daily = daily_array[1] - daily_array[0]

            if (d_daily >= 5000 or d_daily <= -2000):
                delta_rate = round(d_daily / pd_daily * 100, 2)
                if (d_daily >= daily_array[1] * 0.50):
                    cls.__insertEvent(round(d_daily / daily_array[1] * 100, 2),
                                      d_daily, author, '新星爆发', date)
                if (d_daily <= 0 and pd_daily >= 0):
                    cls.__insertEvent('-', d_daily, author, '急转直下', date)
                    c_date += 86400
                    continue
                if (d_daily <= -50000):
                    cls.__insertEvent(delta_rate, d_daily, author, '末日级掉粉',
                                      date)
                elif (d_daily <= -20000):
                    cls.__insertEvent(delta_rate, d_daily, author, '雪崩级掉粉',
                                      date)
                elif (d_daily <= -5000):
                    cls.__insertEvent(delta_rate, d_daily, author, '大量掉粉',
                                      date)

                if (c_date >= start_date * 86400 * 8 and d_daily > 0):
                    weekly_array = interrupted_fans(
                        [c_date - 86400 * 8, c_date - 86400])
                    weekly_mean = (weekly_array[1] - weekly_array[0]) / 7
                    # 上周平均涨粉数
                    delta_rate = round(d_daily / weekly_mean * 100, 2)
                    if delta_rate >= 10000 or d_daily >= 200000:
                        # 日涨粉数超过上日的100倍
                        cls.__insertEvent(delta_rate, d_daily, author, '传说级涨粉',
                                          date)
                    elif delta_rate >= 5000 or d_daily >= 100000:
                        # 日涨粉数超过上日的50倍
                        cls.__insertEvent(delta_rate, d_daily, author, '史诗级涨粉',
                                          date)
                    elif delta_rate >= 2500:
                        # 日涨粉数超过上日的25倍
                        cls.__insertEvent(delta_rate, d_daily, author, '大量涨粉',
                                          date)

            c_date += 86400
Beispiel #20
0
 def __init__(self):
     jieba.load_userdict('./resources/dict.txt')
     self.videoCollection = MongoDbDao.getCollection('video')
     self.errorCollection = MongoDbDao.getCollection('error')
Beispiel #21
0
 def __init__(self):
     self.authorCollection = MongoDbDao.getCollection('author')
     self.redisConnection = RedisDao.getRedisConnect()
Beispiel #22
0
 def __init__(self):
     self.videoCollection = MongoDbDao.getCollection('video')
     self.redisConnection = RedisDao.getRedisConnect()
Beispiel #23
0
 def __init__(self):
     self.videoOnlineCollection = MongoDbDao.getCollection('video_online')
Beispiel #24
0
 def __init__(self):
     self.authorCollection = MongoDbDao.getCollection('author')
     self.task = SpiderTask('观测UP主的视频数据自动追加爬虫',
                            collection=MongoDbDao.getCollection('tracer'))
Beispiel #25
0
 def __init__(self):
     self.coll = MongoDbDao.getCollection('site_info')
Beispiel #26
0
 def __init__(self):
     self.authorCollection = MongoDbDao.getCollection('author')
Beispiel #27
0
 def __init__(self):
     self.mongoDbConnection = MongoDbDao.getMongoDb()
Beispiel #28
0
    def calculateAuthorRank(cls):
        task_name = "计算作者排名数据"
        authorCollection = MongoDbDao.getCollection('author')
        keys = ['cFans', 'cArchive_view', 'cArticle_view']
        allCount = authorCollection.count_documents({keys[0]: {'$exists': 1}})
        progressTask = ProgressTask(
            task_name,
            allCount * len(keys),
            collection=MongoDbDao.getCollection('tracer'))
        for each_key in keys:
            logger.info("开始计算作者{}排名".format(each_key))
            authorCollectionResult = authorCollection.find(
                {
                    each_key: {
                        '$exists': 1
                    }
                }, {
                    'mid': 1,
                    'rank': 1,
                    each_key: 1
                }).batch_size(300).sort(each_key, DESCENDING)
            if each_key == 'cFans':
                each_rank = 'fansRank'
                each_d_rank = 'dFansRank'
                each_p_rank = 'pFansRank'
            elif each_key == 'cArchive_view':
                each_rank = 'archiveViewRank'
                each_d_rank = 'dArchiveViewRank'
                each_p_rank = 'pArchiveViewRank'
            elif each_key == 'cArticle_view':
                each_rank = 'articleViewRank'
                each_d_rank = 'dArticleViewRank'
                each_p_rank = 'pArticleViewRank'

            iTh = 1
            for each_author in authorCollectionResult:
                progressTask.current_value += 1
                logger.info("计算{}排名".format(each_author['mid']))
                if each_key in each_author:
                    if 'rank' in each_author:
                        rank = each_author['rank']
                        if each_rank in each_author['rank']:
                            rank[each_d_rank] = each_author['rank'][
                                each_rank] - iTh
                        else:
                            rank[each_d_rank] = 0
                        rank[each_rank] = iTh
                        rank[each_p_rank] = cls.__format_p_rank(iTh, allCount)
                    else:
                        # 初始化
                        rank = {
                            each_rank: iTh,
                            each_d_rank: 0,
                            each_p_rank: cls.__format_p_rank(iTh, allCount)
                        }

                if each_author[each_key] == 0:
                    if 'rank' in each_author:
                        rank = each_author['rank']
                        rank[each_d_rank] = 0
                        rank[each_rank] = -1
                        rank[each_p_rank] = -1
                    else:
                        rank = {each_rank: -1, each_d_rank: 0, each_p_rank: -1}
                if each_key == 'cArticle_view':
                    rank['updateTime'] = datetime.datetime.now()
                authorCollection.update_one({'mid': each_author['mid']},
                                            {'$set': {
                                                'rank': rank,
                                            }})
                iTh += 1
        progressTask.current_value = progressTask.total_value
        logger.info("计算作者排名结束")
Beispiel #29
0
 def __init__(self):
     self.authorCollection = MongoDbDao.getCollection('author')
     self.redisCollection = RedisDao.getRedisConnect()
     self.task = SpiderTask('作者数据更新爬虫',
                            collection=MongoDbDao.getCollection('tracer'))