Exemple #1
0
 def add_omitted(self):
     total_value = self.mongo_word.count_documents({})
     if self.mongo_word.count_documents({}) < 100:
         return
     t = ProgressTask("更新查询关键词字典",
                      total_value=total_value,
                      collection=db['tracer'])
     d = open('./biliob_analyzer/dict.txt', 'r',
              encoding='utf8').read().split('\n')
     for each in self.mongo_word.find():
         if 'aid' in each and each['aid'] not in d:
             d.append(each['aid'])
         elif 'mid' in each and each['mid'] not in d:
             d.append(each['mid'])
         t.current_value += 1
     pass
     t.finished = True
     o = open('./biliob_analyzer/dict.txt',
              'w',
              encoding='utf8',
              newline='')
     for each in d:
         o.write(each + '\n')
     o.close()
     self.mongo_word.delete_many({})
     jieba.load_userdict('./biliob_analyzer/dict.txt')
     self.refresh_all_author()
     self.refresh_all_video()
Exemple #2
0
def compute_video_rank_table():
    task_name = '计算视频排名对照表'
    coll = db['video']  # 获得collection的句柄
    count = coll.estimated_document_count()
    top_n = 60
    print(count)
    keys = ['cView', 'cLike', 'cDanmaku', 'cFavorite', 'cCoin', 'cShare']
    task = ProgressTask(task_name, top_n * len(keys), collection=db['tracer'])
    o = {}
    skip = int(count / 100)
    for each_key_index in range(len(keys)):
        each_key = keys[each_key_index]
        o[each_key] = {}
        o['name'] = 'video_rank'
        o[each_key]['rate'] = []
        i = 1
        last_value = 9999999999
        # logger.info("开始计算视频{}排名对照表".format(each_key))
        video = coll.find({}, {
            'title': 1
        }).limit(200).sort(each_key, DESCENDING).batch_size(200)
        top = 1
        for each_video in list(video):
            o[each_key][each_video['title']] = top
            top += 1

        while i <= top_n:
            task.current_value = i + top_n * each_key_index
            video = list(
                coll.find({
                    each_key: {
                        '$lt': last_value
                    }
                }, {
                    each_key: 1
                }).limit(1).skip(skip).sort(each_key, DESCENDING))
            print(video)
            if len(video) != 0:
                video = video[0]
            else:
                i += 1
                continue
            if each_key not in video:
                break
            last_value = video[each_key]
            o[each_key]['rate'].append(last_value)
            print(last_value)
            i += 1
    o['update_time'] = datetime.datetime.utcnow() + datetime.timedelta(hours=8)
    output_coll = db['rank_table']
    output_coll.update_one({'name': 'video_rank'}, {'$set': o}, upsert=True)
def auto_crawl_bangumi():
    task_name = "生成番剧国创待爬链接"
    logger.info(task_name)
    t = ProgressTask(task_name, 1, collection=db['tracer'])
    redis_connection.rpush("bangumiAndDonghua:start_urls",
                           "https://www.bilibili.com/ranking/bangumi/167/0/7")
    redis_connection.rpush("bangumiAndDonghua:start_urls",
                           "https://www.bilibili.com/ranking/bangumi/13/0/7")
    t.current_value += 1
Exemple #4
0
def add_tag_task():
    task_name = "生成待爬标签视频链接"
    coll = db['video']
    doc_filter = {'tag': {'$exists': False}}
    total = coll.find(doc_filter, {"aid": 1}).count()
    cursor = coll.find(doc_filter, {"aid": 1}).batch_size(100)
    t = ProgressTask(task_name, total, collection=db['tracer'])
    url = 'https://www.bilibili.com/video/av{}'
    for each_video in cursor:
        t.current_value += 1
        aid = each_video['aid']
        redis_connection.rpush("tagAdder:start_urls", url.format(aid))
def update_author():
    task_name = "生成每日作者待爬链接"
    logger.info(task_name)
    coll = db['author']
    filter_dict = {'$or': [{'focus': True}, {'forceFocus': True}]}
    cursor = coll.find(filter_dict, {"mid": 1}).batch_size(200)
    total = coll.count_documents(filter_dict)
    if total != 0:
        t = ProgressTask(task_name, total, collection=db['tracer'])
        for each_doc in cursor:
            t.current_value += 1
            redis_connection.rpush(AUTHOR_KEY,
                                   AUTHOR_URL.format(mid=each_doc['mid']))
 def __judge_author(self, author_filter):
     author_cursor = author_coll.find(author_filter)
     count = author_cursor.count()
     a = author_coll.aggregate([{
         '$match': author_filter
     }, {
         '$project': {
             "mid": 1,
             "face": 1,
             "name": 1,
             "data": {
                 "$filter": {
                     "input": "$data",
                     "as": "data",
                     "cond": {
                         "$gt": [
                             "$$data.datetime",
                             datetime.datetime.now() -
                             datetime.timedelta(32)
                         ]
                     }
                 }
             }
         }
     }, {
         "$match": {
             "data.0": {
                 "$exists": True
             }
         }
     }])
     print("待爬取作者数量:{}".format(count))
     t = ProgressTask("粉丝数变动探测", total_value=count, collection=db['tracer'])
     for each_author in a:
         print(each_author['mid'])
         t.current_value += 1
         self.__judge(each_author)
     t.finished = True
     pass
def auto_add_video():
    task_name = "生成作者最新发布的视频的待爬链接"
    logger.info(task_name)
    coll = db['author']
    doc_filter = {'$or': [{'focus': True}, {'forceFocus': True}]}
    total = coll.count_documents(doc_filter)
    c = coll.find(doc_filter, {'mid': 1})
    if total != 0:
        t = ProgressTask(task_name, total, collection=db['tracer'])
        for each_doc in c:
            t.current_value += 1
            URL = 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&pagesize=10&page=1&order=pubdate'.format(
                each_doc['mid'])
            redis_connection.rpush("videoAutoAdd:start_urls", URL)
def crawlOnlineTopListData():
    task_name = "生成强力追踪待爬链接"
    logger.info(task_name)
    ONLINE_URL = 'https://www.bilibili.com/video/online.html'
    response = requests.get(ONLINE_URL)
    data_text = etree.HTML(
        response.content.decode('utf8')).xpath('//script/text()')[-2]
    j = json.loads(data_text.lstrip('window.__INITIAL_STATE__=')[:-122])
    total = len(j['onlineList'])
    t = ProgressTask(task_name, total, collection=db['tracer'])
    for each_video in j['onlineList']:
        aid = each_video['aid']
        mid = each_video['owner']['mid']
        if mid not in [7584632, 928123]:
            priorityAuthorCrawlRequest(mid)
        priorityVideoCrawlRequest(aid)
        t.current_value += 1
Exemple #9
0
def send_aids(task_name, total, cursor):
    if total == 0:
        return
    t = ProgressTask(task_name, total, collection=db['tracer'])
    aid_list = ''
    i = 0
    for each_doc in cursor:
        aid_list += str(each_doc['aid']) + ','
        i += 1
        if i == 100:
            t.current_value += i
            redis_connection.rpush(VIDEO_KEY,
                                   VIDEO_URL.format(aid=aid_list[:-1]))
            aid_list = ''
            i = 0
    t.current_value += i
    redis_connection.rpush(VIDEO_KEY, VIDEO_URL.format(aid=aid_list[:-1]))
Exemple #10
0
def auto_add_author():
    task_name = "生成排行榜待爬链接"
    logger.info(task_name)
    start_urls = [
        'https://www.bilibili.com/ranking',
        'https://www.bilibili.com/ranking/all/1/0/3',
        'https://www.bilibili.com/ranking/all/168/0/3',
        'https://www.bilibili.com/ranking/all/3/0/3',
        'https://www.bilibili.com/ranking/all/129/0/3',
        'https://www.bilibili.com/ranking/all/4/0/3',
        'https://www.bilibili.com/ranking/all/36/0/3',
        'https://www.bilibili.com/ranking/all/160/0/3',
        'https://www.bilibili.com/ranking/all/119/0/3',
        'https://www.bilibili.com/ranking/all/155/0/3',
        'https://www.bilibili.com/ranking/all/5/0/3',
        'https://www.bilibili.com/ranking/all/181/0/3'
    ]
    t = ProgressTask(task_name, len(start_urls), collection=db['tracer'])
    for each in start_urls:
        t.current_value += 1
        redis_connection.rpush('authorAutoAdd:start_urls', each)
def gen_online():
    task_name = "生成在线人数爬取链接"
    t = ProgressTask(task_name, 1, collection=db['tracer'])
    ONLINE_URL = 'https://www.bilibili.com/video/online.html'
    redis_connection.rpush("online:start_urls", ONLINE_URL)
    t.current_value = 1
Exemple #12
0
def author_fans_rate_caculate():

    logging.basicConfig(
        level=logging.INFO,
        format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s')
    logger = logging.getLogger(__name__)

    coll = db['author']  # 获得collection的句柄
    logger.info('开始计算粉丝增速')

    c_datetime = datetime.datetime.now()

    end_date = (
        datetime.datetime(c_datetime.year, c_datetime.month, c_datetime.day) -
        datetime.timedelta(1)).timestamp()
    start_date = (
        datetime.datetime(c_datetime.year, c_datetime.month, c_datetime.day) -
        datetime.timedelta(2)).timestamp()

    task = ProgressTask("计算粉丝增速",
                        coll.count_documents({}),
                        collection=db['tracer'])

    c = 0
    for each in coll.find({}, {'mid': 1, '_id': 0}).batch_size(200):
        c += 1
        task.current_value = c
        ag = coll.aggregate([{
            '$match': {
                'mid': each['mid']
            }
        }, {
            '$project': {
                'mid': 1,
                'data': {
                    "$filter": {
                        "input": "$data",
                        "as": "each_data",
                        "cond": {
                            "$gt": [
                                "$$each_data.datetime",
                                datetime.datetime.now() - datetime.timedelta(7)
                            ]
                        }
                    }
                }
            }
        }]).batch_size(1)
        each_author = next(ag)
        if 'data' in each_author and each_author['data'] != None:
            data = sorted(each_author['data'], key=lambda x: x['datetime'])
            if len(data) >= 2:
                logger.info(each_author['mid'])
                x = tuple(map(lambda x: x['datetime'].timestamp(), data))
                y = tuple(map(lambda x: x['fans'], data))
                inter_fun = interp1d(x, y, kind='linear')
                if start_date > x[0] and end_date < x[-1]:
                    inter_data = inter_fun([start_date, end_date])
                    delta_fans = inter_data[1] - inter_data[0]
                    coll.update_one({'mid': each_author['mid']},
                                    {"$set": {
                                        'cRate': int(delta_fans)
                                    }})
def calculate_author_rank():
    task_name = "计算作者排名数据"
    keys = ['cFans', 'cArchive_view', 'cArticle_view']
    count = coll.count_documents({keys[0]: {'$exists': 1}})
    t = ProgressTask(task_name, count * len(keys), collection=db['tracer'])
    for each_key in keys:
        logger.info("开始计算作者{}排名".format(each_key))
        i = 1
        authors = coll.find({
            each_key: {
                '$exists': 1
            }
        }, {
            'mid': 1,
            'rank': 1,
            each_key: 1
        }).batch_size(300).sort(each_key, DESCENDING)
        if each_key == 'cFans':
            each_rank = 'fansRank'
            each_d_rank = 'dFansRank'
            each_p_rank = 'pFansRank'
        elif each_key == 'cArchive_view':
            each_rank = 'archiveViewRank'
            each_d_rank = 'dArchiveViewRank'
            each_p_rank = 'pArchiveViewRank'
        elif each_key == 'cArticle_view':
            each_rank = 'articleViewRank'
            each_d_rank = 'dArticleViewRank'
            each_p_rank = 'pArticleViewRank'
        for each_author in authors:
            t.current_value += 1
            logger.info("计算{}排名".format(each_author['mid']))
            # 如果没有data 直接下一个
            if each_key in each_author:
                # 如果已经计算过rank
                if 'rank' in each_author:
                    rank = each_author['rank']
                    if each_rank in each_author['rank']:
                        rank[each_d_rank] = each_author['rank'][each_rank] - i
                    else:
                        rank[each_d_rank] = 0
                    rank[each_rank] = i
                    rank[each_p_rank] = format_p_rank(i, count)
                else:
                    # 初始化
                    rank = {
                        each_rank: i,
                        each_d_rank: 0,
                        each_p_rank: format_p_rank(i, count)
                    }
            if each_author[each_key] == 0:
                if 'rank' in each_author:
                    rank = each_author['rank']
                    rank[each_d_rank] = 0
                    rank[each_rank] = -1
                    rank[each_p_rank] = -1
                else:
                    rank = {each_rank: -1, each_d_rank: 0, each_p_rank: -1}
            if each_key == 'cArticle_view':
                rank['updateTime'] = datetime.datetime.now()
            coll.update_one({'mid': each_author['mid']},
                            {'$set': {
                                'rank': rank,
                            }})
            i += 1
    t.current_value = t.total_value
    logger.info("计算作者排名结束")