Esempio n. 1
0
def init_parse_user(mid):
    global MAX_USER_NUM, START_NUM, db
    user_following = []
    user_follower = []
    db.insert_or_update(mid, table_sql.insert_user_detect())
    # 准备下一个迭代
    # 用户关注与被关注列表
    user_following_json = prase_content.return_json(
        api.return_user_follow(1, 10, mid), None, return_header())
    user_follower_json = prase_content.return_json(
        api.return_user_fans(1, 10, mid), None, return_header())
    try:
        user_following = user_following_json.get('data').get('list')
        user_follower = user_follower_json.get('data').get('list')

    except AttributeError as aerr:
        logging.exception(aerr)
    # 将关注的人放入数据库,即这些人为爬取对象
    for i in user_following:
        db.insert_or_update(i.get('mid'), table_sql.insert_user_detect())
        START_NUM = START_NUM + 1

        gl.set_value('current_user', START_NUM)

    # 随机抽取一个人并开始下一个迭代
    user_list = user_following + user_follower
    random_user = user_list[random.randint(0, len(user_list) - 1)]
    # 判断爬取人数是否够
    if START_NUM < MAX_USER_NUM:
        return init_parse_user(random_user['mid'])
    else:
        return
Esempio n. 2
0
def update_video():
    global db
    db.start_sql_engine()

    # 对数据库里需要更新的数据进行爬取
    need_update_id_list = db.select(table_sql.query_update_video_list())
    if len(need_update_id_list) != 0:
        gl.set_value('total_video', len(need_update_id_list))
        for update_aid in need_update_id_list:
            try:
                gl.set_value('current_video',
                             gl.get_value('current_video') + 1)
                update_old_video(update_aid[0])
            except Exception as e:
                logging.error(update_aid[0], e)
                continue
    time.sleep(120)
    # 检查所有爬取用户有没有更新视频
    user_list = db.select(table_sql.query_detect_list(0))
    for mid in user_list:
        try:
            insert_new_video(mid[0])
        except Exception as e:
            logging.error(str(mid[0]), e)
            continue

    db.close_db()
Esempio n. 3
0
def parse_user_info(mid):
    # print(mid)
    gl.set_value("current_id", "mid" + str(mid))
    global MAX_USER_NUM, START_NUM, db
    # print(mid)
    # 用户基本信息解析
    # print(1)
    user_info_json = prase_content.return_json(api.return_user_info(mid), None,
                                               return_header())
    time.sleep(random.uniform(0.2, 0.4))
    # print(2)
    user_info_ff_json = prase_content.return_json(
        api.return_user_follower_following(mid), None, return_header())
    time.sleep(random.uniform(0.2, 0.4))
    # print(3)
    user_video_count_json = prase_content.return_json(
        api.return_user_video_count(mid, None), None, return_header())
    # print(4)
    if user_info_json is None or user_info_ff_json is None or user_video_count_json is None:
        raise Exception(str(mid) + "遭遇反扒")
    # 用户不存在,或无意义账号
    # print(user_info_json)
    if user_info_json.get('code') != 0:
        db.insert_or_update(item=None, sql=table_sql.delete_detect_user(mid))
        return

    user_video_count = user_video_count_json.get('data').get('count')

    # print("user_video_count: " + str(user_video_count_json))
    user_info = package.package_user_info(user_info_json, user_info_ff_json,
                                          user_video_count)
    user_official = package.package_user_official(user_info_json)

    db.insert_or_update(item=user_info.return_tup(),
                        sql=table_sql.insert_user_info())
    db.insert_or_update(item=user_official.return_tup(),
                        sql=table_sql.replace_user_official())

    # 用户视频主要分布统计
    user_video_category = user_video_count_json.get('data').get('tlist')
    if len(user_video_category) == 0:
        return
    elif isinstance(user_video_category, dict):
        for i in user_video_category:
            uv_info = package.package_video_count(
                user_video_category[i],
                user_info_json.get('data').get('mid'))
            db.insert_or_update(item=uv_info.return_tup(),
                                sql=table_sql.replace_uv_count())
    elif isinstance(user_video_category, list):
        for index, i in enumerate(user_video_category):
            uv_info = package.package_video_count(
                user_video_category[index],
                user_info_json.get('data').get('mid'))
            db.insert_or_update(item=uv_info.return_tup(),
                                sql=table_sql.replace_uv_count())
Esempio n. 4
0
def update_parse_user():
    global db
    db.start_sql_engine()
    user_list = db.select(table_sql.query_detect_list(0))
    # print(list(user_list))
    gl.set_value('total_user', len(user_list))
    for index, mid in enumerate(list(user_list)):
        try:
            parse_user_info(mid[0])
            gl.set_value('current_user', index + 1)
        except Exception as e:
            print(traceback.format_exc())
            continue
    db.close_db()
Esempio n. 5
0
def update_old_video(aid):
    gl.set_value("current_id", "aid" + str(aid))
    global db
    video_json = prase_content.return_json(api.return_video_info(aid), None,
                                           return_header())
    # 视频不存在,或以删除,则让视频侦测完毕
    if video_json.get('code') != 0:
        db.insert_or_update(item=None,
                            sql=table_sql.update_complete_video_detect(aid))
        return
    video_info = package.package_video_info(video_json)
    db.insert_or_update(item=video_info.return_tup(),
                        sql=table_sql.insert_video_info())
    db.insert_or_update(item=None, sql=table_sql.update_video_detect_time(aid))
Esempio n. 6
0
def update_video():
    try:
        # gl._init()
        app.logger.debug('开始update_video')
        gl.set_value('current_task', enum.TaskList.update_video.value)
        gl.set_value('status', enum.Status.Continue.value)
        start = datetime.datetime.now()
        gl.set_value('start_time', start)

        spider.update_video()

        end = datetime.datetime.now()
        runtime = start - end
        app.logger.debug('runtime:' + str(runtime.seconds))
    except Exception as exc:
        app.logger.error(traceback.format_exc())
        gl.set_value('status', enum.Status.Error.value)
        scheduler.pause_job('update_video')
Esempio n. 7
0
def insert_new_video(mid):
    # print('mid:'+str(mid))
    gl.set_value("current_id", "mid" + str(mid))
    global db
    yesterday_video_count = db.select(
        table_sql.query_yesterday_user_video_count(mid))
    # print("len:"+str(yesterday_video_count))
    if len(yesterday_video_count) == 0:
        # 该天的数据缺失
        return
    yesterday_video_count = yesterday_video_count[0][0]
    ##################
    current_video_count_json = prase_content.return_json(
        api.return_user_video_count(mid, None), None, return_header())
    # if not current_video_count_json.get('status'):
    #     # 请求失败者再次请求
    #     time.sleep(20)
    #     current_video_count_json = prase_content.return_json(api.return_user_video_count(mid, None), None,
    #                                                          return_header())
    ##################
    time.sleep(0.2)
    current_video_count = current_video_count_json.get('data').get('count')
    if yesterday_video_count < current_video_count:
        pagesize = (current_video_count - yesterday_video_count) if (
            (current_video_count - yesterday_video_count) < 100) else 100
        #############
        current_video_list_json = prase_content.return_json(
            api.return_user_video_count(mid, pagesize), None, return_header())
        # if not current_video_list_json.get('status'):
        #     time.sleep(20)
        #     current_video_list_json = prase_content.return_json(
        #         api.return_user_video_count(mid, pagesize), None, return_header())
        #############
        video_list = current_video_list_json.get('data').get('vlist')
        for more_video in range(pagesize):
            try:
                aid = video_list[more_video].get('aid')
                # print('aid'+str(aid))
                gl.set_value('current_video',
                             gl.get_value('current_video') + 1)
                gl.set_value('total_video', gl.get_value('current_video'))
                # 添加视频监控
                new_video_detect(aid)
            except IndexError as exc:
                logging.error(str(mid), exc)
                continue