Ejemplo n.º 1
0
def enter_space(user_id):
    """
    进入用户主页
    :param user_id:
    :return:
    """
    try:
        url = 'https://space.bilibili.com/' + str(user_id)
        response = requests.get(url, headers=headers, timeout=6)
        if response.status_code == 200:
            # logger.info('进入主页成功, user_id = {}'.format(user_id))
            get_basic_userinfo(user_id)
        else:
            logger.info('进入主页失败, use_id = {}, code = {}'.format(
                user_id, response.status_code))
    except ConnectionError as e:
        logger.error('网络连接异常,e = {}', format(e))
Ejemplo n.º 2
0
def save_userinfo_mysql(result):
    """
    将用户个人信息保存到mysql数据库中
    相关:数据库bilibili 数据表bilibili_userinfo
    :param result:
    :return:
    """
    global conn, cur

    sql_select = 'select count(*) from bilibili_userinfo where mid = {};'.format(
        result[0])
    # logger.info(cur.execute(sql_select))& 0 != cur.execute('select * from bilibili_userinfo')
    cur.execute(sql_select)
    count = cur.fetchall()[0][0]
    logger.info('count = {}'.format(count))
    if 0 != count:
        logger.info('用户个人信息在数据库中已存在, user_id = {}'.format(result[0]))
    else:
        sql_insert = 'insert into bilibili_userinfo values(NULL, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,' \
                     ' %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);'
        try:
            cur.execute(sql_insert, result)
            # logger.info('用户个人信息保存到数据库中成功, user_id = {}'.format(result[0]))
        except:
            conn.rollback()
            logger.info('用户个人信息保存到数据库中失败, user_id = {}'.format(result[0]))
    conn.commit()
Ejemplo n.º 3
0
def get_add_userview(user_id):
    """
    获取用户播放数archive_view和阅读数article_view
    :param user_id:
    :return: result
    """
    try:
        url = 'https://api.bilibili.com/x/space/upstat?mid=' + str(user_id)
        response = requests.get(url, headers=headers, timeout=6)
        if response.status_code == 200:
            content = response.json()
            if content.get('data'):
                data = content['data']
                result = (data['archive']['view'], data['article']['view'])
                return result
            else:
                logger.info('获取用户播放数和阅读数失败, use_id = {}'.format(user_id))
        else:
            logger.info('获取用户播放数和阅读数失败, use_id = {}, code = {}'.format(
                user_id, response.status_code))
    except ConnectionError as e:
        logger.error('网络连接异常,e = {}', format(e))
Ejemplo n.º 4
0
def get_add_userfollow(user_id):
    """
    获取用户粉丝数follower和关注数following
    :param user_id:
    :return: result
    """
    try:
        url = 'https://api.bilibili.com/x/relation/stat?vmid=' + str(user_id)
        response = requests.get(url, headers=headers, timeout=6)
        if response.status_code == 200:
            content = response.json()
            if content.get('data'):
                data = content['data']
                result = (data['following'], data['follower'])
                return result
            else:
                logger.info('获取用户粉丝数和关注数失败, use_id = {}'.format(user_id))
        else:
            logger.info('获取用户粉丝数和关注数失败, use_id = {}, code = {}'.format(
                user_id, response.status_code))
    except ConnectionError as e:
        logger.error('网络连接异常,e = {}', format(e))
Ejemplo n.º 5
0
def get_userfollowing_list(user_id, user_name):
    """
    获取用户关注列表
    :param user_id:
    :param user_name:
    :return:
    """
    global complete
    try:
        url = 'https://api.bilibili.com/x/relation/followings?vmid=' + str(
            user_id)
        response = requests.get(url, headers=headers, timeout=6)
        if response.status_code == 200:
            # logger.info('获取用户关注列表成功, user_id = {}'.format(user_id))
            content = response.json()
            if content.get('data'):
                data = content['data']
                followings = []

                totals = data['total']
                followings.append(totals)

                # 由于系统限制 只能获取前50的关注
                if totals > 50:
                    # logger.info('该用户关注列表数多于50, user_id = {}'.format(user_id))
                    totals = 50
                for i in range(0, totals):
                    try:
                        following = (data['list'][i]['mid'],
                                     data['list'][i]['uname'])
                        followings.append(following)
                    except:
                        break
                # 保存用户关系表到mysql数据库上
                # logger.info(followings)
                save_userinfo_mysql(followings, user_id, user_name)

                complete.append(user_id)
                logger.info(complete)

                get_userfollowing_list_repeat(followings)
            else:
                logger.info('获取用户关注列表失败, use_id = {}, user_name = {}'.format(
                    user_id, user_name))
        else:
            logger.info('获取用户关注列表失败, use_id = {}, code = {}'.format(
                user_id, response.status_code))
    except ConnectionError as e:
        logger.error('网络连接异常,e = {}', format(e))
Ejemplo n.º 6
0
def save_userinfo_mysql(followings, user_id, user_name):
    """
    存储用户关系信息到mysql数据库
    相关:数据库bilibili 数据表bilibili_userrelation
    :param followings:
    :param user_id:
    :param user_name:
    :return:
    """
    try:
        totals = followings[0]
        if totals > 50:
            totals = 50

        # 单向 A关注B
        sql_A2B = 'insert into bilibili_userrelation(user1_mid, user1_name, user2_mid, user2_name, status) ' \
                     'values(%s, %s, %s, %s, 0);'
        # 单向 B关注A
        sql_B2A = 'insert into bilibili_userrelation(user1_mid, user1_name, user2_mid, user2_name, status) ' \
                     'values(%s, %s, %s, %s, 1);'
        # 双向 AB互相关注
        sql_AB = 'update bilibili_userrelation set status=2 where user1_mid = %s AND user2_mid = %s;'
        # 检查记录是否存在
        sql_selectAB = 'select count(*) from bilibili_userrelation where user1_mid = %s AND user2_mid = %s;'
        sql_selectABstatus = 'select status from bilibili_userrelation where user1_mid = %s AND user2_mid = %s;'

        for row in followings[1:totals + 1]:
            # logger.info('mid = {}'.format(row[0]))
            if user_id < row[0]:
                cur.execute(sql_selectAB, (user_id, row[0]))
                count = cur.fetchall()[0][0]
                # logger.info('count = {}'.format(count))
                if 0 == count:
                    result = (user_id, user_name) + row
                    try:
                        cur.execute(sql_A2B, result)
                    except:
                        conn.rollback()
                        logger.info('用户关系信息保存到数据库中失败,A2BB,mid分别是{}和{}'.format(
                            user_id, row[0]))
                elif 1 == count:
                    try:
                        cur.execute(sql_selectABstatus, (user_id, row[0]))
                        status = cur.fetchone()[0]
                        # logger.info(status)
                        if 1 == status:
                            try:
                                cur.execute(sql_AB, (user_id, row[0]))
                            except:
                                conn.rollback()
                                logger.info(
                                    '用户关系信息保存到数据库中失败,A2B,mid分别是{}和{}'.format(
                                        user_id, row[0]))
                        else:
                            logger.info(
                                '用户关系信息在数据库中,A2B已经执行过,mid分别是{}和{}'.format(
                                    user_id, row[0]))
                    except:
                        conn.rollback()
                        logger.info('用户关系信息保存到数据库中失败,A2B,mid分别是{}和{}'.format(
                            user_id, row[0]))
                else:
                    logger.info('用户关系信息在数据库中重复有多条,A2B,mid分别是{}和{}'.format(
                        user_id, row[0]))
            elif user_id > row[0]:
                result = row + (user_id, user_name)
                cur.execute(sql_selectAB, (row[0], user_id))
                count = cur.fetchall()[0][0]
                if 0 == count:
                    try:
                        cur.execute(sql_B2A, result)
                    except:
                        conn.rollback()
                        logger.info('用户关系信息保存到数据库中失败,B2AA,mid分别是{}和{}'.format(
                            row[0], user_id))
                elif 1 == count:
                    try:
                        cur.execute(sql_selectABstatus, (row[0], user_id))
                        status = cur.fetchone()[0]
                    except:
                        logger.info('用户关系信息查询状态出错,B2A,mid分别是{}和{}'.format(
                            row[0], user_id))
                    if 0 == status:
                        try:
                            cur.execute(sql_AB, (row[0], user_id))
                        except:
                            conn.rollback()
                            logger.info(
                                '用户关系信息保存到数据库中失败,B2A,mid分别是{}和{}'.format(
                                    row[0], user_id))
                    else:
                        logger.info('用户关系信息在数据库中,B2A已经执行过,mid分别是{}和{}'.format(
                            row[0], user_id))
                else:
                    logger.info('用户关系信息在数据库中重复有多条,B2A,mid分别是{}和{}'.format(
                        row[0], user_id))
            else:
                logger.info('用户关系信息出错,自关,mid是{}', format(user_id))
    except ConnectionError as e:
        logger.error('网络连接异常,e = {}', format(e))
Ejemplo n.º 7
0
                                '用户关系信息保存到数据库中失败,B2A,mid分别是{}和{}'.format(
                                    row[0], user_id))
                    else:
                        logger.info('用户关系信息在数据库中,B2A已经执行过,mid分别是{}和{}'.format(
                            row[0], user_id))
                else:
                    logger.info('用户关系信息在数据库中重复有多条,B2A,mid分别是{}和{}'.format(
                        row[0], user_id))
            else:
                logger.info('用户关系信息出错,自关,mid是{}', format(user_id))
    except ConnectionError as e:
        logger.error('网络连接异常,e = {}', format(e))


if __name__ == '__main__':
    logger.info('运行开始,开始爬取B站用户的用户关系')

    time_start = datetime.datetime.now()

    # user_id = 546195
    user_id = 2
    user_name = '碧诗'
    run(user_id, user_name)

    # user_id = 1
    # user_name = 'test'
    # followings = [1,(2,'a')]
    # save_userinfo_mysql(followings,user_id,user_name)

    time_end = datetime.datetime.now()
    time = (time_end - time_start).seconds
Ejemplo n.º 8
0
def get_basic_userinfo(user_id):
    """
    获取基础用户个人信息
    :param user_id:
    :return:
    """
    global total

    url = 'https://space.bilibili.com/ajax/member/GetInfo'
    payload = {'mid': user_id}
    head = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36'
        '(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        'Referer': 'https://space.bilibili.com/' + str(user_id)
    }
    try:
        response = requests.post(url, headers=head, data=payload, timeout=6)
        if response.status_code == 200:
            content = response.json()
            if content.get('data'):
                data = content['data']
                try:
                    regtime = time.strftime("%Y-%m-%d %H:%M:%S",
                                            time.localtime(data['regtime']))
                except:
                    logger.info(
                        '用户没有regtime这个标签, user_id = {}'.format(user_id))
                    regtime = 0
                    pass
                try:
                    birthday = data['birthday']
                except:
                    logger.info(
                        '用户没有birthday这个标签, user_id = {}'.format(user_id))
                    birthday = 0
                    regtime = 0
                    pass
                result = (
                    data['mid'],
                    data['name'],
                    data['sex'],
                    data['rank'],
                    data['face'],
                    regtime,
                    data['spacesta'],
                    # data['birthday'],
                    birthday,
                    data['sign'],
                    data['level_info']['current_level'],
                    data['official_verify']['desc'],
                    data['vip']['vipType'],
                    data['vip']['vipStatus'],
                    data['toutu'],
                    data['toutuId'],
                    data['theme'],
                    data['theme_preview'],
                    data['coins'],
                    data['im9_sign'],
                    data['fans_badge'])
                # logger.info('获取用户个人信息成功 use_id = {}'.format(user_id))
                result += get_add_userfollow(user_id)
                result += get_add_usercount(user_id)
                result += get_add_userview(user_id)
                # logger.info(result)
                # 将用户个人信息保存到mysql数据库中
                save_userinfo_mysql(result)
                total += 1
                if total % 100 == 0:
                    logger.info('目前共计爬取到{}条数据'.format(total))
            else:
                logger.info('获取用户个人信息失败, use_id = {}'.format(user_id))
        else:
            logger.info('获取用户个人信息失败, use_id = {}, code = {}'.format(
                user_id, response.status_code))
    except ConnectionError as e:
        logger.error('网络连接异常,e = {}', format(e))
Ejemplo n.º 9
0
    cur.execute(sql_select)
    count = cur.fetchall()[0][0]
    logger.info('count = {}'.format(count))
    if 0 != count:
        logger.info('用户个人信息在数据库中已存在, user_id = {}'.format(result[0]))
    else:
        sql_insert = 'insert into bilibili_userinfo values(NULL, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,' \
                     ' %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);'
        try:
            cur.execute(sql_insert, result)
            # logger.info('用户个人信息保存到数据库中成功, user_id = {}'.format(result[0]))
        except:
            conn.rollback()
            logger.info('用户个人信息保存到数据库中失败, user_id = {}'.format(result[0]))
    conn.commit()


if __name__ == '__main__':
    logger.info('运行开始,开始爬取B站用户个人信息数据')

    time_start = datetime.datetime.now()

    for user_id in range(102, 201):
        run(user_id)

    time_end = datetime.datetime.now()
    time = (time_end - time_start).seconds

    logger.info('运行结束,共计爬取到{}条数据,运行时间是{}秒'.format(total, time))
    conn.close()