Esempio n. 1
0
def get_detail(html):
    user = User()
    cont = get_publicinfo.get_right(html)
    if cont == '':
        return user
    soup = BeautifulSoup(cont, 'html.parser')
    basic_modules = soup.find_all(attrs={'class': 'WB_cardwrap S_bg2'})
    basic_info = soup.find_all(attrs={'class': 'li_1 clearfix'})

    for each_module in basic_modules:
        try:
            basic_str = each_module.find(attrs={'class': 'main_title W_fb W_f14'}).get_text()
            if '基本信息' in basic_str:
                for each in basic_info:
                    each_str = each.get_text()
                    if '昵称' in each_str:
                        nickname = each.find(attrs={'class': 'pt_detail'}).get_text()
                        user.screen_name = nickname
                    elif '所在地' in each_str:
                        location = each.find(attrs={'class': 'pt_detail'}).get_text()
                        user.location = location
                        if ' ' in location:
                            province = location.split(' ')[0]
                            city = location.split(' ')[1]
                            user.province = province
                            user.city = city
                        else:
                            user.province = location
                            user.city = ''
                    elif '性别' in each_str:
                        gender = each.find(attrs={'class': 'pt_detail'}).get_text()
                        user.gender = gender
                    elif '性取向' in each_str:
                        gender_prefer = each.find(attrs={'class': 'pt_detail'}).get_text()
                        user.gender_prefer = gender_prefer
                    # elif '感情状况' in each_str:
                    #     loving = each.find(attrs={'class': 'pt_detail'}).get_text()
                    elif '生日' in each_str:
                        birthday = each.find(attrs={'class': 'pt_detail'}).get_text()
                        user.birthday = birthday
                    elif '血型' in each_str:
                        blood_type = each.find(attrs={'class': 'pt_detail'}).get_text()
                        user.blood_type = blood_type
                    elif '博客' in each_str:
                        blog_url = each.find('a').get_text()
                        user.blog_url = blog_url
                    elif '简介' in each_str:
                        description = each.find(attrs={'class': 'pt_detail'}).get_text()
                        user.description = description.encode('gbk', 'ignore').decode('gbk')
                    elif '注册时间' in each_str:
                        register_time = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\t', '').replace(
                            '\r\n', '')
                        user.register_time = register_time
                    elif '个性域名' in each_str:
                        personal_domain = each.find('a').get_text()
                        user.domain_name = personal_domain

            if '标签信息' in basic_str:
                basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
                for each in basic_info:
                    if '标签' in each.get_text():
                        tags = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\t', '').replace('\n\n\n',
                                                                                                            '') \
                            .strip().replace('\r\n', ';')
                        user.owntag_info = tags

            if '教育信息' in basic_str:
                basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
                for each in basic_info:
                    if '大学' in each.get_text():
                        school_info = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\r\n', ',') \
                            .replace('\t', '').replace('\n', ';').lstrip(';').rstrip(';')
                        user.educate_info = school_info

            if '工作信息' in basic_str:
                basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
                jobs_info = []
                for each in basic_info:
                    if '公司' in each.get_text():
                        jobs = each.find_all(attrs={'class': 'pt_detail'})
                        for job in jobs:
                            jobs_info.append(job.get_text().replace('\r\n', '').replace('\t', '').replace('\n', ''))
                all_job = ';'.join(jobs_info)
                user.work_info = all_job

            if '联系信息' in basic_str:
                basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
                contact_info = []
                for each in basic_info:
                    if 'QQ' in each.get_text():
                        contact_info.append(
                            'qq:' + each.find(attrs={'class': 'pt_detail'}).get_text().replace('\n', ''))
                    if '邮箱' in each.get_text():
                        contact_info.append('email:' + each.find(attrs={'class': 'pt_detail'}).get_text())
                    if 'MSN' in each.get_text():
                        contact_info.append('msn:' + each.find(attrs={'class': 'pt_detail'}).get_text())
                contact_str = ';'.join(contact_info)
                user.contact_info = contact_str
        except Exception as why:
            print('解析出错,具体原因为{why}'.format(why=why))
        finally:
            return user
Esempio n. 2
0
def get_profile(user_id, session, headers):
    """
    默认为个人用户,如果为作家,则需要再做一次抓取,而为企业用户,它会重定向到企业主页,直接解析即可
    登陆后可以根据http://weibo.com/u/userId来进行确定用户主页,不知道稳定不,todo 测试这个路径
    好像'http://weibo.com/p/100505' + user_id + '/info?mod=pedit_more' 这个路径可以解决大部分路径问题,只是非普通用户
    会被重定向到主页,有的并不行,比如domain=100106
    """
    if user_id == '':
        return User()

    user = User()
    info = get_user(user_id)

    if info:
        user.id = user_id
        user.screen_name = info.get('name')
        user.province = info.get('province')
        user.city = info.get('city')
        user.location = info.get('location')
        user.description = info.get('description')
        user.headimg_url = info.get('headimg_url')
        user.blog_url = info.get('blog_url')
        user.domain_name = info.get('domain_name')
        user.gender = info.get('gender')
        user.followers_count = info.get('followers_count')
        user.friends_count = info.get('friends_count')
        user.status_count = info.get('status_count')
        user.birthday = info.get('birthday')
        user.verify_type = info.get('verify_type')
        user.verify_info = info.get('verify_info')
        user.register_time = info.get('register_time')

        # 防止在插入数据库的时候encode()出问题
        for key in user.__dict__:
            if user.__dict__[key] is None:
                setattr(user, key, '')

        storage.info('ID为{id}的用户信息已经存在于数据库中'.format(id=user_id))

    else:
        url = 'http://weibo.com/p/100505' + user_id + '/info?mod=pedit_more'
        html = get_page(url, session, headers)

        if not is_404(html):
            domain = get_publicinfo.get_userdomain(html)

            if domain == '100505' or domain == '103505' or domain == '100306':
                user = get_personalinfo.get_detail(html)
                if user is not None:
                    user.followers_count = get_personalinfo.get_fans(html)
                    user.friends_count = get_personalinfo.get_friends(html)
                    user.status_count = get_personalinfo.get_status(html)
                else:
                    user = User()
            else:
                # 为了尽可能少抓取url,所以这里不适配所有服务号
                if domain == '100106':
                    url = 'http://weibo.com/p/'+domain+user_id+'/home'
                    html = get_page(url, session, headers)
                    if html == '':
                        return user

                user.followers_count = get_enterpriseinfo.get_fans(html)
                user.friends_count = get_enterpriseinfo.get_friends(html)
                user.status_count = get_enterpriseinfo.get_status(html)
                user.description = get_enterpriseinfo.get_description(html).encode('gbk', 'ignore').decode('gbk')

            user.id = user_id
            user.screen_name = get_publicinfo.get_username(html)
            user.headimg_url = get_publicinfo.get_headimg(html)
            user.verify_type = get_publicinfo.get_verifytype(html)
            user.verify_info = get_publicinfo.get_verifyreason(html, user.verify_type)

            save_user(user)
            storage.info('已经成功保存ID为{id}的用户信息'.format(id=user_id))

    return user