def get_detail(html): user = User() cont = get_publicinfo.get_right(html) if cont == '': return user soup = BeautifulSoup(cont, 'html.parser') basic_modules = soup.find_all(attrs={'class': 'WB_cardwrap S_bg2'}) basic_info = soup.find_all(attrs={'class': 'li_1 clearfix'}) for each_module in basic_modules: try: basic_str = each_module.find(attrs={'class': 'main_title W_fb W_f14'}).get_text() if '基本信息' in basic_str: for each in basic_info: each_str = each.get_text() if '昵称' in each_str: nickname = each.find(attrs={'class': 'pt_detail'}).get_text() user.screen_name = nickname elif '所在地' in each_str: location = each.find(attrs={'class': 'pt_detail'}).get_text() user.location = location if ' ' in location: province = location.split(' ')[0] city = location.split(' ')[1] user.province = province user.city = city else: user.province = location user.city = '' elif '性别' in each_str: gender = each.find(attrs={'class': 'pt_detail'}).get_text() user.gender = gender elif '性取向' in each_str: gender_prefer = each.find(attrs={'class': 'pt_detail'}).get_text() user.gender_prefer = gender_prefer # elif '感情状况' in each_str: # loving = each.find(attrs={'class': 'pt_detail'}).get_text() elif '生日' in each_str: birthday = each.find(attrs={'class': 'pt_detail'}).get_text() user.birthday = birthday elif '血型' in each_str: blood_type = each.find(attrs={'class': 'pt_detail'}).get_text() user.blood_type = blood_type elif '博客' in each_str: blog_url = each.find('a').get_text() user.blog_url = blog_url elif '简介' in each_str: description = each.find(attrs={'class': 'pt_detail'}).get_text() user.description = description.encode('gbk', 'ignore').decode('gbk') elif '注册时间' in each_str: register_time = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\t', '').replace( '\r\n', '') user.register_time = register_time elif '个性域名' in each_str: personal_domain = each.find('a').get_text() user.domain_name = personal_domain if '标签信息' in basic_str: basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'}) for each in basic_info: if '标签' in each.get_text(): tags = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\t', '').replace('\n\n\n', '') \ .strip().replace('\r\n', ';') user.owntag_info = tags if '教育信息' in basic_str: basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'}) for each in basic_info: if '大学' in each.get_text(): school_info = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\r\n', ',') \ .replace('\t', '').replace('\n', ';').lstrip(';').rstrip(';') user.educate_info = school_info if '工作信息' in basic_str: basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'}) jobs_info = [] for each in basic_info: if '公司' in each.get_text(): jobs = each.find_all(attrs={'class': 'pt_detail'}) for job in jobs: jobs_info.append(job.get_text().replace('\r\n', '').replace('\t', '').replace('\n', '')) all_job = ';'.join(jobs_info) user.work_info = all_job if '联系信息' in basic_str: basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'}) contact_info = [] for each in basic_info: if 'QQ' in each.get_text(): contact_info.append( 'qq:' + each.find(attrs={'class': 'pt_detail'}).get_text().replace('\n', '')) if '邮箱' in each.get_text(): contact_info.append('email:' + each.find(attrs={'class': 'pt_detail'}).get_text()) if 'MSN' in each.get_text(): contact_info.append('msn:' + each.find(attrs={'class': 'pt_detail'}).get_text()) contact_str = ';'.join(contact_info) user.contact_info = contact_str except Exception as why: print('解析出错,具体原因为{why}'.format(why=why)) finally: return user
def get_profile(user_id, session, headers): """ 默认为个人用户,如果为作家,则需要再做一次抓取,而为企业用户,它会重定向到企业主页,直接解析即可 登陆后可以根据http://weibo.com/u/userId来进行确定用户主页,不知道稳定不,todo 测试这个路径 好像'http://weibo.com/p/100505' + user_id + '/info?mod=pedit_more' 这个路径可以解决大部分路径问题,只是非普通用户 会被重定向到主页,有的并不行,比如domain=100106 """ if user_id == '': return User() user = User() info = get_user(user_id) if info: user.id = user_id user.screen_name = info.get('name') user.province = info.get('province') user.city = info.get('city') user.location = info.get('location') user.description = info.get('description') user.headimg_url = info.get('headimg_url') user.blog_url = info.get('blog_url') user.domain_name = info.get('domain_name') user.gender = info.get('gender') user.followers_count = info.get('followers_count') user.friends_count = info.get('friends_count') user.status_count = info.get('status_count') user.birthday = info.get('birthday') user.verify_type = info.get('verify_type') user.verify_info = info.get('verify_info') user.register_time = info.get('register_time') # 防止在插入数据库的时候encode()出问题 for key in user.__dict__: if user.__dict__[key] is None: setattr(user, key, '') storage.info('ID为{id}的用户信息已经存在于数据库中'.format(id=user_id)) else: url = 'http://weibo.com/p/100505' + user_id + '/info?mod=pedit_more' html = get_page(url, session, headers) if not is_404(html): domain = get_publicinfo.get_userdomain(html) if domain == '100505' or domain == '103505' or domain == '100306': user = get_personalinfo.get_detail(html) if user is not None: user.followers_count = get_personalinfo.get_fans(html) user.friends_count = get_personalinfo.get_friends(html) user.status_count = get_personalinfo.get_status(html) else: user = User() else: # 为了尽可能少抓取url,所以这里不适配所有服务号 if domain == '100106': url = 'http://weibo.com/p/'+domain+user_id+'/home' html = get_page(url, session, headers) if html == '': return user user.followers_count = get_enterpriseinfo.get_fans(html) user.friends_count = get_enterpriseinfo.get_friends(html) user.status_count = get_enterpriseinfo.get_status(html) user.description = get_enterpriseinfo.get_description(html).encode('gbk', 'ignore').decode('gbk') user.id = user_id user.screen_name = get_publicinfo.get_username(html) user.headimg_url = get_publicinfo.get_headimg(html) user.verify_type = get_publicinfo.get_verifytype(html) user.verify_info = get_publicinfo.get_verifyreason(html, user.verify_type) save_user(user) storage.info('已经成功保存ID为{id}的用户信息'.format(id=user_id)) return user