def getPrivateUserInfo(self, uid): user = ListenUser(Utils.userHost + '/u/' + uid + '/') # 若为私有,需要换种访问信息的方式 user.isPrivate = 1 # 如下jsonp可用于获取部分信息 # encodeUri后:http://bulo.hujiang.com/service/GetUserFace.ashx?ver=2016/11/23%20%E4%B8%8B%E5%8D%888:29:28&userId=5326257&callback=jQuery17202552129787287378_1479904148908&_=1479904168992 # decodeUri后:http://bulo.hujiang.com/service/GetUserFace.ashx?ver=2016/11/23 下午8:29:28&userId=5326257&callback=jQuery17202552129787287378_1479904148908&_=1479904168992 queryParams = {} nt = datetime.datetime.now() currHour = int(datetime.datetime.now().hour) verTime = nt.strftime('%Y/%m/%d {half}%I:%M:%S').format(half=('上午' if currHour < 12 else '下午')) queryParams['ver'] = verTime queryParams['userId'] = uid timeStamp = ''.join(str(time.time()).split('.'))[:13] queryParams['callback'] = 'jQuery17202552129787287378_' + str(timeStamp) queryParams['_'] = str(timeStamp) info_url = Utils.urlCreate(Utils.userHost + '/service/GetUserFace.ashx?', queryParams) try: infoRespond = requests.get(info_url, headers=Utils.headers) except Exception: self.logger.error('隐私用户信息获取失败: ' + uid) return None infoRespondStr = infoRespond.text.split('(', maxsplit=1)[1][:-1] infoRespondJson = json.loads(infoRespondStr) user.name = infoRespondJson['UserName'] if 'UserName' in infoRespondJson else '' user.nickName = infoRespondJson['NickName'][1:-1] if 'NickName' in infoRespondJson else '' user.signature = infoRespondJson['UserSign'] if 'UserSign' in infoRespondJson else '' user.city = infoRespondJson['city'] if 'city' in infoRespondJson else '' user.signinLast = infoRespondJson['PunchCount'] if 'PunchCount' in infoRespondJson else '' gender = infoRespondJson['Gender'] if 'PunchCount' in infoRespondJson else '' if gender == '1' or gender == '0': user.gender = '男' if gender == '1' else '女' try: user.save(self.mysql_session) except Exception: self.logger.error('存储隐私用户信息失败') raise Exception return user
def getUserInfo(self, uid, frequentAdd=1, frequentReduce=2): full_url = Utils.userHost + '/u/' + uid + '/' user = ListenUser(full_url) try: content = self.session.get(full_url, headers=Utils.headers, allow_redirects=False) except Exception: self.logger.error('获取用户页面失败: ' + full_url) return None # 有的人将部落设置为隐私,外部不能访问,页面会302转向error if content.status_code != 200: self.logger.warning(full_url + ': redirect ' + str(content.status_code)) responseText = urlparse(unquote( content.headers['Location'])).query.split('=', maxsplit=1)[1] self.logger.warning('提示: ' + responseText) if responseText[0:2] == '用户': # 若为私有,需要换种访问信息的方式 # 如下jsonp可用于获取部分信息 # encodeUri后:http://bulo.hujiang.com/service/GetUserFace.ashx?ver=2016/11/23%20%E4%B8%8B%E5%8D%888:29:28&userId=5326257&callback=jQuery17202552129787287378_1479904148908&_=1479904168992 # decodeUri后:http://bulo.hujiang.com/service/GetUserFace.ashx?ver=2016/11/23 下午8:29:28&userId=5326257&callback=jQuery17202552129787287378_1479904148908&_=1479904168992 queryParams = {} nt = datetime.datetime.now() currHour = int(datetime.datetime.now().hour) verTime = nt.strftime('%Y/%m/%d {half}%I:%M:%S').format( half=('上午' if currHour < 12 else '下午')) queryParams['ver'] = verTime queryParams['userId'] = uid timeStamp = ''.join(str(time.time()).split('.'))[:13] queryParams['callback'] = 'jQuery17202552129787287378_' + str( timeStamp) queryParams['_'] = str(timeStamp) info_url = Utils.urlCreate( Utils.userHost + '/service/GetUserFace.ashx?', queryParams) try: infoRespond = requests.get(info_url, headers=Utils.headers) except Exception: self.logger.error('隐私用户信息获取失败: ' + uid) return None infoRespondStr = infoRespond.text.split('(', maxsplit=1)[1][:-1] infoRespondJson = json.loads(infoRespondStr) user.name = infoRespondJson[ 'UserName'] if 'UserName' in infoRespondJson else '' user.nickName = infoRespondJson['NickName'][ 1:-1] if 'NickName' in infoRespondJson else '' user.signature = infoRespondJson[ 'UserSign'] if 'UserSign' in infoRespondJson else '' user.city = infoRespondJson[ 'city'] if 'city' in infoRespondJson else '' user.signinLast = infoRespondJson[ 'PunchCount'] if 'PunchCount' in infoRespondJson else '' gender = infoRespondJson[ 'Gender'] if 'PunchCount' in infoRespondJson else '' if gender == '1' or gender == '0': user.gender = '男' if gender == '1' else '女' try: user.save(self.mysql_session) except Exception: self.logger.error('存储隐私用户信息失败') raise Exception # 若为私有,则存储进userAll self.privateUids += 1 self.userUids.add(uid) if self.tooFrequent > 0: self.tooFrequent -= frequentReduce self.tooFrequent = 0 if self.tooFrequent < 0 else self.tooFrequent self.logger.debug(user) return user else: # 某用户页面访问次数限制 if self.lastUserVisitInfo[0] == uid: self.lastUserVisitInfo[1] += 1 else: self.lastUserVisitInfo[0] = uid self.lastUserVisitInfo[1] = 0 # 满足次数要求的话,就放进优先队列等待下一次重试访问 if self.lastUserVisitInfo[1] <= self.failedToVisitCountLimit: # 不能存进userAll, 而是放进放进uidsPriority数组等待重新访问 self.appendUidPriority(uid) else: # 记录失败的访问用户 self.failedToVisit.append(uid) # 避免再次访问 self.userUids.add(uid) # 如果第一次遇到这种情况,最好睡眠时间快速增长,之后缓慢增长 if self.tooFrequent == 0: self.tooFrequent = 4 else: self.tooFrequent += frequentAdd return None # 如果此时的返回不是过于频繁,那么等待时间即可缩小一倍 if self.tooFrequent > 0: self.tooFrequent -= frequentReduce self.tooFrequent = 0 if self.tooFrequent < 0 else self.tooFrequent try: soup = BeautifulSoup(content.text, "lxml") except Exception: self.logger.error('解析用户页面失败: ' + full_url) return # 统计 countList = soup.find(attrs={'id': 'LeftCnt_divUserCount'}) # 处理一些数据 if countList: # 访客数 viewCount = countList.find(attrs={'id': 'li_viewCount'}) if viewCount and len(viewCount) != 0: user.viewCount = viewCount.string # 留言数 msgCount = countList.find(attrs={'id': 'li_msgCount'}) if msgCount and len(msgCount) != 0: user.msgCount = msgCount.find('a').string # 碎碎数 ingCount = countList.find(attrs={'id': 'li_ingCount'}) if ingCount and len(ingCount) != 0: user.ingCount = ingCount.find('a').string # 日志数 blogCount = countList.find(attrs={'id': 'li_blogCount'}) if blogCount and len(blogCount) != 0: user.blogCount = blogCount.find('a').string # 听写数 listenCount = countList.find(attrs={'id': 'li_listenCount'}) if listenCount and len(listenCount) != 0: user.listenCount = listenCount.find('a').string # 口语数 talkCount = countList.find(attrs={'id': 'li_talkCount'}) if talkCount and len(talkCount) != 0: user.talkCount = talkCount.find('a').string # 礼物数 giftCount = countList.find(attrs={'id': 'li_giftCount'}) if giftCount and len(giftCount) != 0: user.giftCount = giftCount.find('a').string # 个人信息 profileList = soup.find(id='u_profile').find('ul') # 继续处理数据 if profileList: for child in profileList.children: if child.name != 'li': continue text = child.get_text(strip=True) if re.compile(r'性别').search(text): user.gender = child.find_all('span')[1].string if re.compile(r'城市').search(text): user.city = child.find_all('span')[1].string if re.compile(r'昵称').search(text): child.span.replace_with('') user.nickName = child.get_text(strip=True) if re.compile(r'签名').search(text): child.span.replace_with('') user.signature = child.get_text(strip=True) if re.compile(r'沪龄').search(text): # user.yearLast = child.find_all('span')[1].string user.registDate = child.find_all('span')[1]['title'][5:] if re.compile(r'打卡').search(text): child.span.replace_with('') user.signinLast = int(child.get_text(strip=True)[0:-1]) if re.compile(r'登录').search(text): user.lastSignin = child.find_all('span')[1].string # 自我介绍 selfIntroPre = soup.find(id='user_Profile_span_reportIt') selfIntro = None if selfIntroPre: selfIntro = selfIntroPre.find_previous_sibling() if selfIntro and selfIntro.name == 'div': user.selfIntroduction = selfIntro.get_text(strip=True) # 城市,因为该部分是注释,所以用bs4找不出来就用re了 cityMatch = re.compile( r'<li id="user_Profile_span_city.*?<span>(.*?)</span></li>', re.S).search(content.text) if cityMatch: user.city = cityMatch.group(1) # 获取名称 userNameHtml = soup.find(id='cont_h1') userNameHtml.a.replace_with('') userNameHtml.span.replace_with('') user.name = userNameHtml.get_text(strip=True)[0:-5].strip() try: user.save(self.mysql_session) except Exception: self.logger.error('存储用户信息失败') raise Exception self.userUids.add(uid) self.logger.debug(user) return user