def _extractAnswer(self, block): # aid aid = block['data-aid'] # 回答人 responderBlock = block.find('a', class_='zm-item-link-avatar') if responderBlock is None: responder = -1 # 匿名用户 else: responder = responderBlock['href'][8:] # /people/<responder> # 日期 date = block['data-created'] # 内容 content = block.find('div', class_='zm-editable-content').text.strip() # 赞同数 upvote = parseNum(block.find('span', class_='count').text) # 评论数目 comments = block.find('a', class_='toggle-comment').text.strip() p = comments.find('条评论') if p > 0: commentsCount = int(comments[:p]) else: commentsCount = 0 answer = dict() answer['aid'] = aid answer['responder'] = responder answer['date'] = date answer['content'] = content answer['upvote'] = upvote answer['commentsCount'] = commentsCount return answer
def update(self): ''' 更新Question,并获取Answers ''' self.lastModified = str(datetime.datetime.now()) qurl = 'http://www.zhihu.com/question/%d' % (self.qid) r = get(qurl) if r.status_code != 200: return False soup = BeautifulSoup(r.text) # 标题 self.title = soup.find('h2', class_='zm-item-title').text.strip() # 内容 self.detail = soup.find('div', id='zh-question-detail').div.text.strip() # 所属的话题标签 self.tags = [a.string.strip() for a in soup.find_all("a", class_='zm-item-tag')] # 关注人数 followersCountBlock = soup.find('div', class_='zg-gray-normal') if followersCountBlock is None or followersCountBlock.strong is None: # 当”还没有人关注该问题” followersCountBlock.strong is None self.followersCount = 0 else: self.followersCount = parseNum(followersCountBlock.strong.text) self.answers = [] # 回答数目 answersCountBlock = soup.find('h3', id='zh-question-answer-num') if answersCountBlock is None: if soup.find('span', class_='count') is not None: answersCount = 1 else: answersCount = 0 else: answersCount = int(answersCountBlock['data-num']) # 答案部分 每次50个 for block in soup.find_all('div', class_='zm-item-answer'): if block.find('div', class_='answer-status') is not None: continue # 忽略建议修改的答案 self.answers.append(self._extractAnswer(block)) if answersCount > 50: _xsrf = soup.find('input', attrs={'name': '_xsrf'})['value'] otherHeaders = {'Referer': qurl} for i in range(1, math.ceil(answersCount/50)): # more answers data = {"_xsrf": _xsrf, "method": 'next', 'params': '{"url_token": %d, "pagesize": 50, "offset": %d}' % (self.qid, i*50)} r = post('http://www.zhihu.com/node/QuestionAnswerListV2', otherHeaders, data) for block in r.json()['msg']: div = BeautifulSoup(block).div if div.find('div', class_='answer-status') is not None: continue # 忽略建议修改的答案 self.answers.append(self._extractAnswer(div)) return True