def questions_per_topic(topic_id, header, rQ): for page in range(1, 51): topic_url = 'https://www.zhihu.com/topic/%s/top-answers?page=%d' % ( topic_id, page) proxy = rand_proxy() user_agent = random.choice(agents) header.update({'User-Agent': user_agent}) try: html = requests.get(topic_url, headers=header, proxies=proxy).content.decode('utf-8') except Exception as e: logger.error('exception url: %s' % topic_url) logger.error(e) continue #questions_per_topic(topic_id, header, rQ) # 查找本页第一个问题的点赞数量,如果小于1000,忽略本页内容 first_vote = max_vote_p.search(html) if first_vote: max_vote = first_vote.group(1) if int(max_vote) < 1000: break tree = etree.HTML(html) questions = tree.xpath( '//div[@class="feed-main"]//a[@class="question_link"]') #logger.info('topic: %s, page: %s, find %s questions' % (topic_id, page, len(questions))) for q in questions: rQ.enqueue(per_question, q.attrib['href'])
def max_page(topic_id, header): question_url = 'https://www.zhihu.com/topic/{0}/top-answers'.format( topic_id) err = 0 while 1: user_agent = random.choice(agents) proxy = rand_proxy() ip = proxy['http'].split(':')[1][2:] header.update({'User-Agent': user_agent}) try: response = requests.get(question_url, headers=header, proxies=proxy) except Exception as e: logger.error(e) logger.error(topic_id) continue logger.info('visit: %s' % question_url) if response.status_code != 200: logger.error('{0} ERROR'.format(question_url)) logger.error(header) return html = response.content.decode('utf-8') html_tree = etree.HTML(html) page_numbers = html_tree.xpath( '//div[@class="zm-invite-pager"]/span/a/text()') try: # span.text: 上一页 1 2 3 ... 13801 下一页 return page_numbers[-2] except Exception as e: if html.find('系统检测到您的帐号或IP存在异常流量') > -1: logger.error( '统检测到您的帐号或IP存在异常流量, proxy: {0}, user-agent: {1}'.format( proxy, user_agent)) if err == 5: break err += 1 continue logger.error(e) logger.error('topic_id: {0}'.format(topic_id)) return 1
def questions_per_page(topic_id, page, header): question_url = 'https://www.zhihu.com/topic/{0}/questions?page={1}'.format( topic_id, page) user_agent = random.choice(agents) header.update({'User-Agent': user_agent}) html = requests.get(question_url, headers=header, proxies=rand_proxy()).content.decode('utf-8') questions = re.findall(question_p, html) for q in questions: try: mongo_conn().questions.insert_one({ 'qid': q[1], 'stid': topic_id, 'href': q[0], 'name': q[2] }) except DuplicateKeyError as e: logger.error(e) logger.info("topic_id: {0}, href: {1} exists".format( topic_id, q[0]))
def top_answers(topic_id, page, header): question_url = 'https://www.zhihu.com/topic/{0}/top-answers?page={1}'.format( topic_id, page) proxy = rand_proxy() user_agent = random.choice(agents) header.update({'User-Agent': user_agent}) try: html = requests.get(question_url, headers=header, proxies=proxy).content.decode('utf-8') except Exception as e: logger.error('exception url: %s' % question_url) logger.error(e) top_answers(topic_id, page, header) # 查找本页第一个问题的点赞数量,如果小于1000,忽略本页内容 first_vote = max_vote_p.search(html) if first_vote: max_vote = first_vote.group(1) if int(max_vote) < 1000: logger.info('ignore %s, max_vote:%s' % (question_url, max_vote)) return answers = re.findall(top_answer_p, html) if len(answers) == 0: logger.error('{0} answers not found, proxy: {1}'.format( question_url, proxy)) return logger.info('{0} found answer {1}'.format(question_url, len(answers))) for a in answers: qid, aid, href = a[1], a[2], a[0] try: mongo_conn().answers.insert_one({ 'topic': topic_id, 'question': a[1], 'answer': a[2], 'href': a[0] }) except DuplicateKeyError as e: return
def per_question(q_href): #def per_question(q_href, cookie): time.sleep(random.randint(1, 8)) q_url = 'https://www.zhihu.com%s' % q_href proxy = rand_proxy() user_agent = random.choice(agents) header = requestHeader header.update({'User-Agent': user_agent}) try: #response = requests.get(q_url, headers=header, proxies=proxy, cookies=cookie).content response = requests.get(q_url, headers=header, proxies=proxy).content html = response.decode('utf-8') except Exception as e: logger.error('exception url: %s' % q_url) logger.error(e) #logger.info(response) #sys.exit() per_question(q_href) #if '系统检测到您的帐号或IP存在异常流量' in html: # logger.error('proxy error, {0}'.format(proxy)) # raise Exception tree = etree.HTML(html) tags = tree.xpath('//div[@class="Popover"]/text()') #question_a = tree.xpath('//h1[@class="QuestionHeader-title"]/text()') question_a = tree.xpath('//title[@data-react-helmet="true"]/text()') if question_a: title = question_a[0].replace(' - 知乎', '') if '安全验证' == title: logger.error('proxy error, {0}'.format(proxy)) raise Exception logger.info(title) else: logger.error('%s title not found' % q_url) if '你正在使用的浏览器版本过低' in html: logger.info(user_agent) per_question(q_href) else: raise Exception #detail_a = tree.xpath('//div[@class="QuestionHeader-detail"]/div/div/span/text()') #if detail_a: # content = detail_a[0] #else: # content = None topics = tree.xpath('//a[@class="TopicLink"]') sub_topic = mongo_conn().sub_topic for t in topics: # https://www.zhihu.com/topic/19552832 tid = t.attrib['href'].split('/')[-1] name = t.xpath('.//text()')[0] try: sub_topic.insert_one({'sub_tid': tid, 'sub_name': name}) except DuplicateKeyError as e: continue items = tree.xpath('//div[@class="ContentItem AnswerItem"]') for i in items: # "1792 人赞同了该回答" vote_text = i.xpath('.//span[@class="Voters"]/button/text()') if len(vote_text) == 0: logger.info('%s no votes' % q_url) break vote_num = re.match('\d+', vote_text[0]).group() if int(vote_num) >= 800: href = i.xpath('.//meta[@itemprop="url"]')[1].attrib['content'] answer = i.xpath( './/span[@class="RichText CopyrightRichText-richText"]')[0] s = etree.tostring(answer).decode('utf-8') body = html2text.html2text(s.replace('<br>', '')) try: mongo_conn().top_answers.insert_one({ 'title': title, 'answer': body, 'href': href, 'vote': vote_num }) except DuplicateKeyError as e: continue
def getSubTopics(topic_id): offset = 0 while 1: form_data = {'method': 'next', 'params': '{"topic_id": %s, "offset": %s, "hash_id": ""}' % (topic_id, offset)} try: response = requests.post(url=subTopic_url, data=form_data, headers=requestHeader, proxies=rand_proxy()) datas = response.content.decode('utf-8') jr = json.loads(datas) # convert string array to string body = ''.join(jr['msg']) items = subTopic_p.findall(body) if len(items) == 0: break for item in items: #logger.info(item[0], item[1]) yield(item) offset += 20 except Exception as e: # A 400 means that the request was malformed. # In other words, the data stream sent by the client to the server didn't follow the rules logger.error(e) logger.info('args -> topic_id: {0}, offset: {1}'.format(topic_id, offset))