Ejemplo n.º 1
0
def questions_per_topic(topic_id, header, rQ):
    for page in range(1, 51):
        topic_url = 'https://www.zhihu.com/topic/%s/top-answers?page=%d' % (
            topic_id, page)
        proxy = rand_proxy()
        user_agent = random.choice(agents)
        header.update({'User-Agent': user_agent})
        try:
            html = requests.get(topic_url, headers=header,
                                proxies=proxy).content.decode('utf-8')
        except Exception as e:
            logger.error('exception url: %s' % topic_url)
            logger.error(e)
            continue
            #questions_per_topic(topic_id, header, rQ)

        # 查找本页第一个问题的点赞数量,如果小于1000,忽略本页内容
        first_vote = max_vote_p.search(html)
        if first_vote:
            max_vote = first_vote.group(1)
            if int(max_vote) < 1000:
                break

        tree = etree.HTML(html)
        questions = tree.xpath(
            '//div[@class="feed-main"]//a[@class="question_link"]')
        #logger.info('topic: %s, page: %s, find %s questions' % (topic_id, page, len(questions)))
        for q in questions:
            rQ.enqueue(per_question, q.attrib['href'])
Ejemplo n.º 2
0
def max_page(topic_id, header):
    question_url = 'https://www.zhihu.com/topic/{0}/top-answers'.format(
        topic_id)
    err = 0
    while 1:
        user_agent = random.choice(agents)
        proxy = rand_proxy()
        ip = proxy['http'].split(':')[1][2:]
        header.update({'User-Agent': user_agent})
        try:
            response = requests.get(question_url,
                                    headers=header,
                                    proxies=proxy)
        except Exception as e:
            logger.error(e)
            logger.error(topic_id)
            continue

        logger.info('visit: %s' % question_url)
        if response.status_code != 200:
            logger.error('{0} ERROR'.format(question_url))
            logger.error(header)
            return
        html = response.content.decode('utf-8')
        html_tree = etree.HTML(html)
        page_numbers = html_tree.xpath(
            '//div[@class="zm-invite-pager"]/span/a/text()')
        try:
            # span.text: 上一页 1 2 3 ... 13801 下一页
            return page_numbers[-2]
        except Exception as e:
            if html.find('系统检测到您的帐号或IP存在异常流量') > -1:
                logger.error(
                    '统检测到您的帐号或IP存在异常流量, proxy: {0}, user-agent: {1}'.format(
                        proxy, user_agent))
                if err == 5:
                    break
                err += 1
                continue

            logger.error(e)
            logger.error('topic_id: {0}'.format(topic_id))
            return 1
Ejemplo n.º 3
0
def questions_per_page(topic_id, page, header):
    question_url = 'https://www.zhihu.com/topic/{0}/questions?page={1}'.format(
        topic_id, page)
    user_agent = random.choice(agents)
    header.update({'User-Agent': user_agent})
    html = requests.get(question_url, headers=header,
                        proxies=rand_proxy()).content.decode('utf-8')
    questions = re.findall(question_p, html)
    for q in questions:
        try:
            mongo_conn().questions.insert_one({
                'qid': q[1],
                'stid': topic_id,
                'href': q[0],
                'name': q[2]
            })
        except DuplicateKeyError as e:
            logger.error(e)
            logger.info("topic_id: {0}, href: {1} exists".format(
                topic_id, q[0]))
Ejemplo n.º 4
0
def top_answers(topic_id, page, header):
    question_url = 'https://www.zhihu.com/topic/{0}/top-answers?page={1}'.format(
        topic_id, page)
    proxy = rand_proxy()
    user_agent = random.choice(agents)
    header.update({'User-Agent': user_agent})
    try:
        html = requests.get(question_url, headers=header,
                            proxies=proxy).content.decode('utf-8')
    except Exception as e:
        logger.error('exception url: %s' % question_url)
        logger.error(e)
        top_answers(topic_id, page, header)

    # 查找本页第一个问题的点赞数量,如果小于1000,忽略本页内容
    first_vote = max_vote_p.search(html)
    if first_vote:
        max_vote = first_vote.group(1)
        if int(max_vote) < 1000:
            logger.info('ignore %s, max_vote:%s' % (question_url, max_vote))
            return

    answers = re.findall(top_answer_p, html)
    if len(answers) == 0:
        logger.error('{0} answers not found, proxy: {1}'.format(
            question_url, proxy))

        return
    logger.info('{0} found answer {1}'.format(question_url, len(answers)))
    for a in answers:
        qid, aid, href = a[1], a[2], a[0]
        try:
            mongo_conn().answers.insert_one({
                'topic': topic_id,
                'question': a[1],
                'answer': a[2],
                'href': a[0]
            })
        except DuplicateKeyError as e:
            return
Ejemplo n.º 5
0
def per_question(q_href):
    #def per_question(q_href, cookie):
    time.sleep(random.randint(1, 8))
    q_url = 'https://www.zhihu.com%s' % q_href
    proxy = rand_proxy()
    user_agent = random.choice(agents)
    header = requestHeader
    header.update({'User-Agent': user_agent})
    try:
        #response = requests.get(q_url, headers=header, proxies=proxy, cookies=cookie).content
        response = requests.get(q_url, headers=header, proxies=proxy).content
        html = response.decode('utf-8')
    except Exception as e:
        logger.error('exception url: %s' % q_url)
        logger.error(e)
        #logger.info(response)
        #sys.exit()
        per_question(q_href)

    #if '系统检测到您的帐号或IP存在异常流量' in html:
    #    logger.error('proxy error, {0}'.format(proxy))
    #    raise Exception

    tree = etree.HTML(html)
    tags = tree.xpath('//div[@class="Popover"]/text()')

    #question_a = tree.xpath('//h1[@class="QuestionHeader-title"]/text()')
    question_a = tree.xpath('//title[@data-react-helmet="true"]/text()')
    if question_a:
        title = question_a[0].replace(' - 知乎', '')
        if '安全验证' == title:
            logger.error('proxy error, {0}'.format(proxy))
            raise Exception

        logger.info(title)
    else:
        logger.error('%s title not found' % q_url)
        if '你正在使用的浏览器版本过低' in html:
            logger.info(user_agent)
            per_question(q_href)
        else:
            raise Exception

    #detail_a = tree.xpath('//div[@class="QuestionHeader-detail"]/div/div/span/text()')
    #if detail_a:
    #    content = detail_a[0]
    #else:
    #    content = None

    topics = tree.xpath('//a[@class="TopicLink"]')
    sub_topic = mongo_conn().sub_topic
    for t in topics:
        # https://www.zhihu.com/topic/19552832
        tid = t.attrib['href'].split('/')[-1]
        name = t.xpath('.//text()')[0]
        try:
            sub_topic.insert_one({'sub_tid': tid, 'sub_name': name})
        except DuplicateKeyError as e:
            continue

    items = tree.xpath('//div[@class="ContentItem AnswerItem"]')
    for i in items:
        # "1792 人赞同了该回答"
        vote_text = i.xpath('.//span[@class="Voters"]/button/text()')
        if len(vote_text) == 0:
            logger.info('%s no votes' % q_url)
            break

        vote_num = re.match('\d+', vote_text[0]).group()
        if int(vote_num) >= 800:
            href = i.xpath('.//meta[@itemprop="url"]')[1].attrib['content']
            answer = i.xpath(
                './/span[@class="RichText CopyrightRichText-richText"]')[0]
            s = etree.tostring(answer).decode('utf-8')
            body = html2text.html2text(s.replace('<br>', ''))

            try:
                mongo_conn().top_answers.insert_one({
                    'title': title,
                    'answer': body,
                    'href': href,
                    'vote': vote_num
                })
            except DuplicateKeyError as e:
                continue
Ejemplo n.º 6
0
def getSubTopics(topic_id):
    offset = 0

    while 1:
        form_data = {'method': 'next', 'params': '{"topic_id": %s, "offset": %s, "hash_id": ""}' % (topic_id, offset)}
        try:
            response = requests.post(url=subTopic_url, data=form_data, headers=requestHeader, proxies=rand_proxy())
            datas = response.content.decode('utf-8')
            jr = json.loads(datas)
            # convert string array to string
            body = ''.join(jr['msg'])
            items = subTopic_p.findall(body)
            if len(items) == 0:
                break

            for item in items:
                #logger.info(item[0], item[1])
                yield(item)

            offset += 20
        except Exception as e:
            # A 400 means that the request was malformed. 
            # In other words, the data stream sent by the client to the server didn't follow the rules
            logger.error(e)
            logger.info('args -> topic_id: {0}, offset: {1}'.format(topic_id, offset))