def get_answer_info(topic_url: str):
    """
    爬取回帖的内容
    从第一页开始, 第一个content是属于topic的, 这就意味着, 如果是第一页, 我们要从第二个content开始获取
    """
    sel = Selector(text=requests.get(topic_url).text)
    all_divs = sel.xpath("//div[starts-with(@id, 'post-')]")
    if "page" not in topic_url:  # 如果是第一页, 判断是否有回帖内容
        if len(all_divs) <= 1:
            print(topic_url + '这个帖子还没有回帖')
            return
        else:
            all_divs = all_divs[1:]
    else:
        if len(all_divs) <= 0:
            print(topic_url + '在这一页中没有回帖内容')
            return
    for i in range(len(all_divs)):
        answer = Answer()
        answer.answer_id = int(all_divs[i].xpath(
            "//div[@class='bbs_detail_wrap']//div/@data-post-id").extract()[i])
        answer.topic_id = int(re.search(r'(\d+)', topic_url).group())
        author_url = all_divs[i].xpath(
            ".//div[@class='nick_name']//a[1]/@href").extract()[0]
        answer.author_uuid = author_url.split('/')[-1]
        answer.create_time = datetime.strptime(
            all_divs[i].xpath(".//label[@class='date_time']/text()").extract()
            [0], "%Y-%m-%d %H:%M:%S")
        answer_content = ""
        for item in all_divs[i].xpath(
                ".//div[@class='post_body post_body_min_h']/text()").extract():
            answer_content += item.strip()
        answer.answer_content = answer_content
        answer.praised_nums = all_divs[i].xpath(
            ".//label[@class='red_praise digg']//em/text()").extract()[0]
        get_authors_info(author_url)
        if Answer.select().where(Answer.answer_id == answer.answer_id):
            answer.save()
        else:
            answer.save(force_insert=True)
    next_page = sel.xpath(
        "//a[@class='pageliststy next_page']/@href").extract()
    if next_page and "page" in next_page[-1]:
        if int(re.search(r"page=(\d+)", next_page[-1]).group(1)) <= 3:
            get_answer_info(parse.urljoin(DOMAIN, next_page[-1]))
    else:
        return
Ejemplo n.º 2
0
    def fetchAnswers(self, q_id, start, count):
        query = (
            "SELECT id, owner_id, content, c_time FROM answers WHERE q_id = %s ORDER BY c_time DESC LIMIT %s OFFSET %s ;"
        )
        cursor = self.db.cursor()

        ret = []
        cursor.execute(query, (q_id, int(count), int(start)))
        for (id, owner_id, content, c_time) in cursor:
            answer = Answer()
            answer.id = id
            answer.question_id = q_id
            answer.owner_id = owner_id
            answer.content = content
            answer.create_time = c_time
            ret.append(answer)

        cursor.close()
        return ret