def get_answer_info(topic_url: str): """ 爬取回帖的内容 从第一页开始, 第一个content是属于topic的, 这就意味着, 如果是第一页, 我们要从第二个content开始获取 """ sel = Selector(text=requests.get(topic_url).text) all_divs = sel.xpath("//div[starts-with(@id, 'post-')]") if "page" not in topic_url: # 如果是第一页, 判断是否有回帖内容 if len(all_divs) <= 1: print(topic_url + '这个帖子还没有回帖') return else: all_divs = all_divs[1:] else: if len(all_divs) <= 0: print(topic_url + '在这一页中没有回帖内容') return for i in range(len(all_divs)): answer = Answer() answer.answer_id = int(all_divs[i].xpath( "//div[@class='bbs_detail_wrap']//div/@data-post-id").extract()[i]) answer.topic_id = int(re.search(r'(\d+)', topic_url).group()) author_url = all_divs[i].xpath( ".//div[@class='nick_name']//a[1]/@href").extract()[0] answer.author_uuid = author_url.split('/')[-1] answer.create_time = datetime.strptime( all_divs[i].xpath(".//label[@class='date_time']/text()").extract() [0], "%Y-%m-%d %H:%M:%S") answer_content = "" for item in all_divs[i].xpath( ".//div[@class='post_body post_body_min_h']/text()").extract(): answer_content += item.strip() answer.answer_content = answer_content answer.praised_nums = all_divs[i].xpath( ".//label[@class='red_praise digg']//em/text()").extract()[0] get_authors_info(author_url) if Answer.select().where(Answer.answer_id == answer.answer_id): answer.save() else: answer.save(force_insert=True) next_page = sel.xpath( "//a[@class='pageliststy next_page']/@href").extract() if next_page and "page" in next_page[-1]: if int(re.search(r"page=(\d+)", next_page[-1]).group(1)) <= 3: get_answer_info(parse.urljoin(DOMAIN, next_page[-1])) else: return
def fetchAnswers(self, q_id, start, count): query = ( "SELECT id, owner_id, content, c_time FROM answers WHERE q_id = %s ORDER BY c_time DESC LIMIT %s OFFSET %s ;" ) cursor = self.db.cursor() ret = [] cursor.execute(query, (q_id, int(count), int(start))) for (id, owner_id, content, c_time) in cursor: answer = Answer() answer.id = id answer.question_id = q_id answer.owner_id = owner_id answer.content = content answer.create_time = c_time ret.append(answer) cursor.close() return ret