def parse_articles_follow_next_page(self, response): _item = crawldata() _item['data'] = response.body _item['url'] = response.url yield _item next_page = response.css("ul.navigation > li.next-page > a::attr('href')") if next_page: url = response.urljoin(next_page[0].extract()) yield scrapy.Request(url, self.parse_articles_follow_next_page)
def parse_articles_follow_next_page(self, response): _item = crawldata() _item['url'] = response.url _title = response.xpath("//span[@id='thread_subject']/text()").extract_first() _item['title'] = _title _tag = response.xpath("//h1[@class='ts']/a/text()").extract_first() _item['tag'] = _tag try: _item['postdate'] = response.meta['postdate'] except Exception as e: util.exc_info() _root = response.xpath("//div[@id='postlist']/div[starts-with(@id,'post_')]/table/tr/td[@class='plc']/div[@class='pct']/div[@class='pcb']/div[@class='t_fsz']") _message = [] for _root_item in _root: _second_root = _root_item.xpath("table/tr/td/child::node()") for _second_item in _second_root: _node_type = _second_item.xpath("name()").extract_first() if _node_type is None: _message.extend(_second_item.extract()) _message.append("\n") elif _node_type == "ignore_js_op": _img_url = _second_item.xpath("div//img/@file").extract_first() if _img_url is not None: _message.extend(response.urljoin(_img_url)) _message.append("\n") #抽取img,类似这种格式http://hzbike.com/forum.php?mod=viewthread&tid=118823&page=1&authorid=22591 _img_list = _root_item.xpath("div[@class='pattl']/ignore_js_op") for _img in _img_list: _img_url = _img.xpath(".//img/@file").extract_first() if _img_url is not None: _img_desc = _img.xpath(".//p[@class='mbn xg2']/text()").extract_first() if _img_desc is not None: _message.extend(_img_desc) _message.append("\n") _message.extend(response.urljoin(_img_url)) _message.append("\n") _item['data'] = "".join(_message).encode("utf8") yield _item next_page = response.xpath("//div[@class='pgt']/div[@class='pg']/a[@class='nxt']/@href") if next_page: url = response.urljoin(next_page.extract_first()) yield scrapy.Request(url, self.parse_articles_follow_next_page)