def parse_page(self, response): hxs = HtmlXPathSelector(response) categories = stripcats(hxs.select('//title/text()').extract()) joke_area = hxs.select('//p/text()').extract() for joke in joke_area: joke = stripjokes(joke) if len(joke) > 15: yield JokeItem(joke=joke, categories=categories)
def parse(self, response): hxs = HtmlXPathSelector(response) m = re.search(r'ym\d{2}-(\d+)\.html', response.url) if m: page = m.groups()[0] next_page = int(page) + 1 url = urlparse.urljoin(response.url, str(response.url)[0:str(response.url).find('-')] + '-%s.html' % next_page) else: url = response.url self.log(url) yield Request(url, callback=self.parse) categories = stripcats(hxs.select('//title/text()').extract()) for joke_box in hxs.select('//div[@id="Joke_box"]/text()').extract(): for joke in joke_box.split('<br><br>'): joke = stripjokes(joke) if len(joke) > 15: yield JokeItem(joke=joke, categories=categories)