def parse_item(self, response):
     scope = response.xpath('//*[@id="mw-content-text"]')
     for sel in scope.xpath('//h1 | .//h2 | .//h3 | .//h4 | .//p | .//ol | .//ul | .//pre'):
         item = NuggetItem()
         item['content'] = sel.extract()
         item['title'] = response.xpath('//title/text()').extract()
         yield item
 def parse_start_url(self, response):
     scope = response.xpath('//*[@id="mw-content-text"]')
     for sel in scope.xpath('//h1 | .//h2 | .//h3 | .//h4 | .//p | .//ol | .//ul | .//pre'):   # use: '//h2[1]/preceding-sibling::p' to get intro paragraphs
             item = NuggetItem()
             item['content'] = sel.extract()
             item['title'] = response.xpath('//title/text()').extract()
             yield item
Beispiel #3
0
 def parse_start_url(self, response):
     for sel in response.xpath('//h1 | //h2 | //p | //pre | //ul | //ol'):
         item = NuggetItem()
         item['content'] = sel.extract()
         item['title'] = response.xpath('//title/text()').extract()
         yield item