Ejemplo n.º 1
0
class TestCustomScraper(unittest.TestCase):
    def setUp(self):
        self.body = open(
            os.path.join(os.path.dirname(__file__),
                         "test_pages/dmoz_index.html"), "r").read()
        self.url = "http://www.example.com/"
        self.request = Request(self.url)
        self.response = TextResponse(self.url,
                                     request=self.request,
                                     body=self.body,
                                     encoding='utf-8')
        self.scraper = CustomScraper(index="test_index",
                                     start_urls=[self.url],
                                     parser_string="//a",
                                     parser_dict={
                                         "text": "text()",
                                         "link": "@href",
                                     })

    def test_parse(self):
        parsed = list(self.scraper.parse_item(self.response))
        assert parsed[1] == {
            'text': [u'about dmoz'],
            'link': [u'http://www.dmoz.org/docs/en/about.html']
        }
Ejemplo n.º 2
0
 def setUp(self):
     self.body = open(
         os.path.join(os.path.dirname(__file__),
                      "test_pages/dmoz_index.html"), "r").read()
     self.url = "http://www.example.com/"
     self.request = Request(self.url)
     self.response = TextResponse(self.url,
                                  request=self.request,
                                  body=self.body,
                                  encoding='utf-8')
     self.scraper = CustomScraper(index="test_index",
                                  start_urls=[self.url],
                                  parser_string="//a",
                                  parser_dict={
                                      "text": "text()",
                                      "link": "@href",
                                  })
Ejemplo n.º 3
0
 def setUp(self):
     self.es = Elasticsearch()
     self.pipeline = ElasticsearchPipeline()
     self.spider = CustomScraper(
         index="test_index",
         start_urls=["http://www.dmoz.org"],
         parser_string="//a",
         parser_dict={
             "text": "text()",
             "link": "@href",
         }
     )
     self.pipeline.open_spider(self.spider)
Ejemplo n.º 4
0
 def setUp(self):
     self.body = open(os.path.join(os.path.dirname(__file__),
         "test_pages/dmoz_index.html"), "r").read()
     self.url = "http://www.example.com/"
     self.request = Request(self.url)
     self.response = TextResponse(self.url, request=self.request, body=self.body, encoding='utf-8')
     self.scraper = CustomScraper(
         index="test_index",
         start_urls=[self.url],
         parser_string="//a",
         parser_dict={
             "text": "text()",
             "link": "@href",
         }
     )
Ejemplo n.º 5
0
class TestCustomScraper(unittest.TestCase):

    def setUp(self):
        self.body = open(os.path.join(os.path.dirname(__file__),
            "test_pages/dmoz_index.html"), "r").read()
        self.url = "http://www.example.com/"
        self.request = Request(self.url)
        self.response = TextResponse(self.url, request=self.request, body=self.body, encoding='utf-8')
        self.scraper = CustomScraper(
            index="test_index",
            start_urls=[self.url],
            parser_string="//a",
            parser_dict={
                "text": "text()",
                "link": "@href",
            }
        )

    def test_parse(self):
        parsed = list(self.scraper.parse_item(self.response))
        assert parsed[1] == {'text': [u'about dmoz'], 'link': [u'http://www.dmoz.org/docs/en/about.html']}