def test_header(self):
     specs = {"type": "column", "value": 1}
     lextractor = create_linkextractor_from_specs(specs)
     response = UTF8TextResponse(url='http://www.example.com/', body=csvfeed3)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, 'http://www.example.com/path')
     self.assertEqual(links[1].url, 'http://www.example.com/path2')
Esempio n. 2
0
 def test_extra_params(self):
     specs = {"type": "column", "value": 1, "delimiter": "|"}
     lextractor = create_linkextractor_from_specs(specs)
     response = TextResponse(url="http://www.example.com/", body=csvfeed2)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, "http://www.example.com/path")
     self.assertEqual(links[1].url, "http://www.example.com/path2")
 def test_simple(self):
     specs = {"type": "html", "value": None}
     lextractor = create_linkextractor_from_specs(specs)
     response = UTF8HtmlResponse(url='http://www.example.com/', body=html)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, 'http://www.example.com/path')
     self.assertEqual(links[0].text, 'Click here')
 def test_custom_withargs(self):
     specs = {"type": "regex", "value": 'url: ((?:http|https)://www.example.com/[\w/]+)', 'allowed_schemes': ['http']}
     lextractor = create_linkextractor_from_specs(specs)
     text = "url: http://www.example.com/path, more text url: https://www.example.com/path2. And more text url: https://aws.amazon.com/product?id=23#tre"
     response = UTF8TextResponse(url='http://www.example.com/', body=text)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, 'http://www.example.com/path')
 def test_default(self):
     specs = {"type": "regex", "value": ''}
     lextractor = create_linkextractor_from_specs(specs)
     text = "Hello http://www.example.com/path, more text https://aws.amazon.com/product?id=23#tre?"
     response = UTF8TextResponse(url='http://www.example.com/', body=text)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, 'http://www.example.com/path')
     self.assertEqual(links[1].url, 'https://aws.amazon.com/product?id=23')
Esempio n. 6
0
 def test_simple(self):
     specs = {"type": "pagination", "value": None}
     lextractor = create_linkextractor_from_specs(specs)
     html_page = htmlpage_from_response(HtmlResponse(url="http://www.example.com/", body=html))
     html_page.headers["n_items"] = 1
     links = list(lextractor.links_to_follow(html_page))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, "http://www.example.com/path")
     self.assertEqual(links[0].text, "Click here")
Esempio n. 7
0
 def test_custom(self):
     specs = {"type": "regex", "value": "url: ((?:http|https)://www.example.com/[\w/]+)"}
     lextractor = create_linkextractor_from_specs(specs)
     text = "url: http://www.example.com/path, more text url: https://www.example.com/path2. And more text url: https://aws.amazon.com/product?id=23#tre"
     response = TextResponse(url="http://www.example.com/", body=text)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, "http://www.example.com/path")
     self.assertEqual(links[1].url, "https://www.example.com/path2")
    def test_sitemap(self):
        specs = {"type": "sitemap", "value": ""}
        lextractor = create_linkextractor_from_specs(specs)
        links = list(lextractor.links_to_follow(self.sitemap))
        self.assertEqual(len(links), 3)
        self.assertEqual(links[0].url, 'http://www.accommodationforstudents.com/')

        links = list(lextractor.links_to_follow(self.sitemapindex))
        self.assertEqual(len(links), 1)
        self.assertEqual(links[0].url, 'http://www.example.com/sitemap1.xml.gz')
Esempio n. 9
0
 def _create_start_request_from_specs(self, info):
     url = info["url"]
     lspecs = info.get("link_extractor")
     if lspecs:
         linkextractor = create_linkextractor_from_specs(lspecs)
         def _callback(spider, response):
             for link in linkextractor.links_to_follow(response):
                 yield Request(url=link.url, callback=spider.parse)
         return Request(url=url, callback=_callback)
     return Request(url=url, callback=self.parse)
Esempio n. 10
0
    def test_sitemap(self):
        specs = {"type": "sitemap", "value": ""}
        lextractor = create_linkextractor_from_specs(specs)
        links = list(lextractor.links_to_follow(self.sitemap))
        self.assertEqual(len(links), 3)
        self.assertEqual(links[0].url,
                         'http://www.accommodationforstudents.com/')

        links = list(lextractor.links_to_follow(self.sitemapindex))
        self.assertEqual(len(links), 1)
        self.assertEqual(links[0].url,
                         'http://www.example.com/sitemap1.xml.gz')
Esempio n. 11
0
 def handle_xml(self, response, seen):
     _type = content_type(response).subtype.split('+')[0]
     try:
         link_extractor = create_linkextractor_from_specs({
             'type': _type, 'value': ''
         })
     except ValueError:
         link_extractor = XmlLinkExtractor()
     for link in link_extractor.links_to_follow(response):
         request = self._filter_link(link, seen)
         if request:
             yield request
Esempio n. 12
0
 def test_custom(self):
     specs = {
         "type": "regex",
         "value": 'url: ((?:http|https)://www.example.com/[\w/]+)'
     }
     lextractor = create_linkextractor_from_specs(specs)
     text = "url: http://www.example.com/path, more text url: https://www.example.com/path2. And more text url: https://aws.amazon.com/product?id=23#tre"
     response = TextResponse(url='http://www.example.com/', body=text)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, 'http://www.example.com/path')
     self.assertEqual(links[1].url, 'https://www.example.com/path2')
Esempio n. 13
0
    def _create_start_request_from_specs(self, info):
        url = info["url"]
        lspecs = info.get("link_extractor")
        if lspecs:
            linkextractor = create_linkextractor_from_specs(lspecs)

            def _callback(spider, response):
                for link in linkextractor.links_to_follow(response):
                    yield Request(url=link.url, callback=spider.parse)

            return Request(url=url, callback=_callback)
        return Request(url=url, callback=self.parse)
Esempio n. 14
0
 def handle_xml(self, response, seen):
     _type = XML_APPLICATION_TYPE(response.headers.get('Content-Type', ''))
     _type = _type.groupdict()['type'] if _type else 'xml'
     try:
         link_extractor = create_linkextractor_from_specs({
             'type': _type, 'value': ''
         })
     except ValueError:
         link_extractor = SitemapLinkExtractor()
     for link in link_extractor.links_to_follow(response):
         request = self._filter_link(link, seen)
         if request:
             yield request
Esempio n. 15
0
 def handle_xml(self, response, seen):
     _type = XML_APPLICATION_TYPE(response.headers.get('Content-Type', ''))
     _type = _type.groupdict()['type'] if _type else 'xml'
     try:
         link_extractor = create_linkextractor_from_specs({
             'type': _type, 'value': ''
         })
     except ValueError:
         link_extractor = SitemapLinkExtractor()
     for link in link_extractor.links_to_follow(response):
         request = self._filter_link(link, seen)
         if request:
             yield request
Esempio n. 16
0
 def handle_xml(self, response, seen):
     _type = content_type(response).subtype.split('+')[0]
     try:
         link_extractor = create_linkextractor_from_specs({
             'type': _type,
             'value': ''
         })
     except ValueError:
         link_extractor = XmlLinkExtractor()
     for link in link_extractor.links_to_follow(response):
         request = self._filter_link(link, seen)
         if request:
             yield request
Esempio n. 17
0
 def test_start_urls(self):
     specs = {"type": "pagination",
              "value": None,
              "start_urls": ['http://www.spam.com/?p=1',
                             'http://www.eggs.com/?page=0']
     }
     lextractor = create_linkextractor_from_specs(specs)
     html = """
     <a href="http://www.spam.com/?p=100">Click here 1</a>
     <a href="http://www.spam.com/?p=200">Click here 2</a>
     <a href="http://www.spam.com/?p=300">Click here 3</a>
     """
     html_page = htmlpage_from_response(
         HtmlResponse(url='http://www.example.com/', body=html))
     links = list(lextractor.links_to_follow(html_page))
     links = sorted(links, key=lambda link: link.url)
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, "http://www.spam.com/?p=100")
     self.assertEqual(links[1].url, "http://www.spam.com/?p=200")
     self.assertEqual(links[2].url, "http://www.spam.com/?p=300")
     self.assertEqual(links[0].text, 'Click here 1')
     self.assertEqual(links[1].text, 'Click here 2')
     self.assertEqual(links[2].text, 'Click here 3')
Esempio n. 18
0
 def test_start_urls(self):
     specs = {"type": "pagination",
              "value": None,
              "start_urls": ['http://www.spam.com/?p=1',
                             'http://www.eggs.com/?page=0']
     }
     lextractor = create_linkextractor_from_specs(specs)
     html = """
     <a href="http://www.spam.com/?p=100">Click here 1</a>
     <a href="http://www.spam.com/?p=200">Click here 2</a>
     <a href="http://www.spam.com/?p=300">Click here 3</a>
     """
     html_page = htmlpage_from_response(
         UTF8HtmlResponse(url='http://www.example.com/', body=html))
     links = list(lextractor.links_to_follow(html_page))
     links = sorted(links, key=lambda link: link.url)
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, "http://www.spam.com/?p=100")
     self.assertEqual(links[1].url, "http://www.spam.com/?p=200")
     self.assertEqual(links[2].url, "http://www.spam.com/?p=300")
     self.assertEqual(links[0].text, 'Click here 1')
     self.assertEqual(links[1].text, 'Click here 2')
     self.assertEqual(links[2].text, 'Click here 3')
Esempio n. 19
0
 def test_rss(self):
     specs = {"type": "rss", "value": ""}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, "http://www.wikipedia.org/")
Esempio n. 20
0
 def test_xml_remove_namespaces(self):
     specs = {"type": "xpath", "value": "//link/@href", "remove_namespaces": True}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.atom))
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, 'http://example.org/feed/')
Esempio n. 21
0
 def test_xml_remove_namespaces(self):
     specs = {"type": "xpath", "value": "//link/@href", "remove_namespaces": True}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.atom))
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, 'http://example.org/feed/')
Esempio n. 22
0
 def test_atom(self):
     specs = {"type": "atom", "value": ""}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.atom))
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, 'http://example.org/feed/')
Esempio n. 23
0
 def test_xml(self):
     specs = {"type": "xpath", "value": "//item/link/text()"}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, 'http://www.wikipedia.org/')
Esempio n. 24
0
 def test_xml(self):
     specs = {"type": "xpath", "value": "//item/link/text()"}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, 'http://www.wikipedia.org/')
Esempio n. 25
0
 def test_atom(self):
     specs = {"type": "atom", "value": ""}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.atom))
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, 'http://example.org/feed/')