Esempi in Python per create_linkextractor_from_specs, esempi in Python per slybot.linkextractor.create_linkextractor_from_specs

Esempio n. 1

0

Mostra file

File: test_linkextractors.py Progetto: daqv/portia-dashboard

 def test_header(self):
     specs = {"type": "column", "value": 1}
     lextractor = create_linkextractor_from_specs(specs)
     response = UTF8TextResponse(url='http://www.example.com/', body=csvfeed3)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, 'http://www.example.com/path')
     self.assertEqual(links[1].url, 'http://www.example.com/path2')

Esempio n. 2

0

Mostra file

File: test_linkextractors.py Progetto: plafl/portia

 def test_extra_params(self):
     specs = {"type": "column", "value": 1, "delimiter": "|"}
     lextractor = create_linkextractor_from_specs(specs)
     response = TextResponse(url="http://www.example.com/", body=csvfeed2)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, "http://www.example.com/path")
     self.assertEqual(links[1].url, "http://www.example.com/path2")

Esempio n. 3

0

Mostra file

File: test_linkextractors.py Progetto: daqv/portia-dashboard

 def test_simple(self):
     specs = {"type": "html", "value": None}
     lextractor = create_linkextractor_from_specs(specs)
     response = UTF8HtmlResponse(url='http://www.example.com/', body=html)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, 'http://www.example.com/path')
     self.assertEqual(links[0].text, 'Click here')

Esempio n. 4

0

Mostra file

File: test_linkextractors.py Progetto: daqv/portia-dashboard

 def test_custom_withargs(self):
     specs = {"type": "regex", "value": 'url: ((?:http|https)://www.example.com/[\w/]+)', 'allowed_schemes': ['http']}
     lextractor = create_linkextractor_from_specs(specs)
     text = "url: http://www.example.com/path, more text url: https://www.example.com/path2. And more text url: https://aws.amazon.com/product?id=23#tre"
     response = UTF8TextResponse(url='http://www.example.com/', body=text)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, 'http://www.example.com/path')

Esempio n. 5

0

Mostra file

File: test_linkextractors.py Progetto: daqv/portia-dashboard

 def test_default(self):
     specs = {"type": "regex", "value": ''}
     lextractor = create_linkextractor_from_specs(specs)
     text = "Hello http://www.example.com/path, more text https://aws.amazon.com/product?id=23#tre?"
     response = UTF8TextResponse(url='http://www.example.com/', body=text)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, 'http://www.example.com/path')
     self.assertEqual(links[1].url, 'https://aws.amazon.com/product?id=23')

Esempio n. 6

0

Mostra file

File: test_linkextractors.py Progetto: plafl/portia

 def test_simple(self):
     specs = {"type": "pagination", "value": None}
     lextractor = create_linkextractor_from_specs(specs)
     html_page = htmlpage_from_response(HtmlResponse(url="http://www.example.com/", body=html))
     html_page.headers["n_items"] = 1
     links = list(lextractor.links_to_follow(html_page))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, "http://www.example.com/path")
     self.assertEqual(links[0].text, "Click here")

Esempio n. 7

0

Mostra file

File: test_linkextractors.py Progetto: plafl/portia

 def test_custom(self):
     specs = {"type": "regex", "value": "url: ((?:http|https)://www.example.com/[\w/]+)"}
     lextractor = create_linkextractor_from_specs(specs)
     text = "url: http://www.example.com/path, more text url: https://www.example.com/path2. And more text url: https://aws.amazon.com/product?id=23#tre"
     response = TextResponse(url="http://www.example.com/", body=text)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, "http://www.example.com/path")
     self.assertEqual(links[1].url, "https://www.example.com/path2")

Esempio n. 8

0

Mostra file

File: test_linkextractors.py Progetto: daqv/portia-dashboard

    def test_sitemap(self):
        specs = {"type": "sitemap", "value": ""}
        lextractor = create_linkextractor_from_specs(specs)
        links = list(lextractor.links_to_follow(self.sitemap))
        self.assertEqual(len(links), 3)
        self.assertEqual(links[0].url, 'http://www.accommodationforstudents.com/')

        links = list(lextractor.links_to_follow(self.sitemapindex))
        self.assertEqual(len(links), 1)
        self.assertEqual(links[0].url, 'http://www.example.com/sitemap1.xml.gz')

Esempio n. 9

0

Mostra file

File: spider.py Progetto: Kola0o0/slybot

 def _create_start_request_from_specs(self, info):
     url = info["url"]
     lspecs = info.get("link_extractor")
     if lspecs:
         linkextractor = create_linkextractor_from_specs(lspecs)
         def _callback(spider, response):
             for link in linkextractor.links_to_follow(response):
                 yield Request(url=link.url, callback=spider.parse)
         return Request(url=url, callback=_callback)
     return Request(url=url, callback=self.parse)

Esempio n. 10

0

Mostra file

File: test_linkextractors.py Progetto: torome/portia

    def test_sitemap(self):
        specs = {"type": "sitemap", "value": ""}
        lextractor = create_linkextractor_from_specs(specs)
        links = list(lextractor.links_to_follow(self.sitemap))
        self.assertEqual(len(links), 3)
        self.assertEqual(links[0].url,
                         'http://www.accommodationforstudents.com/')

        links = list(lextractor.links_to_follow(self.sitemapindex))
        self.assertEqual(len(links), 1)
        self.assertEqual(links[0].url,
                         'http://www.example.com/sitemap1.xml.gz')

Esempio n. 11

0

Mostra file

File: annotations.py Progetto: fakegit/portia

 def handle_xml(self, response, seen):
     _type = content_type(response).subtype.split('+')[0]
     try:
         link_extractor = create_linkextractor_from_specs({
             'type': _type, 'value': ''
         })
     except ValueError:
         link_extractor = XmlLinkExtractor()
     for link in link_extractor.links_to_follow(response):
         request = self._filter_link(link, seen)
         if request:
             yield request

Esempio n. 12

0

Mostra file

File: test_linkextractors.py Progetto: torome/portia

 def test_custom(self):
     specs = {
         "type": "regex",
         "value": 'url: ((?:http|https)://www.example.com/[\w/]+)'
     }
     lextractor = create_linkextractor_from_specs(specs)
     text = "url: http://www.example.com/path, more text url: https://www.example.com/path2. And more text url: https://aws.amazon.com/product?id=23#tre"
     response = TextResponse(url='http://www.example.com/', body=text)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, 'http://www.example.com/path')
     self.assertEqual(links[1].url, 'https://www.example.com/path2')

Esempio n. 13

0

Mostra file

File: spider.py Progetto: semutter/portia

    def _create_start_request_from_specs(self, info):
        url = info["url"]
        lspecs = info.get("link_extractor")
        if lspecs:
            linkextractor = create_linkextractor_from_specs(lspecs)

            def _callback(spider, response):
                for link in linkextractor.links_to_follow(response):
                    yield Request(url=link.url, callback=spider.parse)

            return Request(url=url, callback=_callback)
        return Request(url=url, callback=self.parse)

Esempio n. 14

0

Mostra file

File: annotations.py Progetto: tomzhang/portia

 def handle_xml(self, response, seen):
     _type = XML_APPLICATION_TYPE(response.headers.get('Content-Type', ''))
     _type = _type.groupdict()['type'] if _type else 'xml'
     try:
         link_extractor = create_linkextractor_from_specs({
             'type': _type, 'value': ''
         })
     except ValueError:
         link_extractor = SitemapLinkExtractor()
     for link in link_extractor.links_to_follow(response):
         request = self._filter_link(link, seen)
         if request:
             yield request

Esempio n. 15

0

Mostra file

File: annotations.py Progetto: BenJamesbabala/portia

 def handle_xml(self, response, seen):
     _type = XML_APPLICATION_TYPE(response.headers.get('Content-Type', ''))
     _type = _type.groupdict()['type'] if _type else 'xml'
     try:
         link_extractor = create_linkextractor_from_specs({
             'type': _type, 'value': ''
         })
     except ValueError:
         link_extractor = SitemapLinkExtractor()
     for link in link_extractor.links_to_follow(response):
         request = self._filter_link(link, seen)
         if request:
             yield request

Esempio n. 16

0

Mostra file

File: annotations.py Progetto: monocleman1/dd

 def handle_xml(self, response, seen):
     _type = content_type(response).subtype.split('+')[0]
     try:
         link_extractor = create_linkextractor_from_specs({
             'type': _type,
             'value': ''
         })
     except ValueError:
         link_extractor = XmlLinkExtractor()
     for link in link_extractor.links_to_follow(response):
         request = self._filter_link(link, seen)
         if request:
             yield request

Esempio n. 17

0

Mostra file

File: test_linkextractors.py Progetto: 01-/portia

 def test_start_urls(self):
     specs = {"type": "pagination",
              "value": None,
              "start_urls": ['http://www.spam.com/?p=1',
                             'http://www.eggs.com/?page=0']
     }
     lextractor = create_linkextractor_from_specs(specs)
     html = """
     <a href="http://www.spam.com/?p=100">Click here 1</a>
     <a href="http://www.spam.com/?p=200">Click here 2</a>
     <a href="http://www.spam.com/?p=300">Click here 3</a>
     """
     html_page = htmlpage_from_response(
         HtmlResponse(url='http://www.example.com/', body=html))
     links = list(lextractor.links_to_follow(html_page))
     links = sorted(links, key=lambda link: link.url)
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, "http://www.spam.com/?p=100")
     self.assertEqual(links[1].url, "http://www.spam.com/?p=200")
     self.assertEqual(links[2].url, "http://www.spam.com/?p=300")
     self.assertEqual(links[0].text, 'Click here 1')
     self.assertEqual(links[1].text, 'Click here 2')
     self.assertEqual(links[2].text, 'Click here 3')

Esempio n. 18

0

Mostra file

 def test_start_urls(self):
     specs = {"type": "pagination",
              "value": None,
              "start_urls": ['http://www.spam.com/?p=1',
                             'http://www.eggs.com/?page=0']
     }
     lextractor = create_linkextractor_from_specs(specs)
     html = """
     <a href="http://www.spam.com/?p=100">Click here 1</a>
     <a href="http://www.spam.com/?p=200">Click here 2</a>
     <a href="http://www.spam.com/?p=300">Click here 3</a>
     """
     html_page = htmlpage_from_response(
         UTF8HtmlResponse(url='http://www.example.com/', body=html))
     links = list(lextractor.links_to_follow(html_page))
     links = sorted(links, key=lambda link: link.url)
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, "http://www.spam.com/?p=100")
     self.assertEqual(links[1].url, "http://www.spam.com/?p=200")
     self.assertEqual(links[2].url, "http://www.spam.com/?p=300")
     self.assertEqual(links[0].text, 'Click here 1')
     self.assertEqual(links[1].text, 'Click here 2')
     self.assertEqual(links[2].text, 'Click here 3')

Esempio n. 19

0

Mostra file

File: test_linkextractors.py Progetto: plafl/portia

 def test_rss(self):
     specs = {"type": "rss", "value": ""}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, "http://www.wikipedia.org/")

Esempio n. 20

0

Mostra file

File: test_linkextractors.py Progetto: r2k0/slybot

 def test_xml_remove_namespaces(self):
     specs = {"type": "xpath", "value": "//link/@href", "remove_namespaces": True}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.atom))
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, 'http://example.org/feed/')

Esempio n. 21

0

Mostra file

File: test_linkextractors.py Progetto: daqv/portia-dashboard

 def test_xml_remove_namespaces(self):
     specs = {"type": "xpath", "value": "//link/@href", "remove_namespaces": True}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.atom))
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, 'http://example.org/feed/')

Esempio n. 22

0

Mostra file

File: test_linkextractors.py Progetto: daqv/portia-dashboard

 def test_atom(self):
     specs = {"type": "atom", "value": ""}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.atom))
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, 'http://example.org/feed/')

Esempio n. 23

0

Mostra file

 def test_xml(self):
     specs = {"type": "xpath", "value": "//item/link/text()"}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, 'http://www.wikipedia.org/')

Esempio n. 24

0

Mostra file

File: test_linkextractors.py Progetto: daqv/portia-dashboard

 def test_xml(self):
     specs = {"type": "xpath", "value": "//item/link/text()"}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, 'http://www.wikipedia.org/')

Esempio n. 25

0

Mostra file

 def test_atom(self):
     specs = {"type": "atom", "value": ""}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.atom))
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, 'http://example.org/feed/')