Beispiel #1
0
 def test_link_text_wrong_encoding(self):
     html = """<body><p><a href="item/12.html">Wrong: \xed</a></p></body></html>"""
     response = HtmlResponse("http://www.example.com", body=html, encoding='utf-8')
     lx = BaseSgmlLinkExtractor()
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://www.example.com/item/12.html', text=u'Wrong: \ufffd'),
     ])
    def test_extraction_encoding(self):
        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
        response_utf8 = HtmlResponse(
            url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']})
        response_noenc = HtmlResponse(
            url='http://example.com/noenc', body=body)
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(
            url='http://example.com/latin1', body=body)

        lx = BaseSgmlLinkExtractor()
        self.assertEqual(lx.extract_links(response_utf8), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html',
                 text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_noenc), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html',
                 text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_latin1), [
            Link(url='http://example.com/sample_%F1.html', text=''),
            Link(url='http://example.com/sample_%E1.html',
                 text='sample \xe1 text'.decode('latin1')),
        ])
Beispiel #3
0
 def __init__(self,
              allow=(),
              deny=(),
              allow_domains=(),
              deny_domains=(),
              restrict_xpaths=(),
              tags=('a', 'area'),
              attrs=('href'),
              canonicalize=True,
              unique=True,
              process_value=None):
     self.allow_res = [
         x if isinstance(x, _re_type) else re.compile(x)
         for x in arg_to_iter(allow)
     ]
     self.deny_res = [
         x if isinstance(x, _re_type) else re.compile(x)
         for x in arg_to_iter(deny)
     ]
     self.allow_domains = set(arg_to_iter(allow_domains))
     self.deny_domains = set(arg_to_iter(deny_domains))
     self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
     self.canonicalize = canonicalize
     tag_func = lambda x: x in tags
     attr_func = lambda x: x in attrs
     BaseSgmlLinkExtractor.__init__(self,
                                    tag=tag_func,
                                    attr=attr_func,
                                    unique=unique,
                                    process_value=process_value)
    def test_matches(self):
        url1 = 'http://lotsofstuff.com/stuff1/index'
        url2 = 'http://evenmorestuff.com/uglystuff/index'

        lx = BaseSgmlLinkExtractor()
        self.assertEqual(lx.matches(url1), True)
        self.assertEqual(lx.matches(url2), True)
    def test_base_url(self):
        html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" />
        <body><p><a href="item/12.html">Item 12</a></p>
        </body></html>"""
        response = HtmlResponse(
            "http://example.org/somepage/index.html", body=html)

        lx = BaseSgmlLinkExtractor()  # default: tag=a, attr=href
        self.assertEqual(lx.extract_links(response),
                         [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])

        # base url is an absolute path and relative to host
        html = """<html><head><title>Page title<title><base href="/" />
        <body><p><a href="item/12.html">Item 12</a></p></body></html>"""
        response = HtmlResponse(
            "https://example.org/somepage/index.html", body=html)
        self.assertEqual(lx.extract_links(response),
                         [Link(url='https://example.org/item/12.html', text='Item 12')])

        # base url has no scheme
        html = """<html><head><title>Page title<title><base href="//noschemedomain.com/path/to/" />
        <body><p><a href="item/12.html">Item 12</a></p></body></html>"""
        response = HtmlResponse(
            "https://example.org/somepage/index.html", body=html)
        self.assertEqual(lx.extract_links(response),
                         [Link(url='https://noschemedomain.com/path/to/item/12.html', text='Item 12')])
    def test_base_url(self):
        html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" />
        <body><p><a href="item/12.html">Item 12</a></p>
        </body></html>"""
        response = HtmlResponse("http://example.org/somepage/index.html",
                                body=html)

        lx = BaseSgmlLinkExtractor()  # default: tag=a, attr=href
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://otherdomain.com/base/item/12.html',
                 text='Item 12')
        ])

        # base url is an absolute path and relative to host
        html = """<html><head><title>Page title<title><base href="/" />
        <body><p><a href="item/12.html">Item 12</a></p></body></html>"""
        response = HtmlResponse("https://example.org/somepage/index.html",
                                body=html)
        self.assertEqual(
            lx.extract_links(response),
            [Link(url='https://example.org/item/12.html', text='Item 12')])

        # base url has no scheme
        html = """<html><head><title>Page title<title><base href="//noschemedomain.com/path/to/" />
        <body><p><a href="item/12.html">Item 12</a></p></body></html>"""
        response = HtmlResponse("https://example.org/somepage/index.html",
                                body=html)
        self.assertEqual(lx.extract_links(response), [
            Link(url='https://noschemedomain.com/path/to/item/12.html',
                 text='Item 12')
        ])
    def test_extraction_encoding(self):
        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
        response_utf8 = HtmlResponse(
            url='http://example.com/utf8',
            body=body,
            headers={'Content-Type': ['text/html; charset=utf-8']})
        response_noenc = HtmlResponse(url='http://example.com/noenc',
                                      body=body)
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(url='http://example.com/latin1',
                                       body=body)

        lx = BaseSgmlLinkExtractor()
        self.assertEqual(lx.extract_links(response_utf8), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html',
                 text='sample \xe2\x82\xac text'.decode('utf-8'))
        ])

        self.assertEqual(lx.extract_links(response_noenc), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html',
                 text='sample \xe2\x82\xac text'.decode('utf-8'))
        ])

        self.assertEqual(lx.extract_links(response_latin1), [
            Link(url='http://example.com/sample_%F1.html', text=''),
            Link(url='http://example.com/sample_%E1.html',
                 text='sample \xe1 text'.decode('latin1'))
        ])
 def test_link_text_wrong_encoding(self):
     html = """<body><p><a href="item/12.html">Wrong: \xed</a></p></body></html>"""
     response = HtmlResponse("http://www.example.com", body=html, encoding='utf-8')
     lx = BaseSgmlLinkExtractor()
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://www.example.com/item/12.html', text=u'Wrong: \ufffd'),
     ])
    def test_extraction_encoding(self):
        body = get_testdata("link_extractor", "linkextractor_noenc.html")
        response_utf8 = HtmlResponse(
            url="http://example.com/utf8", body=body, headers={"Content-Type": ["text/html; charset=utf-8"]}
        )
        response_noenc = HtmlResponse(url="http://example.com/noenc", body=body)
        body = get_testdata("link_extractor", "linkextractor_latin1.html")
        response_latin1 = HtmlResponse(url="http://example.com/latin1", body=body)

        lx = BaseSgmlLinkExtractor()
        self.assertEqual(
            lx.extract_links(response_utf8),
            [
                Link(url="http://example.com/sample_%C3%B1.html", text=""),
                Link(url="http://example.com/sample_%E2%82%AC.html", text="sample \xe2\x82\xac text".decode("utf-8")),
            ],
        )

        self.assertEqual(
            lx.extract_links(response_noenc),
            [
                Link(url="http://example.com/sample_%C3%B1.html", text=""),
                Link(url="http://example.com/sample_%E2%82%AC.html", text="sample \xe2\x82\xac text".decode("utf-8")),
            ],
        )

        self.assertEqual(
            lx.extract_links(response_latin1),
            [
                Link(url="http://example.com/sample_%F1.html", text=""),
                Link(url="http://example.com/sample_%E1.html", text="sample \xe1 text".decode("latin1")),
            ],
        )
    def test_matches(self):
        url1 = 'http://lotsofstuff.com/stuff1/index'
        url2 = 'http://evenmorestuff.com/uglystuff/index'

        lx = BaseSgmlLinkExtractor()
        self.assertEqual(lx.matches(url1), True)
        self.assertEqual(lx.matches(url2), True)
    def test_base_url(self):
        html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" />
        <body><p><a href="item/12.html">Item 12</a></p>
        </body></html>"""
        response = HtmlResponse("http://example.org/somepage/index.html", body=html)

        lx = BaseSgmlLinkExtractor()  # default: tag=a, attr=href
        self.assertEqual(
            lx.extract_links(response), [Link(url="http://otherdomain.com/base/item/12.html", text="Item 12")]
        )
    def test_basic(self):
        html = """<html><head><title>Page title<title>
        <body><p><a href="item/12.html">Item 12</a></p>
        <p><a href="/about.html">About us</a></p>
        <img src="/logo.png" alt="Company logo (not a link)" />
        <p><a href="../othercat.html">Other category</a></p>
        <p><a href="/" /></p>
        </body></html>"""
        response = HtmlResponse("http://example.org/somepage/index.html", body=html)

        lx = BaseSgmlLinkExtractor()  # default: tag=a, attr=href
        self.assertEqual(lx.extract_links(response),
                         [Link(url='http://example.org/somepage/item/12.html', text='Item 12'), 
                          Link(url='http://example.org/about.html', text='About us'),
                          Link(url='http://example.org/othercat.html', text='Other category'), 
                          Link(url='http://example.org/', text='')])
Beispiel #13
0
    def _process_links(self, links):
        links = [
            link for link in links
            if not self.check_url or _is_valid_url(link.url)
        ]

        if self.allow_res:
            links = [
                link for link in links if _matches(link.url, self.allow_res)
            ]
        if self.deny_res:
            links = [
                link for link in links if not _matches(link.url, self.deny_res)
            ]
        if self.allow_domains:
            links = [
                link for link in links
                if url_is_from_any_domain(link.url, self.allow_domains)
            ]
        if self.deny_domains:
            links = [
                link for link in links
                if not url_is_from_any_domain(link.url, self.deny_domains)
            ]

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links
Beispiel #14
0
    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [link for link in links if _matches(link.url, self.allow_res)]
        if self.deny_res:
            links = [link for link in links if not _matches(link.url, self.deny_res)]
        if self.allow_domains:
            links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
        if self.deny_domains:
            links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]

        new_links = []
        for link in links:
            CustomerId = link.url.split('/')[6]
            if not self._ignore_identifier(CustomerId):
                log.msg("Found CustomerId: "+CustomerId,level=log.DEBUG)
                new_links.append(link)

        links = new_links

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links
Beispiel #15
0
    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [
                link for link in links if _matches(link.url, self.allow_res)
            ]
        if self.deny_res:
            links = [
                link for link in links if not _matches(link.url, self.deny_res)
            ]
        if self.allow_domains:
            #links = [link for link in links if self._url_is_from_any_host(link.url, self.allow_domains)]
            links = [
                link for link in links
                if self._url_is_from_any_domain(link.url, self.allow_domains)
            ]
        if self.deny_domains:
            #links = [link for link in links if not self._url_is_from_any_host(link.url, self.deny_domains)]
            links = [
                link for link in links if
                not self._url_is_from_any_domain(link.url, self.deny_domains)
            ]

        if self.canonicalize:
            for link in links:
                #log.msg("extract link before normalize: [%s]" % link.url, level=log.INFO)
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links
Beispiel #16
0
    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [link for link in links if _matches(link.url, self.allow_res)]
        if self.deny_res:
            links = [link for link in links if not _matches(link.url, self.deny_res)]
        if self.allow_domains:
            links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
        if self.deny_domains:
            links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]

        new_links = []
        for link in links:
            ASIN = link.url.split('/')[5]
            if not self._ignore_identifier(ASIN):
                log.msg("Found ASIN: "+ASIN,level=log.DEBUG)
                link.url = "http://www.amazon.com/product-reviews/"+ASIN+"/ref%3Ddp_top_cm_cr_acr_txt?ie=UTF8&showViewpoints=0"
                new_links.append(link)

        links = new_links

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links
Beispiel #17
0
    def test_basic(self):
        html = """<html><head><title>Page title<title>
        <body><p><a href="item/12.html">Item 12</a></p>
        <p><a href="/about.html">About us</a></p>
        <img src="/logo.png" alt="Company logo (not a link)" />
        <p><a href="../othercat.html">Other category</a></p>
        <p><a href="/">&gt;&gt;</a></p>
        <p><a href="/" /></p>
        </body></html>"""
        response = HtmlResponse("http://example.org/somepage/index.html", body=html)

        lx = BaseSgmlLinkExtractor()  # default: tag=a, attr=href
        self.assertEqual(lx.extract_links(response),
                         [Link(url='http://example.org/somepage/item/12.html', text='Item 12'),
                          Link(url='http://example.org/about.html', text='About us'),
                          Link(url='http://example.org/othercat.html', text='Other category'),
                          Link(url='http://example.org/', text='>>'),
                          Link(url='http://example.org/', text='')])
    def _process_links(self, links):
        links = [link for link in links if not self.check_url or _is_valid_url(link.url)]

        if self.allow_res:
            links = [link for link in links if _matches(link.url, self.allow_res)]
        if self.deny_res:
            links = [link for link in links if not _matches(link.url, self.deny_res)]
        if self.allow_domains:
            links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
        if self.deny_domains:
            links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links
Beispiel #19
0
    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [
                link for link in links if _matches(link.url, self.allow_res)
            ]
        if self.deny_res:
            links = [
                link for link in links if not _matches(link.url, self.deny_res)
            ]
        if self.allow_domains:
            links = [
                link for link in links
                if url_is_from_any_domain(link.url, self.allow_domains)
            ]
        if self.deny_domains:
            links = [
                link for link in links
                if not url_is_from_any_domain(link.url, self.deny_domains)
            ]

        new_links = []
        for link in links:
            ASIN = link.url.split('/')[5]
            if not self._ignore_identifier(ASIN):
                log.msg("Found ASIN: " + ASIN, level=log.DEBUG)
                link.url = "http://www.amazon.com/product-reviews/" + ASIN + "/ref%3Ddp_top_cm_cr_acr_txt?ie=UTF8&showViewpoints=0"
                new_links.append(link)

        links = new_links

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links
Beispiel #20
0
    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [
                link for link in links if _matches(link.url, self.allow_res)
            ]
        if self.deny_res:
            links = [
                link for link in links if not _matches(link.url, self.deny_res)
            ]
        if self.allow_domains:
            links = [
                link for link in links
                if url_is_from_any_domain(link.url, self.allow_domains)
            ]
        if self.deny_domains:
            links = [
                link for link in links
                if not url_is_from_any_domain(link.url, self.deny_domains)
            ]

        new_links = []
        for link in links:
            CustomerId = link.url.split('/')[6]
            if not self._ignore_identifier(CustomerId):
                log.msg("Found CustomerId: " + CustomerId, level=log.DEBUG)
                new_links.append(link)

        links = new_links

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links