Esempio n. 1
0
 def setUp(self):
     url = "http://test.com/index1.html"
     scraper = mock_factory.create_mock_page_scraper()
     request_factory = mock.create_autospec(RequestFactory).return_value
     extractor_factory = mock.create_autospec(ExtractorFactory).return_value
     url_joiner = mock.create_autospec(UrlJoiner).return_value
     request_sender = mock.create_autospec(RequestSender).return_value
     compatibility_factory = mock.create_autospec(
         CompatibilityFactory).return_value
     self.page = Page(url, scraper, request_factory, extractor_factory,
                      url_joiner, request_sender, compatibility_factory)
     self.page.content = mock.create_autospec(Element).return_value
     self.page.request = mock.create_autospec(Request).return_value
     self.page.extractor = mock.create_autospec(Extractor).return_value
     self.page.string_converter = mock.create_autospec(
         CompatibleStringConverter).return_value
Esempio n. 2
0
 def setUp(self):
     url = "http://test.com/index1.html"
     scraper = mock_factory.create_mock_page_scraper()
     request_factory = mock.create_autospec(RequestFactory).return_value
     extractor_factory = mock.create_autospec(ExtractorFactory).return_value
     url_joiner = mock.create_autospec(UrlJoiner).return_value
     request_sender = mock.create_autospec(RequestSender).return_value
     compatibility_factory = mock.create_autospec(CompatibilityFactory).return_value
     self.page = Page(url, scraper, request_factory, extractor_factory, url_joiner, request_sender,
                      compatibility_factory)
     self.page.content = mock.create_autospec(Element).return_value
     self.page.request = mock.create_autospec(Request).return_value
     self.page.extractor = mock.create_autospec(Extractor).return_value
     self.page.string_converter = mock.create_autospec(CompatibleStringConverter).return_value
Esempio n. 3
0
class TestPage(unittest.TestCase):
    def setUp(self):
        url = "http://test.com/index1.html"
        scraper = mock_factory.create_mock_page_scraper()
        request_factory = mock.create_autospec(RequestFactory).return_value
        extractor_factory = mock.create_autospec(ExtractorFactory).return_value
        url_joiner = mock.create_autospec(UrlJoiner).return_value
        request_sender = mock.create_autospec(RequestSender).return_value
        compatibility_factory = mock.create_autospec(
            CompatibilityFactory).return_value
        self.page = Page(url, scraper, request_factory, extractor_factory,
                         url_joiner, request_sender, compatibility_factory)
        self.page.content = mock.create_autospec(Element).return_value
        self.page.request = mock.create_autospec(Request).return_value
        self.page.extractor = mock.create_autospec(Extractor).return_value
        self.page.string_converter = mock.create_autospec(
            CompatibleStringConverter).return_value

    def test_set_content(self):
        mock_content = mock.create_autospec(Element).return_value
        mock_extractor = mock.create_autospec(Extractor).return_value
        self.page.extractor_factory.create_extractor.return_value = mock_extractor
        self.page.content = mock_content
        self.assertEquals(self.page.extractor, mock_extractor)

    def test_extract_items(self):
        mock_items_list = mock.Mock()
        self.page.scraper.extract_items_list.return_value = mock_items_list
        items_list = self.page.extract_items()
        self.assertEquals(items_list, mock_items_list)

    def test_extract_pages(self):
        mock_pages_list = mock_factory.create_mock_pages(10)
        self.page.scraper.extract_pages_list.return_value = mock_pages_list
        pages_list = self.page.extract_pages()
        self.assertEquals(pages_list, mock_pages_list)

    def test_xpath(self):
        mock_path = "//div[@class='header_blue']"
        self.page.content.__str__ = "<html><div class='header_blue'>text1</div><div class='header_blue'>text2</div></html>"
        self.page.extractor.xpath.return_value = mock_factory.create_mock_fallback_list(
            ["<div>", "<div>"])
        result = self.page.xpath(mock_path)
        self.assertEquals(result, ["<div>", "<div>"])

    def test_css(self):
        mock_path = "a"
        self.page.content.__str__ = "<html><a href='url1'>text1</a><a href='url2'>text2</a></html>"
        self.page.extractor.css.return_value = mock_factory.create_mock_fallback_list(
            ["<a>", "<a>"])
        result = self.page.css(mock_path)
        self.assertEquals(result, ["<a>", "<a>"])

    def test_css_text(self):
        mock_path = "a"
        self.page.content.__str__ = "<html><a href='url1'>text1</a><a href='url2'>text2</a></html>"
        self.page.extractor.css_text.return_value = mock_factory.create_mock_fallback_list(
            ["text1", "text2"])
        result = self.page.css_text(mock_path)
        self.assertEquals(result, ["text1", "text2"])

    def test_css_attr(self):
        mock_path = "a"
        mock_attribute_name = "href"
        self.page.content.__str__ = "<html><a href='url1'>text1</a><a href='url2'>text2</a></html>"
        self.page.extractor.css_attr.return_value = mock_factory.create_mock_fallback_list(
            ["url1", "url2"])
        result = self.page.css_attr(mock_path, mock_attribute_name)
        self.assertEquals(result, ["url1", "url2"])

    def test_file(self):
        mock_url = "http://test.com/files/example_file.txt"
        mock_text = "The first line.\n The second line."
        mock_base64 = "IlRoZSBmaXJzdCBsaW5lLlxuIFRoZSBzZWNvbmQgbGluZS4i"
        mock_request = mock.create_autospec(Request).return_value
        self.page.request_factory.create_request.return_value = mock_request
        self.page.request_sender.get_base64.return_value = mock_base64
        result = self.page.file(mock_url)
        self.assertEquals(result, mock_base64)

    @mock.patch.object(Page, 'to_url')
    def test_to_urls(self, mock_to_url):
        self.page.url = "http://test.com/page/url.html"
        links = [
            "http://test.com/link/to/example_page.html",
            "link/to/example_page.html", "/link/to/example_page.html"
        ]
        mock_to_url.return_value = "http://test.com/link/to/example_page.html"
        result = self.page.to_urls(links)
        self.assertEquals(mock_to_url.call_count, len(links))
        self.assertEquals(result, [
            "http://test.com/link/to/example_page.html",
            "http://test.com/link/to/example_page.html",
            "http://test.com/link/to/example_page.html"
        ])

    def test_to_url(self):
        self.page.url = "http://test.com/page/url.html"
        link = "/link/to/example_page.html"
        self.page.url_joiner.join_protocol_domain.return_value = "http://test.com/link/to/example_page.html"
        result = self.page.to_url(link)
        self.page.url_joiner.join_protocol_domain.assert_called_once_with(
            self.page.url, link)
        self.assertEquals(result, "http://test.com/link/to/example_page.html")

    @mock.patch('xcrawler.core.crawler.page.etree')
    def test_str_python2(self, mock_etree_module):
        mock_etree_module.tostring.return_value = b"<html><br>Page title</br></html>"
        self.page.string_converter.convert_to_string.return_value = b"<html><br>Page title</br></html>"
        result = self.page.__str__()
        self.assertEquals(
            type(result),
            type(self.page.string_converter.convert_to_string.return_value))

    @mock.patch('xcrawler.core.crawler.page.etree')
    def test_str_python3(self, mock_etree_module):
        mock_etree_module.tostring.return_value = b"<html><br>Page title</br></html>"
        self.page.string_converter.convert_to_string.return_value = u"<html><br>Page title</br></html>"
        result = self.page.__str__()
        self.assertEquals(
            type(result),
            type(self.page.string_converter.convert_to_string.return_value))
Esempio n. 4
0
class TestPage(unittest.TestCase):

    def setUp(self):
        url = "http://test.com/index1.html"
        scraper = mock_factory.create_mock_page_scraper()
        request_factory = mock.create_autospec(RequestFactory).return_value
        extractor_factory = mock.create_autospec(ExtractorFactory).return_value
        url_joiner = mock.create_autospec(UrlJoiner).return_value
        request_sender = mock.create_autospec(RequestSender).return_value
        compatibility_factory = mock.create_autospec(CompatibilityFactory).return_value
        self.page = Page(url, scraper, request_factory, extractor_factory, url_joiner, request_sender,
                         compatibility_factory)
        self.page.content = mock.create_autospec(Element).return_value
        self.page.request = mock.create_autospec(Request).return_value
        self.page.extractor = mock.create_autospec(Extractor).return_value
        self.page.string_converter = mock.create_autospec(CompatibleStringConverter).return_value

    def test_set_content(self):
        mock_content = mock.create_autospec(Element).return_value
        mock_extractor = mock.create_autospec(Extractor).return_value
        self.page.extractor_factory.create_extractor.return_value = mock_extractor
        self.page.content = mock_content
        self.assertEquals(self.page.extractor, mock_extractor)
        
    def test_extract_items(self):
        mock_items_list = mock.Mock()
        self.page.scraper.extract_items_list.return_value = mock_items_list
        items_list = self.page.extract_items()
        self.assertEquals(items_list, mock_items_list)
        
    def test_extract_pages(self):
        mock_pages_list = mock_factory.create_mock_pages(10)
        self.page.scraper.extract_pages_list.return_value = mock_pages_list
        pages_list = self.page.extract_pages()
        self.assertEquals(pages_list, mock_pages_list)

    def test_xpath(self):
        mock_path = "//div[@class='header_blue']"
        self.page.content.__str__ = "<html><div class='header_blue'>text1</div><div class='header_blue'>text2</div></html>"
        self.page.extractor.xpath.return_value = mock_factory.create_mock_fallback_list(["<div>", "<div>"])
        result = self.page.xpath(mock_path)
        self.assertEquals(result, ["<div>", "<div>"])

    def test_css(self):
        mock_path = "a"
        self.page.content.__str__ = "<html><a href='url1'>text1</a><a href='url2'>text2</a></html>"
        self.page.extractor.css.return_value = mock_factory.create_mock_fallback_list(["<a>", "<a>"])
        result = self.page.css(mock_path)
        self.assertEquals(result, ["<a>", "<a>"])

    def test_css_text(self):
        mock_path = "a"
        self.page.content.__str__ = "<html><a href='url1'>text1</a><a href='url2'>text2</a></html>"
        self.page.extractor.css_text.return_value = mock_factory.create_mock_fallback_list(["text1", "text2"])
        result = self.page.css_text(mock_path)
        self.assertEquals(result, ["text1", "text2"])

    def test_css_attr(self):
        mock_path = "a"
        mock_attribute_name = "href"
        self.page.content.__str__ = "<html><a href='url1'>text1</a><a href='url2'>text2</a></html>"
        self.page.extractor.css_attr.return_value = mock_factory.create_mock_fallback_list(["url1", "url2"])
        result = self.page.css_attr(mock_path, mock_attribute_name)
        self.assertEquals(result, ["url1", "url2"])

    def test_file(self):
        mock_url = "http://test.com/files/example_file.txt"
        mock_text = "The first line.\n The second line."
        mock_base64 = "IlRoZSBmaXJzdCBsaW5lLlxuIFRoZSBzZWNvbmQgbGluZS4i"
        mock_request = mock.create_autospec(Request).return_value
        self.page.request_factory.create_request.return_value = mock_request
        self.page.request_sender.get_base64.return_value = mock_base64
        result = self.page.file(mock_url)
        self.assertEquals(result, mock_base64)

    @mock.patch.object(Page, 'to_url')
    def test_to_urls(self, mock_to_url):
        self.page.url = "http://test.com/page/url.html"
        links = ["http://test.com/link/to/example_page.html", "link/to/example_page.html", "/link/to/example_page.html"]
        mock_to_url.return_value = "http://test.com/link/to/example_page.html"
        result = self.page.to_urls(links)
        self.assertEquals(mock_to_url.call_count, len(links))
        self.assertEquals(result, ["http://test.com/link/to/example_page.html", "http://test.com/link/to/example_page.html",
                                   "http://test.com/link/to/example_page.html"])

    def test_to_url(self):
        self.page.url = "http://test.com/page/url.html"
        link = "/link/to/example_page.html"
        self.page.url_joiner.join_protocol_domain.return_value = "http://test.com/link/to/example_page.html"
        result = self.page.to_url(link)
        self.page.url_joiner.join_protocol_domain.assert_called_once_with(self.page.url, link)
        self.assertEquals(result, "http://test.com/link/to/example_page.html")

    @mock.patch('xcrawler.core.crawler.page.etree')
    def test_str_python2(self, mock_etree_module):
        mock_etree_module.tostring.return_value = b"<html><br>Page title</br></html>"
        self.page.string_converter.convert_to_string.return_value = b"<html><br>Page title</br></html>"
        result = self.page.__str__()
        self.assertEquals(type(result), type(self.page.string_converter.convert_to_string.return_value))

    @mock.patch('xcrawler.core.crawler.page.etree')
    def test_str_python3(self, mock_etree_module):
        mock_etree_module.tostring.return_value = b"<html><br>Page title</br></html>"
        self.page.string_converter.convert_to_string.return_value = u"<html><br>Page title</br></html>"
        result = self.page.__str__()
        self.assertEquals(type(result), type(self.page.string_converter.convert_to_string.return_value))