def get_links(self, url):
     """
     Use the scraper class to scrape the links for the given url.
     """
     scraper = HTMLScraper(url=url)
     content =  scraper.get_content()
     
     if not content:
         return []
     
     links = [link[1] for link in content['links']]
     return links
Example #2
0
class ScraperTestCase(unittest.TestCase):
    def setUp(self):
        self.html = """
            <html>
            <head>
                <title>title</title>
            </head>
            <body>
                <div>div<a href="http://www.google.com">link</a></div>
            </body>
            </html>
        """
        self.url = "http://www.google.com"
        self.selector = TagSelector()
        self.scraper = HTMLScraper(url=self.url)

    def test_attr_dict(self):
        self.assertEqual(self.selector.attr_dict([]), {})
        self.assertEqual(self.selector.attr_dict([('class', 'test_class')]),
                         {'class': 'test_class'})

    def test_tag_selector_handle_starttag(self):
        current_tag = "div"
        current_attrs = [('class', 'test_class')]
        self.selector.handle_starttag(current_tag, current_attrs)

        # test the current attrs and tag has been set correctly
        self.assertEqual(self.selector.current_tag, current_tag)
        self.assertEqual(self.selector.current_attrs,
                         self.selector.attr_dict(current_attrs))

    def test_tag_selector_get_data(self):
        self.selector.feed(self.html)
        data = self.selector.get_data()
        self.assertEqual(
            data, {
                'content': ['title', 'div', 'link'],
                'links': [('link', 'http://www.google.com')],
                'title': 'title'
            })

    def test_html_scraper_get_url_content(self):
        self.assertNotEqual(self.scraper.get_url_content(), "")

    def test_html_scraper_get_content(self):
        content = self.scraper.get_content()
        self.assertNotEqual(content, {})
        self.assertTrue("Google" in content['content'])
        self.assertTrue("Google" in content['title'])
        self.assertEqual(self.url, content['url'])

    def tearDown(self):
        pass
    def get_links(self, url):
        """
        Use the scraper class to scrape the links for the given url.
        """
        scraper = HTMLScraper(url=url)
        content = scraper.get_content()

        if not content:
            return []

        links = [link[1] for link in content['links']]
        return links
Example #4
0
 def setUp(self):
     self.html = """
         <html>
         <head>
             <title>title</title>
         </head>
         <body>
             <div>div<a href="http://www.google.com">link</a></div>
         </body>
         </html>
     """
     self.url = "http://www.google.com"
     self.selector = TagSelector()
     self.scraper = HTMLScraper(url=self.url)
Example #5
0
class ScraperTestCase(unittest.TestCase):
    def setUp(self):
        self.html = """
            <html>
            <head>
                <title>title</title>
            </head>
            <body>
                <div>div<a href="http://www.google.com">link</a></div>
            </body>
            </html>
        """
        self.url = "http://www.google.com"
        self.selector = TagSelector()
        self.scraper = HTMLScraper(url=self.url)
        
    def test_attr_dict(self):
        self.assertEqual(self.selector.attr_dict([]), {})
        self.assertEqual(self.selector.attr_dict([('class', 'test_class')]), {'class': 'test_class'})
    
    def test_tag_selector_handle_starttag(self):
        current_tag = "div"
        current_attrs = [('class', 'test_class')]
        self.selector.handle_starttag(current_tag, current_attrs)
        
        # test the current attrs and tag has been set correctly
        self.assertEqual(self.selector.current_tag, current_tag)
        self.assertEqual(self.selector.current_attrs, self.selector.attr_dict(current_attrs))
        
    def test_tag_selector_get_data(self):
        self.selector.feed(self.html)
        data = self.selector.get_data()
        self.assertEqual(data, {'content': ['title', 'div', 'link'], 'links': [('link', 'http://www.google.com')], 'title': 'title'})
    
    def test_html_scraper_get_url_content(self):
        self.assertNotEqual(self.scraper.get_url_content(), "")
        
    def test_html_scraper_get_content(self):
        content = self.scraper.get_content()
        self.assertNotEqual(content, {})
        self.assertTrue("Google" in content['content'])
        self.assertTrue("Google" in content['title'])
        self.assertEqual(self.url, content['url'])
    
    def tearDown(self):
        pass
Example #6
0
 def setUp(self):
     self.html = """
         <html>
         <head>
             <title>title</title>
         </head>
         <body>
             <div>div<a href="http://www.google.com">link</a></div>
         </body>
         </html>
     """
     self.url = "http://www.google.com"
     self.selector = TagSelector()
     self.scraper = HTMLScraper(url=self.url)