Python HTMLScraper Examples

Programming Language: Python

Namespace/Package Name: searchengine.scraper

Class/Type: HTMLScraper

Examples at hotexamples.com: 6

Python HTMLScraper - 6 examples found. These are the top rated real world Python examples of searchengine.scraper.HTMLScraper extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

HTMLScraper(2)

get_content(2)

get_url_content(1)

Example #1

Show file

File: spider.py Project: MrskyBoatin/python-searchengine

 def get_links(self, url):
     """
     Use the scraper class to scrape the links for the given url.
     """
     scraper = HTMLScraper(url=url)
     content =  scraper.get_content()
     
     if not content:
         return []
     
     links = [link[1] for link in content['links']]
     return links

Example #2

Show file

File: tests.py Project: tanmoydeb07/python-searchengine

class ScraperTestCase(unittest.TestCase):
    def setUp(self):
        self.html = """
            <html>
            <head>
                <title>title</title>
            </head>
            <body>
                <div>div<a href="http://www.google.com">link</a></div>
            </body>
            </html>
        """
        self.url = "http://www.google.com"
        self.selector = TagSelector()
        self.scraper = HTMLScraper(url=self.url)

    def test_attr_dict(self):
        self.assertEqual(self.selector.attr_dict([]), {})
        self.assertEqual(self.selector.attr_dict([('class', 'test_class')]),
                         {'class': 'test_class'})

    def test_tag_selector_handle_starttag(self):
        current_tag = "div"
        current_attrs = [('class', 'test_class')]
        self.selector.handle_starttag(current_tag, current_attrs)

        # test the current attrs and tag has been set correctly
        self.assertEqual(self.selector.current_tag, current_tag)
        self.assertEqual(self.selector.current_attrs,
                         self.selector.attr_dict(current_attrs))

    def test_tag_selector_get_data(self):
        self.selector.feed(self.html)
        data = self.selector.get_data()
        self.assertEqual(
            data, {
                'content': ['title', 'div', 'link'],
                'links': [('link', 'http://www.google.com')],
                'title': 'title'
            })

    def test_html_scraper_get_url_content(self):
        self.assertNotEqual(self.scraper.get_url_content(), "")

    def test_html_scraper_get_content(self):
        content = self.scraper.get_content()
        self.assertNotEqual(content, {})
        self.assertTrue("Google" in content['content'])
        self.assertTrue("Google" in content['title'])
        self.assertEqual(self.url, content['url'])

    def tearDown(self):
        pass

Example #3

Show file

File: spider.py Project: tanmoydeb07/python-searchengine

    def get_links(self, url):
        """
        Use the scraper class to scrape the links for the given url.
        """
        scraper = HTMLScraper(url=url)
        content = scraper.get_content()

        if not content:
            return []

        links = [link[1] for link in content['links']]
        return links

Example #4

Show file

File: tests.py Project: tanmoydeb07/python-searchengine

 def setUp(self):
     self.html = """
         <html>
         <head>
             <title>title</title>
         </head>
         <body>
             <div>div<a href="http://www.google.com">link</a></div>
         </body>
         </html>
     """
     self.url = "http://www.google.com"
     self.selector = TagSelector()
     self.scraper = HTMLScraper(url=self.url)

Example #5

Show file

File: tests.py Project: MrskyBoatin/python-searchengine

class ScraperTestCase(unittest.TestCase):
    def setUp(self):
        self.html = """
            <html>
            <head>
                <title>title</title>
            </head>
            <body>
                <div>div<a href="http://www.google.com">link</a></div>
            </body>
            </html>
        """
        self.url = "http://www.google.com"
        self.selector = TagSelector()
        self.scraper = HTMLScraper(url=self.url)
        
    def test_attr_dict(self):
        self.assertEqual(self.selector.attr_dict([]), {})
        self.assertEqual(self.selector.attr_dict([('class', 'test_class')]), {'class': 'test_class'})
    
    def test_tag_selector_handle_starttag(self):
        current_tag = "div"
        current_attrs = [('class', 'test_class')]
        self.selector.handle_starttag(current_tag, current_attrs)
        
        # test the current attrs and tag has been set correctly
        self.assertEqual(self.selector.current_tag, current_tag)
        self.assertEqual(self.selector.current_attrs, self.selector.attr_dict(current_attrs))
        
    def test_tag_selector_get_data(self):
        self.selector.feed(self.html)
        data = self.selector.get_data()
        self.assertEqual(data, {'content': ['title', 'div', 'link'], 'links': [('link', 'http://www.google.com')], 'title': 'title'})
    
    def test_html_scraper_get_url_content(self):
        self.assertNotEqual(self.scraper.get_url_content(), "")
        
    def test_html_scraper_get_content(self):
        content = self.scraper.get_content()
        self.assertNotEqual(content, {})
        self.assertTrue("Google" in content['content'])
        self.assertTrue("Google" in content['title'])
        self.assertEqual(self.url, content['url'])
    
    def tearDown(self):
        pass

Example #6

Show file

File: tests.py Project: MrskyBoatin/python-searchengine

 def setUp(self):
     self.html = """
         <html>
         <head>
             <title>title</title>
         </head>
         <body>
             <div>div<a href="http://www.google.com">link</a></div>
         </body>
         </html>
     """
     self.url = "http://www.google.com"
     self.selector = TagSelector()
     self.scraper = HTMLScraper(url=self.url)