Ejemplo n.º 1
0
Archivo: test.py Proyecto: Dmdv/Haul
    def test_retrieve_error(self):
        h = Haul()

        with self.assertRaises(exceptions.RetrieveError):
            h.find_images(self.not_exist_url)

        with self.assertRaises(exceptions.RetrieveError):
            h.find_images(self.broken_url)
Ejemplo n.º 2
0
Archivo: test.py Proyecto: Dmdv/Haul
    def test_find_html_fragment(self):
        h = Haul()
        hr = h.find_images(self.fragmented_html)

        self.assertIsInstance(hr, HaulResult)

        image_urls = hr.image_urls
        image_urls_count = len(image_urls)
        self.assertEqual(image_urls_count, 6)
Ejemplo n.º 3
0
    def test_find_html_fragment(self):
        h = Haul()
        hr = h.find_images(self.fragmented_html)

        self.assertIsInstance(hr, HaulResult)

        image_urls = hr.image_urls
        image_urls_count = len(image_urls)
        self.assertEqual(image_urls_count, 6)
Ejemplo n.º 4
0
Archivo: test.py Proyecto: Dmdv/Haul
    def test_tumblr_image_url(self):
        h = Haul()
        hr = h.find_images(self.tumblr_image_url, extend=True)

        self.assertIsInstance(hr, HaulResult)
        self.assertIn('image/', hr.content_type)

        image_urls = hr.image_urls
        image_urls_count = len(image_urls)
        self.assertEqual(image_urls_count, 2)
Ejemplo n.º 5
0
    def test_tumblr_image_url(self):
        h = Haul()
        hr = h.find_images(self.tumblr_image_url, extend=True)

        self.assertIsInstance(hr, HaulResult)
        self.assertIn('image/', hr.content_type)

        image_urls = hr.image_urls
        image_urls_count = len(image_urls)
        self.assertEqual(image_urls_count, 2)
Ejemplo n.º 6
0
    def test_find_html_document(self):
        from haul.compat import str

        def img_data_src_finder(pipeline_index,
                                soup,
                                finder_image_urls=[],
                                *args,
                                **kwargs):
            """
            Find image URL in <img>'s data-src attribute
            """

            now_finder_image_urls = []

            for img in soup.find_all('img'):
                src = img.get('data-src', None)
                if src:
                    src = str(src)
                    now_finder_image_urls.append(src)

            output = {}
            output[
                'finder_image_urls'] = finder_image_urls + now_finder_image_urls

            return output

        FINDER_PIPELINE = (
            'haul.finders.pipeline.html.img_src_finder',
            'haul.finders.pipeline.html.a_href_finder',
            'haul.finders.pipeline.css.background_image_finder',
            img_data_src_finder,
        )

        h = Haul(finder_pipeline=FINDER_PIPELINE)
        hr = h.find_images(self.complete_html)

        self.assertIsInstance(hr, HaulResult)

        test_image_url = 'http://files.heelsfetishism.com/media/heels/2013/10/03/18099_307a62430fa045cc9b2124d16de63f33.jpg'
        self.assertIn(test_image_url, hr.finder_image_urls)

        image_urls = hr.image_urls
        image_urls_count = len(image_urls)
        self.assertEqual(image_urls_count, 7)
Ejemplo n.º 7
0
Archivo: test.py Proyecto: Dmdv/Haul
    def test_find_html_document(self):
        from haul.compat import str

        def img_data_src_finder(pipeline_index,
                                soup,
                                finder_image_urls=[],
                                *args, **kwargs):
            """
            Find image URL in <img>'s data-src attribute
            """

            now_finder_image_urls = []

            for img in soup.find_all('img'):
                src = img.get('data-src', None)
                if src:
                    src = str(src)
                    now_finder_image_urls.append(src)

            output = {}
            output['finder_image_urls'] = finder_image_urls + now_finder_image_urls

            return output

        FINDER_PIPELINE = (
            'haul.finders.pipeline.html.img_src_finder',
            'haul.finders.pipeline.html.a_href_finder',
            'haul.finders.pipeline.css.background_image_finder',
            img_data_src_finder,
        )

        h = Haul(finder_pipeline=FINDER_PIPELINE)
        hr = h.find_images(self.complete_html)

        self.assertIsInstance(hr, HaulResult)

        test_image_url = 'http://files.heelsfetishism.com/media/heels/2013/10/03/18099_307a62430fa045cc9b2124d16de63f33.jpg'
        self.assertIn(test_image_url, hr.finder_image_urls)

        image_urls = hr.image_urls
        image_urls_count = len(image_urls)
        self.assertEqual(image_urls_count, 7)
Ejemplo n.º 8
0
    def test_retrieve_error(self):
        h = Haul()

        with self.assertRaises(exceptions.RetrieveError):
            h.find_images(self.not_exist_url)

        with self.assertRaises(exceptions.RetrieveError):
            h.find_images(self.broken_url)
Ejemplo n.º 9
0
Archivo: test.py Proyecto: Dmdv/Haul
    def test_wordpress(self):
        h = Haul()
        hr = h.find_images(self.wordpress_html, extend=True)

        self.assertIsInstance(hr, HaulResult)
        self.assertIn('text/html', hr.content_type)
Ejemplo n.º 10
0
Archivo: test.py Proyecto: Dmdv/Haul
    def test_find_image_url(self):
        h = Haul()
        hr = h.find_images(self.image_url)

        self.assertIsInstance(hr, HaulResult)
        self.assertIn('image/', hr.content_type)
Ejemplo n.º 11
0
    def test_find_image_url(self):
        h = Haul()
        hr = h.find_images(self.image_url)

        self.assertIsInstance(hr, HaulResult)
        self.assertIn('image/', hr.content_type)
Ejemplo n.º 12
0
Archivo: test.py Proyecto: Dmdv/Haul
    def test_is_found_true(self):
        h = Haul()
        hr = h.find_images(self.complete_html)

        self.assertTrue(hr.is_found)
Ejemplo n.º 13
0
    def test_content_type_not_supported(self):
        h = Haul()

        with self.assertRaises(exceptions.ContentTypeNotSupported):
            h.find_images(self.not_supported_url)
Ejemplo n.º 14
0
    def test_invalid_parameter_error(self):
        h = Haul()

        with self.assertRaises(exceptions.InvalidParameterError):
            url_or_html = None
            h.find_images(url_or_html)
Ejemplo n.º 15
0
    def test_wordpress(self):
        h = Haul()
        hr = h.find_images(self.wordpress_html, extend=True)

        self.assertIsInstance(hr, HaulResult)
        self.assertIn('text/html', hr.content_type)
Ejemplo n.º 16
0
Archivo: test.py Proyecto: Dmdv/Haul
    def test_invalid_parameter_error(self):
        h = Haul()

        with self.assertRaises(exceptions.InvalidParameterError):
            url_or_html = None
            h.find_images(url_or_html)
Ejemplo n.º 17
0
    def test_is_found_true(self):
        h = Haul()
        hr = h.find_images(self.complete_html)

        self.assertTrue(hr.is_found)
Ejemplo n.º 18
0
Archivo: test.py Proyecto: Dmdv/Haul
    def test_content_type_not_supported(self):
        h = Haul()

        with self.assertRaises(exceptions.ContentTypeNotSupported):
            h.find_images(self.not_supported_url)
Ejemplo n.º 19
0
    def test_is_found_false(self):
        h = Haul()
        hr = h.find_images(self.no_image_html)

        self.assertFalse(hr.is_found)
Ejemplo n.º 20
0
Archivo: test.py Proyecto: Dmdv/Haul
    def test_is_found_false(self):
        h = Haul()
        hr = h.find_images(self.no_image_html)

        self.assertFalse(hr.is_found)
Ejemplo n.º 21
0
    def test_find_html_url(self):
        h = Haul()
        hr = h.find_images(self.webpage_url)

        self.assertIsInstance(hr, HaulResult)
        self.assertIn('text/html', hr.content_type)
Ejemplo n.º 22
0
Archivo: test.py Proyecto: Dmdv/Haul
    def test_find_html_url(self):
        h = Haul()
        hr = h.find_images(self.webpage_url)

        self.assertIsInstance(hr, HaulResult)
        self.assertIn('text/html', hr.content_type)
Ejemplo n.º 23
0
    for img in soup.find_all('img'):
        src = img.get('data-src', None)
        if src:
            src = str(src)
            now_finder_image_urls.append(src)

    output = {}
    output['finder_image_urls'] = finder_image_urls + now_finder_image_urls

    return output


MY_FINDER_PIPELINE = (
    'haul.finders.pipeline.html.img_src_finder',
    'haul.finders.pipeline.css.background_image_finder',
    img_data_src_finder,
)

GOOGLE_SITES_EXTENDER_PIEPLINE = (
    'haul.extenders.pipeline.google.blogspot_s1600_extender',
    'haul.extenders.pipeline.google.ggpht_s1600_extender',
    'haul.extenders.pipeline.google.googleusercontent_s1600_extender',
)

url = 'https://pixabay.com/en/photos/?q=computer&image_type=&cat=&min_width=&min_height='
h = Haul(parser='lxml',
         finder_pipeline=MY_FINDER_PIPELINE,
         extender_pipeline=GOOGLE_SITES_EXTENDER_PIEPLINE)
result = h.find_images(url, extend=True)
print(result)