def test_retrieve_error(self): h = Haul() with self.assertRaises(exceptions.RetrieveError): h.find_images(self.not_exist_url) with self.assertRaises(exceptions.RetrieveError): h.find_images(self.broken_url)
def test_find_html_fragment(self): h = Haul() hr = h.find_images(self.fragmented_html) self.assertIsInstance(hr, HaulResult) image_urls = hr.image_urls image_urls_count = len(image_urls) self.assertEqual(image_urls_count, 6)
def test_tumblr_image_url(self): h = Haul() hr = h.find_images(self.tumblr_image_url, extend=True) self.assertIsInstance(hr, HaulResult) self.assertIn('image/', hr.content_type) image_urls = hr.image_urls image_urls_count = len(image_urls) self.assertEqual(image_urls_count, 2)
def test_find_html_document(self): from haul.compat import str def img_data_src_finder(pipeline_index, soup, finder_image_urls=[], *args, **kwargs): """ Find image URL in <img>'s data-src attribute """ now_finder_image_urls = [] for img in soup.find_all('img'): src = img.get('data-src', None) if src: src = str(src) now_finder_image_urls.append(src) output = {} output[ 'finder_image_urls'] = finder_image_urls + now_finder_image_urls return output FINDER_PIPELINE = ( 'haul.finders.pipeline.html.img_src_finder', 'haul.finders.pipeline.html.a_href_finder', 'haul.finders.pipeline.css.background_image_finder', img_data_src_finder, ) h = Haul(finder_pipeline=FINDER_PIPELINE) hr = h.find_images(self.complete_html) self.assertIsInstance(hr, HaulResult) test_image_url = 'http://files.heelsfetishism.com/media/heels/2013/10/03/18099_307a62430fa045cc9b2124d16de63f33.jpg' self.assertIn(test_image_url, hr.finder_image_urls) image_urls = hr.image_urls image_urls_count = len(image_urls) self.assertEqual(image_urls_count, 7)
def test_find_html_document(self): from haul.compat import str def img_data_src_finder(pipeline_index, soup, finder_image_urls=[], *args, **kwargs): """ Find image URL in <img>'s data-src attribute """ now_finder_image_urls = [] for img in soup.find_all('img'): src = img.get('data-src', None) if src: src = str(src) now_finder_image_urls.append(src) output = {} output['finder_image_urls'] = finder_image_urls + now_finder_image_urls return output FINDER_PIPELINE = ( 'haul.finders.pipeline.html.img_src_finder', 'haul.finders.pipeline.html.a_href_finder', 'haul.finders.pipeline.css.background_image_finder', img_data_src_finder, ) h = Haul(finder_pipeline=FINDER_PIPELINE) hr = h.find_images(self.complete_html) self.assertIsInstance(hr, HaulResult) test_image_url = 'http://files.heelsfetishism.com/media/heels/2013/10/03/18099_307a62430fa045cc9b2124d16de63f33.jpg' self.assertIn(test_image_url, hr.finder_image_urls) image_urls = hr.image_urls image_urls_count = len(image_urls) self.assertEqual(image_urls_count, 7)
def test_wordpress(self): h = Haul() hr = h.find_images(self.wordpress_html, extend=True) self.assertIsInstance(hr, HaulResult) self.assertIn('text/html', hr.content_type)
def test_find_image_url(self): h = Haul() hr = h.find_images(self.image_url) self.assertIsInstance(hr, HaulResult) self.assertIn('image/', hr.content_type)
def test_is_found_true(self): h = Haul() hr = h.find_images(self.complete_html) self.assertTrue(hr.is_found)
def test_content_type_not_supported(self): h = Haul() with self.assertRaises(exceptions.ContentTypeNotSupported): h.find_images(self.not_supported_url)
def test_invalid_parameter_error(self): h = Haul() with self.assertRaises(exceptions.InvalidParameterError): url_or_html = None h.find_images(url_or_html)
def test_is_found_false(self): h = Haul() hr = h.find_images(self.no_image_html) self.assertFalse(hr.is_found)
def test_find_html_url(self): h = Haul() hr = h.find_images(self.webpage_url) self.assertIsInstance(hr, HaulResult) self.assertIn('text/html', hr.content_type)
for img in soup.find_all('img'): src = img.get('data-src', None) if src: src = str(src) now_finder_image_urls.append(src) output = {} output['finder_image_urls'] = finder_image_urls + now_finder_image_urls return output MY_FINDER_PIPELINE = ( 'haul.finders.pipeline.html.img_src_finder', 'haul.finders.pipeline.css.background_image_finder', img_data_src_finder, ) GOOGLE_SITES_EXTENDER_PIEPLINE = ( 'haul.extenders.pipeline.google.blogspot_s1600_extender', 'haul.extenders.pipeline.google.ggpht_s1600_extender', 'haul.extenders.pipeline.google.googleusercontent_s1600_extender', ) url = 'https://pixabay.com/en/photos/?q=computer&image_type=&cat=&min_width=&min_height=' h = Haul(parser='lxml', finder_pipeline=MY_FINDER_PIPELINE, extender_pipeline=GOOGLE_SITES_EXTENDER_PIEPLINE) result = h.find_images(url, extend=True) print(result)