def get_best_image(self, doc, top_node): image = self.check_known_elements() if image: return image image = self.check_large_images(top_node, 0, 0) if image: return image image = self.check_meta_tag() if image: return image return Image()
def get_image(self, src, score=100, extraction_type="N/A"): # build the Image object image = Image() image._src = self.build_image_path(src) image._extraction_type = extraction_type image._confidence_score = score # check if we have a local image # in order to add more information # on the Image object local_image = self.get_local_image(image.src) if local_image: image._bytes = local_image.bytes image._height = local_image.height image._width = local_image.width # return the image return image
def assert_top_image(self, fields, expected_value, result_image): # test if the result value # is an Goose Image instance msg = "Result value is not a Goose Image instance" self.assertTrue(isinstance(result_image, Image), msg=msg) # expected image expected_image = Image() for k, v in list(expected_value.items()): setattr(expected_image, '_{}'.format(k), v) msg = "Expected value is not a Goose Image instance" self.assertTrue(isinstance(expected_image, Image), msg=msg) # check msg = "Returned Image is not the one expected" self.assertIn(result_image.src, expected_image.src, msg=msg) fields = vars(expected_image) for k, v in list(fields.items()): msg = "Returned Image attribute '%s' is not the one expected" % k self.assertEqual(getattr(expected_image, k), getattr(result_image, k), msg=msg)
def check_large_images(self, node, parent_depth_level, sibling_depth_level): """\ although slow the best way to determine the best image is to download them and check the actual dimensions of the image when on disk so we'll go through a phased approach... 1. get a list of ALL images from the parent node 2. filter out any bad image names that we know of (gifs, ads, etc..) 3. do a head request on each file to make sure it meets our bare requirements 4. any images left over let's do a full GET request, download em to disk and check their dimensions 5. Score images based on different factors like height/width and possibly things like color density """ good_images = self.get_image_candidates(node) if good_images: scored_images = self.fetch_images(good_images, parent_depth_level) if scored_images: highscore_image = sorted(list(scored_images.items()), key=lambda x: x[1], reverse=True)[0][0] main_image = Image() main_image._src = highscore_image.src main_image._width = highscore_image.width main_image._height = highscore_image.height main_image._extraction_type = "bigimage" score_len = len(scored_images) main_image._confidence_score = 100 / score_len if score_len > 0 else 0 return main_image depth_obj = self.get_depth_level(node, parent_depth_level, sibling_depth_level) if depth_obj: return self.check_large_images(depth_obj.node, depth_obj.parent_depth, depth_obj.sibling_depth) return None
def get_best_image(self, doc, top_node): # the webpage url that we're extracting content from self.target_url = self.article.final_url image = self.check_known_elements() if image: return image image = self.check_large_images(top_node, 0, 0) if image: return image image = self.check_meta_tag() if image: return image # Since, nothing worked, pick any img. image = self._check_elements(self.article.raw_doc) if image is not None: src = self.parser.getAttribute(image, attr='src') if src and src[-1] != "/" and not self.badimages_names_re.search(src): return self.get_image(src, score=80, extraction_type='any') return Image()