def check_large_images(self, node, parent_depth_level, sibling_depth_level): """\ although slow the best way to determine the best image is to download them and check the actual dimensions of the image when on disk so we'll go through a phased approach... 1. get a list of ALL images from the parent node 2. filter out any bad image names that we know of (gifs, ads, etc..) 3. do a head request on each file to make sure it meets our bare requirements 4. any images left over let's do a full GET request, download em to disk and check their dimensions 5. Score images based on different factors like height/width and possibly things like color density """ good_images = self.get_image_candidates(node) if good_images: scored_images = self.fetch_images(good_images, parent_depth_level) if scored_images: highscore_image = sorted(scored_images.items(), key=lambda x: x[1], reverse=True)[0][0] main_image = Image() main_image.src = highscore_image.src main_image.width = highscore_image.width main_image.height = highscore_image.height main_image.extraction_type = "bigimage" main_image.confidence_score = 100 / len(scored_images) if len(scored_images) > 0 else 0 return main_image depth_obj = self.get_depth_level(node, parent_depth_level, sibling_depth_level) if depth_obj: return self.check_large_images(depth_obj.node, depth_obj.parent_depth, depth_obj.sibling_depth) return None
def get_best_image(self, doc, topNode): image = self.check_known_elements() if image: return image image = self.check_large_images(topNode, 0, 0) if image: return image image = self.check_meta_tag() if image: return image return Image()
def get_image(self, element, src, score=100, extraction_type="N/A"): # build the Image object image = Image() image.src = self.build_image_path(src) image.extraction_type = extraction_type image.confidence_score = score # check if we have a local image # in order to add more information # on the Image object local_image = self.get_local_image(image.src) if local_image: image.bytes = local_image.bytes image.height = local_image.height image.width = local_image.width # return the image return image
def check_large_images(self, node, parent_depth_level, sibling_depth_level): """\ although slow the best way to determine the best image is to download them and check the actual dimensions of the image when on disk so we'll go through a phased approach... 1. get a list of ALL images from the parent node 2. filter out any bad image names that we know of (gifs, ads, etc..) 3. do a head request on each file to make sure it meets our bare requirements 4. any images left over let's do a full GET request, download em to disk and check their dimensions 5. Score images based on different factors like height/width and possibly things like color density """ good_images = self.get_image_candidates(node) if good_images: scored_images = self.fetch_images(good_images, parent_depth_level) if scored_images: highscore_image = sorted(scored_images.items(), key=lambda x: x[1], reverse=True)[0][0] main_image = Image() main_image.src = highscore_image.src main_image.width = highscore_image.width main_image.height = highscore_image.height main_image.extraction_type = "bigimage" main_image.confidence_score = 100 / len(scored_images) \ if len(scored_images) > 0 else 0 return main_image depth_obj = self.get_depth_level(node, parent_depth_level, sibling_depth_level) if depth_obj: return self.check_large_images(depth_obj.node, depth_obj.parent_depth, depth_obj.sibling_depth) return None
def getExpectedImage(self, expected_value): image = Image() for k, v in expected_value.items(): setattr(image, k, v) return image