Beispiel #1
0
    def check_large_images(self, node, parent_depth_level, sibling_depth_level):
        """\
        although slow the best way to determine the best image is to download
        them and check the actual dimensions of the image when on disk
        so we'll go through a phased approach...
        1. get a list of ALL images from the parent node
        2. filter out any bad image names that we know of (gifs, ads, etc..)
        3. do a head request on each file to make sure it meets
           our bare requirements
        4. any images left over let's do a full GET request,
           download em to disk and check their dimensions
        5. Score images based on different factors like height/width
           and possibly things like color density
        """
        good_images = self.get_image_candidates(node)

        if good_images:
            scored_images = self.fetch_images(good_images, parent_depth_level)
            if scored_images:
                highscore_image = sorted(scored_images.items(), key=lambda x: x[1], reverse=True)[0][0]
                main_image = Image()
                main_image.src = highscore_image.src
                main_image.width = highscore_image.width
                main_image.height = highscore_image.height
                main_image.extraction_type = "bigimage"
                main_image.confidence_score = 100 / len(scored_images) if len(scored_images) > 0 else 0
                return main_image

        depth_obj = self.get_depth_level(node, parent_depth_level, sibling_depth_level)
        if depth_obj:
            return self.check_large_images(depth_obj.node, depth_obj.parent_depth, depth_obj.sibling_depth)

        return None
Beispiel #2
0
    def get_best_image(self, doc, topNode):
        image = self.check_known_elements()
        if image:
            return image

        image = self.check_large_images(topNode, 0, 0)
        if image:
            return image

        image = self.check_meta_tag()
        if image:
            return image
        return Image()
Beispiel #3
0
    def get_image(self, element, src, score=100, extraction_type="N/A"):
        # build the Image object
        image = Image()
        image.src = self.build_image_path(src)
        image.extraction_type = extraction_type
        image.confidence_score = score

        # check if we have a local image
        # in order to add more information
        # on the Image object
        local_image = self.get_local_image(image.src)
        if local_image:
            image.bytes = local_image.bytes
            image.height = local_image.height
            image.width = local_image.width

        # return the image
        return image
Beispiel #4
0
    def get_image(self, element, src, score=100, extraction_type="N/A"):
        # build the Image object
        image = Image()
        image.src = self.build_image_path(src)
        image.extraction_type = extraction_type
        image.confidence_score = score

        # check if we have a local image
        # in order to add more information
        # on the Image object
        local_image = self.get_local_image(image.src)
        if local_image:
            image.bytes = local_image.bytes
            image.height = local_image.height
            image.width = local_image.width

        # return the image
        return image
Beispiel #5
0
    def check_large_images(self, node, parent_depth_level,
                           sibling_depth_level):
        """\
        although slow the best way to determine the best image is to download
        them and check the actual dimensions of the image when on disk
        so we'll go through a phased approach...
        1. get a list of ALL images from the parent node
        2. filter out any bad image names that we know of (gifs, ads, etc..)
        3. do a head request on each file to make sure it meets
           our bare requirements
        4. any images left over let's do a full GET request,
           download em to disk and check their dimensions
        5. Score images based on different factors like height/width
           and possibly things like color density
        """
        good_images = self.get_image_candidates(node)

        if good_images:
            scored_images = self.fetch_images(good_images, parent_depth_level)
            if scored_images:
                highscore_image = sorted(scored_images.items(),
                                         key=lambda x: x[1],
                                         reverse=True)[0][0]
                main_image = Image()
                main_image.src = highscore_image.src
                main_image.width = highscore_image.width
                main_image.height = highscore_image.height
                main_image.extraction_type = "bigimage"
                main_image.confidence_score = 100 / len(scored_images) \
                                    if len(scored_images) > 0 else 0
                return main_image

        depth_obj = self.get_depth_level(node, parent_depth_level,
                                         sibling_depth_level)
        if depth_obj:
            return self.check_large_images(depth_obj.node,
                                           depth_obj.parent_depth,
                                           depth_obj.sibling_depth)

        return None
Beispiel #6
0
 def getExpectedImage(self, expected_value):
     image = Image()
     for k, v in expected_value.items():
         setattr(image, k, v)
     return image