Example #1
0
    def get_best_image(self, doc, top_node):
        image = self.check_known_elements()
        if image:
            return image

        image = self.check_large_images(top_node, 0, 0)
        if image:
            return image

        image = self.check_meta_tag()
        if image:
            return image
        return Image()
Example #2
0
    def get_image(self, src, score=100, extraction_type="N/A"):
        # build the Image object
        image = Image()
        image._src = self.build_image_path(src)
        image._extraction_type = extraction_type
        image._confidence_score = score

        # check if we have a local image
        # in order to add more information
        # on the Image object
        local_image = self.get_local_image(image.src)
        if local_image:
            image._bytes = local_image.bytes
            image._height = local_image.height
            image._width = local_image.width

        # return the image
        return image
Example #3
0
    def assert_top_image(self, fields, expected_value, result_image):
        # test if the result value
        # is an Goose Image instance
        msg = "Result value is not a Goose Image instance"
        self.assertTrue(isinstance(result_image, Image), msg=msg)

        # expected image
        expected_image = Image()
        for k, v in list(expected_value.items()):
            setattr(expected_image, '_{}'.format(k), v)
        msg = "Expected value is not a Goose Image instance"
        self.assertTrue(isinstance(expected_image, Image), msg=msg)

        # check
        msg = "Returned Image is not the one expected"
        self.assertIn(result_image.src, expected_image.src, msg=msg)

        fields = vars(expected_image)
        for k, v in list(fields.items()):
            msg = "Returned Image attribute '%s' is not the one expected" % k
            self.assertEqual(getattr(expected_image, k), getattr(result_image, k), msg=msg)
Example #4
0
    def check_large_images(self, node, parent_depth_level,
                           sibling_depth_level):
        """\
        although slow the best way to determine the best image is to download
        them and check the actual dimensions of the image when on disk
        so we'll go through a phased approach...
        1. get a list of ALL images from the parent node
        2. filter out any bad image names that we know of (gifs, ads, etc..)
        3. do a head request on each file to make sure it meets
           our bare requirements
        4. any images left over let's do a full GET request,
           download em to disk and check their dimensions
        5. Score images based on different factors like height/width
           and possibly things like color density
        """
        good_images = self.get_image_candidates(node)

        if good_images:
            scored_images = self.fetch_images(good_images, parent_depth_level)
            if scored_images:
                highscore_image = sorted(list(scored_images.items()),
                                         key=lambda x: x[1],
                                         reverse=True)[0][0]
                main_image = Image()
                main_image._src = highscore_image.src
                main_image._width = highscore_image.width
                main_image._height = highscore_image.height
                main_image._extraction_type = "bigimage"
                score_len = len(scored_images)
                main_image._confidence_score = 100 / score_len if score_len > 0 else 0
                return main_image

        depth_obj = self.get_depth_level(node, parent_depth_level,
                                         sibling_depth_level)
        if depth_obj:
            return self.check_large_images(depth_obj.node,
                                           depth_obj.parent_depth,
                                           depth_obj.sibling_depth)

        return None
Example #5
0
    def get_best_image(self, doc, top_node):
        # the webpage url that we're extracting content from
        self.target_url = self.article.final_url

        image = self.check_known_elements()
        if image:
            return image

        image = self.check_large_images(top_node, 0, 0)
        if image:
            return image

        image = self.check_meta_tag()
        if image:
            return image

        # Since, nothing worked, pick any img.
        image = self._check_elements(self.article.raw_doc)
        if image is not None:
            src = self.parser.getAttribute(image, attr='src')
            if src and src[-1] != "/" and not self.badimages_names_re.search(src):
                return self.get_image(src, score=80, extraction_type='any')

        return Image()