Ejemplo n.º 1
0
    def check_large_images(self, node, parent_depth_level, sibling_depth_level):
        """\
        although slow the best way to determine the best image is to download
        them and check the actual dimensions of the image when on disk
        so we'll go through a phased approach...
        1. get a list of ALL images from the parent node
        2. filter out any bad image names that we know of (gifs, ads, etc..)
        3. do a head request on each file to make sure it meets
           our bare requirements
        4. any images left over let's do a full GET request,
           download em to disk and check their dimensions
        5. Score images based on different factors like height/width
           and possibly things like color density
        """
        good_images = self.get_image_candidates(node)

        if good_images:
            scored_images = self.fetch_images(good_images, parent_depth_level)
            if scored_images:
                highscore_image = sorted(list(scored_images.items()),
                                        key=lambda x: x[1], reverse=True)[0][0]
                main_image = Image()
                main_image.src = highscore_image.src
                main_image.extraction_type = "bigimage"
                main_image.confidence_score = 100 / len(scored_images) \
                                    if len(scored_images) > 0 else 0
                return main_image

        depth_obj = self.get_depth_level(node, parent_depth_level, sibling_depth_level)
        if depth_obj:
            return self.check_large_images(depth_obj.node,
                            depth_obj.parent_depth, depth_obj.sibling_depth)

        return None
Ejemplo n.º 2
0
    def check_large_images(self, node, parent_depth_level, sibling_depth_level):
        """\
        although slow the best way to determine the best image is to download
        them and check the actual dimensions of the image when on disk
        so we'll go through a phased approach...
        1. get a list of ALL images from the parent node
        2. filter out any bad image names that we know of (gifs, ads, etc..)
        3. do a head request on each file to make sure it meets
           our bare requirements
        4. any images left over let's do a full GET request,
           download em to disk and check their dimensions
        5. Score images based on different factors like height/width
           and possibly things like color density
        """
        good_images = self.get_image_candidates(node)

        if good_images:
            scored_images = self.fetch_images(good_images, parent_depth_level)
            if scored_images:
                highscore_image = sorted(scored_images.items(),
                                        key=lambda x: x[1], reverse=True)[0][0]
                main_image = Image()
                main_image.src = highscore_image.src
                main_image.extraction_type = "bigimage"
                main_image.confidence_score = 100 / len(scored_images) \
                                    if len(scored_images) > 0 else 0
                return main_image

        depth_obj = self.get_depth_level(node, parent_depth_level, sibling_depth_level)
        if depth_obj:
            return self.check_large_images(depth_obj.node,
                            depth_obj.parent_depth, depth_obj.sibling_depth)

        return None
Ejemplo n.º 3
0
    def from_image_node_to_image(self, image_node, src=None):
        image = Image()
        if src:
            image.src = src
        else:
            image.src = self.parser.getAttribute(image_node, 'src')
        image.width = self.size_to_int(image_node, 'width')
        image.height = self.size_to_int(image_node, 'height')

        return image
Ejemplo n.º 4
0
 def scored_image_to_result_image(self, scored_img, scored_imgs_len):
     img = Image()
     img.src = scored_img.src
     img.width = scored_img.width
     img.height = scored_img.height
     img.extraction_type = "bigimage"
     img.confidence_score = 100 / scored_imgs_len
     return img
Ejemplo n.º 5
0
    def get_best_image(self, doc, topNode):
        image = self.check_known_elements()
        if image:
            return image

        image = self.check_large_images(topNode, 0, 0)
        if image:
            return image

        image = self.check_meta_tag()
        if image:
            return image
        return Image()
Ejemplo n.º 6
0
 def scored_image_to_result_image(self, scored_img, scored_imgs_len):
     img = Image()
     img.src = scored_img.src
     img.width = scored_img.width
     img.height = scored_img.height
     img.extraction_type = "bigimage"
     img.confidence_score = 100 / scored_imgs_len
     return img
Ejemplo n.º 7
0
    def get_image(self, element, src, score=100, extraction_type="N/A"):
        # build the Image object
        image = Image()
        image.src = self.build_image_path(src)
        image.extraction_type = extraction_type
        image.confidence_score = score

        # check if we have a local image
        # in order to add more information
        # on the Image object
        local_image = self.get_local_image(image.src)
        if local_image:
            image.bytes = local_image.bytes
            image.height = local_image.height
            image.width = local_image.width

        # return the image
        return image
Ejemplo n.º 8
0
    def from_image_node_to_image(self, image_node, src=None, extraction_type="NA"):
        image = Image()
        if src:
            image.src = src
        else:
            image.src = self.parser.getAttribute(image_node, 'src')
        image.width = self.size_to_int(image_node, 'width')
        image.height = self.size_to_int(image_node, 'height')
        image.extraction_type = extraction_type

        return image
Ejemplo n.º 9
0
    def check_known_elements(self):
        """\
        in here we check for known image contains from sites
        we've checked out like yahoo, techcrunch, etc... that have
        * known  places to look for good images.
        * TODO: enable this to use a series of settings files
          so people can define what the image ids/classes
          are on specific sites
        """
        domain = self.get_clean_domain()
        if domain in self.custom_site_mapping.keys():
            classes = self.custom_site_mapping.get(domain).split('|')
            for classname in classes:
                KNOWN_IMG_DOM_NAMES.append(classname)

        known_image = None

        for known_name in KNOWN_IMG_DOM_NAMES:
            known = self.parser.getElementById(self.article.raw_doc, known_name)
            if not known:
                known = self.parser.getElementsByTag(self.article.raw_doc,
                                                attr='class', value=known_name)
                if known:
                    known = known[0]
            if known:
                main_image = self.parser.getElementsByTag(known, tag='img')
                if main_image:
                    known_image = main_image[0]

        if known_image is not None:
            known_image_source = self.parser.getAttribute(known_image, attr='src')
            main_image = Image()
            main_image.src = self.build_image_path(known_image_source)
            main_image.extraction_type = "known"
            main_image.confidence_score = 90
            local_image = self.get_local_image(main_image.src)
            if local_image:
                main_image.bytes = local_image.bytes
                main_image.height = local_image.height
                main_image.width = local_image.width

            return main_image
Ejemplo n.º 10
0
 def check_opengraph_tag(self):
     """\
     checks to see if we were able to
     find open graph tags on this page
     """
     node = self.article.raw_doc
     meta = self.parser.getElementsByTag(node, tag='meta', attr='property', value='og:image')
     for item in meta:
         href = self.parser.getAttribute(item, attr='content')
         if href:
             main_image = Image()
             main_image.src = href
             main_image.extraction_type = "opengraph"
             main_image.confidence_score = 100
             local_image = self.get_local_image(main_image.src)
             if local_image:
                 main_image.bytes = local_image.bytes
                 main_image.height = local_image.height
                 main_image.width = local_image.width
                 return main_image
     return None
Ejemplo n.º 11
0
 def check_link_tag(self):
     """\
     checks to see if we were able to
     find open link_src on this page
     """
     node = self.article.raw_doc
     meta = self.parser.getElementsByTag(node, tag='link', attr='rel', value='image_src')
     for item in meta:
         href = self.parser.getAttribute(item, attr='href')
         if href:
             main_image = Image()
             main_image.src = href
             main_image.extraction_type = "linktag"
             main_image.confidence_score = 100
             local_image = self.get_local_image(main_image.src)
             if local_image:
                 main_image.bytes = local_image.bytes
                 main_image.height = local_image.height
                 main_image.width = local_image.width
                 return main_image
     return None
Ejemplo n.º 12
0
    def get_image(self, element, src, score=100, extraction_type="N/A"):
        # build the Image object
        image = Image()
        image.src = self.build_image_path(src)
        image.extraction_type = extraction_type
        image.confidence_score = score

        # check if we have a local image
        # in order to add more information
        # on the Image object
        local_image = self.get_local_image(image.src)
        if local_image:
            image.bytes = local_image.bytes
            image.height = local_image.height
            image.width = local_image.width

        # return the image
        return image
Ejemplo n.º 13
0
    def check_known_elements(self):
        """\
        in here we check for known image contains from sites
        we've checked out like yahoo, techcrunch, etc... that have
        * known  places to look for good images.
        * TODO: enable this to use a series of settings files
          so people can define what the image ids/classes
          are on specific sites
        """
        domain = self.get_clean_domain()
        if domain in self.custom_site_mapping.keys():
            classes = self.custom_site_mapping.get(domain).split('|')
            for classname in classes:
                KNOWN_IMG_DOM_NAMES.append(classname)

        known_image = None

        for known_name in KNOWN_IMG_DOM_NAMES:
            known = Parser.getElementById(self.article.raw_doc, known_name)
            if not known:
                known = Parser.getElementsByTag(self.article.raw_doc,
                                                attr='class', value=known_name)
                if known:
                    known = known[0]
            if known:
                main_image = Parser.getElementsByTag(known, tag='img')
                if main_image:
                    known_image = main_image[0]

        if known_image is not None:
            known_image_source = Parser.getAttribute(known_image, attr='src')
            main_image = Image()
            main_image.src = self.build_image_path(known_image_source)
            main_image.extraction_type = "known"
            main_image.confidence_score = 90
            local_image = self.get_local_image(main_image.src)
            if local_image:
                main_image.bytes = local_image.bytes
                main_image.height = local_image.height
                main_image.width = local_image.width

            return main_image
Ejemplo n.º 14
0
 def check_opengraph_tag(self):
     """\
     checks to see if we were able to
     find open graph tags on this page
     """
     node = self.article.raw_doc
     meta = Parser.getElementsByTag(node, tag='meta', attr='property', value='og:image')
     for item in meta:
         href = Parser.getAttribute(item, attr='content')
         if href:
             main_image = Image()
             main_image.src = href
             main_image.extraction_type = "opengraph"
             main_image.confidence_score = 100
             local_image = self.get_local_image(main_image.src)
             if local_image:
                 main_image.bytes = local_image.bytes
                 main_image.height = local_image.height
                 main_image.width = local_image.width
                 return main_image
     return None
Ejemplo n.º 15
0
 def check_link_tag(self):
     """\
     checks to see if we were able to
     find open link_src on this page
     """
     node = self.article.raw_doc
     meta = Parser.getElementsByTag(node, tag='link', attr='rel', value='image_src')
     for item in meta:
         href = Parser.getAttribute(item, attr='href')
         if href:
             main_image = Image()
             main_image.src = href
             main_image.extraction_type = "linktag"
             main_image.confidence_score = 100
             local_image = self.get_local_image(main_image.src)
             if local_image:
                 main_image.bytes = local_image.bytes
                 main_image.height = local_image.height
                 main_image.width = local_image.width
                 return main_image
     return None
Ejemplo n.º 16
0
 def getExpectedImage(self, expected_value):
     image = Image()
     for k, v in expected_value.items():
         setattr(image, k, v)
     return image