def checkForLargeImages(self, node, parentDepthLevel, siblingDepthLevel):
        """\
        although slow the best way to determine the best image is to download
        them and check the actual dimensions of the image when on disk
        so we'll go through a phased approach...
        1. get a list of ALL images from the parent node
        2. filter out any bad image names that we know of (gifs, ads, etc..)
        3. do a head request on each file to make sure it meets
           our bare requirements
        4. any images left over let's do a full GET request,
           download em to disk and check their dimensions
        5. Score images based on different factors like height/width
           and possibly things like color density
        """
        goodImages = self.getImageCandidates(node)

        if goodImages:
            scoredImages = self.downloadImagesAndGetResults(goodImages, parentDepthLevel)
            if scoredImages:
                highScoreImage = sorted(scoredImages.items(),
                                        key=lambda x: x[1], reverse=True)[0][0]
                mainImage = Image()
                mainImage.imageSrc = highScoreImage.imgSrc
                mainImage.imageExtractionType = "bigimage"
                mainImage.confidenceScore = 100 / len(scoredImages) \
                                    if len(scoredImages) > 0 else 0
                return mainImage

        depthObj = self.getDepthLevel(node, parentDepthLevel, siblingDepthLevel)
        if depthObj:
            return self.checkForLargeImages(depthObj.node,
                            depthObj.parentDepth, depthObj.siblingDepth)

        return None
 def checkForOpenGraphTag(self):
     """\
     checks to see if we were able to 
     find open graph tags on this page
     """
     node = self.article.rawDoc
     meta = Parser.getElementsByTag(node,
                                    tag='meta',
                                    attr='property',
                                    value='og:image')
     for item in meta:
         href = Parser.getAttribute(item, attr='content')
         if href:
             mainImage = Image()
             mainImage.imageSrc = href
             mainImage.imageExtractionType = "opengraph"
             mainImage.confidenceScore = 100
             locallyStoredImage = self.getLocallyStoredImage(
                 mainImage.imageSrc)
             if locallyStoredImage:
                 mainImage.bytes = locallyStoredImage.bytes
                 mainImage.height = locallyStoredImage.height
                 mainImage.width = locallyStoredImage.width
                 return mainImage
     return None
 def checkForLinkTag(self):
     """\
     checks to see if we were able to 
     find open link_src on this page
     """
     node = self.article.rawDoc
     meta = Parser.getElementsByTag(node,
                                    tag='link',
                                    attr='rel',
                                    value='image_src')
     for item in meta:
         href = Parser.getAttribute(item, attr='href')
         if href:
             mainImage = Image()
             mainImage.imageSrc = href
             mainImage.imageExtractionType = "linktag"
             mainImage.confidenceScore = 100
             locallyStoredImage = self.getLocallyStoredImage(
                 mainImage.imageSrc)
             if locallyStoredImage:
                 mainImage.bytes = locallyStoredImage.bytes
                 mainImage.height = locallyStoredImage.height
                 mainImage.width = locallyStoredImage.width
                 return mainImage
     return None
    def getBestImage(self, doc, topNode):
        image = self.checkForKnownElements()
        if image:
            return image

        image = self.checkForLargeImages(topNode, 0, 0)
        if image:
            return image

        image = self.checkForMetaTag()
        if image:
            return image
        return Image()
    def checkForLargeImages(self, node, parentDepthLevel, siblingDepthLevel):
        """\
        although slow the best way to determine the best image is to download
        them and check the actual dimensions of the image when on disk
        so we'll go through a phased approach...
        1. get a list of ALL images from the parent node
        2. filter out any bad image names that we know of (gifs, ads, etc..)
        3. do a head request on each file to make sure it meets 
           our bare requirements
        4. any images left over let's do a full GET request, 
           download em to disk and check their dimensions
        5. Score images based on different factors like height/width 
           and possibly things like color density
        """
        goodImages = self.getImageCandidates(node)

        if goodImages:
            scoredImages = self.downloadImagesAndGetResults(
                goodImages, parentDepthLevel)
            if scoredImages:
                highScoreImage = sorted(scoredImages.items(),
                                        key=lambda x: x[1],
                                        reverse=True)[0][0]
                mainImage = Image()
                mainImage.imageSrc = highScoreImage.imgSrc
                mainImage.imageExtractionType = "bigimage"
                mainImage.confidenceScore = 100 / len(scoredImages) \
                                    if len(scoredImages) > 0 else 0
                return mainImage

        depthObj = self.getDepthLevel(node, parentDepthLevel,
                                      siblingDepthLevel)
        if depthObj:
            return self.checkForLargeImages(depthObj.node,
                                            depthObj.parentDepth,
                                            depthObj.siblingDepth)

        return None
Ejemplo n.º 6
0
    def checkForKnownElements(self):
        """\
        in here we check for known image contains from sites
        we've checked out like yahoo, techcrunch, etc... that have
        * known  places to look for good images.
        * TODO: enable this to use a series of settings files
          so people can define what the image ids/classes
          are on specific sites
        """
        domain = self.getCleanDomain()
        if domain in self.customSiteMapping.keys():
            classes = self.customSiteMapping.get(domain).split('|')
            for classname in classes:
                KNOWN_IMG_DOM_NAMES.append(classname)

        knownImage = None

        for knownName in KNOWN_IMG_DOM_NAMES:
            known = Parser.getElementById(self.article.rawDoc, knownName)
            if not known:
                known = Parser.getElementsByTag(self.article.rawDoc,
                                                attr='class',
                                                value=knownName)
                if known:
                    known = known[0]
            if known:
                mainImage = Parser.getElementsByTag(known, tag='img')
                if mainImage:
                    knownImage = mainImage[0]

        if knownImage is not None:
            knownImgSrc = Parser.getAttribute(knownImage, attr='src')
            mainImage = Image()
            mainImage.imageSrc = self.buildImagePath(knownImgSrc)
            mainImage.imageExtractionType = "known"
            mainImage.confidenceScore = 90
            locallyStoredImage = self.getLocallyStoredImage(mainImage.imageSrc)
            if locallyStoredImage:
                mainImage.bytes = locallyStoredImage.bytes
                mainImage.height = locallyStoredImage.height
                mainImage.width = locallyStoredImage.width

            return mainImage
    def checkForKnownElements(self):
        """\
        in here we check for known image contains from sites
        we've checked out like yahoo, techcrunch, etc... that have
        * known  places to look for good images.
        * TODO: enable this to use a series of settings files
          so people can define what the image ids/classes
          are on specific sites
        """
        domain = self.getCleanDomain()
        if domain in self.customSiteMapping.keys():
            classes = self.customSiteMapping.get(domain).split('|')
            for classname in classes:
                KNOWN_IMG_DOM_NAMES.append(classname)

        knownImage = None

        for knownName in KNOWN_IMG_DOM_NAMES:
            known = Parser.getElementById(self.article.rawDoc, knownName)
            if not known:
                known = Parser.getElementsByTag(self.article.rawDoc,
                                                attr='class', value=knownName)
                if known:
                    known = known[0]
            if known:
                mainImage = Parser.getElementsByTag(known, tag='img')
                if mainImage:
                    knownImage = mainImage[0]

        if knownImage is not None:
            knownImgSrc = Parser.getAttribute(knownImage, attr='src')
            mainImage = Image()
            mainImage.imageSrc = self.buildImagePath(knownImgSrc)
            mainImage.imageExtractionType = "known"
            mainImage.confidenceScore = 90
            locallyStoredImage = self.getLocallyStoredImage(mainImage.imageSrc)
            if locallyStoredImage:
                mainImage.bytes = locallyStoredImage.bytes
                mainImage.height = locallyStoredImage.height
                mainImage.width = locallyStoredImage.width

            return mainImage
 def checkForOpenGraphTag(self):
     """\
     checks to see if we were able to
     find open graph tags on this page
     """
     node = self.article.rawDoc
     meta = Parser.getElementsByTag(node, tag='meta', attr='property', value='og:image')
     for item in meta:
         href = Parser.getAttribute(item, attr='content')
         if href:
             mainImage = Image()
             mainImage.imageSrc = href
             mainImage.imageExtractionType = "opengraph"
             mainImage.confidenceScore = 100
             locallyStoredImage = self.getLocallyStoredImage(mainImage.imageSrc)
             if locallyStoredImage:
                 mainImage.bytes = locallyStoredImage.bytes
                 mainImage.height = locallyStoredImage.height
                 mainImage.width = locallyStoredImage.width
                 return mainImage
     return None
 def checkForLinkTag(self):
     """\
     checks to see if we were able to
     find open link_src on this page
     """
     node = self.article.rawDoc
     meta = Parser.getElementsByTag(node, tag='link', attr='rel', value='image_src')
     for item in meta:
         href = Parser.getAttribute(item, attr='href')
         if href:
             mainImage = Image()
             mainImage.imageSrc = href
             mainImage.imageExtractionType = "linktag"
             mainImage.confidenceScore = 100
             locallyStoredImage = self.getLocallyStoredImage(mainImage.imageSrc)
             if locallyStoredImage:
                 mainImage.bytes = locallyStoredImage.bytes
                 mainImage.height = locallyStoredImage.height
                 mainImage.width = locallyStoredImage.width
                 return mainImage
     return None