def checkForLargeImages(self, node, parentDepthLevel, siblingDepthLevel): """\ although slow the best way to determine the best image is to download them and check the actual dimensions of the image when on disk so we'll go through a phased approach... 1. get a list of ALL images from the parent node 2. filter out any bad image names that we know of (gifs, ads, etc..) 3. do a head request on each file to make sure it meets our bare requirements 4. any images left over let's do a full GET request, download em to disk and check their dimensions 5. Score images based on different factors like height/width and possibly things like color density """ goodImages = self.getImageCandidates(node) if goodImages: scoredImages = self.downloadImagesAndGetResults(goodImages, parentDepthLevel) if scoredImages: highScoreImage = sorted(scoredImages.items(), key=lambda x: x[1], reverse=True)[0][0] mainImage = Image() mainImage.imageSrc = highScoreImage.imgSrc mainImage.imageExtractionType = "bigimage" mainImage.confidenceScore = 100 / len(scoredImages) \ if len(scoredImages) > 0 else 0 return mainImage depthObj = self.getDepthLevel(node, parentDepthLevel, siblingDepthLevel) if depthObj: return self.checkForLargeImages(depthObj.node, depthObj.parentDepth, depthObj.siblingDepth) return None
def checkForOpenGraphTag(self): """\ checks to see if we were able to find open graph tags on this page """ node = self.article.rawDoc meta = Parser.getElementsByTag(node, tag='meta', attr='property', value='og:image') for item in meta: href = Parser.getAttribute(item, attr='content') if href: mainImage = Image() mainImage.imageSrc = href mainImage.imageExtractionType = "opengraph" mainImage.confidenceScore = 100 locallyStoredImage = self.getLocallyStoredImage( mainImage.imageSrc) if locallyStoredImage: mainImage.bytes = locallyStoredImage.bytes mainImage.height = locallyStoredImage.height mainImage.width = locallyStoredImage.width return mainImage return None
def checkForLinkTag(self): """\ checks to see if we were able to find open link_src on this page """ node = self.article.rawDoc meta = Parser.getElementsByTag(node, tag='link', attr='rel', value='image_src') for item in meta: href = Parser.getAttribute(item, attr='href') if href: mainImage = Image() mainImage.imageSrc = href mainImage.imageExtractionType = "linktag" mainImage.confidenceScore = 100 locallyStoredImage = self.getLocallyStoredImage( mainImage.imageSrc) if locallyStoredImage: mainImage.bytes = locallyStoredImage.bytes mainImage.height = locallyStoredImage.height mainImage.width = locallyStoredImage.width return mainImage return None
def getBestImage(self, doc, topNode): image = self.checkForKnownElements() if image: return image image = self.checkForLargeImages(topNode, 0, 0) if image: return image image = self.checkForMetaTag() if image: return image return Image()
def checkForLargeImages(self, node, parentDepthLevel, siblingDepthLevel): """\ although slow the best way to determine the best image is to download them and check the actual dimensions of the image when on disk so we'll go through a phased approach... 1. get a list of ALL images from the parent node 2. filter out any bad image names that we know of (gifs, ads, etc..) 3. do a head request on each file to make sure it meets our bare requirements 4. any images left over let's do a full GET request, download em to disk and check their dimensions 5. Score images based on different factors like height/width and possibly things like color density """ goodImages = self.getImageCandidates(node) if goodImages: scoredImages = self.downloadImagesAndGetResults( goodImages, parentDepthLevel) if scoredImages: highScoreImage = sorted(scoredImages.items(), key=lambda x: x[1], reverse=True)[0][0] mainImage = Image() mainImage.imageSrc = highScoreImage.imgSrc mainImage.imageExtractionType = "bigimage" mainImage.confidenceScore = 100 / len(scoredImages) \ if len(scoredImages) > 0 else 0 return mainImage depthObj = self.getDepthLevel(node, parentDepthLevel, siblingDepthLevel) if depthObj: return self.checkForLargeImages(depthObj.node, depthObj.parentDepth, depthObj.siblingDepth) return None
def checkForKnownElements(self): """\ in here we check for known image contains from sites we've checked out like yahoo, techcrunch, etc... that have * known places to look for good images. * TODO: enable this to use a series of settings files so people can define what the image ids/classes are on specific sites """ domain = self.getCleanDomain() if domain in self.customSiteMapping.keys(): classes = self.customSiteMapping.get(domain).split('|') for classname in classes: KNOWN_IMG_DOM_NAMES.append(classname) knownImage = None for knownName in KNOWN_IMG_DOM_NAMES: known = Parser.getElementById(self.article.rawDoc, knownName) if not known: known = Parser.getElementsByTag(self.article.rawDoc, attr='class', value=knownName) if known: known = known[0] if known: mainImage = Parser.getElementsByTag(known, tag='img') if mainImage: knownImage = mainImage[0] if knownImage is not None: knownImgSrc = Parser.getAttribute(knownImage, attr='src') mainImage = Image() mainImage.imageSrc = self.buildImagePath(knownImgSrc) mainImage.imageExtractionType = "known" mainImage.confidenceScore = 90 locallyStoredImage = self.getLocallyStoredImage(mainImage.imageSrc) if locallyStoredImage: mainImage.bytes = locallyStoredImage.bytes mainImage.height = locallyStoredImage.height mainImage.width = locallyStoredImage.width return mainImage
def checkForOpenGraphTag(self): """\ checks to see if we were able to find open graph tags on this page """ node = self.article.rawDoc meta = Parser.getElementsByTag(node, tag='meta', attr='property', value='og:image') for item in meta: href = Parser.getAttribute(item, attr='content') if href: mainImage = Image() mainImage.imageSrc = href mainImage.imageExtractionType = "opengraph" mainImage.confidenceScore = 100 locallyStoredImage = self.getLocallyStoredImage(mainImage.imageSrc) if locallyStoredImage: mainImage.bytes = locallyStoredImage.bytes mainImage.height = locallyStoredImage.height mainImage.width = locallyStoredImage.width return mainImage return None
def checkForLinkTag(self): """\ checks to see if we were able to find open link_src on this page """ node = self.article.rawDoc meta = Parser.getElementsByTag(node, tag='link', attr='rel', value='image_src') for item in meta: href = Parser.getAttribute(item, attr='href') if href: mainImage = Image() mainImage.imageSrc = href mainImage.imageExtractionType = "linktag" mainImage.confidenceScore = 100 locallyStoredImage = self.getLocallyStoredImage(mainImage.imageSrc) if locallyStoredImage: mainImage.bytes = locallyStoredImage.bytes mainImage.height = locallyStoredImage.height mainImage.width = locallyStoredImage.width return mainImage return None