def getMetaLang(self, article): """\ Extract content language from meta """ # we have a lang attribute in html attr = Parser.getAttribute(article.doc, attr='lang') if attr is None: # look up for a Content-Language in meta items = [ {'tag': 'meta', 'attr': 'http-equiv', 'value': 'content-language'}, {'tag': 'meta', 'attr': 'name', 'value': 'lang'} ] for item in items: meta = Parser.getElementsByTag(article.doc, **item) if meta: attr = Parser.getAttribute(meta[0], attr='content') break if attr: value = attr[:2] if re.search(RE_LANG, value): self.language = value.lower() return value.lower() return None
def get_replacement_nodes(self, doc, div): replacement_text = [] nodes_to_return = [] nodes_to_remove = [] childs = Parser.childNodesWithText(div) for kid in childs: # node is a p # and already have some replacement text if Parser.getTag(kid) == 'p' and len(replacement_text) > 0: newNode = self.get_flushed_buffer(''.join(replacement_text), doc) nodes_to_return.append(newNode) replacement_text = [] nodes_to_return.append(kid) # node is a text node elif Parser.isTextNode(kid): kid_text_node = kid kid_text = Parser.getText(kid) replace_text = self.tablines_replacements.replaceAll(kid_text) if(len(replace_text)) > 1: previous_sibling_node = Parser.previousSibling(kid_text_node) while previous_sibling_node is not None \ and Parser.getTag(previous_sibling_node) == "a" \ and Parser.getAttribute(previous_sibling_node, 'grv-usedalready') != 'yes': outer = " " + Parser.outerHtml(previous_sibling_node) + " " replacement_text.append(outer) nodes_to_remove.append(previous_sibling_node) Parser.setAttribute(previous_sibling_node, attr='grv-usedalready', value='yes') prev = Parser.previousSibling(previous_sibling_node) previous_sibling_node = prev if prev is not None else None # append replace_text replacement_text.append(replace_text) # next_sibling_node = Parser.nextSibling(kid_text_node) while next_sibling_node is not None \ and Parser.getTag(next_sibling_node) == "a" \ and Parser.getAttribute(next_sibling_node, 'grv-usedalready') != 'yes': outer = " " + Parser.outerHtml(next_sibling_node) + " " replacement_text.append(outer) nodes_to_remove.append(next_sibling_node) Parser.setAttribute(next_sibling_node, attr='grv-usedalready', value='yes') next = Parser.nextSibling(next_sibling_node) previous_sibling_node = next if next is not None else None # otherwise else: nodes_to_return.append(kid) # flush out anything still remaining if(len(replacement_text) > 0): new_node = self.get_flushed_buffer(''.join(replacement_text), doc) nodes_to_return.append(new_node) replacement_text = [] for n in nodes_to_remove: Parser.remove(n) return nodes_to_return
def getMetaLang(self, article): """\ Extract content language from meta """ # we have a lang attribute in html attr = Parser.getAttribute(article.doc, attr='lang') if attr is None: # look up for a Content-Language in meta items = [{ 'tag': 'meta', 'attr': 'http-equiv', 'value': 'content-language' }, { 'tag': 'meta', 'attr': 'name', 'value': 'lang' }] for item in items: meta = Parser.getElementsByTag(article.doc, **item) if meta: attr = Parser.getAttribute(meta[0], attr='content') break if attr: value = attr[:2] if re.search(RE_LANG, value): self.language = value.lower() return value.lower() return None
def checkForOpenGraphTag(self): """\ checks to see if we were able to find open graph tags on this page """ node = self.article.rawDoc meta = Parser.getElementsByTag(node, tag='meta', attr='property', value='og:image') for item in meta: href = Parser.getAttribute(item, attr='content') if href: mainImage = Image() mainImage.imageSrc = href mainImage.imageExtractionType = "opengraph" mainImage.confidenceScore = 100 locallyStoredImage = self.getLocallyStoredImage( mainImage.imageSrc) if locallyStoredImage: mainImage.bytes = locallyStoredImage.bytes mainImage.height = locallyStoredImage.height mainImage.width = locallyStoredImage.width return mainImage return None
def getMetaLang(self, article): """\ Extract content languages from metas """ # we have a lang attribute in html meta_langs = [] attr = Parser.getAttribute(article.doc, attr='lang') if attr is not None: meta_langs += attr.replace(' ','').lower().split(',') # look up for a Content-Language in meta attrs = { 'http-equiv':'content-language', 'name':'lang', 'name':'og:lang', } head = article.doc.find('head') if head is not None: metas = Parser.getElementsByTag(head, tag='meta') for meta in metas: for attr in attrs: if meta.attrib.get(attr,'').lower().startswith(attrs[attr]): langs = meta.attrib.get('content',None) if langs is not None: meta_langs += langs.replace(' ','').lower().split(',') if 'lang' in meta.attrib: meta_langs += meta.attrib['lang'].replace(' ','').lower().split(',') result = [] for lang in meta_langs: lang = lang[:2] if re.search(RE_LANG, lang): result.append(lang) return result
def checkForLinkTag(self): """\ checks to see if we were able to find open link_src on this page """ node = self.article.rawDoc meta = Parser.getElementsByTag(node, tag='link', attr='rel', value='image_src') for item in meta: href = Parser.getAttribute(item, attr='href') if href: mainImage = Image() mainImage.imageSrc = href mainImage.imageExtractionType = "linktag" mainImage.confidenceScore = 100 locallyStoredImage = self.getLocallyStoredImage( mainImage.imageSrc) if locallyStoredImage: mainImage.bytes = locallyStoredImage.bytes mainImage.height = locallyStoredImage.height mainImage.width = locallyStoredImage.width return mainImage return None
def getMetaLang(self, article): """\ Extract content language from meta """ # we have a lang attribute in html attr = Parser.getAttribute(article.doc, attr='lang') if attr is None: # look up for a Content-Language in meta kwargs = {'tag':'meta', 'attr':' http-equiv', 'value':'content-language'} meta = Parser.getElementsByTag(article.doc, **kwargs) if meta: attr = Parser.getAttribute(meta[0], attr='content') if attr: value = attr[:2] if re.search(RE_LANG, value): return value.lower() return None
def isOkImageFileName(self, imageNode): """\ will check the image src against a list of bad image files we know of like buttons, etc... """ imgSrc = Parser.getAttribute(imageNode, attr='src') if not imgSrc: return False if self.matchBadImageNames.search(imgSrc): return False return True
def is_valid_filename(self, imageNode): """\ will check the image src against a list of bad image files we know of like buttons, etc... """ src = Parser.getAttribute(imageNode, attr='src') if not src: return False if self.badimages_names_re.search(src): return False return True
def isOkImageFileName(self, imageNode): """\ will check the image src against a list of bad image files we know of like buttons, etc... """ imgSrc = Parser.getAttribute(imageNode, attr='src') if not imgSrc: return False if self.matchBadImageNames.search(imgSrc): return False return True
def getMetaLang(self, article): """\ Extract content language from meta """ # we have a lang attribute in html attr = Parser.getAttribute(article.doc, attr='lang') if attr is None: # look up for a Content-Language in meta kwargs = { 'tag': 'meta', 'attr': ' http-equiv', 'value': 'content-language' } meta = Parser.getElementsByTag(article.doc, **kwargs) if meta: attr = Parser.getAttribute(meta[0], attr='content') if attr: value = attr[:2] if re.search(RE_LANG, value): return value.lower() return None
def downloadImagesAndGetResults(self, images, depthLevel): """\ download the images to temp disk and set their dimensions - we're going to score the images in the order in which they appear so images higher up will have more importance, - we'll count the area of the 1st image as a score of 1 and then calculate how much larger or small each image after it is - we'll also make sure to try and weed out banner type ad blocks that have big widths and small heights or vice versa - so if the image is 3rd found in the dom it's sequence score would be 1 / 3 = .33 * diff in area from the first image """ imageResults = {} initialArea = float(0.0) totalScore = float(0.0) cnt = float(1.0) MIN_WIDTH = 50 for image in images[:30]: imgSrc = Parser.getAttribute(image, attr='src') imgSrc = self.buildImagePath(imgSrc) locallyStoredImage = self.getLocallyStoredImage(imgSrc) width = locallyStoredImage.width height = locallyStoredImage.height imageSrc = locallyStoredImage.imgSrc fileExtension = locallyStoredImage.fileExtension if fileExtension != '.gif' or fileExtension != 'NA': if (depthLevel >= 1 and locallyStoredImage.width > 300) or depthLevel < 1: if not self.isBannerDimensions(width, height): if width > MIN_WIDTH: sequenceScore = float(1.0 / cnt) area = float(width * height) totalScore = float(0.0) if initialArea == 0: initialArea = area * float(1.48) totalScore = 1 else: areaDifference = float(area / initialArea) totalScore = sequenceScore * areaDifference imageResults.update( {locallyStoredImage: totalScore}) cnt += 1 cnt += 1 return imageResults
def fetch_images(self, images, depth_level): """\ download the images to temp disk and set their dimensions - we're going to score the images in the order in which they appear so images higher up will have more importance, - we'll count the area of the 1st image as a score of 1 and then calculate how much larger or small each image after it is - we'll also make sure to try and weed out banner type ad blocks that have big widths and small heights or vice versa - so if the image is 3rd found in the dom it's sequence score would be 1 / 3 = .33 * diff in area from the first image """ image_results = {} initial_area = float(0.0) total_score = float(0.0) cnt = float(1.0) MIN_WIDTH = 50 for image in images[:30]: src = Parser.getAttribute(image, attr='src') src = self.build_image_path(src) local_image = self.get_local_image(src) width = local_image.width height = local_image.height src = local_image.src file_extension = local_image.file_extension if file_extension != '.gif' or file_extension != 'NA': if (depth_level >= 1 and local_image.width > 300) or depth_level < 1: if not self.is_banner_dimensions(width, height): if width > MIN_WIDTH: sequence_score = float(1.0 / cnt) area = float(width * height) total_score = float(0.0) if initial_area == 0: initial_area = area * float(1.48) total_score = 1 else: area_difference = float(area / initial_area) total_score = sequence_score * area_difference image_results.update({local_image: total_score}) cnt += 1 cnt += 1 return image_results
def checkForKnownElements(self): """\ in here we check for known image contains from sites we've checked out like yahoo, techcrunch, etc... that have * known places to look for good images. * TODO: enable this to use a series of settings files so people can define what the image ids/classes are on specific sites """ domain = self.getCleanDomain() if domain in self.customSiteMapping.keys(): classes = self.customSiteMapping.get(domain).split('|') for classname in classes: KNOWN_IMG_DOM_NAMES.append(classname) knownImage = None for knownName in KNOWN_IMG_DOM_NAMES: known = Parser.getElementById(self.article.rawDoc, knownName) if not known: known = Parser.getElementsByTag(self.article.rawDoc, attr='class', value=knownName) if known: known = known[0] if known: mainImage = Parser.getElementsByTag(known, tag='img') if mainImage: knownImage = mainImage[0] if knownImage is not None: knownImgSrc = Parser.getAttribute(knownImage, attr='src') mainImage = Image() mainImage.imageSrc = self.buildImagePath(knownImgSrc) mainImage.imageExtractionType = "known" mainImage.confidenceScore = 90 locallyStoredImage = self.getLocallyStoredImage(mainImage.imageSrc) if locallyStoredImage: mainImage.bytes = locallyStoredImage.bytes mainImage.height = locallyStoredImage.height mainImage.width = locallyStoredImage.width return mainImage
def checkForKnownElements(self): """\ in here we check for known image contains from sites we've checked out like yahoo, techcrunch, etc... that have * known places to look for good images. * TODO: enable this to use a series of settings files so people can define what the image ids/classes are on specific sites """ domain = self.getCleanDomain() if domain in self.customSiteMapping.keys(): classes = self.customSiteMapping.get(domain).split('|') for classname in classes: KNOWN_IMG_DOM_NAMES.append(classname) knownImage = None for knownName in KNOWN_IMG_DOM_NAMES: known = Parser.getElementById(self.article.rawDoc, knownName) if known is None: known = Parser.getElementsByTag(self.article.rawDoc, attr='class', value=knownName) known = known[0] if known else None if known is not None: mainImage = Parser.getElementsByTag(known, tag='img') if mainImage: knownImage = mainImage[0] if knownImage is not None: knownImgSrc = Parser.getAttribute(knownImage, attr='src') mainImage = Image() mainImage.imageSrc = self.buildImagePath(knownImgSrc) mainImage.imageExtractionType = "known" mainImage.confidenceScore = 90 locallyStoredImage = self.getLocallyStoredImage(mainImage.imageSrc) if locallyStoredImage: mainImage.bytes = locallyStoredImage.bytes mainImage.height = locallyStoredImage.height mainImage.width = locallyStoredImage.width return mainImage
def check_known_elements(self): """\ in here we check for known image contains from sites we've checked out like yahoo, techcrunch, etc... that have * known places to look for good images. * TODO: enable this to use a series of settings files so people can define what the image ids/classes are on specific sites """ domain = self.get_clean_domain() if domain in self.custom_site_mapping.keys(): classes = self.custom_site_mapping.get(domain).split('|') for classname in classes: KNOWN_IMG_DOM_NAMES.append(classname) known_image = None for known_name in KNOWN_IMG_DOM_NAMES: known = Parser.getElementById(self.article.raw_doc, known_name) if not known: known = Parser.getElementsByTag(self.article.raw_doc, attr='class', value=known_name) if known: known = known[0] if known: main_image = Parser.getElementsByTag(known, tag='img') if main_image: known_image = main_image[0] if known_image is not None: known_image_source = Parser.getAttribute(known_image, attr='src') main_image = Image() main_image.src = self.build_image_path(known_image_source) main_image.extraction_type = "known" main_image.confidence_score = 90 local_image = self.get_local_image(main_image.src) if local_image: main_image.bytes = local_image.bytes main_image.height = local_image.height main_image.width = local_image.width return main_image
def check_link_tag(self): """\ checks to see if we were able to find open link_src on this page """ node = self.article.raw_doc meta = Parser.getElementsByTag(node, tag='link', attr='rel', value='image_src') for item in meta: href = Parser.getAttribute(item, attr='href') if href: main_image = Image() main_image.src = href main_image.extraction_type = "linktag" main_image.confidence_score = 100 local_image = self.get_local_image(main_image.src) if local_image: main_image.bytes = local_image.bytes main_image.height = local_image.height main_image.width = local_image.width return main_image return None
def check_opengraph_tag(self): """\ checks to see if we were able to find open graph tags on this page """ node = self.article.raw_doc meta = Parser.getElementsByTag(node, tag='meta', attr='property', value='og:image') for item in meta: href = Parser.getAttribute(item, attr='content') if href: main_image = Image() main_image.src = href main_image.extraction_type = "opengraph" main_image.confidence_score = 100 local_image = self.get_local_image(main_image.src) if local_image: main_image.bytes = local_image.bytes main_image.height = local_image.height main_image.width = local_image.width return main_image return None
def findImagesThatPassByteSizeTest(self, images): """\ loop through all the images and find the ones that have the best bytez to even make them a candidate """ cnt = 0 MAX_BYTES_SIZE = 15728640 goodImages = [] for image in images: if cnt > 30: return goodImages imgSrc = Parser.getAttribute(image, attr='src') imgSrc = self.buildImagePath(imgSrc) locallyStoredImage = self.getLocallyStoredImage(imgSrc) if locallyStoredImage: bytes = locallyStoredImage.bytes if (bytes == 0 or bytes > self.minBytesForImages) \ and bytes < MAX_BYTES_SIZE: goodImages.append(image) else: images.remove(image) cnt += 1 return goodImages if len(goodImages) > 0 else None
def get_images_bytesize_match(self, images): """\ loop through all the images and find the ones that have the best bytez to even make them a candidate """ cnt = 0 MAX_BYTES_SIZE = 15728640 good_images = [] for image in images: if cnt > 30: return good_images src = Parser.getAttribute(image, attr='src') src = self.build_image_path(src) local_image = self.get_local_image(src) if local_image: bytes = local_image.bytes if (bytes == 0 or bytes > self.images_min_bytes) \ and bytes < MAX_BYTES_SIZE: good_images.append(image) else: images.remove(image) cnt += 1 return good_images if len(good_images) > 0 else None
def findImagesThatPassByteSizeTest(self, images): """\ loop through all the images and find the ones that have the best bytez to even make them a candidate """ cnt = 0 MAX_BYTES_SIZE = 15728640 goodImages = [] for image in images: if cnt > 30: return goodImages imgSrc = Parser.getAttribute(image, attr='src') imgSrc = self.buildImagePath(imgSrc) locallyStoredImage = self.getLocallyStoredImage(imgSrc) if locallyStoredImage: bytes = locallyStoredImage.bytes if (bytes == 0 or bytes > self.minBytesForImages) \ and bytes < MAX_BYTES_SIZE: goodImages.append(image) else: images.remove(image) cnt += 1 return goodImages if len(goodImages) > 0 else None
def getReplacementNodes(self, doc, div): replacementText = [] nodesToReturn = [] nodesToRemove = [] childs = Parser.childNodesWithText(div) for kid in childs: # node is a p # and already have some replacement text if Parser.getTag(kid) == 'p' and len(replacementText) > 0: newNode = self.getFlushedBuffer(''.join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] nodesToReturn.append(kid) # node is a text node elif Parser.isTextNode(kid): kidTextNode = kid kidText = Parser.getText(kid) replaceText = self.tabsAndNewLinesReplcesments.replaceAll(kidText) if(len(replaceText)) > 0: prevSibNode = Parser.previousSibling(kidTextNode) while prevSibNode is not None \ and Parser.getTag(prevSibNode) == "a" \ and Parser.getAttribute(prevSibNode, 'grv-usedalready') != 'yes': outer = " " + Parser.outerHtml(prevSibNode) + " " replacementText.append(outer) nodesToRemove.append(prevSibNode) Parser.setAttribute(prevSibNode, attr='grv-usedalready', value='yes') prevSibNode = Parser.previousSibling(prevSibNode) # append replaceText replacementText.append(replaceText) # nextSibNode = Parser.nextSibling(kidTextNode) while nextSibNode is not None \ and Parser.getTag(nextSibNode) == "a" \ and Parser.getAttribute(nextSibNode, 'grv-usedalready') != 'yes': outer = " " + Parser.outerHtml(nextSibNode) + " " replacementText.append(outer) nodesToRemove.append(nextSibNode) Parser.setAttribute(nextSibNode, attr='grv-usedalready', value='yes') prevSibNode = Parser.nextSibling(nextSibNode) # otherwise else: if Parser.getTag(kid) == "a" and Parser.getAttribute(kid, 'grv-usedalready') == 'yes': continue if(len(replacementText) > 0): newNode = self.getFlushedBuffer(''.join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] nodesToReturn.append(kid) # flush out anything still remaining if(len(replacementText) > 0): newNode = self.getFlushedBuffer(''.join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] for n in nodesToRemove: Parser.remove(n) return nodesToReturn
def getReplacementNodes(self, doc, div): replacementText = [] nodesToReturn = [] nodesToRemove = [] childs = Parser.childNodesWithText(div) for kid in childs: # node is a p # and already have some replacement text if Parser.getTag(kid) == 'p' and len(replacementText) > 0: newNode = self.getFlushedBuffer(''.join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] nodesToReturn.append(kid) # node is a text node elif Parser.isTextNode(kid): kidTextNode = kid kidText = Parser.getText(kid) replaceText = self.tabsAndNewLinesReplcesments.replaceAll( kidText) if (len(replaceText)) > 1: prevSibNode = Parser.previousSibling(kidTextNode) while prevSibNode is not None \ and Parser.getTag(prevSibNode) == "a" \ and Parser.getAttribute(prevSibNode, 'grv-usedalready') != 'yes': outer = " " + Parser.outerHtml(prevSibNode) + " " replacementText.append(outer) nodesToRemove.append(prevSibNode) Parser.setAttribute(prevSibNode, attr='grv-usedalready', value='yes') prev = Parser.previousSibling(prevSibNode) prevSibNode = prev if prev is not None else None # append replaceText replacementText.append(replaceText) # nextSibNode = Parser.nextSibling(kidTextNode) while nextSibNode is not None \ and Parser.getTag(nextSibNode) == "a" \ and Parser.getAttribute(nextSibNode, 'grv-usedalready') != 'yes': outer = " " + Parser.outerHtml(nextSibNode) + " " replacementText.append(outer) nodesToRemove.append(nextSibNode) Parser.setAttribute(nextSibNode, attr='grv-usedalready', value='yes') next = Parser.nextSibling(nextSibNode) prevSibNode = next if next is not None else None # otherwise else: nodesToReturn.append(kid) # flush out anything still remaining if (len(replacementText) > 0): newNode = self.getFlushedBuffer(''.join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] # for n in nodesToRemove: Parser.remove(n) return nodesToReturn
def getReplacementNodes(self, doc, div): replacementText = [] nodesToReturn = [] nodesToRemove = [] childs = Parser.childNodesWithText(div) for kid in childs: # node is a p # and already have some replacement text if Parser.getTag(kid) == "p" and len(replacementText) > 0: newNode = self.getFlushedBuffer("".join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] nodesToReturn.append(kid) # node is a text node elif Parser.isTextNode(kid): kidTextNode = kid kidText = Parser.getText(kid) replaceText = self.tabsAndNewLinesReplcesments.replaceAll(kidText) if (len(replaceText)) > 1: prevSibNode = Parser.previousSibling(kidTextNode) while ( prevSibNode is not None and Parser.getTag(prevSibNode) == "a" and Parser.getAttribute(prevSibNode, "grv-usedalready") != "yes" ): outer = " " + Parser.outerHtml(prevSibNode) + " " replacementText.append(outer) nodesToRemove.append(prevSibNode) Parser.setAttribute(prevSibNode, attr="grv-usedalready", value="yes") prev = Parser.previousSibling(prevSibNode) prevSibNode = prev if prev is not None else None # append replaceText replacementText.append(replaceText) # nextSibNode = Parser.nextSibling(kidTextNode) while ( nextSibNode is not None and Parser.getTag(nextSibNode) == "a" and Parser.getAttribute(nextSibNode, "grv-usedalready") != "yes" ): outer = " " + Parser.outerHtml(nextSibNode) + " " replacementText.append(outer) nodesToRemove.append(nextSibNode) Parser.setAttribute(nextSibNode, attr="grv-usedalready", value="yes") next = Parser.nextSibling(nextSibNode) prevSibNode = next if next is not None else None # otherwise else: nodesToReturn.append(kid) # flush out anything still remaining if len(replacementText) > 0: newNode = self.getFlushedBuffer("".join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] for n in nodesToRemove: Parser.remove(n) return nodesToReturn