def get_image(self, element, src, score=100, extraction_type="N/A"): # build the Image object image = Image() image.src = self.build_image_path(src) image.extraction_type = extraction_type image.confidence_score = score # check if we have a local image # in order to add more information # on the Image object local_image = self.get_local_image(image.src) if local_image: image.bytes = local_image.bytes image.height = local_image.height image.width = local_image.width # return the image return image
def check_known_elements(self): """\ in here we check for known image contains from sites we've checked out like yahoo, techcrunch, etc... that have * known places to look for good images. * TODO: enable this to use a series of settings files so people can define what the image ids/classes are on specific sites """ domain = self.get_clean_domain() if domain in self.custom_site_mapping.keys(): classes = self.custom_site_mapping.get(domain).split('|') for classname in classes: KNOWN_IMG_DOM_NAMES.append(classname) known_image = None for known_name in KNOWN_IMG_DOM_NAMES: known = Parser.getElementById(self.article.raw_doc, known_name) if not known: known = Parser.getElementsByTag(self.article.raw_doc, attr='class', value=known_name) if known: known = known[0] if known: main_image = Parser.getElementsByTag(known, tag='img') if main_image: known_image = main_image[0] if known_image is not None: known_image_source = Parser.getAttribute(known_image, attr='src') main_image = Image() main_image.src = self.build_image_path(known_image_source) main_image.extraction_type = "known" main_image.confidence_score = 90 local_image = self.get_local_image(main_image.src) if local_image: main_image.bytes = local_image.bytes main_image.height = local_image.height main_image.width = local_image.width return main_image
def check_known_elements(self): """\ in here we check for known image contains from sites we've checked out like yahoo, techcrunch, etc... that have * known places to look for good images. * TODO: enable this to use a series of settings files so people can define what the image ids/classes are on specific sites """ domain = self.get_clean_domain() if domain in self.custom_site_mapping.keys(): classes = self.custom_site_mapping.get(domain).split('|') for classname in classes: KNOWN_IMG_DOM_NAMES.append(classname) known_image = None for known_name in KNOWN_IMG_DOM_NAMES: known = self.parser.getElementById(self.article.raw_doc, known_name) if not known: known = self.parser.getElementsByTag(self.article.raw_doc, attr='class', value=known_name) if known: known = known[0] if known: main_image = self.parser.getElementsByTag(known, tag='img') if main_image: known_image = main_image[0] if known_image is not None: known_image_source = self.parser.getAttribute(known_image, attr='src') main_image = Image() main_image.src = self.build_image_path(known_image_source) main_image.extraction_type = "known" main_image.confidence_score = 90 local_image = self.get_local_image(main_image.src) if local_image: main_image.bytes = local_image.bytes main_image.height = local_image.height main_image.width = local_image.width return main_image
def check_opengraph_tag(self): """\ checks to see if we were able to find open graph tags on this page """ node = self.article.raw_doc meta = Parser.getElementsByTag(node, tag='meta', attr='property', value='og:image') for item in meta: href = Parser.getAttribute(item, attr='content') if href: main_image = Image() main_image.src = href main_image.extraction_type = "opengraph" main_image.confidence_score = 100 local_image = self.get_local_image(main_image.src) if local_image: main_image.bytes = local_image.bytes main_image.height = local_image.height main_image.width = local_image.width return main_image return None
def check_link_tag(self): """\ checks to see if we were able to find open link_src on this page """ node = self.article.raw_doc meta = Parser.getElementsByTag(node, tag='link', attr='rel', value='image_src') for item in meta: href = Parser.getAttribute(item, attr='href') if href: main_image = Image() main_image.src = href main_image.extraction_type = "linktag" main_image.confidence_score = 100 local_image = self.get_local_image(main_image.src) if local_image: main_image.bytes = local_image.bytes main_image.height = local_image.height main_image.width = local_image.width return main_image return None
def check_opengraph_tag(self): """\ checks to see if we were able to find open graph tags on this page """ node = self.article.raw_doc meta = self.parser.getElementsByTag(node, tag='meta', attr='property', value='og:image') for item in meta: href = self.parser.getAttribute(item, attr='content') if href: main_image = Image() main_image.src = href main_image.extraction_type = "opengraph" main_image.confidence_score = 100 local_image = self.get_local_image(main_image.src) if local_image: main_image.bytes = local_image.bytes main_image.height = local_image.height main_image.width = local_image.width return main_image return None
def check_link_tag(self): """\ checks to see if we were able to find open link_src on this page """ node = self.article.raw_doc meta = self.parser.getElementsByTag(node, tag='link', attr='rel', value='image_src') for item in meta: href = self.parser.getAttribute(item, attr='href') if href: main_image = Image() main_image.src = href main_image.extraction_type = "linktag" main_image.confidence_score = 100 local_image = self.get_local_image(main_image.src) if local_image: main_image.bytes = local_image.bytes main_image.height = local_image.height main_image.width = local_image.width return main_image return None