def __init__(self, elemtree, url): # object tree representing this document self._lxmletree = elemtree # css parser parses extern and inline css declarations on page self.cssparser = CSSParser() # metadata, additional information about document self.frames = [] self.url = url self._meta = {} # content self.navigation = {} self.name = None
class HTMLDocument(object): """ Fundamental class for rrslib web framework. HTMLDocument provides API for visibility-driven manipulation - added new attribute "style" to each element of the tree. This style is type CSSStyle and represents result of parsed cascade styles on the page and in external files. HTMLDocument uses persistent-tree API for lxml. This class also provides methods for high-level page operations like: - frame checking - metadata parsing - navigation storage (should parse the page implicitly) """ def __init__(self, elemtree, url): # object tree representing this document self._lxmletree = elemtree # css parser parses extern and inline css declarations on page self.cssparser = CSSParser() # metadata, additional information about document self.frames = [] self.url = url self._meta = {} # content self.navigation = {} self.name = None def _normalize_meta_property(self, property): for delim in (".", ":"): if delim in property: property = property.split(delim)[1] if property[0].isupper(): property = property.lower() # classify uppper letters firstupper = property[0].isupper() middleupper = not property[1:].islower() if firstupper and not middleupper: property = property.lower() elif middleupper: buff = [] for i, letter in enumerate(property): if i == 0: buff.append(letter.lower()) continue if letter.isupper(): buff.append(" ") buff.append(letter.lower()) property = "".join(buff) return property @lazy def _parse_meta(self): title = self._lxmletree.find(".//title") if title is not None: self.name = title.text meta = self._lxmletree.findall(".//meta[@content]") for tag in meta: content = tag.get("content") name, httpequiv, property = tag.get("name"), tag.get("http-equiv"), tag.get("property") if name is not None: name = self._normalize_meta_property(name) if name == "keywords": self._meta[name] = [x.strip() for x in content.split(",")] else: if name in self._meta: if content not in self._meta[name]: self._meta[name].append(content) else: self._meta[name] = [content] elif httpequiv is not None: httpequiv = httpequiv.lower() if httpequiv == "content-type": contenttype, charset = content.split(";") self._meta[httpequiv] = contenttype self._meta["charset"] = charset.split("=")[1] else: self._meta[httpequiv] = content elif property is not None: property = self._normalize_meta_property(property) if property in self._meta: if content not in self._meta[property]: self._meta[property].append(content) else: self._meta[property] = [content] def get_meta(self, name): self._parse_meta() try: return self._meta[name] except KeyError: return None def get_meta_map(self): self._parse_meta() return self._meta @lazy def parse_document(self): """ Parse the whole HTML document on the basis of lxml.etree.ElementTree. """ # use persistent lxml.etree_ElementTree API (rrslib extension) persist_ElementTree(self._lxmletree) # Parse css self.cssparser.parse(self._lxmletree, self.url) # parse metadata self._parse_meta() @cached def get_language(self): l = LanguageIdentifier() return l.identify(self.text_content()) @cached def text_content(self): return self._lxmletree.getroot().text_content() def get_element_visibility(self, elem): """ Returns integer representing visibility of the element's text. """ return elem.style.get_visibility() @cached def get_frames(self): """ If page contains frames, returns their urls (from "src" attribute) @return list of frame's URL's or None if no frames on the page """ # get all frames on the page f = [] try: frames = self._lxmletree.findall("//frameset/frame") frames.extend(self._lxmletree.findall("//iframe")) except: return None # nothing found, it is noframe page if not frames: return None # frames found, get URLs from them for frame in frames: # make frame URLs absolute if self.url is not None: base = self.url basesplit = urlsplit(self.url) if re.match("/[^.]*[^/]$", basesplit[2], re.I): base = self.url + "/" frame.make_links_absolute(base) # URL is in src attribute f.append(frame.get("src")) return f # list of frames URLs def add_menu_item(self, text, link): self.navigation[text] = link def get_name(self): return self.name def set_name(self, name): self.name = name def get_url(self): return self.url def get_menu(self): return self.navigation def get_etree(self): return self._lxmletree def __str__(self): return "<" + __modulename__ + ".HTMLDocument url='" + self.url + "'>"
class HTMLDocument(object): """ Fundamental class for rrslib web framework. HTMLDocument provides API for visibility-driven manipulation - added new attribute "style" to each element of the tree. This style is type CSSStyle and represents result of parsed cascade styles on the page and in external files. HTMLDocument uses persistent-tree API for lxml. This class also provides methods for high-level page operations like: - frame checking - metadata parsing - navigation storage (should parse the page implicitly) """ def __init__(self, elemtree, url): # object tree representing this document self._lxmletree = elemtree # css parser parses extern and inline css declarations on page self.cssparser = CSSParser() # metadata, additional information about document self.frames = [] self.url = url self._meta = {} # content self.navigation = {} self.name = None def _normalize_meta_property(self, property): for delim in (".", ":"): if delim in property: property = property.split(delim)[1] if property[0].isupper(): property = property.lower() # classify uppper letters firstupper = property[0].isupper() middleupper = not property[1:].islower() if firstupper and not middleupper: property = property.lower() elif middleupper: buff = [] for i, letter in enumerate(property): if i == 0: buff.append(letter.lower()) continue if letter.isupper(): buff.append(' ') buff.append(letter.lower()) property = "".join(buff) return property @lazy def _parse_meta(self): title = self._lxmletree.find('.//title') if title is not None: self.name = title.text meta = self._lxmletree.findall('.//meta[@content]') for tag in meta: content = tag.get("content") name, httpequiv, property = tag.get("name"), tag.get( "http-equiv"), tag.get("property") if name is not None: name = self._normalize_meta_property(name) if name == 'keywords': self._meta[name] = [x.strip() for x in content.split(",")] else: if name in self._meta: if content not in self._meta[name]: self._meta[name].append(content) else: self._meta[name] = [content] elif httpequiv is not None: httpequiv = httpequiv.lower() if httpequiv == 'content-type': contenttype, charset = content.split(";") self._meta[httpequiv] = contenttype self._meta['charset'] = charset.split("=")[1] else: self._meta[httpequiv] = content elif property is not None: property = self._normalize_meta_property(property) if property in self._meta: if content not in self._meta[property]: self._meta[property].append(content) else: self._meta[property] = [content] def get_meta(self, name): self._parse_meta() try: return self._meta[name] except KeyError: return None def get_meta_map(self): self._parse_meta() return self._meta @lazy def parse_document(self): """ Parse the whole HTML document on the basis of lxml.etree.ElementTree. """ # use persistent lxml.etree_ElementTree API (rrslib extension) persist_ElementTree(self._lxmletree) # Parse css self.cssparser.parse(self._lxmletree, self.url) # parse metadata self._parse_meta() @cached def get_language(self): l = LanguageIdentifier() return l.identify(self.text_content()) @cached def text_content(self): return self._lxmletree.getroot().text_content() def get_element_visibility(self, elem): """ Returns integer representing visibility of the element's text. """ return elem.style.get_visibility() @cached def get_frames(self): """ If page contains frames, returns their urls (from "src" attribute) @return list of frame's URL's or None if no frames on the page """ # get all frames on the page f = [] try: frames = self._lxmletree.findall('//frameset/frame') frames.extend(self._lxmletree.findall('//iframe')) except: return None # nothing found, it is noframe page if not frames: return None # frames found, get URLs from them for frame in frames: # make frame URLs absolute if self.url is not None: base = self.url basesplit = urlsplit(self.url) if re.match("/[^.]*[^/]$", basesplit[2], re.I): base = self.url + "/" frame.make_links_absolute(base) # URL is in src attribute f.append(frame.get('src')) return f # list of frames URLs def add_menu_item(self, text, link): self.navigation[text] = link def get_name(self): return self.name def set_name(self, name): self.name = name def get_url(self): return self.url def get_menu(self): return self.navigation def get_etree(self): return self._lxmletree def __str__(self): return "<" + __modulename__ + ".HTMLDocument url='" + self.url + "'>"