def getblocks(self, e): # chop into blocks r = [] self.chunk(e, [], False) for (path, texts) in self.blocks: b = TextBlock('/'.join(path[-self.PATH_COMPS:]), concat( t for (t, a) in texts ), concat( t for (t, a) in texts if not a )) if self.MIN_CHARS <= len(b.sig_text): r.append(b) return r
def getblocks(self, e): # chop into blocks r = [] self.chunk(e, [], False) for (path, texts) in self.blocks: b = TextBlock('/'.join(path[-self.PATH_COMPS:]), concat(t for (t, a) in texts), concat(t for (t, a) in texts if not a)) if self.MIN_CHARS <= len(b.sig_text): r.append(b) return r
def get_links(self, normalize=False): """Return the list of href links in the element. When normalize is set, the relative url is normalized based on the documentbase. """ for x in self.walk(): if element_tag(x) == "a" and x.get_attr("href"): url = unquote(x.get_attr("href")) if normalize: url = self.root.normalize_url(url) yield (concat(x.get_text()), url) return
def get_text(x, elimtags=('style', 'script', 'comment', 'option')): if isinstance(x, HTMLElement): return concat(x.get_text(elimtags=elimtags)) else: return x