Ejemplo n.º 1
0
 def getblocks(self, e):
   # chop into blocks
   r = []
   self.chunk(e, [], False)
   for (path, texts) in self.blocks:
     b = TextBlock('/'.join(path[-self.PATH_COMPS:]), 
                   concat( t for (t, a) in texts ),
                   concat( t for (t, a) in texts if not a ))
     if self.MIN_CHARS <= len(b.sig_text):
       r.append(b)
   return r
Ejemplo n.º 2
0
 def getblocks(self, e):
     # chop into blocks
     r = []
     self.chunk(e, [], False)
     for (path, texts) in self.blocks:
         b = TextBlock('/'.join(path[-self.PATH_COMPS:]),
                       concat(t for (t, a) in texts),
                       concat(t for (t, a) in texts if not a))
         if self.MIN_CHARS <= len(b.sig_text):
             r.append(b)
     return r
Ejemplo n.º 3
0
  def get_links(self, normalize=False):
    """Return the list of href links in the element.

    When normalize is set, the relative url is normalized based on the documentbase.
    """
    for x in self.walk():
      if element_tag(x) == "a" and x.get_attr("href"):
        url = unquote(x.get_attr("href"))
        if normalize:
          url = self.root.normalize_url(url)
        yield (concat(x.get_text()), url)
    return
Ejemplo n.º 4
0
def get_text(x, elimtags=('style', 'script', 'comment', 'option')):
  if isinstance(x, HTMLElement):
    return concat(x.get_text(elimtags=elimtags))
  else:
    return x