Ejemplo n.º 1
0
 def _extractor(txt, htmlpage=None):
     if txt is None:
         return
     page = getattr(htmlpage, 'htmlpage', htmlpage)
     if not hasattr(txt, 'text_content'):
         txt = HtmlPageRegion(page, txt)
     data = extractor.extract(txt)
     if data:
         return extractor.adapt(data, page)
Ejemplo n.º 2
0
 def _extract_attribute(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs):
     data = []
     for (f, ta, ea) in self.tag_data:
         tag_value = extraction_page.htmlpage_tag(start_index).attributes.get(ta)
         if tag_value:
             region = HtmlPageRegion(extraction_page.htmlpage, tag_value)
             extracted = f(region)
             if extracted is not None:
                 data.append((ea, extracted))
     return data
Ejemplo n.º 3
0
 def __new__(cls, htmlpage, regions):
     text = u''.join(regions)
     return HtmlPageRegion.__new__(cls, htmlpage, text)
Ejemplo n.º 4
0
 def extract(self, region):
     """Extract a region from the region passed"""
     text = self.extract_text(region)
     return HtmlPageRegion(region.htmlpage, text) if text else None
Ejemplo n.º 5
0
 def _exec(x):
     ret = g(x)
     if ret is not None:
         ret = HtmlPageRegion(ret.htmlpage, remove_tags(ret.text_content))
         return f(ret)
     return None
Ejemplo n.º 6
0
 def __new__(cls, htmlpage, regions):
     text = u''.join(regions)
     return HtmlPageRegion.__new__(cls, htmlpage, text)