def _extractor(txt, htmlpage=None): if txt is None: return page = getattr(htmlpage, 'htmlpage', htmlpage) if not hasattr(txt, 'text_content'): txt = HtmlPageRegion(page, txt) data = extractor.extract(txt) if data: return extractor.adapt(data, page)
def _extract_attribute(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs): data = [] for (f, ta, ea) in self.tag_data: tag_value = extraction_page.htmlpage_tag(start_index).attributes.get(ta) if tag_value: region = HtmlPageRegion(extraction_page.htmlpage, tag_value) extracted = f(region) if extracted is not None: data.append((ea, extracted)) return data
def __new__(cls, htmlpage, regions): text = u''.join(regions) return HtmlPageRegion.__new__(cls, htmlpage, text)
def extract(self, region): """Extract a region from the region passed""" text = self.extract_text(region) return HtmlPageRegion(region.htmlpage, text) if text else None
def _exec(x): ret = g(x) if ret is not None: ret = HtmlPageRegion(ret.htmlpage, remove_tags(ret.text_content)) return f(ret) return None