def detokenize_single(self, html_tokens, tags): """ Build annotated ``lxml.etree.ElementTree`` from ``html_tokens`` (a list of :class:`.HtmlToken` instances) and ``tags`` (a list of their tags). Annotations are encoded as ``__START_TAG__`` and ``__END_TAG__`` text tokens (this is the format :mod:`webstruct.loaders` use). """ if len(html_tokens) != len(tags): raise ValueError("len(html_tokens) must be equal to len(tags)") if not html_tokens: return None orig_tree = html_tokens[0].root tree = copy.deepcopy(orig_tree) xpatheval = XPathEvaluator(tree) # find starts/ends of token groups token_groups = self.sequence_encoder.group(zip(html_tokens, tags)) starts, ends = set(), set() pos = 0 for gr_tokens, gr_tag in token_groups: n_tokens = len(gr_tokens) if gr_tag != 'O': starts.add(pos) ends.add(pos + n_tokens - 1) pos += n_tokens # mark starts/ends with special tokens data = zip(html_tokens, tags, range(len(html_tokens))) keyfunc = lambda rec: (rec[0].elem, rec[0].is_tail) for (orig_elem, is_tail), g in groupby(data, keyfunc): g = list(g) fix = False tokens = g[0][0].tokens[:] for token, tag, token_idx in g: if token_idx in starts: text = ' __START_%s__ %s' % (tag[2:], tokens[token.index]) tokens[token.index] = text fix = True if token_idx in ends: text = '%s __END_%s__ ' % (tokens[token.index], tag[2:]) tokens[token.index] = text fix = True if fix: xpath = orig_tree.getpath(orig_elem) elem = xpatheval(xpath)[0] if is_tail: elem.tail = smart_join(tokens) else: elem.text = smart_join(tokens) return tree
def detokenize_single(self, html_tokens, tags): """ Build annotated ``lxml.etree.ElementTree`` from ``html_tokens`` (a list of :class:`.HtmlToken` instances) and ``tags`` (a list of their tags). Annotations are encoded as ``__START_TAG__`` and ``__END_TAG__`` text tokens (this is the format :mod:`webstruct.loaders` use). """ if len(html_tokens) != len(tags): raise ValueError("len(html_tokens) must be equal to len(tags)") if not html_tokens: return None orig_tree = html_tokens[0].root tree = copy.deepcopy(orig_tree) xpatheval = XPathEvaluator(tree) # find starts/ends of token groups token_groups = self.sequence_encoder.group(zip(html_tokens, tags)) starts, ends = set(), set() pos = 0 for gr_tokens, gr_tag in token_groups: n_tokens = len(gr_tokens) if gr_tag != 'O': starts.add(pos) ends.add(pos + n_tokens - 1) pos += n_tokens # mark starts/ends with special tokens data = zip(html_tokens, tags, range(len(html_tokens))) keyfunc = lambda rec: (rec[0].elem, rec[0].is_tail) for (orig_elem, is_tail), g in groupby(data, keyfunc): g = list(g) fix = False tokens = g[0][0].tokens[:] for token, tag, token_idx in g: if token_idx in starts: tokens[token.index] = ' __START_%s__ %s' % (tag[2:], tokens[token.index]) fix = True if token_idx in ends: tokens[token.index] = '%s __END_%s__ ' % (tokens[token.index], tag[2:]) fix = True if fix: xpath = orig_tree.getpath(orig_elem) elem = xpatheval(xpath)[0] if is_tail: elem.tail = smart_join(tokens) else: elem.text = smart_join(tokens) return tree
def build_entity(self, html_tokens, tag): """ Join tokens to an entity. Return an entity, as text. By default this function uses :func:`webstruct.utils.smart_join`. Override it to customize :meth:`extract`, :meth:`extract_from_url` and :meth:`extract_groups` results. If this function returns empty string or None, entity is dropped. """ return smart_join(t.token for t in html_tokens)
def _join_tokens(html_tokens): return smart_join(t.token for t in html_tokens)