コード例 #1
0
    def detokenize_single(self, html_tokens, tags):
        """
        Build annotated ``lxml.etree.ElementTree`` from
        ``html_tokens`` (a list of :class:`.HtmlToken` instances)
        and ``tags`` (a list of their tags).

        Annotations are encoded as ``__START_TAG__`` and ``__END_TAG__``
        text tokens (this is the format :mod:`webstruct.loaders` use).
        """
        if len(html_tokens) != len(tags):
            raise ValueError("len(html_tokens) must be equal to len(tags)")

        if not html_tokens:
            return None

        orig_tree = html_tokens[0].root
        tree = copy.deepcopy(orig_tree)
        xpatheval = XPathEvaluator(tree)

        # find starts/ends of token groups
        token_groups = self.sequence_encoder.group(zip(html_tokens, tags))
        starts, ends = set(), set()
        pos = 0
        for gr_tokens, gr_tag in token_groups:
            n_tokens = len(gr_tokens)
            if gr_tag != 'O':
                starts.add(pos)
                ends.add(pos + n_tokens - 1)
            pos += n_tokens

        # mark starts/ends with special tokens
        data = zip(html_tokens, tags, range(len(html_tokens)))
        keyfunc = lambda rec: (rec[0].elem, rec[0].is_tail)

        for (orig_elem, is_tail), g in groupby(data, keyfunc):
            g = list(g)
            fix = False
            tokens = g[0][0].tokens[:]
            for token, tag, token_idx in g:
                if token_idx in starts:
                    text = ' __START_%s__ %s' % (tag[2:], tokens[token.index])
                    tokens[token.index] = text
                    fix = True
                if token_idx in ends:
                    text = '%s __END_%s__ ' % (tokens[token.index], tag[2:])
                    tokens[token.index] = text
                    fix = True

            if fix:
                xpath = orig_tree.getpath(orig_elem)
                elem = xpatheval(xpath)[0]
                if is_tail:
                    elem.tail = smart_join(tokens)
                else:
                    elem.text = smart_join(tokens)

        return tree
コード例 #2
0
    def detokenize_single(self, html_tokens, tags):
        """
        Build annotated ``lxml.etree.ElementTree`` from
        ``html_tokens`` (a list of :class:`.HtmlToken` instances)
        and ``tags`` (a list of their tags).

        Annotations are encoded as ``__START_TAG__`` and ``__END_TAG__``
        text tokens (this is the format :mod:`webstruct.loaders` use).
        """
        if len(html_tokens) != len(tags):
            raise ValueError("len(html_tokens) must be equal to len(tags)")

        if not html_tokens:
            return None

        orig_tree = html_tokens[0].root
        tree = copy.deepcopy(orig_tree)
        xpatheval = XPathEvaluator(tree)

        # find starts/ends of token groups
        token_groups = self.sequence_encoder.group(zip(html_tokens, tags))
        starts, ends = set(), set()
        pos = 0
        for gr_tokens, gr_tag in token_groups:
            n_tokens = len(gr_tokens)
            if gr_tag != 'O':
                starts.add(pos)
                ends.add(pos + n_tokens - 1)
            pos += n_tokens

        # mark starts/ends with special tokens
        data = zip(html_tokens, tags, range(len(html_tokens)))
        keyfunc = lambda rec: (rec[0].elem, rec[0].is_tail)

        for (orig_elem, is_tail), g in groupby(data, keyfunc):
            g = list(g)
            fix = False
            tokens = g[0][0].tokens[:]
            for token, tag, token_idx in g:
                if token_idx in starts:
                    tokens[token.index] = ' __START_%s__ %s' % (tag[2:], tokens[token.index])
                    fix = True
                if token_idx in ends:
                    tokens[token.index] = '%s __END_%s__ ' % (tokens[token.index], tag[2:])
                    fix = True

            if fix:
                xpath = orig_tree.getpath(orig_elem)
                elem = xpatheval(xpath)[0]
                if is_tail:
                    elem.tail = smart_join(tokens)
                else:
                    elem.text = smart_join(tokens)

        return tree
コード例 #3
0
ファイル: model.py プロジェクト: pombredanne/webstruct
    def build_entity(self, html_tokens, tag):
        """
        Join tokens to an entity. Return an entity, as text.
        By default this function uses :func:`webstruct.utils.smart_join`.

        Override it to customize :meth:`extract`, :meth:`extract_from_url`
        and :meth:`extract_groups` results. If this function returns empty
        string or None, entity is dropped.
        """
        return smart_join(t.token for t in html_tokens)
コード例 #4
0
ファイル: model.py プロジェクト: etongle/webstruct
    def build_entity(self, html_tokens, tag):
        """
        Join tokens to an entity. Return an entity, as text.
        By default this function uses :func:`webstruct.utils.smart_join`.

        Override it to customize :meth:`extract`, :meth:`extract_from_url`
        and :meth:`extract_groups` results. If this function returns empty
        string or None, entity is dropped.
        """
        return smart_join(t.token for t in html_tokens)
コード例 #5
0
def _join_tokens(html_tokens):
    return smart_join(t.token for t in html_tokens)
コード例 #6
0
ファイル: model.py プロジェクト: scrapinghub/webstruct
def _join_tokens(html_tokens):
    return smart_join(t.token for t in html_tokens)