Ejemplo n.º 1
0
    def test_etree_sax_ns_attributes(self):
        handler = sax.ElementTreeContentHandler()
        handler.startDocument()

        self.assertRaises(
            ValueError, handler.startElement, "a", {"blaA:attr_a1": "a1"}
        )
Ejemplo n.º 2
0
def TestOneInput(data):
    try:
        f = io.BytesIO(data)
        parsed = et.parse(f)

        handler = sax.ElementTreeContentHandler()
        sax.ElementTreeProducer(parsed, handler).saxify()
    except et.LxmlError:
        None
Ejemplo n.º 3
0
    def test_etree_sax_no_ns_attributes(self):
        handler = sax.ElementTreeContentHandler()
        handler.startDocument()
        handler.startElement("a", {"attr_a1": "a1"})
        handler.startElement("b", {"attr_b1": "b1"})
        handler.endElement("b")
        handler.endElement("a")
        handler.endDocument()

        new_tree = handler.etree
        root = new_tree.getroot()
        self.assertEqual("a", root.tag)
        self.assertEqual("b", root[0].tag)
        self.assertEqual("a1", root.attrib["attr_a1"])
        self.assertEqual("b1", root[0].attrib["attr_b1"])
Ejemplo n.º 4
0
    def startElement(self, tag, attrib):
        """
		Called when an XML element starts.
		"""
        if tag == constants.EVENT_TAG_EVENT:
            self.event_handler = sax.ElementTreeContentHandler()
            self.event_handler.startDocument()
            self.event_started = True
        if self.event_started:
            # ugly, but necessary (incompatibilities between lxml and sax)
            attributes = {}
            if attrib.getLength() > 0:
                for key in attrib.keys():
                    attributes[(None, key)] = attrib[key]
            self.event_handler.startElement(tag, attributes)
Ejemplo n.º 5
0
    def test_etree_sax_no_ns(self):
        handler = sax.ElementTreeContentHandler()
        handler.startDocument()
        handler.startElement("a", {})
        handler.startElement("b", {})
        handler.endElement("b")
        handler.startElement("c")  # with empty attributes
        handler.endElement("c")
        handler.endElement("a")
        handler.endDocument()

        new_tree = handler.etree
        root = new_tree.getroot()
        self.assertEqual("a", root.tag)
        self.assertEqual("b", root[0].tag)
        self.assertEqual("c", root[1].tag)
Ejemplo n.º 6
0
    def test_etree_sax_redefine_ns(self):
        handler = sax.ElementTreeContentHandler()
        handler.startDocument()
        handler.startPrefixMapping("ns", "blaA")
        handler.startElementNS(("blaA", "a"), "ns:a", {})
        handler.startPrefixMapping("ns", "blaB")
        handler.startElementNS(("blaB", "b"), "ns:b", {})
        handler.endElementNS(("blaB", "b"), "ns:b")
        handler.endPrefixMapping("ns")
        handler.startElementNS(("blaA", "c"), "ns:c", {})
        handler.endElementNS(("blaA", "c"), "ns:c")
        handler.endElementNS(("blaA", "a"), "ns:a")
        handler.endPrefixMapping("ns")
        handler.endDocument()

        new_tree = handler.etree
        root = new_tree.getroot()
        self.assertEqual("{blaA}a", root.tag)
        self.assertEqual("{blaB}b", root[0].tag)
        self.assertEqual("{blaA}c", root[1].tag)
Ejemplo n.º 7
0
    def test_etree_sax_handler_default_ns_None(self):
        handler = sax.ElementTreeContentHandler()
        handler.startDocument()
        handler.startPrefixMapping(None, "blaA")
        handler.startElementNS((None, "a"), "a", {})
        handler.startPrefixMapping(None, "blaB")
        handler.startElementNS((None, "b"), "b", {})
        handler.endElementNS((None, "b"), "b")
        handler.endPrefixMapping(None)
        handler.startElementNS((None, "c"), "c", {})
        handler.endElementNS((None, "c"), "c")
        handler.endElementNS((None, "a"), "a")
        handler.endPrefixMapping(None)
        handler.endDocument()

        new_tree = handler.etree
        root = new_tree.getroot()
        self.assertEqual("{blaA}a", root.tag)
        self.assertEqual("{blaB}b", root[0].tag)
        self.assertEqual("{blaA}c", root[1].tag)
Ejemplo n.º 8
0
    def test_etree_sax_redefine_ns(self):
        handler = sax.ElementTreeContentHandler()
        handler.startDocument()
        handler.startPrefixMapping('ns', 'blaA')
        handler.startElementNS(('blaA', 'a'), 'ns:a', {})
        handler.startPrefixMapping('ns', 'blaB')
        handler.startElementNS(('blaB', 'b'), 'ns:b', {})
        handler.endElementNS(('blaB', 'b'), 'ns:b')
        handler.endPrefixMapping('ns')
        handler.startElementNS(('blaA', 'c'), 'ns:c', {})
        handler.endElementNS(('blaA', 'c'), 'ns:c')
        handler.endElementNS(('blaA', 'a'), 'ns:a')
        handler.endPrefixMapping('ns')
        handler.endDocument()

        new_tree = handler.etree
        root = new_tree.getroot()
        self.assertEqual('{blaA}a', root.tag)
        self.assertEqual('{blaB}b', root[0].tag)
        self.assertEqual('{blaA}c', root[1].tag)
Ejemplo n.º 9
0
    def test_etree_sax_handler_default_ns_None(self):
        handler = sax.ElementTreeContentHandler()
        handler.startDocument()
        handler.startPrefixMapping(None, 'blaA')
        handler.startElementNS((None, 'a'), 'a', {})
        handler.startPrefixMapping(None, 'blaB')
        handler.startElementNS((None, 'b'), 'b', {})
        handler.endElementNS((None, 'b'), 'b')
        handler.endPrefixMapping(None)
        handler.startElementNS((None, 'c'), 'c', {})
        handler.endElementNS((None, 'c'), 'c')
        handler.endElementNS((None, 'a'), 'a')
        handler.endPrefixMapping(None)
        handler.endDocument()

        new_tree = handler.etree
        root = new_tree.getroot()
        self.assertEqual('{blaA}a', root.tag)
        self.assertEqual('{blaB}b', root[0].tag)
        self.assertEqual('{blaA}c', root[1].tag)
Ejemplo n.º 10
0
 def _saxify_unsaxify(self, saxifiable):
     handler = sax.ElementTreeContentHandler()
     sax.ElementTreeProducer(saxifiable, handler).saxify()
     return handler.etree
Ejemplo n.º 11
0
 def test_etree_sax_error2(self):
     handler = sax.ElementTreeContentHandler()
     handler.startDocument()
     handler.startElement("a")
     handler.startElement("b")
     self.assertRaises(sax.SaxError, handler.endElement, "a")
Ejemplo n.º 12
0
    def make_pars(self,
                  pars,
                  parent_el,
                  left_strip_text=None,
                  last_page_label=None,
                  include_block_label=False):
        """ Make each <p class='label'> or <label> element. """
        for par in pars:
            if self.redacted and par.get('redacted'):
                continue
            handler = sax.ElementTreeContentHandler()
            tag_stack = []
            open_tags = set()

            # opening tag
            if self.format == 'xml':
                par_attrs = {'id': par['id']}

                # special handling for duplicative files -- alto block label gets applied as casemets paragraph label attr
                if include_block_label and par['block_ids']:
                    first_block = self.blocks_by_id[par['block_ids'][0]]
                    if 'class' in first_block and first_block['class'] != 'p':
                        par_attrs['label'] = first_block['class']

                tag_stack.append((handler.startElement, (
                    par['class'],
                    par_attrs,
                )))
            else:
                if par['class'] == 'p':
                    tag = (
                        'p',
                        {
                            'id': par['id']
                        },
                    )
                elif par['class'] == 'blockquote':
                    tag = (
                        'blockquote',
                        {
                            'id': par['id']
                        },
                    )
                else:
                    tag = (
                        par_class_to_tag.get(par['class'], 'p'),
                        {
                            'class': par['class'],
                            'id': par['id']
                        },
                    )
                tag_stack.append((handler.startElement, tag))

            # write each block in the paragraph
            for block_id in par['block_ids']:
                block = self.blocks_by_id[block_id]

                # write <page-number> or <a class='page-label'> between blocks
                if not self.original_xml:
                    page_label = self.labels_by_block_id[block_id]
                    if page_label != last_page_label:
                        if last_page_label is not None:
                            if self.format == 'xml':
                                tag_stack.append((handler.startElement, (
                                    'page-number',
                                    {
                                        'label': page_label,
                                        'citation-index': '1'
                                    },
                                )))
                                tag_stack.append(
                                    (handler.characters, ('*' + page_label, )))
                                tag_stack.append(
                                    (handler.endElement, ('page-number', )))
                            else:
                                tag_stack.append((handler.startElement, (
                                    'a',
                                    {
                                        'id': 'p' + page_label,
                                        'href': '#p' + page_label,
                                        'data-label': page_label,
                                        'data-citation-index': '1',
                                        'class': 'page-label'
                                    },
                                )))
                                tag_stack.append(
                                    (handler.characters, ('*' + page_label, )))
                                tag_stack.append((handler.endElement, ('a', )))
                        last_page_label = page_label

                # write <img>
                if block.get('format') == 'image' and not (
                        self.redacted and block.get('redacted')):
                    if self.original_xml:
                        tag_stack.append(
                            (handler.characters, ('[[Image here]]', )))
                    else:
                        tag_stack.append((handler.startElement, (
                            'img',
                            {
                                'src': 'data:' + block['data'],
                                'class': block['class'],
                                'width': str(round(block['rect'][2])),
                                'height': str(round(block['rect'][3]))
                            },
                        )))
                        tag_stack.append((handler.endElement, ('img', )))

                # write tokens
                else:
                    open_font_tags = []
                    for token in filter_tokens(block, self.html_token_filter,
                                               self.redacted):

                        # text token
                        if type(token) == str:
                            if left_strip_text:
                                while left_strip_text and token:
                                    if left_strip_text[0] == token[0]:
                                        left_strip_text = left_strip_text[1:]
                                        token = token[1:]
                                    else:
                                        left_strip_text = None
                            tag_stack.append((handler.characters, (token, )))
                            continue

                        token_name, token_attrs = (token + [{}])[:2]

                        # handle opening and closing font tags
                        if token_name == 'font':
                            if self.original_xml:
                                continue
                            font_obj = self.fonts_by_id[token_attrs['id']]
                            open_font_tags = [
                                tag for tag, font_string in self.font_style_map
                                if font_string in font_obj.style
                            ]
                            self.open_font_tags(handler, tag_stack,
                                                open_font_tags)
                        elif token_name == '/font':
                            if self.original_xml:
                                continue
                            self.close_font_tags(handler, tag_stack,
                                                 open_font_tags)
                            open_font_tags = []

                        # handle footnotemark and bracketnum
                        elif token_name == 'footnotemark' or token_name == 'bracketnum':
                            if self.original_xml:
                                tag_stack.append(
                                    (handler.startElement, (token_name, )))
                            elif self.format == 'xml':
                                with self.wrap_font_tags(
                                        handler, tag_stack, open_font_tags):
                                    tag_stack.append(
                                        (handler.startElement, (token_name, )))
                            else:
                                attrs = {'class': token_name}
                                ref = token_attrs.get('ref')
                                if ref:
                                    attrs['href'] = '#' + ref
                                    attrs['id'] = 'ref_' + ref
                                with self.wrap_font_tags(
                                        handler, tag_stack, open_font_tags):
                                    tag_stack.append((handler.startElement, (
                                        'a',
                                        attrs,
                                    )))
                            open_tags.add(token_name)
                        elif token_name == '/footnotemark' or token_name == '/bracketnum':
                            # we could hit a close tag without an open tag, if the open tag was in a previous redacted block
                            tag_name = token_name[1:]
                            if tag_name in open_tags:
                                with self.wrap_font_tags(
                                        handler, tag_stack, open_font_tags):
                                    tag_stack.append(
                                        (handler.endElement,
                                         (token_name[1:]
                                          if self.format == 'xml' else 'a', )))
                                open_tags.remove(tag_name)

            # run all of our commands, like "handler.startElement(*args)", to actually build the xml tree
            for method, args in tag_stack:
                method(*args)

            # remove empty tags, which would typically be created by redacted spans
            par_el = handler._root
            remove_empty_tags(par_el, ignore_tags={'img'})

            # append element if not empty (contents not redacted)
            if par_el.text or len(par_el):
                parent_el.append(par_el)

        return last_page_label