def __init__(self, xml, source=None): """Includes automatic conversion from string and a deep copy for safety. `source` represents the providence of this xml. It is _not_ serialized and hence does not follow the xml through the index""" if isinstance(xml, six.binary_type): xml = replace_html_entities(xml) self.xml = etree.fromstring(xml) elif isinstance(xml, etree._Element): self.xml = deepcopy(xml) else: raise ValueError("xml should be either binary or an lxml node") self.source = source
def test_replace_html_entities(): xml_str = b"text <with field='''> But ” and > + 2¢s" expected = u"text <with field='''> But ” and > + 2¢s" expected = expected.encode('utf-8') assert preprocessors.replace_html_entities(xml_str) == expected