def __init__(self, xml, source=None):
     """Includes automatic conversion from string and a deep copy for
     safety. `source` represents the providence of this xml. It is _not_
     serialized and hence does not follow the xml through the index"""
     if isinstance(xml, six.binary_type):
         xml = replace_html_entities(xml)
         self.xml = etree.fromstring(xml)
     elif isinstance(xml, etree._Element):
         self.xml = deepcopy(xml)
     else:
         raise ValueError("xml should be either binary or an lxml node")
     self.source = source
def test_replace_html_entities():
    xml_str = b"text <with field='&apos;'> But &rdquo; and &gt; + 2&cent;s"
    expected = u"text <with field='&apos;'> But ” and &gt; + 2¢s"
    expected = expected.encode('utf-8')
    assert preprocessors.replace_html_entities(xml_str) == expected