def _load(self): """ Load the ElementTree from the source """ # Convert directional quotation marks to regular quotes double_quotes = ur'[\u201c\u201d]' self.source = re.sub(double_quotes, u'"', self.source) single_quotes = ur'[\u2019\u2018]' self.source = re.sub(single_quotes, u"'", self.source) # Convert colons self.source = self.source.replace(u'\uff1a', u':') # Remove line breaks and tabs self.source = self.source.replace(u'\n', u'') self.source = self.source.replace(u'\t', u'') # There are also some "zero width joiners" in random places in the text # Should remove them here, since they make string search unreliable # these are the codes: ‍,   (nbsp), \xa0 (nbsp), \u200d zero_width_joiners = u'\u200d' self.source = self.source.replace(zero_width_joiners, u'') # Also previously had some non breaking spaces in unicode \u00a0, but this # may have been fixed by changing the parser below # Use the lxml cleaner cleaner = Cleaner() parser = HTMLParser(encoding='utf-8') # Finally, load the cleaned string to an ElementTree self.tree = cleaner.clean_html( lxml.html.fromstring(to_string(self.source), parser=parser))
def _load(self): """ Load the ElementTree from the source """ # Convert directional quotation marks to regular quotes double_quotes = ur'[\u201c\u201d]' self.source = re.sub(double_quotes, u'"', self.source) single_quotes = ur'[\u2019\u2018]' self.source = re.sub(single_quotes, u"'", self.source) # Convert colons self.source = self.source.replace(u'\uff1a', u':') # Remove line breaks and tabs self.source = self.source.replace(u'\n', u'') self.source = self.source.replace(u'\t', u'') # There are also some "zero width joiners" in random places in the text # Should remove them here, since they make string search unreliable # these are the codes: ‍,   (nbsp), \xa0 (nbsp), \u200d zero_width_joiners = u'\u200d' self.source = self.source.replace(zero_width_joiners, u'') # Also previously had some non breaking spaces in unicode \u00a0, but this # may have been fixed by changing the parser below # Use the lxml cleaner cleaner = Cleaner() parser = HTMLParser(encoding='utf-8') # Finally, load the cleaned string to an ElementTree self.tree = cleaner.clean_html(lxml.html.fromstring(to_string(self.source), parser=parser))
def __repr__(self): return to_string(u'<BillReading {} {}>'.format(self.title, self.reading))