def parse_html(self, file): html = open(file, 'r').read() html = html.decode('utf-8', 'ignore') html = convertentities(html) parser = etree.HTMLParser() tree = etree.parse(StringIO(html), parser) return tree.getroot()
def load_source(self): with open(self.ft_source,'r') as f: raw_xml = f.read() raw_xml = re.sub('(<!-- body|endbody -->)', '', raw_xml) raw_xml = convertentities(raw_xml.decode('utf-8', 'ignore')) raw_xml = re.sub('<\?CDATA.+?\?>', '', raw_xml) self.source_content = raw_xml self.source_loaded = True
def load_source(self): with open(self.ft_source, 'r') as f: raw_xml = f.read() raw_xml = re.sub('(<!-- body|endbody -->)', '', raw_xml) raw_xml = convertentities(raw_xml.decode('utf-8', 'ignore')) raw_xml = re.sub('<\?CDATA.+?\?>', '', raw_xml) self.source_content = raw_xml self.source_loaded = True