def convert(self, filename, encoding=None, mimetype=None): # XXX: dont read entire file into memory doc = open(filename, 'r').read() # convert to unicode if not encoding: mo = charset_reg.search(doc) encoding = mo.group(1) doc = unicode(doc, encoding, 'replace') doc = convert_entities(doc) result = html2text(doc) # convert back to utf-8 return StringIO.StringIO(result.encode('utf-8')), 'utf-8'
def convert(self, filename, encoding, mimetype): # XXX: dont read entire file into memory doc = open(filename, 'r').read() # Use encoding from XML preamble if present mo = encoding_reg.search(doc) if mo: encoding = mo.group(1) if not encoding: encoding = default_encoding if not isinstance(doc, unicode): doc = unicode(doc, encoding, 'replace') doc = convert_entities(doc) doc = doc.encode('utf-8') p = StripTagParser() p.feed(doc) p.close() return StringIO.StringIO(p), 'utf-8'
def extract_text_from_html(text): if not isinstance(text, unicode): text = unicode(text, 'utf-8', 'replace') return html2text(convert_entities(text))