def get_text(self): # the text to be analyzed is passed in via a url, so we need to # retrieve it url = self.fargs[0] if not url.startswith('http://'): url = 'http://'+url fp = urllib.urlopen(url) print 'retrieving text from %s' % url raw = fp.read() encoding = fp.headers['content-type'].split('charset=')[-1] if encoding == 'text/plain': encoding = 'ascii' elif encoding == 'text/html': encoding = 'utf-8' ustring = unicode(raw, encoding) ustring_escaped = html_unescape(ustring) return ustring_escaped
def escape_text(self, raw_text, encoding=None): if not encoding: encoding = 'utf8' #ustring = unicode(raw_text, encoding, 'ignore') ustring = html_unescape(raw_text) return ustring