Example #1
0
 def get_text(self):
     # the text to be analyzed is passed in via a url, so we need to
     # retrieve it
     url = self.fargs[0]
     if not url.startswith('http://'):
         url = 'http://'+url
     fp = urllib.urlopen(url)
     print 'retrieving text from %s' % url
     raw = fp.read()
     encoding = fp.headers['content-type'].split('charset=')[-1]
     if encoding == 'text/plain':
         encoding = 'ascii'
     elif encoding == 'text/html':
         encoding = 'utf-8'
     ustring = unicode(raw, encoding)
     ustring_escaped = html_unescape(ustring)
     return ustring_escaped
Example #2
0
 def get_text(self):
     # the text to be analyzed is passed in via a url, so we need to
     # retrieve it
     url = self.fargs[0]
     if not url.startswith('http://'):
         url = 'http://'+url
     fp = urllib.urlopen(url)
     print 'retrieving text from %s' % url
     raw = fp.read()
     encoding = fp.headers['content-type'].split('charset=')[-1]
     if encoding == 'text/plain':
         encoding = 'ascii'
     elif encoding == 'text/html':
         encoding = 'utf-8'
     ustring = unicode(raw, encoding)
     ustring_escaped = html_unescape(ustring)
     return ustring_escaped
Example #3
0
 def escape_text(self, raw_text, encoding=None):
     if not encoding:
         encoding = 'utf8'
     #ustring = unicode(raw_text, encoding, 'ignore')
     ustring = html_unescape(raw_text)
     return ustring
Example #4
0
	def escape_text(self, raw_text, encoding=None):
		if not encoding:
			encoding = 'utf8'
		#ustring = unicode(raw_text, encoding, 'ignore')
		ustring = html_unescape(raw_text)
		return ustring