def decode_raw_line(line): #first, try to decode using utf-8 try: line = line.decode('utf8', 'strict') except UnicodeError: # if this fails and charade is loaded, try to guess the correct encoding if charadeLoaded: u = UniversalDetector() u.feed(line) u.close() if u.result['encoding']: # try to use the guessed encoding try: line = line.decode(u.result['encoding'], 'strict') # on error, give up and replace the offending characters except UnicodeError: line = line.decode(errors='replace') else: # if no encoding could be guessed, fall back to utf-8 and # replace offending characters line = line.decode('utf8', 'replace') # if charade is not loaded, try to decode using utf-8 and replace any # offending characters else: line = line.decode('utf8', 'replace') return line
def runTest(self): u = UniversalDetector() for line in open(self.file_name, 'rb'): u.feed(line) if u.done: break u.close() self.assertEqual(u.result['encoding'].lower(), self.encoding, "Expected %s, but got %r in %s" % (self.encoding, u.result['encoding'], self.file_name))
def description_of(path): """Return a string describing the probable encoding of a file.""" u = UniversalDetector() for line in open(path, 'rb'): u.feed(line) u.close() result = u.result if result['encoding']: return '%s: %s with confidence %s' % (path, result['encoding'], result['confidence']) else: return '%s: no result' % path
def _read(self): """Called by _select() when we can read data.""" try: self.inbuffer += self.conn.recv(1024) self.eagains = 0 # If we successfully recv'ed, we can reset this. lines = self.inbuffer.split(b'\n') self.inbuffer = lines.pop() for line in lines: if sys.version_info[0] >= 3: #first, try to decode using utf-8 try: line = line.decode('utf8', 'strict') except UnicodeError: # if this fails and charade is loaded, try to guess the correct encoding if charadeLoaded: u = UniversalDetector() u.feed(line) u.close() if u.result['encoding']: # try to use the guessed encoding try: line = line.decode(u.result['encoding'], 'strict') # on error, give up and replace the offending characters except UnicodeError: line = line.decode(errors='replace') else: # if no encoding could be guessed, fall back to utf-8 and # replace offending characters line = line.decode('utf8', 'replace') # if charade is not loaded, try to decode using utf-8 and replace any # offending characters else: line = line.decode('utf8', 'replace') msg = drivers.parseMsg(line) if msg is not None and self.irc is not None: self.irc.feedMsg(msg) except socket.timeout: pass except SSLError as e: if e.args[0] == 'The read operation timed out': pass else: self._handleSocketError(e) return except socket.error as e: self._handleSocketError(e) return if self.irc and not self.irc.zombie: self._sendIfMsgs()
def detectEncoding(self, parseMeta=True, chardet=True): # First look for a BOM # This will also read past the BOM if present encoding = self.detectBOM() confidence = "certain" # If there is no BOM need to look for meta elements with encoding # information if encoding is None and parseMeta: encoding = self.detectEncodingMeta() confidence = "tentative" # Guess with chardet, if avaliable if encoding is None and chardet: confidence = "tentative" try: try: from charade.universaldetector import UniversalDetector except ImportError: from chardet.universaldetector import UniversalDetector buffers = [] detector = UniversalDetector() while not detector.done: buffer = self.rawStream.read(self.numBytesChardet) assert isinstance(buffer, bytes) if not buffer: break buffers.append(buffer) detector.feed(buffer) detector.close() encoding = detector.result['encoding'] self.rawStream.seek(0) except ImportError: pass # If all else fails use the default encoding if encoding is None: confidence = "tentative" encoding = self.defaultEncoding # Substitute for equivalent encodings: encodingSub = {"iso-8859-1": "windows-1252"} if encoding.lower() in encodingSub: encoding = encodingSub[encoding.lower()] return encoding, confidence
def runTest(self): u = UniversalDetector() for line in open(self.file_name, 'rb'): u.feed(line) if u.done: break u.close() self.assertEqual( u.result['encoding'].lower(), self.encoding, "Expected %s, but got %r in %s" % (self.encoding, u.result['encoding'], self.file_name))