Esempio n. 1
0
 def decode_raw_line(line):
     #first, try to decode using utf-8
     try:
         line = line.decode('utf8', 'strict')
     except UnicodeError:
         # if this fails and charade is loaded, try to guess the correct encoding
         if charadeLoaded:
             u = UniversalDetector()
             u.feed(line)
             u.close()
             if u.result['encoding']:
                 # try to use the guessed encoding
                 try:
                     line = line.decode(u.result['encoding'],
                         'strict')
                 # on error, give up and replace the offending characters
                 except UnicodeError:
                     line = line.decode(errors='replace')
             else:
                 # if no encoding could be guessed, fall back to utf-8 and
                 # replace offending characters
                 line = line.decode('utf8', 'replace')
         # if charade is not loaded, try to decode using utf-8 and replace any
         # offending characters
         else:
             line = line.decode('utf8', 'replace')
     return line
Esempio n. 2
0
 def runTest(self):
     u = UniversalDetector()
     for line in open(self.file_name, 'rb'):
         u.feed(line)
         if u.done:
             break
     u.close()
     self.assertEqual(u.result['encoding'].lower(), self.encoding,
                      "Expected %s, but got %r in %s" %
                      (self.encoding, u.result['encoding'],
                       self.file_name))
Esempio n. 3
0
def description_of(path):
    """Return a string describing the probable encoding of a file."""
    u = UniversalDetector()
    for line in open(path, 'rb'):
        u.feed(line)
    u.close()
    result = u.result
    if result['encoding']:
        return '%s: %s with confidence %s' % (path,
                                              result['encoding'],
                                              result['confidence'])
    else:
        return '%s: no result' % path
Esempio n. 4
0
    def _read(self):
        """Called by _select() when we can read data."""
        try:
            self.inbuffer += self.conn.recv(1024)
            self.eagains = 0 # If we successfully recv'ed, we can reset this.
            lines = self.inbuffer.split(b'\n')
            self.inbuffer = lines.pop()
            for line in lines:
                if sys.version_info[0] >= 3:
                    #first, try to decode using utf-8
                    try:
                        line = line.decode('utf8', 'strict')
                    except UnicodeError:
                        # if this fails and charade is loaded, try to guess the correct encoding
                        if charadeLoaded:
                            u = UniversalDetector()
                            u.feed(line)
                            u.close()
                            if u.result['encoding']:
                                # try to use the guessed encoding
                                try:
                                    line = line.decode(u.result['encoding'],
                                        'strict')
                                # on error, give up and replace the offending characters
                                except UnicodeError:
                                    line = line.decode(errors='replace')
                            else:
                                # if no encoding could be guessed, fall back to utf-8 and
                                # replace offending characters
                                line = line.decode('utf8', 'replace')
                        # if charade is not loaded, try to decode using utf-8 and replace any
                        # offending characters
                        else:
                            line = line.decode('utf8', 'replace')

                msg = drivers.parseMsg(line)
                if msg is not None and self.irc is not None:
                    self.irc.feedMsg(msg)
        except socket.timeout:
            pass
        except SSLError as e:
            if e.args[0] == 'The read operation timed out':
                pass
            else:
                self._handleSocketError(e)
                return
        except socket.error as e:
            self._handleSocketError(e)
            return
        if self.irc and not self.irc.zombie:
            self._sendIfMsgs()
Esempio n. 5
0
    def detectEncoding(self, parseMeta=True, chardet=True):
        # First look for a BOM
        # This will also read past the BOM if present
        encoding = self.detectBOM()
        confidence = "certain"
        # If there is no BOM need to look for meta elements with encoding
        # information
        if encoding is None and parseMeta:
            encoding = self.detectEncodingMeta()
            confidence = "tentative"
        # Guess with chardet, if avaliable
        if encoding is None and chardet:
            confidence = "tentative"
            try:
                try:
                    from charade.universaldetector import UniversalDetector
                except ImportError:
                    from chardet.universaldetector import UniversalDetector
                buffers = []
                detector = UniversalDetector()
                while not detector.done:
                    buffer = self.rawStream.read(self.numBytesChardet)
                    assert isinstance(buffer, bytes)
                    if not buffer:
                        break
                    buffers.append(buffer)
                    detector.feed(buffer)
                detector.close()
                encoding = detector.result['encoding']
                self.rawStream.seek(0)
            except ImportError:
                pass
        # If all else fails use the default encoding
        if encoding is None:
            confidence = "tentative"
            encoding = self.defaultEncoding

        # Substitute for equivalent encodings:
        encodingSub = {"iso-8859-1": "windows-1252"}

        if encoding.lower() in encodingSub:
            encoding = encodingSub[encoding.lower()]

        return encoding, confidence
Esempio n. 6
0
 def decode_raw_line(line):
     #first, try to decode using utf-8
     try:
         line = line.decode('utf8', 'strict')
     except UnicodeError:
         # if this fails and charade is loaded, try to guess the correct encoding
         if charadeLoaded:
             u = UniversalDetector()
             u.feed(line)
             u.close()
             if u.result['encoding']:
                 # try to use the guessed encoding
                 try:
                     line = line.decode(u.result['encoding'],
                         'strict')
                 # on error, give up and replace the offending characters
                 except UnicodeError:
                     line = line.decode(errors='replace')
             else:
                 # if no encoding could be guessed, fall back to utf-8 and
                 # replace offending characters
                 line = line.decode('utf8', 'replace')
         # if charade is not loaded, try to decode using utf-8 and replace any
         # offending characters
         else:
             line = line.decode('utf8', 'replace')
     return line
Esempio n. 7
0
 def runTest(self):
     u = UniversalDetector()
     for line in open(self.file_name, 'rb'):
         u.feed(line)
         if u.done:
             break
     u.close()
     self.assertEqual(
         u.result['encoding'].lower(), self.encoding,
         "Expected %s, but got %r in %s" %
         (self.encoding, u.result['encoding'], self.file_name))
Esempio n. 8
0
def description_of(path):
    """Return a string describing the probable encoding of a file."""
    u = UniversalDetector()
    for line in open(path, 'rb'):
        u.feed(line)
    u.close()
    result = u.result
    if result['encoding']:
        return '%s: %s with confidence %s' % (path, result['encoding'],
                                              result['confidence'])
    else:
        return '%s: no result' % path
Esempio n. 9
0
    def _read(self):
        """Called by _select() when we can read data."""
        try:
            self.inbuffer += self.conn.recv(1024)
            self.eagains = 0  # If we successfully recv'ed, we can reset this.
            lines = self.inbuffer.split(b'\n')
            self.inbuffer = lines.pop()
            for line in lines:
                if sys.version_info[0] >= 3:
                    #first, try to decode using utf-8
                    try:
                        line = line.decode('utf8', 'strict')
                    except UnicodeError:
                        # if this fails and charade is loaded, try to guess the correct encoding
                        if charadeLoaded:
                            u = UniversalDetector()
                            u.feed(line)
                            u.close()
                            if u.result['encoding']:
                                # try to use the guessed encoding
                                try:
                                    line = line.decode(u.result['encoding'],
                                                       'strict')
                                # on error, give up and replace the offending characters
                                except UnicodeError:
                                    line = line.decode(errors='replace')
                            else:
                                # if no encoding could be guessed, fall back to utf-8 and
                                # replace offending characters
                                line = line.decode('utf8', 'replace')
                        # if charade is not loaded, try to decode using utf-8 and replace any
                        # offending characters
                        else:
                            line = line.decode('utf8', 'replace')

                msg = drivers.parseMsg(line)
                if msg is not None and self.irc is not None:
                    self.irc.feedMsg(msg)
        except socket.timeout:
            pass
        except SSLError as e:
            if e.args[0] == 'The read operation timed out':
                pass
            else:
                self._handleSocketError(e)
                return
        except socket.error as e:
            self._handleSocketError(e)
            return
        if self.irc and not self.irc.zombie:
            self._sendIfMsgs()
    def detectEncoding(self, parseMeta=True, chardet=True):
        # First look for a BOM
        # This will also read past the BOM if present
        encoding = self.detectBOM()
        confidence = "certain"
        # If there is no BOM need to look for meta elements with encoding
        # information
        if encoding is None and parseMeta:
            encoding = self.detectEncodingMeta()
            confidence = "tentative"
        # Guess with chardet, if avaliable
        if encoding is None and chardet:
            confidence = "tentative"
            try:
                try:
                    from charade.universaldetector import UniversalDetector
                except ImportError:
                    from chardet.universaldetector import UniversalDetector
                buffers = []
                detector = UniversalDetector()
                while not detector.done:
                    buffer = self.rawStream.read(self.numBytesChardet)
                    assert isinstance(buffer, bytes)
                    if not buffer:
                        break
                    buffers.append(buffer)
                    detector.feed(buffer)
                detector.close()
                encoding = detector.result['encoding']
                self.rawStream.seek(0)
            except ImportError:
                pass
        # If all else fails use the default encoding
        if encoding is None:
            confidence = "tentative"
            encoding = self.defaultEncoding

        # Substitute for equivalent encodings:
        encodingSub = {"iso-8859-1": "windows-1252"}

        if encoding.lower() in encodingSub:
            encoding = encodingSub[encoding.lower()]

        return encoding, confidence