def convert_to_utf8(self, filename): # try to open the file and exit if some IOError occurs """ :param filename: """ try: f = open(filename, 'r').read() print fastchardet.detect(f) except Exception: sys.exit(1) try: encoding_ = fastchardet.detect(f)['encoding'] if encoding_ and encoding_=="ANSI": data = f.decode(encoding_) # now get the absolute path of our filename and append .bak # to the end of it (for our backup file) fpath = os.path.abspath(filename) newfilename = fpath + '.bak' # and make our backup file with shutil shutil.copy(filename, newfilename) # and at last convert it to utf-8 f = open(filename, 'w') try: f.write(data.encode('utf-8')) except Exception, e: print e finally: f.close()
def convert_to_utf8(self, filename): # try to open the file and exit if some IOError occurs """ :param filename: """ try: f = open(filename, 'r').read() print fastchardet.detect(f) except Exception: sys.exit(1) try: encoding_ = fastchardet.detect(f)['encoding'] if encoding_ and encoding_ == "ANSI": data = f.decode(encoding_) # now get the absolute path of our filename and append .bak # to the end of it (for our backup file) fpath = os.path.abspath(filename) newfilename = fpath + '.bak' # and make our backup file with shutil shutil.copy(filename, newfilename) # and at last convert it to utf-8 f = open(filename, 'w') try: f.write(data.encode('utf-8')) except Exception, e: print e finally: f.close()
def _parse_l10n_doc(name, doc, no_encoding=False): 'Parses an L10n document.' extension = name.split('.')[-1].lower() handlers = {'dtd': dtd.DTDParser, 'properties': properties.PropertiesParser} # These are expected encodings for the various files. handler_formats = ('ASCII', 'UTF_8') if extension not in handlers: return None wrapper = StringIO(doc) loc_doc = handlers[extension](wrapper) # Allow the parse to specify files to skip for encoding checks if not no_encoding: try: # This is much faster than fastchardet, and succeeds more often # than fails. doc.decode('utf-8') encoding = 'UTF_8' except UnicodeDecodeError: encoding = fastchardet.detect(doc)['encoding'].upper() loc_doc.expected_encoding = encoding in handler_formats loc_doc.suitable_encoding = handler_formats return loc_doc
def test_unicode(): """ Make sure that things turn out right when we're silly sallies and pass unicode in. """ assert fastchardet.detect(unicode('foo'))['encoding'] == 'unicode'
def _parse_l10n_doc(name, doc, no_encoding=False): 'Parses an L10n document.' extension = name.split('.')[-1].lower() handlers = { 'dtd': dtd.DTDParser, 'properties': properties.PropertiesParser } # These are expected encodings for the various files. handler_formats = ('ASCII', 'UTF_8') if extension not in handlers: return None wrapper = StringIO(doc) loc_doc = handlers[extension](wrapper) # Allow the parse to specify files to skip for encoding checks if not no_encoding: try: # This is much faster than fastchardet, and succeeds more often # than fails. doc.decode('utf-8') encoding = 'UTF_8' except UnicodeDecodeError: encoding = fastchardet.detect(doc)['encoding'].upper() loc_doc.expected_encoding = encoding in handler_formats loc_doc.suitable_encoding = handler_formats return loc_doc
def test_unicode(): """ Make sure that things turn out right when we're silly sallies and pass unicode in. """ assert fastchardet.detect(unicode("foo"))["encoding"] == "unicode"
def test_esoteric(): """Make sure that fastchardet can detect other encodings.""" a = lambda code: fastchardet.detect(code)['encoding'] # High Bytes print a('High Byte:\x91') assert a('High Byte:\x91') == 'windows-1252' # UTF-8 without BOM print a('\xc2\xbc + \xc2\xbd = \xcd\xbe') assert a('\xc2\xbc + \xc2\xbd = \xcd\xbe') == 'utf_8'
def test_esoteric(): """Make sure that fastchardet can detect other encodings.""" a = lambda code: fastchardet.detect(code)["encoding"] # High Bytes print a("High Byte:\x91") assert a("High Byte:\x91") == "windows-1252" # UTF-8 without BOM print a("\xc2\xbc + \xc2\xbd = \xcd\xbe") assert a("\xc2\xbc + \xc2\xbd = \xcd\xbe") == "utf_8"
def _parse_l10n_doc(name, doc, no_encoding=False): "Parses an L10n document." extension = name.split(".")[-1].lower() handlers = {"dtd": dtd.DTDParser, "properties": properties.PropertiesParser} # These are expected encodings for the various files. handler_formats = ("ASCII", "UTF_8") if extension not in handlers: return None wrapper = StringIO(doc) loc_doc = handlers[extension](wrapper) # Allow the parse to specify files to skip for encoding checks if not no_encoding: encoding = fastchardet.detect(doc)["encoding"].upper() loc_doc.expected_encoding = encoding in handler_formats loc_doc.suitable_encoding = handler_formats return loc_doc
def _parse_l10n_doc(name, doc, no_encoding=False): "Parses an L10n document." extension = name.split(".")[-1].lower() handlers = { "dtd": dtd.DTDParser, "properties": properties.PropertiesParser } # These are expected encodings for the various files. handler_formats = ("ASCII", "UTF_8") if extension not in handlers: return None wrapper = StringIO(doc) loc_doc = handlers[extension](wrapper) # Allow the parse to specify files to skip for encoding checks if not no_encoding: encoding = fastchardet.detect(doc)["encoding"].upper() loc_doc.expected_encoding = encoding in handler_formats loc_doc.suitable_encoding = handler_formats return loc_doc
def test_utf8(): """Determine that fastchardet properly detects UTF-8.""" assert fastchardet.detect("""\xEF\xBB\xBF Haldo, UTF-8 """)['encoding'] == 'utf_8'
def test_ascii(): """Determines that fastchardet detects ASCII properly.""" assert fastchardet.detect('This is plain ASCII')['encoding'] == 'ascii'
def test_utfn(): """Determine that fastchardet properly detects UTF-N.""" assert fastchardet.detect("""\xFF\xFE\x00\x00 Haldo, UTF-Not 8 """)['encoding'] == 'utf_n'
def test_utf8(): """Determine that fastchardet properly detects UTF-8.""" assert fastchardet.detect("""\xEF\xBB\xBF Haldo, UTF-8 """)["encoding"] == "utf_8"
def test_ascii(): """Determines that fastchardet detects ASCII properly.""" assert fastchardet.detect("This is plain ASCII")["encoding"] == "ascii"
def test_utfn(): """Determine that fastchardet properly detects UTF-N.""" assert fastchardet.detect("""\xFF\xFE\x00\x00 Haldo, UTF-Not 8 """)["encoding"] == "utf_n"