Ejemplo n.º 1
0
    def convert_to_utf8(self, filename):
        # try to open the file and exit if some IOError occurs
        """

        :param filename:
        """
        try:
            f = open(filename, 'r').read()
            print fastchardet.detect(f)
        except Exception:
            sys.exit(1)


        try:
            encoding_ = fastchardet.detect(f)['encoding']
            if encoding_ and encoding_=="ANSI":
                data = f.decode(encoding_)
                # now get the absolute path of our filename and append .bak
                # to the end of it (for our backup file)
                fpath = os.path.abspath(filename)
                newfilename = fpath + '.bak'
                # and make our backup file with shutil
                shutil.copy(filename, newfilename)

                # and at last convert it to utf-8
                f = open(filename, 'w')
                try:
                    f.write(data.encode('utf-8'))
                except Exception, e:
                    print e
                finally:
                    f.close()
Ejemplo n.º 2
0
    def convert_to_utf8(self, filename):
        # try to open the file and exit if some IOError occurs
        """

        :param filename:
        """
        try:
            f = open(filename, 'r').read()
            print fastchardet.detect(f)
        except Exception:
            sys.exit(1)

        try:
            encoding_ = fastchardet.detect(f)['encoding']
            if encoding_ and encoding_ == "ANSI":
                data = f.decode(encoding_)
                # now get the absolute path of our filename and append .bak
                # to the end of it (for our backup file)
                fpath = os.path.abspath(filename)
                newfilename = fpath + '.bak'
                # and make our backup file with shutil
                shutil.copy(filename, newfilename)

                # and at last convert it to utf-8
                f = open(filename, 'w')
                try:
                    f.write(data.encode('utf-8'))
                except Exception, e:
                    print e
                finally:
                    f.close()
def _parse_l10n_doc(name, doc, no_encoding=False):
    'Parses an L10n document.'

    extension = name.split('.')[-1].lower()

    handlers = {'dtd': dtd.DTDParser,
                'properties': properties.PropertiesParser}
    # These are expected encodings for the various files.
    handler_formats = ('ASCII', 'UTF_8')
    if extension not in handlers:
        return None

    wrapper = StringIO(doc)
    loc_doc = handlers[extension](wrapper)

    # Allow the parse to specify files to skip for encoding checks
    if not no_encoding:
        try:
            # This is much faster than fastchardet, and succeeds more often
            # than fails.
            doc.decode('utf-8')
            encoding = 'UTF_8'
        except UnicodeDecodeError:
            encoding = fastchardet.detect(doc)['encoding'].upper()
        loc_doc.expected_encoding = encoding in handler_formats
        loc_doc.suitable_encoding = handler_formats

    return loc_doc
Ejemplo n.º 4
0
def test_unicode():
    """
    Make sure that things turn out right when we're silly sallies and pass
    unicode in.
    """

    assert fastchardet.detect(unicode('foo'))['encoding'] == 'unicode'
Ejemplo n.º 5
0
def _parse_l10n_doc(name, doc, no_encoding=False):
    'Parses an L10n document.'

    extension = name.split('.')[-1].lower()

    handlers = {
        'dtd': dtd.DTDParser,
        'properties': properties.PropertiesParser
    }
    # These are expected encodings for the various files.
    handler_formats = ('ASCII', 'UTF_8')
    if extension not in handlers:
        return None

    wrapper = StringIO(doc)
    loc_doc = handlers[extension](wrapper)

    # Allow the parse to specify files to skip for encoding checks
    if not no_encoding:
        try:
            # This is much faster than fastchardet, and succeeds more often
            # than fails.
            doc.decode('utf-8')
            encoding = 'UTF_8'
        except UnicodeDecodeError:
            encoding = fastchardet.detect(doc)['encoding'].upper()
        loc_doc.expected_encoding = encoding in handler_formats
        loc_doc.suitable_encoding = handler_formats

    return loc_doc
Ejemplo n.º 6
0
def test_unicode():
    """
    Make sure that things turn out right when we're silly sallies and pass
    unicode in.
    """

    assert fastchardet.detect(unicode("foo"))["encoding"] == "unicode"
Ejemplo n.º 7
0
def test_esoteric():
    """Make sure that fastchardet can detect other encodings."""

    a = lambda code: fastchardet.detect(code)['encoding']

    # High Bytes
    print a('High Byte:\x91')
    assert a('High Byte:\x91') == 'windows-1252'

    # UTF-8 without BOM
    print a('\xc2\xbc + \xc2\xbd = \xcd\xbe')
    assert a('\xc2\xbc + \xc2\xbd = \xcd\xbe') == 'utf_8'
Ejemplo n.º 8
0
def test_esoteric():
    """Make sure that fastchardet can detect other encodings."""

    a = lambda code: fastchardet.detect(code)["encoding"]

    # High Bytes
    print a("High Byte:\x91")
    assert a("High Byte:\x91") == "windows-1252"

    # UTF-8 without BOM
    print a("\xc2\xbc + \xc2\xbd = \xcd\xbe")
    assert a("\xc2\xbc + \xc2\xbd = \xcd\xbe") == "utf_8"
Ejemplo n.º 9
0
def _parse_l10n_doc(name, doc, no_encoding=False):
    "Parses an L10n document."

    extension = name.split(".")[-1].lower()

    handlers = {"dtd": dtd.DTDParser, "properties": properties.PropertiesParser}
    # These are expected encodings for the various files.
    handler_formats = ("ASCII", "UTF_8")
    if extension not in handlers:
        return None

    wrapper = StringIO(doc)
    loc_doc = handlers[extension](wrapper)

    # Allow the parse to specify files to skip for encoding checks
    if not no_encoding:
        encoding = fastchardet.detect(doc)["encoding"].upper()
        loc_doc.expected_encoding = encoding in handler_formats
        loc_doc.suitable_encoding = handler_formats

    return loc_doc
Ejemplo n.º 10
0
def _parse_l10n_doc(name, doc, no_encoding=False):
    "Parses an L10n document."

    extension = name.split(".")[-1].lower()

    handlers = {
        "dtd": dtd.DTDParser,
        "properties": properties.PropertiesParser
    }
    # These are expected encodings for the various files.
    handler_formats = ("ASCII", "UTF_8")
    if extension not in handlers:
        return None

    wrapper = StringIO(doc)
    loc_doc = handlers[extension](wrapper)

    # Allow the parse to specify files to skip for encoding checks
    if not no_encoding:
        encoding = fastchardet.detect(doc)["encoding"].upper()
        loc_doc.expected_encoding = encoding in handler_formats
        loc_doc.suitable_encoding = handler_formats

    return loc_doc
Ejemplo n.º 11
0
def test_utf8():
    """Determine that fastchardet properly detects UTF-8."""

    assert fastchardet.detect("""\xEF\xBB\xBF
            Haldo, UTF-8
            """)['encoding'] == 'utf_8'
Ejemplo n.º 12
0
def test_ascii():
    """Determines that fastchardet detects ASCII properly."""
    assert fastchardet.detect('This is plain ASCII')['encoding'] == 'ascii'
Ejemplo n.º 13
0
def test_utfn():
    """Determine that fastchardet properly detects UTF-N."""

    assert fastchardet.detect("""\xFF\xFE\x00\x00
            Haldo, UTF-Not 8
            """)['encoding'] == 'utf_n'
Ejemplo n.º 14
0
def test_utf8():
    """Determine that fastchardet properly detects UTF-8."""

    assert fastchardet.detect("""\xEF\xBB\xBF
            Haldo, UTF-8
            """)["encoding"] == "utf_8"
Ejemplo n.º 15
0
def test_ascii():
    """Determines that fastchardet detects ASCII properly."""
    assert fastchardet.detect("This is plain ASCII")["encoding"] == "ascii"
Ejemplo n.º 16
0
def test_utfn():
    """Determine that fastchardet properly detects UTF-N."""

    assert fastchardet.detect("""\xFF\xFE\x00\x00
            Haldo, UTF-Not 8
            """)["encoding"] == "utf_n"