Example #1
0
def detect(view, file_name, encoding):
    if not file_name or not os.path.exists(file_name):
        return
    if not encoding.endswith(" with BOM"):
        encoding = encoding_cache.pop(file_name)
    if encoding:
        sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0)
        return
    sublime.set_timeout(lambda: view.set_status("origin_encoding", "Detecting encoding, please wait..."), 0)
    detector = UniversalDetector()
    cnt = SETTINGS["max_detect_lines"]
    fp = open(file_name, "rb")
    for line in fp:
        # cut MS-Windows CR code
        line = line.replace(b"\r", b"")
        detector.feed(line)
        cnt -= 1
        if detector.done or cnt == 0:
            break
    fp.close()
    detector.close()
    encoding = detector.result["encoding"]
    if encoding:
        encoding = encoding.upper()
        if encoding == "BIG5":
            encoding = "BIG5-HKSCS"
        elif encoding == "GB2312":
            encoding = "GBK"
    confidence = detector.result["confidence"]
    sublime.set_timeout(lambda: check_encoding(view, encoding, confidence), 0)
def listTextBasedFiles(file):
    try:
        # Detect MIME type for file
        # https://github.com/kaito834/myNotes/blob/master/snippets/python/magic_from_file.py
        # https://github.com/ahupp/python-magic#usage
        f_mimetype = magic.from_file(file, mime=True)
    except Exception as e:
        print("[!] Exception: {0} ({1})".format(e, type(e)))

    # Open and count lines if MIME type of the file is text/*
    if f_mimetype.split('/')[0] == 'text':
        # Detect encoding by chardet.universaldetector.UniversalDetector()
        # https://chardet.readthedocs.io/en/latest/usage.html#advanced-usage
        detector = UniversalDetector()
        with open(file, 'rb') as f:
            for line in f.readlines():
                detector.feed(line)
                if detector.done:
                    break
        detector.close()

        with open(file, "r", encoding=detector.result['encoding']) as f:
            line_count = 0
            for line in f.readlines():
                line_count += 1
            print("{0}: {1}, {2}, {3} lines".format(file, f_mimetype, detector.result['encoding'], line_count))
    else:
        print("{0}: NOT txet based file (reason: MIME type isn't text/*: {1})".format(file, f_mimetype))
Example #3
0
def detect_encoding(bytes, encoding=None):
    """Detect encoding of a byte stream.
    """
    # To reduce tabulator import time
    from chardet.universaldetector import UniversalDetector
    if encoding is not None:
        if encoding.lower() == 'utf-8':
            prefix = bytes.read(len(codecs.BOM_UTF8))
            if prefix == codecs.BOM_UTF8:
                encoding = 'utf-8-sig'
            bytes.seek(0)
        return encoding
    detector = UniversalDetector()
    num_lines = config.ENCODING_DETECTION_MAX_LINES
    while num_lines > 0:
        line = bytes.readline()
        detector.feed(line)
        if detector.done:
            break
        num_lines -= 1
    detector.close()
    bytes.seek(0)
    confidence = detector.result['confidence']
    encoding = detector.result['encoding']
    # Do not use if not confident
    if confidence < config.ENCODING_DETECTION_MIN_CONFIDENCE:
        encoding = config.DEFAULT_ENCODING
    # Default to utf-8 for safety
    if encoding == 'ascii':
        encoding = config.DEFAULT_ENCODING
    return encoding
Example #4
0
File: dyr.py Project: iacopy/dyr
def description_of(file_path, name='stdin', byte=1000000):
    """
    Return a string describing the probable encoding of a file.
    """
    from chardet.universaldetector import UniversalDetector
    file = open(file_path, 'rb')
    u = UniversalDetector()
    i = 0
    for line in file:
        l = len(line)
        if i + l > byte:
            bytoread = byte-i
            u.feed(line[:bytoread])
            break
        else:
            bytoread = l
            u.feed(line)
        i += bytoread
    file.close()
    u.close()
    result = u.result
    if result['encoding']:
        return '%s: %s with confidence %s' % (name,
                                              result['encoding'],
                                              result['confidence'])
    else:
        return '%s: no result' % name
Example #5
0
def guessWithChardet(content):
    u = UniversalDetector()
    for line in content:
        u.feed(line)
    u.close()
    result = u.result
    return result
Example #6
0
def detect_encoding(file):
    detector = UniversalDetector()
    for line in open(file, 'rb').readlines():
        detector.feed(line)
        if detector.done: break
    detector.close()
    return detector.result['encoding']
Example #7
0
def get_file_encoding(file_name):
    if not os.path.isfile(file_name):
        return ""
    u = UniversalDetector()
    with open(file_name, "rb") as f:
        for index, line in enumerate(f):
            u.feed(line)
            if index > 500:
                break
        u.close()
    if u.result["encoding"].lower() == "gb2312":
        try:
            _file = codecs.open(file_name, encoding="gb2312")
            _file.readlines()
            result = "gb2312"
        except Exception as e:
            print e
            try:
                _file = codecs.open(file_name, encoding="gbk")
                _file.readlines()
                result = "gbk"
            except Exception as e:
                print e
                result = "gb18030"
    else:
        result = u.result["encoding"]
    return result
Example #8
0
def get_unicode_content(file_path, encoding=None):
    """
    Return a unicode string of the files contents using the given encoding.  If no encoding is given
    then chardet will be used to determine the encoding.
    Note that this uses the chardet library and may cause problems, if an error is thrown then
    a utf-8 encoding is assumed and unrecognize caracters are discarded.
    """
    from chardet.universaldetector import UniversalDetector
    
    try:
        if not encoding:
            detector = UniversalDetector()
            contents = ''
            with open(file_path, 'rb') as f:
                contents = f.read()
                detector.feed(contents)
            detector.close()
            determined_encoding = detector.result['encoding']
            return contents.decode(encoding=determined_encoding)
        else:
            with open(file_path, 'r') as f:
                return unicode(f.read(), encoding=encoding, errors='ignore')
    except UnicodeError:
        with open(file_path, 'r') as f:
            return unicode(f.read(), encoding='utf-8', errors='ignore')
Example #9
0
    def _guessEncoding(self, path):
        """Opens a file from the given `path` and checks the file encoding.

        The file must exists on the file system and end with the extension
        `.csv`. The file is read line by line until the encoding could be
        guessed.
        On a successfull identification, the widgets of this dialog will be
        updated.

        Args:
            path (string): Path to a csv file on the file system.

        """
        if os.path.exists(path) and path.lower().endswith('csv'):
            encodingDetector = UniversalDetector()
            with open(path, 'r') as fp:
                for line in fp:
                    encodingDetector.feed(line)
                    if encodingDetector.done:
                        break
            encodingDetector.close()
            result = encodingDetector.result['encoding']
            result = result.replace('-','_')

            self._encodingKey = _calculateEncodingKey(result)
            if self._encodingKey:
                index = self._encodingComboBox.findText(result.upper())
                self._encodingComboBox.setCurrentIndex(index)
def validate_csv(f):
    """Return dialect information about given csv file."""
    with open(f.fullpath, 'rU') as csvfile:
        is_valid = False
        try:
            dialect = csv.Sniffer().sniff(csvfile.read(1024))
        except Exception as e:
            current_app.logger.debug(
                'File %s is not valid CSV: %s' % (f.name + f.superformat, e))
            return {
                'delimiter': '',
                'encoding': '',
                'is_valid': is_valid
            }
        u = UniversalDetector()
        dialect.strict = True
        csvfile.seek(0)
        reader = csv.reader(csvfile, dialect)
        try:
            for row in reader:
                u.feed(dialect.delimiter.join(row))
            is_valid = True
        except csv.Error as e:
            current_app.logger.debug(
                'File %s is not valid CSV: %s' % (f.name + f.superformat, e))
        finally:
            u.close()
    return {
        'delimiter': dialect.delimiter,
        'encoding': u.result['encoding'],
        'is_valid': is_valid
    }
Example #11
0
def get_coding(text, force_chardet=False):
    """
    Function to get the coding of a text.
    @param text text to inspect (string)
    @return coding string
    """
    if not force_chardet:
        for line in text.splitlines()[:2]:
            try:
                result = CODING_RE.search(to_text_string(line))
            except UnicodeDecodeError:
                # This could fail because to_text_string assume the text
                # is utf8-like and we don't know the encoding to give
                # it to to_text_string
                pass
            else:
                if result:
                    codec = result.group(1)
                    # sometimes we find a false encoding that can
                    # result in errors
                    if codec in CODECS:
                        return codec

    # Fallback using chardet
    if is_binary_string(text):
        detector = UniversalDetector()
        for line in text.splitlines()[:2]:
            detector.feed(line)
            if detector.done: break

        detector.close()
        return detector.result['encoding']

    return None
Example #12
0
File: mime.py Project: Darshnik/dxr
def decode_data(data, encoding_guess, can_be_binary=True):
    """Given string data, return an (is_text, data) tuple, where data is
    returned as unicode if we think it's text and were able to determine an
    encoding for it.
    If can_be_binary is False, then skip the initial is_binary check.
    """
    if not (can_be_binary and is_binary_string(data[:1024])):
        try:
            # Try our default encoding.
            data = data.decode(encoding_guess)
            return True, data
        except UnicodeDecodeError:
            # Fall back to chardet - chardet is really slow, which is why we
            # don't just do chardet from the start.
            detector = UniversalDetector()
            for chunk in ichunks(80, data):
                detector.feed(chunk)
                if detector.done:
                    break
            detector.close()
            if detector.result['encoding']:
                try:
                    data = data.decode(detector.result['encoding'])
                    return True, data
                except (UnicodeDecodeError, LookupError):
                    # Either we couldn't decode or chardet gave us an encoding
                    # that python doesn't recognize (yes, it can do that).
                    pass  # Leave data as str.
    return False, data
Example #13
0
def decode(string):
    """ detects string encoding and returns decoded string"""
    u = UniversalDetector()
    u.feed(string)
    u.close()
    result = u.result
    return string.decode(result['encoding'])
Example #14
0
def decode(filename, data):
    if '.m3u8' in filename:
        encoding = 'utf-8'
        data = data.decode(encoding)
    elif '.m3u' in filename or '.pls' in filename:
        try:
            encoding = 'ISO-8859-2'
            data = data.decode(encoding)
        except:
            if chardet:
                u = UniversalDetector()
                u.feed(data)
                u.close()
                if u.result['confidence'] > 0.5:
                    try:
                        encoding = result['encoding']
                        data = data.decode(encoding)
                    except:
                        encoding = 'ascii'
                else:
                    encoding = 'ascii'
            else:
                encoding = 'ascii'
    elif '.xml' in filename or '.xspf' in filename:
        encoding = 'utf-8'

    return {'data' : data, 'encoding' : encoding}
Example #15
0
def transferToEncoding(filename, toCode):
	if os.path.isdir(filename):
		print "error:not file"
		return False

	try:
		detector = UniversalDetector()
		f = open(filename, 'r')
		ls = f.readlines()
		f.close()

		# 如果空文件没法探测到,所以直接跳出做提示即可
		if len(ls) == 0: 
			print printRed(filename), printRed(' is blank file, can not detect encoding')
			return False;

		# 探测编码
		for l in ls:
			detector.feed(l)
			if detector.done: break
		detector.close()
		
		encode = gb(detector.result['encoding'])
		if encode.lower() != toCode.lower():
			f = open(filename, 'w')
			print printGreen(filename) + ' ====> ' + toCode + ' SUCCESS'
			for l in ls:
				f.write(unicode(l, encode).encode(toCode))
			f.close()
		else:
			pass		
	except Exception, e:
		traceback.print_exc()
		print 'exception'
def detect(view, file_name, cnt):
	#traceback.print_stack()
	print("detect...")
	if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0:
		return
	encoding = encoding_cache.pop(file_name)
	
	if encoding:
		print("it is already at cache encoding_cache.json:",encoding)
		sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0)
		return
	sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Detecting encoding, please wait...'), 0)
	detector = UniversalDetector()
	fp = open(file_name, 'rb')
	for line in fp:
		# cut MS-Windows CR code
		line = line.replace(b'\r',b'')
		detector.feed(line)
		#print(line)
		cnt -= 1
		if detector.done or cnt == 0:
			break
	fp.close()
	detector.close()
	encoding = detector.result['encoding']
	print(encoding)
	if encoding:
		encoding = encoding.upper()
	confidence = detector.result['confidence']
	print(confidence)
	sublime.set_timeout(lambda: check_encoding(view, encoding, confidence), 0)
Example #17
0
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        u = UniversalDetector()
        for line in f:
            u.feed(line)
        u.close()
        return u.result['encoding']
def sanitize_texts(directory):
    """
    Strip all header and copyright information from downloaded text files in the
    specified directory using gutenberg.strip_headers module and ensure proper
    file encodings.

    :param directory: <String> A string containing the full path to directory containing files to strip
    :return:
    """

    for item in os.listdir(directory):
        file_path = os.path.join(directory, item)
        if os.path.isfile(file_path):

            # Detect file encoding, takes time to run
            with open(file_path, 'rb') as inf:
                text = inf.readlines()
            detector = UniversalDetector()
            for line in text:
                detector.feed(line)
                if detector.done:
                    break
            detector.close()
            encoding = detector.result['encoding']

            # Open file, strip headers, and save result
            with open(file_path, 'r', encoding=encoding) as inf:
                text = inf.read()
            text = strip_headers(text).strip()
            os.remove(file_path)
            with open(file_path, 'w+', encoding=encoding) as outf:
                outf.write(text)
Example #19
0
def get_csv_reader(filename, charset=None):
    logger.info("Reading CSV file %s", filename)

    myfile = open(filename, "rb")

    if not charset:
        # Detect encoding
        detector = UniversalDetector()
        for line in myfile.xreadlines():
            detector.feed(line)

            if detector.result["confidence"] > 0.01:
                logger.debug("Result so far: %s", detector.result)

            if detector.done:
                break
        detector.close()
        charset = detector.result["encoding"]

        logger.info("Found encoding %s", charset)

        # Reset the file index
        myfile.seek(0)

    # Attempt to detect the dialect
    encodedfile = codecs.EncodedFile(myfile, charset)
    dialect = csv.Sniffer().sniff(encodedfile.read(1024))

    logger.info("Found dialect %s", dialect)

    # Reset the file index
    myfile.seek(0)

    return UnicodeReader(myfile, dialect=dialect, encoding=charset)
Example #20
0
def safe_open(path, mode='r'):
    '''
    Retrieves a file's encoding and returns the opened file. If the opened file
    begins with a BOM, it is read before the file object is returned. This
    allows callers to not have to handle BOMs of files.

    :param str path: file path to open
    :param str mode: the mode to open the file (see :func:`open`)
    :returns file: the opened file object
    '''
    u = UniversalDetector()
    first = None
    with open(path, 'rb') as fp:
        bin = first = fp.read(0x1000)

        while not u.done and bin:
            u.feed(bin)
            if not u.done:
                bin = fp.read(0x1000)
    u.close()

    if not first:
        return open(path, mode)

    fp = codecs.open(path, mode, encoding=u.result['encoding'])
    for bom in (codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE, codecs.BOM_UTF8,
                codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE):
        if first.startswith(bom):
            fp.seek(len(bom))
            break

    return fp
Example #21
0
    def detectEncoding(self, parseMeta=True, chardet=True):
        # First look for a BOM
        # This will also read past the BOM if present
        encoding = self.detectBOM()
        confidence = "certain"
        # If there is no BOM need to look for meta elements with encoding
        # information
        if encoding is None and parseMeta:
            encoding = self.detectEncodingMeta()
            confidence = "tentative"
        # Guess with chardet, if available
        if encoding is None and chardet:
            confidence = "tentative"
            try:
                from chardet.universaldetector import UniversalDetector
                buffers = []
                detector = UniversalDetector()
                while not detector.done:
                    buffer = self.rawStream.read(self.numBytesChardet)
                    assert isinstance(buffer, bytes)
                    if not buffer:
                        break
                    buffers.append(buffer)
                    detector.feed(buffer)
                detector.close()
                encoding = lookupEncoding(detector.result['encoding'])
                self.rawStream.seek(0)
            except ImportError:
                pass
        # If all else fails use the default encoding
        if encoding is None:
            confidence = "tentative"
            encoding = lookupEncoding(self.defaultEncoding)

        return encoding, confidence
Example #22
0
def description_of(lines, name='stdin'):
    """
    Return a string describing the probable encoding of a file or
    list of strings.

    :param lines: The lines to get the encoding of.
    :type lines: Iterable of bytes
    :param name: Name of file or collection of lines
    :type name: str
    """
    u = UniversalDetector()
    for line in lines:
        line = bytearray(line)
        u.feed(line)
        # shortcut out of the loop to save reading further - particularly useful if we read a BOM.
        if u.done:
            break
    u.close()
    result = u.result
    if PY2:
        name = name.decode(sys.getfilesystemencoding(), 'ignore')
    if result['encoding']:
        return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
                                                     result['confidence'])
    else:
        return '{0}: no result'.format(name)
def validate_csv(document):
    """Return dialect information about given csv file."""
    with open(document.document.uri, 'rU') as csvfile:
        is_valid = False
        try:
            dialect = csv.Sniffer().sniff(csvfile.read(1024))
        except Exception as e:
            current_app.logger.debug(
                'File %s is not valid CSV: %s' % (document.get_filename(), e))
            return {
                'delimiter': '',
                'encoding': '',
                'is_valid': is_valid
            }
        universal_detector = UniversalDetector()
        dialect.strict = True
        csvfile.seek(0)
        reader = csv.reader(csvfile, dialect)
        try:
            for row in reader:
                universal_detector.feed(
                    dialect.delimiter.join(row).encode('utf-8'))
            is_valid = True
        except csv.Error as e:
            current_app.logger.debug(
                'File %s is not valid CSV: %s' % (document.get_filename(), e))
        finally:
            universal_detector.close()
    return {
        'delimiter': dialect.delimiter,
        'encoding': universal_detector.result['encoding'],
        'is_valid': is_valid
    }
def process_buffer(buf, d):
    if not buf:
        return
    header = buf[0]
    url = header.split()[1]
    skip = 0
    empty_lines = 0
    while empty_lines < 2:
        skip += 1
        if not buf[skip].strip():
            empty_lines += 1

    rawhtml = "".join(buf[skip + 1:])
    html = None
    try:
        html = rawhtml.decode("utf-8")
    except:
        try:
            detector = UniversalDetector()
            for line in buf[skip + 1:]:
                detector.feed(line)
                if detector.done:
                    break
            detector.close()
            encoding = detector.result
            html = rawhtml.decode(encoding["encoding"])
        except:
            html = rawhtml.decode("utf-8", errors='ignore')
    assert html is not None, "Error processing %s\n" % rawhtml
    html = html.replace(r"\r", "")
    d[url] = (header, html)
Example #25
0
def detect_local_charset(filepath):
    global VERBOSE
    # Open to read in binary.
    fp = open(filepath, "rb")
    detector = UniversalDetector()

    if VERBOSE:
        print "Reading file to detect encoding..."

    for line in fp:
        line = line.replace(b'\r',b'')
        detector.feed(line)
        if detector.done:
            break

    fp.close()
    detector.close()

    if VERBOSE:
        print "Encoding: %s" % detector.result["encoding"]
        print "Confidence: {0:.0f}% ".format(detector.result["confidence"]*100)

    if detector.result["confidence"] > 0.75:
        encoding = detector.result["encoding"]
        return encoding.replace('-','_').lower() # Format for codecs
    else:
        return None
def detect(view, file_name):
	if not os.path.exists(file_name):
		return
	encoding = encoding_cache.pop(file_name)
	if encoding:
		sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0)
		return
	sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Detecting encoding, please wait...'), 0)
	detector = UniversalDetector()
	cnt = SETTINGS['max_detect_lines']
	fp = file(file_name, 'rb')
	for line in fp:
		detector.feed(line)
		cnt -= 1
		if detector.done or cnt == 0:
			break
	fp.close()
	detector.close()
	encoding = detector.result['encoding']
	confidence = detector.result['confidence']
	if not encoding or confidence < 0.7:
		sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Encoding can not be detected, please choose one manually. (%s/%.2f)' % (encoding, confidence)), 0)
		return
	encoding = encoding.upper()
	if encoding == 'BIG5':
		encoding = 'BIG5-HKSCS'
	elif encoding == 'GB2312':
		encoding = 'GBK'
	sublime.set_timeout(lambda: init_encoding_vars(view, encoding), 0)
Example #27
0
def detect_encoding(f, verbose=False):
    """Detects a file's encoding.

    Args:
        f (obj): The file like object to detect.
        verbose (Optional[bool]): The file open mode (default: False).
        mode (Optional[str]): The file open mode (default: 'rU').

    Returns:
        dict: The encoding result

    Examples:
        >>> filepath = p.join(DATA_DIR, 'test.csv')
        >>> with open(filepath, 'rb') as f:
        ...     result = detect_encoding(f)
        ...     result == {'confidence': 0.99, 'encoding': 'utf-8'}
        True
    """
    pos = f.tell()
    detector = UniversalDetector()

    for line in f:
        detector.feed(line)

        if detector.done:
            break

    detector.close()
    f.seek(pos)

    if verbose:
        print('result', detector.result)

    return detector.result
Example #28
0
    def deserialize(file_bytes):
        try:
            file_string = file_bytes.decode('utf-8')
        except UnicodeDecodeError as ude:
            detector = UniversalDetector()
            for line in BytesIO(file_bytes):
                detector.feed(line)
                if detector.done:
                    break
            detector.close()
            if detector.result['confidence'] < 0.5:
                raise ValueError("Failed to guess the encoding of the file (it's not utf-8). Use utf-8 encoded files.")
            try:
                file_string = file_bytes.decode(detector.result['encoding'])
            except UnicodeDecodeError:
                raise ValueError("Failed to guess the encoding of the file (it's not utf-8). Use utf-8 encoded files. "
                                 "(The invalid character is '{char:#x}' at {pos})".format(pos=ude.start,
                                                                                          char=file_bytes[ude.start]))
        csv_lines = file_string.splitlines()
        first_line = csv_lines[:1]
        first_row_tab = next(csv.reader(first_line, delimiter="\t"))
        first_row_semicolon = next(csv.reader(first_line, delimiter=";"))
        if len(first_row_tab) > 1:
            rows = csv.reader(csv_lines, delimiter="\t")
        elif len(first_row_semicolon) > 1:
            rows = csv.reader(csv_lines, delimiter=";")
        else:
            raise ValueError("Csv file is not delimited by ';' or 'tab'")

        return rows
Example #29
0
 def repo_cat_file(self, repo_path, commit_hash, path):
     (commit_hash, path) = self._all_to_utf8(commit_hash, path)
     if not self._path_check_chdir(repo_path, commit_hash, path):
         return ''
     path = self._get_quote_path(path)
     if path.startswith('./'):
         path = path[2:]
     file_type = path.split('.')[-1]
     if file_type in BINARY_FILE_TYPE:
         return u'二进制文件'
     stage_file = self._get_stage_file(repo_path, commit_hash, path)
     result = self._read_load_stage_file(stage_file)
     if result is not None:
         return result['blob']
     command = '/usr/bin/git show %s:%s | /usr/bin/head -c 524288' % (commit_hash, path)
     try:
         signal.signal(signal.SIGPIPE, signal.SIG_DFL)
         result = check_output(command, shell=True)
         ud = UniversalDetector()
         ud.feed(result)
         ud.close()
         if ud.result['encoding']:
             encoding = ud.result['encoding']
             if encoding != 'utf-8' or encoding != 'utf8':
                 result = result.decode(encoding).encode('utf-8')
         self._dumps_write_stage_file({'blob': result}, stage_file)
         return result
     except Exception, e:
         logger.exception(e)
Example #30
0
def detect(view, file_name, encoding):
	if not file_name or not os.path.exists(file_name):
		return
	if not encoding.endswith(' with BOM'):
		encoding = encoding_cache.pop(file_name)
	if encoding:
		sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0)
		return
	sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Detecting encoding, please wait...'), 0)
	detector = UniversalDetector()
	cnt = SETTINGS['max_detect_lines']
	fp = open(file_name, 'rb')
	for line in fp:
		# cut MS-Windows CR code
		line = line.replace(b'\r',b'')
		detector.feed(line)
		cnt -= 1
		if detector.done or cnt == 0:
			break
	fp.close()
	detector.close()
	encoding = detector.result['encoding']
	if encoding:
		encoding = encoding.upper()
	confidence = detector.result['confidence']
	sublime.set_timeout(lambda: check_encoding(view, encoding, confidence), 0)
Example #31
0
def dump_dictionary(out_path, lt_dir, tag_dict_path, tag_info_path):
    # dump dictionary, see https://dev.languagetool.org/developing-a-tagger-dictionary
    os.system(
        f"java -cp {lt_dir / 'languagetool.jar'} org.languagetool.tools.DictionaryExporter "
        f"-i {tag_dict_path} -info {tag_info_path} -o {out_path}")

    # the dumped dictionary is sometimes not in utf-8
    detector = UniversalDetector()
    for i, line in enumerate(open(out_path, "rb")):
        detector.feed(line)

        if i > 10_000:
            detector.close()
            break

    result = detector.result

    print(
        f"Dump was encoded as {result['encoding']} with confidence {result['confidence']}."
    )
    dump_bytes = open(out_path, "rb").read()

    with open(out_path, "w") as f:
        f.write(dump_bytes.decode(result["encoding"]))
Example #32
0
def make_array(dir):
    result = []
    detector = UniversalDetector()
    counter = 0
    for name in os.listdir(dir):
        if name == ".DS_Store":
            continue
        else:
            counter += 1
            name = 'fb2/' + name
            temp = open(name, 'rb')
            detector.reset()
            for line in temp.readlines():
                detector.feed(line)
                if detector.done:
                    break
            detector.close()
            print(str(counter) + ') ' + str(detector.result))
            temp = open(name, 'rb').read().decode(detector.result['encoding'])
            result.append(bytes(temp, encoding=detector.result['encoding']))
    # temp = open('example.fb2', 'r', encoding='utf-8').read().replace('\n', ' ')
    # temp = open('kek.xml', 'r', encoding='utf-8').read()
    # result.append(temp)
    return result