Ejemplo n.º 1
0
def get_file_encoding(file_name):
    if not os.path.isfile(file_name):
        return ""
    u = UniversalDetector()
    with open(file_name, "rb") as f:
        for index, line in enumerate(f):
            u.feed(line)
            if index > 500:
                break
        u.close()
    if u.result["encoding"].lower() == "gb2312":
        try:
            _file = codecs.open(file_name, encoding="gb2312")
            _file.readlines()
            result = "gb2312"
        except Exception as e:
            print e
            try:
                _file = codecs.open(file_name, encoding="gbk")
                _file.readlines()
                result = "gbk"
            except Exception as e:
                print e
                result = "gb18030"
    else:
        result = u.result["encoding"]
    return result
Ejemplo n.º 2
0
 def repo_cat_file(self, repo_path, commit_hash, path):
     (commit_hash, path) = self._all_to_utf8(commit_hash, path)
     if not self._path_check_chdir(repo_path, commit_hash, path):
         return ''
     path = self._get_quote_path(path)
     if path.startswith('./'):
         path = path[2:]
     file_type = path.split('.')[-1]
     if file_type in BINARY_FILE_TYPE:
         return u'二进制文件'
     stage_file = self._get_stage_file(repo_path, commit_hash, path)
     result = self._read_load_stage_file(stage_file)
     if result is not None:
         return result['blob']
     command = '/usr/bin/git show %s:%s | /usr/bin/head -c 524288' % (commit_hash, path)
     try:
         signal.signal(signal.SIGPIPE, signal.SIG_DFL)
         result = check_output(command, shell=True)
         ud = UniversalDetector()
         ud.feed(result)
         ud.close()
         if ud.result['encoding']:
             encoding = ud.result['encoding']
             if encoding != 'utf-8' or encoding != 'utf8':
                 result = result.decode(encoding).encode('utf-8')
         self._dumps_write_stage_file({'blob': result}, stage_file)
         return result
     except Exception, e:
         logger.exception(e)
Ejemplo n.º 3
0
Archivo: mime.py Proyecto: Darshnik/dxr
def decode_data(data, encoding_guess, can_be_binary=True):
    """Given string data, return an (is_text, data) tuple, where data is
    returned as unicode if we think it's text and were able to determine an
    encoding for it.
    If can_be_binary is False, then skip the initial is_binary check.
    """
    if not (can_be_binary and is_binary_string(data[:1024])):
        try:
            # Try our default encoding.
            data = data.decode(encoding_guess)
            return True, data
        except UnicodeDecodeError:
            # Fall back to chardet - chardet is really slow, which is why we
            # don't just do chardet from the start.
            detector = UniversalDetector()
            for chunk in ichunks(80, data):
                detector.feed(chunk)
                if detector.done:
                    break
            detector.close()
            if detector.result['encoding']:
                try:
                    data = data.decode(detector.result['encoding'])
                    return True, data
                except (UnicodeDecodeError, LookupError):
                    # Either we couldn't decode or chardet gave us an encoding
                    # that python doesn't recognize (yes, it can do that).
                    pass  # Leave data as str.
    return False, data
Ejemplo n.º 4
0
def safe_open(path, mode='r'):
    '''
    Retrieves a file's encoding and returns the opened file. If the opened file
    begins with a BOM, it is read before the file object is returned. This
    allows callers to not have to handle BOMs of files.

    :param str path: file path to open
    :param str mode: the mode to open the file (see :func:`open`)
    :returns file: the opened file object
    '''
    u = UniversalDetector()
    first = None
    with open(path, 'rb') as fp:
        bin = first = fp.read(0x1000)

        while not u.done and bin:
            u.feed(bin)
            if not u.done:
                bin = fp.read(0x1000)
    u.close()

    if not first:
        return open(path, mode)

    fp = codecs.open(path, mode, encoding=u.result['encoding'])
    for bom in (codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE, codecs.BOM_UTF8,
                codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE):
        if first.startswith(bom):
            fp.seek(len(bom))
            break

    return fp
Ejemplo n.º 5
0
def get_csv_reader(filename, charset=None):
    logger.info("Reading CSV file %s", filename)

    myfile = open(filename, "rb")

    if not charset:
        # Detect encoding
        detector = UniversalDetector()
        for line in myfile.xreadlines():
            detector.feed(line)

            if detector.result["confidence"] > 0.01:
                logger.debug("Result so far: %s", detector.result)

            if detector.done:
                break
        detector.close()
        charset = detector.result["encoding"]

        logger.info("Found encoding %s", charset)

        # Reset the file index
        myfile.seek(0)

    # Attempt to detect the dialect
    encodedfile = codecs.EncodedFile(myfile, charset)
    dialect = csv.Sniffer().sniff(encodedfile.read(1024))

    logger.info("Found dialect %s", dialect)

    # Reset the file index
    myfile.seek(0)

    return UnicodeReader(myfile, dialect=dialect, encoding=charset)
Ejemplo n.º 6
0
def detect_encoding(bytes, encoding=None):
    """Detect encoding of a byte stream.
    """
    # To reduce tabulator import time
    from chardet.universaldetector import UniversalDetector
    if encoding is not None:
        if encoding.lower() == 'utf-8':
            prefix = bytes.read(len(codecs.BOM_UTF8))
            if prefix == codecs.BOM_UTF8:
                encoding = 'utf-8-sig'
            bytes.seek(0)
        return encoding
    detector = UniversalDetector()
    num_lines = config.ENCODING_DETECTION_MAX_LINES
    while num_lines > 0:
        line = bytes.readline()
        detector.feed(line)
        if detector.done:
            break
        num_lines -= 1
    detector.close()
    bytes.seek(0)
    confidence = detector.result['confidence']
    encoding = detector.result['encoding']
    # Do not use if not confident
    if confidence < config.ENCODING_DETECTION_MIN_CONFIDENCE:
        encoding = config.DEFAULT_ENCODING
    # Default to utf-8 for safety
    if encoding == 'ascii':
        encoding = config.DEFAULT_ENCODING
    return encoding
Ejemplo n.º 7
0
def decode(string):
    """ detects string encoding and returns decoded string"""
    u = UniversalDetector()
    u.feed(string)
    u.close()
    result = u.result
    return string.decode(result['encoding'])
Ejemplo n.º 8
0
def detect(view, file_name, encoding):
	if not file_name or not os.path.exists(file_name):
		return
	if not encoding.endswith(' with BOM'):
		encoding = encoding_cache.pop(file_name)
	if encoding:
		sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0)
		return
	sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Detecting encoding, please wait...'), 0)
	detector = UniversalDetector()
	cnt = SETTINGS['max_detect_lines']
	fp = open(file_name, 'rb')
	for line in fp:
		# cut MS-Windows CR code
		line = line.replace(b'\r',b'')
		detector.feed(line)
		cnt -= 1
		if detector.done or cnt == 0:
			break
	fp.close()
	detector.close()
	encoding = detector.result['encoding']
	if encoding:
		encoding = encoding.upper()
	confidence = detector.result['confidence']
	sublime.set_timeout(lambda: check_encoding(view, encoding, confidence), 0)
def listTextBasedFiles(file):
    try:
        # Detect MIME type for file
        # https://github.com/kaito834/myNotes/blob/master/snippets/python/magic_from_file.py
        # https://github.com/ahupp/python-magic#usage
        f_mimetype = magic.from_file(file, mime=True)
    except Exception as e:
        print("[!] Exception: {0} ({1})".format(e, type(e)))

    # Open and count lines if MIME type of the file is text/*
    if f_mimetype.split('/')[0] == 'text':
        # Detect encoding by chardet.universaldetector.UniversalDetector()
        # https://chardet.readthedocs.io/en/latest/usage.html#advanced-usage
        detector = UniversalDetector()
        with open(file, 'rb') as f:
            for line in f.readlines():
                detector.feed(line)
                if detector.done:
                    break
        detector.close()

        with open(file, "r", encoding=detector.result['encoding']) as f:
            line_count = 0
            for line in f.readlines():
                line_count += 1
            print("{0}: {1}, {2}, {3} lines".format(file, f_mimetype, detector.result['encoding'], line_count))
    else:
        print("{0}: NOT txet based file (reason: MIME type isn't text/*: {1})".format(file, f_mimetype))
Ejemplo n.º 10
0
def get_unicode_content(file_path, encoding=None):
    """
    Return a unicode string of the files contents using the given encoding.  If no encoding is given
    then chardet will be used to determine the encoding.
    Note that this uses the chardet library and may cause problems, if an error is thrown then
    a utf-8 encoding is assumed and unrecognize caracters are discarded.
    """
    from chardet.universaldetector import UniversalDetector
    
    try:
        if not encoding:
            detector = UniversalDetector()
            contents = ''
            with open(file_path, 'rb') as f:
                contents = f.read()
                detector.feed(contents)
            detector.close()
            determined_encoding = detector.result['encoding']
            return contents.decode(encoding=determined_encoding)
        else:
            with open(file_path, 'r') as f:
                return unicode(f.read(), encoding=encoding, errors='ignore')
    except UnicodeError:
        with open(file_path, 'r') as f:
            return unicode(f.read(), encoding='utf-8', errors='ignore')
Ejemplo n.º 11
0
def detect(view, file_name, encoding):
    if not file_name or not os.path.exists(file_name):
        return
    if not encoding.endswith(" with BOM"):
        encoding = encoding_cache.pop(file_name)
    if encoding:
        sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0)
        return
    sublime.set_timeout(lambda: view.set_status("origin_encoding", "Detecting encoding, please wait..."), 0)
    detector = UniversalDetector()
    cnt = SETTINGS["max_detect_lines"]
    fp = open(file_name, "rb")
    for line in fp:
        # cut MS-Windows CR code
        line = line.replace(b"\r", b"")
        detector.feed(line)
        cnt -= 1
        if detector.done or cnt == 0:
            break
    fp.close()
    detector.close()
    encoding = detector.result["encoding"]
    if encoding:
        encoding = encoding.upper()
        if encoding == "BIG5":
            encoding = "BIG5-HKSCS"
        elif encoding == "GB2312":
            encoding = "GBK"
    confidence = detector.result["confidence"]
    sublime.set_timeout(lambda: check_encoding(view, encoding, confidence), 0)
Ejemplo n.º 12
0
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        u = UniversalDetector()
        for line in f:
            u.feed(line)
        u.close()
        return u.result['encoding']
Ejemplo n.º 13
0
    def detectEncoding(self, parseMeta=True, chardet=True):
        # First look for a BOM
        # This will also read past the BOM if present
        encoding = self.detectBOM()
        confidence = "certain"
        # If there is no BOM need to look for meta elements with encoding
        # information
        if encoding is None and parseMeta:
            encoding = self.detectEncodingMeta()
            confidence = "tentative"
        # Guess with chardet, if available
        if encoding is None and chardet:
            confidence = "tentative"
            try:
                from chardet.universaldetector import UniversalDetector
                buffers = []
                detector = UniversalDetector()
                while not detector.done:
                    buffer = self.rawStream.read(self.numBytesChardet)
                    assert isinstance(buffer, bytes)
                    if not buffer:
                        break
                    buffers.append(buffer)
                    detector.feed(buffer)
                detector.close()
                encoding = lookupEncoding(detector.result['encoding'])
                self.rawStream.seek(0)
            except ImportError:
                pass
        # If all else fails use the default encoding
        if encoding is None:
            confidence = "tentative"
            encoding = lookupEncoding(self.defaultEncoding)

        return encoding, confidence
Ejemplo n.º 14
0
def description_of(lines, name='stdin'):
    """
    Return a string describing the probable encoding of a file or
    list of strings.

    :param lines: The lines to get the encoding of.
    :type lines: Iterable of bytes
    :param name: Name of file or collection of lines
    :type name: str
    """
    u = UniversalDetector()
    for line in lines:
        line = bytearray(line)
        u.feed(line)
        # shortcut out of the loop to save reading further - particularly useful if we read a BOM.
        if u.done:
            break
    u.close()
    result = u.result
    if PY2:
        name = name.decode(sys.getfilesystemencoding(), 'ignore')
    if result['encoding']:
        return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
                                                     result['confidence'])
    else:
        return '{0}: no result'.format(name)
Ejemplo n.º 15
0
def get_coding(text, force_chardet=False):
    """
    Function to get the coding of a text.
    @param text text to inspect (string)
    @return coding string
    """
    if not force_chardet:
        for line in text.splitlines()[:2]:
            try:
                result = CODING_RE.search(to_text_string(line))
            except UnicodeDecodeError:
                # This could fail because to_text_string assume the text
                # is utf8-like and we don't know the encoding to give
                # it to to_text_string
                pass
            else:
                if result:
                    codec = result.group(1)
                    # sometimes we find a false encoding that can
                    # result in errors
                    if codec in CODECS:
                        return codec

    # Fallback using chardet
    if is_binary_string(text):
        detector = UniversalDetector()
        for line in text.splitlines()[:2]:
            detector.feed(line)
            if detector.done: break

        detector.close()
        return detector.result['encoding']

    return None
def validate_csv(document):
    """Return dialect information about given csv file."""
    with open(document.document.uri, 'rU') as csvfile:
        is_valid = False
        try:
            dialect = csv.Sniffer().sniff(csvfile.read(1024))
        except Exception as e:
            current_app.logger.debug(
                'File %s is not valid CSV: %s' % (document.get_filename(), e))
            return {
                'delimiter': '',
                'encoding': '',
                'is_valid': is_valid
            }
        universal_detector = UniversalDetector()
        dialect.strict = True
        csvfile.seek(0)
        reader = csv.reader(csvfile, dialect)
        try:
            for row in reader:
                universal_detector.feed(
                    dialect.delimiter.join(row).encode('utf-8'))
            is_valid = True
        except csv.Error as e:
            current_app.logger.debug(
                'File %s is not valid CSV: %s' % (document.get_filename(), e))
        finally:
            universal_detector.close()
    return {
        'delimiter': dialect.delimiter,
        'encoding': universal_detector.result['encoding'],
        'is_valid': is_valid
    }
def detect(view, file_name):
	if not os.path.exists(file_name):
		return
	encoding = encoding_cache.pop(file_name)
	if encoding:
		sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0)
		return
	sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Detecting encoding, please wait...'), 0)
	detector = UniversalDetector()
	cnt = SETTINGS['max_detect_lines']
	fp = file(file_name, 'rb')
	for line in fp:
		detector.feed(line)
		cnt -= 1
		if detector.done or cnt == 0:
			break
	fp.close()
	detector.close()
	encoding = detector.result['encoding']
	confidence = detector.result['confidence']
	if not encoding or confidence < 0.7:
		sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Encoding can not be detected, please choose one manually. (%s/%.2f)' % (encoding, confidence)), 0)
		return
	encoding = encoding.upper()
	if encoding == 'BIG5':
		encoding = 'BIG5-HKSCS'
	elif encoding == 'GB2312':
		encoding = 'GBK'
	sublime.set_timeout(lambda: init_encoding_vars(view, encoding), 0)
Ejemplo n.º 18
0
def validate_csv(f):
    """Return dialect information about given csv file."""
    with open(f.fullpath, 'rU') as csvfile:
        is_valid = False
        try:
            dialect = csv.Sniffer().sniff(csvfile.read(1024))
        except Exception as e:
            current_app.logger.debug(
                'File %s is not valid CSV: %s' % (f.name + f.superformat, e))
            return {
                'delimiter': '',
                'encoding': '',
                'is_valid': is_valid
            }
        u = UniversalDetector()
        dialect.strict = True
        csvfile.seek(0)
        reader = csv.reader(csvfile, dialect)
        try:
            for row in reader:
                u.feed(dialect.delimiter.join(row))
            is_valid = True
        except csv.Error as e:
            current_app.logger.debug(
                'File %s is not valid CSV: %s' % (f.name + f.superformat, e))
        finally:
            u.close()
    return {
        'delimiter': dialect.delimiter,
        'encoding': u.result['encoding'],
        'is_valid': is_valid
    }
def process_buffer(buf, d):
    if not buf:
        return
    header = buf[0]
    url = header.split()[1]
    skip = 0
    empty_lines = 0
    while empty_lines < 2:
        skip += 1
        if not buf[skip].strip():
            empty_lines += 1

    rawhtml = "".join(buf[skip + 1:])
    html = None
    try:
        html = rawhtml.decode("utf-8")
    except:
        try:
            detector = UniversalDetector()
            for line in buf[skip + 1:]:
                detector.feed(line)
                if detector.done:
                    break
            detector.close()
            encoding = detector.result
            html = rawhtml.decode(encoding["encoding"])
        except:
            html = rawhtml.decode("utf-8", errors='ignore')
    assert html is not None, "Error processing %s\n" % rawhtml
    html = html.replace(r"\r", "")
    d[url] = (header, html)
Ejemplo n.º 20
0
def decode(filename, data):
    if '.m3u8' in filename:
        encoding = 'utf-8'
        data = data.decode(encoding)
    elif '.m3u' in filename or '.pls' in filename:
        try:
            encoding = 'ISO-8859-2'
            data = data.decode(encoding)
        except:
            if chardet:
                u = UniversalDetector()
                u.feed(data)
                u.close()
                if u.result['confidence'] > 0.5:
                    try:
                        encoding = result['encoding']
                        data = data.decode(encoding)
                    except:
                        encoding = 'ascii'
                else:
                    encoding = 'ascii'
            else:
                encoding = 'ascii'
    elif '.xml' in filename or '.xspf' in filename:
        encoding = 'utf-8'

    return {'data' : data, 'encoding' : encoding}
Ejemplo n.º 21
0
def transferToEncoding(filename, toCode):
	if os.path.isdir(filename):
		print "error:not file"
		return False

	try:
		detector = UniversalDetector()
		f = open(filename, 'r')
		ls = f.readlines()
		f.close()

		# 如果空文件没法探测到,所以直接跳出做提示即可
		if len(ls) == 0: 
			print printRed(filename), printRed(' is blank file, can not detect encoding')
			return False;

		# 探测编码
		for l in ls:
			detector.feed(l)
			if detector.done: break
		detector.close()
		
		encode = gb(detector.result['encoding'])
		if encode.lower() != toCode.lower():
			f = open(filename, 'w')
			print printGreen(filename) + ' ====> ' + toCode + ' SUCCESS'
			for l in ls:
				f.write(unicode(l, encode).encode(toCode))
			f.close()
		else:
			pass		
	except Exception, e:
		traceback.print_exc()
		print 'exception'
Ejemplo n.º 22
0
    def deserialize(file_bytes):
        try:
            file_string = file_bytes.decode('utf-8')
        except UnicodeDecodeError as ude:
            detector = UniversalDetector()
            for line in BytesIO(file_bytes):
                detector.feed(line)
                if detector.done:
                    break
            detector.close()
            if detector.result['confidence'] < 0.5:
                raise ValueError("Failed to guess the encoding of the file (it's not utf-8). Use utf-8 encoded files.")
            try:
                file_string = file_bytes.decode(detector.result['encoding'])
            except UnicodeDecodeError:
                raise ValueError("Failed to guess the encoding of the file (it's not utf-8). Use utf-8 encoded files. "
                                 "(The invalid character is '{char:#x}' at {pos})".format(pos=ude.start,
                                                                                          char=file_bytes[ude.start]))
        csv_lines = file_string.splitlines()
        first_line = csv_lines[:1]
        first_row_tab = next(csv.reader(first_line, delimiter="\t"))
        first_row_semicolon = next(csv.reader(first_line, delimiter=";"))
        if len(first_row_tab) > 1:
            rows = csv.reader(csv_lines, delimiter="\t")
        elif len(first_row_semicolon) > 1:
            rows = csv.reader(csv_lines, delimiter=";")
        else:
            raise ValueError("Csv file is not delimited by ';' or 'tab'")

        return rows
Ejemplo n.º 23
0
def detect_encoding(file):
    detector = UniversalDetector()
    for line in open(file, 'rb').readlines():
        detector.feed(line)
        if detector.done: break
    detector.close()
    return detector.result['encoding']
Ejemplo n.º 24
0
    def _guessEncoding(self, path):
        """Opens a file from the given `path` and checks the file encoding.

        The file must exists on the file system and end with the extension
        `.csv`. The file is read line by line until the encoding could be
        guessed.
        On a successfull identification, the widgets of this dialog will be
        updated.

        Args:
            path (string): Path to a csv file on the file system.

        """
        if os.path.exists(path) and path.lower().endswith('csv'):
            encodingDetector = UniversalDetector()
            with open(path, 'r') as fp:
                for line in fp:
                    encodingDetector.feed(line)
                    if encodingDetector.done:
                        break
            encodingDetector.close()
            result = encodingDetector.result['encoding']
            result = result.replace('-','_')

            self._encodingKey = _calculateEncodingKey(result)
            if self._encodingKey:
                index = self._encodingComboBox.findText(result.upper())
                self._encodingComboBox.setCurrentIndex(index)
def sanitize_texts(directory):
    """
    Strip all header and copyright information from downloaded text files in the
    specified directory using gutenberg.strip_headers module and ensure proper
    file encodings.

    :param directory: <String> A string containing the full path to directory containing files to strip
    :return:
    """

    for item in os.listdir(directory):
        file_path = os.path.join(directory, item)
        if os.path.isfile(file_path):

            # Detect file encoding, takes time to run
            with open(file_path, 'rb') as inf:
                text = inf.readlines()
            detector = UniversalDetector()
            for line in text:
                detector.feed(line)
                if detector.done:
                    break
            detector.close()
            encoding = detector.result['encoding']

            # Open file, strip headers, and save result
            with open(file_path, 'r', encoding=encoding) as inf:
                text = inf.read()
            text = strip_headers(text).strip()
            os.remove(file_path)
            with open(file_path, 'w+', encoding=encoding) as outf:
                outf.write(text)
Ejemplo n.º 26
0
def detect_local_charset(filepath):
    global VERBOSE
    # Open to read in binary.
    fp = open(filepath, "rb")
    detector = UniversalDetector()

    if VERBOSE:
        print "Reading file to detect encoding..."

    for line in fp:
        line = line.replace(b'\r',b'')
        detector.feed(line)
        if detector.done:
            break

    fp.close()
    detector.close()

    if VERBOSE:
        print "Encoding: %s" % detector.result["encoding"]
        print "Confidence: {0:.0f}% ".format(detector.result["confidence"]*100)

    if detector.result["confidence"] > 0.75:
        encoding = detector.result["encoding"]
        return encoding.replace('-','_').lower() # Format for codecs
    else:
        return None
Ejemplo n.º 27
0
def detect_encoding(f, verbose=False):
    """Detects a file's encoding.

    Args:
        f (obj): The file like object to detect.
        verbose (Optional[bool]): The file open mode (default: False).
        mode (Optional[str]): The file open mode (default: 'rU').

    Returns:
        dict: The encoding result

    Examples:
        >>> filepath = p.join(DATA_DIR, 'test.csv')
        >>> with open(filepath, 'rb') as f:
        ...     result = detect_encoding(f)
        ...     result == {'confidence': 0.99, 'encoding': 'utf-8'}
        True
    """
    pos = f.tell()
    detector = UniversalDetector()

    for line in f:
        detector.feed(line)

        if detector.done:
            break

    detector.close()
    f.seek(pos)

    if verbose:
        print('result', detector.result)

    return detector.result
def detect(view, file_name, cnt):
	#traceback.print_stack()
	print("detect...")
	if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0:
		return
	encoding = encoding_cache.pop(file_name)
	
	if encoding:
		print("it is already at cache encoding_cache.json:",encoding)
		sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0)
		return
	sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Detecting encoding, please wait...'), 0)
	detector = UniversalDetector()
	fp = open(file_name, 'rb')
	for line in fp:
		# cut MS-Windows CR code
		line = line.replace(b'\r',b'')
		detector.feed(line)
		#print(line)
		cnt -= 1
		if detector.done or cnt == 0:
			break
	fp.close()
	detector.close()
	encoding = detector.result['encoding']
	print(encoding)
	if encoding:
		encoding = encoding.upper()
	confidence = detector.result['confidence']
	print(confidence)
	sublime.set_timeout(lambda: check_encoding(view, encoding, confidence), 0)
Ejemplo n.º 29
0
def guessWithChardet(content):
    u = UniversalDetector()
    for line in content:
        u.feed(line)
    u.close()
    result = u.result
    return result
Ejemplo n.º 30
0
def getEncoding(filename):
    fp = open(filename, 'r')
    orig_content = fp.read()
    detector = UniversalDetector()
    detector.feed(orig_content)
    detector.close()
    fp.close()
    return detector.result["encoding"]
Ejemplo n.º 31
0
def detect_encoding(filename):
    """
    Detect encoding of `filename`, which can be a ``str`` filename, a
    ``file``-like object, or ``bytes``.
    """
    # Try with Unix file utility first because it's faster (~10ms vs 100ms)
    if isinstance(filename, str) and not filename.endswith(Compression.all):
        try:
            with subprocess.Popen(('file', '--brief', '--mime-encoding', filename),
                                  stdout=subprocess.PIPE) as process:
                process.wait()
                if process.returncode == 0:
                    encoding = process.stdout.read().strip()
                    # file only supports these encodings; for others it says
                    # unknown-8bit or binary. So we give chardet a chance to do
                    # better
                    if encoding in (b'utf-8', b'us-ascii', b'iso-8859-1',
                                    b'utf-7', b'utf-16le', b'utf-16be', b'ebcdic'):
                        return encoding.decode('us-ascii')
        except OSError:
            pass  # windoze

    # file not available or unable to guess the encoding, have chardet do it
    detector = UniversalDetector()
    # We examine only first N 4kB blocks of file because chardet is really slow
    MAX_BYTES = 4*1024*12

    def _from_file(f):
        detector.feed(f.read(MAX_BYTES))
        detector.close()
        return (detector.result.get('encoding')
                if detector.result.get('confidence', 0) >= .85 else
                'utf-8')

    if isinstance(filename, str):
        with open_compressed(filename, 'rb') as f:
            return _from_file(f)
    elif isinstance(filename, bytes):
        detector.feed(filename[:MAX_BYTES])
        detector.close()
        return detector.result.get('encoding')
    elif hasattr(filename, 'encoding'):
        return filename.encoding
    else:  # assume file-like object that you can iter through
        return _from_file(filename)
Ejemplo n.º 32
0
    def detect_data(self, req, csvfile, csvsep=',', csvdel='"', csvcode='utf-8', jsonp='callback'):
        #detect encoding
        if csvcode == 'auto':
            u = UniversalDetector()
            for line in csvfile:
                u.feed(line)
            u.close()
            csvcode = u.result['encoding'].lower()
            csvfile.seek(0)

        # gb2312 gbk hz-gb-2312 hz-gb
        if csvcode == 'gb2312': csvcode = 'gbk'
        if 'hz' in csvcode: csvcode = 'hz'

        #remove bom
        if 'utf' in csvcode:
            if 'utf-8' in csvcode:
                contents = csvfile.read().decode('utf-8-sig').encode('utf-8')
                csvcode = 'utf-8'
            #FIXME not support utf-16
            if 'utf-16' in csvcode:
                contents = csvfile.read().decode(csvcode).encode('utf-16')
                csvcode = 'utf-16'
            #FIXME not support utf-32
            if 'utf-32' in csvcode:
                contents = csvfile.read().decode('utf-32be').encode('utf-32')
                csvcode = 'utf-32'
            csvfile.truncate(0)
            csvfile.write(contents)
            csvfile.seek(0)

        try:
            data = list(csv.reader(
                csvfile, quotechar=str(csvdel), delimiter=str(csvsep)))
        except csv.Error, e:
            csvfile.seek(0)
            return '<script>window.top.%s(%s);</script>' % (
                jsonp, simplejson.dumps({'error': {
                    'message': 'Error parsing CSV file: %s' % e,
                    # decodes each byte to a unicode character, which may or
                    # may not be printable, but decoding will succeed.
                    # Otherwise simplejson will try to decode the `str` using
                    # utf-8, which is very likely to blow up on characters out
                    # of the ascii range (in range [128, 256))
                    'preview': csvfile.read(200).decode('iso-8859-1')}}))
def my_open_source_file(
    path
):  # copied from C:\ProgramData\CAST\CAST\Extensions\com.castsoftware.sqlscript.1.2.0-alpha1\analyser.py
    """
    Uses chardet to autodetect encoding and open the file in the correct encoding.
    """
    from chardet.universaldetector import UniversalDetector

    detector = UniversalDetector()
    with open(path, 'rb') as f:
        for line in f:
            detector.feed(line)
            if detector.done: break
    detector.close()

    result = open(path, 'r', encoding=detector.result['encoding'])
    #print (encoding=detector.result['encoding'])
    return result
def find_response_encoding(response):
    """
	如果html的body中有charset声明的话,就会
	返回相应的内容.如果没有发现,就是用chardet来估算出网页所使用的字符编码
	"""
    r = response.body
    encoding = html_body_declared_encoding(r)
    if encoding:
        return encoding
    else:
        my_stringio = cStringIO.StringIO(r)
        my_detector = UniversalDetector()
        for x in my_stringio:
            my_detector.feed(x)
            if my_detector.done:
                break
        my_detector.close()
        return my_detector.result['encoding']
Ejemplo n.º 35
0
def guess_encoding(file_bytes: bytes) -> str:
    """Guesses the encoding as a string using the
    Universal Encoding Detector library incrementally
    calling its feed method repeatedly with each block of text

    Args:
        file_bytes (bytes): raw bytes

    Returns:
        str: Type of the encoding
    """
    detector = UniversalDetector()
    for line in file_bytes.readlines():
        detector.feed(line)
        if detector.done:
            break
    detector.close()
    return detector.result['encoding']
Ejemplo n.º 36
0
def open(file, mode='r', buffering=-1, encoding=None, errors=None, newline=None, closefd=True):
        rawdata = io.open(file, mode='rb')

        detector = UniversalDetector()
        for line in rawdata.readlines():
            detector.feed(line)
            if detector.done:
                break
        detector.close()
        rawdata.close()

        decoded = io.open(file, mode=mode, buffering=buffering,
                          encoding=detector.result["encoding"],
                          errors=errors,
                          newline=newline,
                          closefd=closefd)

        return decoded
Ejemplo n.º 37
0
class Encoding(object):
    def __init__(self):
        self.detector = UniversalDetector()

    def _detect(self, data):
        self.detector.reset()
        self.detector.feed(data)
        self.detector.close()
        return self.detector.result

    def detect(self, data, safe=False):
        try:
            return self._detect(data)
        except:
            if safe:
                return None

            raise
Ejemplo n.º 38
0
def id_encodefile(file__):
    """
    Find encoding of file using chardet
    """

    from chardet.universaldetector import UniversalDetector

    detector = UniversalDetector()

    for l in open(file__):
        detector.feed(l)

        if detector.done:
            break

    detector.close()

    return detector.result['encoding']
Ejemplo n.º 39
0
def read_file(archive, name):
    try:
        with archive.open(name) as open_file:
            detector = UniversalDetector()
            for line in open_file:
                detector.feed(line)
                if detector.done:
                    break
            detector.close()
            open_file.seek(0)
            with io.TextIOWrapper(
                    open_file,
                    encoding=detector.result['encoding']) as wrapped_file:
                for line in csv.DictReader(wrapped_file):
                    yield (line)
    except KeyError:
        # file doesn't exist
        return
Ejemplo n.º 40
0
	def detect(self, begin_line, end_line):
		begin_line = int(begin_line)
		end_line = int(end_line)
		begin_point = self.view.text_point(begin_line + 1, 0)
		end_point = self.view.text_point(end_line, 0) - 1
		region = sublime.Region(begin_point, end_point)
		content = self.get_text(region)
		if not content:
			return
		detector = UniversalDetector()
		detector.feed(content)
		detector.close()
		encoding = detector.result['encoding']
		confidence = detector.result['confidence']
		encoding = encoding.upper()
		if confidence < SETTINGS['confidence'] or encoding in SKIP_ENCODINGS:
			return
		self.view.run_command('convert_text_to_utf8', {'begin_line': begin_line, 'end_line': end_line, 'encoding': encoding})
def get_file_encode(file_name: str) -> str:
    try:
        detector = UniversalDetector()

        with open(file_name, mode='rb') as f:
            for binary in f:
                detector.feed(binary)
                if detector.done:
                    break
        detector.close()

        return detector.result['encoding']
    except FileNotFoundError as e:
        print("%s file is not found!" % (file_name))
        sys.exit()
    except csv.Error as e:
        print(e)
        sys.exit()
Ejemplo n.º 42
0
def translate():
    detector = UniversalDetector()
    source = 'Source'
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), source)
    file_list = os.listdir(path)

    result_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'Result')

    if not os.path.exists(result_folder):
        os.makedirs(result_folder)

    print("Для перевода доступны следующиее файлы:")
    for i in file_list:
        print(i)

    file_name = input('Укажите, какой файл из указанных выше Вы хотитк перевести:')
    file_path = os.path.join(path, file_name)

    with open(file_path, 'rb') as file:
        for line in file:
            detector.feed(line)
            if detector.done:
                break
        detector.close()
        code_type = detector.result['encoding']
        print('Файл {} выполнен в кодировке {}' .format(file_name, code_type))

    to_lang = os.path.basename(file_path).lower()[0]+os.path.basename(file_path).lower()[1]

    with open(file_path, encoding=code_type) as translate_file:
        params = {
            'key': API_KEY,
            'text': translate_file,
            'lang': '{}-ru'.format(to_lang)
        }
        response = requests.get(URL, params=params)
        json_ = response.json()

        with open(os.path.join(result_folder, 'Translate_{}'.format(file_name)), 'w', encoding='utf-8') as new_file:
            new_file.write(''.join(json_['text']))
        print('Результат перевода:')
        with open(os.path.join(result_folder, 'Translate_{}'.format(file_name)), 'r', encoding='utf-8') as read_result:
            for line in read_result:
                pprint(line)
Ejemplo n.º 43
0
    def insert_resets(csv_path):

        # Attempt to detect file encoding
        detector = UniversalDetector()
        file = codecs.open(csv_path, 'rb')
        for line in file:
            detector.feed(line)
            if detector.done:
                break
        detector.close()
        file.close()
        encoding = detector.result['encoding'] if detector.result else 'utf-8'

        with codecs.open(csv_path, 'r', encoding) as named_resets_file:

            # Check if CSV file has a header row.
            sniffer = csv.Sniffer()
            has_header = sniffer.has_header(named_resets_file.read(2048))
            named_resets_file.seek(0)
            named_resets = csv.reader(named_resets_file)
            iter_resets = iter(named_resets)

            if has_header:
                next(iter_resets)

            for r in iter_resets:
                if r[6] not in ['aux-support', 'aux-testUser']:
                    try:
                        named_password_resets = NamedAccountPasswordReset(
                            agent=r[0],
                            acct=r[1],
                            acct_location=r[2] if r[2] else '00099',
                            reset_date=parse(r[3],
                                             dayfirst=False,
                                             yearfirst=True,
                                             fuzzy=True,
                                             ignoretz=True),
                            reset_day=r[4],
                            reset_type=r[5],
                            type=r[6] if r[6] else 'Unknown')
                        db.session.add(named_password_resets)
                        db.session.commit()
                    except ValueError:
                        pass
 def extract_raw_text(self, filePath, encoding):
     """This function receive a filePath and an encoding value and return a
     string with the text of the given file."""
     if encoding == "(auto-detect)":
         detector = UniversalDetector()
         fh = open(filePath, 'rb')
         for line in fh:
             detector.feed(line)
             if detector.done: break
         detector.close()
         fh.close()
         encoding = detector.result['encoding']
     fh = open(
         filePath,
         mode='rU',
         encoding=encoding,
     )
     try:
         i = 0
         fileContent = ""
         chunks = list()
         for chunk in iter(lambda: fh.read(CHUNK_LENGTH), ""):
             chunks.append('\n'.join(chunk.splitlines()))
             i += CHUNK_LENGTH
             if i % (CHUNK_NUM * CHUNK_LENGTH) == 0:
                 fileContent += "".join(chunks)
                 chunks = list()
         if len(chunks):
             fileContent += "".join(chunks)
         del chunks
         return fileContent
     except UnicodeError:
         progressBar.finish()
         if len(myFiles) > 1:
             message = u"Please select another encoding "    \
                     + u"for file %s." % filePath
         else:
             message = u"Please select another encoding."
         self.infoBox.setText(message, 'error')
         self.send('Text data', None, self)
         self.controlArea.setDisabled(False)
         return
     finally:
         fh.close()
Ejemplo n.º 45
0
class LDictText:
    """Text to be analysed for word usage"""
    def __init__(self, fileName):
        self.fileName = fileName
        self.detector = UniversalDetector()
    def getWords(self):
        """Returns iterator on words in text"""
        self.detector.reset()
        for line in open(self.fileName, 'rb'):
            self.detector.feed(line)
            if self.detector.done: break
        self.detector.close()
        inFile = open(self.fileName,'r',encoding=self.detector.result['encoding'],errors='replace')
#        patt = "[\p{L}][\p{L}\p{Nd}-]*"
        patt = "[\w][\w\d-]*"
        t = inFile.read()
#        print(t[1:150])
        inFile.close()
        return(re.finditer(patt,t,flags=re.I))
def check_encoding(file_path):
    try:
        detector = UniversalDetector()
        with open(file_path, mode='rb') as f:
            for binary in f:
                detector.feed(binary)
                #ある程度、ファイルを読んだら読込終了
                if detector.done:
                    break
            detector.close()
            
    # ファイルが存在しなかった場合
    except FileNotFoundError as e: 
       print('ファイルが見つかりません', e)
    # Exceptionは、それ以外の例外が発生した場合
    except Exception as e: 
       print('予期せぬエラーです', e)
    
    return detector.result['encoding'] 
Ejemplo n.º 47
0
    def get_encoding(full_path: str) -> str:
        '''
        Проверка кодировки файла

        :param full_path: полный путь к файлу
        :return: кодировка или None в случае ошибки
        '''
        try:
            detector = UniversalDetector()
            with open(full_path, 'rb') as fh:
                for line in fh:
                    detector.feed(line)
                    if detector.done:
                        break
                detector.close()
            encoding = detector.result['encoding']  # Получили кодировку
            return encoding
        except BaseException as miss:
            raise ProcessingError('Encoding detection error') from miss
Ejemplo n.º 48
0
def get_file_encoding(filename):
    """
    Utility function to incrementally detect the file encoding.

    :param filename: Filename for the file to determine the encoding for. Str
    :return: A dict with the keys 'encoding' and 'confidence'
    """
    detector = UniversalDetector()
    try:
        with open(filename, 'rb') as detect_file:
            while not detector.done:
                chunk = detect_file.read(1024)
                if not chunk:
                    break
                detector.feed(chunk)
            detector.close()
        return detector.result
    except OSError:
        log.exception('Error detecting file encoding')
Ejemplo n.º 49
0
def get_detail_line(root, filepath):
    md5check = md5(filepath)
    filetype = mimetypes.guess_type(filepath)[0]
    purepath = '/' + filepath[len(root):]
    filesize = os.path.getsize(filepath)
    # encoding detection
    detector = UniversalDetector()
    for line in open(filepath, 'rb'):
        detector.feed(line)
        if detector.done: break
    detector.close()
    encoding = detector.result['encoding']
    return [
        str(md5check),
        str(filesize),
        str(filetype),
        str(encoding),
        str(purepath)
    ]
Ejemplo n.º 50
0
def detect_encoding(filename):
    """
    Takes a filename and attempts to detect the character encoding of the file
    using `chardet`.
     
    :param filename: Name of the file to process
    :type filename: string

    :returns: encoding : string
    """
    detector = UniversalDetector()
    with open(filename, 'rb') as unknown_file:
        for line in unknown_file:
            detector.feed(line)
            if detector.done:
                break
    detector.close()

    return detector.result['encoding']
Ejemplo n.º 51
0
def getFileCod(file):
    # == определить кодировку файла
    # -- возвращает название кодировки или пустую строку
    # -- пример:
    #    file_cod = getFileCod(file_name)
    #    fh = open(file_name)
    #    for line in fh:
    #    line = fh.readline().decode(file_cod)
    #    print(line)
    # --  используется для последующей работы со строками файла

    detector = UniversalDetector()
    with open(file, 'rb') as fh:
        for line in fh:
            detector.feed(line)
            if detector.done:
                break  #https://python-scripts.com/question/7401
        detector.close()
    return detector.result['encoding']
Ejemplo n.º 52
0
def get_encoding(data):
    """Try to get encoding incrementally.

    See http://chardet.readthedocs.org/en/latest/usage.html#example-detecting-encoding-incrementally  # noqa
    """
    start = time.time()
    msg = 'detecting file encoding...'
    logger.info(msg)
    file_like = io.BytesIO(data)
    detector = UniversalDetector()
    for i, line in enumerate(file_like):
        detector.feed(line)
        if detector.done:
            break
    detector.close()
    msg = 'encoding found in %s sec' % str(time.time() - start)
    msg += str(detector.result)
    logger.info(msg)
    return detector.result
Ejemplo n.º 53
0
def description_of(lines, name='stdin'):
    """
    Return a string describing the probable encoding of a file or
    list of strings.
    :param lines: The lines to get the encoding of.
    :type lines: Iterable of bytes
    :param name: Name of file or collection of lines
    :type name: str
    """
    u = UniversalDetector()
    for line in lines:
        u.feed(line)
    u.close()
    result = u.result
    if result['encoding']:
        return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
                                                     result['confidence'])
    else:
        return '{0}: no result'.format(name)
Ejemplo n.º 54
0
    def open(self):
        file = self.path.open(mode="rb")

        if not self._codec:
            detector = UniversalDetector()
            for line in file.readlines():
                detector.feed(line)
                if detector.done:
                    break
            detector.close()
            file.seek(0)
            codec = detector.result["encoding"]

            try:
                self._codec = codecs.lookup("sloppy-" + codec)
            except LookupError:
                self._codec = codecs.lookup(codec)

        return TextIOWrapper(file, encoding=self._codec.name)
Ejemplo n.º 55
0
def CheckPattern(ownname,FileName):
	jsonFName = ownname.split('.')[0] + '.json'
	json_data = open(jsonFName, "r+")
	JD = json.loads(json_data.read())
	extension = '*.'+FileName.split('.')[-1]
	for key,val in JD.items():
		patFN = val["FileName"]
		if extension!=patFN:
			return 'none'
	detector = UniversalDetector()
	with open(FileName, 'rb') as fh:
		for line in fh:
			detector.feed(line)
			if detector.done:
				break
		detector.close()
	code=detector.result
	enc='"EncodingInfo" :("'+code['encoding']+'")'
	err='"ErrorInfo" :("Ошибок нет")'
	col_lst = '"Colls" : ['
	if code['encoding']=='windows-1251':
		rdr = csv.reader(open(FileName, 'r',encoding='cp1251'))
		line1 = next(rdr)
		for coll in line1:
			dlm=coll.find(';')
			if dlm>0:
				err='"ErrorInfo" :("Неверный формат")'
			else:
				col_lst += '"%s",' % (coll)
	elif code['encoding']=='utf-8':
		rdr = csv.reader(open(FileName, 'r',encoding='utf-8'))
		line1 = next(rdr)
		for coll in line1:
			dlm=coll.find(';')
			if dlm>0:
				err='"ErrorInfo" :("Неверный формат")'
			else:
				col_lst += '"%s",' % (coll)
	else:
		err='"ErrorInfo" :("Кодировка не поддерживается")'
	col_lst = col_lst.rstrip(",") + "]"
	return '{"ScriptName" :("csv_l"), "TargetInfo" : ("Сценарий загрузки Акта сверки (csv)"), '+enc+','+err+','+col_lst+'}'
Ejemplo n.º 56
0
def _encoding_heuristic(stream,
                        confidence_threshold,
                        encoding_preferences=('1252', '8859-1'),
                        encoding_excludes=('koi', 'mik', 'iscii', 'tscii',
                                           'viscii', 'jis', 'gb', 'big5',
                                           'hkscs', 'ks', 'euc', '2022'),
                        encoding_default='ascii',
                        error_no_encoding=True):
    """Return a pair of (encoding, confidence)
    Encoding_preferences overrule other code-pages to prevent mistakes. Should be changed in other countries than NL!!!"""
    pos = stream.tell()
    detector = UniversalDetector()
    i = 0
    size = 2**10
    while not (detector.done or i > 7):
        detector.feed(stream.read(size))
        i += 1
        if size < 2**16: size <<= 1
    stream.seek(pos)
    certain = detector.done
    detector.close()
    confidence, encoding = (detector.result['confidence'],
                            detector.result['encoding']) if certain else (0.0,
                                                                          None)
    if not encoding:
        encoding = encoding_default
    elif confidence < confidence_threshold + .5 * (1 - confidence_threshold) and \
       (any([base in encoding.lower() for base in ('windows', 'iso')]) and not
        any([cp in encoding for cp in encoding_preferences])):
        # Heuristic: ANSI (mbcs) is much more probable than exotic windows/iso encoding:
        encoding = encoding_default
    if any([enc in encoding.lower() for enc in encoding_excludes]):
        encoding = encoding_default
        confidence = 0.0
    if encoding == 'ascii':
        # Plain ASCII is very improbable. Windows is superset of ascii, so does not hurt.
        encoding = 'mbcs'
    if confidence < confidence_threshold and error_no_encoding:
        raise ValueError(
            'Heuristic determination of encoding failed: {:.1%} confidence in "{}", {:.1%} required.'
            .format(confidence, encoding, confidence_threshold))
    return encoding, confidence
Ejemplo n.º 57
0
def detect_encoding(filename):
    """
    Detects the encoding of a file.

    Parameters
    ----------
    filename: str
        Name of the file to detect the encoding.

    Returns
    -------
    encoding: str
        The encoding of the file.

    """
    detector = UniversalDetector()
    for line in open(filename, 'rb').readlines():
        detector.feed(line)
    detector.close()
    return detector.result['encoding']
Ejemplo n.º 58
0
def detect_file_encode(file):
    """ファイルの文字エンコード判定"""
    detector = UniversalDetector()

    try:
        with open(file, mode='rb') as f:
            while True:
                binary = f.readline()
                if binary == b'':
                    # ファイルを最後まで読みきった
                    break

                detector.feed(binary)
                if detector.done:
                    # 十分な確度でエンコーディングが推定できた
                    break
    finally:
        detector.close()

    return detector.result
Ejemplo n.º 59
0
def detect_encoding(filepath):
    """
    Given a path to a CSV of unknown encoding
    read lines to detects its encoding type

    :param filepath: Filepath to check
    :type  filepath: str

    :return: Example `{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}`
    :rtype: dict
    """

    detector = UniversalDetector()
    with io.open(filepath, 'rb') as f:
        for line in f:
            detector.feed(line)
            if detector.done:
                break
    detector.close()
    return detector.result
Ejemplo n.º 60
0
def guess_file_encoding(path):
    detector = UniversalDetector()

    with open(path, 'rb') as file:
        for line in file:
            detector.feed(line)
            if detector.done:
                break

    detector.close()
    guessed = detector.result

    use_guess = (guessed['encoding'] is not None
                 and guessed['encoding'] != 'ascii'
                 and guessed['confidence'] >= 1)

    if use_guess:
        return guessed['encoding']

    return 'utf-8'