def detect(view, file_name, encoding): if not file_name or not os.path.exists(file_name): return if not encoding.endswith(" with BOM"): encoding = encoding_cache.pop(file_name) if encoding: sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0) return sublime.set_timeout(lambda: view.set_status("origin_encoding", "Detecting encoding, please wait..."), 0) detector = UniversalDetector() cnt = SETTINGS["max_detect_lines"] fp = open(file_name, "rb") for line in fp: # cut MS-Windows CR code line = line.replace(b"\r", b"") detector.feed(line) cnt -= 1 if detector.done or cnt == 0: break fp.close() detector.close() encoding = detector.result["encoding"] if encoding: encoding = encoding.upper() if encoding == "BIG5": encoding = "BIG5-HKSCS" elif encoding == "GB2312": encoding = "GBK" confidence = detector.result["confidence"] sublime.set_timeout(lambda: check_encoding(view, encoding, confidence), 0)
def listTextBasedFiles(file): try: # Detect MIME type for file # https://github.com/kaito834/myNotes/blob/master/snippets/python/magic_from_file.py # https://github.com/ahupp/python-magic#usage f_mimetype = magic.from_file(file, mime=True) except Exception as e: print("[!] Exception: {0} ({1})".format(e, type(e))) # Open and count lines if MIME type of the file is text/* if f_mimetype.split('/')[0] == 'text': # Detect encoding by chardet.universaldetector.UniversalDetector() # https://chardet.readthedocs.io/en/latest/usage.html#advanced-usage detector = UniversalDetector() with open(file, 'rb') as f: for line in f.readlines(): detector.feed(line) if detector.done: break detector.close() with open(file, "r", encoding=detector.result['encoding']) as f: line_count = 0 for line in f.readlines(): line_count += 1 print("{0}: {1}, {2}, {3} lines".format(file, f_mimetype, detector.result['encoding'], line_count)) else: print("{0}: NOT txet based file (reason: MIME type isn't text/*: {1})".format(file, f_mimetype))
def detect_encoding(bytes, encoding=None): """Detect encoding of a byte stream. """ # To reduce tabulator import time from chardet.universaldetector import UniversalDetector if encoding is not None: if encoding.lower() == 'utf-8': prefix = bytes.read(len(codecs.BOM_UTF8)) if prefix == codecs.BOM_UTF8: encoding = 'utf-8-sig' bytes.seek(0) return encoding detector = UniversalDetector() num_lines = config.ENCODING_DETECTION_MAX_LINES while num_lines > 0: line = bytes.readline() detector.feed(line) if detector.done: break num_lines -= 1 detector.close() bytes.seek(0) confidence = detector.result['confidence'] encoding = detector.result['encoding'] # Do not use if not confident if confidence < config.ENCODING_DETECTION_MIN_CONFIDENCE: encoding = config.DEFAULT_ENCODING # Default to utf-8 for safety if encoding == 'ascii': encoding = config.DEFAULT_ENCODING return encoding
def description_of(file_path, name='stdin', byte=1000000): """ Return a string describing the probable encoding of a file. """ from chardet.universaldetector import UniversalDetector file = open(file_path, 'rb') u = UniversalDetector() i = 0 for line in file: l = len(line) if i + l > byte: bytoread = byte-i u.feed(line[:bytoread]) break else: bytoread = l u.feed(line) i += bytoread file.close() u.close() result = u.result if result['encoding']: return '%s: %s with confidence %s' % (name, result['encoding'], result['confidence']) else: return '%s: no result' % name
def guessWithChardet(content): u = UniversalDetector() for line in content: u.feed(line) u.close() result = u.result return result
def detect_encoding(file): detector = UniversalDetector() for line in open(file, 'rb').readlines(): detector.feed(line) if detector.done: break detector.close() return detector.result['encoding']
def get_file_encoding(file_name): if not os.path.isfile(file_name): return "" u = UniversalDetector() with open(file_name, "rb") as f: for index, line in enumerate(f): u.feed(line) if index > 500: break u.close() if u.result["encoding"].lower() == "gb2312": try: _file = codecs.open(file_name, encoding="gb2312") _file.readlines() result = "gb2312" except Exception as e: print e try: _file = codecs.open(file_name, encoding="gbk") _file.readlines() result = "gbk" except Exception as e: print e result = "gb18030" else: result = u.result["encoding"] return result
def get_unicode_content(file_path, encoding=None): """ Return a unicode string of the files contents using the given encoding. If no encoding is given then chardet will be used to determine the encoding. Note that this uses the chardet library and may cause problems, if an error is thrown then a utf-8 encoding is assumed and unrecognize caracters are discarded. """ from chardet.universaldetector import UniversalDetector try: if not encoding: detector = UniversalDetector() contents = '' with open(file_path, 'rb') as f: contents = f.read() detector.feed(contents) detector.close() determined_encoding = detector.result['encoding'] return contents.decode(encoding=determined_encoding) else: with open(file_path, 'r') as f: return unicode(f.read(), encoding=encoding, errors='ignore') except UnicodeError: with open(file_path, 'r') as f: return unicode(f.read(), encoding='utf-8', errors='ignore')
def _guessEncoding(self, path): """Opens a file from the given `path` and checks the file encoding. The file must exists on the file system and end with the extension `.csv`. The file is read line by line until the encoding could be guessed. On a successfull identification, the widgets of this dialog will be updated. Args: path (string): Path to a csv file on the file system. """ if os.path.exists(path) and path.lower().endswith('csv'): encodingDetector = UniversalDetector() with open(path, 'r') as fp: for line in fp: encodingDetector.feed(line) if encodingDetector.done: break encodingDetector.close() result = encodingDetector.result['encoding'] result = result.replace('-','_') self._encodingKey = _calculateEncodingKey(result) if self._encodingKey: index = self._encodingComboBox.findText(result.upper()) self._encodingComboBox.setCurrentIndex(index)
def validate_csv(f): """Return dialect information about given csv file.""" with open(f.fullpath, 'rU') as csvfile: is_valid = False try: dialect = csv.Sniffer().sniff(csvfile.read(1024)) except Exception as e: current_app.logger.debug( 'File %s is not valid CSV: %s' % (f.name + f.superformat, e)) return { 'delimiter': '', 'encoding': '', 'is_valid': is_valid } u = UniversalDetector() dialect.strict = True csvfile.seek(0) reader = csv.reader(csvfile, dialect) try: for row in reader: u.feed(dialect.delimiter.join(row)) is_valid = True except csv.Error as e: current_app.logger.debug( 'File %s is not valid CSV: %s' % (f.name + f.superformat, e)) finally: u.close() return { 'delimiter': dialect.delimiter, 'encoding': u.result['encoding'], 'is_valid': is_valid }
def get_coding(text, force_chardet=False): """ Function to get the coding of a text. @param text text to inspect (string) @return coding string """ if not force_chardet: for line in text.splitlines()[:2]: try: result = CODING_RE.search(to_text_string(line)) except UnicodeDecodeError: # This could fail because to_text_string assume the text # is utf8-like and we don't know the encoding to give # it to to_text_string pass else: if result: codec = result.group(1) # sometimes we find a false encoding that can # result in errors if codec in CODECS: return codec # Fallback using chardet if is_binary_string(text): detector = UniversalDetector() for line in text.splitlines()[:2]: detector.feed(line) if detector.done: break detector.close() return detector.result['encoding'] return None
def decode_data(data, encoding_guess, can_be_binary=True): """Given string data, return an (is_text, data) tuple, where data is returned as unicode if we think it's text and were able to determine an encoding for it. If can_be_binary is False, then skip the initial is_binary check. """ if not (can_be_binary and is_binary_string(data[:1024])): try: # Try our default encoding. data = data.decode(encoding_guess) return True, data except UnicodeDecodeError: # Fall back to chardet - chardet is really slow, which is why we # don't just do chardet from the start. detector = UniversalDetector() for chunk in ichunks(80, data): detector.feed(chunk) if detector.done: break detector.close() if detector.result['encoding']: try: data = data.decode(detector.result['encoding']) return True, data except (UnicodeDecodeError, LookupError): # Either we couldn't decode or chardet gave us an encoding # that python doesn't recognize (yes, it can do that). pass # Leave data as str. return False, data
def decode(string): """ detects string encoding and returns decoded string""" u = UniversalDetector() u.feed(string) u.close() result = u.result return string.decode(result['encoding'])
def decode(filename, data): if '.m3u8' in filename: encoding = 'utf-8' data = data.decode(encoding) elif '.m3u' in filename or '.pls' in filename: try: encoding = 'ISO-8859-2' data = data.decode(encoding) except: if chardet: u = UniversalDetector() u.feed(data) u.close() if u.result['confidence'] > 0.5: try: encoding = result['encoding'] data = data.decode(encoding) except: encoding = 'ascii' else: encoding = 'ascii' else: encoding = 'ascii' elif '.xml' in filename or '.xspf' in filename: encoding = 'utf-8' return {'data' : data, 'encoding' : encoding}
def transferToEncoding(filename, toCode): if os.path.isdir(filename): print "error:not file" return False try: detector = UniversalDetector() f = open(filename, 'r') ls = f.readlines() f.close() # 如果空文件没法探测到,所以直接跳出做提示即可 if len(ls) == 0: print printRed(filename), printRed(' is blank file, can not detect encoding') return False; # 探测编码 for l in ls: detector.feed(l) if detector.done: break detector.close() encode = gb(detector.result['encoding']) if encode.lower() != toCode.lower(): f = open(filename, 'w') print printGreen(filename) + ' ====> ' + toCode + ' SUCCESS' for l in ls: f.write(unicode(l, encode).encode(toCode)) f.close() else: pass except Exception, e: traceback.print_exc() print 'exception'
def detect(view, file_name, cnt): #traceback.print_stack() print("detect...") if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0: return encoding = encoding_cache.pop(file_name) if encoding: print("it is already at cache encoding_cache.json:",encoding) sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0) return sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Detecting encoding, please wait...'), 0) detector = UniversalDetector() fp = open(file_name, 'rb') for line in fp: # cut MS-Windows CR code line = line.replace(b'\r',b'') detector.feed(line) #print(line) cnt -= 1 if detector.done or cnt == 0: break fp.close() detector.close() encoding = detector.result['encoding'] print(encoding) if encoding: encoding = encoding.upper() confidence = detector.result['confidence'] print(confidence) sublime.set_timeout(lambda: check_encoding(view, encoding, confidence), 0)
def detect_encoding(file_path): with open(file_path, 'rb') as f: u = UniversalDetector() for line in f: u.feed(line) u.close() return u.result['encoding']
def sanitize_texts(directory): """ Strip all header and copyright information from downloaded text files in the specified directory using gutenberg.strip_headers module and ensure proper file encodings. :param directory: <String> A string containing the full path to directory containing files to strip :return: """ for item in os.listdir(directory): file_path = os.path.join(directory, item) if os.path.isfile(file_path): # Detect file encoding, takes time to run with open(file_path, 'rb') as inf: text = inf.readlines() detector = UniversalDetector() for line in text: detector.feed(line) if detector.done: break detector.close() encoding = detector.result['encoding'] # Open file, strip headers, and save result with open(file_path, 'r', encoding=encoding) as inf: text = inf.read() text = strip_headers(text).strip() os.remove(file_path) with open(file_path, 'w+', encoding=encoding) as outf: outf.write(text)
def get_csv_reader(filename, charset=None): logger.info("Reading CSV file %s", filename) myfile = open(filename, "rb") if not charset: # Detect encoding detector = UniversalDetector() for line in myfile.xreadlines(): detector.feed(line) if detector.result["confidence"] > 0.01: logger.debug("Result so far: %s", detector.result) if detector.done: break detector.close() charset = detector.result["encoding"] logger.info("Found encoding %s", charset) # Reset the file index myfile.seek(0) # Attempt to detect the dialect encodedfile = codecs.EncodedFile(myfile, charset) dialect = csv.Sniffer().sniff(encodedfile.read(1024)) logger.info("Found dialect %s", dialect) # Reset the file index myfile.seek(0) return UnicodeReader(myfile, dialect=dialect, encoding=charset)
def safe_open(path, mode='r'): ''' Retrieves a file's encoding and returns the opened file. If the opened file begins with a BOM, it is read before the file object is returned. This allows callers to not have to handle BOMs of files. :param str path: file path to open :param str mode: the mode to open the file (see :func:`open`) :returns file: the opened file object ''' u = UniversalDetector() first = None with open(path, 'rb') as fp: bin = first = fp.read(0x1000) while not u.done and bin: u.feed(bin) if not u.done: bin = fp.read(0x1000) u.close() if not first: return open(path, mode) fp = codecs.open(path, mode, encoding=u.result['encoding']) for bom in (codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE, codecs.BOM_UTF8, codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE): if first.startswith(bom): fp.seek(len(bom)) break return fp
def detectEncoding(self, parseMeta=True, chardet=True): # First look for a BOM # This will also read past the BOM if present encoding = self.detectBOM() confidence = "certain" # If there is no BOM need to look for meta elements with encoding # information if encoding is None and parseMeta: encoding = self.detectEncodingMeta() confidence = "tentative" # Guess with chardet, if available if encoding is None and chardet: confidence = "tentative" try: from chardet.universaldetector import UniversalDetector buffers = [] detector = UniversalDetector() while not detector.done: buffer = self.rawStream.read(self.numBytesChardet) assert isinstance(buffer, bytes) if not buffer: break buffers.append(buffer) detector.feed(buffer) detector.close() encoding = lookupEncoding(detector.result['encoding']) self.rawStream.seek(0) except ImportError: pass # If all else fails use the default encoding if encoding is None: confidence = "tentative" encoding = lookupEncoding(self.defaultEncoding) return encoding, confidence
def description_of(lines, name='stdin'): """ Return a string describing the probable encoding of a file or list of strings. :param lines: The lines to get the encoding of. :type lines: Iterable of bytes :param name: Name of file or collection of lines :type name: str """ u = UniversalDetector() for line in lines: line = bytearray(line) u.feed(line) # shortcut out of the loop to save reading further - particularly useful if we read a BOM. if u.done: break u.close() result = u.result if PY2: name = name.decode(sys.getfilesystemencoding(), 'ignore') if result['encoding']: return '{0}: {1} with confidence {2}'.format(name, result['encoding'], result['confidence']) else: return '{0}: no result'.format(name)
def validate_csv(document): """Return dialect information about given csv file.""" with open(document.document.uri, 'rU') as csvfile: is_valid = False try: dialect = csv.Sniffer().sniff(csvfile.read(1024)) except Exception as e: current_app.logger.debug( 'File %s is not valid CSV: %s' % (document.get_filename(), e)) return { 'delimiter': '', 'encoding': '', 'is_valid': is_valid } universal_detector = UniversalDetector() dialect.strict = True csvfile.seek(0) reader = csv.reader(csvfile, dialect) try: for row in reader: universal_detector.feed( dialect.delimiter.join(row).encode('utf-8')) is_valid = True except csv.Error as e: current_app.logger.debug( 'File %s is not valid CSV: %s' % (document.get_filename(), e)) finally: universal_detector.close() return { 'delimiter': dialect.delimiter, 'encoding': universal_detector.result['encoding'], 'is_valid': is_valid }
def process_buffer(buf, d): if not buf: return header = buf[0] url = header.split()[1] skip = 0 empty_lines = 0 while empty_lines < 2: skip += 1 if not buf[skip].strip(): empty_lines += 1 rawhtml = "".join(buf[skip + 1:]) html = None try: html = rawhtml.decode("utf-8") except: try: detector = UniversalDetector() for line in buf[skip + 1:]: detector.feed(line) if detector.done: break detector.close() encoding = detector.result html = rawhtml.decode(encoding["encoding"]) except: html = rawhtml.decode("utf-8", errors='ignore') assert html is not None, "Error processing %s\n" % rawhtml html = html.replace(r"\r", "") d[url] = (header, html)
def detect_local_charset(filepath): global VERBOSE # Open to read in binary. fp = open(filepath, "rb") detector = UniversalDetector() if VERBOSE: print "Reading file to detect encoding..." for line in fp: line = line.replace(b'\r',b'') detector.feed(line) if detector.done: break fp.close() detector.close() if VERBOSE: print "Encoding: %s" % detector.result["encoding"] print "Confidence: {0:.0f}% ".format(detector.result["confidence"]*100) if detector.result["confidence"] > 0.75: encoding = detector.result["encoding"] return encoding.replace('-','_').lower() # Format for codecs else: return None
def detect(view, file_name): if not os.path.exists(file_name): return encoding = encoding_cache.pop(file_name) if encoding: sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0) return sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Detecting encoding, please wait...'), 0) detector = UniversalDetector() cnt = SETTINGS['max_detect_lines'] fp = file(file_name, 'rb') for line in fp: detector.feed(line) cnt -= 1 if detector.done or cnt == 0: break fp.close() detector.close() encoding = detector.result['encoding'] confidence = detector.result['confidence'] if not encoding or confidence < 0.7: sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Encoding can not be detected, please choose one manually. (%s/%.2f)' % (encoding, confidence)), 0) return encoding = encoding.upper() if encoding == 'BIG5': encoding = 'BIG5-HKSCS' elif encoding == 'GB2312': encoding = 'GBK' sublime.set_timeout(lambda: init_encoding_vars(view, encoding), 0)
def detect_encoding(f, verbose=False): """Detects a file's encoding. Args: f (obj): The file like object to detect. verbose (Optional[bool]): The file open mode (default: False). mode (Optional[str]): The file open mode (default: 'rU'). Returns: dict: The encoding result Examples: >>> filepath = p.join(DATA_DIR, 'test.csv') >>> with open(filepath, 'rb') as f: ... result = detect_encoding(f) ... result == {'confidence': 0.99, 'encoding': 'utf-8'} True """ pos = f.tell() detector = UniversalDetector() for line in f: detector.feed(line) if detector.done: break detector.close() f.seek(pos) if verbose: print('result', detector.result) return detector.result
def deserialize(file_bytes): try: file_string = file_bytes.decode('utf-8') except UnicodeDecodeError as ude: detector = UniversalDetector() for line in BytesIO(file_bytes): detector.feed(line) if detector.done: break detector.close() if detector.result['confidence'] < 0.5: raise ValueError("Failed to guess the encoding of the file (it's not utf-8). Use utf-8 encoded files.") try: file_string = file_bytes.decode(detector.result['encoding']) except UnicodeDecodeError: raise ValueError("Failed to guess the encoding of the file (it's not utf-8). Use utf-8 encoded files. " "(The invalid character is '{char:#x}' at {pos})".format(pos=ude.start, char=file_bytes[ude.start])) csv_lines = file_string.splitlines() first_line = csv_lines[:1] first_row_tab = next(csv.reader(first_line, delimiter="\t")) first_row_semicolon = next(csv.reader(first_line, delimiter=";")) if len(first_row_tab) > 1: rows = csv.reader(csv_lines, delimiter="\t") elif len(first_row_semicolon) > 1: rows = csv.reader(csv_lines, delimiter=";") else: raise ValueError("Csv file is not delimited by ';' or 'tab'") return rows
def repo_cat_file(self, repo_path, commit_hash, path): (commit_hash, path) = self._all_to_utf8(commit_hash, path) if not self._path_check_chdir(repo_path, commit_hash, path): return '' path = self._get_quote_path(path) if path.startswith('./'): path = path[2:] file_type = path.split('.')[-1] if file_type in BINARY_FILE_TYPE: return u'二进制文件' stage_file = self._get_stage_file(repo_path, commit_hash, path) result = self._read_load_stage_file(stage_file) if result is not None: return result['blob'] command = '/usr/bin/git show %s:%s | /usr/bin/head -c 524288' % (commit_hash, path) try: signal.signal(signal.SIGPIPE, signal.SIG_DFL) result = check_output(command, shell=True) ud = UniversalDetector() ud.feed(result) ud.close() if ud.result['encoding']: encoding = ud.result['encoding'] if encoding != 'utf-8' or encoding != 'utf8': result = result.decode(encoding).encode('utf-8') self._dumps_write_stage_file({'blob': result}, stage_file) return result except Exception, e: logger.exception(e)
def detect(view, file_name, encoding): if not file_name or not os.path.exists(file_name): return if not encoding.endswith(' with BOM'): encoding = encoding_cache.pop(file_name) if encoding: sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0) return sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Detecting encoding, please wait...'), 0) detector = UniversalDetector() cnt = SETTINGS['max_detect_lines'] fp = open(file_name, 'rb') for line in fp: # cut MS-Windows CR code line = line.replace(b'\r',b'') detector.feed(line) cnt -= 1 if detector.done or cnt == 0: break fp.close() detector.close() encoding = detector.result['encoding'] if encoding: encoding = encoding.upper() confidence = detector.result['confidence'] sublime.set_timeout(lambda: check_encoding(view, encoding, confidence), 0)
def dump_dictionary(out_path, lt_dir, tag_dict_path, tag_info_path): # dump dictionary, see https://dev.languagetool.org/developing-a-tagger-dictionary os.system( f"java -cp {lt_dir / 'languagetool.jar'} org.languagetool.tools.DictionaryExporter " f"-i {tag_dict_path} -info {tag_info_path} -o {out_path}") # the dumped dictionary is sometimes not in utf-8 detector = UniversalDetector() for i, line in enumerate(open(out_path, "rb")): detector.feed(line) if i > 10_000: detector.close() break result = detector.result print( f"Dump was encoded as {result['encoding']} with confidence {result['confidence']}." ) dump_bytes = open(out_path, "rb").read() with open(out_path, "w") as f: f.write(dump_bytes.decode(result["encoding"]))
def make_array(dir): result = [] detector = UniversalDetector() counter = 0 for name in os.listdir(dir): if name == ".DS_Store": continue else: counter += 1 name = 'fb2/' + name temp = open(name, 'rb') detector.reset() for line in temp.readlines(): detector.feed(line) if detector.done: break detector.close() print(str(counter) + ') ' + str(detector.result)) temp = open(name, 'rb').read().decode(detector.result['encoding']) result.append(bytes(temp, encoding=detector.result['encoding'])) # temp = open('example.fb2', 'r', encoding='utf-8').read().replace('\n', ' ') # temp = open('kek.xml', 'r', encoding='utf-8').read() # result.append(temp) return result