def detect_encoding(f: BinaryIO, limit: int = 2500) -> str: """ Return encoding of provided input stream. Most of the time it's unicode, but if we are unable to decode the input natively, use `chardet` to determine the encoding heuristically. """ unicode_decodable = True for line_no, line in enumerate(f): try: line.decode("utf-8") except UnicodeDecodeError: unicode_decodable = False break if line_no > limit: break if unicode_decodable: return "utf-8" f.seek(0) u = UniversalDetector() for line_no, line in enumerate(f): u.feed(line) if u.done or line_no > limit: break u.close() return u.result["encoding"]
def _get_dict_reader(self, resource_data_fp): ''' Returns a dict reader for the given source file pointer. If the encoding of the source hasn't been guessed yet then this function will read the from the file pointer until EOF and guess the encoding. The file pointer is reset to the start after this occurs. :param resource_data_fp: the file pointer to the source :return: a DictReader object ''' if self.encoding is None: with ensure_reset(resource_data_fp): detector = UniversalDetector() while True: chunk = resource_data_fp.read(8192) if chunk: detector.feed(chunk) else: detector.close() break self.encoding = detector.result[u'encoding'] # if the detector failed to work out the encoding (unlikely) or if the encoding it # comes up with is ASCII, just default to UTF-8 (UTF-8 is a superset of ASCII) if self.encoding is None or self.encoding == u'ASCII': self.encoding = u'utf-8' # create and return the dict reader line_iterator = iter_universal_lines(resource_data_fp, self.encoding) return unicodecsv.DictReader(line_iterator, dialect=self.dialect, encoding=self.encoding)
def guessWithChardet(content): u = UniversalDetector() for line in content.split(b"\n"): u.feed(line) u.close() result = u.result return result
def detect_encoding(f): detector = UniversalDetector() detector.reset() for line in f: detector.feed(line) if detector.done: break detector.close() return detector.result["encoding"]
def detect_encoding(path): detector = UniversalDetector() detector.reset() with open(path, "rb") as f: for line in f: detector.feed(line) if detector.done: break detector.close() return detector.result["encoding"]
def detect_encoding(the_file): '''Detects file encoding using chardet based on N first lines ''' detector = UniversalDetector() for line in the_file.readlines(): detector.feed(line) if detector.done: break detector.close() return detector.result
def _file_encoding(self): detector = UniversalDetector() for chunk in self.chunks: for line in chunk.splitlines(): detector.feed(line) if detector.done: break detector.close() encoding = detector.result['encoding'] return encoding
def SetEncoding(self): if os.path.isfile(self.filename): bigdata = open(filename, "rb") from cchardet import UniversalDetector detector = UniversalDetector() for line in bigdata.readlines(): detector.feed(line) if detector.done: break detector.close() bigdata.close() result = detector.result self.encoding = result.get('encoding', self.encoding)
def detect_encoding(f: BinaryIO, limit: int = 2500) -> str: u = UniversalDetector() for line in f: u.feed(line) limit -= 1 if u.done or limit < 1: break u.close() if u.result["encoding"].lower() == "ascii": return "utf-8" else: return u.result["encoding"]
def detect_encoding_with_chardet(iostream, min_lines=10, max_lines=anycsvconfig.NO_SNIFF_LINES): detector = UniversalDetector() if max_lines != -1: max_lines = max(min_lines, max_lines) c = 0 for line in iostream: c += 1 detector.feed(line) if c > min_lines and (detector.done or min(c, max_lines) == max_lines): break detector.close() return detector.result #{'encoding':, 'confidence':}
def detect_convert(filename): detector = UniversalDetector() detector.reset() cache = b'' with open(filename, 'rb') as f: for line in f: detector.feed(line) cache += line if detector.done: break detector.close() cache = cache.decode(detector.result['encoding'] or args.fallback_enc, errors='ignore') cache += f.read().decode(detector.result['encoding'] or args.fallback_enc, errors='ignore') cf = convertfunc(cache, args.locale, args.locale_only) return cf(cache)
def detect_convert(filename): detector = UniversalDetector() detector.reset() cache = b'' with open(filename, 'rb') as f: for line in f: detector.feed(line) cache += line if detector.done: break detector.close() cache = cache.decode( detector.result['encoding'] or args.fallback_enc, errors='ignore') cache += f.read().decode( detector.result['encoding'] or args.fallback_enc, errors='ignore') cf = convertfunc(cache, args.locale, args.locale_only) return cf(cache)
def read_file(usr_fil, log=None): """Reads a file and returns each line as a list of strings. :param usr_fil: Required - Input file :param log: Optional - Log file :return: str: The file contents as a string """ out = "" try: print(usr_fil) # Detect file encoding file = open(usr_fil, 'rb') det = UniversalDetector() for line in file.readlines(): det.feed(line) if det.done: break det.close() usr_enc = det.result.get("encoding") out += f"File encoding: {usr_enc}\n" # Open file using detected encoding text_read = open(usr_fil, 'r', encoding='UTF-8', errors='ignore') text = text_read.read() text_read.close() return text except IOError: out += "\nFile reading failed\n" return False finally: # Log info if log is not None: log.write(out) else: print(out, end='')
def convert(asmfile, opt, verbose, stdout) -> None: encoding = "utf-8" requires_manual_conversion = False try: with open(asmfile, "r") as f: text = f.readlines() except Exception as e: detector = UniversalDetector() for line in open(asmfile, "rb"): detector.feed(line) if detector.done: break detector.close() encod = detector.result if encod["confidence"] is not None: if encod["confidence"] >= 0.5: encoding = encod["encoding"] else: encoding = "SHIFT_JIS" else: encoding = "SHIFT_JIS" # if confidence is low, try japanese try: if verbose: print(f"Guessed encoding {encoding}, will try to parse now...") with open(asmfile, "r", encoding=encoding) as f: text = f.readlines() except Exception as e: raise e # propagate bw_defs = [] outputfile = asmfile.replace(".asm", "_sa1.asm") outfile = open(outputfile, "w", encoding=encoding) stdout.write(bytes(f"Processing file {asmfile}:\n", encoding=encoding)) outlines = [] if opt: outfile.write("incsrc conv_defines.asm\n") tot_conversions = 0 for index, line in enumerate(text, start=1): outlines.append("") data_types = ["db", "dw", "dl", "dd"] in_comment = False in_data = False define_found = re.match( r"![A-Za-z\d_]+\s+=\s+((\$)?[\dA-Fa-f]{2,6})\S*", line.strip()) words = re.split(r"([ \t;])", line.rstrip()) if line.strip() == "" or line.lstrip().startswith(";") or define_found: # shortcuts for comments and blank lines and defines if define_found: requires_manual_conversion = True is_hex = define_found.group(2) is not None if (int( define_found.group(1).replace("$", "0x") if is_hex else define_found.group(1), 16 if is_hex else 10, ) == 12): stdout.write( bytes( f"There is define {define_found.group(0)} at line {index} which is equal to 12," f" this might be a define related to how many sprites can be loaded by the game" f" if so, change it to 22 or $16, or (even better) use the following\n" f"\tif read1($00FFD5) == $23\n\t\t{define_found.group(0)}\n\telse\n\t\t" f'{define_found.group(0).split("=")[0]}= {"$16" if is_hex else "22"}\n\tendif\n', encoding=encoding, )) elif (int(define_found.group(1).replace("$", "0x"), 16) in sprite_addr_list and is_hex): stdout.write( bytes( f"There is define {define_found.group(0)} at line {index} which is a sprite " f"address, usually replacing the $ with ! works in most tools, it didn't get " f"converted automatically because it might not be necessary to do so, make sure " f"to convert manually it ONLY if needed.\n", encoding=encoding, )) elif (0x0100 <= int( define_found.group(1).replace("$", "0x") if is_hex else define_found.group(1), 16 if is_hex else 10, ) <= 0x1FFF): stdout.write( bytes( f"There is define {define_found.group(0)} at line {index} which might be a ram" f" address, if it is, convert it by adding |!addr at the end of it, if it's not" f" a ram address leave it alone\n", encoding=encoding, )) outlines[index - 1] = line.rstrip() continue ignore_next_address = False for og_word in words: stripped_word = og_word.strip() to_insert = "" if in_comment or in_data: pass elif stripped_word.startswith(";"): in_comment = True elif any([stripped_word.startswith(a) for a in data_types]): in_data = True elif stripped_word.startswith("PEA") or stripped_word.startswith( "PER"): ignore_next_address = True elif addr := re.findall(r"\$[\da-fA-F]{1,6}\|![a-zA-Z\d_]+\b", og_word): stdout.write( bytes( f"Possibly address {addr[0]} at line {index} was already hybrid.\n", encoding=encoding, )) elif re.findall(r"\$[^, \n()\[\]]{1,6}", og_word): if ignore_next_address: ignore_next_address = False outlines[index - 1] += og_word continue splitted = re.split(r"([\[\](), ])", og_word) word_tuples = [] for i, word in enumerate(splitted): if word.startswith("$"): try: proc_word = eval(word.replace("$", "0x")) expr = re.split( r"[+\\\-^*~<>|]", word.replace( "$", "")) # +\-^*~<> some asar math ops word = "${:0{}X}".format( proc_word, max([len(e) for e in expr])) word_tuples.append((WordType.ADDR, word, i)) except SyntaxError: bunch = re.split(r"([+\-^*~<>| ])", word) for w in bunch: if w.startswith("$"): word_tuples.append((WordType.ADDR, w, i)) else: word_tuples.append((WordType.OTHER, w, i)) elif word.startswith(","): word_tuples.append((WordType.COMMA, word, i)) else: word_tuples.append((WordType.OTHER, word, i)) for wordtype, word, i in word_tuples: if wordtype == WordType.ADDR: try: try: comma_index = (i + 1 if word_tuples[i + 1][0] == WordType.COMMA else -1) except IndexError: comma_index = -1 ( ww, bwram_define_needed, converted, manual_conversion, ) = process_word( word.replace("$", ""), stdout, encoding, index, splitted, comma_index, ) if manual_conversion: requires_manual_conversion = True if converted: tot_conversions += 1 stdout.write( bytes( f"Conversion: {word} -> {ww}\n", encoding=encoding, )) bw_defs.append(bwram_define_needed) to_insert += ww except ValueError: to_insert += word else: to_insert += word outlines[index - 1] += to_insert if to_insert != "" else og_word
detector = UniversalDetector() arg = sys.argv[1:] if arg: fns = list(filter(os.path.isfile, arg)) else: fns = ['-'] for fn in fns: if fn == '-': stream = sys.stdin.buffer else: stream = open(fn, 'rb') detector.reset() cache = [] for line in stream: detector.feed(line) cache.append(line) if detector.done: break detector.close() for line in cache: sys.stdout.write( line.decode(detector.result['encoding'] or 'utf-8', errors='replace')) for line in stream: sys.stdout.write( line.decode(detector.result['encoding'] or 'utf-8', errors='replace')) stream.close()