def normalise_to_utf8(bytes_or_filepath): """Convert any text input with unknown encoding to utf-8. Parameters ---------- bytes_or_filepath : bytes or str A binary string or path to any text file in any encoding. Returns ------- str A string with correct utf-8 encoding. Raises ------ TypeError Input is not of type bytes or a valid path to an existing file. """ if type(bytes_or_filepath) == bytes: utf8_str = str(cnm.from_bytes(bytes_or_filepath).best().first()) elif os.path.isfile(bytes_or_filepath): utf8_str = str(cnm.from_path(bytes_or_filepath).best().first()) else: raise TypeError('Input must be bytes or a valid file path') return utf8_str
def test_encode_decode(self): with self.subTest('Encode & Detect UTF-8 WITHOUT SIG SMALL CONTENT'): self.assertEqual( CnM.from_bytes('h\xe9llo world!\n'.encode( 'utf_8')).best().first().encoding, 'utf_8') with self.subTest('Encode & Detect GB18030 WITHOUT SIG'): self.assertEqual( CnM.from_bytes('我没有埋怨,磋砣的只是一些时间。'.encode( 'gb18030')).best().first().encoding, 'gb18030') with self.subTest('Encode & Detect GB18030 WITH SIG (CJK)'): self.assertEqual( CnM.from_bytes((u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。' ).encode('gb18030')).best().first().encoding, 'gb18030') with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CJK)'): self.assertEqual( CnM.from_bytes('我没有埋怨,磋砣的只是一些时间。'.encode( 'utf_8')).best().first().encoding, 'utf_8') with self.subTest('Encode & Detect UTF-7 WITHOUT BOM (CJK)'): self.assertEqual( CnM.from_bytes('我没有埋怨,蹉跎的只是一些时间。'.encode( 'utf_7')).best().first().encoding, 'utf_7') with self.subTest('Encode & Detect UTF-7 WITH BOM (CJK)'): self.assertEqual( CnM.from_bytes(b'\x2b\x2f\x76\x38' + '我没有埋怨,磋砣的只是一些时间。'.encode('utf_7')).best(). first().encoding, 'utf_7') with self.subTest('Encode & Detect UTF-7 WITHOUT BOM (CYRILLIC)'): self.assertEqual( CnM.from_bytes( 'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно,' .encode('utf_7')).best().first().encoding, 'utf_7') with self.subTest('Encode & Detect UTF-8 WITH SIG (CJK)'): self.assertEqual( CnM.from_bytes(b'\xef\xbb\xbf' + '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8')).best(). first().encoding, 'utf_8') with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CYRILLIC)'): self.assertEqual( CnM.from_bytes( 'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, ' 'поне що се отнася до началното и основното образование.'. encode('utf_8')).best().first().encoding, 'utf_8') with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CYRILLIC)'): self.assertEqual( CnM.from_bytes('Bсеки човек има право на образование.'.encode( 'utf_8')).best().first().encoding, 'utf_8')
def test_file_input(self): for path_name in glob('./data/*.srt') + glob('./data/*.txt'): with self.subTest('test_file_input <{}>'.format(path_name)): matches = CnM.from_path(path_name) self.assertGreater( len(matches), 0 ) r_ = matches.best().first() self.assertIsNotNone( r_ ) if isinstance(TestFileCharsetNormalizer.SHOULD_BE[basename(path_name)], str): self.assertEqual( r_.encoding, TestFileCharsetNormalizer.SHOULD_BE[basename(path_name)] ) else: self.assertIn( r_.encoding, TestFileCharsetNormalizer.SHOULD_BE[basename(path_name)] )
def process_drive(img_burn_exe: str, drive: str, output_folder: str): """ Processes drive, tests if it is ready and if it is - tries to backup :param img_burn_exe: path to Exe file :param drive: Drive from which backup will be performed :param output_folder: Folder to which output will be saved :return: """ if not test_drive(drive): logging.info("Waiting for drive: %s to be ready", drive) return autorun_file = Path(f"{drive}Autorun.inf") autorun_label = "" if autorun_file.is_file(): parser = ConfigParser() encoding = CnM.from_path(autorun_file).best().first().encoding logging.debug("Detected autorun.inf encoding: %s", encoding) try: parser.read_string( autorun_file.read_text(encoding=encoding).lower()) except DuplicateOptionError as err: pass else: if 'label' in parser['autorun']: autorun_label = parser['autorun']['label'].upper() backup_disk(autorun_label, drive, img_burn_exe, output_folder)
def _detect_file_encoding(path: Path) -> str: """Return an approximate encoding of text file. Performs an encoding detection and BOM check. Args: path: The path to playlist file Returns: A string with "best" encoding from following: 'utf-8', 'utf-8-sig', 'cp1251', 'cp1252', 'utf_16_le'. Raises: ClickException: The file was no found or the encoding was not retrieved from 'charset_normalizer' """ try: detection_result = (CnM.from_path( path, cp_isolation=["utf_8", "cp1252", "cp1251", "utf_16_le"]).best().first()) encoding = "utf-8" if path.suffix == ".aimppl4": encoding = "utf-16-le" elif detection_result.encoding == "utf_8": if detection_result.byte_order_mark: encoding = "utf-8-sig" else: encoding = detection_result.encoding return encoding except (OSError, AttributeError) as error: message = str(error) raise ClickException(message)
def encoding_from_path(txt_file_path): file_encoding = 'utf-8' enc = CnM.from_path(txt_file_path).best().first() file_encoding = enc.encoding ##fix same encoding if (file_encoding == 'big5' or file_encoding == 'cp1252'): file_encoding = 'utf-8' return file_encoding
def source(source_file_s): # TODO: file missing exception text_str = str(cnm.from_path(source_file_s).best().first()) if len(text_str) == 0: ui.message.addItem("Файл оказался пустым!") ui.save_button.setVisible(False) return text_str
def get_corpus(self): ##extract training and development datasets ##do data merge, ArchiveImporter make final train/test/dev datasets utterances = {} audios = [] wav_dir = os.path.join(self.origin_data_path, self.extract_dir, "wav", "IT") text_dir = os.path.join(self.origin_data_path, self.extract_dir, "txt", "IT") for subdir, dirs, files in os.walk(wav_dir): uuu = 0 for _dir in dirs: curr_wav_dir = os.path.join(subdir, _dir) curr_txt_dir = os.path.join(text_dir, _dir) ##iterate wav file current folder for fname in os.listdir(curr_wav_dir): fname = os.fsdecode(fname) wav_file_path = os.path.join(wav_dir, _dir, fname) txt_file_path = os.path.join(curr_txt_dir, fname.split('.')[0] + '.txt') if (not os.path.isfile(txt_file_path)): print('audio file {} doesn\'t have a file transcript') continue ##read file transcript transcript = '' ##files have different encoding (utf-8, utf_16_be, etc..) ##need check to open file with correct encoding file_encoding = 'utf-8' enc = CnM.from_path(txt_file_path).best().first() file_encoding = enc.encoding ##fix same encoding if (file_encoding == 'big5' or file_encoding == 'cp1252'): file_encoding = 'utf-8' with open(txt_file_path, "r", encoding=file_encoding) as f: transcript += f.readline() transcript = transcript.strip() ##append data manifest utterances[wav_file_path] = transcript audios.append(wav_file_path) ##collect corpus corpus = Corpus(utterances, audios) ################# ## SIWIS clips need resample wav - clips is 44100Hz 706 kb/s (1 chnl) ## not require resample corpus.make_wav_resample = True return corpus
def read_file(path: str, filename: str = '') -> str: if filename: path = join_path(path, filename=filename) file_bytes = Path(path).read_bytes() encodings = CharsetNormalizerMatches.from_bytes(file_bytes).best() if len(encodings) == 0: encoding = None else: encoding = encodings.first().encoding return Path(path).read_text(encoding=encoding)
def test_bom_detection(self): with self.subTest('GB18030 UNAVAILABLE SIG'): self.assertFalse( CnM.from_bytes('我没有埋怨,磋砣的只是一些时间。'.encode( 'gb18030')).best().first().byte_order_mark) with self.subTest('GB18030 AVAILABLE SIG'): self.assertTrue( CnM.from_bytes( (u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。' ).encode('gb18030')).best().first().byte_order_mark) with self.subTest('UTF-7 AVAILABLE BOM'): self.assertTrue( CnM.from_bytes(b'\x2b\x2f\x76\x38' + '我没有埋怨,磋砣的只是一些时间。'.encode('utf_7')).best(). first().byte_order_mark) with self.subTest('UTF-8 AVAILABLE BOM'): self.assertTrue( CnM.from_bytes(b'\xef\xbb\xbf' + '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8')).best(). first().byte_order_mark)
def read_txt_file(self,txt_file_path): transcript = '' ##files have different encoding (utf-8, utf_16_be, etc..) ##need check to open file with correct encoding file_encoding ='utf-8' enc = CnM.from_path(txt_file_path).best().first() file_encoding = enc.encoding ##fix same encoding if(file_encoding=='big5' or file_encoding=='cp1252' ): file_encoding = 'utf-8' with open(txt_file_path, "r",encoding=file_encoding) as f: transcript += f.readline() transcript = transcript.strip() return transcript
def get_data(cls, self, report_type, option_type): """Construct and make get request.""" rpt_date = self.report_date.strftime('%Y%m%d') p1_url = f"{self.occ_burl}/flex-reports?reportType={report_type}" p2_url = f"&optionType={option_type}&reportDate={rpt_date}" # Make get request with passed url flex_bytes = requests.get(f"{p1_url}{p2_url}") # If a short error message assume wrong date if len(flex_bytes.content) < 500: self.report_date = self.report_date - timedelta(days=1) rpt_date = self.report_date.strftime('%Y%m%d') p2_url = f"&optionType={option_type}&reportDate={rpt_date}" # Make get request with passed url flex_bytes = requests.get(f"{p1_url}{p2_url}") self.byte_length = len(flex_bytes.content) self.rpt_to_print = CnM.from_bytes(flex_bytes.content).best().first() return flex_bytes
def detect(): if 'file' not in request.files: return jsonify({'message': 'No file has been sent'}), 400 my_file = request.files['file'] # type: FileStorage byte_str = my_file.stream.read() r_ = CnM.from_bytes(byte_str).best() k_ = chardet_detect(byte_str) k_['confidence'] = str(round( k_['confidence'] * 100., ndigits=3)) + ' %' if k_['confidence'] is not None else None z_ = cchardet_detect(byte_str) z_['confidence'] = str(round( z_['confidence'] * 100., ndigits=3)) + ' %' if z_['confidence'] is not None else None return jsonify({ 'charset-normalizer': { 'encoding': r_.encoding, 'aliases': r_.encoding_aliases, 'alphabets': r_.alphabets, 'language': r_.language, 'chaos': str(r_.percent_chaos) + ' %', 'coherence': str(r_.percent_coherence) + ' %', 'could_be': r_.could_be_from_charset } if r_ is not None else None, 'chardet': k_, 'cchardet': z_, 'filename': my_file.filename, 'b64_content': b64encode(r_.output()).decode('ascii') if r_ is not None else '' })
def cli_detect(argv=None): parser = argparse.ArgumentParser( description="The Real First Universal Charset Detector. " "Discover originating encoding used on text file. " "Normalize text to unicode.") parser.add_argument('file', type=argparse.FileType('rb'), nargs='+', help='Filename') parser.add_argument( '--verbose', action="store_true", default=False, dest='verbose', help='Display complementary information about file if any.') parser.add_argument( '--normalize', action="store_true", default=False, dest='normalize', help= 'Permit to normalize input file. If not set, program does not write anything.' ) parser.add_argument( '--replace', action="store_true", default=False, dest='replace', help= 'Replace file when trying to normalize it instead of creating a new one.' ) parser.add_argument( '--force', action="store_true", default=False, dest='force', help= 'Replace file without asking if you are sure, use this flag with caution.' ) args = parser.parse_args(argv) if len(args.file) == 0: print( 'This command purpose is to analyse text file. Please specify any filename.', file=sys.stderr) parser.print_help(file=sys.stderr) return 1 if args.replace is True and args.normalize is False: print('Use --replace in addition of --normalize only.', file=sys.stderr) return 1 if args.force is True and args.replace is False: print('Use --force in addition of --replace only.', file=sys.stderr) return 1 for my_file in args.file: matches = CharsetNormalizerMatches.from_fp(my_file) if len(matches) == 0: print('Unable to identify originating encoding for "{}".'.format( my_file.name), file=sys.stderr) continue x_ = PrettyTable([ 'Filename', 'Encoding', 'Language', 'Alphabets', 'Chaos', 'Coherence' ]) r_ = matches.best() p_ = r_.first() x_.add_row([ my_file.name, p_.encoding, p_.language, (' and ' if len(p_.alphabets) < 4 else '\n').join([ el if 'and' not in el else '"{}"'.format(el) for el in p_.alphabets ]), '{} %'.format(round(p_.chaos * 100., ndigits=3)), '{} %'.format(round(100. - p_.coherence * 100., ndigits=3)) ]) if len(matches) > 1 and args.verbose: for el in matches: if el != p_: x_.add_row([ '** ALTERNATIVE ' + my_file.name + '**', el.encoding, el.language, (' and ' if len(el.alphabets) < 4 else '\n').join([ el if 'and' not in el else '"{}"'.format(el) for el in el.alphabets ]), '{} %'.format(round(el.chaos * 100., ndigits=3)), '{} %'.format( round(100. - el.coherence * 100., ndigits=3)) ]) print(x_) if args.verbose is True: print('"{}" could be also originating from {}.'.format( my_file.name, ','.join(r_.could_be_from_charset))) print('"{}" could be also be written in {}.'.format( my_file.name, ' or '.join(p_.languages))) if args.normalize is True: if p_.encoding.startswith('utf') is True: print( '"{}" file does not need to be normalized, as it already came from unicode.' ) continue o_ = my_file.name.split('.') # type: list[str] if args.replace is False: o_.insert(-1, p_.encoding) else: if args.force is False and query_yes_no( 'Are you sure to normalize "{}" by replacing it ?'. format(my_file.name), 'no') is False: continue try: with open('./{}'.format('.'.join(o_)), 'w', encoding='utf-8') as fp: fp.write(str(p_)) except IOError as e: print(str(e), file=sys.stderr) return 2 return 0
for i in range(N_REQUIRED_LOOP): st_t = perf_counter_ns() z_ = n_detect(seq_) l_.append(perf_counter_ns() - st_t) st_ar[srt_file]['cchardet'] = locale.format_string('%d', mean(l_), grouping=True) st_re[srt_file]['cchardet'] = z_['encoding'] l_.clear() for i in range(N_REQUIRED_LOOP): st_t = perf_counter_ns() y_ = CharsetNormalizerMatches.from_bytes(seq_) l_.append(perf_counter_ns() - st_t) st_ar[srt_file]['charset_normalizer'] = locale.format_string( '%d', mean(l_), grouping=True) st_re[srt_file]['charset_normalizer'] = y_.best().first().encoding x_ = prettytable.PrettyTable( ['File', 'Chardet', 'cChardet', 'Charset Normalizer']) for k, v in st_ar.items(): x_.add_row([k, v['chardet'], v['cchardet'], v['charset_normalizer']]) print(x_) x_ = prettytable.PrettyTable(
def parse(self, file_name): content = str(CnM.from_path(file_name).best().first()) return self.parser.parse(content)[0]
pd.set_option('display.max_columns', 100) # Display maximum rows pd.set_option('display.max_rows', 50) # %% codecell ######################################## url = "https://www.sec.gov/Archives/edgar/daily-index/2021/QTR2/sitemap.20210426.xml" get = requests.get(url) fpath = '/Users/unknown1/Algo/data/sec/raw/daily_index/2021/20210426' f = open(fpath, 'wb') f.write(get.content) f.close() print(CnM.from_bytes(get.content).best().first()) root = ET.fromstring(get.content.decode('UTF-8')) # %% codecell ##################################################################### data = [] for i, child in enumerate(root): data.append([subchild.text for subchild in child]) df = pd.DataFrame(data) # Write in DF df.columns = ['url', 'lastmod', 'changefreq', 'priority']
def detect_file_encoding(file_path, buffer_size=1024, max_lines=20): """ Determines the encoding of files within the initial `max_lines` of length `buffer_size`. :param file_path: path to the file :type file_path: str :param buffer_size: buffer length for each line being read :type buffer_size: int :param max_lines: number of lines to read from file of length buffer_size :type max_lines: int :return: encoding type :rtype: str """ detector = UniversalDetector() line_count = 0 with FileOrBufferHandler(file_path, 'rb') as input_file: chunk = input_file.read(buffer_size) while chunk and line_count < max_lines: detector.feed(chunk) chunk = input_file.read(buffer_size) line_count += 1 detector.close() encoding = detector.result["encoding"] # Typical file representation is utf-8 instead of ascii, treat as such. if not encoding or encoding.lower() in ['ascii', 'windows-1254']: encoding = 'utf-8' # Check if encoding can be used to decode without throwing an error def _decode_is_valid(encoding): try: with FileOrBufferHandler(file_path, encoding=encoding) as input_file: input_file.read(1024 * 1024) return True except: return False if not _decode_is_valid(encoding): try: from charset_normalizer import CharsetNormalizerMatches as CnM # Try with small sample with FileOrBufferHandler(file_path, 'rb') as input_file: raw_data = input_file.read(10000) result = CnM.from_bytes(raw_data, steps=5, chunk_size=512, threshold=0.2, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False) result = result.best() if result: if result.first(): encoding = result.first().encoding # Try again with full sample if not _decode_is_valid(encoding): with FileOrBufferHandler(file_path, 'rb') as input_file: raw_data = input_file.read(max_lines * buffer_size) result = CnM.from_bytes(raw_data, steps=max_lines, chunk_size=buffer_size, threshold=0.2, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False) result = result.best() if result: if result.first(): encoding = result.first().encoding except: logger.info("Install charset_normalizer for improved file " "encoding detection") # If no encoding is still found, default to utf-8 if not encoding: encoding = 'utf-8' return encoding.lower()
def get_data(folder_path: str) -> dict: """ Iterates through a directory to create a Dict of Pandas DataFrames with filepaths as their keys. :type folder_path: str :rtype: dict keys: filepaths values: pd.DataFrame """ # print("This is the name of the script: ", sys.argv[0]) print("Initializing Data Retrieval...") csvfiles = glob.glob(folder_path + "/**/*.csv", recursive=True) xlfiles = glob.glob(folder_path + "/**/*.xls?", recursive=True) xlfiles = xlfiles + glob.glob(folder_path + "/**/*.xls", recursive=True) # xlfiles = [] file_dict = {} i = 1 for file in xlfiles: print("Reading File %d of %d:" % (i, len(csvfiles) + len(xlfiles))) print("\tFull Path: ", file) # csv_from_excel(file) try: df = pd.read_excel(file, sheet_name=None) for sheet in df.keys(): print("\t\t", sheet, "processed...") df[sheet].index.rename('file_index',inplace=True) file_dict.update({file.join(['', sheet]): df[sheet]}) except: logging.error('COULD NOT LOAD %s' % file) print('\t\tFAILED') i += 1 for file in csvfiles: # print("Reading File %d of %d:" % (i, len(csvfiles) + len(xlfiles))) print("Reading File %d of %d:" % (i, len(csvfiles) + len(xlfiles))) print("\tFull Path: ", file) try: df = pd.read_csv(file, low_memory=False, header='infer', encoding = detect_encoding(file)) df.index.rename('file_index',inplace=True) file_dict.update({file: df}) except UnicodeDecodeError: try: print("Encoding Detection Failed... Attempting to Normalize...") normalized = StringIO(str(CnM.from_path(file).best().first())) df = pd.read_csv(normalized, low_memory=False, header='infer') df.index.rename('file_index',inplace=True) file_dict.update({file: df}) print("Success!") except: print('Encoding Normalization Failed') except: logging.error('COULD NOT LOAD %s' % file) print('\t\tFAILED') i += 1 return file_dict
###################################################################################### def ClearScreen(): if platform.system() == "Windows": os.system("cls") if platform.system() == "Linux": os.system("clear") ###################################################################################### ClearScreen() ModelPath = easygui.fileopenbox("Selecione a legenda modelo:") TargetPath = easygui.fileopenbox("Selecione a legenda alvo:") ###################################################################################### ModelCoding = CnM.from_path(ModelPath).best().first().encoding TargetCoding = CnM.from_path(TargetPath).best().first().encoding ###################################################################################### ModelSub = open(ModelPath, encoding=ModelCoding) TargetSub = open(TargetPath, encoding=TargetCoding) ModelContent = ModelSub.readlines() TargetContent = TargetSub.readlines() i = 0 ModelTimesPos = [] for l in ModelContent: if IsSubTimeLine(l): ModelTimesPos.append(i) i += 1 i = 0 TargetTimesPos = []
def normalize(text_file): try: CnM.normalize(text_file) # should write to disk my_subtitle-***.srt except IOError as e: print('Sadly, we are unable to perform charset normalization.', str(e))
args = sys.argv enc = None vk = None data = [] try: use_cp1251, use_auth = parse_arg(args[1]), parse_arg(args[2]) except ValueError as exc: print(f"Неверный аргумент: {exc.args[0]}. Возможны только аргументы {POSSIBLE_ARGS}") exit(-1) except IndexError: pass try: enc = CnM.from_path(INPUT_FNAME).best().first().encoding if enc != 'utf-8': print("\n\n", ENC_WARN, "\n\n") if use_cp1251 is None: use_cp1251 = yes_no("Использовать cp1251 вместо текущей кодировки?") if use_cp1251: enc = 'cp1251' # parse the file with group IDs print("Используется кодировка: ", enc) with open(INPUT_FNAME, 'r', newline='', encoding=enc) as csvf: dialect = csv.Sniffer().sniff(csvf.read(1024)) csvf.seek(0) reader = csv.reader(csvf, dialect=dialect) for row in reader: if row[0] == INPUT_FILE_HEADER_GROUP_TITLE: continue
from charset_normalizer import CharsetNormalizerMatches as cnm text_str = str(cnm.from_path("test.txt").best().first()) print(text_str)
import xml.etree.ElementTree as ET # Display max 50 columns pd.set_option('display.max_columns', None) # Display maximum rows pd.set_option('display.max_rows', None) # %% codecell ############################################################## url = 'https://marketdata.theocc.com/delo-download?prodType=ALL&downloadFields=OS;US;SN&format=txt' get = requests.get(url) dlp_df = pd.DataFrame(pd.read_csv(BytesIO(get.content)), escapechar='\n', delimiter='\t') print(CnM.from_bytes(get.content).best().first()) get_sample = get.content[0:1000] get_sample """ sym = 'IBM' occ = requests.get(f"https://marketdata.theocc.com/series-search?symbolType=U&symbol={sym}") occ_df = pd.read_csv(BytesIO(occ.content), skiprows=6, escapechar='\n', delimiter='\t') cols = occ_df.columns[:-1] occ_df.drop('year', inplace=True, axis=1) occ_df.columns = cols
def cli_detect(argv=None): """ CLI assistant using ARGV and ArgumentParser :param argv: :return: 0 if everything is fine, anything else equal trouble """ parser = argparse.ArgumentParser( description="The Real First Universal Charset Detector. " "Discover originating encoding used on text file. " "Normalize text to unicode." ) parser.add_argument('file', type=argparse.FileType('rb'), nargs='+', help='Filename') parser.add_argument('--verbose', action="store_true", default=False, dest='verbose', help='Display complementary information about file if any.') parser.add_argument('--normalize', action="store_true", default=False, dest='normalize', help='Permit to normalize input file. If not set, program does not write anything.') parser.add_argument('--replace', action="store_true", default=False, dest='replace', help='Replace file when trying to normalize it instead of creating a new one.') parser.add_argument('--force', action="store_true", default=False, dest='force', help='Replace file without asking if you are sure, use this flag with caution.') parser.add_argument('--threshold', action="store", default=0.2, type=float, dest='threshold', help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.") args = parser.parse_args(argv) if len(args.file) == 0: print('This command purpose is to analyse text file. Please specify any filename.', file=sys.stderr) parser.print_help(file=sys.stderr) return 1 if args.replace is True and args.normalize is False: print('Use --replace in addition of --normalize only.', file=sys.stderr) return 1 if args.force is True and args.replace is False: print('Use --force in addition of --replace only.', file=sys.stderr) return 1 if args.threshold < 0. or args.threshold > 1.: print('--threshold VALUE should be between 0. AND 1.') return 1 for my_file in args.file: matches = CharsetNormalizerMatches.from_fp( my_file, threshold=args.threshold ) if len(matches) == 0: print('Unable to identify originating encoding for "{}". {}'.format(my_file.name, 'Maybe try increasing maximum amount of chaos.' if args.threshold < 1. else ''), file=sys.stderr) if my_file.closed is False: my_file.close() continue x_ = PrettyTable(['Filename', 'Encoding', 'Language', 'Alphabets', 'Chaos', 'Coherence']) r_ = matches.best() p_ = r_.first() x_.add_row( [ my_file.name, p_.encoding, p_.language, (' and ' if len(p_.alphabets) < 4 else '\n').join([el if 'and' not in el else '"{}"'.format(el) for el in p_.alphabets]), '{} %'.format(round(p_.chaos * 100., ndigits=3)), '{} %'.format(round(100. - p_.coherence * 100., ndigits=3)) ] ) if len(matches) > 1 and args.verbose: for el in matches: if el != p_: x_.add_row( [ '** ALTERNATIVE '+my_file.name+'**', el.encoding, el.language, (' and ' if len(el.alphabets) < 4 else '\n').join([el if 'and' not in el else '"{}"'.format(el) for el in el.alphabets]), '{} %'.format(round(el.chaos * 100., ndigits=3)), '{} %'.format(round(100. - el.coherence * 100., ndigits=3)) ] ) print(x_) if args.verbose is True: if len(r_.could_be_from_charset) > 1: print('"{}" could be also originating from {}.'.format(my_file.name, ','.join(r_.could_be_from_charset))) if len(p_.could_be_from_charset) > 1: print('"{}" produce the EXACT same output with those encoding : {}.'.format(my_file.name, ' OR '.join(p_.could_be_from_charset))) if len(p_.languages) > 1: print('"{}" could be also be written in {}.'.format(my_file.name, ' or '.join(p_.languages))) if p_.byte_order_mark is True: print('"{}" has a signature or byte order mark (BOM) in it.'.format(my_file.name)) if args.normalize is True: if p_.encoding.startswith('utf') is True: print('"{}" file does not need to be normalized, as it already came from unicode.'.format(my_file.name)) if my_file.closed is False: my_file.close() continue o_ = my_file.name.split('.') # type: list[str] if args.replace is False: o_.insert(-1, p_.encoding) if my_file.closed is False: my_file.close() else: if args.force is False and query_yes_no( 'Are you sure to normalize "{}" by replacing it ?'.format(my_file.name), 'no') is False: if my_file.closed is False: my_file.close() continue try: with open('./{}'.format('.'.join(o_)), 'w', encoding='utf-8') as fp: fp.write( str(p_) ) print('"{}" has been successfully written to disk.'.format('.'.join(o_))) except IOError as e: print(str(e), file=sys.stderr) if my_file.closed is False: my_file.close() return 2 if my_file.closed is False: my_file.close() return 0