def findCodecName(text, displayEncodingCodecName=sys.stdout.encoding, encodingErrorFlag=0): from sets import Set from encodings.aliases import aliases print text encodingCodecSet = Set() for encodingName in aliases.items(): encodingCodecSet.add(encodingName[1].replace('_', '-')) """ # [DUMP ENCODING CODEC] for encodingCodec in encodingCodecSet : if encodingCodec == 'cp949' : print 'FOUND' print encodingCodec, """ if len(encodingCodecSet.intersection([displayEncodingCodecName])) == 0: print 'invalid displayEncodingCodecName : %s' % ( displayEncodingCodecName) return for encodingCodec in encodingCodecSet: try: encodedStr = text.decode(encodingCodec).encode( displayEncodingCodecName) # except UnicodeEncodeError as e : # if encodingErrorFlag == 1 : # print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e # continue # except UnicodeDecodeError as e : # if encodingErrorFlag == 1 : # print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e # continue # except ValueError as e : # if encodingErrorFlag == 1 : # print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e # continue # except TypeError as e : # if encodingErrorFlag == 1 : # print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e # continue # except IOError as e : # if encodingErrorFlag == 1 : # print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e # continue # except LookupError as e : # if encodingErrorFlag == 1 : # print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e # continue except BaseException as e: if encodingErrorFlag == 1: print '"', encodingCodec, '"', '->', '"', sys.stdout.encoding, '":', e continue print '"', encodingCodec, '"', '->', '"', sys.stdout.encoding, '":', encodedStr
def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]: """ Extract using ASCII-only decoder any specified encoding in the first n-bytes. """ if not isinstance(sequence, bytes): raise TypeError seq_len = len(sequence) # type: int results = findall( RE_POSSIBLE_ENCODING_INDICATION, sequence[: seq_len if seq_len <= search_zone else search_zone].decode( "ascii", errors="ignore" ), ) # type: List[str] if len(results) == 0: return None for specified_encoding in results: specified_encoding = specified_encoding.lower().replace("-", "_") for encoding_alias, encoding_iana in aliases.items(): if encoding_alias == specified_encoding: return encoding_iana if encoding_iana == specified_encoding: return encoding_iana return None
def check_encoding_supported(encoding): is_supported = False for key, value in enumerate(aliases.items()): if encoding in list(value): is_supported = True break return is_supported
def findCodecName( text, displayEncodingCodecName = sys.stdout.encoding, encodingErrorFlag = 0 ) : from sets import Set from encodings.aliases import aliases print text encodingCodecSet = Set() for encodingName in aliases.items() : encodingCodecSet.add(encodingName[1].replace('_','-')) """ # [DUMP ENCODING CODEC] for encodingCodec in encodingCodecSet : if encodingCodec == 'cp949' : print 'FOUND' print encodingCodec, """ if len(encodingCodecSet.intersection([displayEncodingCodecName])) == 0 : print 'invalid displayEncodingCodecName : %s'%(displayEncodingCodecName) return for encodingCodec in encodingCodecSet : try : encodedStr = text.decode(encodingCodec).encode(displayEncodingCodecName) # except UnicodeEncodeError as e : # if encodingErrorFlag == 1 : # print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e # continue # except UnicodeDecodeError as e : # if encodingErrorFlag == 1 : # print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e # continue # except ValueError as e : # if encodingErrorFlag == 1 : # print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e # continue # except TypeError as e : # if encodingErrorFlag == 1 : # print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e # continue # except IOError as e : # if encodingErrorFlag == 1 : # print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e # continue # except LookupError as e : # if encodingErrorFlag == 1 : # print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e # continue except BaseException as e : if encodingErrorFlag == 1 : print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":', e continue print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":', encodedStr
def populate_cmb_unicodes(self, combo): """ Populate combo with full list of codes """ unicode_list = [] for item in list(aliases.items()): unicode_list.append(str(item[0])) sorted_list = sorted(unicode_list, key=str.lower) utils_giswater.set_autocompleter(combo, sorted_list)
def encoding_aliases(self) -> List[str]: """ Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. """ also_known_as = [] # type: List[str] for u, p in aliases.items(): if self.encoding == u: also_known_as.append(p) elif self.encoding == p: also_known_as.append(u) return also_known_as
def iana_name(cp_name: str, strict: bool = True) -> str: cp_name = cp_name.lower().replace("-", "_") for encoding_alias, encoding_iana in aliases.items(): if cp_name == encoding_alias or cp_name == encoding_iana: return encoding_iana if strict: raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name)) return cp_name
def fill_combo_unicodes(combo): """ Populate combo with full list of codes """ unicode_list = [] matches = ["utf8", "windows", "latin"] for item in list(aliases.items()): for x in matches: if not f"{item[0]}".startswith(x): continue unicode_list.append(str(item[0])) sorted_list = sorted(unicode_list, key=str.lower) if sorted_list: set_autocompleter(combo, sorted_list)
def _calculateEncodingKey(comparator): """Gets the first key of all available encodings where the corresponding value matches the comparator. Args: comparator (string): A view name for an encoding. Returns: str: A key for a specific encoding used by python. """ encodingName = None for k, v in list(_encodings.items()): if v == comparator: encodingName = k break return encodingName
def findEncodings(q): return [(k,v) for k, v in aliases.items() if q in k or q in v]
parser.add_argument("-s", "--starts-with", help="Searches the start of the multi-byte character", action="store_true") parser.add_argument("-c", "--contains", help="Searches entire multi-byte character", action="store_true") args = parser.parse_args() if(args.ends_with and args.starts_with) or (args.ends_with and args.contains) or (args.starts_with and args.contains): print("You may not select more than one search position at a time") print("Please choose only one of the following arguments: -e, -s, -c") exit() if args.starts_with: print("You are currently searching for a character set that begins with: " + args.needle) elif args.contains: print("You are currently searching for a character set that contains: " + args.needle) else: print("You are currently searching for a character set that ends with: " + args.needle) chars = list(str for str in map(chr, range(0,1114112)) if str.isprintable()) search_needles = dict() for v,encoding in aliases.items(): for char in chars: try: if char == args.needle: search_needles[encoding] = ' '.join(map(hex,char.encode(encoding))) except LookupError: pass for encoding,code in search_needles.items(): for char in chars: try: if args.starts_with: if ' '.join(map(hex,char.encode(encoding))).startswith(code) and char != args.needle: print(encoding + " | " + char + " | " + ' '.join(map(hex,char.encode(encoding))) + " | " + unicodedata.name(char)) elif args.contains: if code in ' '.join(map(hex,char.encode(encoding))) and char != args.needle: print(encoding + " | " + char + " | " + ' '.join(map(hex,char.encode(encoding))) + " | " + unicodedata.name(char))
from encodings.aliases import aliases s = 'El Niño' print("Codecs") for codec in ['latin_1', 'utf_8', 'utf_16']: print("codec({}) {}: {}".format(codec, s, s.encode(codec))) # NB: stackoverflow suggests this is an incomplete set, still interesting IMO all_the_things = set() for alias in aliases.items(): all_the_things.add(alias[0]) all_the_things.add(alias[1]) def chunks(l, n): for i in range(0, len(l), n): yield l[i:i + n] for elts in chunks(list(all_the_things), 8): print(", ".join(elts))
# Copyright: ZopeChina Corp, Ltd. http://zopechina.com # hack python's default encoding to 'utf-8' import sys reload(sys) sys.setdefaultencoding('utf-8') del sys.setdefaultencoding import os from encodings.aliases import aliases # gb2312 is obsoleted, use gbk for k,v in aliases.items(): if v == 'cjkcodecs.gb2312': aliases[k] = 'cjkcodecs.gbk' if os.name == 'nt': import encodings for ec in ['gb2312', 'gbk', 'gb18030', 'big5']: if not encodings.aliases.aliases.has_key(ec): encodings.aliases.aliases[ec] = 'mbcs' # clear cache if encodings._cache.has_key(ec): del encodings._cache[ec] import ZopePak import StructuredTextPak import setup try: import PlonePak
# -*- coding: utf-8 -*- #from cxFile import cxFile from sets import Set from encodings.aliases import aliases #import sys #reload(sys) #sys.setdefaultencoding('utf-8') #resultFile = cxFile() encodingCodecSet = Set() #for encodingName in aliases.keys() : for encodingName in aliases.items() : encodingCodecSet.add(encodingName[1].replace('_','-')) #string = encodingName[0] + encodingName[1] + '\n' #string = encodingName[0] + '\t:\t' + encodingName[1].replace('_','-') + '\n' #resultFile.write(string) #for encodingCodec in encodingCodecSet : # string = encodingCodec + '\n' # resultFile.write(string) #resultFile.close() testString = '¿ÀÁø¿ø' print testString import sys print sys.stdout.encoding
# -*- coding: utf-8 -*- #from cxFile import cxFile from sets import Set from encodings.aliases import aliases #import sys #reload(sys) #sys.setdefaultencoding('utf-8') #resultFile = cxFile() encodingCodecSet = Set() #for encodingName in aliases.keys() : for encodingName in aliases.items(): encodingCodecSet.add(encodingName[1].replace('_', '-')) #string = encodingName[0] + encodingName[1] + '\n' #string = encodingName[0] + '\t:\t' + encodingName[1].replace('_','-') + '\n' #resultFile.write(string) #for encodingCodec in encodingCodecSet : # string = encodingCodec + '\n' # resultFile.write(string) #resultFile.close() testString = '¿ÀÁø¿ø' print testString import sys print sys.stdout.encoding
def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.09): """ Take a sequence of bytes that could potentially be decoded to str and discard all obvious non supported charset encoding. :param bytearray sequences: Actual sequence of bytes to analyse :param float threshold: Maximum amount of chaos allowed on first pass :param int chunk_size: Size to extract and analyse in each step :param int steps: Number of steps :return: List of potential matches :rtype: CharsetNormalizerMatches """ py_v = [int(el) for el in python_version_tuple()] py_need_sort = py_v[0] < 3 or (py_v[0] == 3 and py_v[1] < 6) supported = sorted( aliases.items()) if py_need_sort else aliases.items() tested = set() working = dict() maximum_length = len(sequences) for support in supported: k, p = support if p in tested: continue tested.add(p) try: str(sequences, encoding=p) except UnicodeDecodeError: continue except LookupError: continue chaos_measures = list() ranges_encountered_t = dict() decoded_len_t = 0 for i in range(0, maximum_length, int(maximum_length / steps)): chunk = sequences[i:i + chunk_size] decoded = str(chunk, encoding=p, errors='ignore') probe_chaos = ProbeChaos(decoded) chaos_measure, ranges_encountered = probe_chaos.ratio, probe_chaos.encountered_unicode_range_occurrences for k, e in ranges_encountered.items(): if k not in ranges_encountered_t.keys(): ranges_encountered_t[k] = 0 ranges_encountered_t[k] += e if chaos_measure > threshold: if p in working.keys(): del working[p] break chaos_measures.append(chaos_measure) if p not in working.keys(): working[p] = dict() if p in working.keys(): working[p]['ratio'] = statistics.mean(chaos_measures) working[p]['ranges'] = ranges_encountered_t working[p]['chaos'] = sum(chaos_measures) working[p]['len'] = decoded_len_t if p == 'ascii' and working[p]['ratio'] == 0.: break return CharsetNormalizerMatches([ CharsetNormalizerMatch(sequences, enc, working[enc]['ratio'], working[enc]['ranges']) for enc in ( sorted(working.keys()) if py_need_sort else working.keys()) if working[enc]['ratio'] <= threshold ])
print("You may not select more than one search position at a time") print("Please choose only one of the following arguments: -e, -s, -c") exit() if args.starts_with: print( "You are currently searching for a character set that begins with: " + args.needle) elif args.contains: print("You are currently searching for a character set that contains: " + args.needle) else: print("You are currently searching for a character set that ends with: " + args.needle) chars = list(str for str in map(chr, range(0, 1114112)) if str.isprintable()) search_needles = dict() for v, encoding in aliases.items(): for char in chars: try: if char == args.needle: search_needles[encoding] = ' '.join( map(hex, char.encode(encoding))) except LookupError: pass for encoding, code in search_needles.items(): for char in chars: try: if args.starts_with: if ' '.join(map(hex, char.encode(encoding))).startswith( code) and char != args.needle: print(encoding + " | " + char + " | " + ' '.join(map(hex, char.encode(encoding))) + " | " +
from ..utils import add_in_db, exists_in_db dataset_controller = Controller(component=Dataset, format_fn=format_dataset, control_fn=control_dataset, module_fn=load_dataset_modules_in_background) pandas_prof_controller = Controller(component=ModulePandasProfiling, format_fn=pandas_profiling.format_module, control_fn=pandas_profiling.control_module) bias_controller = Controller(component=ModuleBias, format_fn=bias.format_module, control_fn=bias.control_module) encodings = list(sorted(set([v for k, v in aliases.items()]))) def index(): title = _('Datasets') header = get_header_attributes() datasets = dataset_controller.index() return render_template("datasets/index.html", session=session, datasets=datasets, header=header) def get_all_instances_json(): datasets = dataset_controller.index()
def __init__(self, parent): menubar = tk.Menu(parent.master) self.config(menubar, parent) parent.master.config(menu = menubar) file_dropdown = tk.Menu(menubar) file_dropdown.add_command(label = "Nouveau", accelerator = "Ctrl+N", command = parent.new_file) file_dropdown.add_command(label = "Ouvrir", accelerator = "Ctrl+O", command = parent.ask_file) reopen_dropdown = tk.Menu(file_dropdown) for file in parent.constante.reopen_files : if path.isfile(file) : reopen_dropdown.add_command(label = file, command = lambda file = file : parent.open_file(file)) self.config(reopen_dropdown, parent) file_dropdown.add_cascade(label = "Ré-ouvrir", menu = reopen_dropdown) file_dropdown.add_command(label = "Sauvegarder", accelerator = "Ctrl+S", command = parent.save) file_dropdown.add_command(label = "Sauvegarder sous...", accelerator = "Ctrl+Shift+S", command = parent.save_as) file_dropdown.add_separator() file_dropdown.add_command(label = "info", accelerator = "Ctrl+I", command = parent.get_info) file_dropdown.add_command(label = "Rechercher", accelerator = "Ctrl+F", command = parent.find) file_dropdown.add_separator() file_dropdown.add_command(label = "Quitter", command = parent.exit) self.config(file_dropdown, parent) theme_dropdown = tk.Menu(menubar) theme_dropdown.add_command(label = "Sombre", command = lambda : self.modif_config(parent, "Sombre")) theme_dropdown.add_command(label = "Clair", command = lambda : self.modif_config(parent, "Clair")) theme_dropdown.add_command(label = "Pas de theme", command = lambda : self.modif_config(parent)) self.config(theme_dropdown, parent) crypt_dropdown = tk.Menu(menubar) crypt_dropdown.add_command(label = "Cryptage : faible (sans clés)", command = lambda : (f:=open( parent.constante.file_full_name+".crypt",'wb'),f.write(b85encode(b16encode(codecs.encode(parent.textarea.get( 1.0,tk.END),"rot13").encode()))),f.close(),print("Le fichier a été crypté avec succès."))) crypt_dropdown.add_command(label = "Cryptage : fort (avec clés)", command = lambda : (key:=simpledialog.askstring( "Clef","Clef de cryptage",show="*"),f:=open(parent.constante.file_full_name+".crypt","wb"),f.write(File.XOR( parent.textarea.get(1.0,tk.END).encode(),key)),f.close(),print("Le fichier a été crypté avec succès"))) crypt_dropdown.add_command(label = "Décryptage : faible (sans clés)", command = lambda : ( file:=filedialog.askopenfilename(defaultextension=".crypt").replace("/", "\\"),f:=open(file,'rb'),code:=f.read(), f.close(),f:=open(file+".decrypt","w"),f.write(codecs.decode(b16decode(b85decode(code)).decode(),"rot13") ),f.close(),print("Le fichier a été décrypté avec succès."))) crypt_dropdown.add_command(label = "Décrypter : fort (avec clés)", command = lambda : (file:=filedialog.askopenfilename( defaultextension=".crypt").replace("/", "\\"),key:=simpledialog.askstring("Clef","Clef de cryptage",show="*"), f:=open(file,'rb'),code:=File.XOR(f.read(),key),f.close(),f:=open(file+".decrypt",'wb'),f.write(code),f.close(), print("Le fichier a été décrypté avec succès."))) self.config(crypt_dropdown, parent) checksums_dropdown = tk.Menu(menubar) checksums_dropdown.add_command(label = "Voir les checksums", command = lambda : (print(f"""SELECT :\n{File.hashs( parent.get_select())}""")if parent.get_select()else None,print(f"""FILE :\n{File.hashs(parent.textarea.get(1.0, tk.END))}"""))) checksums_dropdown.add_command(label = "Généré un fichier de checksums", command = lambda : (f:=open( parent.constante.file_full_name+".hash","w", encoding = "utf-8"),f.write(File.hashs(parent.textarea.get(1.0,tk.END ))),f.close(),print("Le fichier a été créé avec succès."))) self.config(checksums_dropdown, parent) compress_dropdown = tk.Menu(menubar) compress_dropdown.add_command(label = "Compresser le fichier", command = lambda : File.compress(parent.constante)) compress_dropdown.add_command(label = "Décompresser le fichier", command = lambda : (file:=filedialog.askopenfilename( defaultextension=".zip").replace("/", "\\"),pwd:=simpledialog.askstring("Mot de passe", "Mot de passe (facultatif) : ",show="*"),File.decompress(file,pwd))) self.config(compress_dropdown, parent) python_dropdown = tk.Menu(menubar) python_dropdown.add_command(label = "Compilation du fichier", command = lambda : (py_compile.compile( parent.constante.file_full_name,cfile=parent.constante.file_full_name+"c")if re.match(r"^(.*)\.py$", parent.constante.file_full_name)else print("Ce fichier n'est pas un fichier Python..."), print("Fin de la compilation"))) python_dropdown.add_command(label = "Script en 1 ligne", command = lambda : (File.script_one_line( parent)if re.match(r"^(.*)\.py$", parent.constante.file_full_name)else print( "Ce fichier n'est pas un fichier Python..."),print("Fin de la transformation"))) self.config(python_dropdown, parent) encoding_dropdown = tk.Menu(menubar) for encoding in parent.constante.config["general"]["encodings"] : encoding_dropdown.add_command(label = encoding, command = lambda enc=encoding : (parent.save(), parent.read_file([enc]))) encoding_dropdown.add_command(label = "Hexadecimal", command = lambda : (parent.save(), parent.read_file(["hex"]))) encoding_dropdown.add_command(label = "Liste des encodings", command = lambda : [print(alias) for alias, enc in aliases.items()]) self.config(encoding_dropdown, parent) about_dropdown = tk.Menu(menubar) about_dropdown.add_command(label = "Version", command = self.show_release_notes) about_dropdown.add_separator() about_dropdown.add_command(label = "A propos...", command = self.show_about_message) self.config(about_dropdown, parent) menubar.add_cascade(label = "Fichier", menu = file_dropdown) menubar.add_cascade(label = "Themes", menu = theme_dropdown) menubar.add_cascade(label = "Encoding", menu = encoding_dropdown) menubar.add_cascade(label = "Compression", menu = compress_dropdown) menubar.add_cascade(label = "Cryptage", menu = crypt_dropdown) menubar.add_cascade(label = "Checksums", menu = checksums_dropdown) menubar.add_cascade(label = "Script Python", menu = python_dropdown) menubar.add_command(label = "Execute", accelerator = "f5") menubar.add_cascade(label = "A propos", menu = about_dropdown)
def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20): """ Take a sequence of bytes that could potentially be decoded to str and discard all obvious non supported charset encoding. Will test input like this (with steps=4 & chunk_size=4) --> [#### #### #### ####] :param bytes sequences: Actual sequence of bytes to analyse :param float threshold: Maximum amount of chaos allowed on first pass :param int chunk_size: Size to extract and analyse in each step :param int steps: Number of steps :return: List of potential matches :rtype: CharsetNormalizerMatches """ py_v = [int(el) for el in python_version_tuple()] py_need_sort = py_v[0] < 3 or (py_v[0] == 3 and py_v[1] < 6) supported = sorted(aliases.items()) if py_need_sort else aliases.items() tested = set() matches = list() maximum_length = len(sequences) if maximum_length <= chunk_size: chunk_size = maximum_length steps = 1 for support in supported: k, p = support if p in tested: continue tested.add(p) bom_available = False bom_len = None try: if p in BYTE_ORDER_MARK.keys(): if isinstance(BYTE_ORDER_MARK[p], bytes) and sequences.startswith(BYTE_ORDER_MARK[p]): bom_available = True bom_len = len(BYTE_ORDER_MARK[p]) elif isinstance(BYTE_ORDER_MARK[p], list): bom_c_list = [sequences.startswith(el) for el in BYTE_ORDER_MARK[p]] if any(bom_c_list) is True: bom_available = True bom_len = len(BYTE_ORDER_MARK[p][bom_c_list.index(True)]) str( sequences if bom_available is False else sequences[bom_len:], encoding=p ) except UnicodeDecodeError: continue except LookupError: continue r_ = range( 0 if bom_available is False else bom_len, maximum_length, int(maximum_length / steps) ) measures = [ProbeChaos(str(sequences[i:i + chunk_size], encoding=p, errors='ignore'), giveup_threshold=threshold) for i in r_] ratios = [el.ratio for el in measures] nb_gave_up = [el.gave_up is True or el.ratio >= threshold for el in measures].count(True) chaos_means = statistics.mean(ratios) chaos_median = statistics.median(ratios) # chaos_min = min(ratios) # chaos_max = max(ratios) if (len(r_) >= 4 and nb_gave_up > len(r_) / 4) or chaos_median > threshold: # print(p, 'is too much chaos for decoded input !') continue encountered_unicode_range_occurrences = dict() for el in measures: for u_name, u_occ in el.encountered_unicode_range_occurrences.items(): if u_name not in encountered_unicode_range_occurrences.keys(): encountered_unicode_range_occurrences[u_name] = 0 encountered_unicode_range_occurrences[u_name] += u_occ # print(p, 'U RANGES', encountered_unicode_range_occurrences) cnm = CharsetNormalizerMatch( sequences if not bom_available else sequences[bom_len:], p, chaos_means, encountered_unicode_range_occurrences, bom_available ) fingerprint_tests = [el.fingerprint == cnm.fingerprint for el in matches] if any(fingerprint_tests) is True: matches[fingerprint_tests.index(True)].submatch.append(cnm) else: matches.append( CharsetNormalizerMatch( sequences if not bom_available else sequences[bom_len:], p, chaos_means, encountered_unicode_range_occurrences, bom_available ) ) # print(p, nb_gave_up, chaos_means, chaos_median, chaos_min, chaos_max, matches[-1].coherence, matches[-1].languages,) if (p == 'ascii' and chaos_median == 0.) or bom_available is True: return CharsetNormalizerMatches([matches[-1]]) return CharsetNormalizerMatches(matches)