def get_coding(url): patern = re.compile(r'charset=([-\w\d]+)', re.IGNORECASE) try: response = urllib.request.urlopen(url) chrset = patern.findall(response.info()['Content-Type']) if len(chrset) == 0: gl = 15 for line in response: chrset = patern.findall(str(line)) gl -= 1 if gl == 0: return 'utf_8' if len(chrset) != 0: ch = chrset[0].lower().replace('-', '_') if ch in cdn.keys(): return cdn[ch] if ch in cdn.values(): return ch return 'utf_8' else: ch = chrset[0].lower().replace('-', '_') if ch in cdn.keys(): return cdn[ch] if ch in cdn.values(): return ch except: print('Помилка завантаження сторінки:\n', url) return None
def read_text_from_file(filename,enc='utf8'): try: text = open(filename,encoding=enc).read() return text except UnicodeDecodeError: list = sorted(aliases.keys()) combobox = widget.ComboBox() for item in macros.major_character_encodings_list: if(item in list): combobox.add_item(item) for item in list: combobox.add_item(item) combobox.set_active(0) dlg = dialog.Dialog(_("{} Decode error Select Other Character Encoding".format(enc)),(_("Select"), dialog.Dialog.BUTTON_ID_1)) dlg.add_widget_with_label(combobox,_("Character Encoding : ")) combobox.grab_focus() dlg.show_all() response = dlg.run() if response == dialog.Dialog.BUTTON_ID_1: index = combobox.get_active() dlg.destroy() text = read_text_from_file(filename,enc=list[index]) return text else: dlg.destroy() return ""
def read_text_from_file(filename, enc='utf8'): try: text = open(filename, encoding=enc).read() return text except UnicodeDecodeError: list = sorted(aliases.keys()) combobox = widget.ComboBox() for item in macros.major_character_encodings_list: if (item in list): combobox.add_item(item) for item in list: combobox.add_item(item) combobox.set_active(0) dlg = dialog.Dialog( _("{} Decode error Select Other Character Encoding".format(enc)), (_("Select"), dialog.Dialog.BUTTON_ID_1)) dlg.add_widget_with_label(combobox, _("Character Encoding : ")) combobox.grab_focus() dlg.show_all() response = dlg.run() if response == dialog.Dialog.BUTTON_ID_1: index = combobox.get_active() dlg.destroy() text = read_text_from_file(filename, enc=list[index]) return text else: dlg.destroy() return ""
def cod_page(f): cod_p = re.findall(r'charset=([-\w\d]+)(?i)', f.info()['Content-Type']) if len(cod_p) == 0: return 'utf_8' ch = cod_p[0].lower().replace('-', '_') if ch in cdn.keys(): return cdn[ch] if ch in cdn.values(): return ch
def get_encodings(): """Get list of all encodings.""" exclude = ('BASE64', 'BZ2', 'HEX', 'QUOPRI', 'ROT-13', 'UU', 'ZLIB') elist = set() elist.add('BIN') for k in aliases.keys(): value = normalize_encoding_name(k) if value is not None and value not in exclude: elist.add(value) elist = list(elist) elist = sorted(elist, key=numeric_sort) return elist
def __init__(self): self.content_type = None self.status = None self.response = None self.charset = None self.charsets = { key.replace('_', ''): aliases.get(key).replace('_', '-') for key in aliases.keys() } self.types = { 'json': 'application/json', 'xml': 'application/xml', 'soap': 'application/soap+xml' } self.transactions = None
def known_encodings(): """\ Render a list of all-known-to-Python character encodings (including all known aliases) """ from encodings.aliases import aliases _raw_encname_list = [] _raw_encname_list.extend(aliases.keys()) _raw_encname_list.extend(aliases.values()) _raw_encname_list.sort() _encname_list = [] for _raw_encname in _raw_encname_list: _encname = _raw_encname.upper() _encname = _encname.replace('_', '-') _encname_list.append(_encname) _encname_list.sort() _encname_list = unique(_encname_list) return _encname_list
def parse_header_value(header_value: str) -> str: """ Email header to be parsed and decoded to string. :param header_value: header value as string :return: parsed decoded header value """ for value, charset in decode_header(header_value): if charset: # Check charset is a valid Python charset clean_charset = charset.replace("-", "_") if clean_charset and clean_charset in aliases.keys(): return str(value, encoding=clean_charset, errors="replace") else: # Convert bytes to string if isinstance(value, bytes): return value.decode(errors="replace") return str(header_value)
# Print all charsets from encodings.aliases import aliases encs = list(aliases.keys()) # from bin/hex to dec int('11111111', 2) int('aa', 16) # to hex hex(123) # string to hex bytes import codecs codecs.encode(b'a', 'hex_codec') # 61 (0x61 = 97) # from bytes to string b'\x00\x61'.decode('utf_16_be') # a b'\x61\x00'.decode('utf_16_le') # a b'\x00\x01\x30\x20'.decode('utf_32_be') # egyptian symbol U013020 '𓀠' # encodings b'\x02\xd8\x01\xdd'.decode('utf_16_le') # phoenician letter Bet '𐤁' b'\xd8\x02\xdd\x01'.decode('utf_16_be') chr(0x10901) # unicode codepoint b'\x00\x01\x09\x01'.decode('utf_32_be') b'\x01\x09\x01\x00'.decode('utf_32_le')
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA """ import sys import re from encodings.aliases import aliases from splitcode import header, footer charset_pattern = re.compile(r"<meta[^>]*charset=([a-zA-Z\-0-9\"\']*)") available_encodings = set((_.lower() for _ in aliases.keys())) available_encodings |= set((_.lower() for _ in aliases.values())) # for detect invalid positions in UnicodeError message position_interval_pattern = re.compile(r"position ([0-9]*)-([0-9]*)") position_pattern = re.compile(r"position ([0-9]*):") def test_encoding(t, enc, stop_at=None): """ tests a "t" text decoding with enc and returns how many decode errors occured in the whole text """ c = 0 while True: try: t = t.decode(enc) break
def check_if_encoding_exist(encoding): return encoding in aliases.keys() or encoding in aliases.values()
class ReadTXT(Read): """.txt Reader""" # required class variables for extensions, interface labels, and description EXTENSIONS = ['txt'] GUI_LABELS = ['Plain Text'] CLI_LABELS = ['txt'] DESCRIPTION = 'All files with the ".txt" extension.' # UC_PROPS class variable with the base class UC_PROPS added UC_PROPS = Read.UC_PROPS + [{ 'flag': '--indirsubdir', 'name': '--input-dir-subdir', 'label': 'Folder and Subfolders', 'action': 'store', 'default': None, 'type': str, 'help': 'Choose a directory that contains subfolders with files to be converted', 'var': 'input_dir_subdir', 'intype': 'dir', 'position': -1000, 'required': False }, { 'flag': '--indir', 'name': '--input-dir', 'label': 'Single Folder', 'action': 'store', 'default': None, 'type': str, 'help': 'Choose a directory that contains all files to be converted', 'var': 'input_dir', 'intype': 'dir', 'position': -999, 'required': False }, { 'flag': '--infile', 'name': '--input-file', 'label': 'File', 'action': 'store', 'default': None, 'type': str, 'help': 'Choose a single file to convert', 'var': 'input_file', 'intype': 'file', 'position': -998, 'required': False }, { 'flag': '--renc', 'name': '--r-encoding', 'label': 'Encoding', 'action': 'store', 'default': 'utf8', 'type': str, 'help': ('Choose what encoding to use for reading the input file. ' 'utf8 is the default, which will work for most files.'), 'var': 'r_encoding', 'gui_choices': sorted(aliases.keys()), 'position': 4, 'required': True }] # sort the UC_PROPS on the position key UC_PROPS = sorted(UC_PROPS, key=lambda k: k['position']) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Ensure that the input is a file if not os.path.isfile(self.info['metadata']['location']): self.put_error(self.info['metadata']['location'], "Not a valid file path.") return self.info['data'] = [] self.info['metadata'].update({ 'filename': os.path.basename(self.info['metadata']['location']), 'filepath': self.info['metadata']['location'], 'size': os.path.getsize(self.info['metadata']['location']), 'conversion_id': os.path.abspath(self.info['metadata']['location']) }) self.progress.update({ 'filename': self.info['metadata']['filename'], 'size': self.info['metadata']['size'], 'conversion_id': os.path.abspath(self.info['metadata']['location']) }) # check that the file is not empty if not self.progress['size'] > 0: # put_error will put a progress dict in, so don't put another in self.put_error(self.info['metadata']['filename'], 'File has no content.') else: # if there's no error, report the progress self.prog_wait() def read_data(self): """Generator to yield lines in file""" # open the file using with statement to avoid having to close file with open(self.info['metadata']['filepath'], 'r', encoding=self.options['r_encoding']) as file: # set state to Reading self.progress['state'] = 'Reading' # put the progress dict without waiting self.prog_nowait() # iterate through lines in file for index, line in self.file_gen(file): # add the line to the info dictionary self.info['data'].append(line) # set the state to writing self.progress['state'] = 'Writing' # flush the read-ahead buffer, get position, report progress file.flush() self.progress['progress'] = file.tell() # let it block if it needs to, this message must go through self.prog_wait() # yield the info dictionary # although this seems weird as a generator that only yields once, # it's necessary so that the writers work with all readers yield self.info
from encodings.aliases import aliases s='\nos\n' for x in aliases.keys(): try: a=s.encode(x) if a[0]==s[0]and a!=s: print(x, a) except: pass
class WriteTXT(Write): """.txt Writer""" # required class variables for interface labels and description GUI_LABELS = ['Plain Text'] CLI_LABELS = ['txt', 'text', 'plain_text'] DESCRIPTION = 'All files with the ".txt" extension.' # UC_PROPS class variable with the base class UC_PROPS added UC_PROPS = Write.UC_PROPS + [ { 'flag': '--outdir', 'name': '--output-directory', 'label': 'Folder', 'action': 'store', 'default': None, 'type': str, 'help': 'Choose the directory to write the converted files to', 'var': 'output_dir', 'outtype': 'dir', 'position': -1001, 'required': False }, { 'flag': '--filename', 'name': '--output-filename', 'label': 'Output Filename', 'action': 'store', 'default': None, 'gui_default': '', 'type': str, 'help': 'The name for your output file(s)', 'var': 'output_filename', 'position': -1000, 'required': False }, { 'flag': '--wenc', 'name': '--w-encoding', 'label': 'Encoding', 'action': 'store', 'default': 'utf8', 'type': str, 'help': 'Choose what encoding to use for writing the output file. utf8 is the default, which will work for most files.', 'var': 'w_encoding', 'gui_choices': sorted(aliases.keys()), 'position': 1, 'required': True }, { 'flag': '--ibl', 'name': '--ignore-blank-lines', 'label': 'Ignore Blank Lines', 'action': 'store_true', 'default': False, 'help': 'Choose whether to ignore blank lines when writing file', 'var': 'ignore_blank_lines', 'position': 2, 'required': False }, # {'flag': '--mll', # 'name': '--min-line-length', # 'label': 'Minimum Line Length', # 'action': 'store', # 'default': None, # 'type': int, # 'help': 'Choose a minimum line length where lines greater than this value are ignored', # 'var': 'min_line_len', # 'position': 4, # 'required': False}, { 'flag': '--lc', 'name': '--lowercase', 'label': 'Lowercase', 'action': 'store_true', 'default': False, 'help': 'Write the output file in all lowercase', 'var': 'lowercase', 'position': 3, 'required': False }, { 'flag': '--unwrap', 'name': '--text-unwrap', 'label': 'Text Unwrapping', 'action': 'store_true', 'gui_default': False, 'default': False, 'help': 'This option attempts to remove end-of-lines that do not represent ends of sentences, common in word-wrapped documents. Enabling can improve sentence boundary detection.', 'var': 'text_unwrap', 'position': 4, 'required': False }, { 'flag': '--wrap', 'name': '--text-wrap', 'label': 'Text Wrapping', 'action': 'store', 'gui_default': 40, 'default': None, 'type': int, 'help': 'Wrap text with the specified line length (in characters).', 'var': 'text_wrap', 'position': 5, 'required': False }, ] # sort the UC_PROPS on the position key UC_PROPS = sorted(UC_PROPS, key=lambda k: k['position']) def __init__(self, options, read_file): super().__init__(options, read_file) # if the filename was provided, make sure the extension is there if self.options['output_filename'] is not None and len( self.options['output_filename']): if self.options['output_filename'].split('.')[-1] != 'txt': self.options['output_filename'] += '.txt' # else create the filename from input else: # get input filename filename = self.read_file.info['metadata']['filename'] # make sure you make the extension .txt namelist = filename.split('.') namelist[-1] = 'txt' # join it back together filename = '.'.join(namelist) # set the output filename self.options['output_filename'] = filename self.wrap = None if self.options["text_wrap"]: import textwrap self.wrap = textwrap.TextWrapper(self.options["text_wrap"]) self.unwrapper = None if self.options["text_unwrap"]: from ..utils.textunwrapper.unwrapper import RuleBasedUnwrapper self.unwrapper = RuleBasedUnwrapper() #print(self.unwrapper) def write_dir(self): """Writes file(s) to a directory""" # count files so we can distinguish multiple output files count = 1 # iterate through records yielded by reader generator for info in self.read_file.read_data(): # add numbers to output filename for multiple file output name = self.options['output_filename'].split('.') name[-2] = '{} ({})'.format(name[-2], count) name = '.'.join(name) path = os.path.join(self.options['output_dir'], name) # make sure you aren't overwriting path = self.get_safe_path(path) # get buffer size buffer = cdc.CONFIG.getint('WRITE', 'OutputBufferSize', fallback=8192) try: # open output file for writing with open(path, 'w', buffer, encoding=self.options['w_encoding']) as file: # run it through process_data generator and write line-by-line file.write(self.get_document(info)) # increment count for the next file count += 1 except: os.remove(path) raise def process_data(self, info): """Generator for processing the data with the UC_PROPS""" # go through the lines and perform any in-place modifications for i, line in enumerate(info['data']): # process user-specified properties if self.options['lowercase']: info['data'][i] = line.lower() # Remove blank lines if self.options['ignore_blank_lines']: info["data"] = [line for line in info["data"] if line.strip()] # text wrapping if self.wrap: info["data"] = self.wrap.wrap(''.join(info['data'])) info["data"] = [line + "\n" for line in info["data"] ] # should not need to do this if self.unwrapper: # do not include the first line unwrapped = self.unwrapper.process(''.join(info['data'][1:])) info["data"] = [info["data"][0]] + self.unwrapper.render( unwrapped, "reflow").splitlines(True) return info def get_document(self, info): document = ''.join(self.process_data(info)['data']) return document
# -*- coding: utf-8 -*- from encodings.aliases import aliases import nkf all_encodings = set(aliases.values()) | set(aliases.keys()) def normalize_encoding(encoding): encoding = encoding.lower() if encoding in ('windows-31j', 'shift-jis', 'shift_jis', 'x-sjis', 'sjis'): return 'cp932' return encoding def decode(text, encoding=None, *args): if not encoding or encoding in ('ISO-8859-1', 'iso-8859-1'): encoding = nkf.guess(text) if encoding in ('BINARY', 'ISO-8859-1'): encoding = 'utf8' encoding = normalize_encoding(encoding) if not encoding in all_encodings: return nkf.nkf('-w', text).decode('utf8') return text.decode(encoding, *args)
from encodings.aliases import aliases s = '\nos\n' for x in aliases.keys(): try: a = s.encode(x) if a[0] == s[0] and a != s: print(x, a) except: pass