Ejemplo n.º 1
0
def get_coding(url):
    patern = re.compile(r'charset=([-\w\d]+)', re.IGNORECASE)
    try:
        response = urllib.request.urlopen(url)
        chrset = patern.findall(response.info()['Content-Type'])
        if len(chrset) == 0:
            gl = 15
            for line in response:
                chrset = patern.findall(str(line))
                gl -= 1
                if gl == 0:
                    return 'utf_8'
                if len(chrset) != 0:
                    ch = chrset[0].lower().replace('-', '_')
                    if ch in cdn.keys():
                        return cdn[ch]
                    if ch in cdn.values():
                        return ch
            return 'utf_8'
        else:
            ch = chrset[0].lower().replace('-', '_')
            if ch in cdn.keys():
                return cdn[ch]
            if ch in cdn.values():
                return ch
    except:
        print('Помилка завантаження сторінки:\n', url)
        return None
Ejemplo n.º 2
0
def read_text_from_file(filename,enc='utf8'):
	try:
		text = open(filename,encoding=enc).read()
		return text
	except	UnicodeDecodeError:
		list = sorted(aliases.keys())
		combobox = widget.ComboBox()
		for item in macros.major_character_encodings_list:
			if(item in list):
				combobox.add_item(item)
		for item in list:
			combobox.add_item(item)
		combobox.set_active(0)

		dlg = dialog.Dialog(_("{} Decode error Select Other Character Encoding".format(enc)),(_("Select"), dialog.Dialog.BUTTON_ID_1))
		dlg.add_widget_with_label(combobox,_("Character Encoding : "))
		combobox.grab_focus()
		dlg.show_all()
		response = dlg.run()

		if response == dialog.Dialog.BUTTON_ID_1:
			index = combobox.get_active()
			dlg.destroy()
			text = read_text_from_file(filename,enc=list[index])
			return text
		else:
			dlg.destroy()
			return ""
Ejemplo n.º 3
0
def read_text_from_file(filename, enc='utf8'):
    try:
        text = open(filename, encoding=enc).read()
        return text
    except UnicodeDecodeError:
        list = sorted(aliases.keys())
        combobox = widget.ComboBox()
        for item in macros.major_character_encodings_list:
            if (item in list):
                combobox.add_item(item)
        for item in list:
            combobox.add_item(item)
        combobox.set_active(0)

        dlg = dialog.Dialog(
            _("{} Decode error Select Other Character Encoding".format(enc)),
            (_("Select"), dialog.Dialog.BUTTON_ID_1))
        dlg.add_widget_with_label(combobox, _("Character Encoding : "))
        combobox.grab_focus()
        dlg.show_all()
        response = dlg.run()

        if response == dialog.Dialog.BUTTON_ID_1:
            index = combobox.get_active()
            dlg.destroy()
            text = read_text_from_file(filename, enc=list[index])
            return text
        else:
            dlg.destroy()
            return ""
Ejemplo n.º 4
0
def cod_page(f):
    cod_p = re.findall(r'charset=([-\w\d]+)(?i)', f.info()['Content-Type'])
    if len(cod_p) == 0: return 'utf_8'
    ch = cod_p[0].lower().replace('-', '_')
    if ch in cdn.keys():
        return cdn[ch]
    if ch in cdn.values():
        return ch
Ejemplo n.º 5
0
def get_encodings():
    """Get list of all encodings."""

    exclude = ('BASE64', 'BZ2', 'HEX', 'QUOPRI', 'ROT-13', 'UU', 'ZLIB')
    elist = set()
    elist.add('BIN')
    for k in aliases.keys():
        value = normalize_encoding_name(k)
        if value is not None and value not in exclude:
            elist.add(value)
    elist = list(elist)
    elist = sorted(elist, key=numeric_sort)
    return elist
Ejemplo n.º 6
0
def get_encodings():
    """Get list of all encodings."""

    exclude = ('BASE64', 'BZ2', 'HEX', 'QUOPRI', 'ROT-13', 'UU', 'ZLIB')
    elist = set()
    elist.add('BIN')
    for k in aliases.keys():
        value = normalize_encoding_name(k)
        if value is not None and value not in exclude:
            elist.add(value)
    elist = list(elist)
    elist = sorted(elist, key=numeric_sort)
    return elist
Ejemplo n.º 7
0
 def __init__(self):
     self.content_type = None
     self.status = None
     self.response = None
     self.charset = None
     self.charsets = {
         key.replace('_', ''): aliases.get(key).replace('_', '-')
         for key in aliases.keys()
     }
     self.types = {
         'json': 'application/json',
         'xml': 'application/xml',
         'soap': 'application/soap+xml'
     }
     self.transactions = None
Ejemplo n.º 8
0
def known_encodings():
    """\
    Render a list of all-known-to-Python character encodings (including 
    all known aliases)

    """
    from encodings.aliases import aliases
    _raw_encname_list = []
    _raw_encname_list.extend(aliases.keys())
    _raw_encname_list.extend(aliases.values())
    _raw_encname_list.sort()
    _encname_list = []
    for _raw_encname in _raw_encname_list:
        _encname = _raw_encname.upper()
        _encname = _encname.replace('_', '-')
        _encname_list.append(_encname)
    _encname_list.sort()
    _encname_list = unique(_encname_list)
    return _encname_list
Ejemplo n.º 9
0
def parse_header_value(header_value: str) -> str:
    """
    Email header to be parsed and decoded to string.

    :param header_value: header value as string
    :return: parsed decoded header value
    """
    for value, charset in decode_header(header_value):
        if charset:
            # Check charset is a valid Python charset
            clean_charset = charset.replace("-", "_")
            if clean_charset and clean_charset in aliases.keys():
                return str(value, encoding=clean_charset, errors="replace")
        else:
            # Convert bytes to string
            if isinstance(value, bytes):
                return value.decode(errors="replace")

    return str(header_value)
Ejemplo n.º 10
0
# Print all charsets
from encodings.aliases import aliases
encs = list(aliases.keys())


# from bin/hex to dec
int('11111111', 2)
int('aa', 16)

# to hex
hex(123)


# string to hex bytes
import codecs
codecs.encode(b'a', 'hex_codec')  # 61 (0x61 = 97)


# from bytes to string
b'\x00\x61'.decode('utf_16_be')   # a
b'\x61\x00'.decode('utf_16_le')   # a
b'\x00\x01\x30\x20'.decode('utf_32_be')   # egyptian symbol U013020 '𓀠'


# encodings
b'\x02\xd8\x01\xdd'.decode('utf_16_le')   # phoenician letter Bet '𐤁'
b'\xd8\x02\xdd\x01'.decode('utf_16_be')
chr(0x10901)                              # unicode codepoint
b'\x00\x01\x09\x01'.decode('utf_32_be')
b'\x01\x09\x01\x00'.decode('utf_32_le')
Ejemplo n.º 11
0
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
"""

import sys
import re
from encodings.aliases import aliases

from splitcode import header, footer

charset_pattern = re.compile(r"<meta[^>]*charset=([a-zA-Z\-0-9\"\']*)")
available_encodings = set((_.lower() for _ in aliases.keys()))
available_encodings |= set((_.lower() for _ in aliases.values()))
# for detect invalid positions in UnicodeError message
position_interval_pattern = re.compile(r"position ([0-9]*)-([0-9]*)")
position_pattern = re.compile(r"position ([0-9]*):")

def test_encoding(t, enc, stop_at=None):
    """
    tests a "t" text decoding with enc and returns how many decode errors
    occured in the whole text
    """
    c = 0
    while True:
        try:
            t = t.decode(enc)
            break
Ejemplo n.º 12
0
def check_if_encoding_exist(encoding):
    return encoding in aliases.keys() or encoding in aliases.values()
Ejemplo n.º 13
0
class ReadTXT(Read):
    """.txt Reader"""

    # required class variables for extensions, interface labels, and description
    EXTENSIONS = ['txt']
    GUI_LABELS = ['Plain Text']
    CLI_LABELS = ['txt']
    DESCRIPTION = 'All files with the ".txt" extension.'

    # UC_PROPS class variable with the base class UC_PROPS added
    UC_PROPS = Read.UC_PROPS + [{
        'flag': '--indirsubdir',
        'name': '--input-dir-subdir',
        'label': 'Folder and Subfolders',
        'action': 'store',
        'default': None,
        'type': str,
        'help':
        'Choose a directory that contains subfolders with files to be converted',
        'var': 'input_dir_subdir',
        'intype': 'dir',
        'position': -1000,
        'required': False
    }, {
        'flag': '--indir',
        'name': '--input-dir',
        'label': 'Single Folder',
        'action': 'store',
        'default': None,
        'type': str,
        'help': 'Choose a directory that contains all files to be converted',
        'var': 'input_dir',
        'intype': 'dir',
        'position': -999,
        'required': False
    }, {
        'flag': '--infile',
        'name': '--input-file',
        'label': 'File',
        'action': 'store',
        'default': None,
        'type': str,
        'help': 'Choose a single file to convert',
        'var': 'input_file',
        'intype': 'file',
        'position': -998,
        'required': False
    }, {
        'flag':
        '--renc',
        'name':
        '--r-encoding',
        'label':
        'Encoding',
        'action':
        'store',
        'default':
        'utf8',
        'type':
        str,
        'help': ('Choose what encoding to use for reading the input file. '
                 'utf8 is the default, which will work for most files.'),
        'var':
        'r_encoding',
        'gui_choices':
        sorted(aliases.keys()),
        'position':
        4,
        'required':
        True
    }]
    # sort the UC_PROPS on the position key
    UC_PROPS = sorted(UC_PROPS, key=lambda k: k['position'])

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Ensure that the input is a file
        if not os.path.isfile(self.info['metadata']['location']):
            self.put_error(self.info['metadata']['location'],
                           "Not a valid file path.")
            return

        self.info['data'] = []
        self.info['metadata'].update({
            'filename':
            os.path.basename(self.info['metadata']['location']),
            'filepath':
            self.info['metadata']['location'],
            'size':
            os.path.getsize(self.info['metadata']['location']),
            'conversion_id':
            os.path.abspath(self.info['metadata']['location'])
        })
        self.progress.update({
            'filename':
            self.info['metadata']['filename'],
            'size':
            self.info['metadata']['size'],
            'conversion_id':
            os.path.abspath(self.info['metadata']['location'])
        })

        # check that the file is not empty
        if not self.progress['size'] > 0:
            # put_error will put a progress dict in, so don't put another in
            self.put_error(self.info['metadata']['filename'],
                           'File has no content.')
        else:
            # if there's no error, report the progress
            self.prog_wait()

    def read_data(self):
        """Generator to yield lines in file"""
        # open the file using with statement to avoid having to close file
        with open(self.info['metadata']['filepath'],
                  'r',
                  encoding=self.options['r_encoding']) as file:
            # set state to Reading
            self.progress['state'] = 'Reading'
            # put the progress dict without waiting
            self.prog_nowait()
            # iterate through lines in file
            for index, line in self.file_gen(file):
                # add the line to the info dictionary
                self.info['data'].append(line)
            # set the state to writing
            self.progress['state'] = 'Writing'
            # flush the read-ahead buffer, get position, report progress
            file.flush()
            self.progress['progress'] = file.tell()
            # let it block if it needs to, this message must go through
            self.prog_wait()
            # yield the info dictionary
            # although this seems weird as a generator that only yields once,
            # it's necessary so that the writers work with all readers
            yield self.info
Ejemplo n.º 14
0
Archivo: a.py Proyecto: est/snippets
from encodings.aliases import aliases

s='\nos\n'
for x in aliases.keys():
 try:
  a=s.encode(x)
  if a[0]==s[0]and a!=s:
   print(x, a)
 except:
  pass
Ejemplo n.º 15
0
class WriteTXT(Write):
    """.txt Writer"""
    # required class variables for interface labels and description
    GUI_LABELS = ['Plain Text']
    CLI_LABELS = ['txt', 'text', 'plain_text']
    DESCRIPTION = 'All files with the ".txt" extension.'

    # UC_PROPS class variable with the base class UC_PROPS added
    UC_PROPS = Write.UC_PROPS + [
        {
            'flag': '--outdir',
            'name': '--output-directory',
            'label': 'Folder',
            'action': 'store',
            'default': None,
            'type': str,
            'help': 'Choose the directory to write the converted files to',
            'var': 'output_dir',
            'outtype': 'dir',
            'position': -1001,
            'required': False
        },
        {
            'flag': '--filename',
            'name': '--output-filename',
            'label': 'Output Filename',
            'action': 'store',
            'default': None,
            'gui_default': '',
            'type': str,
            'help': 'The name for your output file(s)',
            'var': 'output_filename',
            'position': -1000,
            'required': False
        },
        {
            'flag': '--wenc',
            'name': '--w-encoding',
            'label': 'Encoding',
            'action': 'store',
            'default': 'utf8',
            'type': str,
            'help':
            'Choose what encoding to use for writing the output file. utf8 is the default, which will work for most files.',
            'var': 'w_encoding',
            'gui_choices': sorted(aliases.keys()),
            'position': 1,
            'required': True
        },
        {
            'flag': '--ibl',
            'name': '--ignore-blank-lines',
            'label': 'Ignore Blank Lines',
            'action': 'store_true',
            'default': False,
            'help': 'Choose whether to ignore blank lines when writing file',
            'var': 'ignore_blank_lines',
            'position': 2,
            'required': False
        },
        # {'flag': '--mll',
        #  'name': '--min-line-length',
        #  'label': 'Minimum Line Length',
        #  'action': 'store',
        #  'default': None,
        #  'type': int,
        #  'help': 'Choose a minimum line length where lines greater than this value are ignored',
        #  'var': 'min_line_len',
        #  'position': 4,
        #  'required': False},
        {
            'flag': '--lc',
            'name': '--lowercase',
            'label': 'Lowercase',
            'action': 'store_true',
            'default': False,
            'help': 'Write the output file in all lowercase',
            'var': 'lowercase',
            'position': 3,
            'required': False
        },
        {
            'flag': '--unwrap',
            'name': '--text-unwrap',
            'label': 'Text Unwrapping',
            'action': 'store_true',
            'gui_default': False,
            'default': False,
            'help':
            'This option attempts to remove end-of-lines that do not represent ends of sentences, common in word-wrapped documents. Enabling can improve sentence boundary detection.',
            'var': 'text_unwrap',
            'position': 4,
            'required': False
        },
        {
            'flag': '--wrap',
            'name': '--text-wrap',
            'label': 'Text Wrapping',
            'action': 'store',
            'gui_default': 40,
            'default': None,
            'type': int,
            'help':
            'Wrap text with the specified line length (in characters).',
            'var': 'text_wrap',
            'position': 5,
            'required': False
        },
    ]

    # sort the UC_PROPS on the position key
    UC_PROPS = sorted(UC_PROPS, key=lambda k: k['position'])

    def __init__(self, options, read_file):
        super().__init__(options, read_file)

        # if the filename was provided, make sure the extension is there
        if self.options['output_filename'] is not None and len(
                self.options['output_filename']):
            if self.options['output_filename'].split('.')[-1] != 'txt':
                self.options['output_filename'] += '.txt'
        # else create the filename from input
        else:
            # get input filename
            filename = self.read_file.info['metadata']['filename']
            # make sure you make the extension .txt
            namelist = filename.split('.')
            namelist[-1] = 'txt'
            # join it back together
            filename = '.'.join(namelist)
            # set the output filename
            self.options['output_filename'] = filename

        self.wrap = None
        if self.options["text_wrap"]:
            import textwrap
            self.wrap = textwrap.TextWrapper(self.options["text_wrap"])

        self.unwrapper = None
        if self.options["text_unwrap"]:
            from ..utils.textunwrapper.unwrapper import RuleBasedUnwrapper
            self.unwrapper = RuleBasedUnwrapper()
            #print(self.unwrapper)

    def write_dir(self):
        """Writes file(s) to a directory"""
        # count files so we can distinguish multiple output files
        count = 1
        # iterate through records yielded by reader generator
        for info in self.read_file.read_data():
            # add numbers to output filename for multiple file output
            name = self.options['output_filename'].split('.')
            name[-2] = '{} ({})'.format(name[-2], count)
            name = '.'.join(name)
            path = os.path.join(self.options['output_dir'], name)

            # make sure you aren't overwriting
            path = self.get_safe_path(path)

            # get buffer size
            buffer = cdc.CONFIG.getint('WRITE',
                                       'OutputBufferSize',
                                       fallback=8192)

            try:
                # open output file for writing
                with open(path,
                          'w',
                          buffer,
                          encoding=self.options['w_encoding']) as file:
                    # run it through process_data generator and write line-by-line
                    file.write(self.get_document(info))

                # increment count for the next file
                count += 1
            except:
                os.remove(path)
                raise

    def process_data(self, info):
        """Generator for processing the data with the UC_PROPS"""

        # go through the lines and perform any in-place modifications
        for i, line in enumerate(info['data']):
            # process user-specified properties
            if self.options['lowercase']:
                info['data'][i] = line.lower()

# Remove blank lines
        if self.options['ignore_blank_lines']:
            info["data"] = [line for line in info["data"] if line.strip()]

        # text wrapping
        if self.wrap:
            info["data"] = self.wrap.wrap(''.join(info['data']))
            info["data"] = [line + "\n" for line in info["data"]
                            ]  # should not need to do this

        if self.unwrapper:
            # do not include the first line
            unwrapped = self.unwrapper.process(''.join(info['data'][1:]))
            info["data"] = [info["data"][0]] + self.unwrapper.render(
                unwrapped, "reflow").splitlines(True)

        return info

    def get_document(self, info):
        document = ''.join(self.process_data(info)['data'])
        return document
Ejemplo n.º 16
0
# -*- coding: utf-8 -*-
from encodings.aliases import aliases
import nkf

all_encodings = set(aliases.values()) | set(aliases.keys())


def normalize_encoding(encoding):
    encoding = encoding.lower()
    if encoding in ('windows-31j', 'shift-jis', 'shift_jis', 'x-sjis', 'sjis'):
        return 'cp932'
    return encoding


def decode(text, encoding=None, *args):
    if not encoding or encoding in ('ISO-8859-1', 'iso-8859-1'):
        encoding = nkf.guess(text)
        if encoding in ('BINARY', 'ISO-8859-1'):
            encoding = 'utf8'
    encoding = normalize_encoding(encoding)
    if not encoding in all_encodings:
        return nkf.nkf('-w', text).decode('utf8')
    return text.decode(encoding, *args)
Ejemplo n.º 17
0
Archivo: a.py Proyecto: est/snippets
from encodings.aliases import aliases

s = '\nos\n'
for x in aliases.keys():
    try:
        a = s.encode(x)
        if a[0] == s[0] and a != s:
            print(x, a)
    except:
        pass