Esempio n. 1
0
def get_coding(url):
    patern = re.compile(r'charset=([-\w\d]+)', re.IGNORECASE)
    try:
        response = urllib.request.urlopen(url)
        chrset = patern.findall(response.info()['Content-Type'])
        if len(chrset) == 0:
            gl = 15
            for line in response:
                chrset = patern.findall(str(line))
                gl -= 1
                if gl == 0:
                    return 'utf_8'
                if len(chrset) != 0:
                    ch = chrset[0].lower().replace('-', '_')
                    if ch in cdn.keys():
                        return cdn[ch]
                    if ch in cdn.values():
                        return ch
            return 'utf_8'
        else:
            ch = chrset[0].lower().replace('-', '_')
            if ch in cdn.keys():
                return cdn[ch]
            if ch in cdn.values():
                return ch
    except:
        print('Помилка завантаження сторінки:\n', url)
        return None
Esempio n. 2
0
def get_available_charsets():
    charsets = set()
    for i in aliases.values():
        i = i.replace('_', '-')
        charsets.add(i)
    charsets = list(charsets)
    charsets.sort()
    return [(i,i) for i in charsets]
Esempio n. 3
0
def cod_page(f):
    cod_p = re.findall(r'charset=([-\w\d]+)(?i)', f.info()['Content-Type'])
    if len(cod_p) == 0: return 'utf_8'
    ch = cod_p[0].lower().replace('-', '_')
    if ch in cdn.keys():
        return cdn[ch]
    if ch in cdn.values():
        return ch
Esempio n. 4
0
    def get_encodings(self, filepath):
        """ Prints encodings related with a given file

        Parameters
        ----------
        filepath: string
            Path to the file to analyse.
        """
        from encodings.aliases import aliases

        alias_values = set(aliases.values())

        for encoding in set(aliases.values()):
            try:
                df = pd.read_csv(filepath, encoding=encoding)
                print('successful', encoding)
            except:
                pass
def write_encodings(filename, line_number, final_encoding):
    # To ensure that we cover as many as possible encodings,
    # we take the union between our predefined encoding set and the
    # set of the values from the encodings.aliases.aliases.
    encodings = encs.union(set(aliases.values()))

    data = dict()

    # Read line from file
    try:
        with io.open(filename, "rb") as f:
            lines = f.readlines()
            line = lines[line_number - 1]
            print("\nProcessing line number: " + str(line_number))
            if len(line) < 3:
                print("!!!Warning!!!: Possible empty line.")
            print("")
    except Exception:
        _, err, _ = sys.exc_info()
        print("Error reading " + filename)
        print(err)
        sys.exit(1)

    # Decode it using every possible encoding
    for enc in encodings:
        try:
            data[enc] = line.decode(enc)
        except Exception:
            _, err, _ = sys.exc_info()
            print("Cannot decode using " + enc)
            # print(err)

    # We write the results in a new utf-8 text file
    # We use the same filename + an '.encodings' extension
    fpath = os.path.abspath(filename)
    newfilename = fpath + '.encodings'
    print("\nWriting successfully tested encodings in " + newfilename)

    with open(newfilename, 'w') as out:
        c = 0
        for enc in sorted(data.keys()):
            try:
                out.write("%-20s" % enc)
                if (sys.version_info[0] < 3):
                    line = data[enc].encode(final_encoding)
                else:
                    line = data[enc]
                out.write(line)
                out.write(os.linesep)
                c += 1
            except Exception:
                _, err, _ = sys.exc_info()
                print("Cannot encode " + enc + " to " + final_encoding)
                # print(err)

    print("\n" + str(c) + " out of " + str(len(encodings)) +
          " tested encodings were written.\n")
Esempio n. 6
0
def get_encodings_list():
    """
    Список возможных кодировок.
    """
    try:
        result = reduce(lambda lst, code: lst if code in lst else lst + [code],
                        aliases.values(), [])
        result.sort()
        return result
    except:
        return ['UTF-8', 'UTF-16', 'CP1251', 'CP866', 'KOI8-R']
Esempio n. 7
0
def find_encoding(file_path):
    """Requires Python 3.6 or higher."""
    from encodings.aliases import aliases

    alias_values = set(aliases.values())
    encodings = []
    for alias in alias_values:
        try:
            pd.read_csv(f'{file_path}', encoding=alias)
            encodings.append(alias)
        except:
            UnicodeDecodeError
    return encodings
Esempio n. 8
0
    def train(self):
        '''
        load_data_folder: give the path of the folder
        csv : return a csv file
        '''
        for encoding in set(aliases.values()):
            try:
                dataframe = pd.read_csv(self.path, encoding=encoding)
                print(f"[SUCCESS] Dataset Loaded Successfully")
                self.display_option(dataframe)
                label_type = dataframe[self.label][0]
                self.data_clean(df=dataframe)  # Data cleaning null columns
                if self.fold == True:
                    df = self.create_folds(df=dataframe)
                else:
                    df = self.with_out_kfold(df=dataframe)

                if (isinstance(label_type, np.int64)):
                    print("True")
                    std_df = df.drop([self.label], axis=1)
                else:
                    data_input = input(
                        f"[INPUT] ENTER THE NAME OF ENCODER BECAUSE LABEL DTYPE IS OBJECT : "
                    )
                    split_df, inv_label = self.split_data(df, data_input)
                    X = split_df.drop([self.label], axis=1)
                    print("False")
                    std_df = self.scale_data(X)
                new_scaled_data = pd.concat([df[self.label], std_df], axis=1)
                X_train, X_test, y_train, y_test = self.train_val_data(
                    new_scaled_data)
                scores = self.model_(X_train, X_test, y_train, y_test)
                self.display_option(
                    scores.sort_values(by="best_score",
                                       ascending=False,
                                       ignore_index=True))
                filename = input(f"[INPUT] ENTER THE PATH : ")
                if os.path.exists(filename):
                    print(f"[INFO] LOADED SUCCESSFULLY")
                else:
                    print(f"[ERROR] FILE NOT FOUND")
                pred = self.load_datafile(
                    filename, new_scaled_data.drop([self.label], axis=1))
                return f"[RESULT] predicted label : {inv_label[pred]}"
            except Exception as e:
                print(f"[ERROR] {e}")
                break
def ReadCsvFiles(files, delimiter=',', merge=False):
    '''
    INPUT:
    files: List of file names
    
    OUTPUT:
    dfs: overall status (bool) all succ, list of data frames
    '''
    if files is None or len(files) < 0:
        raise ValueError('Fnc "ReadCsvFiles": files is None or empty')
    if type(files) is not list:
        files = [files]
    dfs = {}
    PrintLine('Start reading files')
    notworked = []
    for file in files:
        try:
            curdf = pd.read_csv(file, delimiter=delimiter)
            dfs[file] = curdf
            print('Dataframe loaded from {}: shape = {}'.format(
                file, curdf.shape))
        except Exception as e:
            print('Could not read file ', file, ': ', str(e))
            notworked.append(file)
            dfs[file] = None
    log = 'Reading files successfully finished'

    if len(notworked) > 0:
        for file in notworked:
            print('Trying to load file with encodings: ', file)
            for encoding in set(aliases.values()):
                try:
                    curdf = pd.read_csv(file, encoding=encoding)
                    dfs[file] = curdf
                    print('Encoding found to load file: ', encoding)
                    break
                except:
                    pass
    allsucc = True
    for key, val in enumerate(dfs):
        if val is None:
            print('File could not be loaded: ', key)
            allsucc = False
    PrintLine(log)
    return allsucc, dfs
Esempio n. 10
0
def known_encodings():
    """\
    Render a list of all-known-to-Python character encodings (including 
    all known aliases)

    """
    from encodings.aliases import aliases
    _raw_encname_list = []
    _raw_encname_list.extend(aliases.keys())
    _raw_encname_list.extend(aliases.values())
    _raw_encname_list.sort()
    _encname_list = []
    for _raw_encname in _raw_encname_list:
        _encname = _raw_encname.upper()
        _encname = _encname.replace('_', '-')
        _encname_list.append(_encname)
    _encname_list.sort()
    _encname_list = unique(_encname_list)
    return _encname_list
Esempio n. 11
0
def main():
    '''
    parses available encoding types and checks
    if expected terms are present as such
    when decoded
    '''
    logging.basicConfig(level=logging.INFO,
                        format='[%(levelname)8s]: %(message)s')
    infile = ARGS.infile
    expected_words = ARGS.exp
    available_encs = list(set(aliases.values()))
    for enc in available_encs:
        try:
            with open(infile, 'r', encoding=enc) as inp:
                try:
                    contents = inp.read()
                    found, missed = defaultdict(list), defaultdict(list)
                    for word in expected_words:
                        if word in contents:
                            found[enc].append(word)
                        else:
                            missed[enc].append(word)

                    if expected_words:
                        if missed[enc]:
                            logging.debug('%s: Missed %s', enc, missed[enc])
                        if found[enc]:
                            logging.info('%s: Found %s', enc, found[enc])
                    else:
                        logging.info(
                            '%s: readable. Use expected terms (--exp) to narrow results.',
                            enc)

                except (UnicodeError, UnicodeDecodeError) as exception:
                    logging.debug('%s: %s', enc, type(exception).__name__)

        except LookupError as exception:
            logging.debug('%s: %s', enc, type(exception).__name__)

    return
Esempio n. 12
0
 def _get_encoding(self, cr, user, context=None):
     result = [(x, x.replace('_', '-')) for x in set(aliases.values())]
     result.sort()
     return result
Esempio n. 13
0
    def _initUI(self):
        """Initiates the user interface with a grid layout and several widgets.

        """
        self.setModal(self._modal)
        self.setWindowTitle(self._windowTitle)

        layout = QtGui.QGridLayout()

        self._filenameLabel = QtGui.QLabel('Choose File', self)
        self._filenameLineEdit = QtGui.QLineEdit(self)
        self._filenameLineEdit.textEdited.connect(self._updateFilename)
        chooseFileButtonIcon = QtGui.QIcon(
            QtGui.QPixmap(':/icons/document-open.png'))
        self._chooseFileAction = QtGui.QAction(self)
        self._chooseFileAction.setIcon(chooseFileButtonIcon)
        self._chooseFileAction.triggered.connect(self._openFile)

        self._chooseFileButton = QtGui.QToolButton(self)
        self._chooseFileButton.setDefaultAction(self._chooseFileAction)

        layout.addWidget(self._filenameLabel, 0, 0)
        layout.addWidget(self._filenameLineEdit, 0, 1, 1, 2)
        layout.addWidget(self._chooseFileButton, 0, 3)

        self._encodingLabel = QtGui.QLabel('File Encoding', self)

        encoding_names = list(
            [x.upper() for x in sorted(list(set(_encodings.values())))])
        self._encodingComboBox = QtGui.QComboBox(self)
        self._encodingComboBox.addItems(encoding_names)
        self._encodingComboBox.activated.connect(self._updateEncoding)

        layout.addWidget(self._encodingLabel, 1, 0)
        layout.addWidget(self._encodingComboBox, 1, 1, 1, 1)

        self._hasHeaderLabel = QtGui.QLabel('Header Available?', self)
        self._headerCheckBox = QtGui.QCheckBox(self)
        self._headerCheckBox.toggled.connect(self._updateHeader)

        layout.addWidget(self._hasHeaderLabel, 2, 0)
        layout.addWidget(self._headerCheckBox, 2, 1)

        self._delimiterLabel = QtGui.QLabel('Column Delimiter', self)
        self._delimiterBox = DelimiterSelectionWidget(self)
        self._delimiter = self._delimiterBox.currentSelected()
        self._delimiterBox.delimiter.connect(self._updateDelimiter)

        layout.addWidget(self._delimiterLabel, 3, 0)
        layout.addWidget(self._delimiterBox, 3, 1, 1, 3)

        self._tabWidget = QtGui.QTabWidget(self)
        self._previewTableView = QtGui.QTableView(self)
        self._datatypeTableView = QtGui.QTableView(self)
        self._tabWidget.addTab(self._previewTableView, 'Preview')
        self._tabWidget.addTab(self._datatypeTableView, 'Change Column Types')
        layout.addWidget(self._tabWidget, 4, 0, 3, 4)

        self._datatypeTableView.horizontalHeader().setDefaultSectionSize(200)
        self._datatypeTableView.setItemDelegateForColumn(
            1, DtypeComboDelegate(self._datatypeTableView))

        self._loadButton = QtGui.QPushButton('Load Data', self)
        #self.loadButton.setAutoDefault(False)

        self._cancelButton = QtGui.QPushButton('Cancel', self)
        # self.cancelButton.setDefault(False)
        # self.cancelButton.setAutoDefault(True)

        self._buttonBox = QtGui.QDialogButtonBox(self)
        self._buttonBox.addButton(self._loadButton,
                                  QtGui.QDialogButtonBox.AcceptRole)
        self._buttonBox.addButton(self._cancelButton,
                                  QtGui.QDialogButtonBox.RejectRole)
        self._buttonBox.accepted.connect(self.accepted)
        self._buttonBox.rejected.connect(self.rejected)
        layout.addWidget(self._buttonBox, 9, 2, 1, 2)
        self._loadButton.setDefault(False)
        self._filenameLineEdit.setFocus()

        self._statusBar = QtGui.QStatusBar(self)
        self._statusBar.setSizeGripEnabled(False)
        self._headerCheckBox.setChecked(True)
        layout.addWidget(self._statusBar, 8, 0, 1, 4)
        self.setLayout(layout)
Esempio n. 14
0
    def _initUI(self):
        """Initiates the user interface with a grid layout and several widgets.

        """
        self.setModal(self._modal)
        self.setWindowTitle(self._windowTitle)

        layout = QtGui.QGridLayout()

        self._filenameLabel = QtGui.QLabel('Output File', self)
        self._filenameLineEdit = QtGui.QLineEdit(self)
        chooseFileButtonIcon = QtGui.QIcon(
            QtGui.QPixmap(':/icons/document-save-as.png'))
        self._chooseFileAction = QtGui.QAction(self)
        self._chooseFileAction.setIcon(chooseFileButtonIcon)
        self._chooseFileAction.triggered.connect(self._createFile)

        self._chooseFileButton = QtGui.QToolButton(self)
        self._chooseFileButton.setDefaultAction(self._chooseFileAction)

        layout.addWidget(self._filenameLabel, 0, 0)
        layout.addWidget(self._filenameLineEdit, 0, 1, 1, 2)
        layout.addWidget(self._chooseFileButton, 0, 3)

        self._encodingLabel = QtGui.QLabel('File Encoding', self)

        encoding_names = list(
            map(lambda x: x.upper(), sorted(list(set(_encodings.values())))))

        self._encodingComboBox = QtGui.QComboBox(self)
        self._encodingComboBox.addItems(encoding_names)
        self._idx = encoding_names.index('UTF_8')
        self._encodingComboBox.setCurrentIndex(self._idx)
        #self._encodingComboBox.activated.connect(self._updateEncoding)

        layout.addWidget(self._encodingLabel, 1, 0)
        layout.addWidget(self._encodingComboBox, 1, 1, 1, 1)

        self._hasHeaderLabel = QtGui.QLabel('Header Available?', self)
        self._headerCheckBox = QtGui.QCheckBox(self)
        #self._headerCheckBox.toggled.connect(self._updateHeader)

        layout.addWidget(self._hasHeaderLabel, 2, 0)
        layout.addWidget(self._headerCheckBox, 2, 1)

        self._delimiterLabel = QtGui.QLabel('Column Delimiter', self)
        self._delimiterBox = DelimiterSelectionWidget(self)

        layout.addWidget(self._delimiterLabel, 3, 0)
        layout.addWidget(self._delimiterBox, 3, 1, 1, 3)

        self._exportButton = QtGui.QPushButton('Export Data', self)
        self._cancelButton = QtGui.QPushButton('Cancel', self)

        self._buttonBox = QtGui.QDialogButtonBox(self)
        self._buttonBox.addButton(self._exportButton,
                                  QtGui.QDialogButtonBox.AcceptRole)
        self._buttonBox.addButton(self._cancelButton,
                                  QtGui.QDialogButtonBox.RejectRole)

        self._buttonBox.accepted.connect(self.accepted)
        self._buttonBox.rejected.connect(self.rejected)

        layout.addWidget(self._buttonBox, 5, 2, 1, 2)
        self._exportButton.setDefault(False)
        self._filenameLineEdit.setFocus()

        self._statusBar = QtGui.QStatusBar(self)
        self._statusBar.setSizeGripEnabled(False)
        layout.addWidget(self._statusBar, 4, 0, 1, 4)
        self.setLayout(layout)
Esempio n. 15
0
UNICODE_SECONDARY_RANGE_KEYWORD = [
    'Supplement', 'Extended', 'Extensions', 'Modifier', 'Marks', 'Punctuation',
    'Symbols', 'Forms', 'Operators', 'Miscellaneous', 'Drawing', 'Block',
    'Shapes', 'Supplemental', 'Tags'
]  # type: List[str]

RE_POSSIBLE_ENCODING_INDICATION = re_compile(
    r'(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)',
    IGNORECASE)

IANA_SUPPORTED = sorted(
    filter(
        lambda x: x.endswith("_codec") is
        False and x not in {"rot_13", "tactis", "mbcs"},
        list(set(aliases.values()))))  # type: List[str]

IANA_SUPPORTED_COUNT = len(IANA_SUPPORTED)  # type: int

# pre-computed code page that are similar using the function cp_similarity.
IANA_SUPPORTED_SIMILAR = {
    "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
    "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
    "cp1125": ["cp866"],
    "cp1140": ["cp037", "cp1026", "cp273", "cp500"],
    "cp1250": ["iso8859_2"],
    "cp1251": ["kz1048", "ptcp154"],
    "cp1252": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],
    "cp1253": ["iso8859_7"],
    "cp1254": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],
    "cp1257": ["iso8859_13"],
Esempio n. 16
0
# Run this code cell to install and import the pycountry library
#!pip install pycountry
from pycountry import countries
import pandas as pd

# Run this code cell to see an example of how the library works
countries.get(name='Spain')

# Run this code cell to see how you can also look up countries without specifying the key
countries.lookup('Kingdom of Spain')

#encoding
from encodings.aliases import aliases

alias_values = set(aliases.values())

# This code finds the encodings that works for the file
for encoding in set(aliases.values()):
    try:
        df = pd.read_csv("mystery.csv", encoding=encoding)
        print('successful', encoding)
    except:
        pass

# Fill null
# Fill with mean of a group

df_melt = pd.read_csv('gdp_data.csv')
df_melt['GDP_filled'] = df_melt.groupby('Country Name')['GDP'].transform(
    lambda x: x.fillna(x.mean()))
Esempio n. 17
0
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
"""

import sys
import re
from encodings.aliases import aliases

from splitcode import header, footer

charset_pattern = re.compile(r"<meta[^>]*charset=([a-zA-Z\-0-9\"\']*)")
available_encodings = set((_.lower() for _ in aliases.keys()))
available_encodings |= set((_.lower() for _ in aliases.values()))
# for detect invalid positions in UnicodeError message
position_interval_pattern = re.compile(r"position ([0-9]*)-([0-9]*)")
position_pattern = re.compile(r"position ([0-9]*):")

def test_encoding(t, enc, stop_at=None):
    """
    tests a "t" text decoding with enc and returns how many decode errors
    occured in the whole text
    """
    c = 0
    while True:
        try:
            t = t.decode(enc)
            break
        except LookupError:
Esempio n. 18
0
    'for', 'from',
    'have', 'he', 'her', 'him', 'his', 'has',
    'i', 'if', 'in', 'is', 'it',
    'just',
    'like',
    'man', 'may', 'more', 'most', 'my',
    'no', 'not', 'now',
    'of', 'on', 'only', 'or', 'out', 'over',
    'say', 'see', 'she', 'should', "shouldn't", 'so',
    'than', 'that', 'the', 'then', 'there', 'they', 'this', 'to',
    'was', 'way', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'would', 'wouldn', "won't",
    'you'
}

DEFAULT_ENCODING = 'utf-8'
SUPPORTED_ENCODINGS = list(sorted(set(aliases.values())))

# precompile all expressions
EXPR_UPPERCASE = re.compile('^[A-Z]+$')
EXPR_CAPITALS = re.compile('^[A-Z0-9]+$')
EXPR_PHRASE = re.compile('[A-Za-z0-9]+')
EXPR_WORD = re.compile('^[^A-Z0-9]+|[A-Z0-9]+[^A-Z0-9]*')

logger = logging.getLogger(__name__)


class HTMLStripper(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.reset()
        self._raw = []
Esempio n. 19
0
def print_encodings():
    for enc in sorted(c for c in set(aliases.values()) if not c.endswith("_codec")):
        print(enc)
#!/usr/bin/env python3

from sys import argv, stderr, stdout, exit
import argparse
from encodings.aliases import aliases
from codecs import encode as cencode
from pathlib import Path

ENCODINGS = list(set(aliases.values()))
ENCODINGS.remove('rot_13')

for e in ENCODINGS:
    if e.startswith('base'):
        ENCODINGS.pop(ENCODINGS.index(e))


def encode(s, encoding='utf-16'):
    'UTF-16 encode the string and return each char URI encoded'

    try:
        return ''.join(['%{:0>2x}'.format(b) for b in cencode(s, encoding)])
    except TypeError as e:
        if e.__str__(
        ) == "TypeError: a bytes-like object is required, not 'str'":
            return ''.join(
                ['%{:0>2x}'.format(b) for b in cencode(bytes(s), encoding)])
    except LookupError as e:
        return None
    except Exception as e:
        print(f'[+] Failed encoding for: {encoding}', file=stderr)
        print(f'Error Message: {e}', file=stderr)
Esempio n. 21
0
from encodings.aliases import aliases

encoding_list = list(set(aliases.values()))


def main():
    src_text = input("需要恢复的字符串:")
    for item_i in encoding_list:
        for item_j in encoding_list:
            try:
                guess_text = src_text.encode(encoding=item_i).decode(
                    encoding=item_j)
                print(f'{item_i}->{item_j}:{guess_text}')
            except Exception:
                pass


if __name__ == '__main__':
    main()
Esempio n. 22
0
def check_if_encoding_exist(encoding):
    return encoding in aliases.keys() or encoding in aliases.values()
Esempio n. 23
0
    "Block",
    "Shapes",
    "Supplemental",
    "Tags",
]  # type: List[str]

RE_POSSIBLE_ENCODING_INDICATION = re_compile(
    r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
    IGNORECASE,
)

IANA_SUPPORTED = sorted(
    filter(
        lambda x: x.endswith("_codec") is False and x not in
        {"rot_13", "tactis", "mbcs"},
        list(set(aliases.values())),
    ))  # type: List[str]

IANA_SUPPORTED_COUNT = len(IANA_SUPPORTED)  # type: int

# pre-computed code page that are similar using the function cp_similarity.
IANA_SUPPORTED_SIMILAR = {
    "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
    "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
    "cp1125": ["cp866"],
    "cp1140": ["cp037", "cp1026", "cp273", "cp500"],
    "cp1250": ["iso8859_2"],
    "cp1251": ["kz1048", "ptcp154"],
    "cp1252": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],
    "cp1253": ["iso8859_7"],
    "cp1254": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],
Esempio n. 24
0
# -*- coding: utf-8 -*-
from encodings.aliases import aliases
import nkf

all_encodings = set(aliases.values()) | set(aliases.keys())


def normalize_encoding(encoding):
    encoding = encoding.lower()
    if encoding in ('windows-31j', 'shift-jis', 'shift_jis', 'x-sjis', 'sjis'):
        return 'cp932'
    return encoding


def decode(text, encoding=None, *args):
    if not encoding or encoding in ('ISO-8859-1', 'iso-8859-1'):
        encoding = nkf.guess(text)
        if encoding in ('BINARY', 'ISO-8859-1'):
            encoding = 'utf8'
    encoding = normalize_encoding(encoding)
    if not encoding in all_encodings:
        return nkf.nkf('-w', text).decode('utf8')
    return text.decode(encoding, *args)
Esempio n. 25
0
# In[4]:

# TODO: Figure out what the encoding is of the myster.csv file
# HINT: pd.read_csv('mystery.csv', encoding=?) where ? is the string for an encoding like 'ascii'
# HINT: This link has a list of encodings that Python recognizes https://docs.python.org/3/library/codecs.html#standard-encodings

# Python has a file containing a dictionary of encoding names and associated aliases
# This line imports the dictionary and then creates a set of all available encodings
# You can use this set of encodings to search for the correct encoding
# If you'd like to see what this file looks like, execute the following Python code to see where the file is located
#    from encodings import aliases
#    aliases.__file__

from encodings.aliases import aliases

alias_values = set(aliases.values())

# TODO: iterate through the alias_values list trying out the different encodings to see which one or ones work
# HINT: Use a try - except statement. Otherwise your code will produce an error when reading in the csv file
#       with the wrong encoding.
# HINT: In the try statement, print out the encoding name so that you know which one(s) worked.

for encoding in alias_values:
    try:
        pd.read_csv('mystery.csv', encoding=encoding)
        print("Successfully read the csv with encoding of ", encoding)
    except:
        print("Failed: Encoding of ", encoding)

# # Conclusion
#
Esempio n. 26
0
CONSTANT_EVALS = {'true': True, 'false': False, 'null': None}

COMMON_TERMS = {
    'a', 'about', 'all', 'and', 'are', 'as', 'at', 'be', 'but', 'by'
    'can', 'cannot', 'could', "couldn't", 'do', 'did', "didn't", 'for', 'from',
    'have', 'he', 'her', 'him', 'his', 'has', 'i', 'if', 'in', 'is', 'it',
    'just', 'like', 'man', 'may', 'more', 'most', 'my', 'no', 'not', 'now',
    'of', 'on', 'only', 'or', 'out', 'over', 'say', 'see', 'she', 'should',
    "shouldn't", 'so', 'than', 'that', 'the', 'then', 'there', 'they', 'this',
    'to', 'was', 'way', 'we', 'were', 'what', 'when', 'which', 'who', 'will',
    'with', 'would', 'wouldn', "won't", 'you'
}

DEFAULT_ENCODING = 'utf-8'
SUPPORTED_ENCODINGS = list(sorted(set(aliases.values())))

# precompile all expressions
EXPR_UPPERCASE = re.compile('^[A-Z]+$')
EXPR_CAPITALS = re.compile('^[A-Z0-9]+$')
EXPR_PHRASE = re.compile('[A-Za-z0-9]+')
EXPR_WORD = re.compile('^[^A-Z0-9]+|[A-Z0-9]+[^A-Z0-9]*')

logger = logging.getLogger(__name__)


class HTMLStripper(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.reset()
        self._raw = []
Esempio n. 27
0
import re
from pathlib import Path
from encodings.aliases import aliases

# regex list
re_p_content = re.compile(r"<P>.*?</P>", re.DOTALL)
re_a_content = re.compile(r"<A.*?</A>", re.DOTALL)
re_b_content = re.compile(r"<B>.*?</B>", re.DOTALL)
re_tag = re.compile(r"</?.*?>", re.DOTALL)
re_empty_lines = re.compile(r"\n\s*\n")
re_parenth = re.compile(r"[{(].*?[})]")

# encodings
heb_encodings = ['utf-8', 'cp1255', 'iso8859_8', 'cp424', 'cp856', 'cp862']
other_encodings = set(aliases.values()) - set(heb_encodings)


def read_heb_file(file_path):

    # try hebrew encodings
    file_content = read_file(file_path, heb_encodings)
    if not file_content:

        # try all other encodings
        file_content = read_file(file_path, other_encodings)

    return file_content


def read_file(file_path, enc_list):