def test_encode_decode(self):

        with self.subTest('Encode & Detect UTF-8 WITHOUT SIG SMALL CONTENT'):
            self.assertEqual(
                CnM.from_bytes('h\xe9llo world!\n'.encode(
                    'utf_8')).best().first().encoding, 'utf_8')

        with self.subTest('Encode & Detect GB18030 WITHOUT SIG'):
            self.assertEqual(
                CnM.from_bytes('我没有埋怨,磋砣的只是一些时间。'.encode(
                    'gb18030')).best().first().encoding, 'gb18030')

        with self.subTest('Encode & Detect GB18030 WITH SIG (CJK)'):
            self.assertEqual(
                CnM.from_bytes((u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。'
                                ).encode('gb18030')).best().first().encoding,
                'gb18030')

        with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CJK)'):
            self.assertEqual(
                CnM.from_bytes('我没有埋怨,磋砣的只是一些时间。'.encode(
                    'utf_8')).best().first().encoding, 'utf_8')

        with self.subTest('Encode & Detect UTF-7 WITHOUT BOM (CJK)'):
            self.assertEqual(
                CnM.from_bytes('我没有埋怨,蹉跎的只是一些时间。'.encode(
                    'utf_7')).best().first().encoding, 'utf_7')

        with self.subTest('Encode & Detect UTF-7 WITH BOM (CJK)'):
            self.assertEqual(
                CnM.from_bytes(b'\x2b\x2f\x76\x38' +
                               '我没有埋怨,磋砣的只是一些时间。'.encode('utf_7')).best().
                first().encoding, 'utf_7')

        with self.subTest('Encode & Detect UTF-7 WITHOUT BOM (CYRILLIC)'):
            self.assertEqual(
                CnM.from_bytes(
                    'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно,'
                    .encode('utf_7')).best().first().encoding, 'utf_7')

        with self.subTest('Encode & Detect UTF-8 WITH SIG (CJK)'):
            self.assertEqual(
                CnM.from_bytes(b'\xef\xbb\xbf' +
                               '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8')).best().
                first().encoding, 'utf_8')

        with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CYRILLIC)'):
            self.assertEqual(
                CnM.from_bytes(
                    'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, '
                    'поне що се отнася до началното и основното образование.'.
                    encode('utf_8')).best().first().encoding, 'utf_8')

        with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CYRILLIC)'):
            self.assertEqual(
                CnM.from_bytes('Bсеки човек има право на образование.'.encode(
                    'utf_8')).best().first().encoding, 'utf_8')
Esempio n. 2
0
def normalise_to_utf8(bytes_or_filepath):
    """Convert any text input with unknown encoding to utf-8.

    Parameters
    ----------
    bytes_or_filepath : bytes or str
        A binary string or path to any text file in any encoding.

    Returns
    -------
    str
        A string with correct utf-8 encoding.

    Raises
    ------
    TypeError
        Input is not of type bytes or a valid path to an existing file.
    """
    if type(bytes_or_filepath) == bytes:
        utf8_str = str(cnm.from_bytes(bytes_or_filepath).best().first())
    elif os.path.isfile(bytes_or_filepath):
        utf8_str = str(cnm.from_path(bytes_or_filepath).best().first())
    else:
        raise TypeError('Input must be bytes or a valid file path')

    return utf8_str
Esempio n. 3
0
def read_file(path: str, filename: str = '') -> str:
    if filename:
        path = join_path(path, filename=filename)

    file_bytes = Path(path).read_bytes()
    encodings = CharsetNormalizerMatches.from_bytes(file_bytes).best()

    if len(encodings) == 0:
        encoding = None
    else:
        encoding = encodings.first().encoding

    return Path(path).read_text(encoding=encoding)
    def test_bom_detection(self):
        with self.subTest('GB18030 UNAVAILABLE SIG'):
            self.assertFalse(
                CnM.from_bytes('我没有埋怨,磋砣的只是一些时间。'.encode(
                    'gb18030')).best().first().byte_order_mark)

        with self.subTest('GB18030 AVAILABLE SIG'):
            self.assertTrue(
                CnM.from_bytes(
                    (u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。'
                     ).encode('gb18030')).best().first().byte_order_mark)

        with self.subTest('UTF-7 AVAILABLE BOM'):
            self.assertTrue(
                CnM.from_bytes(b'\x2b\x2f\x76\x38' +
                               '我没有埋怨,磋砣的只是一些时间。'.encode('utf_7')).best().
                first().byte_order_mark)

        with self.subTest('UTF-8 AVAILABLE BOM'):
            self.assertTrue(
                CnM.from_bytes(b'\xef\xbb\xbf' +
                               '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8')).best().
                first().byte_order_mark)
Esempio n. 5
0
    def get_data(cls, self, report_type, option_type):
        """Construct and make get request."""
        rpt_date = self.report_date.strftime('%Y%m%d')
        p1_url = f"{self.occ_burl}/flex-reports?reportType={report_type}"
        p2_url = f"&optionType={option_type}&reportDate={rpt_date}"
        # Make get request with passed url
        flex_bytes = requests.get(f"{p1_url}{p2_url}")

        # If a short error message assume wrong date
        if len(flex_bytes.content) < 500:
            self.report_date = self.report_date - timedelta(days=1)
            rpt_date = self.report_date.strftime('%Y%m%d')
            p2_url = f"&optionType={option_type}&reportDate={rpt_date}"
            # Make get request with passed url
            flex_bytes = requests.get(f"{p1_url}{p2_url}")

        self.byte_length = len(flex_bytes.content)
        self.rpt_to_print = CnM.from_bytes(flex_bytes.content).best().first()

        return flex_bytes
Esempio n. 6
0
def detect():
    if 'file' not in request.files:
        return jsonify({'message': 'No file has been sent'}), 400

    my_file = request.files['file']  # type: FileStorage

    byte_str = my_file.stream.read()

    r_ = CnM.from_bytes(byte_str).best()

    k_ = chardet_detect(byte_str)
    k_['confidence'] = str(round(
        k_['confidence'] *
        100., ndigits=3)) + ' %' if k_['confidence'] is not None else None

    z_ = cchardet_detect(byte_str)
    z_['confidence'] = str(round(
        z_['confidence'] *
        100., ndigits=3)) + ' %' if z_['confidence'] is not None else None

    return jsonify({
        'charset-normalizer': {
            'encoding': r_.encoding,
            'aliases': r_.encoding_aliases,
            'alphabets': r_.alphabets,
            'language': r_.language,
            'chaos': str(r_.percent_chaos) + ' %',
            'coherence': str(r_.percent_coherence) + ' %',
            'could_be': r_.could_be_from_charset
        } if r_ is not None else None,
        'chardet':
        k_,
        'cchardet':
        z_,
        'filename':
        my_file.filename,
        'b64_content':
        b64encode(r_.output()).decode('ascii') if r_ is not None else ''
    })
Esempio n. 7
0
            for i in range(N_REQUIRED_LOOP):
                st_t = perf_counter_ns()
                z_ = n_detect(seq_)
                l_.append(perf_counter_ns() - st_t)

            st_ar[srt_file]['cchardet'] = locale.format_string('%d',
                                                               mean(l_),
                                                               grouping=True)
            st_re[srt_file]['cchardet'] = z_['encoding']

            l_.clear()

            for i in range(N_REQUIRED_LOOP):
                st_t = perf_counter_ns()
                y_ = CharsetNormalizerMatches.from_bytes(seq_)
                l_.append(perf_counter_ns() - st_t)

            st_ar[srt_file]['charset_normalizer'] = locale.format_string(
                '%d', mean(l_), grouping=True)
            st_re[srt_file]['charset_normalizer'] = y_.best().first().encoding

    x_ = prettytable.PrettyTable(
        ['File', 'Chardet', 'cChardet', 'Charset Normalizer'])

    for k, v in st_ar.items():
        x_.add_row([k, v['chardet'], v['cchardet'], v['charset_normalizer']])

    print(x_)

    x_ = prettytable.PrettyTable(
Esempio n. 8
0
pd.set_option('display.max_columns', 100)
# Display maximum rows
pd.set_option('display.max_rows', 50)
# %% codecell
########################################

url = "https://www.sec.gov/Archives/edgar/daily-index/2021/QTR2/sitemap.20210426.xml"
get = requests.get(url)

fpath = '/Users/unknown1/Algo/data/sec/raw/daily_index/2021/20210426'

f = open(fpath, 'wb')
f.write(get.content)
f.close()

print(CnM.from_bytes(get.content).best().first())

root = ET.fromstring(get.content.decode('UTF-8'))



# %% codecell
#####################################################################

data = []
for i, child in enumerate(root):
    data.append([subchild.text for subchild in child])

df = pd.DataFrame(data)  # Write in DF
df.columns = ['url', 'lastmod', 'changefreq', 'priority']
Esempio n. 9
0
def detect_file_encoding(file_path, buffer_size=1024, max_lines=20):
    """
    Determines the encoding of files within the initial `max_lines` of length
    `buffer_size`.
    
    :param file_path: path to the file
    :type file_path: str
    :param buffer_size: buffer length for each line being read
    :type buffer_size: int
    :param max_lines: number of lines to read from file of length buffer_size
    :type max_lines: int
    :return: encoding type
    :rtype: str
    """
    detector = UniversalDetector()
    line_count = 0
    with FileOrBufferHandler(file_path, 'rb') as input_file:
        chunk = input_file.read(buffer_size)
        while chunk and line_count < max_lines:
            detector.feed(chunk)
            chunk = input_file.read(buffer_size)
            line_count += 1
    detector.close()
    encoding = detector.result["encoding"]

    # Typical file representation is utf-8 instead of ascii, treat as such.
    if not encoding or encoding.lower() in ['ascii', 'windows-1254']:
        encoding = 'utf-8'

    # Check if encoding can be used to decode without throwing an error
    def _decode_is_valid(encoding):
        try:
            with FileOrBufferHandler(file_path,
                                     encoding=encoding) as input_file:
                input_file.read(1024 * 1024)
                return True
        except:
            return False

    if not _decode_is_valid(encoding):
        try:
            from charset_normalizer import CharsetNormalizerMatches as CnM

            # Try with small sample
            with FileOrBufferHandler(file_path, 'rb') as input_file:
                raw_data = input_file.read(10000)
                result = CnM.from_bytes(raw_data,
                                        steps=5,
                                        chunk_size=512,
                                        threshold=0.2,
                                        cp_isolation=None,
                                        cp_exclusion=None,
                                        preemptive_behaviour=True,
                                        explain=False)
                result = result.best()
            if result:
                if result.first():
                    encoding = result.first().encoding

            # Try again with full sample
            if not _decode_is_valid(encoding):
                with FileOrBufferHandler(file_path, 'rb') as input_file:
                    raw_data = input_file.read(max_lines * buffer_size)
                    result = CnM.from_bytes(raw_data,
                                            steps=max_lines,
                                            chunk_size=buffer_size,
                                            threshold=0.2,
                                            cp_isolation=None,
                                            cp_exclusion=None,
                                            preemptive_behaviour=True,
                                            explain=False)
                    result = result.best()
                if result:
                    if result.first():
                        encoding = result.first().encoding

        except:
            logger.info("Install charset_normalizer for improved file "
                        "encoding detection")

    # If no encoding is still found, default to utf-8
    if not encoding:
        encoding = 'utf-8'
    return encoding.lower()
Esempio n. 10
0
import xml.etree.ElementTree as ET

# Display max 50 columns
pd.set_option('display.max_columns', None)
# Display maximum rows
pd.set_option('display.max_rows', None)



# %% codecell
##############################################################

url = 'https://marketdata.theocc.com/delo-download?prodType=ALL&downloadFields=OS;US;SN&format=txt'
get = requests.get(url)
dlp_df = pd.DataFrame(pd.read_csv(BytesIO(get.content)), escapechar='\n', delimiter='\t')
print(CnM.from_bytes(get.content).best().first())
get_sample = get.content[0:1000]
get_sample



"""
sym = 'IBM'
occ = requests.get(f"https://marketdata.theocc.com/series-search?symbolType=U&symbol={sym}")
occ_df = pd.read_csv(BytesIO(occ.content), skiprows=6, escapechar='\n', delimiter='\t')
cols = occ_df.columns[:-1]
occ_df.drop('year', inplace=True, axis=1)
occ_df.columns = cols