Esempio n. 1
0
def normalise_to_utf8(bytes_or_filepath):
    """Convert any text input with unknown encoding to utf-8.

    Parameters
    ----------
    bytes_or_filepath : bytes or str
        A binary string or path to any text file in any encoding.

    Returns
    -------
    str
        A string with correct utf-8 encoding.

    Raises
    ------
    TypeError
        Input is not of type bytes or a valid path to an existing file.
    """
    if type(bytes_or_filepath) == bytes:
        utf8_str = str(cnm.from_bytes(bytes_or_filepath).best().first())
    elif os.path.isfile(bytes_or_filepath):
        utf8_str = str(cnm.from_path(bytes_or_filepath).best().first())
    else:
        raise TypeError('Input must be bytes or a valid file path')

    return utf8_str
    def test_encode_decode(self):

        with self.subTest('Encode & Detect UTF-8 WITHOUT SIG SMALL CONTENT'):
            self.assertEqual(
                CnM.from_bytes('h\xe9llo world!\n'.encode(
                    'utf_8')).best().first().encoding, 'utf_8')

        with self.subTest('Encode & Detect GB18030 WITHOUT SIG'):
            self.assertEqual(
                CnM.from_bytes('我没有埋怨,磋砣的只是一些时间。'.encode(
                    'gb18030')).best().first().encoding, 'gb18030')

        with self.subTest('Encode & Detect GB18030 WITH SIG (CJK)'):
            self.assertEqual(
                CnM.from_bytes((u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。'
                                ).encode('gb18030')).best().first().encoding,
                'gb18030')

        with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CJK)'):
            self.assertEqual(
                CnM.from_bytes('我没有埋怨,磋砣的只是一些时间。'.encode(
                    'utf_8')).best().first().encoding, 'utf_8')

        with self.subTest('Encode & Detect UTF-7 WITHOUT BOM (CJK)'):
            self.assertEqual(
                CnM.from_bytes('我没有埋怨,蹉跎的只是一些时间。'.encode(
                    'utf_7')).best().first().encoding, 'utf_7')

        with self.subTest('Encode & Detect UTF-7 WITH BOM (CJK)'):
            self.assertEqual(
                CnM.from_bytes(b'\x2b\x2f\x76\x38' +
                               '我没有埋怨,磋砣的只是一些时间。'.encode('utf_7')).best().
                first().encoding, 'utf_7')

        with self.subTest('Encode & Detect UTF-7 WITHOUT BOM (CYRILLIC)'):
            self.assertEqual(
                CnM.from_bytes(
                    'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно,'
                    .encode('utf_7')).best().first().encoding, 'utf_7')

        with self.subTest('Encode & Detect UTF-8 WITH SIG (CJK)'):
            self.assertEqual(
                CnM.from_bytes(b'\xef\xbb\xbf' +
                               '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8')).best().
                first().encoding, 'utf_8')

        with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CYRILLIC)'):
            self.assertEqual(
                CnM.from_bytes(
                    'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, '
                    'поне що се отнася до началното и основното образование.'.
                    encode('utf_8')).best().first().encoding, 'utf_8')

        with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CYRILLIC)'):
            self.assertEqual(
                CnM.from_bytes('Bсеки човек има право на образование.'.encode(
                    'utf_8')).best().first().encoding, 'utf_8')
Esempio n. 3
0
    def test_file_input(self):
        for path_name in glob('./data/*.srt') + glob('./data/*.txt'):

            with self.subTest('test_file_input <{}>'.format(path_name)):

                matches = CnM.from_path(path_name)

                self.assertGreater(
                    len(matches),
                    0
                )

                r_ = matches.best().first()

                self.assertIsNotNone(
                    r_
                )

                if isinstance(TestFileCharsetNormalizer.SHOULD_BE[basename(path_name)], str):
                    self.assertEqual(
                        r_.encoding,
                        TestFileCharsetNormalizer.SHOULD_BE[basename(path_name)]
                    )
                else:
                    self.assertIn(
                        r_.encoding,
                        TestFileCharsetNormalizer.SHOULD_BE[basename(path_name)]
                    )
Esempio n. 4
0
def process_drive(img_burn_exe: str, drive: str, output_folder: str):
    """
    Processes drive, tests if it is ready and if it is - tries to backup
    :param img_burn_exe: path to Exe file
    :param drive: Drive from which backup will be performed
    :param output_folder: Folder to which output will be saved
    :return:
    """
    if not test_drive(drive):
        logging.info("Waiting for drive: %s to be ready", drive)
        return
    autorun_file = Path(f"{drive}Autorun.inf")
    autorun_label = ""
    if autorun_file.is_file():
        parser = ConfigParser()
        encoding = CnM.from_path(autorun_file).best().first().encoding
        logging.debug("Detected autorun.inf encoding: %s", encoding)
        try:
            parser.read_string(
                autorun_file.read_text(encoding=encoding).lower())
        except DuplicateOptionError as err:
            pass
        else:
            if 'label' in parser['autorun']:
                autorun_label = parser['autorun']['label'].upper()

    backup_disk(autorun_label, drive, img_burn_exe, output_folder)
Esempio n. 5
0
def _detect_file_encoding(path: Path) -> str:
    """Return an approximate encoding of text file.

    Performs an encoding detection and BOM check.

    Args:
        path: The path to playlist file

    Returns:
        A string with "best" encoding from following:
        'utf-8', 'utf-8-sig', 'cp1251', 'cp1252', 'utf_16_le'.

    Raises:
        ClickException: The file was no found or
            the encoding was not retrieved from 'charset_normalizer'
    """
    try:
        detection_result = (CnM.from_path(
            path, cp_isolation=["utf_8", "cp1252", "cp1251",
                                "utf_16_le"]).best().first())

        encoding = "utf-8"
        if path.suffix == ".aimppl4":
            encoding = "utf-16-le"
        elif detection_result.encoding == "utf_8":
            if detection_result.byte_order_mark:
                encoding = "utf-8-sig"
        else:
            encoding = detection_result.encoding

        return encoding
    except (OSError, AttributeError) as error:
        message = str(error)
        raise ClickException(message)
def encoding_from_path(txt_file_path):
    file_encoding = 'utf-8'
    enc = CnM.from_path(txt_file_path).best().first()
    file_encoding = enc.encoding
    ##fix same encoding
    if (file_encoding == 'big5' or file_encoding == 'cp1252'):
        file_encoding = 'utf-8'
    return file_encoding
Esempio n. 7
0
def source(source_file_s):
    # TODO: file missing exception
    text_str = str(cnm.from_path(source_file_s).best().first())

    if len(text_str) == 0:
        ui.message.addItem("Файл оказался пустым!")
        ui.save_button.setVisible(False)

    return text_str
Esempio n. 8
0
    def get_corpus(self):
        ##extract training and development datasets
        ##do data merge, ArchiveImporter make final train/test/dev datasets
        utterances = {}
        audios = []
        wav_dir = os.path.join(self.origin_data_path, self.extract_dir, "wav",
                               "IT")
        text_dir = os.path.join(self.origin_data_path, self.extract_dir, "txt",
                                "IT")

        for subdir, dirs, files in os.walk(wav_dir):
            uuu = 0
            for _dir in dirs:
                curr_wav_dir = os.path.join(subdir, _dir)
                curr_txt_dir = os.path.join(text_dir, _dir)

                ##iterate wav file current folder
                for fname in os.listdir(curr_wav_dir):
                    fname = os.fsdecode(fname)

                    wav_file_path = os.path.join(wav_dir, _dir, fname)
                    txt_file_path = os.path.join(curr_txt_dir,
                                                 fname.split('.')[0] + '.txt')
                    if (not os.path.isfile(txt_file_path)):
                        print('audio file {} doesn\'t have a file transcript')
                        continue

                    ##read file transcript
                    transcript = ''

                    ##files have different encoding (utf-8, utf_16_be, etc..)
                    ##need check to open file with correct encoding
                    file_encoding = 'utf-8'
                    enc = CnM.from_path(txt_file_path).best().first()
                    file_encoding = enc.encoding
                    ##fix same encoding
                    if (file_encoding == 'big5' or file_encoding == 'cp1252'):
                        file_encoding = 'utf-8'

                    with open(txt_file_path, "r", encoding=file_encoding) as f:
                        transcript += f.readline()

                    transcript = transcript.strip()
                    ##append data manifest
                    utterances[wav_file_path] = transcript
                    audios.append(wav_file_path)

        ##collect corpus
        corpus = Corpus(utterances, audios)
        #################
        ## SIWIS clips need resample wav - clips is 44100Hz  706 kb/s (1 chnl)
        ## not require resample
        corpus.make_wav_resample = True
        return corpus
Esempio n. 9
0
def read_file(path: str, filename: str = '') -> str:
    if filename:
        path = join_path(path, filename=filename)

    file_bytes = Path(path).read_bytes()
    encodings = CharsetNormalizerMatches.from_bytes(file_bytes).best()

    if len(encodings) == 0:
        encoding = None
    else:
        encoding = encodings.first().encoding

    return Path(path).read_text(encoding=encoding)
Esempio n. 10
0
    def test_bom_detection(self):
        with self.subTest('GB18030 UNAVAILABLE SIG'):
            self.assertFalse(
                CnM.from_bytes('我没有埋怨,磋砣的只是一些时间。'.encode(
                    'gb18030')).best().first().byte_order_mark)

        with self.subTest('GB18030 AVAILABLE SIG'):
            self.assertTrue(
                CnM.from_bytes(
                    (u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。'
                     ).encode('gb18030')).best().first().byte_order_mark)

        with self.subTest('UTF-7 AVAILABLE BOM'):
            self.assertTrue(
                CnM.from_bytes(b'\x2b\x2f\x76\x38' +
                               '我没有埋怨,磋砣的只是一些时间。'.encode('utf_7')).best().
                first().byte_order_mark)

        with self.subTest('UTF-8 AVAILABLE BOM'):
            self.assertTrue(
                CnM.from_bytes(b'\xef\xbb\xbf' +
                               '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8')).best().
                first().byte_order_mark)
    def read_txt_file(self,txt_file_path):
        transcript = ''

        ##files have different encoding (utf-8, utf_16_be, etc..)
        ##need check to open file with correct encoding
        file_encoding ='utf-8'                   
        enc = CnM.from_path(txt_file_path).best().first()
        file_encoding = enc.encoding
        ##fix same encoding 
        if(file_encoding=='big5' or file_encoding=='cp1252' ):
            file_encoding = 'utf-8'                    

        with open(txt_file_path, "r",encoding=file_encoding) as f:
            transcript += f.readline()

        transcript = transcript.strip()
        return transcript
Esempio n. 12
0
    def get_data(cls, self, report_type, option_type):
        """Construct and make get request."""
        rpt_date = self.report_date.strftime('%Y%m%d')
        p1_url = f"{self.occ_burl}/flex-reports?reportType={report_type}"
        p2_url = f"&optionType={option_type}&reportDate={rpt_date}"
        # Make get request with passed url
        flex_bytes = requests.get(f"{p1_url}{p2_url}")

        # If a short error message assume wrong date
        if len(flex_bytes.content) < 500:
            self.report_date = self.report_date - timedelta(days=1)
            rpt_date = self.report_date.strftime('%Y%m%d')
            p2_url = f"&optionType={option_type}&reportDate={rpt_date}"
            # Make get request with passed url
            flex_bytes = requests.get(f"{p1_url}{p2_url}")

        self.byte_length = len(flex_bytes.content)
        self.rpt_to_print = CnM.from_bytes(flex_bytes.content).best().first()

        return flex_bytes
Esempio n. 13
0
def detect():
    if 'file' not in request.files:
        return jsonify({'message': 'No file has been sent'}), 400

    my_file = request.files['file']  # type: FileStorage

    byte_str = my_file.stream.read()

    r_ = CnM.from_bytes(byte_str).best()

    k_ = chardet_detect(byte_str)
    k_['confidence'] = str(round(
        k_['confidence'] *
        100., ndigits=3)) + ' %' if k_['confidence'] is not None else None

    z_ = cchardet_detect(byte_str)
    z_['confidence'] = str(round(
        z_['confidence'] *
        100., ndigits=3)) + ' %' if z_['confidence'] is not None else None

    return jsonify({
        'charset-normalizer': {
            'encoding': r_.encoding,
            'aliases': r_.encoding_aliases,
            'alphabets': r_.alphabets,
            'language': r_.language,
            'chaos': str(r_.percent_chaos) + ' %',
            'coherence': str(r_.percent_coherence) + ' %',
            'could_be': r_.could_be_from_charset
        } if r_ is not None else None,
        'chardet':
        k_,
        'cchardet':
        z_,
        'filename':
        my_file.filename,
        'b64_content':
        b64encode(r_.output()).decode('ascii') if r_ is not None else ''
    })
Esempio n. 14
0
def cli_detect(argv=None):
    parser = argparse.ArgumentParser(
        description="The Real First Universal Charset Detector. "
        "Discover originating encoding used on text file. "
        "Normalize text to unicode.")

    parser.add_argument('file',
                        type=argparse.FileType('rb'),
                        nargs='+',
                        help='Filename')
    parser.add_argument(
        '--verbose',
        action="store_true",
        default=False,
        dest='verbose',
        help='Display complementary information about file if any.')
    parser.add_argument(
        '--normalize',
        action="store_true",
        default=False,
        dest='normalize',
        help=
        'Permit to normalize input file. If not set, program does not write anything.'
    )
    parser.add_argument(
        '--replace',
        action="store_true",
        default=False,
        dest='replace',
        help=
        'Replace file when trying to normalize it instead of creating a new one.'
    )
    parser.add_argument(
        '--force',
        action="store_true",
        default=False,
        dest='force',
        help=
        'Replace file without asking if you are sure, use this flag with caution.'
    )

    args = parser.parse_args(argv)

    if len(args.file) == 0:
        print(
            'This command purpose is to analyse text file. Please specify any filename.',
            file=sys.stderr)
        parser.print_help(file=sys.stderr)
        return 1

    if args.replace is True and args.normalize is False:
        print('Use --replace in addition of --normalize only.',
              file=sys.stderr)
        return 1

    if args.force is True and args.replace is False:
        print('Use --force in addition of --replace only.', file=sys.stderr)
        return 1

    for my_file in args.file:

        matches = CharsetNormalizerMatches.from_fp(my_file)

        if len(matches) == 0:
            print('Unable to identify originating encoding for "{}".'.format(
                my_file.name),
                  file=sys.stderr)
            continue

        x_ = PrettyTable([
            'Filename', 'Encoding', 'Language', 'Alphabets', 'Chaos',
            'Coherence'
        ])

        r_ = matches.best()
        p_ = r_.first()

        x_.add_row([
            my_file.name, p_.encoding, p_.language,
            (' and ' if len(p_.alphabets) < 4 else '\n').join([
                el if 'and' not in el else '"{}"'.format(el)
                for el in p_.alphabets
            ]), '{} %'.format(round(p_.chaos * 100., ndigits=3)),
            '{} %'.format(round(100. - p_.coherence * 100., ndigits=3))
        ])

        if len(matches) > 1 and args.verbose:
            for el in matches:
                if el != p_:
                    x_.add_row([
                        '** ALTERNATIVE ' + my_file.name + '**', el.encoding,
                        el.language,
                        (' and ' if len(el.alphabets) < 4 else '\n').join([
                            el if 'and' not in el else '"{}"'.format(el)
                            for el in el.alphabets
                        ]), '{} %'.format(round(el.chaos * 100., ndigits=3)),
                        '{} %'.format(
                            round(100. - el.coherence * 100., ndigits=3))
                    ])

        print(x_)

        if args.verbose is True:
            print('"{}" could be also originating from {}.'.format(
                my_file.name, ','.join(r_.could_be_from_charset)))
            print('"{}" could be also be written in {}.'.format(
                my_file.name, ' or '.join(p_.languages)))

        if args.normalize is True:

            if p_.encoding.startswith('utf') is True:
                print(
                    '"{}" file does not need to be normalized, as it already came from unicode.'
                )
                continue

            o_ = my_file.name.split('.')  # type: list[str]

            if args.replace is False:
                o_.insert(-1, p_.encoding)
            else:
                if args.force is False and query_yes_no(
                        'Are you sure to normalize "{}" by replacing it ?'.
                        format(my_file.name), 'no') is False:
                    continue

            try:
                with open('./{}'.format('.'.join(o_)), 'w',
                          encoding='utf-8') as fp:
                    fp.write(str(p_))
            except IOError as e:
                print(str(e), file=sys.stderr)
                return 2

    return 0
Esempio n. 15
0
            for i in range(N_REQUIRED_LOOP):
                st_t = perf_counter_ns()
                z_ = n_detect(seq_)
                l_.append(perf_counter_ns() - st_t)

            st_ar[srt_file]['cchardet'] = locale.format_string('%d',
                                                               mean(l_),
                                                               grouping=True)
            st_re[srt_file]['cchardet'] = z_['encoding']

            l_.clear()

            for i in range(N_REQUIRED_LOOP):
                st_t = perf_counter_ns()
                y_ = CharsetNormalizerMatches.from_bytes(seq_)
                l_.append(perf_counter_ns() - st_t)

            st_ar[srt_file]['charset_normalizer'] = locale.format_string(
                '%d', mean(l_), grouping=True)
            st_re[srt_file]['charset_normalizer'] = y_.best().first().encoding

    x_ = prettytable.PrettyTable(
        ['File', 'Chardet', 'cChardet', 'Charset Normalizer'])

    for k, v in st_ar.items():
        x_.add_row([k, v['chardet'], v['cchardet'], v['charset_normalizer']])

    print(x_)

    x_ = prettytable.PrettyTable(
Esempio n. 16
0
 def parse(self, file_name):
     content = str(CnM.from_path(file_name).best().first())
     return self.parser.parse(content)[0]
Esempio n. 17
0
pd.set_option('display.max_columns', 100)
# Display maximum rows
pd.set_option('display.max_rows', 50)
# %% codecell
########################################

url = "https://www.sec.gov/Archives/edgar/daily-index/2021/QTR2/sitemap.20210426.xml"
get = requests.get(url)

fpath = '/Users/unknown1/Algo/data/sec/raw/daily_index/2021/20210426'

f = open(fpath, 'wb')
f.write(get.content)
f.close()

print(CnM.from_bytes(get.content).best().first())

root = ET.fromstring(get.content.decode('UTF-8'))



# %% codecell
#####################################################################

data = []
for i, child in enumerate(root):
    data.append([subchild.text for subchild in child])

df = pd.DataFrame(data)  # Write in DF
df.columns = ['url', 'lastmod', 'changefreq', 'priority']
Esempio n. 18
0
def detect_file_encoding(file_path, buffer_size=1024, max_lines=20):
    """
    Determines the encoding of files within the initial `max_lines` of length
    `buffer_size`.
    
    :param file_path: path to the file
    :type file_path: str
    :param buffer_size: buffer length for each line being read
    :type buffer_size: int
    :param max_lines: number of lines to read from file of length buffer_size
    :type max_lines: int
    :return: encoding type
    :rtype: str
    """
    detector = UniversalDetector()
    line_count = 0
    with FileOrBufferHandler(file_path, 'rb') as input_file:
        chunk = input_file.read(buffer_size)
        while chunk and line_count < max_lines:
            detector.feed(chunk)
            chunk = input_file.read(buffer_size)
            line_count += 1
    detector.close()
    encoding = detector.result["encoding"]

    # Typical file representation is utf-8 instead of ascii, treat as such.
    if not encoding or encoding.lower() in ['ascii', 'windows-1254']:
        encoding = 'utf-8'

    # Check if encoding can be used to decode without throwing an error
    def _decode_is_valid(encoding):
        try:
            with FileOrBufferHandler(file_path,
                                     encoding=encoding) as input_file:
                input_file.read(1024 * 1024)
                return True
        except:
            return False

    if not _decode_is_valid(encoding):
        try:
            from charset_normalizer import CharsetNormalizerMatches as CnM

            # Try with small sample
            with FileOrBufferHandler(file_path, 'rb') as input_file:
                raw_data = input_file.read(10000)
                result = CnM.from_bytes(raw_data,
                                        steps=5,
                                        chunk_size=512,
                                        threshold=0.2,
                                        cp_isolation=None,
                                        cp_exclusion=None,
                                        preemptive_behaviour=True,
                                        explain=False)
                result = result.best()
            if result:
                if result.first():
                    encoding = result.first().encoding

            # Try again with full sample
            if not _decode_is_valid(encoding):
                with FileOrBufferHandler(file_path, 'rb') as input_file:
                    raw_data = input_file.read(max_lines * buffer_size)
                    result = CnM.from_bytes(raw_data,
                                            steps=max_lines,
                                            chunk_size=buffer_size,
                                            threshold=0.2,
                                            cp_isolation=None,
                                            cp_exclusion=None,
                                            preemptive_behaviour=True,
                                            explain=False)
                    result = result.best()
                if result:
                    if result.first():
                        encoding = result.first().encoding

        except:
            logger.info("Install charset_normalizer for improved file "
                        "encoding detection")

    # If no encoding is still found, default to utf-8
    if not encoding:
        encoding = 'utf-8'
    return encoding.lower()
Esempio n. 19
0
    def get_data(folder_path: str) -> dict:
        """
        Iterates through a directory to create a Dict of Pandas DataFrames with
        filepaths as their keys.

        :type folder_path: str
        :rtype: dict

        keys: filepaths
        values: pd.DataFrame
        """

        # print("This is the name of the script: ", sys.argv[0])
        print("Initializing Data Retrieval...")

        csvfiles = glob.glob(folder_path + "/**/*.csv", recursive=True)
        xlfiles = glob.glob(folder_path + "/**/*.xls?", recursive=True)
        xlfiles = xlfiles + glob.glob(folder_path + "/**/*.xls", recursive=True)
        # xlfiles = []
        file_dict = {}
        i = 1

        for file in xlfiles:
            print("Reading File %d of %d:" % (i, len(csvfiles) + len(xlfiles)))
            print("\tFull Path: ", file)
            # csv_from_excel(file)
            try:
                df = pd.read_excel(file, sheet_name=None)

                for sheet in df.keys():
                    print("\t\t", sheet, "processed...")
                    df[sheet].index.rename('file_index',inplace=True)
                    file_dict.update({file.join(['', sheet]): df[sheet]})
            except:
                logging.error('COULD NOT LOAD %s' % file)
                print('\t\tFAILED')

            i += 1

        for file in csvfiles:
            # print("Reading File %d of %d:" % (i, len(csvfiles) + len(xlfiles)))
            print("Reading File %d of %d:" % (i, len(csvfiles) + len(xlfiles)))
            print("\tFull Path: ", file)
            try:
                df = pd.read_csv(file, low_memory=False, header='infer', encoding = detect_encoding(file))
                df.index.rename('file_index',inplace=True)
                file_dict.update({file: df})
            except UnicodeDecodeError:
                try:
                    print("Encoding Detection Failed... Attempting to Normalize...")
                    normalized = StringIO(str(CnM.from_path(file).best().first()))
                    df = pd.read_csv(normalized, low_memory=False, header='infer')
                    df.index.rename('file_index',inplace=True)
                    file_dict.update({file: df})
                    print("Success!")
                except:
                    print('Encoding Normalization Failed')
            except:
                logging.error('COULD NOT LOAD %s' % file)
                print('\t\tFAILED')
            i += 1
        return file_dict
Esempio n. 20
0

######################################################################################
def ClearScreen():
    if platform.system() == "Windows":
        os.system("cls")
    if platform.system() == "Linux":
        os.system("clear")


######################################################################################
ClearScreen()
ModelPath = easygui.fileopenbox("Selecione a legenda modelo:")
TargetPath = easygui.fileopenbox("Selecione a legenda alvo:")
######################################################################################
ModelCoding = CnM.from_path(ModelPath).best().first().encoding
TargetCoding = CnM.from_path(TargetPath).best().first().encoding
######################################################################################

ModelSub = open(ModelPath, encoding=ModelCoding)
TargetSub = open(TargetPath, encoding=TargetCoding)
ModelContent = ModelSub.readlines()
TargetContent = TargetSub.readlines()
i = 0
ModelTimesPos = []
for l in ModelContent:
    if IsSubTimeLine(l):
        ModelTimesPos.append(i)
    i += 1
i = 0
TargetTimesPos = []
def normalize(text_file):
    try:
        CnM.normalize(text_file)  # should write to disk my_subtitle-***.srt
    except IOError as e:
        print('Sadly, we are unable to perform charset normalization.', str(e))
Esempio n. 22
0
args = sys.argv
enc = None
vk = None
data = []


try:
    use_cp1251, use_auth = parse_arg(args[1]), parse_arg(args[2])
except ValueError as exc:
    print(f"Неверный аргумент: {exc.args[0]}. Возможны только аргументы {POSSIBLE_ARGS}")
    exit(-1)
except IndexError:
    pass

try:
    enc = CnM.from_path(INPUT_FNAME).best().first().encoding
    if enc != 'utf-8':
        print("\n\n", ENC_WARN, "\n\n")
        if use_cp1251 is None:
            use_cp1251 = yes_no("Использовать cp1251 вместо текущей кодировки?")
        if use_cp1251:
            enc = 'cp1251'
    # parse the file with group IDs
    print("Используется кодировка: ", enc)
    with open(INPUT_FNAME, 'r', newline='', encoding=enc) as csvf:
        dialect = csv.Sniffer().sniff(csvf.read(1024))
        csvf.seek(0)
        reader = csv.reader(csvf, dialect=dialect)
        for row in reader:
            if row[0] == INPUT_FILE_HEADER_GROUP_TITLE:
                continue
Esempio n. 23
0
from charset_normalizer import CharsetNormalizerMatches as cnm

text_str = str(cnm.from_path("test.txt").best().first())
print(text_str)
Esempio n. 24
0
import xml.etree.ElementTree as ET

# Display max 50 columns
pd.set_option('display.max_columns', None)
# Display maximum rows
pd.set_option('display.max_rows', None)



# %% codecell
##############################################################

url = 'https://marketdata.theocc.com/delo-download?prodType=ALL&downloadFields=OS;US;SN&format=txt'
get = requests.get(url)
dlp_df = pd.DataFrame(pd.read_csv(BytesIO(get.content)), escapechar='\n', delimiter='\t')
print(CnM.from_bytes(get.content).best().first())
get_sample = get.content[0:1000]
get_sample



"""
sym = 'IBM'
occ = requests.get(f"https://marketdata.theocc.com/series-search?symbolType=U&symbol={sym}")
occ_df = pd.read_csv(BytesIO(occ.content), skiprows=6, escapechar='\n', delimiter='\t')
cols = occ_df.columns[:-1]
occ_df.drop('year', inplace=True, axis=1)
occ_df.columns = cols


Esempio n. 25
0
def cli_detect(argv=None):
    """
    CLI assistant using ARGV and ArgumentParser
    :param argv:
    :return: 0 if everything is fine, anything else equal trouble
    """
    parser = argparse.ArgumentParser(
        description="The Real First Universal Charset Detector. "
                    "Discover originating encoding used on text file. "
                    "Normalize text to unicode."
    )

    parser.add_argument('file', type=argparse.FileType('rb'), nargs='+', help='Filename')
    parser.add_argument('--verbose', action="store_true", default=False, dest='verbose',
                        help='Display complementary information about file if any.')
    parser.add_argument('--normalize', action="store_true", default=False, dest='normalize',
                        help='Permit to normalize input file. If not set, program does not write anything.')
    parser.add_argument('--replace', action="store_true", default=False, dest='replace',
                        help='Replace file when trying to normalize it instead of creating a new one.')
    parser.add_argument('--force', action="store_true", default=False, dest='force',
                        help='Replace file without asking if you are sure, use this flag with caution.')
    parser.add_argument('--threshold', action="store", default=0.2, type=float, dest='threshold',
                        help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.")

    args = parser.parse_args(argv)

    if len(args.file) == 0:
        print('This command purpose is to analyse text file. Please specify any filename.', file=sys.stderr)
        parser.print_help(file=sys.stderr)
        return 1

    if args.replace is True and args.normalize is False:
        print('Use --replace in addition of --normalize only.', file=sys.stderr)
        return 1

    if args.force is True and args.replace is False:
        print('Use --force in addition of --replace only.', file=sys.stderr)
        return 1

    if args.threshold < 0. or args.threshold > 1.:
        print('--threshold VALUE should be between 0. AND 1.')
        return 1

    for my_file in args.file:

        matches = CharsetNormalizerMatches.from_fp(
            my_file,
            threshold=args.threshold
        )

        if len(matches) == 0:
            print('Unable to identify originating encoding for "{}". {}'.format(my_file.name, 'Maybe try increasing maximum amount of chaos.' if args.threshold < 1. else ''), file=sys.stderr)
            if my_file.closed is False:
                my_file.close()
            continue

        x_ = PrettyTable(['Filename', 'Encoding', 'Language', 'Alphabets', 'Chaos', 'Coherence'])

        r_ = matches.best()
        p_ = r_.first()

        x_.add_row(
            [
                my_file.name,
                p_.encoding,
                p_.language,
                (' and ' if len(p_.alphabets) < 4 else '\n').join([el if 'and' not in el else '"{}"'.format(el) for el in p_.alphabets]),
                '{} %'.format(round(p_.chaos * 100., ndigits=3)),
                '{} %'.format(round(100. - p_.coherence * 100., ndigits=3))
            ]
        )

        if len(matches) > 1 and args.verbose:
            for el in matches:
                if el != p_:
                    x_.add_row(
                        [
                            '** ALTERNATIVE '+my_file.name+'**',
                            el.encoding,
                            el.language,
                            (' and ' if len(el.alphabets) < 4 else '\n').join([el if 'and' not in el else '"{}"'.format(el) for el in el.alphabets]),
                            '{} %'.format(round(el.chaos * 100., ndigits=3)),
                            '{} %'.format(round(100. - el.coherence * 100., ndigits=3))
                        ]
                    )

        print(x_)

        if args.verbose is True:
            if len(r_.could_be_from_charset) > 1:
                print('"{}" could be also originating from {}.'.format(my_file.name, ','.join(r_.could_be_from_charset)))
            if len(p_.could_be_from_charset) > 1:
                print('"{}" produce the EXACT same output with those encoding : {}.'.format(my_file.name, ' OR '.join(p_.could_be_from_charset)))
            if len(p_.languages) > 1:
                print('"{}" could be also be written in {}.'.format(my_file.name, ' or '.join(p_.languages)))
            if p_.byte_order_mark is True:
                print('"{}" has a signature or byte order mark (BOM) in it.'.format(my_file.name))

        if args.normalize is True:

            if p_.encoding.startswith('utf') is True:
                print('"{}" file does not need to be normalized, as it already came from unicode.'.format(my_file.name))
                if my_file.closed is False:
                    my_file.close()
                continue

            o_ = my_file.name.split('.')  # type: list[str]

            if args.replace is False:
                o_.insert(-1, p_.encoding)
                if my_file.closed is False:
                    my_file.close()
            else:
                if args.force is False and query_yes_no(
                        'Are you sure to normalize "{}" by replacing it ?'.format(my_file.name), 'no') is False:
                    if my_file.closed is False:
                        my_file.close()
                    continue

            try:
                with open('./{}'.format('.'.join(o_)), 'w', encoding='utf-8') as fp:
                    fp.write(
                        str(p_)
                    )
                print('"{}" has been successfully written to disk.'.format('.'.join(o_)))
            except IOError as e:
                print(str(e), file=sys.stderr)
                if my_file.closed is False:
                    my_file.close()
                return 2

        if my_file.closed is False:
            my_file.close()

    return 0