Ejemplos de detect_encoding en Python, ejemplos de wl_utils.wl_detection.detect_encoding en Python

Ejemplo n.º 1

0

Mostrar archivo

def check_file_paths_parsing_error(main, file_paths):
    file_paths_parsing_error = []
    file_paths_pass = []

    if file_paths:
        for file_path in file_paths:
            file_path = wl_misc.get_normalized_path(file_path)

            if os.path.splitext(file_path)[1] in [
                    '.txt', '.csv', '.htm', '.html', '.xml', '.tmx'
            ]:

                if main.settings_custom['files']['auto_detection_settings'][
                        'detect_encodings']:
                    encoding = wl_detection.detect_encoding(main, file_path)
                else:
                    encoding = main.settings_custom['auto_detection'][
                        'default_settings']['default_encoding']

                try:
                    text = ''

                    with open(file_path, 'r', encoding=encoding) as f:
                        for line in f:
                            text += line
                except:
                    file_paths_parsing_error.append(file_path)
                else:
                    file_paths_pass.append(file_path)
            else:
                file_paths_pass.append(file_path)

    return file_paths_pass, file_paths_parsing_error

Ejemplo n.º 2

0

Mostrar archivo

Archivo: test_detection.py Proyecto: BLKSerene/Wordless

def test_detection_encoding(file_path):
    file_name = os.path.basename(file_path)

    print(f'Detecting encoding for file "{file_name}"... ', end = '')

    encoding_code = wl_detection.detect_encoding(main, file_path)
    encoding_code_file = re.search(r'(?<=\()[^\(\)]+?(?=\)\.txt)', file_name).group()

    print(f'Detected: {encoding_code}')

    assert encoding_code == encoding_code_file

Ejemplo n.º 3

0

Mostrar archivo

Archivo: wl_checking_file.py Proyecto: hong1999/Wordless

def check_files_parsing_error(main, files):
    files_parsing_error = []
    files_ok = []

    if files:
        # Wordless files
        if type(files[0]) == dict:
            for file in files:
                file_path = file['path']

                if os.path.splitext(file_path)[1] in [
                        '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc'
                ]:
                    try:
                        with open(file_path, 'r',
                                  encoding=file['encoding']) as f:
                            for line in f:
                                pass
                    except:
                        files_parsing_error.append(file)
                    else:
                        files_ok.append(file)
                else:
                    files_ok.append(file)
        # File paths
        elif type(files[0]) == str:
            for file_path in files:
                file_path = wl_misc.get_normalized_path(file_path)

                if os.path.splitext(file_path)[1] in [
                        '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc'
                ]:
                    if main.settings_custom['files'][
                            'auto_detection_settings']['detect_encodings']:
                        encoding, _ = wl_detection.detect_encoding(
                            main, file_path)
                    else:
                        encoding = main.settings_custom['auto_detection'][
                            'default_settings']['default_encoding']

                    try:
                        with open(file_path, 'r', encoding=encoding) as f:
                            for line in f:
                                pass
                    except:
                        files_parsing_error.append(file_path)
                    else:
                        files_ok.append(file_path)
                else:
                    files_ok.append(file_path)

    return files_ok, files_parsing_error

Ejemplo n.º 4

0

Mostrar archivo

def test_detection_encoding(file_name):
    file = {}

    file['path'] = f'wl_tests/files/wl_utils/wl_detection/encoding/{file_name}'
    file['name'] = os.path.basename(file['path'])
    file['encoding'] = 'utf_8'

    encoding_code = wl_detection.detect_encoding(main, file["path"])

    encoding_code_file = re.search(r'(?<=\()[^\(\)]+?(?=\)\.txt)',
                                   file_name).group()
    encoding_code_file = encoding_code_file.lower()
    encoding_code_file = encoding_code_file.replace('-', '_')

    assert encoding_code == encoding_code_file

Ejemplo n.º 5

0

Mostrar archivo

Archivo: wl_file_area.py Proyecto: hong1999/Wordless

    def _new_file(self, file_path):
        new_file = {}

        detection_success_encoding = True
        detection_success_text_type = True
        detection_success_lang = True

        new_file['selected'] = True
        new_file['path'] = file_path
        new_file['name'], _ = os.path.splitext(
            os.path.basename(new_file['path']))
        new_file['name_old'] = new_file['name']

        # Detect encodings
        if self.main.settings_custom['files']['auto_detection_settings'][
                'detect_encodings']:
            (new_file['encoding'],
             detection_success_encoding) = wl_detection.detect_encoding(
                 self.main, new_file['path'])
        else:
            new_file['encoding'] = self.main.settings_custom['auto_detection'][
                'default_settings']['default_encoding']

        # Detect text types
        if self.main.settings_custom['files']['auto_detection_settings'][
                'detect_text_types']:
            (new_file['text_type'],
             detection_success_text_type) = wl_detection.detect_text_type(
                 self.main, new_file)
        else:
            new_file['text_type'] = self.main.settings_custom[
                'auto_detection']['default_settings']['default_text_type']

        # Detect languages
        if self.main.settings_custom['files']['auto_detection_settings'][
                'detect_langs']:
            (new_file['lang'],
             detection_success_lang) = wl_detection.detect_lang(
                 self.main, new_file)
        else:
            new_file['lang'] = self.main.settings_custom['auto_detection'][
                'default_settings']['default_lang']

        return (new_file, detection_success_encoding,
                detection_success_text_type, detection_success_lang)

Ejemplo n.º 6

0

Mostrar archivo

def check_file_paths_empty(main, file_paths):
    file_paths_empty = []
    file_paths_pass = []

    if file_paths:
        for file_path in file_paths:
            file_path = wl_misc.get_normalized_path(file_path)

            # Text files
            if os.path.splitext(file_path)[1] in [
                    '.txt', '.csv', '.htm', '.html', '.xml', '.tmx'
            ]:
                if main.settings_custom['files']['auto_detection_settings'][
                        'detect_encodings']:
                    encoding = wl_detection.detect_encoding(main, file_path)
                else:
                    encoding = main.settings_custom['auto_detection'][
                        'default_settings']['default_encoding']

                try:
                    with open(file_path, 'r', encoding=encoding) as f:
                        empty_file = True

                        for line in f:
                            if line.strip():
                                empty_file = False

                                break

                        if empty_file:
                            file_paths_empty.append(file_path)
                        else:
                            file_paths_pass.append(file_path)
                except:
                    file_paths_pass.append(file_path)
            # Other file types
            else:
                if os.stat(file_path).st_size:
                    file_paths_pass.append(file_path)
                else:
                    file_paths_empty.append(file_path)

    return file_paths_pass, file_paths_empty

Ejemplo n.º 7

0

Mostrar archivo

Archivo: wl_file_area.py Proyecto: rosie-law/Wordless

    def run(self):
        new_files = []

        if self.file_paths:
            len_file_paths = len(self.file_paths)

            for i, file_path in enumerate(self.file_paths):
                self.progress_updated.emit(
                    self.tr(f'Opening files ... ({i + 1}/{len_file_paths})'))

                default_dir = wl_checking_misc.check_dir(
                    self.main.settings_custom['import']['temp_files']
                    ['default_path'])
                default_encoding = self.main.settings_custom['import'][
                    'temp_files']['default_encoding']

                file_path = wl_misc.get_normalized_path(file_path)
                file_name, file_ext = os.path.splitext(
                    os.path.basename(file_path))
                file_ext = file_ext.lower()

                # Text files
                if file_ext == '.txt':
                    new_files.append(self.main.wl_files._new_file(file_path))
                else:
                    if file_ext in ['.docx', '.xlsx', '.xls']:
                        new_path = wl_checking_misc.check_new_path(
                            os.path.join(default_dir, f'{file_name}.txt'))

                        # Word documents
                        if file_ext == '.docx':
                            lines = []

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                doc = docx.Document(file_path)

                                for block in self.iter_block_items(doc):
                                    if type(block
                                            ) == docx.text.paragraph.Paragraph:
                                        f.write(f'{block.text}\n')
                                    elif type(block) == docx.table.Table:
                                        for row in self.iter_visual_cells(
                                                block):
                                            cells = []

                                            for cell in row:
                                                cells.append(' '.join([
                                                    item.text for item in
                                                    self.iter_cell_items(cell)
                                                ]))

                                            f.write('\t'.join(cells) + '\n')

                        # Excel workbooks
                        elif file_ext == '.xlsx':
                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                workbook = openpyxl.load_workbook(
                                    file_path, data_only=True)

                                for worksheet_name in workbook.sheetnames:
                                    worksheet = workbook[worksheet_name]

                                    for row in worksheet.rows:
                                        f.write('\t'.join([(
                                            cell.value if cell.value != None
                                            else '') for cell in row]) + '\n')

                        new_paths = [new_path]
                    else:
                        # Detect encoding
                        if self.main.settings_custom['files'][
                                'auto_detection_settings']['detect_encodings']:
                            encoding_code, _ = wl_detection.detect_encoding(
                                self.main, file_path)
                        else:
                            encoding_code = self.main.settings_custom[
                                'auto_detection']['default_settings'][
                                    'default_encoding']

                        # CSV files
                        if file_ext == '.csv':
                            new_path = wl_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.txt'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                with open(file_path,
                                          'r',
                                          newline='',
                                          encoding=encoding_code) as f_csv:
                                    csv_reader = csv.reader(f_csv)

                                    for row in csv_reader:
                                        f.write('\t'.join(row) + '\n')

                            new_paths = [new_path]

                        # HTML files
                        elif file_ext in ['.htm', '.html']:
                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                soup = bs4.BeautifulSoup(f.read(), 'lxml')

                            new_path = wl_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.txt'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                f.write(soup.get_text())

                            new_paths = [new_path]

                        # XML files
                        elif file_ext == '.xml':
                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                xml_text = f.read()

                            new_path = wl_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.xml'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                f.write(xml_text)

                            new_paths = [new_path]

                        # Translation memory files
                        elif file_ext == '.tmx':
                            lines_src = []
                            lines_target = []

                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                soup = bs4.BeautifulSoup(f.read(), 'lxml-xml')

                                for tu in soup.find_all('tu'):
                                    seg_src, seg_target = tu.find_all('seg')

                                    lines_src.append(seg_src.get_text())
                                    lines_target.append(seg_target.get_text())

                            path_src = wl_checking_misc.check_new_path(
                                os.path.join(default_dir,
                                             f'{file_name}_source.txt'))
                            path_target = wl_checking_misc.check_new_path(
                                os.path.join(default_dir,
                                             f'{file_name}_target.txt'))

                            with open(path_src, 'w',
                                      encoding=default_encoding) as f:
                                f.write('\n'.join(lines_src))
                                f.write('\n')

                            with open(path_target,
                                      'w',
                                      encoding=default_encoding) as f:
                                f.write('\n'.join(lines_target))
                                f.write('\n')

                            new_paths = [path_src, path_target]

                    for new_path in new_paths:
                        new_files.append(
                            self.main.wl_files._new_file(new_path, txt=False))

            self.main.settings_custom['import']['files'][
                'default_path'] = wl_misc.get_normalized_dir(
                    self.file_paths[0])

        self.progress_updated.emit(self.tr('Updating table ...'))

        time.sleep(0.1)

        self.worker_done.emit(new_files)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: wl_file_area.py Proyecto: rosie-law/Wordless

    def _new_file(self, file_path, txt=True):
        new_file = {}

        detect_pass_encoding = True
        detect_pass_lang = True

        new_file['selected'] = True

        new_file['path'] = file_path

        if new_file['path'].endswith('.txt'):
            new_file['tokenized'] = 'No'
            new_file['tagged'] = 'No'
        elif new_file['path'].endswith('.xml'):
            new_file['tokenized'] = 'Yes'
            new_file['tagged'] = 'Yes'

        new_file['name'], _ = os.path.splitext(
            os.path.basename(new_file['path']))
        new_file['name_old'] = new_file['name']

        # Detect encodings
        if self.main.settings_custom['files']['auto_detection_settings'][
                'detect_encodings']:
            new_file['encoding'] = wl_detection.detect_encoding(
                self.main, new_file['path'])
        else:
            new_file['encoding'] = self.main.settings_custom['auto_detection'][
                'default_settings']['default_encoding']

        # Detect languages
        if self.main.settings_custom['files']['auto_detection_settings'][
                'detect_langs']:
            new_file['lang'] = wl_detection.detect_lang(self.main, new_file)
        else:
            new_file['lang'] = self.main.settings_custom['auto_detection'][
                'default_settings']['default_lang']

        if txt:
            default_dir = wl_checking_misc.check_dir(
                self.main.settings_custom['import']['temp_files']
                ['default_path'])

            new_file['path'] = os.path.join(default_dir,
                                            re.split(r'[/\\]', file_path)[-1])
            new_file['path'] = wl_checking_misc.check_new_path(
                new_file['path'])

        # Remove header tags
        tags_header = []

        for _, _, tag_opening, _ in self.main.settings_custom['tags'][
                'tags_header']:
            tags_header.append(tag_opening[1:-1])

        text = ''

        with open(file_path, 'r', encoding=new_file['encoding']) as f:
            for line in f:
                text += line

        # The "lxml" parser will add <html><body> to the text, which is undesirable
        with open(new_file['path'], 'w', encoding='utf_8') as f:
            soup = bs4.BeautifulSoup(text, features='html.parser')

            for tag_header in tags_header:
                for header_element in soup.select(tag_header):
                    header_element.decompose()

            f.write(str(soup))

        return new_file

Ejemplo n.º 9

0

Mostrar archivo

Archivo: wl_checking_file.py Proyecto: hong1999/Wordless

def check_files_empty(main, files):
    files_empty = []
    files_ok = []

    if files:
        # Wordless files
        if type(files[0]) == dict:
            for file in files:
                file_path = file['path']

                # Text files
                if os.path.splitext(file_path)[1] in [
                        '.txt', '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc'
                ]:
                    try:
                        with open(file_path, 'r',
                                  encoding=file['encoding']) as f:
                            empty_file = True

                            for line in f:
                                if line.strip():
                                    empty_file = False

                                    break

                            if empty_file:
                                files_empty.append(file)
                            else:
                                files_ok.append(file)
                    except:
                        files_ok.append(file)
                # Other file types
                else:
                    if os.stat(file_path).st_size:
                        files_ok.append(file)
                    else:
                        files_empty.append(file)
        # File paths
        elif type(files[0]) == str:
            for file_path in files:
                file_path = wl_misc.get_normalized_path(file_path)

                # Text files
                if os.path.splitext(file_path)[1] in [
                        '.txt', '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc'
                ]:
                    if main.settings_custom['files'][
                            'auto_detection_settings']['detect_encodings']:
                        encoding, _ = wl_detection.detect_encoding(
                            main, file_path)
                    else:
                        encoding = main.settings_custom['auto_detection'][
                            'default_settings']['default_encoding']

                    try:
                        with open(file_path, 'r', encoding=encoding) as f:
                            empty_file = True

                            for line in f:
                                if line.strip():
                                    empty_file = False

                                    break

                            if empty_file:
                                files_empty.append(file_path)
                            else:
                                files_ok.append(file_path)
                    except:
                        files_ok.append(file_path)
                # Other file types
                else:
                    if os.stat(file_path).st_size:
                        files_ok.append(file_path)
                    else:
                        files_empty.append(file_path)

    return files_ok, files_empty

Ejemplo n.º 10

0

Mostrar archivo

Archivo: wl_list.py Proyecto: hayreddin2019/Wordless

    def import_list(self, settings):
        files = []

        if os.path.exists(
                self.main.settings_custom['import'][settings]['default_path']):
            default_dir = self.main.settings_custom['import'][settings][
                'default_path']
        else:
            default_dir = self.main.settings_default['import'][settings][
                'default_path']

        file_paths = QFileDialog.getOpenFileNames(
            self.main, self.tr('Import from File(s)'), default_dir,
            self.tr('Text File (*.txt)'))[0]

        if file_paths:
            self.main.settings_custom['import'][settings][
                'default_path'] = os.path.normpath(
                    os.path.dirname(file_paths[0]))

            # Detect encodings
            if self.main.settings_custom['import'][settings][
                    'detect_encodings']:
                for file_path in file_paths:
                    files.append({
                        'path':
                        wl_misc.get_normalized_path(file_path),
                        'encoding':
                        wl_detection.detect_encoding(self.main, file_path)[0]
                    })
            else:
                for file_path in file_paths:
                    files.append({
                        'path':
                        wl_misc.get_normalized_path(file_path),
                        'encoding':
                        self.main.settings_custom['auto_detection']
                        ['default_settings']['default_encoding']
                    })

            files_ok, files_empty = wl_checking_file.check_files_empty(
                self.main, files)
            files_ok, files_decoding_error = wl_checking_file.check_files_decoding_error(
                self.main, files_ok)

            # Extract file paths
            files_empty = [file['path'] for file in files_empty]
            files_decoding_error = [
                file['path'] for file in files_decoding_error
            ]

            if files_empty or files_decoding_error:
                wl_dialog_error.wl_dialog_error_import(
                    self.main,
                    files_empty=files_empty,
                    files_decoding_error=files_decoding_error)

                wl_msg.wl_msg_import_list_error(self.main)
            else:
                # Check duplicate items
                items_to_import = []
                items_cur = self.get_items()

                num_prev = len(items_cur)

                for file in files_ok:
                    with open(file['path'], 'r',
                              encoding=file['encoding']) as f:
                        for line in f:
                            line = line.strip()

                            if line not in items_cur:
                                items_to_import.append(line)

                self.load_items(
                    collections.OrderedDict.fromkeys(items_to_import))
                self.itemChanged.emit(self.item(0))

                wl_msg.wl_msg_import_list_success(self.main, num_prev,
                                                  len(self.get_items()))

Ejemplo n.º 11

0

Mostrar archivo

Archivo: wl_lists.py Proyecto: BLKSerene/Wordless

    def imp_list(self):
        if os.path.exists(self.main.settings_custom['imp'][self.settings]
                          ['default_path']):
            default_dir = self.main.settings_custom['imp'][
                self.settings]['default_path']
        else:
            default_dir = self.main.settings_default['imp'][
                self.settings]['default_path']

        file_paths = QFileDialog.getOpenFileNames(
            self.main,
            _tr('Wl_List_Add_Ins_Del_Clr_Imp_Exp', 'Import from Files'),
            default_dir,
            _tr('Wl_List_Add_Ins_Del_Clr_Imp_Exp', 'Text File (*.txt)'))[0]

        if file_paths:
            # Modify default path
            self.main.settings_custom['imp'][
                self.settings]['default_path'] = os.path.normpath(
                    os.path.dirname(file_paths[0]))

            file_paths, file_paths_empty = wl_checking_files.check_file_paths_empty(
                self.main, file_paths)

            if file_paths_empty:
                dialog_err_files = wl_dialogs_errs.Wl_Dialog_Err_Files(
                    self.main,
                    _tr('Wl_List_Add_Ins_Del_Clr_Imp_Exp', 'Import Error'))

                dialog_err_files.label_err.set_text(
                    _tr(
                        'Wl_List_Add_Ins_Del_Clr_Imp_Exp', '''
                    <div>
                        An error occurred during import, please check the following files and try again.
                    </div>
                '''))

                dialog_err_files.table_err_files.model().setRowCount(
                    len(file_paths_empty))

                dialog_err_files.table_err_files.disable_updates()

                for i, file_path in enumerate(file_paths_empty):
                    dialog_err_files.table_err_files.model().setItem(
                        i, 0,
                        QStandardItem(
                            _tr('Wl_List_Add_Ins_Del_Clr_Imp_Exp',
                                'Empty File')))
                    dialog_err_files.table_err_files.model().setItem(
                        i, 1, QStandardItem(file_path))

                dialog_err_files.table_err_files.enable_updates()

                dialog_err_files.open()

                self.main.statusBar().showMessage(
                    _tr('Wl_List_Add_Ins_Del_Clr_Imp_Exp',
                        'An error occured during import!'))
            else:
                # Check duplicate items
                items_to_imp = []
                items_cur = self.model().stringList()

                num_prev = len(items_cur)

                for file_path in file_paths:
                    # Detect encodings
                    if self.main.settings_custom['imp'][
                            self.settings]['detect_encodings']:
                        encoding = wl_detection.detect_encoding(
                            self.main, file_path)
                    else:
                        encoding = self.main.settings_custom['imp'][
                            self.settings]['default_encoding']

                    with open(file_path,
                              'r',
                              encoding=encoding,
                              errors='replace') as f:
                        text = f.read()

                    for line in text.split('\n'):
                        line = line.strip()

                        if line and line not in items_cur:
                            items_to_imp.append(line)

                self._add_items(items_to_imp)

                num_imps = self.model().rowCount() - num_prev
                msg_item = _tr('Wl_List_Add_Ins_Del_Clr_Imp_Exp',
                               'item') if num_imps == 1 else _tr(
                                   'Wl_List_Add_Ins_Del_Clr_Imp_Exp', 'items')

                self.main.statusBar().showMessage(
                    _tr('Wl_List_Add_Ins_Del_Clr_Imp_Exp',
                        '{} {} has been successfully imported into the list.').
                    format(num_imps, msg_item))

Ejemplo n.º 12

0

Mostrar archivo

Archivo: wl_file_area.py Proyecto: hong1999/Wordless

    def run(self):
        new_files = []

        files_detection_error_encoding = []
        files_detection_error_text_type = []
        files_detection_error_lang = []

        if self.file_paths:
            len_file_paths = len(self.file_paths)

            for i, file_path in enumerate(self.file_paths):
                self.progress_updated.emit(
                    self.tr(f'Opening files ... ({i + 1}/{len_file_paths})'))

                default_dir = wl_checking_misc.check_dir(
                    self.main.settings_custom['import']['temp_files']
                    ['default_path'])
                default_encoding = self.main.settings_custom['import'][
                    'temp_files']['default_encoding']

                file_path = wl_misc.get_normalized_path(file_path)
                file_name, file_ext = os.path.splitext(
                    os.path.basename(file_path))
                file_ext = file_ext.lower()

                # Text files
                if file_ext == '.txt':
                    (new_file, detection_success_encoding,
                     detection_success_text_type, detection_success_lang
                     ) = self.main.wl_files._new_file(file_path)

                    new_files.append(new_file)

                    if not detection_success_encoding:
                        files_detection_error_encoding.append(new_file['path'])

                    if not detection_success_text_type:
                        files_detection_error_text_type.append(
                            new_file['path'])

                    if not detection_success_lang:
                        files_detection_error_lang.append(new_file['path'])
                else:
                    if file_ext in ['.docx', '.xlsx', '.xls']:
                        new_path = wl_checking_misc.check_new_path(
                            os.path.join(default_dir, f'{file_name}.txt'))

                        # Word documents
                        if file_ext == '.docx':
                            lines = []

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                doc = docx.Document(file_path)

                                for block in self.iter_block_items(doc):
                                    if type(block
                                            ) == docx.text.paragraph.Paragraph:
                                        f.write(f'{block.text}\n')
                                    elif type(block) == docx.table.Table:
                                        for row in self.iter_visual_cells(
                                                block):
                                            cells = []

                                            for cell in row:
                                                cells.append(' '.join([
                                                    item.text for item in
                                                    self.iter_cell_items(cell)
                                                ]))

                                            f.write('\t'.join(cells) + '\n')

                        # Excel workbooks
                        elif file_ext == '.xlsx':
                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                workbook = openpyxl.load_workbook(
                                    file_path, data_only=True)

                                for worksheet_name in workbook.sheetnames:
                                    worksheet = workbook[worksheet_name]

                                    for row in worksheet.rows:
                                        f.write('\t'.join([(
                                            cell.value if cell.value != None
                                            else '') for cell in row]) + '\n')
                        elif file_ext == '.xls':
                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                workbook = xlrd.open_workbook(file_path)

                                for i_sheet in range(workbook.nsheets):
                                    worksheet = workbook.sheet_by_index(
                                        i_sheet)

                                    for row in range(worksheet.nrows):
                                        f.write('\t'.join([
                                            worksheet.cell_value(row, col)
                                            for col in range(worksheet.ncols)
                                        ]) + '\n')

                        new_paths = [new_path]
                    else:
                        # Detect encoding
                        if self.main.settings_custom['files'][
                                'auto_detection_settings']['detect_encodings']:
                            encoding_code, _ = wl_detection.detect_encoding(
                                self.main, file_path)
                        else:
                            encoding_code = self.main.settings_custom[
                                'auto_detection']['default_settings'][
                                    'default_encoding']

                        # CSV files
                        if file_ext == '.csv':
                            new_path = wl_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.txt'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                with open(file_path,
                                          'r',
                                          newline='',
                                          encoding=encoding_code) as f_csv:
                                    csv_reader = csv.reader(f_csv)

                                    for row in csv_reader:
                                        f.write('\t'.join(row) + '\n')

                            new_paths = [new_path]

                        # HTML files
                        elif file_ext in ['.htm', '.html']:
                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                soup = bs4.BeautifulSoup(f.read(), 'lxml')

                            new_path = wl_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.txt'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                f.write(soup.get_text())

                            new_paths = [new_path]

                        # XML files
                        elif file_ext == '.xml':
                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                xml_text = f.read()

                            new_path = wl_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.txt'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                f.write(xml_text)

                            new_paths = [new_path]

                        # Translation memory files
                        elif file_ext == '.tmx':
                            lines_src = []
                            lines_target = []

                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                soup = bs4.BeautifulSoup(f.read(), 'lxml-xml')

                                for tu in soup.find_all('tu'):
                                    seg_src, seg_target = tu.find_all('seg')

                                    lines_src.append(seg_src.get_text())
                                    lines_target.append(seg_target.get_text())

                            path_src = wl_checking_misc.check_new_path(
                                os.path.join(default_dir,
                                             f'{file_name}_source.txt'))
                            path_target = wl_checking_misc.check_new_path(
                                os.path.join(default_dir,
                                             f'{file_name}_target.txt'))

                            with open(path_src, 'w',
                                      encoding=default_encoding) as f:
                                f.write('\n'.join(lines_src))
                                f.write('\n')

                            with open(path_target,
                                      'w',
                                      encoding=default_encoding) as f:
                                f.write('\n'.join(lines_target))
                                f.write('\n')

                            new_paths = [path_src, path_target]

                        # Lyrics files
                        elif file_ext == '.lrc':
                            lyrics = {}

                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                for line in f:
                                    time_tags = []

                                    line = line.strip()

                                    # Strip time tags
                                    while re.search(r'^\[[^\]]+?\]', line):
                                        time_tags.append(
                                            re.search(r'^\[[^\]]+?\]',
                                                      line).group())

                                        line = line[len(time_tags[-1]):].strip(
                                        )

                                    # Strip word time tags
                                    line = re.sub(r'<[^>]+?>', r'', line)
                                    line = re.sub(r'\s{2,}', r' ',
                                                  line).strip()

                                    for time_tag in time_tags:
                                        if re.search(
                                                r'^\[[0-9]{2}:[0-5][0-9]\.[0-9]{2}\]$',
                                                time_tag):
                                            lyrics[time_tag] = line

                            new_path = wl_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.txt'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                for _, lyrics in sorted(lyrics.items()):
                                    f.write(f'{lyrics}\n')

                            new_paths = [new_path]

                    for new_path in new_paths:
                        (new_file, detection_success_encoding,
                         detection_success_text_type, detection_success_lang
                         ) = self.main.wl_files._new_file(new_path)

                        new_files.append(new_file)

                        if not detection_success_encoding:
                            files_detection_error_encoding.append(
                                new_file['path'])

                        if not detection_success_text_type:
                            files_detection_error_text_type.append(
                                new_file['path'])

                        if not detection_success_lang:
                            files_detection_error_lang.append(new_file['path'])

            self.main.settings_custom['import']['files'][
                'default_path'] = wl_misc.get_normalized_dir(
                    self.file_paths[0])

        self.progress_updated.emit(self.tr('Updating table ...'))

        time.sleep(0.1)

        self.worker_done.emit(new_files, files_detection_error_encoding,
                              files_detection_error_text_type,
                              files_detection_error_lang)

Ejemplo n.º 13

0

Mostrar archivo

    def _new_file(self, file_path, txt=True):
        new_file = {}

        detection_success_encoding = True
        detection_success_text_type = True
        detection_success_lang = True

        new_file['selected'] = True
        new_file['path'] = file_path
        new_file['name'], _ = os.path.splitext(
            os.path.basename(new_file['path']))
        new_file['name_old'] = new_file['name']

        # Detect encodings
        if self.main.settings_custom['files']['auto_detection_settings'][
                'detect_encodings']:
            (new_file['encoding'],
             detection_success_encoding) = wl_detection.detect_encoding(
                 self.main, new_file['path'])
        else:
            new_file['encoding'] = self.main.settings_custom['auto_detection'][
                'default_settings']['default_encoding']

        # Detect text types
        if self.main.settings_custom['files']['auto_detection_settings'][
                'detect_text_types']:
            (new_file['text_type'],
             detection_success_text_type) = wl_detection.detect_text_type(
                 self.main, new_file)
        else:
            new_file['text_type'] = self.main.settings_custom[
                'auto_detection']['default_settings']['default_text_type']

        # Detect languages
        if self.main.settings_custom['files']['auto_detection_settings'][
                'detect_langs']:
            (new_file['lang'],
             detection_success_lang) = wl_detection.detect_lang(
                 self.main, new_file)
        else:
            new_file['lang'] = self.main.settings_custom['auto_detection'][
                'default_settings']['default_lang']

        # Remove header tags
        tags_header_opening = []
        tags_header_closing = []

        if txt:
            default_dir = wl_checking_misc.check_dir(
                self.main.settings_custom['import']['temp_files']
                ['default_path'])
            new_file['path'] = re.sub(r'^.+?\\([^\\]+?$)',
                                      fr'{re.escape(default_dir)}\\\1',
                                      file_path)
            new_file['path'] = wl_checking_misc.check_new_path(
                new_file['path'])

        for tag_opening, tag_closing in self.main.settings_custom['tags'][
                'tags_header']:
            tags_header_opening.append(fr"{tag_opening}.+?")
            tags_header_closing.append(fr".+?{tag_closing}")

        tag_header_opening = '|'.join(tags_header_opening)
        tag_header_closing = '|'.join(tags_header_closing)

        with open(file_path, 'r', encoding=new_file['encoding']) as f, open(
                new_file['path'], 'w', encoding='utf_8') as f_temp:
            tags_header = False

            for line in f:
                if tags_header:
                    if re.search(tag_header_closing, line):
                        f_temp.write(re.sub(tag_header_closing, '', line))

                        tags_header = False
                elif re.search(tag_header_opening, line):
                    f_temp.write(re.sub(tag_header_opening, '', line))

                    tags_header = True
                else:
                    f_temp.write(line)

        return (new_file, detection_success_encoding,
                detection_success_text_type, detection_success_lang)