Example #1
0
    def __init__(self):
        super().__init__()

        self.threads_check_updates = []

        self.setWindowTitle(self.tr('Wordless'))
        self.setWindowIcon(
            QIcon(wordless_misc.get_abs_path('imgs/wordless_icon.ico')))

        # Default settings
        wordless_settings_default.init_settings_default(self)

        # Custom settings
        path_settings = wordless_misc.get_abs_path('wordless_settings.pickle')

        if os.path.exists(path_settings):
            with open(path_settings, 'rb') as f:
                settings_custom = pickle.load(f)

            if wordless_checking_misc.check_custom_settings(
                    settings_custom, self.settings_default):
                self.settings_custom = settings_custom
            else:
                self.settings_custom = copy.deepcopy(self.settings_default)
        else:
            self.settings_custom = copy.deepcopy(self.settings_default)

        # Global settings
        wordless_settings_global.init_settings_global(self)

        # Settings
        self.wordless_settings = wordless_settings.Wordless_Settings(self)

        # Menu
        self.init_menu()

        # Work Area & File Area
        self.init_central_widget()

        # Status Bar
        self.statusBar().showMessage(self.tr('Ready!'))

        self.statusBar().setFixedHeight(22)
        self.statusBar().setStyleSheet('''
            QStatusBar {
                background-color: #D0D0D0;
            }
        ''')

        # Check for updates on startup
        if self.settings_custom['general']['update_settings'][
                'check_updates_on_startup']:
            self.dialog_check_updates = self.help_check_updates(
                on_startup=True)

        self.load_settings()

        # Fix layout on macOS
        if platform.system() == 'Darwin':
            self.fix_macos_layout(self)
def check_files_unsupported(main, files):
    files_unsupported = []
    files_ok = []

    file_exts = [
        ext for file_type in main.settings_global['file_types']['files']
        for ext in re.findall(r'(?<=\*)\.[a-z]+', file_type)
    ]

    if files:
        # Wordless files
        if type(files[0]) == dict:
            for file in files:
                if os.path.splitext(file['path'])[1].lower() not in file_exts:
                    files_unsupported.append(file)
                else:
                    files_ok.append(file)
        # File paths
        elif type(files[0]) == str:
            for file_path in files:
                file_path = wordless_misc.get_abs_path(file_path)

                if os.path.splitext(file_path)[1].lower() not in file_exts:
                    files_unsupported.append(file_path)
                else:
                    files_ok.append(file_path)

    return files_ok, files_unsupported
Example #3
0
    def __init__(self, main, title):
        super().__init__(main)

        self.main = main

        self.setWindowTitle(title)
        self.setWindowIcon(
            QIcon(wordless_misc.get_abs_path('imgs/wordless_icon.ico')))
        self.setWindowFlag(Qt.MSWindowsFixedSizeDialogHint, True)
        self.setWindowFlag(Qt.WindowContextHelpButtonHint, False)
Example #4
0
    def __init__(self):
        super().__init__(QPixmap(wordless_misc.get_abs_path('imgs/wordless_loading.png')))

        msg_font = QFont('Times New Roman')
        msg_font.setPixelSize(14)

        self.setFont(msg_font)
        self.showMessage(
            self.tr(' Loading Wordless ...\n Please wait, it should only take a few seconds.'),
            color = Qt.white,
            alignment = Qt.AlignLeft | Qt.AlignBottom
        )
def check_files_parsing_error(main, files):
    files_parsing_error = []
    files_ok = []

    if files:
        # Wordless files
        if type(files[0]) == dict:
            for file in files:
                file_path = file['path']

                if os.path.splitext(file_path)[1] in [
                        '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc'
                ]:
                    try:
                        with open(file_path, 'r',
                                  encoding=file['encoding']) as f:
                            for line in f:
                                pass
                    except:
                        files_parsing_error.append(file)
                    else:
                        files_ok.append(file)
                else:
                    files_ok.append(file)
        # File paths
        elif type(files[0]) == str:
            for file_path in files:
                file_path = wordless_misc.get_abs_path(file_path)

                if os.path.splitext(file_path)[1] in [
                        '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc'
                ]:
                    if main.settings_custom['files'][
                            'auto_detection_settings']['detect_encodings']:
                        encoding, _ = wordless_detection.detect_encoding(
                            main, file_path)
                    else:
                        encoding = main.settings_custom['auto_detection'][
                            'default_settings']['default_encoding']

                    try:
                        with open(file_path, 'r', encoding=encoding) as f:
                            for line in f:
                                pass
                    except:
                        files_parsing_error.append(file_path)
                    else:
                        files_ok.append(file_path)
                else:
                    files_ok.append(file_path)

    return files_ok, files_parsing_error
Example #6
0
    def __init__(self, main, title, width = 0, height = 0):
        super().__init__(main)

        self.main = main

        if width:
            self.setFixedWidth(width)
        if height:
            self.setFixedHeight(height)

        self.setWindowTitle(title)
        self.setWindowIcon(QIcon(wordless_misc.get_abs_path('imgs/wordless_icon.ico')))
        self.setWindowFlag(Qt.MSWindowsFixedSizeDialogHint, True)
        self.setWindowFlag(Qt.WindowContextHelpButtonHint, False)
def check_files_missing(main, files):
    files_missing = []
    files_ok = []

    if files:
        # Wordless files
        if type(files[0]) == dict:
            for file in files:
                if os.path.exists(file['path']):
                    files_ok.append(file)
                else:
                    files_missing.append(file)
        # File paths
        elif type(files[0]) == str:
            for file_path in files:
                file_path = wordless_misc.get_abs_path(file_path)

                if os.path.exists(file_path):
                    files_ok.append(file_path)
                else:
                    files_missing.append(file_path)

    return files_ok, files_missing
def check_files_duplicate(main, files):
    files_duplicate = []
    files_ok = []

    if files:
        # Wordless files
        if type(files[0]) == dict:
            for file in files:
                if main.wordless_files.find_file_by_path(file['path']):
                    files_duplicate.append(file)
                else:
                    files_ok.append(file)
        # File paths
        elif type(files[0]) == str:
            for file_path in files:
                file_path = wordless_misc.get_abs_path(file_path)

                if main.wordless_files.find_file_by_path(file_path):
                    files_duplicate.append(file_path)
                else:
                    files_ok.append(file_path)

    return files_ok, files_duplicate
Example #9
0
    def restart(self):
        if getattr(sys, '_MEIPASS', False):
            if platform.system() == 'Windows':
                subprocess.Popen([wordless_misc.get_abs_path('Wordless.exe')])
            elif platform.system() == 'Darwin':
                subprocess.Popen([wordless_misc.get_abs_path('Wordless')])
            elif platform.system() == 'Linux':
                subprocess.Popen([wordless_misc.get_abs_path('Wordless')])
        else:
            if platform.system() == 'Windows':
                subprocess.Popen(['python', wordless_misc.get_abs_path(__file__)])
            elif platform.system() == 'Darwin':
                subprocess.Popen(['python3', wordless_misc.get_abs_path(__file__)])
            elif platform.system() == 'Linux':
                subprocess.Popen(['python3.7', wordless_misc.get_abs_path(__file__)])

        self.save_settings()
        sys.exit(0)
def wordless_get_stop_words(main, lang, list_stop_words='default'):
    if list_stop_words == 'default':
        list_stop_words = main.settings_custom['stop_words']['stop_words'][
            lang]

    lang_639_1 = wordless_conversion.to_iso_639_1(main, lang)

    # Chinese (Simplified)
    if lang_639_1 == 'zh_cn':
        lang_639_1 = 'zh'

    if 'Stopwords ISO' in list_stop_words:
        # Norwegian Bokmål & Norwegian Nynorsk
        if lang_639_1 in ['nb', 'nn']:
            lang_639_1 = 'no'

        # Chinese (Traditional)
        if lang_639_1 == 'zh_tw':
            with open(wordless_misc.get_abs_path(
                    'stop_words/Stopwords ISO/stop_words_zh_tw.txt'),
                      'r',
                      encoding='utf_8') as f:
                stop_words = [line.rstrip() for line in f]
        else:
            with open(wordless_misc.get_abs_path(
                    'stop_words/Stopwords ISO/stopwords_iso.json'),
                      'r',
                      encoding='utf_8') as f:
                stop_words = json.load(f)[lang_639_1]
    elif 'spaCy' in list_stop_words:
        # Chinese (Traditional)
        if lang_639_1 == 'zh_tw':
            with open(wordless_misc.get_abs_path(
                    'stop_words/spaCy/stop_words_zh_tw.txt'),
                      'r',
                      encoding='utf_8') as f:
                stop_words = [line.rstrip() for line in f]
        else:
            spacy_lang = importlib.import_module(f'spacy.lang.{lang_639_1}')

            stop_words = spacy_lang.STOP_WORDS
    elif 'NLTK' in list_stop_words:
        lang_texts = {
            'ara': 'arabic',
            'aze': 'azerbaijani',
            'dan': 'danish',
            'nld': 'dutch',
            'eng': 'english',
            'fin': 'finnish',
            'fra': 'french',
            'deu': 'german',
            'ell': 'greek',
            'hun': 'hungarian',
            'ind': 'indonesian',
            'ita': 'italian',
            'kaz': 'kazakh',
            'nep': 'nepali',
            # Norwegian Bokmål & Norwegian Nynorsk
            'nob': 'norwegian',
            'nno': 'norwegian',
            'por': 'portuguese',
            'ron': 'romanian',
            'rus': 'russian',
            'spa': 'spanish',
            'swe': 'swedish',
            'tur': 'turkish'
        }

        stop_words = nltk.corpus.stopwords.words(lang_texts[lang])
    # Greek (Ancient)
    elif list_stop_words == main.tr(
            'grk-stoplist - Greek (Ancient) Stop Words'):
        with open(wordless_misc.get_abs_path(
                'stop_words/grk-stoplist/stoplist-greek.txt'),
                  'r',
                  encoding='utf_8') as f:
            stop_words = [line.rstrip() for line in f.readlines()]
    # Thai
    elif list_stop_words == main.tr('PyThaiNLP - Thai Stop Words'):
        stop_words = pythainlp.corpus.common.thai_stopwords()
    # Custom Lists
    elif list_stop_words == main.tr('Custom List'):
        stop_words = main.settings_custom['stop_words']['custom_lists'][lang]

    return sorted(stop_words)
def wordless_lemmatize(main,
                       tokens,
                       lang,
                       text_type=('untokenized', 'untagged'),
                       lemmatizer='default'):
    empty_offsets = []
    mapping_lemmas = {}
    lemmas = []

    tokens = [str(token) for token in tokens]

    re_tags_all = wordless_matching.get_re_tags(main, tags='all')
    re_tags_pos = wordless_matching.get_re_tags(main, tags='pos')
    re_tags_non_pos = wordless_matching.get_re_tags(main, tags='non_pos')

    if text_type[1] == 'tagged_both':
        tags = [''.join(re.findall(re_tags_all, token)) for token in tokens]
        tokens = [re.sub(re_tags_all, '', token) for token in tokens]
    elif text_type[1] == 'tagged_pos':
        tags = [''.join(re.findall(re_tags_pos, token)) for token in tokens]
        tokens = [re.sub(re_tags_pos, '', token) for token in tokens]
    elif text_type[1] == 'tagged_non_pos':
        tags = [
            ''.join(re.findall(re_tags_non_pos, token)) for token in tokens
        ]
        tokens = [re.sub(re_tags_non_pos, '', token) for token in tokens]
    else:
        tags = [''] * len(tokens)

    # Record empty tokens
    for i, token in reversed(list(enumerate(tokens))):
        if not token.strip():
            tokens.remove(token)

            empty_offsets.append(i)

    wordless_text_utils.check_lemmatizers(main, lang)

    if tokens and lang in main.settings_global['lemmatizers']:
        if lemmatizer == 'default':
            lemmatizer = main.settings_custom['lemmatization']['lemmatizers'][
                lang]

        # Dutch, English, French, German, Greek (Modern), Italian, Portuguese, Spanish
        if 'spaCy' in lemmatizer:
            nlp = main.__dict__[f'spacy_nlp_{lang}']

            doc = spacy.tokens.Doc(nlp.vocab, words=tokens)
            nlp.tagger(doc)

            lemmas = [token.lemma_ for token in doc]
        # English
        elif lemmatizer == main.tr('NLTK - WordNet Lemmatizer'):
            word_net_lemmatizer = nltk.WordNetLemmatizer()

            for token, pos in wordless_pos_tag(
                    main,
                    tokens,
                    lang='eng',
                    pos_tagger='NLTK - Perceptron POS Tagger',
                    tagset='universal'):
                if pos == 'ADJ':
                    lemmas.append(
                        word_net_lemmatizer.lemmatize(
                            token, pos=nltk.corpus.wordnet.ADJ))
                elif pos in ['NOUN', 'PROPN']:
                    lemmas.append(
                        word_net_lemmatizer.lemmatize(
                            token, pos=nltk.corpus.wordnet.NOUN))
                elif pos == 'ADV':
                    lemmas.append(
                        word_net_lemmatizer.lemmatize(
                            token, pos=nltk.corpus.wordnet.ADV))
                elif pos in ['VERB', 'AUX']:
                    lemmas.append(
                        word_net_lemmatizer.lemmatize(
                            token, pos=nltk.corpus.wordnet.VERB))
                else:
                    lemmas.append(word_net_lemmatizer.lemmatize(token))
        # Greek (Ancient)
        elif lemmatizer == main.tr(
                'lemmalist-greek - Greek (Ancient) Lemma List'):
            with open(wordless_misc.get_abs_path(
                    'lemmatization/lemmalist-greek/lemmalist-greek.txt'),
                      'r',
                      encoding='utf_8') as f:
                for line in f.readlines():
                    line = line.rstrip()

                    if line:
                        lemma, *words = line.split()

                        for word in words:
                            mapping_lemmas[word] = lemma
        # Russian & Ukrainian
        elif lemmatizer == main.tr('pymorphy2 - Morphological Analyzer'):
            if lang == 'rus':
                morphological_analyzer = pymorphy2.MorphAnalyzer(lang='ru')
            else:
                morphological_analyzer = pymorphy2.MorphAnalyzer(lang='uk')

            for token in tokens:
                lemmas.append(
                    morphological_analyzer.parse(token)[0].normal_form)
        # Tibetan
        elif lemmatizer == main.tr('pybo - Tibetan Lemmatizer'):
            word_tokenizer = main.settings_custom['word_tokenization'][
                'word_tokenizers'][lang]

            wordless_text_utils.check_pybo_tokenizers(
                main, word_tokenizer=word_tokenizer)

            if word_tokenizer == main.tr(
                    'pybo - Tibetan Word Tokenizer (GMD)'):
                tokens = main.pybo_tokenizer_gmd.tokenize(' '.join(tokens))
            elif word_tokenizer == main.tr(
                    'pybo - Tibetan Word Tokenizer (POS)'):
                tokens = main.pybo_tokenizer_pos.tokenize(' '.join(tokens))
            elif word_tokenizer == main.tr(
                    'pybo - Tibetan Word Tokenizer (tsikchen)'):
                tokens = main.pybo_tokenizer_tsikchen.tokenize(
                    ' '.join(tokens))

            for token in tokens:
                if token.lemma:
                    lemmas.append(token.lemma)
                else:
                    lemmas.append(token.text)
        # Other Languages
        elif 'Lemmatization Lists' in lemmatizer:
            lang = wordless_conversion.to_iso_639_1(main, lang)

            with open(wordless_misc.get_abs_path(
                    f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'
            ),
                      'r',
                      encoding='utf_8_sig') as f:
                for line in f:
                    try:
                        lemma, word = line.rstrip().split('\t')

                        mapping_lemmas[word] = lemma
                    except:
                        pass
    else:
        lemmas = tokens

    if mapping_lemmas:
        lemmas = [mapping_lemmas.get(token, token) for token in tokens]

    # Insert empty lemmas
    for empty_offset in empty_offsets:
        lemmas.insert(empty_offset, '')

    return [lemma + tag for lemma, tag in zip(lemmas, tags)]
def wordless_get_stop_words(main, lang, list_stop_words='default'):
    if list_stop_words == 'default':
        list_stop_words = main.settings_custom['stop_words']['stop_words'][
            lang]

    lang_639_1 = wordless_conversion.to_iso_639_1(main, lang)

    # Chinese (Simplified)
    if lang_639_1 == 'zh_cn':
        lang_639_1 = 'zh'

    # extra-stopwords
    if 'extra-stopwords' in list_stop_words:
        LANG_TEXTS = {
            'sqi': 'albanian',
            'ara': 'arabic',
            'hye': 'armenian',
            'eus': 'basque',
            'bel': 'belarusian',
            'ben': 'bengali',
            'bul': 'bulgarian',
            'cat': 'catalan',
            'zho_cn': 'chinese',
            # Chinese (Traditional)
            'zho_tw': 'chinese-traditional',
            'hrv': 'croatian',
            'ces': 'czech',
            'dan': 'danish',
            'nld': 'dutch',
            'eng': 'english',
            'est': 'estonian',
            'fin': 'finnish',
            'fra': 'french',
            'glg': 'galician',
            'deu': 'german',
            'ell': 'greek',
            'hau': 'hausa',
            'heb': 'hebrew',
            'hin': 'hindi',
            'hun': 'hungarian',
            'isl': 'icelandic',
            'ind': 'indonesian',
            'gle': 'irish',
            'ita': 'italian',
            'jpn': 'japanese',
            'kor': 'korean',
            'kur': 'kurdish',
            'lav': 'latvian',
            'lit': 'lithuanian',
            'msa': 'malay',
            'mar': 'marathi',
            'mon': 'mongolian',
            'nep': 'nepali',
            # Norwegian Bokmål & Norwegian Nynorsk
            'nob': 'norwegian',
            'nno': 'norwegian',
            'fas': 'persian',
            'pol': 'polish',
            'por': 'portuguese',
            'ron': 'romanian',
            'rus': 'russian',
            'srp_cyrl': 'serbian-cyrillic',
            'srp_latn': 'serbian',
            'slk': 'slovak',
            'slv': 'slovenian',
            'spa': 'spanish',
            'swa': 'swahili',
            'swe': 'swedish',
            'tgl': 'tagalog',
            'tel': 'telugu',
            'tha': 'thai',
            'tur': 'turkish',
            'ukr': 'ukranian',
            'urd': 'urdu',
            'vie': 'vietnamese',
            'yor': 'yoruba'
        }

        with open(wordless_misc.get_abs_path(
                f'stop_words/extra-stopwords/{LANG_TEXTS[lang]}'),
                  'r',
                  encoding='utf_8') as f:
            stop_words = [
                line.rstrip() for line in f if not line.startswith('#')
            ]
    # NLTK
    elif 'NLTK' in list_stop_words:
        LANG_TEXTS = {
            'ara': 'arabic',
            'aze': 'azerbaijani',
            'dan': 'danish',
            'nld': 'dutch',
            'eng': 'english',
            'fin': 'finnish',
            'fra': 'french',
            'deu': 'german',
            'ell': 'greek',
            'hun': 'hungarian',
            'ind': 'indonesian',
            'ita': 'italian',
            'kaz': 'kazakh',
            'nep': 'nepali',
            # Norwegian Bokmål & Norwegian Nynorsk
            'nob': 'norwegian',
            'nno': 'norwegian',
            'por': 'portuguese',
            'ron': 'romanian',
            'rus': 'russian',
            'slv': 'slovene',
            'spa': 'spanish',
            'swe': 'swedish',
            'tgk': 'tajik',
            'tur': 'turkish'
        }

        stop_words = nltk.corpus.stopwords.words(LANG_TEXTS[lang])
    # spaCy
    elif 'spaCy' in list_stop_words:
        # Chinese (Traditional)
        if lang_639_1 == 'zh_tw':
            with open(wordless_misc.get_abs_path(
                    'stop_words/spaCy/stop_words_zh_tw.txt'),
                      'r',
                      encoding='utf_8') as f:
                stop_words = [line.rstrip() for line in f]
        else:
            # Serbian (Cyrillic) & Serbian (Latin)
            if lang_639_1 == 'sr_cyrl':
                spacy_lang = importlib.import_module('spacy.lang.sr')

                stop_words = spacy_lang.STOP_WORDS
            elif lang_639_1 == 'sr_latn':
                spacy_lang = importlib.import_module('spacy.lang.sr')

                stop_words = spacy_lang.STOP_WORDS
                stop_words = wordless_text_utils.to_srp_latn(stop_words)
            else:
                spacy_lang = importlib.import_module(
                    f'spacy.lang.{lang_639_1}')

                stop_words = spacy_lang.STOP_WORDS
    # Stopwords ISO
    elif 'Stopwords ISO' in list_stop_words:
        # Norwegian Bokmål & Norwegian Nynorsk
        if lang_639_1 in ['nb', 'nn']:
            lang_639_1 = 'no'

        # Chinese (Traditional)
        if lang_639_1 == 'zh_tw':
            with open(wordless_misc.get_abs_path(
                    'stop_words/Stopwords ISO/stop_words_zh_tw.txt'),
                      'r',
                      encoding='utf_8') as f:
                stop_words = [line.rstrip() for line in f]
        else:
            with open(wordless_misc.get_abs_path(
                    'stop_words/Stopwords ISO/stopwords_iso.json'),
                      'r',
                      encoding='utf_8') as f:
                stop_words = json.load(f)[lang_639_1]
    # Greek (Ancient)
    elif list_stop_words == main.tr(
            'grk-stoplist - Greek (Ancient) Stop Words'):
        with open(wordless_misc.get_abs_path(
                'stop_words/grk-stoplist/stoplist-greek.txt'),
                  'r',
                  encoding='utf_8') as f:
            stop_words = [line.rstrip() for line in f.readlines()]
    # Thai
    elif list_stop_words == main.tr('PyThaiNLP - Thai Stop Words'):
        stop_words = pythainlp.corpus.common.thai_stopwords()
    # Custom Lists
    elif list_stop_words == main.tr('Custom List'):
        stop_words = main.settings_custom['stop_words']['custom_lists'][lang]

    # Remove empty tokens
    stop_words = [stop_word for stop_word in stop_words if stop_word]

    return sorted(set(stop_words))
    def add_files(self):
        new_files = []

        files_detection_failed_encoding = []
        files_detection_failed_text_type = []
        files_detection_failed_lang = []

        if self.file_paths:
            len_file_paths = len(self.file_paths)

            for i, file_path in enumerate(self.file_paths):
                self.progress_updated.emit(
                    self.tr(f'Loading files ... ({i + 1}/{len_file_paths})'))

                default_dir = wordless_checking_misc.check_dir(
                    self.main.settings_custom['import']['temp_files']
                    ['default_path'])
                default_encoding = self.main.settings_custom['import'][
                    'temp_files']['default_encoding']

                file_path = wordless_misc.get_abs_path(file_path)
                file_name, file_ext = os.path.splitext(
                    os.path.basename(file_path))
                file_ext = file_ext.lower()

                # Text Files
                if file_ext == '.txt':
                    (new_file, detection_success_encoding,
                     detection_success_text_type, detection_success_lang
                     ) = self.main.wordless_files._new_file(file_path)

                    new_files.append(new_file)

                    if not detection_success_encoding:
                        files_detection_failed_encoding.append(
                            new_file['path'])

                    if not detection_success_text_type:
                        files_detection_failed_text_type.append(
                            new_file['path'])

                    if not detection_success_lang:
                        files_detection_failed_lang.append(new_file['path'])
                else:
                    if file_ext in ['.docx', '.xlsx', '.xls']:
                        new_path = wordless_checking_misc.check_new_path(
                            os.path.join(default_dir, f'{file_name}.txt'))

                        # Word Documents
                        if file_ext == '.docx':
                            lines = []

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                doc = docx.Document(file_path)

                                for block in self.iter_block_items(doc):
                                    if type(block
                                            ) == docx.text.paragraph.Paragraph:
                                        f.write(f'{block.text}\n')
                                    elif type(block) == docx.table.Table:
                                        for row in self.iter_visual_cells(
                                                block):
                                            cells = []

                                            for cell in row:
                                                cells.append(' '.join([
                                                    item.text for item in
                                                    self.iter_cell_items(cell)
                                                ]))

                                            f.write('\t'.join(cells) + '\n')

                        # Excel Workbooks
                        elif file_ext == '.xlsx':
                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                workbook = openpyxl.load_workbook(
                                    file_path, data_only=True)

                                for worksheet_name in workbook.sheetnames:
                                    worksheet = workbook[worksheet_name]

                                    for row in worksheet.rows:
                                        f.write('\t'.join([(
                                            cell.value if cell.value != None
                                            else '') for cell in row]) + '\n')
                        elif file_ext == '.xls':
                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                workbook = xlrd.open_workbook(file_path)

                                for i_sheet in range(workbook.nsheets):
                                    worksheet = workbook.sheet_by_index(
                                        i_sheet)

                                    for row in range(worksheet.nrows):
                                        f.write('\t'.join([
                                            worksheet.cell_value(row, col)
                                            for col in range(worksheet.ncols)
                                        ]) + '\n')

                        new_paths = [new_path]
                    else:
                        # Detect encoding
                        if self.main.settings_custom['files'][
                                'auto_detection_settings']['detect_encodings']:
                            encoding_code, _ = wordless_detection.detect_encoding(
                                self.main, file_path)
                        else:
                            encoding_code = self.main.settings_custom[
                                'encoding_detection']['default_settings'][
                                    'default_encoding']

                        # CSV Files
                        if file_ext == '.csv':
                            new_path = wordless_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.txt'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                with open(file_path,
                                          'r',
                                          newline='',
                                          encoding=encoding_code) as f_csv:
                                    csv_reader = csv.reader(f_csv)

                                    for row in csv_reader:
                                        f.write('\t'.join(row) + '\n')

                            new_paths = [new_path]

                        # HTML Files
                        elif file_ext in ['.htm', '.html']:
                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                soup = bs4.BeautifulSoup(f.read(), 'lxml')

                            new_path = wordless_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.txt'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                f.write(soup.get_text())

                            new_paths = [new_path]

                        # Translation Memory Files
                        elif file_ext == '.tmx':
                            lines_src = []
                            lines_target = []

                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                soup = bs4.BeautifulSoup(f.read(), 'lxml-xml')

                                for tu in soup.find_all('tu'):
                                    seg_src, seg_target = tu.find_all('seg')

                                    lines_src.append(seg_src.get_text())
                                    lines_target.append(seg_target.get_text())

                            path_src = wordless_checking_misc.check_new_path(
                                os.path.join(default_dir,
                                             f'{file_name}_source.txt'))
                            path_target = wordless_checking_misc.check_new_path(
                                os.path.join(default_dir,
                                             f'{file_name}_target.txt'))

                            with open(path_src, 'w',
                                      encoding=default_encoding) as f:
                                f.write('\n'.join(lines_src))
                                f.write('\n')

                            with open(path_target,
                                      'w',
                                      encoding=default_encoding) as f:
                                f.write('\n'.join(lines_target))
                                f.write('\n')

                            new_paths = [path_src, path_target]

                        # Lyrics Files
                        elif file_ext == '.lrc':
                            lyrics = {}

                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                for line in f:
                                    time_tags = []

                                    line = line.strip()

                                    # Strip time tags
                                    while re.search(r'^\[[^\]]+?\]', line):
                                        time_tags.append(
                                            re.search(r'^\[[^\]]+?\]',
                                                      line).group())

                                        line = line[len(time_tags[-1]):].strip(
                                        )

                                    # Strip word time tags
                                    line = re.sub(r'<[^>]+?>', r'', line)
                                    line = re.sub(r'\s{2,}', r' ',
                                                  line).strip()

                                    for time_tag in time_tags:
                                        if re.search(
                                                r'^\[[0-9]{2}:[0-5][0-9]\.[0-9]{2}\]$',
                                                time_tag):
                                            lyrics[time_tag] = line

                            new_path = wordless_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.txt'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                for _, lyrics in sorted(lyrics.items()):
                                    f.write(f'{lyrics}\n')

                            new_paths = [new_path]

                    for new_path in new_paths:
                        (new_file, detection_success_encoding,
                         detection_success_text_type, detection_success_lang
                         ) = self.main.wordless_files._new_file(new_path)

                        new_files.append(new_file)

                        if not detection_success_encoding:
                            files_detection_failed_encoding.append(
                                new_file['path'])

                        if not detection_success_text_type:
                            files_detection_failed_text_type.append(
                                new_file['path'])

                        if not detection_success_lang:
                            files_detection_failed_lang.append(
                                new_file['path'])

            self.main.settings_custom['import']['files'][
                'default_path'] = wordless_misc.get_abs_path(
                    os.path.dirname(self.file_paths[0]))

        self.files_added.emit(new_files, files_detection_failed_encoding,
                              files_detection_failed_text_type,
                              files_detection_failed_lang)
Example #14
0
def get_path(file_name):
    return wordless_misc.get_abs_path(f'wordless_tests/files/checking/{file_name}')
Example #15
0
    def import_list(self, settings):
        files = []

        if os.path.exists(self.main.settings_custom['import'][settings]['default_path']):
            default_dir = self.main.settings_custom['import'][settings]['default_path']
        else:
            default_dir = self.main.settings_default['import'][settings]['default_path']

        file_paths = QFileDialog.getOpenFileNames(self.main,
                                                  self.tr('Import from File(s)'),
                                                  default_dir,
                                                  self.tr('Text File (*.txt)'))[0]

        if file_paths:
            self.main.settings_custom['import'][settings]['default_path'] = os.path.normpath(os.path.dirname(file_paths[0]))

            # Detect encodings
            if self.main.settings_custom['import'][settings]['detect_encodings']:
                for file_path in file_paths:
                    files.append({
                        'path': wordless_misc.get_abs_path(file_path),
                        'encoding': wordless_detection.detect_encoding(self.main, file_path)[0]
                    })
            else:
                for file_path in file_paths:
                    files.append({
                        'path': wordless_misc.get_abs_path(file_path),
                        'encoding': self.main.settings_custom['auto_detection']['default_settings']['default_encoding']
                    })

            files_ok, files_empty = wordless_checking_file.check_files_empty(self.main, files)
            files_ok, files_decoding_error = wordless_checking_file.check_files_decoding_error(self.main, files_ok)

            # Extract file paths
            files_empty = [file['path'] for file in files_empty]
            files_decoding_error = [file['path'] for file in files_decoding_error]

            if files_empty or files_decoding_error:
                wordless_dialog_error.wordless_dialog_error_import(self.main,
                                                                   files_empty = files_empty,
                                                                   files_decoding_error = files_decoding_error)

                wordless_msg.wordless_msg_import_list_error(self.main)
            else:
                # Check duplicate items
                items_to_import = []
                items_cur = self.get_items()

                num_prev = len(items_cur)

                for file in files_ok:
                    with open(file['path'], 'r', encoding = file['encoding']) as f:
                        for line in f:
                            line = line.strip()

                            if line not in items_cur:
                                items_to_import.append(line)

                self.load_items(collections.OrderedDict.fromkeys(items_to_import))
                self.itemChanged.emit(self.item(0))

                wordless_msg.wordless_msg_import_list_success(self.main, num_prev, len(self.get_items()))
def check_files_empty(main, files):
    files_empty = []
    files_ok = []

    if files:
        # Wordless files
        if type(files[0]) == dict:
            for file in files:
                file_path = file['path']

                # Text files
                if os.path.splitext(file_path)[1] in [
                        '.txt', '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc'
                ]:
                    try:
                        with open(file_path, 'r',
                                  encoding=file['encoding']) as f:
                            empty_file = True

                            for line in f:
                                if line.strip():
                                    empty_file = False

                                    break

                            if empty_file:
                                files_empty.append(file)
                            else:
                                files_ok.append(file)
                    except:
                        files_ok.append(file)
                # Other file types
                else:
                    if os.stat(file_path).st_size:
                        files_ok.append(file)
                    else:
                        files_empty.append(file)
        # File paths
        elif type(files[0]) == str:
            for file_path in files:
                file_path = wordless_misc.get_abs_path(file_path)

                # Text files
                if os.path.splitext(file_path)[1] in [
                        '.txt', '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc'
                ]:
                    if main.settings_custom['files'][
                            'auto_detection_settings']['detect_encodings']:
                        encoding, _ = wordless_detection.detect_encoding(
                            main, file_path)
                    else:
                        encoding = main.settings_custom['auto_detection'][
                            'default_settings']['default_encoding']

                    try:
                        with open(file_path, 'r', encoding=encoding) as f:
                            empty_file = True

                            for line in f:
                                if line.strip():
                                    empty_file = False

                                    break

                            if empty_file:
                                files_empty.append(file_path)
                            else:
                                files_ok.append(file_path)
                    except:
                        files_ok.append(file_path)
                # Other file types
                else:
                    if os.stat(file_path).st_size:
                        files_ok.append(file_path)
                    else:
                        files_empty.append(file_path)

    return files_ok, files_empty