コード例 #1
0
def testing_lemmatize(lang, lemmatizer):
    lang_text = wordless_conversion.to_lang_text(main, lang)

    print(f'{lang_text} / {lemmatizer}:')

    wordless_text_utils.check_lemmatizers(main, lang, lemmatizer = lemmatizer)

    lemmas = wordless_text_processing.wordless_lemmatize(main, globals()[f'tokens_{lang}'],
                                                         lang = lang,
                                                         lemmatizer = lemmatizer)

    print(f"\t{lemmas}")
コード例 #2
0
def testing_lemmatize(lang, lemmatizer):
    lang_text = wordless_conversion.to_lang_text(main, lang)

    print(f'{lang_text} / {lemmatizer}:')

    tokens_sentences = wordless_text_processing.wordless_word_tokenize(
        main, globals()[f'SENTENCE_{lang.upper()}'], lang=lang)
    tokens = [token for tokens in tokens_sentences for token in tokens]

    lemmas = wordless_text_processing.wordless_lemmatize(main,
                                                         tokens,
                                                         lang=lang,
                                                         lemmatizer=lemmatizer)

    print(f"\t{lemmas}")
コード例 #3
0
def test_lemmatize(lang, lemmatizer, show_results=False):
    lang_text = wordless_conversion.to_lang_text(main, lang)

    tokens = wordless_text_processing.wordless_word_tokenize(
        main,
        text=getattr(wordless_test_lang_examples, f'SENTENCE_{lang.upper()}'),
        lang=lang)

    lemmas = wordless_text_processing.wordless_lemmatize(main,
                                                         tokens=tokens,
                                                         lang=lang,
                                                         lemmatizer=lemmatizer)

    if show_results:
        print(lemmas)

    if lang == 'ast':
        assert lemmas == [
            "L'asturianu", 'ser', 'unu', 'llingua', 'romance', 'propiu',
            "d'Asturies,[1", ']', 'perteneciente', 'al', 'subgrupu',
            'asturllionés', '.'
        ]
    elif lang == 'bul':
        assert lemmas == [
            'Бъ̀лгарският', 'езѝк', 'съм', 'индоевропейски', 'език', 'от',
            'група', 'на', 'южнославянските', 'език', '.'
        ]
    elif lang == 'cat':
        assert lemmas == [
            'El', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya',
            ',', 'a', 'ell', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a',
            'ell', 'ciutat', 'de', 'ell', 'Alguer', 'i', 'tradicional', 'a',
            'Catalunya', 'Nord', ')', 'o', 'valencià', '(', 'denominació',
            'oficial', 'al', 'País', 'Valencià', 'i', 'tradicional', 'al',
            'Carxe', ')', 'ser', 'un', 'llengua', 'romànic', 'parlar', 'a',
            'Catalunya', ',', 'ell', 'País', 'Valencià', '(', 'treure', 'de',
            'algun', 'comarca', 'i', 'localitat', 'de', 'ell', 'interior', ')',
            ',', 'ell', 'Illes', 'Balears', ',', 'Andorra', ',', 'ell',
            'Franja', 'de', 'Ponent', '(', 'a', 'ell', 'Aragó', ')', ',',
            'ell', 'ciutat', 'de', 'ell', 'Alguer', '(', 'a', 'ell', 'illa',
            'de', 'Sardenya', ')', ',', 'ell', 'Catalunya', 'del', 'Nord,[8',
            ']', 'ell', 'Carxe', '(', 'un', 'petit', 'territori', 'de',
            'Múrcia', 'poblar', 'per', 'immigrar', 'valencians),[9][10', ']',
            'i', 'en', 'petita', 'comunitat', 'arreu', 'del', 'món', '(',
            'entrar', 'ell', 'qual', 'destacar', 'ell', 'de', 'ell',
            'Argentina', ',', 'amb', '195.000', 'parlants).[11', ']'
        ]
    elif lang == 'ces':
        assert lemmas == [
            'Čeština', 'neboli', 'český', 'jazyk', 'on', 'západoslovanský',
            'jazyk', ',', 'blízký', 'slovenštině', ',', 'poté', 'lužické',
            'srbštině', 'a', 'polštině', '.'
        ]
    elif lang == 'nld':
        assert lemmas == [
            'het', 'nederlands', 'zijn', 'een', 'west-germaans', 'taal', 'en',
            'de', 'moedertaal', 'van', 'de', 'veel', 'inwoner', 'van',
            'nederland', ',', 'belgië', 'en', 'suriname', '.'
        ]
    elif lang == 'eng':
        if lemmatizer == 'Lemmatization Lists - English Lemma List':
            assert lemmas == [
                'English', 'be', 'a', 'West', 'Germanic', 'language', 'that',
                'be', '1', 'speak', 'in', 'early', 'medieval', 'England',
                'and', 'eventually', 'become', 'a', 'global', 'lingua',
                'franca.[4][5', ']'
            ]
        elif lemmatizer == 'NLTK - WordNet Lemmatizer':
            assert lemmas == [
                'English', 'be', 'a', 'West', 'Germanic', 'language', 'that',
                'be', 'first', 'speak', 'in', 'early', 'medieval', 'England',
                'and', 'eventually', 'become', 'a', 'global', 'lingua',
                'franca.[4][5', ']'
            ]
        elif lemmatizer == 'spaCy - English Lemmatizer':
            assert lemmas == [
                'English', 'be', 'a', 'West', 'germanic', 'language', 'that',
                'be', 'first', 'speak', 'in', 'early', 'medieval', 'England',
                'and', 'eventually', 'become', 'a', 'global', 'lingua',
                'franca.[4][5', ']'
            ]
    elif lang == 'est':
        assert lemmas == [
            'Eesti', 'kee', '(', 'varasem', 'nimetu', ':', 'maakeel', ')',
            'olema', 'läänemeresoome', 'lõunarühma', 'kuuluma', 'kee', '.'
        ]
    elif lang == 'fra':
        if lemmatizer == 'Lemmatization Lists - French Lemma List':
            assert lemmas == [
                'Le', 'français', 'être', 'un', 'langue', 'indo-européen',
                'de', 'le', 'famille', 'un', 'langue', 'roman', '.'
            ]
        elif lemmatizer == 'spaCy - French Lemmatizer':
            assert lemmas == [
                'le', 'français', 'être', 'un', 'langue', 'indo-européen',
                'de', 'le', 'famille', 'un', 'langue', 'roman', '.'
            ]
    elif lang == 'glg':
        assert lemmas == [
            'O', 'galego', '(', '[', 'ɡaˈleɣo̝', ']', ')', 'ser', 'un',
            'lingua', 'indoeuropeo', 'que', 'pertencer', 'á', 'póla', 'de',
            'lingua', 'románico', '.'
        ]
    elif lang == 'deu':
        if lemmatizer == 'Lemmatization Lists - German Lemma List':
            assert lemmas == [
                'Die', 'deutsch', 'Sprache', 'bzw.', 'Deutsch', '(', '[',
                'dɔʏ̯t͡ʃ', ']', ';', 'abkürzen', 'dt', '.', 'oder', 'dtsch',
                '.', ')', 'sein', 'einen', 'westgermanische', 'Sprache', '.'
            ]
        elif lemmatizer == 'spaCy - German Lemmatizer':
            assert lemmas == [
                'der', 'deutsch', 'Sprache', 'bzw.', 'Deutsch', '(', '[',
                'dɔʏ̯t͡ʃ', ']', ';', 'abkürzen', 'dt', '.', 'oder', 'dtsch',
                '.', ')', 'sein', 'einen', 'westgermanische', 'Sprache', '.'
            ]
    elif lang == 'grc':
        assert lemmas == [
            'Με', 'τον', 'όρο', 'αρχαία', 'ελληνική', 'γλώσσα', 'εννοείται',
            'μια', 'μορφή', 'της', 'ελληνικής', 'γλώσσας', ',', 'πού',
            'ομιλούνταν', 'κατά', 'τους', 'αρχαϊκούς', 'χρόνους', 'και', 'την',
            'κλασική', 'αρχαιότητα', '.'
        ]
    elif lang == 'ell':
        assert lemmas == [
            'η', 'ελληνικός', 'γλώσσα', 'ανήκω', 'στην', 'ινδοευρωπαϊκός',
            'οικογένεια[9', ']', 'και', 'συγκεκριμένα', 'στον', 'ελληνικό',
            'κλάδο', ',', 'μαζί', 'με', 'την', 'τσακωνικός', ',', 'ενώ',
            'είναι', 'η', 'επίσημη', 'γλώσσα', 'της', 'Ελλάδος', 'και', 'της',
            'Κύπρου', '.'
        ]
    elif lang == 'hun':
        assert lemmas == [
            'A', 'magyar', 'nyelv', 'az', 'uráli', 'nyelvcsalád', 'tag', ',',
            'a', 'finnugor', 'nyelv', 'köz', 'tartozó', 'ugor', 'nyelv',
            'egyik', '.'
        ]
    elif lang == 'gle':
        assert lemmas == [
            'Is', 'ceann', 'de', 'na', 'teangach', 'Ceilteacha', 'í', 'an',
            'Ghaeilge', '(', 'nó', 'Gaeilge', 'na', 'hÉireann', 'mar', 'a',
            'tabhair', 'ar', 'corruair', ')', ',', 'agus', 'ceann', 'den',
            'trí', 'ceann', 'de', 'teangach', 'Ceilteacha', 'air', 'a',
            'tabhair', 'na', 'teangach', 'Gaelacha', '(', '.i.', 'an',
            'Ghaeilge', ',', 'Gaeilge', 'na', 'hAlban', 'agus', 'Gaeilge',
            'Mhanann', ')', 'go', 'áirithe', '.'
        ]
    elif lang == 'ita':
        assert lemmas == [
            "L'", 'italiano', '(', '[', 'itaˈljaːno][Nota', '1', ']',
            'ascolta[?·info', ']', ')', 'essere', 'una', 'lingua', 'romanzo',
            'parlato', 'principalmente', 'in', 'Italia', '.'
        ]
    elif lang == 'lit':
        assert lemmas == [
            'lietuvė', 'kalbėti', '–', 'ižti', 'baltas', 'prokalbės', 'kilęs',
            'lietuvė', 'tauta', 'kalbėti', ',', '-PRON-', 'Lietuvoje', 'būti',
            'valstybinis', ',', 'o', 'Europos', 'sąjunga', '–', 'viena',
            'ižti', 'oficialus', 'kalbus', '.'
        ]
    elif lang == 'glv':
        assert lemmas == [
            'She', 'Gaelg', '(', 'graït', ':', '/gɪlg/', ')', 'çhengey',
            'Gaelagh', 'Mannin', '.'
        ]
    elif lang == 'nob':
        assert lemmas == [
            'bokmål', 'være', 'en', 'varietet', 'av', 'norsk', 'språk', '.'
        ]
    elif lang == 'fas':
        assert lemmas == [
            'فارسی', 'یا', 'پارسی', 'یکی', 'از', 'زبان\u200cهای',
            'هندواروپایی', 'در', 'شاخهٔ', 'زبان\u200cهای', 'ایرانی', 'جنوب',
            'غربی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان،[۳',
            ']', 'تاجیکستان[۴', ']', 'را', 'ازبکستان[۵', ']', 'به', 'آن',
            'سخن', 'می\u200cگویند', '.'
        ]
    elif lang == 'por':
        assert lemmas == [
            'A', 'língua', 'portuguesar', ',', 'também', 'designar',
            'português', ',', 'ser', 'umar', 'língua', 'românico', 'flexivo',
            'ocidental', 'originar', 'o', 'galego-português', 'falar', 'o',
            'Reino', 'da', 'Galiza', 'e', 'o', 'norte', 'de', 'Portugal', '.'
        ]
    elif lang == 'ron':
        assert lemmas == [
            'Limba', 'român', 'fi', 'vrea', 'limbă', 'indo', '-', 'european',
            ',', 'din', 'grup', 'italic', 'și', 'din', 'subgrupul', 'oriental',
            'al', 'limbă', 'romanice', '.'
        ]
    elif lang == 'rus':
        assert lemmas == [
            'ру́сский', 'язы́к', '(', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']',
            'информация', 'о', 'файл', 'слушать', ')', '[', '~', '3', ']', '[',
            '⇨', ']', '—', 'один', 'из', 'восточнославянский', 'язык', ',',
            'национальный', 'язык', 'русский', 'народ', '.'
        ]
    elif lang == 'gla':
        assert lemmas == [
            "'S", 'i', 'cànan', 'dùthchasach', 'na', 'h', '-', 'Alba', 'a',
            'th', "'", 'anns', 'a', "'", 'Ghàidhlig', '.'
        ]
    elif lang == 'slk':
        assert lemmas == [
            'Slovenčina', 'patriť', 'do', 'skupina', 'západoslovanský',
            'jazyk', '(', 'spolu', 's', 'čeština', ',', 'poľština', ',',
            'horný', 'as', 'dolný', 'lužickou', 'srbčina', 'as', 'kašubčinou',
            ')', '.'
        ]
    elif lang == 'slv':
        assert lemmas == [
            'Slovenščina', '[', 'slovénščina', ']', '/', '[', 'sloˈʋenʃtʃina',
            ']', 'onbiti', 'združen', 'naziv', 'za', 'uraden', 'knjižen',
            'jezik', 'Slovenec', 'in', 'skupen', 'ime', 'za', 'narečje', 'in',
            'govoriti', ',', 'ki', 'on', 'govoriti', 'ali', 'biti', 'on',
            'nekoč', 'govoriti', 'Slovenec', '.'
        ]
    elif lang == 'spa':
        assert lemmas == [
            'El', 'español', 'o', 'castellano', 'ser', 'uno', 'lengua',
            'romance', 'procedente', 'del', 'latín', 'hablar', '.'
        ]
    elif lang == 'swe':
        assert lemmas == [
            'Svenska', '(', 'svensk', '(', 'info', ')', ')', 'vara', 'en',
            'östnordiskt', 'språka', 'som', 'tala', 'av', 'ungefär', 'tio',
            'miljon', 'person', 'främst', 'i', 'Sverige', 'där', 'språk',
            'hare', 'man', 'dominant', 'ställning', 'som', 'huvudspråk', ',',
            'mena', 'även', 'som', 'en', 'en', 'nationalspråk', 'i', 'Finland',
            'och', 'som', 'enda', 'officiell', 'språka', 'på', 'Åland', '.'
        ]
    elif lang == 'bod':
        assert lemmas == [
            '༄༅། ། ', 'རྒྱ་གར་', 'སྐད་', 'དུ་', ' ། ', 'བོ་', ' དྷི་', ' སཏྭ་',
            ' ཙརྻ་', 'ཨ་བ་', 'ཏ་', 'ར་', ' ། ', 'བོད་སྐད་', 'དུ་', ' ། ',
            'བྱང་ཆུབ་', 'སེམས་དཔའ་', 'གི་', 'སྤྱོད་པ་', 'ལ་', 'འཇུག་པ་',
            ' ། ། ', 'སངས་རྒྱས་', 'དང་', 'བྱང་ཆུབ་', 'སེམས་དཔའ་', 'ཐམས་ཅད་',
            'ལ་', 'ཕྱག་', 'འཚལ་', 'ལོ་', ' ། ། ', 'བདེ་གཤེགས་', 'ཆོ་', 'ཀྱི་',
            'སྐུ་', 'མངའ་', 'སྲ་', 'བཅའ་', 'དང་', ' ། ། ', 'ཕྱག་འོས་', 'ཀུན་',
            'ལ་', 'ཀྱང་', 'གུས་པ་', 'ལ་', 'ཕྱག་', 'འཚལ་', 'ཏེ་', ' ། ། ',
            'བདེ་གཤེགས་', 'སྲ་', 'ཀྱི་', 'སྡོམ་', 'ལ་', 'འཇུག་པ་', 'ནི་',
            ' ། ། ', 'ལུང་', 'བཞིན་', 'མདོར་བསྡུས་', 'ན་', 'ནི་', 'བརྗོད་པ་',
            'ལ་', 'བྱ་', ' ། །'
        ]
    elif lang == 'ukr':
        if lemmatizer == 'Lemmatization Lists - Ukrainian Lemma List':
            assert lemmas == [
                'Украї́нська', 'мо́ва', '(', 'МФА', ':', '[',
                'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичний', 'назвати',
                '—', 'ру́ська', ',', 'руси́нська[9][10][11', ']', '[', '*',
                '2', ']', ')', '—', 'національний', 'мова', 'українець', '.'
            ]
        elif lemmatizer == 'pymorphy2 - Morphological Analyzer':
            assert lemmas == [
                'украї́нський', 'мо́вий', '(', 'мфа', ':', '[',
                'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичний', 'назва',
                '—', 'ру́ський', ',', 'руси́нська[9][10][11', ']', '[', '*',
                '2', ']', ')', '—', 'національний', 'мова', 'українець', '.'
            ]
    elif lang == 'cym':
        assert lemmas == [
            'Aelod', "o'r", 'cangen', 'Frythonaidd', "o'r", 'iaith',
            'Celtaidd', 'a', 'siarad', 'bod', 'brodorol', 'yn', 'Nghymru', ',',
            'can', 'Gymry', 'a', 'pobl', 'arall', 'aredig', 'gwasgar', 'bod',
            'Lloegr', ',', 'a', 'can', 'cymuno', 'bechan', 'bod', 'Y',
            'Wladfa', ',', 'gwybod', 'Ariannin[7', ']', "yw'r", 'Gymraeg', '(',
            'hefyd', 'Cymraeg', 'heb', 'yr', 'bannod', ')', '.'
        ]
コード例 #4
0
def wordless_process_tokens(text, token_settings):
    main = text.main
    settings = copy.deepcopy(token_settings)

    # Token Settings
    if settings['use_tags']:
        settings['ignore_tags'] = settings['ignore_tags_tags']
        settings['ignore_tags_type'] = settings['ignore_tags_type_tags']

    # Punctuations
    if not settings['puncs']:
        i_tokens = 0

        # Mark tokens to be removed
        for para in text.tokens_hierarchical:
            for sentence in para:
                for clause in sentence:
                    for i, token in enumerate(clause):
                        if wordless_checking_token.is_token_punc(token):
                            clause[i] = ''

                            text.tags_pos[i_tokens + i] = ''
                            text.tags_non_pos[i_tokens + i] = ''
                            text.tags_all[i_tokens + i] = ''

                    i_tokens += len(clause)

        # Remove punctuations
        for para in text.tokens_hierarchical:
            for sentence in para:
                for i, clause in enumerate(sentence):
                    sentence[i] = [token for token in clause if token]

        text.tags_pos = [tags for tags in text.tags_pos if tags != '']
        text.tags_non_pos = [tags for tags in text.tags_pos if tags != '']
        text.tags_all = [tags for tags in text.tags_pos if tags != '']

    # Lemmatize all tokens
    if not settings['use_tags'] and settings['lemmatize_tokens']:
        for para in text.tokens_hierarchical:
            for sentence in para:
                for i, clause in enumerate(sentence):
                    sentence[i] = wordless_text_processing.wordless_lemmatize(
                        main, clause,
                        lang = text.lang
                    )

    # Treat as all lowercase
    if settings['treat_as_lowercase']:
        for para in text.tokens_hierarchical:
            for sentence in para:
                for i, clause in enumerate(sentence):
                    sentence[i] = [token.lower() for token in clause]

        text.tags_pos = [[tag.lower() for tag in tags] for tags in text.tags_pos]
        text.tags_non_pos = [[tag.lower() for tag in tags] for tags in text.tags_non_pos]
        text.tags_all = [[tag.lower() for tag in tags] for tags in text.tags_all]

    # Words
    if settings['words']:
        # Lowercase
        if not settings['lowercase']:
            for para in text.tokens_hierarchical:
                for sentence in para:
                    for clause in sentence:
                        for i, token in enumerate(clause):
                            if wordless_checking_token.is_token_word_lowercase(token):
                                clause[i] = ''
        # Uppercase
        if not settings['uppercase']:
            for para in text.tokens_hierarchical:
                for sentence in para:
                    for clause in sentence:
                        for i, token in enumerate(clause):
                            if wordless_checking_token.is_token_word_uppercase(token):
                                clause[i] = ''
        # Title Case
        if not settings['title_case']:
            for para in text.tokens_hierarchical:
                for sentence in para:
                    for clause in sentence:
                        for i, token in enumerate(clause):
                            if wordless_checking_token.is_token_word_title_case(token):
                                clause[i] = ''
    else:
        for para in text.tokens_hierarchical:
            for sentence in para:
                for clause in sentence:
                    for i, token in enumerate(clause):
                        if wordless_checking_token.is_token_word(token):
                            clause[i] = ''

    # Numerals
    if not settings['nums']:
        for para in text.tokens_hierarchical:
            for sentence in para:
                for clause in sentence:
                    for i, token in enumerate(clause):
                        if wordless_checking_token.is_token_num(token):
                            clause[i] = ''

    # Filter stop words
    if settings['filter_stop_words']:
        for para in text.tokens_hierarchical:
            for sentence in para:
                for i, clause in enumerate(sentence):
                    sentence[i] = wordless_text_processing.wordless_filter_stop_words(
                        main, clause,
                        lang = text.lang
                    )

    # Ignore tags
    i_token = 0

    if settings['ignore_tags']:
        # Ignore all tags
        if settings['ignore_tags_type'] == main.tr('all'):
            for para in text.tokens_hierarchical:
                for sentence in para:
                    for clause in sentence:
                        for i, token in enumerate(clause):
                            clause[i] = (token, [])
        # Ignore POS tags
        elif settings['ignore_tags_type'] == main.tr('POS'):
            for para in text.tokens_hierarchical:
                for sentence in para:
                    for clause in sentence:
                        for i, token in enumerate(clause):
                            clause[i] = (token, text.tags_non_pos[i_token + i])

                        i_token += len(clause)

        # Ignore non-POS tags
        elif settings['ignore_tags_type'] == main.tr('non-POS'):
            for para in text.tokens_hierarchical:
                for sentence in para:
                    for clause in sentence:
                        for i, token in enumerate(clause):
                            clause[i] = (token, text.tags_pos[i_token + i])

                        i_token += len(clause)
    else:
        for para in text.tokens_hierarchical:
            for sentence in para:
                for clause in sentence:
                    for i, token in enumerate(clause):
                        clause[i] = (token, text.tags_all[i_token + i])

                    i_token += len(clause)

    # Use tags only
    if settings['use_tags']:
        for para in text.tokens_hierarchical:
            for sentence in para:
                for clause in sentence:
                    for i, token in enumerate(clause):
                        clause[i] = clause[i][1]
    else:
        for para in text.tokens_hierarchical:
            for sentence in para:
                for clause in sentence:
                    for i, token in enumerate(clause):
                        clause[i] = f"{clause[i][0]}{''.join(clause[i][1])}"

    text.tokens_flat = list(wordless_misc.flatten_list(text.tokens_hierarchical))

    return text
コード例 #5
0
def wordless_process_tokens(text, token_settings):
    main = text.main
    tokens = text.tokens.copy()

    settings = copy.deepcopy(token_settings)

    # Token Settings
    if settings['use_tags']:
        settings['ignore_tags'] = settings['ignore_tags_tags']
        settings['ignore_tags_type'] = settings['ignore_tags_type_tags']

    # Punctuations
    if not settings['puncs']:
        for i, token in reversed(list(enumerate(tokens))):
            if wordless_checking_token.is_token_punc(token):
                del tokens[i]

                del text.tags_pos[i]
                del text.tags_non_pos[i]
                del text.tags_all[i]

    # Lemmatize all tokens
    if not settings['use_tags'] and settings['lemmatize_tokens']:
        tokens = wordless_text_processing.wordless_lemmatize(main,
                                                             tokens,
                                                             lang=text.lang)

    # Treat as all lowercase
    if settings['treat_as_lowercase']:
        tokens = [token.lower() for token in tokens]

        text.tags_pos = [[tag.lower() for tag in tags]
                         for tags in text.tags_pos]
        text.tags_non_pos = [[tag.lower() for tag in tags]
                             for tags in text.tags_non_pos]
        text.tags_all = [[tag.lower() for tag in tags]
                         for tags in text.tags_all]

    text.tokens = copy.deepcopy(tokens)

    # Words
    if settings['words']:
        # Lowercase
        if not settings['lowercase']:
            for i, token in enumerate(tokens):
                if wordless_checking_token.is_token_word_lowercase(token):
                    tokens[i] = ''
        # Uppercase
        if not settings['uppercase']:
            for i, token in enumerate(tokens):
                if wordless_checking_token.is_token_word_uppercase(token):
                    tokens[i] = ''
        # Title Case
        if not settings['title_case']:
            for i, token in enumerate(tokens):
                if wordless_checking_token.is_token_word_title_case(token):
                    tokens[i] = ''
    else:
        for i, token in enumerate(tokens):
            if wordless_checking_token.is_token_word(token):
                tokens[i] = ''

    # Numerals
    if not settings['nums']:
        for i, token in enumerate(tokens):
            if wordless_checking_token.is_token_num(token):
                tokens[i] = ''

    # Filter stop words
    if settings['filter_stop_words']:
        tokens_filtered = wordless_text_processing.wordless_filter_stop_words(
            main, [token for token in tokens], lang=text.lang)

        for i, token in enumerate(tokens):
            if token not in tokens_filtered:
                tokens[i] = ''

    # Ignore tags
    if settings['ignore_tags']:
        # Ignore all tags
        if settings['ignore_tags_type'] == main.tr('all'):
            tokens = [(token, []) for token in tokens]
            text.tokens = [(token, []) for token in text.tokens]
        # Ignore POS tags
        elif settings['ignore_tags_type'] == main.tr('POS'):
            tokens = [(token, tags)
                      for token, tags in zip(tokens, text.tags_non_pos)]
            text.tokens = [
                (token, tags)
                for token, tags in zip(text.tokens, text.tags_non_pos)
            ]
        # Ignore non-POS tags
        elif settings['ignore_tags_type'] == main.tr('non-POS'):
            tokens = [(token, tags)
                      for token, tags in zip(tokens, text.tags_pos)]
            text.tokens = [(token, tags)
                           for token, tags in zip(text.tokens, text.tags_pos)]
    else:
        tokens = [(token, tags) for token, tags in zip(tokens, text.tags_all)]
        text.tokens = [(token, tags)
                       for token, tags in zip(text.tokens, text.tags_all)]

    return tokens
コード例 #6
0
def match_ngrams(main, search_terms, tokens, lang, text_type, token_settings,
                 search_settings):
    search_terms_matched = set()

    settings = copy.deepcopy(search_settings)

    re_tags_all = get_re_tags(main, tags='all')
    re_tags_pos = get_re_tags(main, tags='pos')
    re_tags_non_pos = get_re_tags(main, tags='non_pos')

    search_term_tokens = [
        search_term_token for search_term in search_terms
        for search_term_token in search_term.split()
    ]

    if search_settings['use_regex']:
        regexes_matched = {
            search_term_token: set()
            for search_term_token in search_term_tokens
        }
        tokens_matched = {}
    else:
        tokens_matched = {
            search_term_token: set()
            for search_term_token in search_term_tokens
        }

    # Search Settings
    if settings['match_tags']:
        settings['match_inflected_forms'] = False

        settings['ignore_tags'] = settings['ignore_tags_tags']
        settings['ignore_tags_type'] = settings['ignore_tags_type_tags']

    # Token Settings
    if token_settings['use_tags']:
        settings['match_inflected_forms'] = False
        settings['match_tags'] = False

        if token_settings['ignore_tags_tags']:
            settings['ignore_tags'] = False
    else:
        if token_settings['ignore_tags']:
            if token_settings['ignore_tags_type'] == main.tr('all'):
                settings['ignore_tags'] = False
                settings['match_tags'] = False

    # Match Tags Only & Ignore Tags
    if settings['match_tags']:
        if settings['ignore_tags']:
            if text_type[1] == 'untagged':
                tokens_searched = []
            else:
                if settings['ignore_tags_type'] == main.tr('POS'):
                    if text_type[1] in ['tagged_both', 'tagged_non_pos']:
                        tokens_searched = [
                            ''.join(re.findall(re_tags_non_pos, token))
                            for token in tokens
                        ]
                    elif text_type[1] == 'tagged_pos':
                        tokens_searched = []
                elif settings['ignore_tags_type'] == main.tr('non-POS'):
                    if text_type[1] in ['tagged_both', 'tagged_pos']:
                        tokens_searched = [
                            ''.join(re.findall(re_tags_pos, token))
                            for token in tokens
                        ]
                    elif text_type[1] == 'tagged_non_pos':
                        tokens_searched = []
        else:
            if text_type[1] == 'untagged':
                tokens_searched = []
            elif text_type[1] == 'tagged_pos':
                tokens_searched = [
                    ''.join(re.findall(re_tags_pos, token)) for token in tokens
                ]
            elif text_type[1] == 'tagged_non_pos':
                tokens_searched = [
                    ''.join(re.findall(re_tags_non_pos, token))
                    for token in tokens
                ]
            elif text_type[1] == 'tagged_both':
                tokens_searched = [
                    ''.join(re.findall(re_tags_all, token)) for token in tokens
                ]
    else:
        if settings['ignore_tags']:
            if text_type[1] == 'untagged':
                tokens_searched = tokens
            else:
                if settings['ignore_tags_type'] == main.tr('all'):
                    if text_type[1] == 'tagged_both':
                        tokens_searched = [
                            re.sub(re_tags_all, '', token) for token in tokens
                        ]
                    elif text_type[1] == 'tagged_pos':
                        tokens_searched = [
                            re.sub(re_tags_pos, '', token) for token in tokens
                        ]
                    elif text_type[1] == 'tagged_non_pos':
                        tokens_searched = [
                            re.sub(re_tags_non_pos, '', token)
                            for token in tokens
                        ]
                elif settings['ignore_tags_type'] == main.tr('POS'):
                    if text_type[1] in ['tagged_both', 'tagged_pos']:
                        tokens_searched = [
                            re.sub(re_tags_pos, '', token) for token in tokens
                        ]
                    elif text_type[1] == 'tagged_non_pos':
                        tokens_searched = tokens
                elif settings['ignore_tags_type'] == main.tr('non-POS'):
                    if text_type[1] in ['tagged_both', 'tagged_non_pos']:
                        tokens_searched = [
                            re.sub(re_tags_non_pos, '', token)
                            for token in tokens
                        ]
                    elif text_type[1] == 'tagged_pos':
                        tokens_searched = tokens
        else:
            tokens_searched = tokens

    if tokens_searched:
        if settings['use_regex']:
            for search_term_token in search_term_tokens:
                if settings['match_whole_words']:
                    regex = fr'(^|\s+){search_term_token}(\s+|$)'
                else:
                    regex = search_term_token

                if settings['ignore_case']:
                    flags = re.IGNORECASE
                else:
                    flags = 0

                for token, token_searched in zip(tokens, tokens_searched):
                    if re.search(regex, token_searched, flags=flags):
                        regexes_matched[search_term_token].add(token)
                        tokens_matched[token] = set()
        else:
            for search_term_token in search_term_tokens:
                regex = re.escape(search_term_token)

                if settings['match_whole_words']:
                    regex = fr'(^|\s+){regex}(\s+|$)'

                if settings['ignore_case']:
                    flags = re.IGNORECASE
                else:
                    flags = 0

                for token, token_searched in zip(tokens, tokens_searched):
                    if re.search(regex, token_searched, flags=flags):
                        tokens_matched[search_term_token].add(token)

        if settings['match_inflected_forms']:
            lemmas_searched = wordless_text_processing.wordless_lemmatize(
                main, tokens_searched, lang, text_type)
            lemmas_matched = wordless_text_processing.wordless_lemmatize(
                main, list(tokens_matched), lang, text_type)

            for token_matched, lemma_matched in zip(list(tokens_matched),
                                                    lemmas_matched):
                lemma_matched = re.escape(lemma_matched)
                lemma_matched = fr'(^|\s+){lemma_matched}(\s+|$)'

                if settings['ignore_case']:
                    flags = re.IGNORECASE
                else:
                    flags = 0

                for token, lemma_searched in zip(tokens, lemmas_searched):
                    if re.search(lemma_matched, lemma_searched, flags=flags):
                        tokens_matched[token_matched].add(token)

    if search_settings['use_regex']:
        for search_term in search_terms:
            search_term_tokens_matched = []

            for search_term_token in search_term.split():
                search_term_tokens_matched.append(set())

                for regex_matched in regexes_matched[search_term_token]:
                    search_term_tokens_matched[-1].add(regex_matched)
                    search_term_tokens_matched[-1] |= set(
                        tokens_matched[regex_matched])

            for item in itertools.product(*search_term_tokens_matched):
                search_terms_matched.add(item)
    else:
        for search_term in search_terms:
            search_term_tokens_matched = []

            for search_term_token in search_term.split():
                search_term_tokens_matched.append(
                    set(tokens_matched[search_term_token]))

            for item in itertools.product(*search_term_tokens_matched):
                search_terms_matched.add(item)

    return search_terms_matched