コード例 #1
0
    def run(self):
        preview_results = []

        preview_lang = self.main.settings_custom['lemmatization'][
            'preview_lang']
        preview_samples = self.main.settings_custom['lemmatization'][
            'preview_samples']

        for line in preview_samples.split('\n'):
            line = line.strip()

            if line:
                tokens = wl_word_tokenization.wl_word_tokenize(
                    self.main, line, lang=preview_lang)
                tokens = wl_misc.flatten_list(tokens)

                lemmas = wl_lemmatization.wl_lemmatize(
                    self.main,
                    tokens,
                    lang=preview_lang,
                    lemmatizer=self.lemmatizer)

                text = wl_word_detokenization.wl_word_detokenize(
                    self.main, lemmas, lang=preview_lang)

                preview_results.append(text)
            else:
                preview_results.append('')

        self.worker_done.emit(preview_samples, preview_results)
コード例 #2
0
def test_lemmatize(lang, lemmatizer, show_results = False):
    lang_text = wl_conversion.to_lang_text(main, lang)

    tokens = wl_word_tokenization.wl_word_tokenize(
        main,
        text = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'),
        lang = lang
    )
    
    lemmas = wl_lemmatization.wl_lemmatize(
        main,
        tokens = tokens,
        lang = lang,
        lemmatizer = lemmatizer
    )

    if show_results:
        print(f'{lang} / {lemmatizer}:')
        print(lemmas)

    if lang == 'ast':
        assert lemmas == ["L'asturianu", 'ser', 'unu', 'llingua', 'romance', 'propiu', "d'Asturies,[1", ']', 'perteneciente', 'al', 'subgrupu', 'asturllionés', '.']
    elif lang == 'bul':
        assert lemmas == ['Бъ̀лгарският', 'езѝк', 'съм', 'индоевропейски', 'език', 'от', 'група', 'на', 'южнославянските', 'език', '.']
    elif lang == 'cat':
        assert lemmas == ['El', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'ell', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a', 'ell', 'ciutat', 'de', 'ell', 'Alguer', 'i', 'tradicional', 'a', 'Catalunya', 'Nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'al', 'País', 'Valencià', 'i', 'tradicional', 'al', 'Carxe', ')', 'ser', 'un', 'llengua', 'romànic', 'parlar', 'a', 'Catalunya', ',', 'ell', 'País', 'Valencià', '(', 'treure', 'de', 'algun', 'comarca', 'i', 'localitat', 'de', 'ell', 'interior', ')', ',', 'ell', 'Illes', 'Balears', ',', 'Andorra', ',', 'ell', 'Franja', 'de', 'Ponent', '(', 'a', 'ell', 'Aragó', ')', ',', 'ell', 'ciutat', 'de', 'ell', 'Alguer', '(', 'a', 'ell', 'illa', 'de', 'Sardenya', ')', ',', 'ell', 'Catalunya', 'del', 'Nord,[8', ']', 'ell', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'poblar', 'per', 'immigrar', 'valencians),[9][10', ']', 'i', 'en', 'petita', 'comunitat', 'arreu', 'del', 'món', '(', 'entrar', 'ell', 'qual', 'destacar', 'ell', 'de', 'ell', 'Argentina', ',', 'amb', '195.000', 'parlants).[11', ']']
    elif lang == 'ces':
        assert lemmas == ['Čeština', 'neboli', 'český', 'jazyk', 'on', 'západoslovanský', 'jazyk', ',', 'blízký', 'slovenštině', ',', 'poté', 'lužické', 'srbštině', 'a', 'polštině', '.']
    elif lang == 'nld':
        assert lemmas == ['het', 'nederlands', 'zijn', 'een', 'west-germaans', 'taal', 'en', 'de', 'moedertaal', 'van', 'de', 'veel', 'inwoner', 'van', 'nederland', ',', 'belgië', 'en', 'suriname', '.']
    elif lang == 'eng':
        if lemmatizer == 'Lemmatization Lists - English Lemma List':
            assert lemmas == ['English', 'be', 'a', 'West', 'Germanic', 'language', 'that', 'be', '1', 'speak', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'become', 'a', 'global', 'lingua', 'franca.[4][5', ']']
        elif lemmatizer == 'NLTK - WordNet Lemmatizer':
            assert lemmas == ['English', 'be', 'a', 'West', 'Germanic', 'language', 'that', 'be', 'first', 'speak', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'become', 'a', 'global', 'lingua', 'franca.[4][5', ']']
        elif lemmatizer == 'spaCy - English Lemmatizer':
            assert lemmas == ['English', 'be', 'a', 'West', 'germanic', 'language', 'that', 'be', 'first', 'speak', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'become', 'a', 'global', 'lingua', 'franca.[4][5', ']']
    elif lang == 'est':
        assert lemmas == ['Eesti', 'kee', '(', 'varasem', 'nimetu', ':', 'maakeel', ')', 'olema', 'läänemeresoome', 'lõunarühma', 'kuuluma', 'kee', '.']
    elif lang == 'fra':
        if lemmatizer == 'Lemmatization Lists - French Lemma List':
            assert lemmas == ['Le', 'français', 'être', 'un', 'langue', 'indo-européen', 'de', 'le', 'famille', 'un', 'langue', 'roman', '.']
        elif lemmatizer == 'spaCy - French Lemmatizer':
            assert lemmas == ['le', 'français', 'être', 'un', 'langue', 'indo-européen', 'de', 'le', 'famille', 'un', 'langue', 'roman', '.']
    elif lang == 'glg':
        assert lemmas == ['O', 'galego', '(', '[', 'ɡaˈleɣo̝', ']', ')', 'ser', 'un', 'lingua', 'indoeuropeo', 'que', 'pertencer', 'á', 'póla', 'de', 'lingua', 'románico', '.']
    elif lang == 'deu':
        if lemmatizer == 'Lemmatization Lists - German Lemma List':
            assert lemmas == ['Die', 'deutsch', 'Sprache', 'bzw.', 'Deutsch', '(', '[', 'dɔʏ̯t͡ʃ', ']', ';', 'abkürzen', 'dt', '.', 'oder', 'dtsch', '.', ')', 'sein', 'einen', 'westgermanische', 'Sprache', '.']
        elif lemmatizer == 'spaCy - German Lemmatizer':
            assert lemmas == ['der', 'deutsch', 'Sprache', 'bzw.', 'Deutsch', '(', '[', 'dɔʏ̯t͡ʃ', ']', ';', 'abkürzen', 'dt', '.', 'oder', 'dtsch', '.', ')', 'sein', 'einen', 'westgermanische', 'Sprache', '.']
    elif lang == 'grc':
        assert lemmas == ['Με', 'τον', 'όρο', 'αρχαία', 'ελληνική', 'γλώσσα', 'εννοείται', 'μια', 'μορφή', 'της', 'ελληνικής', 'γλώσσας', ',', 'πού', 'ομιλούνταν', 'κατά', 'τους', 'αρχαϊκούς', 'χρόνους', 'και', 'την', 'κλασική', 'αρχαιότητα', '.']
    elif lang == 'ell':
        assert lemmas == ['η', 'ελληνικός', 'γλώσσα', 'ανήκω', 'στην', 'ινδοευρωπαϊκός', 'οικογένεια[9', ']', 'και', 'συγκεκριμένα', 'στον', 'ελληνικό', 'κλάδο', ',', 'μαζί', 'με', 'την', 'τσακωνικός', ',', 'ενώ', 'είναι', 'η', 'επίσημη', 'γλώσσα', 'της', 'Ελλάδος', 'και', 'της', 'Κύπρου', '.']
    elif lang == 'hun':
        assert lemmas == ['A', 'magyar', 'nyelv', 'az', 'uráli', 'nyelvcsalád', 'tag', ',', 'a', 'finnugor', 'nyelv', 'köz', 'tartozó', 'ugor', 'nyelv', 'egyik', '.']
    elif lang == 'gle':
        assert lemmas == ['Is', 'ceann', 'de', 'na', 'teangach', 'Ceilteacha', 'í', 'an', 'Ghaeilge', '(', 'nó', 'Gaeilge', 'na', 'hÉireann', 'mar', 'a', 'tabhair', 'ar', 'corruair', ')', ',', 'agus', 'ceann', 'den', 'trí', 'ceann', 'de', 'teangach', 'Ceilteacha', 'air', 'a', 'tabhair', 'na', 'teangach', 'Gaelacha', '(', '.i.', 'an', 'Ghaeilge', ',', 'Gaeilge', 'na', 'hAlban', 'agus', 'Gaeilge', 'Mhanann', ')', 'go', 'áirithe', '.']
    elif lang == 'ita':
        assert lemmas == ["L'", 'italiano', '(', '[', 'itaˈljaːno][Nota', '1', ']', 'ascolta[?·info', ']', ')', 'essere', 'una', 'lingua', 'romanzo', 'parlato', 'principalmente', 'in', 'Italia', '.']
    elif lang == 'lit':
        assert lemmas == ['lietuvė', 'kalbėti', '–', 'ižti', 'baltas', 'prokalbės', 'kilęs', 'lietuvė', 'tauta', 'kalbėti', ',', '-PRON-', 'Lietuvoje', 'būti', 'valstybinis', ',', 'o', 'Europos', 'sąjunga', '–', 'viena', 'ižti', 'oficialus', 'kalbus', '.']
    elif lang == 'glv':
        assert lemmas == ['She', 'Gaelg', '(', 'graït', ':', '/gɪlg/', ')', 'çhengey', 'Gaelagh', 'Mannin', '.']
    elif lang == 'nob':
        assert lemmas == ['bokmål', 'være', 'en', 'varietet', 'av', 'norsk', 'språk', '.']
    elif lang == 'fas':
        assert lemmas == ['فارسی', 'یا', 'پارسی', 'یکی', 'از', 'زبان\u200cهای', 'هندواروپایی', 'در', 'شاخهٔ', 'زبان\u200cهای', 'ایرانی', 'جنوب', 'غربی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان،[۳', ']', 'تاجیکستان[۴', ']', 'را', 'ازبکستان[۵', ']', 'به', 'آن', 'سخن', 'می\u200cگویند', '.']
    elif lang == 'por':
        assert lemmas == ['A', 'língua', 'portuguesar', ',', 'também', 'designar', 'português', ',', 'ser', 'umar', 'língua', 'românico', 'flexivo', 'ocidental', 'originar', 'o', 'galego-português', 'falar', 'o', 'Reino', 'da', 'Galiza', 'e', 'o', 'norte', 'de', 'Portugal', '.']
    elif lang == 'ron':
        assert lemmas == ['Limba', 'român', 'fi', 'vrea', 'limbă', 'indo', '-', 'european', ',', 'din', 'grup', 'italic', 'și', 'din', 'subgrupul', 'oriental', 'al', 'limbă', 'romanice', '.']
    elif lang == 'rus':
        assert lemmas == ['ру́сский', 'язы́к', '(', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']', 'информация', 'о', 'файл', 'слушать', ')', '[', '~', '3', ']', '[', '⇨', ']', '—', 'один', 'из', 'восточнославянский', 'язык', ',', 'национальный', 'язык', 'русский', 'народ', '.']
    elif lang == 'gla':
        assert lemmas == ["'S", 'i', 'cànan', 'dùthchasach', 'na', 'h', '-', 'Alba', 'a', 'th', "'", 'anns', 'a', "'", 'Ghàidhlig', '.']
    elif lang == 'slk':
        assert lemmas == ['Slovenčina', 'patriť', 'do', 'skupina', 'západoslovanský', 'jazyk', '(', 'spolu', 's', 'čeština', ',', 'poľština', ',', 'horný', 'as', 'dolný', 'lužickou', 'srbčina', 'as', 'kašubčinou', ')', '.']
    elif lang == 'slv':
        assert lemmas == ['Slovenščina', '[', 'slovénščina', ']', '/', '[', 'sloˈʋenʃtʃina', ']', 'onbiti', 'združen', 'naziv', 'za', 'uraden', 'knjižen', 'jezik', 'Slovenec', 'in', 'skupen', 'ime', 'za', 'narečje', 'in', 'govoriti', ',', 'ki', 'on', 'govoriti', 'ali', 'biti', 'on', 'nekoč', 'govoriti', 'Slovenec', '.']
    elif lang == 'spa':
        assert lemmas == ['El', 'español', 'o', 'castellano', 'ser', 'uno', 'lengua', 'romance', 'procedente', 'del', 'latín', 'hablar', '.']
    elif lang == 'swe':
        assert lemmas == ['Svenska', '(', 'svensk', '(', 'info', ')', ')', 'vara', 'en', 'östnordiskt', 'språka', 'som', 'tala', 'av', 'ungefär', 'tio', 'miljon', 'person', 'främst', 'i', 'Sverige', 'där', 'språk', 'hare', 'man', 'dominant', 'ställning', 'som', 'huvudspråk', ',', 'mena', 'även', 'som', 'en', 'en', 'nationalspråk', 'i', 'Finland', 'och', 'som', 'enda', 'officiell', 'språka', 'på', 'Åland', '.']
    elif lang == 'bod':
        assert lemmas == ['བོད་', 'ཀྱི་', 'སྐད་ཡིག་', 'ནི་', 'བོད་ཡུལ་', 'དང་', 'དེ་', 'གི་', 'ཉེ་འཁོར་', 'གྱི་', 'ས་ཁུལ་', 'ཏེ་', ' །']
    elif lang == 'ukr':
        if lemmatizer == 'Lemmatization Lists - Ukrainian Lemma List':
            assert lemmas == ['Украї́нська', 'мо́ва', '(', 'МФА', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичний', 'назвати', '—', 'ру́ська', ',', 'руси́нська[9][10][11', ']', '[', '*', '2', ']', ')', '—', 'національний', 'мова', 'українець', '.']
        elif lemmatizer == 'pymorphy2 - Morphological Analyzer':
            assert lemmas == ['украї́нський', 'мо́вий', '(', 'мфа', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичний', 'назва', '—', 'ру́ський', ',', 'руси́нська[9][10][11', ']', '[', '*', '2', ']', ')', '—', 'національний', 'мова', 'українець', '.']
    elif lang == 'cym':
        assert lemmas == ['Aelod', "o'r", 'cangen', 'Frythonaidd', "o'r", 'iaith', 'Celtaidd', 'a', 'siarad', 'bod', 'brodorol', 'yn', 'Nghymru', ',', 'can', 'Gymry', 'a', 'pobl', 'arall', 'aredig', 'gwasgar', 'bod', 'Lloegr', ',', 'a', 'can', 'cymuno', 'bechan', 'bod', 'Y', 'Wladfa', ',', 'gwybod', 'Ariannin[7', ']', "yw'r", 'Gymraeg', '(', 'hefyd', 'Cymraeg', 'heb', 'yr', 'bannod', ')', '.']
コード例 #3
0
def wl_process_tokens(text, token_settings):
    main = text.main
    settings = copy.deepcopy(token_settings)

    # Token Settings
    if settings['use_tags']:
        settings['ignore_tags'] = settings['ignore_tags_tags']
        settings['ignore_tags_type'] = settings['ignore_tags_type_tags']

    # Punctuations
    if not settings['puncs']:
        i_tokens = 0

        # Mark tokens to be removed
        for para in text.tokens_multilevel:
            for sentence in para:
                for clause in sentence:
                    for i, token in enumerate(clause):
                        if wl_checking_token.is_token_punc(token):
                            clause[i] = ''

                            text.tags_pos[i_tokens + i] = ''
                            text.tags_non_pos[i_tokens + i] = ''
                            text.tags_all[i_tokens + i] = ''

                    i_tokens += len(clause)

        # Remove punctuations
        for para in text.tokens_multilevel:
            for sentence in para:
                for i, clause in enumerate(sentence):
                    sentence[i] = [token for token in clause if token]

        text.tags_pos = [tags for tags in text.tags_pos if tags != '']
        text.tags_non_pos = [tags for tags in text.tags_pos if tags != '']
        text.tags_all = [tags for tags in text.tags_pos if tags != '']

    # Lemmatize all tokens
    if not settings['use_tags'] and settings['lemmatize_tokens']:
        for para in text.tokens_multilevel:
            for sentence in para:
                for i, clause in enumerate(sentence):
                    sentence[i] = wl_lemmatization.wl_lemmatize(main,
                                                                clause,
                                                                lang=text.lang)

    # Treat as all lowercase
    if settings['treat_as_lowercase']:
        for para in text.tokens_multilevel:
            for sentence in para:
                for i, clause in enumerate(sentence):
                    sentence[i] = [token.lower() for token in clause]

        text.tags_pos = [[tag.lower() for tag in tags]
                         for tags in text.tags_pos]
        text.tags_non_pos = [[tag.lower() for tag in tags]
                             for tags in text.tags_non_pos]
        text.tags_all = [[tag.lower() for tag in tags]
                         for tags in text.tags_all]

    # Words
    if settings['words']:
        # Lowercase
        if not settings['lowercase']:
            for para in text.tokens_multilevel:
                for sentence in para:
                    for clause in sentence:
                        for i, token in enumerate(clause):
                            if wl_checking_token.is_token_word_lowercase(
                                    token):
                                clause[i] = ''
        # Uppercase
        if not settings['uppercase']:
            for para in text.tokens_multilevel:
                for sentence in para:
                    for clause in sentence:
                        for i, token in enumerate(clause):
                            if wl_checking_token.is_token_word_uppercase(
                                    token):
                                clause[i] = ''
        # Title Case
        if not settings['title_case']:
            for para in text.tokens_multilevel:
                for sentence in para:
                    for clause in sentence:
                        for i, token in enumerate(clause):
                            if wl_checking_token.is_token_word_title_case(
                                    token):
                                clause[i] = ''
    else:
        for para in text.tokens_multilevel:
            for sentence in para:
                for clause in sentence:
                    for i, token in enumerate(clause):
                        if wl_checking_token.is_token_word(token):
                            clause[i] = ''

    # Numerals
    if not settings['nums']:
        for para in text.tokens_multilevel:
            for sentence in para:
                for clause in sentence:
                    for i, token in enumerate(clause):
                        if wl_checking_token.is_token_num(token):
                            clause[i] = ''

    # Filter stop words
    if settings['filter_stop_words']:
        for para in text.tokens_multilevel:
            for sentence in para:
                for i, clause in enumerate(sentence):
                    sentence[i] = wl_stop_word_lists.wl_filter_stop_words(
                        main, clause, lang=text.lang)

    # Ignore tags
    i_token = 0

    if settings['ignore_tags']:
        # Ignore all tags
        if settings['ignore_tags_type'] == main.tr('all'):
            for para in text.tokens_multilevel:
                for sentence in para:
                    for clause in sentence:
                        for i, token in enumerate(clause):
                            clause[i] = (token, [])
        # Ignore POS tags
        elif settings['ignore_tags_type'] == main.tr('POS'):
            for para in text.tokens_multilevel:
                for sentence in para:
                    for clause in sentence:
                        for i, token in enumerate(clause):
                            clause[i] = (token, text.tags_non_pos[i_token + i])

                        i_token += len(clause)

        # Ignore non-POS tags
        elif settings['ignore_tags_type'] == main.tr('non-POS'):
            for para in text.tokens_multilevel:
                for sentence in para:
                    for clause in sentence:
                        for i, token in enumerate(clause):
                            clause[i] = (token, text.tags_pos[i_token + i])

                        i_token += len(clause)
    else:
        for para in text.tokens_multilevel:
            for sentence in para:
                for clause in sentence:
                    for i, token in enumerate(clause):
                        clause[i] = (token, text.tags_all[i_token + i])

                    i_token += len(clause)

    # Use tags only
    if settings['use_tags']:
        for para in text.tokens_multilevel:
            for sentence in para:
                for clause in sentence:
                    for i, token in enumerate(clause):
                        clause[i] = clause[i][1]
    else:
        for para in text.tokens_multilevel:
            for sentence in para:
                for clause in sentence:
                    for i, token in enumerate(clause):
                        clause[i] = f"{clause[i][0]}{''.join(clause[i][1])}"

    text.tokens_flat = list(wl_misc.flatten_list(text.tokens_multilevel))

    return text
コード例 #4
0
def match_ngrams(
    main, search_terms, tokens,
    lang, tokenized, tagged,
    token_settings, search_settings
):
    search_terms_matched = set()

    settings = copy.deepcopy(search_settings)
    re_tags = get_re_tags(main)

    search_term_tokens = [search_term_token
                          for search_term in search_terms
                          for search_term_token in search_term.split()]

    if search_settings['use_regex']:
        regexes_matched = {search_term_token: set() for search_term_token in search_term_tokens}
        tokens_matched = {}
    else:
        tokens_matched = {search_term_token: set() for search_term_token in search_term_tokens}

    # Search Settings
    if settings['match_tags']:
        settings['match_inflected_forms'] = False

    # Token Settings
    if token_settings['use_tags']:
        settings['match_inflected_forms'] = False
        settings['match_tags'] = False
    else:
        if token_settings['ignore_tags']:
            settings['ignore_tags'] = False
            settings['match_tags'] = False

    # Match tags only & Ignore tags
    if settings['match_tags']:
        if tagged == 'No':
            tokens_searched = []
        else:
            tokens_searched = [''.join(re.findall(re_tags, token)) for token in tokens]
    else:
        if settings['ignore_tags']:
            if tagged == 'No':
                tokens_searched = tokens
            else:
                if tagged == 'Yes':
                    tokens_searched = [re.sub(re_tags, '', token) for token in tokens]
        else:
            tokens_searched = tokens

    if tokens_searched:
        if settings['use_regex']:
            for search_term_token in search_term_tokens:
                if settings['match_whole_words']:
                    regex = fr'(^|\s+){search_term_token}(\s+|$)'
                else:
                    regex = search_term_token

                if settings['ignore_case']:
                    flags = re.IGNORECASE
                else:
                    flags = 0

                for token, token_searched in zip(tokens, tokens_searched):
                    if re.search(regex, token_searched, flags = flags):
                        regexes_matched[search_term_token].add(token)
                        tokens_matched[token] = set()
        else:
            for search_term_token in search_term_tokens:
                regex = re.escape(search_term_token)

                if settings['match_whole_words']:
                    regex = fr'(^|\s+){regex}(\s+|$)'

                if settings['ignore_case']:
                    flags = re.IGNORECASE
                else:
                    flags = 0

                for token, token_searched in zip(tokens, tokens_searched):
                    if re.search(regex, token_searched, flags = flags):
                        tokens_matched[search_term_token].add(token)

        if settings['match_inflected_forms']:
            lemmas_searched = wl_lemmatization.wl_lemmatize(main, tokens_searched, lang, tokenized, tagged)
            lemmas_matched = wl_lemmatization.wl_lemmatize(main, list(tokens_matched), lang, tokenized, tagged)

            for token_matched, lemma_matched in zip(list(tokens_matched), lemmas_matched):
                lemma_matched = re.escape(lemma_matched)
                lemma_matched = fr'(^|\s+){lemma_matched}(\s+|$)'

                if settings['ignore_case']:
                    flags = re.IGNORECASE
                else:
                    flags = 0

                for token, lemma_searched in zip(tokens, lemmas_searched):
                    if re.search(lemma_matched, lemma_searched, flags = flags):
                        tokens_matched[token_matched].add(token)

    if search_settings['use_regex']:
        for search_term in search_terms:
            search_term_tokens_matched = []

            for search_term_token in search_term.split():
                search_term_tokens_matched.append(set())

                for regex_matched in regexes_matched[search_term_token]:
                    search_term_tokens_matched[-1].add(regex_matched)
                    search_term_tokens_matched[-1] |= set(tokens_matched[regex_matched])

            for item in itertools.product(*search_term_tokens_matched):
                search_terms_matched.add(item)
    else:
        for search_term in search_terms:
            search_term_tokens_matched = []

            for search_term_token in search_term.split():
                search_term_tokens_matched.append(set(tokens_matched[search_term_token]))

            for item in itertools.product(*search_term_tokens_matched):
                search_terms_matched.add(item)

    return search_terms_matched