Example #1
0
 def testOverrides(self):
     # run the inflection system once to assure the overrides is loaded (ie.. lazy loading)
     lemminflect.getInflection('watch', 'VBD'), ('watched', )
     # Hack the code to replace the overrides dictionary
     orig_dict = lemminflect.Inflections().overrides_dict
     with self.assertLogs():
         lemmas = lemminflect.getLemma('WORD', 'X')
     self.assertEqual(lemmas, ())
     with self.assertLogs():
         lemmas = lemminflect.getAllLemmas('WORD', 'X')
     self.assertEqual(lemmas, {})
     with self.assertLogs():
         lemmas = lemminflect.getAllLemmasOOV('WORD', 'X')
     self.assertEqual(lemmas, {})
     token = self.nlp('I')[0]
     self.assertEqual(token._.lemma(), 'I')
     lemminflect.Inflections().overrides_dict = {
         'watch': {
             'VBD': ('xxx', )
         }
     }
     inflections = lemminflect.getInflection('watch',
                                             'VBD',
                                             inflect_oov=False)
     self.assertEqual(inflections, ('xxx', ))
     # put the original dictionary back
     lemminflect.Inflections().overrides_dict = orig_dict
 def _save_lemmas(self, digest_record, cleared_description):
     words = nltk.word_tokenize(cleared_description)
     word_lemmas_counts = {}
     for word in words:
         word_lemmas = lemminflect.getAllLemmas(word)
         lemmas_keys = ('NOUN', 'VERB', 'AUX', 'ADV', 'ADJ')
         word_lemmas_plain = []
         for lk in lemmas_keys:
             if lk in word_lemmas:
                 word_lemmas_plain += (l.lower() for l in word_lemmas[lk])
         if not word_lemmas and re.match(r'\w', word):
             word_lemmas_plain.append(word.lower())
         for l in word_lemmas_plain:
             if l not in word_lemmas_counts:
                 word_lemmas_counts[l] = 1
             else:
                 word_lemmas_counts[l] += 1
     for lemma_text, lemma_count_in_dr in word_lemmas_counts.items():
         existing_lemmas = Lemma.objects.filter(text=lemma_text)
         if not existing_lemmas:
             lemma_object = Lemma(text=lemma_text)
             lemma_object.save()
         else:
             lemma_object = existing_lemmas[0]
         existing_digest_record_lemmas = DigestRecordLemma.objects.filter(lemma=lemma_object,
                                                                          digest_record=digest_record)
         if not existing_digest_record_lemmas:
             digest_record_lemma = DigestRecordLemma(lemma=lemma_object,
                                                     digest_record=digest_record,
                                                     count=lemma_count_in_dr)
             digest_record_lemma.save()
    def _get_replacement_words(self, word, word_part_of_speech):
        # only nouns, verbs, and adjectives are considered for replacement
        if word_part_of_speech not in self._enptb_to_universal:
            return []

        # gets a dict that maps part-of-speech (POS) to available lemmas
        replacement_inflections_dict = lemminflect.getAllLemmas(word)

        # if dict is empty, there are no replacements for this word
        if not replacement_inflections_dict:
            return []

        # map the fine-grained POS to a universal POS
        lemminflect_pos = self._enptb_to_universal[word_part_of_speech]

        # choose lemma with same POS, if ones exists; otherwise, choose lemma randomly
        if lemminflect_pos in replacement_inflections_dict:
            lemma = replacement_inflections_dict[lemminflect_pos][0]
        else:
            lemma = random.choice(list(replacement_inflections_dict.values()))[0]

        # get the available inflections for chosen lemma
        inflections = lemminflect.getAllInflections(
            lemma, upos=lemminflect_pos
        ).values()

        # merge tuples, remove duplicates, remove copy of the original word
        replacement_words = list(set([infl for tup in inflections for infl in tup]))
        replacement_words = [r for r in replacement_words if r != word]

        return replacement_words
    def get_inflections(orig_tokenized, pos_tagged, constrain_pos):
        have_inflections = {'NOUN', 'VERB', 'ADJ'}
        token_inflections = [
        ]  # elements of form (i, inflections) where i is the token's position in the sequence

        for i, word in enumerate(orig_tokenized):
            lemmas = lemminflect.getAllLemmas(word)
            if lemmas and pos_tagged[i][1] in have_inflections:
                if pos_tagged[i][1] in lemmas:
                    lemma = lemmas[pos_tagged[i][1]][0]
                else:
                    lemma = random.choice(list(lemmas.values()))[0]

                if constrain_pos:
                    inflections = (
                        i,
                        list(
                            set([
                                infl for tup in lemminflect.getAllInflections(
                                    lemma, upos=pos_tagged[i][1]).values()
                                for infl in tup
                            ])))
                else:
                    inflections = (i,
                                   list(
                                       set([
                                           infl for tup in lemminflect.
                                           getAllInflections(lemma).values()
                                           for infl in tup
                                       ])))

                random.shuffle(inflections[1])
                token_inflections.append(inflections)
        return token_inflections
Example #5
0
 def __init__(self):
     global lemminflect
     import lemminflect
     self.name = 'LemmInflect'
     self.version_string = 'LemmInflect version: %s' % lemminflect.__version__
     # Force loading dictionary and model so lazy loading doesn't show up in run times
     lemmas = lemminflect.getAllLemmas('testing', 'VERB')
     lemmas = lemminflect.getAllLemmasOOV('xxtesting', 'VERB')
 def _get_replacement_words(self, word, word_part_of_speech):
     if word_part_of_speech not in self._flair_to_lemminflect_pos_map:
         # Only nouns, verbs, and adjectives have proper inflections.
         return []
     replacement_inflections_dict = lemminflect.getAllLemmas(word)
     # `lemminflect.getAllLemmas` returns a dict mapping part-of-speech
     # to available inflections. First, map part-of-speech from flair
     # POS tag to lemminflect.
     lemminflect_pos = self._flair_to_lemminflect_pos_map[word_part_of_speech]
     return replacement_inflections_dict.get(lemminflect_pos, None)
Example #7
0
def lemmatize_eng(word):
    from lemminflect import getAllLemmas, getAllLemmasOOV
    result = ""
    is_known = True
    is_multiple_forms = False
    for w in word.split():
        try:
            result += list(getAllLemmas(w).values())[0][0] + " "
            if len(list(getAllLemmas(w).values())) > 1:
                is_multiple_forms = True
        except IndexError:
            is_known = False
            result += list(getAllLemmasOOV(w, upos="NOUN").values())[0][0] + " "
    return {
        "normal_form": result,
        "is_known": is_known,
        "is_multiple_forms": is_multiple_forms,
        "pos_tag": "UNKNW",
    }
Example #8
0
 def testUPOSLog(self):
     with self.assertLogs():
         lemmas = lemminflect.getLemma('WORD', 'X')
     self.assertEqual(lemmas, ())
     with self.assertLogs():
         lemmas = lemminflect.getAllLemmas('WORD', 'X')
     self.assertEqual(lemmas, {})
     with self.assertLogs():
         lemmas = lemminflect.getAllLemmasOOV('WORD', 'X')
     self.assertEqual(lemmas, {})
     token = self.nlp('I')[0]
     self.assertEqual(token._.lemma(), 'I')
def fill_lemmas_and_connections_to_digest_records(apps, schema_editor):
    all_valued_records = DigestRecord.objects.filter(
        language=Language.ENGLISH.name)
    one_percent_count = math.ceil(all_valued_records.count() / 100)
    last_printed_percent = None
    for dr_i, dr in enumerate(all_valued_records):
        s = dr.title
        if dr.cleared_description:
            s += ' ' + dr.cleared_description
        words = nltk.word_tokenize(s)
        word_lemmas_counts = {}
        for word in words:
            word_lemmas = lemminflect.getAllLemmas(word)
            lemmas_keys = ('NOUN', 'VERB', 'AUX', 'ADV', 'ADJ')
            word_lemmas_plain = []
            for lk in lemmas_keys:
                if lk in word_lemmas:
                    word_lemmas_plain += (l.lower() for l in word_lemmas[lk])
            if not word_lemmas and re.match(r'\w', word):
                word_lemmas_plain.append(word.lower())
            for l in word_lemmas_plain:
                if l not in word_lemmas_counts:
                    word_lemmas_counts[l] = 1
                else:
                    word_lemmas_counts[l] += 1
        for lemma_text, lemma_count_in_dr in word_lemmas_counts.items():
            existing_lemmas = Lemma.objects.filter(text=lemma_text)
            if not existing_lemmas:
                lemma_object = Lemma(text=lemma_text)
                lemma_object.save()
            else:
                lemma_object = existing_lemmas[0]
            existing_digest_record_lemmas = DigestRecordLemma.objects.filter(
                lemma=lemma_object, digest_record=dr)
            if not existing_digest_record_lemmas:
                digest_record_lemma = DigestRecordLemma(
                    lemma=lemma_object,
                    digest_record=dr,
                    count=lemma_count_in_dr)
                digest_record_lemma.save()

        if (dr_i + 1) % one_percent_count == 0:
            current_percent = math.ceil((dr_i + 1) / one_percent_count)
            if last_printed_percent is None or current_percent != last_printed_percent:
                last_printed_percent = current_percent
                print(
                    f'Processed {current_percent}% ({dr_i + 1} records, {all_valued_records.count()} total, {all_valued_records.count() - dr_i - 1} left)'
                )
Example #10
0
 def testContractionLemmas(self):
     lemmas = lemminflect.getAllLemmas("'d")
     self.assertTrue(lemmas.items() >= {'AUX': ('will', 'have')}.items())
     lemmas = lemminflect.getAllLemmas("'ll")
     self.assertTrue(lemmas.items() >= {'AUX': ('will', )}.items())
     lemmas = lemminflect.getAllLemmas("'m")
     self.assertTrue(lemmas.items() >= {'AUX': ('be', )}.items())
     lemmas = lemminflect.getAllLemmas("'re")
     self.assertTrue(lemmas.items() >= {'AUX': ('be', )}.items())
     lemmas = lemminflect.getAllLemmas("'s")
     self.assertTrue(lemmas.items() >= {'AUX': ('be', )}.items())
     lemmas = lemminflect.getAllLemmas("'ve")
     self.assertTrue(lemmas.items() >= {'AUX': ('have', )}.items())
     lemmas = lemminflect.getAllLemmas("'ve")
     self.assertTrue(lemmas.items() >= {'AUX': ('have', )}.items())
Example #11
0
    def get_lemmas(self, word, tag=None, pos=None):

        lemmas = []

        if tag:
            # infer pos from tag
            pos = Inflector.tag_to_pos(tag)

        if pos:
            lemma_dict = lemminflect.getLemma(word, upos=pos)
            lemmas = list(lemma_dict)
        else:
            # no pos provided, return all lemmas
            lemma_dict = lemminflect.getAllLemmas(word)
            for i in lemma_dict.values():
                lemmas += list(i)

        return lemmas
Example #12
0
 def candidate_edits(self, text: str) -> List[Edit]:
     tokenized = self._spacy.tokenizer(text)
     candidate_edits = []
     for token in tokenized:
         lemmas = {
             lemma
             for lemmas in lemminflect.getAllLemmas(token.text).values()
             for lemma in lemmas
         }
         inflections = {
             inflection
             for lemma in lemmas
             for inflections in lemminflect.getAllInflections(lemma).values()
             for inflection in inflections
         }
         substitutes = inflections - {token.text}
         current_candidate_edits = _edits(token.i, tokenized, substitutes)
         candidate_edits.extend(current_candidate_edits)
     return candidate_edits
Example #13
0
def base_form(word):
    """
    Return the base form of the given word.
    In the case where the word can be seen as different parts of speech with different base forms,
    this returns the shortest. If they are the same length, it chooses the alphabetically first one.
    For example, "outing" is base form of noun, but also the inflected form of the verb "out", so
    this method will return "out".
    :param word:
    :return: base form
    """
    wl = word.lower()
    all_forms = set()
    for pos, lemmas in getAllLemmas(wl).items():
        # getAllLemmas can return multiple possible baseforms for each POS, eg british and american versions.
        # The first listed form is supposed to be the most common so that is the only one we consider.
        all_forms.add(lemmas[0])
    if all_forms:
        return min(all_forms, key=base_form_sort_key)
    else:
        return wl
Example #14
0
def random_inflect(source: str,
                   inflection_counts: Dict[str, int] = None) -> str:
    have_inflections = {'NOUN', 'VERB', 'ADJ'}
    tokenized = MosesTokenizer(lang='en').tokenize(
        source)  # Tokenize the sentence
    upper = False
    if tokenized[0][0].isupper():
        upper = True
        tokenized[0] = tokenized[0].lower()

    pos_tagged = nltk.pos_tag(tokenized,
                              tagset='universal')  # POS tag words in sentence

    for i, word in enumerate(tokenized):
        lemmas = lemminflect.getAllLemmas(word)
        # Only operate on content words (nouns/verbs/adjectives)
        if lemmas and pos_tagged[i][1] in have_inflections and pos_tagged[i][
                1] in lemmas:
            lemma = lemmas[pos_tagged[i][1]][0]
            inflections = (i, [(tag, infl)
                               for tag, tup in lemminflect.getAllInflections(
                                   lemma, upos=pos_tagged[i][1]).items()
                               for infl in tup])
            if inflections[1]:
                # Use inflection distribution for weighted random sampling if specified
                # Otherwise unweighted
                if inflection_counts:
                    counts = [
                        inflection_counts[tag] for tag, infl in inflections[1]
                    ]
                    inflection = random.choices(inflections[1],
                                                weights=counts)[0][1]
                else:
                    inflection = random.choices(inflections[1])[0][1]
                tokenized[i] = inflection
    if upper:
        tokenized[0] = tokenized[0].title()
    return MosesDetokenizer(lang='en').detokenize(tokenized)
def preprocessing_raw_data(**kwargs):
    import re

    from airflow.models import Variable
    from elasticsearch.helpers import streaming_bulk
    from lemminflect import getAllLemmas, getAllLemmasOOV
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT
    from nltk.corpus import stopwords
    from stop_words import get_stop_words
    from util.service_es import search, update_generator
    from util.util import is_latin

    process_num = kwargs['process_num']
    total_proc = kwargs['total_proc']

    number_of_documents = int(
        Variable.get("lemmatize_number_of_documents_eng", default_var=None))
    if number_of_documents is None:
        raise Exception("No variable!")

    s = search(ES_CLIENT,
               ES_INDEX_DOCUMENT,
               query={},
               source=['id', 'text'],
               sort=['id'],
               get_search_obj=True)
    s = s.exclude('exists', field="is_english")

    stopwords = set(
        get_stop_words('ru') + get_stop_words('en') +
        stopwords.words('english'))
    success = 0
    documents = []
    for doc in s.params(raise_on_error=False).scan():
        if int(doc.id) % total_proc != process_num:
            continue
        success += 1
        if success > 50_000:
            break
        if success % 10_000 == 0:
            print(f"{success}/{50_000}")
        if not is_latin(doc.text):
            doc['is_english'] = False
            documents.append(doc)
            continue
        cleaned_doc = [
            x.lower() for x in ' '.join(
                re.sub('([^А-Яа-яa-zA-ZӘәҒғҚқҢңӨөҰұҮүІі-]|[^ ]*[*][^ ]*)', ' ',
                       doc.text).split()).split()
            if not x in stopwords and len(x) > 2
        ]
        result = ""
        for word in cleaned_doc:
            try:
                result += list(getAllLemmas(word).values())[0][0] + " "
            except IndexError:
                result += list(getAllLemmasOOV(
                    word, upos="NOUN").values())[0][0] + " "
        doc['text_lemmatized_eng_lemminflect'] = result
        doc['is_english'] = True
        documents.append(doc)
Example #16
0
 def runGetAllLemmasTests(self, tests):
     for test in tests:
         base, upos, form = test
         lemmas = lemminflect.getAllLemmas(form, upos).get(upos, {})
         self.assertTrue(base in set(lemmas),
                         msg='base=%s  lemmas=%s' % (base, str(lemmas)))
Example #17
0
def api_getAllLemmas():
    content = request.json
    result = getAllLemmas(content['word'], content['upos'])
    return jsonify(result)
Example #18
0
 def getLemmaDictOnly(self, entry, upos):
     lemmas = lemminflect.getAllLemmas(entry.infl, upos)
     lemma = lemmas.get(upos, ())
     if not lemma:
         return ()
     return lemma[0]