Python Annotation Examples

Programming Language: Python

Namespace/Package Name: estnltk

Class/Type: Annotation

Examples at hotexamples.com: 6

Python Annotation - 6 examples found. These are the top rated real world Python examples of estnltk.Annotation extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Annotation(6)

Frequently Used Methods

Annotation (6)

Example #1

Show file

File: carry_over_original_morph_analysis.py Project: estnltk/estnltk-workflows

def make_new_ot_morph_layer(old_text_obj,
                            new_text_obj,
                            new_layer='original_words_morph_analysis',
                            old_layer='ot_morph_analysis',
                            new_parent_layer='original_words'):
    '''Creates new 'original_words_morph_analysis' layer based on the old morph analysis layer.'''
    assert old_layer in old_text_obj.layers
    assert new_parent_layer in new_text_obj.layers
    assert new_layer not in new_text_obj.layers
    original_layer = old_text_obj[old_layer]
    assert 'normalized_text' not in original_layer.attributes
    parent_layer = new_text_obj[new_parent_layer]
    layer = Layer(name=new_layer,
                  text_object=new_text_obj,
                  attributes=('normalized_text', ) + original_layer.attributes,
                  parent=new_parent_layer,
                  ambiguous=True)
    assert len(parent_layer) == len(original_layer)
    for wid, parent_span in enumerate(parent_layer):
        old_morph_span = original_layer[wid]
        new_span = Span(base_span=parent_span.base_span, layer=layer)
        for ann in old_morph_span.annotations:
            new_annotation = {'normalized_text': parent_span.text}
            for a in layer.attributes:
                if a in ['start', 'end', 'text', 'normalized_text']:
                    continue
                new_annotation[a] = ann[a]
            new_span.add_annotation(Annotation(new_span, **new_annotation))
        layer.add_span(new_span)
    return layer

Example #2

Show file

def add_normalized_form_to_words( layer ):
    '''Rewrites words layer and add normalized_form attribute.'''
    layer.attributes += ('normalized_form',)
    for item in layer:
        item.clear_annotations()
        item.add_annotation( Annotation(item, normalized_form=None) )
    return layer

Example #3

Show file

File: morph_pipeline.py Project: estnltk/processing-old-estonian

 def _change_layer(self, text, layers, status):
     # Get changeble layer
     changeble_layer = layers[self.output_layer]
     # Add new attribute to the layer
     changeble_layer.attributes += (self.output_attributes[-1], )
     # Iterate over words and add new normalizations
     for span in changeble_layer:
         # Get current normalized forms of the word
         current_norm_forms = [
             a['normalized_form'] for a in span.annotations
         ]
         if current_norm_forms == [None]:
             current_norm_forms = [span.text]
         # Try to replace current normalized forms with forms from the lexicon
         new_forms = []
         change_status = []
         for cur_form in current_norm_forms:
             for letter in self.letters_replaced:
                 new_form = cur_form.replace(letter,
                                             self.letters_replaced[letter])
                 if new_form != cur_form:
                     new_forms.append(new_form)
                     change_status.append(True)
                 else:
                     new_forms.append(cur_form)
                     change_status.append(False)
         # Clear existing annotations and add new ones that have 1 extra attribute
         span.clear_annotations()
         for form_id, new_form in enumerate(new_forms):
             span.add_annotation(
                 Annotation(span,
                            normalized_form=new_form,
                            is_prenormalized=change_status[form_id]))

Example #4

Show file

File: morph_pipeline.py Project: estnltk/processing-old-estonian

def add_punctuation_analysis(text):
    #Redefine the tagger for analysing punctuation
    punct_analyser = VabamorfAnalyzer(guess=True, propername=True)
    for word in text.morph_analysis:
        if _is_empty_annotation(word.annotations[0]):
            # Check if it is punctuation
            if len(word.text) > 0 and not any([c.isalnum()
                                               for c in word.text]):
                # It is a punctuation. Generate the analyses with guessing enabled and add them to the text
                w = Text(word.text)
                w.tag_layer(['sentences'])
                punct_analyser.tag(w)

                analysis = w.morph_analysis[0].annotations
                # If for some reason there are multiple analyses
                # the only first one will remain.
                if len(analysis) > 1:
                    analysis = [analysis[0]]

                #Rewrite the analysis
                word.clear_annotations()
                word.add_annotation(Annotation(word, **analysis[0]))

Example #5

Show file

File: NormalizeWordsRetagger.py Project: estnltk/ettenten-experiments

    def _change_layer(self, text, layers, status):

        words = layers[self.output_layer]

        words.attributes = words.attributes + self.output_attributes

        # normalizes letter repetitions (e.g. väääääga)
        if self.use_letter_reps == True:

            # checks if created normalized form should be changed or additional forms added
            def add_normalized_form(outcome, form_to_use, candidates,
                                    spelling_list):
                forms_to_add = [
                ]  # all the normalized forms will be added here
                dict_of_other_forms = {"w": "www", "x": "xxx", "z": "zzz"}

                if type(outcome) == str:

                    # normalized form is changed if needed to match the original word (upper/lowercase, capital letter)
                    if form_to_use.isupper() and outcome.islower():
                        outcome = outcome.upper()
                    elif form_to_use.islower() and outcome.isupper():
                        outcome = outcome.lower()
                    elif form_to_use[0].isupper() and outcome[0].islower():
                        outcome = outcome.capitalize()
                    # normalized form is added if the original word contains more than 2 letter reps
                    if re.search(r"([a-zšžõäöü])\1{2,}",
                                 form_to_use.lower()) != None or find_repeats(
                                     form_to_use.lower()) != None:
                        forms_to_add.append(outcome)

                        wo_repeats = without_recurrent_letters(outcome)
                        # finding and adding possible alternative forms
                        outcome_modif = []
                        # e.g. "prrrr-ga" -- so that it will then be "prrrr"--"prr" and would get special alternative forms too ("pr")
                        if "-" in outcome and outcome.count("-") == 1:
                            splitted_outcome = outcome.split("-")
                            to_add = splitted_outcome[0], splitted_outcome[
                                1]  # word and the word ending etc
                            outcome_modif.append(to_add)
                            splitted_form_to_use = form_to_use.split("-")
                            form_to_use = splitted_form_to_use[0]  # e.g. prr
                            outcome = splitted_outcome[0]  # e.g. original prrr

                        if (len(set(without_recurrent_letters(outcome))) in [2,3]) or \
                        (len(set(without_recurrent_letters(outcome))) in [1,2,3] and outcome_modif!=0): # certain shorter forms get possible other alternatives
                            # if max 3 different letters in word: new alternative form: e.g. "ma" (orig: "maaaa", first norm_form: "maa")
                            new_form = re.sub(r"([a-zšžõäöüA-ZÜÕÄÖŠŽ])\1{1,}",
                                              r"\1", form_to_use)
                            # if max 2 different letters in word: new alternative form: e.g. "krr" (orig: "krrrr", first norm_form: "kr")
                            new_form2 = re.sub(r"([a-zšžõäöüA-ZÜÕÄÖŠŽ])\1{2,}",
                                               r"\1\1", form_to_use)
                            if len(outcome_modif) == 0:
                                if len(outcome) == 2 and len(new_form2) == len(
                                        outcome) + 1:  # e.g. krr
                                    forms_to_add.append(new_form2)
                                # max 5 letters or max 2 diff letters (e.g. maa - ma, ahhaa - aha)
                                if (len(outcome)<5 or len(set(without_recurrent_letters(outcome)))==2) \
                                and new_form!=outcome and len(new_form)<len(outcome):
                                    forms_to_add.append(new_form)

                            else:  # words with "-", e.g. "prrrr-ga"
                                if (new_form!=outcome_modif[0][0] and len(new_form)<len(outcome_modif[0][0])) or \
                                    (len(set(without_recurrent_letters(outcome)))==1):
                                    if outcome not in dict_of_other_forms:
                                        forms_to_add.append(
                                            new_form + "-" +
                                            outcome_modif[0][1])
                                    else:
                                        forms_to_add.append(
                                            dict_of_other_forms[outcome] +
                                            "-" + outcome_modif[0][1])

                        # special alternative forms from dict, e.g. w - www
                        if len(outcome_modif
                               ) == 0 and outcome in dict_of_other_forms:
                            forms_to_add.append(dict_of_other_forms[outcome])

                        # roman numeral, e.g. "xii"/"Xii" get "XII" as an alternative
                        if (all(c in ["i","v","x","l","c","d","m"] for c in form_to_use)==True) or \
                        (all(c in ["i","v","x","l","c","d","m"] for c in form_to_use[1:])==True and form_to_use[0] in ["I","V","X","L","C","D","M"]):
                            if len(outcome_modif) == 0:
                                forms_to_add.append(form_to_use.upper())
                            else:  # if there is "-"
                                forms_to_add.append(form_to_use.upper() + "-" +
                                                    outcome_modif[0][1])

                        # if besides 3+x letter reps there are also double letters in word and it is not a word
                        # alternative form with 1x letters will be added, e.g. "uiijjjee - uiijee; uije"
                        if re.search(r"(\w)\1",outcome.lower()) and MorphAnalyzedToken(outcome).is_word==False \
                        and wo_repeats!=outcome.lower():
                            if not any(i.lower() == wo_repeats
                                       for i in candidates):
                                if form_to_use.isupper(
                                ) and wo_repeats.islower():
                                    forms_to_add.append(wo_repeats.upper())
                                elif form_to_use[0].isupper(
                                ) and outcome[0].islower():
                                    forms_to_add.append(
                                        wo_repeats.capitalize())
                                else:
                                    forms_to_add.append(wo_repeats)
                        # shorter words in uppercase get the original word as an alternative (e.g. EEEL)
                        if outcome.isupper(
                        ) and len(form_to_use) < 5 and outcome != form_to_use:
                            if not any(i.lower() == form_to_use.lower()
                                       for i in candidates):
                                forms_to_add.append(form_to_use)

                for form in forms_to_add:
                    candidates.append(form)
                    spelling_new_form = vm.spellcheck(
                        [form], suggestions=True)[0]["spelling"]
                    spelling_list.append(spelling_new_form)

            # removes all letter reps, e.g. noonohhh -- nonoh
            def without_recurrent_letters(word):
                new_word = re.compile(r'(.)\1{1,}',
                                      re.IGNORECASE).sub(r'\1', word.lower())
                return new_word

            # compares word without letter reps
            def compare_words_wo_repeating_letters(speller_sugg_list,
                                                   new_form):
                for word in speller_sugg_list:
                    test = 0  # to avoid forms where new letters are added, e.g. "urr" [1,2] -- "uur" [2,1] (no double "u" in our word)
                    count_letters1 = [
                        (sum(1 for i in group))
                        for label, group in groupby(new_form.lower())
                    ]
                    count_letters2 = [(sum(1 for i in group))
                                      for label, group in groupby(word.lower())
                                      ]
                    # if any number in the list of the new word is bigger from the other word's list, then this new form is avoided
                    for i, i2 in zip(count_letters1, count_letters2):
                        if i2 > i:
                            test += 1

                    if len(word.lower())>1 and without_recurrent_letters(new_form)==without_recurrent_letters(word) and \
                    test==0:
                        return word
                        break

            # finds if the word consists of repetitive chunks (e.g. blabla, kluklu, nununu)
            def find_repeats(word):
                rep_regex = re.compile(
                    r"(.+?)\1+$"
                )  # e.g. lalala, blablablabla, midagigigi, muhahaha
                match1 = re.sub(rep_regex, r'\1', word.lower())
                if MorphAnalyzedToken(match1).is_word == True and len(
                        match1) > 4:  # e.g. midagigi - midagi vs. lala - lala
                    return match1
                else:
                    match2 = re.sub(rep_regex, r'\1\1', word.lower())
                    if match2 != word and match2 != without_recurrent_letters(
                            word):  # to not count e.g. programmmerija in here
                        return match2
                    elif match1 + match1 == word.lower():  # e.g. EIEI
                        return match2
                    else:
                        return None

            # running spell_check
            def check_spelling(word_to_use):
                spell_check = vm.spellcheck([word_to_use], suggestions=True)
                for i in spell_check:
                    return i

            # checks the spelling and runs all the rules
            def use_rules(word_to_use, rule, form_to_use):
                speller_info = check_spelling(word_to_use)
                run_the_rules = compare(speller_info, rule, form_to_use)
                return run_the_rules

            # rules; normalized form is found or not
            def compare(speller_info, rule, form_to_use):

                new_form = re.sub(r"([a-zšžõäöüA-ZÜÕÄÖŠŽ])\1{2,}", r"\1",
                                  form_to_use.lower())
                abbrev_w_other_char = re.match(
                    r'^[A-ZÜÕÄÖŠŽ]{1,4}[\.\-][a-züõäöšž]+$',
                    speller_info["text"])
                two_letters = re.sub(r"([a-zšžõäöüA-ZÜÕÄÖŠŽ])\1{2,}", r"\1\1",
                                     form_to_use)

                # avoid abbrevs with endings, e.g. XXXX-le
                if abbrev_w_other_char != None:
                    return None
                # words like Mmm, Ooo, Eee etc to lowercase
                if len(speller_info["text"])>1 and len(set(without_recurrent_letters(speller_info["text"]))) == 1 \
                and speller_info["text"][0].isupper() and speller_info["text"][1:].islower():
                    speller_info["text"] = speller_info["text"].lower()

                # if SPELLING==TRUE
                if speller_info["spelling"] == True:

                    if not speller_info["text"].isupper(
                    ) and not speller_info["text"].islower():
                        if len(re.findall(r'[A-ZÜÕÄÖŠŽ]', speller_info["text"])
                               ) > 1:  # e.g. MMMnjaah -- mmmnjaah
                            speller_info["text"] = speller_info["text"].lower()

                        # e.g. Aaagaa - change to lowercase to avoid counting only as a proper name
                        if len(check_spelling(without_recurrent_letters(speller_info["text"]))["suggestions"])!=0 or \
                        check_spelling(without_recurrent_letters(speller_info["text"]))["spelling"]==True:
                            speller_info["text"] = speller_info["text"].lower()

                    # if word is uppercase + is a word with reps changed to either 2x or 1x
                    if rule!="nr 2" and speller_info["text"].isupper() and \
                    (MorphAnalyzedToken(without_recurrent_letters(speller_info["text"])).is_word==True or \
                     MorphAnalyzedToken(two_letters).is_word==True or len(speller_info["text"])>5) \
                    and all(c in ["I","V","X","L","C","D","M"] for c in speller_info["text"])==False:
                        # e.g. JAAAA, SEEEEE have to be changed, but roman numerals kept the same
                        return speller_info  # goes to next round

                    elif rule == "analysis OK":
                        return None
                    else:
                        if MorphAnalyzedToken(
                                speller_info["text"]
                        ).is_word == True:  # e.g. NII, EEL
                            return speller_info["text"]
                        elif MorphAnalyzedToken(
                                without_recurrent_letters(speller_info["text"])
                        ).is_word == True:  # e.g. AAAHHH
                            speller_info["text"] = speller_info["text"].lower()
                            return without_recurrent_letters(
                                speller_info["text"])
                        else:
                            return form_to_use

                else:
                    if len(speller_info["suggestions"]
                           ) == 1:  # if spelling==False, but 1 suggestion
                        w1 = without_recurrent_letters(speller_info["text"])
                        w2 = without_recurrent_letters(
                            speller_info["suggestions"][0])
                        # lists - e.g. "urr" - [1, 2]
                        count_letters1 = [
                            (sum(1 for i in group))
                            for label, group in groupby(speller_info["text"])
                        ]
                        count_letters2 = [(sum(1 for i in group))
                                          for label, group in groupby(
                                              speller_info["suggestions"][0])]
                        rep_word = find_repeats(w1)
                        test = 0  # e.g. "Urr" -- "uur"
                        for i, i2 in zip(count_letters1, count_letters2):
                            if i2 > i:
                                test += 1
                        # not in [1,2,3] - to avoid unnecessary suggestions by speller
                        if w1 == w2 and (
                            (test == 0) or
                            (test != 0 and len(set(w1)) not in [1, 2, 3])):
                            return speller_info["suggestions"][0]
                        elif rep_word != None:
                            return rep_word
                        # proper names
                        elif len(speller_info["suggestions"][0])>1 and (speller_info["suggestions"][0][0].isupper() \
                             and speller_info["suggestions"][0][1].islower()) and rule=="nr 2":
                            return speller_info["text"]
                        else:
                            if rule == "nr 2":
                                return new_form
                            else:
                                return speller_info

                    elif len(speller_info["suggestions"]) > 1:
                        speller_info["suggestions"].sort(
                            key=len)  # e.g. vitamiine vs vitamiinne
                        func_compare_words = compare_words_wo_repeating_letters(
                            speller_info["suggestions"], speller_info["text"])
                        if func_compare_words:
                            return func_compare_words

                        elif rule == "nr 2":
                            speller_try = check_spelling(new_form)
                            if len(speller_try["suggestions"]) == 1:
                                func_compare_words = compare_words_wo_repeating_letters(
                                    speller_try["suggestions"],
                                    speller_info["text"])
                                if func_compare_words:
                                    return func_compare_words
                                else:
                                    return new_form
                            else:
                                rep_word = find_repeats(
                                    without_recurrent_letters(new_form))
                                if rep_word != None:
                                    return rep_word
                                else:
                                    return new_form
                        else:
                            return speller_info

                    elif len(speller_info["suggestions"]
                             ) == 0 and rule == "nr 2":

                        w1 = without_recurrent_letters(new_form)
                        rep_word = find_repeats(w1)
                        if rep_word != None:
                            new_form = rep_word
                        speller_try = check_spelling(new_form)
                        if speller_try["spelling"] == True:
                            return new_form
                        elif len(speller_try["suggestions"]) == 1 or len(
                                speller_try["suggestions"]) > 1:
                            speller_try["suggestions"].sort(key=len)
                            func_compare_words = compare_words_wo_repeating_letters(
                                speller_try["suggestions"], new_form)
                            if func_compare_words:
                                return func_compare_words
                            else:
                                return new_form
                        else:
                            # goes as an abbrev, e.g. CCCPis
                            if rep_word == None and (form_to_use.isupper(
                            )) or (not form_to_use.isupper()
                                   and not form_to_use.islower()
                                   and form_to_use[0:2].isupper()):
                                return None
                            elif rep_word == None and speller_info["text"][
                                    0].isupper():
                                return speller_info["text"]
                            else:
                                return new_form
                    else:
                        return speller_info

        # words without diacritics, eg voimalus-võimalus
        if self.use_diacritics_fixes == True:

            # checks, whether word is corrected, if only one of the letters is changed
            def check_changes_separately(how_many, form_to_use, k, v, dict_nr):
                check = 0
                for i in range(how_many):
                    form_to_use_rep = form_to_use.replace(k, v, i + 1)
                    if MorphAnalyzedToken(
                            form_to_use_rep).is_word == True and re.search(
                                r'[^0-9]+', form_to_use_rep):
                        return form_to_use_rep
                    else:
                        check += 1
                if check == how_many and dict_nr != "dict 3":
                    return True
                elif check == how_many and dict_nr == "dict 3":
                    return None

            # makes changes
            def find_diacritics(form_to_use, k, v, dict_nr, new_form_to_use):
                alternatives = []
                if dict_nr == "dict 1":
                    new_form_to_use = form_to_use
                else:
                    new_form_to_use = new_form_to_use
                check_letters = 0
                how_many = form_to_use.lower().count(k)
                if k in form_to_use.lower():
                    if len(v) == 1:
                        form_to_use_rep = form_to_use.replace(k, v)
                        new_form_to_use = new_form_to_use.replace(k, v)

                        if MorphAnalyzedToken(
                                form_to_use_rep).is_word == True and re.search(
                                    r'[^0-9]+', form_to_use_rep):
                            alternatives.append(form_to_use_rep)
                        else:
                            if MorphAnalyzedToken(
                                    new_form_to_use
                            ).is_word == True and re.search(
                                    r'[^0-9]+', new_form_to_use):
                                alternatives.append(new_form_to_use)
                            else:
                                if k + k in form_to_use.lower():
                                    form_to_use_rep = form_to_use.replace(
                                        k + k, v + v)
                                    if MorphAnalyzedToken(
                                            form_to_use_rep
                                    ).is_word == True and re.search(
                                            r'[^0-9]+', form_to_use_rep):
                                        alternatives.append(form_to_use_rep)
                                    else:
                                        check = check_changes_separately(
                                            how_many, form_to_use, k, v,
                                            dict_nr)
                                        if type(check) == str:
                                            alternatives.append(check)
                                        elif check == True:
                                            return True, new_form_to_use
                                else:
                                    check = check_changes_separately(
                                        how_many, form_to_use, k, v, dict_nr)
                                    if type(check) == str:
                                        alternatives.append(check)
                                    elif check == True:
                                        return True, new_form_to_use

                    else:
                        for letter in v:
                            form_to_use_rep = form_to_use.replace(k, letter)
                            new_form_to_use = new_form_to_use.replace(
                                k, letter)

                            if MorphAnalyzedToken(
                                    form_to_use_rep
                            ).is_word == True and re.search(
                                    r'[^0-9]+', form_to_use_rep):
                                alternatives.append(form_to_use_rep)
                            else:
                                if MorphAnalyzedToken(
                                        new_form_to_use
                                ).is_word == True and re.search(
                                        r'[^0-9]+', new_form_to_use):
                                    alternatives.append(new_form_to_use)
                                else:
                                    if k + k in form_to_use.lower():
                                        form_to_use_rep = form_to_use.replace(
                                            k + k, letter + letter)
                                        if MorphAnalyzedToken(
                                                form_to_use_rep
                                        ).is_word == True and re.search(
                                                r'[^0-9]+', form_to_use_rep):
                                            alternatives.append(
                                                form_to_use_rep)
                                        else:
                                            check = check_changes_separately(
                                                how_many, form_to_use, k,
                                                letter, dict_nr)
                                            if type(check) == str:
                                                alternatives.append(check)
                                            else:
                                                check_letters += 1
                                    else:
                                        check = check_changes_separately(
                                            how_many, form_to_use, k, letter,
                                            dict_nr)
                                        if type(check) == str:
                                            alternatives.append(check)
                                        else:
                                            check_letters += 1

                if len(alternatives) != 0:
                    return alternatives
                elif check_letters == len(v) and dict_nr != "dict 3":
                    return True, new_form_to_use
                elif check_letters == len(v) and dict_nr == "dict 3":
                    return None

            # new form is found
            def use_diacritics_rules(form_to_use):

                dict_of_alterns_1 = {
                    "y": "ü",
                    "6": "õ",
                    "2": "ä",
                    "å": "ä",
                    "ô": "õ",
                    "ó": "õ",
                    "ō": "õ",
                    "û": "ü",
                    "ú": "ü"
                }
                dict_of_alterns_2 = {"a": "ä", "o": ["õ", "ö"], "u": "ü"}
                dict_of_alterns_3 = {
                    "ö": ["ü", "õ", "ö", "ä"],
                    "õ": ["ü", "õ", "ö", "ä"],
                    "ü": ["ü", "õ", "ö", "ä"],
                    "ä": ["ü", "õ", "ö", "ä"],
                    "e": ["ä", "ö", "õ"],
                    "?": ["ü", "õ", "ö", "ä"]
                }

                if_no_result = False
                new_form_to_use = ""

                # dict_of_alterns_1
                if self.use_diacritics_fixes_1 == True:
                    for k, v in dict_of_alterns_1.items():
                        find_dict1 = find_diacritics(form_to_use, k, v,
                                                     "dict 1", "")
                        if type(find_dict1) is list:
                            return find_dict1
                        elif find_dict1 == True:
                            if_no_result = True
                        elif find_dict1 is not None and len(find_dict1) == 2:
                            if_no_result = True
                            new_form_to_use = find_dict1[1]

                # dict_of_alterns_2
                if self.use_diacritics_fixes_2 == True:
                    if if_no_result == True or [
                            l for l in form_to_use.lower()
                            if l in dict_of_alterns_1
                    ] == []:
                        for k, v in dict_of_alterns_2.items():
                            find_dict2 = find_diacritics(
                                form_to_use, k, v, "dict 2", new_form_to_use)
                            if type(find_dict2) is list:
                                return find_dict2
                            elif find_dict2 == True:
                                if_no_result = True
                            elif find_dict2 is not None and len(
                                    find_dict2) == 2:
                                if_no_result = True
                                new_form_to_use = find_dict2[1]

                # dict_of_alterns_3
                if self.use_diacritics_fixes_3 == True:
                    if if_no_result==True or [l for l in form_to_use.lower() if l in dict_of_alterns_2]==[] or \
                    [l for l in form_to_use.lower() if l in dict_of_alterns_1]==[]:
                        for k, v in dict_of_alterns_3.items():
                            find_dict3 = find_diacritics(
                                form_to_use, k, v, "dict 3", new_form_to_use)
                            if type(find_dict3) is list:
                                return find_dict3

        for word_id, w in enumerate(words):
            form_to_use = _get_word_text(w)
            candidates = [form_to_use]
            spelling_results = [
                vm.spellcheck([form_to_use], suggestions=True)[0]["spelling"]
            ]

            if self.use_letter_reps == True:  # normalizes letter repetitions (e.g. väääääga)

                new_candidates = []
                for candidate, spelling in zip(candidates, spelling_results):
                    if spelling == False:
                        first_try = use_rules(candidate, "analysis OK",
                                              candidate)
                        if type(first_try) == str:
                            new_candidates.append(first_try)
                        elif first_try != None:
                            rule_1 = re.sub(r"([a-zšžõäöüA-ZÜÕÄÖŠŽ])\1{2,}",
                                            r"\1\1\1",
                                            first_try["text"])  # nr 1 - 3 reps
                            second_try = use_rules(rule_1, "nr 1", form_to_use)
                            if type(second_try) == str:
                                new_candidates.append(second_try)
                            elif second_try != None:
                                rule_2 = re.sub(
                                    r"([a-zšžõäöüA-ZÜÕÄÖŠŽ])\1{2,}", r"\1\1",
                                    second_try["text"])  # nr 2 - 2 reps
                                third_try = use_rules(rule_2, "nr 2",
                                                      form_to_use)
                                if type(third_try) == str:
                                    new_candidates.append(third_try)

                for new_c in new_candidates:
                    add_normalized_form(new_c, form_to_use, candidates,
                                        spelling_results)

            if self.use_diacritics_fixes == True:
                new_candidates = []
                for candidate, spelling in zip(candidates, spelling_results):
                    if spelling == False:
                        prev_word = ""
                        next_word = ""
                        if word_id - 1 > -1:
                            prev_word = words[word_id - 1]
                        if len(words) - 1 > word_id:
                            next_word = words[word_id + 1]
                        if (candidate[0].isupper() and (w.start>2 and words.text[w.start-2:w.start-1] in ["!","?","."])) \
                        or (candidate in self._english_words and ((not type(prev_word) is str and prev_word.text in self._english_words) \
                                                            or (not type(next_word) is str and next_word.text in self._english_words))) \
                        or (MorphAnalyzedToken(candidate.capitalize()).is_word==True):
                            continue
                        else:
                            first_try = use_diacritics_rules(candidate)
                            if type(first_try) is list:
                                for i in first_try:
                                    if i not in new_candidates:
                                        new_candidates.append(i)

                for new_c in new_candidates:
                    candidates.append(new_c)
                    spelling_new_form = vm.spellcheck(
                        [new_c], suggestions=True)[0]["spelling"]
                    spelling_results.append(spelling_new_form)

            # remove if first in the list is the original incorrect word
            if w.text == candidates[0]:
                candidates.pop(0)
                spelling_results.pop(0)

            if candidates:
                w.clear_annotations()
                for candidate in candidates:
                    w.add_annotation(Annotation(w, normalized_form=candidate))
            elif self.use_vabamorf_speller == True:
                if w.normalized_form[0] == None:
                    spell_check = vm.spellcheck([w.text], suggestions=True)
                    if len(spell_check[0]["suggestions"]) != 0:
                        w.clear_annotations()
                        for spell_sugg in spell_check[0]["suggestions"]:
                            w.add_annotation(
                                Annotation(w, normalized_form=spell_sugg))

Example #6

Show file

File: make_user_dicts.py Project: estnltk/processing-old-estonian

                  encoding='utf-8') as fin:
            raw_words = []
            normalized_forms = []
            for line in fin:
                line = line.strip()
                line = line.split(" ")
                #The first element of a line is the non-standard word, the other one is the normalized form
                raw_words.append(line[0])
                normalized_forms.append(line[1])
            raw_text = " ".join(raw_words)
            text = Text(raw_text)
            text.tag_layer(['sentences'])
            for index, w in enumerate(text['words']):
                w.clear_annotations()
                w.add_annotation(
                    Annotation(w, normalized_form=normalized_forms[index]))
            vm.tag(text)
            for w in text['morph_analysis']:
                for annotation in w.annotations:
                    annotation['text'] = w.text
                    dicts[location].append(annotation)

#Write the dicts into tsv files
for location in dicts:
    dict = dicts[location]
    outfile = os.path.join(user_dict_dir, location + ".tsv")
    with open(outfile, 'w', encoding='utf-8', newline='\n') as csvfile:
        fieldnames = [
            'text', 'root', 'ending', 'clitic', 'partofspeech', 'form'
        ]
        writer = csv.writer(csvfile, delimiter='\t')