def make_new_ot_morph_layer(old_text_obj,
                            new_text_obj,
                            new_layer='original_words_morph_analysis',
                            old_layer='ot_morph_analysis',
                            new_parent_layer='original_words'):
    '''Creates new 'original_words_morph_analysis' layer based on the old morph analysis layer.'''
    assert old_layer in old_text_obj.layers
    assert new_parent_layer in new_text_obj.layers
    assert new_layer not in new_text_obj.layers
    original_layer = old_text_obj[old_layer]
    assert 'normalized_text' not in original_layer.attributes
    parent_layer = new_text_obj[new_parent_layer]
    layer = Layer(name=new_layer,
                  text_object=new_text_obj,
                  attributes=('normalized_text', ) + original_layer.attributes,
                  parent=new_parent_layer,
                  ambiguous=True)
    assert len(parent_layer) == len(original_layer)
    for wid, parent_span in enumerate(parent_layer):
        old_morph_span = original_layer[wid]
        new_span = Span(base_span=parent_span.base_span, layer=layer)
        for ann in old_morph_span.annotations:
            new_annotation = {'normalized_text': parent_span.text}
            for a in layer.attributes:
                if a in ['start', 'end', 'text', 'normalized_text']:
                    continue
                new_annotation[a] = ann[a]
            new_span.add_annotation(Annotation(new_span, **new_annotation))
        layer.add_span(new_span)
    return layer
Example #2
0
def add_normalized_form_to_words( layer ):
    '''Rewrites words layer and add normalized_form attribute.'''
    layer.attributes += ('normalized_form',)
    for item in layer:
        item.clear_annotations()
        item.add_annotation( Annotation(item, normalized_form=None) )
    return layer
 def _change_layer(self, text, layers, status):
     # Get changeble layer
     changeble_layer = layers[self.output_layer]
     # Add new attribute to the layer
     changeble_layer.attributes += (self.output_attributes[-1], )
     # Iterate over words and add new normalizations
     for span in changeble_layer:
         # Get current normalized forms of the word
         current_norm_forms = [
             a['normalized_form'] for a in span.annotations
         ]
         if current_norm_forms == [None]:
             current_norm_forms = [span.text]
         # Try to replace current normalized forms with forms from the lexicon
         new_forms = []
         change_status = []
         for cur_form in current_norm_forms:
             for letter in self.letters_replaced:
                 new_form = cur_form.replace(letter,
                                             self.letters_replaced[letter])
                 if new_form != cur_form:
                     new_forms.append(new_form)
                     change_status.append(True)
                 else:
                     new_forms.append(cur_form)
                     change_status.append(False)
         # Clear existing annotations and add new ones that have 1 extra attribute
         span.clear_annotations()
         for form_id, new_form in enumerate(new_forms):
             span.add_annotation(
                 Annotation(span,
                            normalized_form=new_form,
                            is_prenormalized=change_status[form_id]))
def add_punctuation_analysis(text):
    #Redefine the tagger for analysing punctuation
    punct_analyser = VabamorfAnalyzer(guess=True, propername=True)
    for word in text.morph_analysis:
        if _is_empty_annotation(word.annotations[0]):
            # Check if it is punctuation
            if len(word.text) > 0 and not any([c.isalnum()
                                               for c in word.text]):
                # It is a punctuation. Generate the analyses with guessing enabled and add them to the text
                w = Text(word.text)
                w.tag_layer(['sentences'])
                punct_analyser.tag(w)

                analysis = w.morph_analysis[0].annotations
                # If for some reason there are multiple analyses
                # the only first one will remain.
                if len(analysis) > 1:
                    analysis = [analysis[0]]

                #Rewrite the analysis
                word.clear_annotations()
                word.add_annotation(Annotation(word, **analysis[0]))
    def _change_layer(self, text, layers, status):

        words = layers[self.output_layer]

        words.attributes = words.attributes + self.output_attributes

        # normalizes letter repetitions (e.g. väääääga)
        if self.use_letter_reps == True:

            # checks if created normalized form should be changed or additional forms added
            def add_normalized_form(outcome, form_to_use, candidates,
                                    spelling_list):
                forms_to_add = [
                ]  # all the normalized forms will be added here
                dict_of_other_forms = {"w": "www", "x": "xxx", "z": "zzz"}

                if type(outcome) == str:

                    # normalized form is changed if needed to match the original word (upper/lowercase, capital letter)
                    if form_to_use.isupper() and outcome.islower():
                        outcome = outcome.upper()
                    elif form_to_use.islower() and outcome.isupper():
                        outcome = outcome.lower()
                    elif form_to_use[0].isupper() and outcome[0].islower():
                        outcome = outcome.capitalize()
                    # normalized form is added if the original word contains more than 2 letter reps
                    if re.search(r"([a-zšžõäöü])\1{2,}",
                                 form_to_use.lower()) != None or find_repeats(
                                     form_to_use.lower()) != None:
                        forms_to_add.append(outcome)

                        wo_repeats = without_recurrent_letters(outcome)
                        # finding and adding possible alternative forms
                        outcome_modif = []
                        # e.g. "prrrr-ga" -- so that it will then be "prrrr"--"prr" and would get special alternative forms too ("pr")
                        if "-" in outcome and outcome.count("-") == 1:
                            splitted_outcome = outcome.split("-")
                            to_add = splitted_outcome[0], splitted_outcome[
                                1]  # word and the word ending etc
                            outcome_modif.append(to_add)
                            splitted_form_to_use = form_to_use.split("-")
                            form_to_use = splitted_form_to_use[0]  # e.g. prr
                            outcome = splitted_outcome[0]  # e.g. original prrr

                        if (len(set(without_recurrent_letters(outcome))) in [2,3]) or \
                        (len(set(without_recurrent_letters(outcome))) in [1,2,3] and outcome_modif!=0): # certain shorter forms get possible other alternatives
                            # if max 3 different letters in word: new alternative form: e.g. "ma" (orig: "maaaa", first norm_form: "maa")
                            new_form = re.sub(r"([a-zšžõäöüA-ZÜÕÄÖŠŽ])\1{1,}",
                                              r"\1", form_to_use)
                            # if max 2 different letters in word: new alternative form: e.g. "krr" (orig: "krrrr", first norm_form: "kr")
                            new_form2 = re.sub(r"([a-zšžõäöüA-ZÜÕÄÖŠŽ])\1{2,}",
                                               r"\1\1", form_to_use)
                            if len(outcome_modif) == 0:
                                if len(outcome) == 2 and len(new_form2) == len(
                                        outcome) + 1:  # e.g. krr
                                    forms_to_add.append(new_form2)
                                # max 5 letters or max 2 diff letters (e.g. maa - ma, ahhaa - aha)
                                if (len(outcome)<5 or len(set(without_recurrent_letters(outcome)))==2) \
                                and new_form!=outcome and len(new_form)<len(outcome):
                                    forms_to_add.append(new_form)

                            else:  # words with "-", e.g. "prrrr-ga"
                                if (new_form!=outcome_modif[0][0] and len(new_form)<len(outcome_modif[0][0])) or \
                                    (len(set(without_recurrent_letters(outcome)))==1):
                                    if outcome not in dict_of_other_forms:
                                        forms_to_add.append(
                                            new_form + "-" +
                                            outcome_modif[0][1])
                                    else:
                                        forms_to_add.append(
                                            dict_of_other_forms[outcome] +
                                            "-" + outcome_modif[0][1])

                        # special alternative forms from dict, e.g. w - www
                        if len(outcome_modif
                               ) == 0 and outcome in dict_of_other_forms:
                            forms_to_add.append(dict_of_other_forms[outcome])

                        # roman numeral, e.g. "xii"/"Xii" get "XII" as an alternative
                        if (all(c in ["i","v","x","l","c","d","m"] for c in form_to_use)==True) or \
                        (all(c in ["i","v","x","l","c","d","m"] for c in form_to_use[1:])==True and form_to_use[0] in ["I","V","X","L","C","D","M"]):
                            if len(outcome_modif) == 0:
                                forms_to_add.append(form_to_use.upper())
                            else:  # if there is "-"
                                forms_to_add.append(form_to_use.upper() + "-" +
                                                    outcome_modif[0][1])

                        # if besides 3+x letter reps there are also double letters in word and it is not a word
                        # alternative form with 1x letters will be added, e.g. "uiijjjee - uiijee; uije"
                        if re.search(r"(\w)\1",outcome.lower()) and MorphAnalyzedToken(outcome).is_word==False \
                        and wo_repeats!=outcome.lower():
                            if not any(i.lower() == wo_repeats
                                       for i in candidates):
                                if form_to_use.isupper(
                                ) and wo_repeats.islower():
                                    forms_to_add.append(wo_repeats.upper())
                                elif form_to_use[0].isupper(
                                ) and outcome[0].islower():
                                    forms_to_add.append(
                                        wo_repeats.capitalize())
                                else:
                                    forms_to_add.append(wo_repeats)
                        # shorter words in uppercase get the original word as an alternative (e.g. EEEL)
                        if outcome.isupper(
                        ) and len(form_to_use) < 5 and outcome != form_to_use:
                            if not any(i.lower() == form_to_use.lower()
                                       for i in candidates):
                                forms_to_add.append(form_to_use)

                for form in forms_to_add:
                    candidates.append(form)
                    spelling_new_form = vm.spellcheck(
                        [form], suggestions=True)[0]["spelling"]
                    spelling_list.append(spelling_new_form)

            # removes all letter reps, e.g. noonohhh -- nonoh
            def without_recurrent_letters(word):
                new_word = re.compile(r'(.)\1{1,}',
                                      re.IGNORECASE).sub(r'\1', word.lower())
                return new_word

            # compares word without letter reps
            def compare_words_wo_repeating_letters(speller_sugg_list,
                                                   new_form):
                for word in speller_sugg_list:
                    test = 0  # to avoid forms where new letters are added, e.g. "urr" [1,2] -- "uur" [2,1] (no double "u" in our word)
                    count_letters1 = [
                        (sum(1 for i in group))
                        for label, group in groupby(new_form.lower())
                    ]
                    count_letters2 = [(sum(1 for i in group))
                                      for label, group in groupby(word.lower())
                                      ]
                    # if any number in the list of the new word is bigger from the other word's list, then this new form is avoided
                    for i, i2 in zip(count_letters1, count_letters2):
                        if i2 > i:
                            test += 1

                    if len(word.lower())>1 and without_recurrent_letters(new_form)==without_recurrent_letters(word) and \
                    test==0:
                        return word
                        break

            # finds if the word consists of repetitive chunks (e.g. blabla, kluklu, nununu)
            def find_repeats(word):
                rep_regex = re.compile(
                    r"(.+?)\1+$"
                )  # e.g. lalala, blablablabla, midagigigi, muhahaha
                match1 = re.sub(rep_regex, r'\1', word.lower())
                if MorphAnalyzedToken(match1).is_word == True and len(
                        match1) > 4:  # e.g. midagigi - midagi vs. lala - lala
                    return match1
                else:
                    match2 = re.sub(rep_regex, r'\1\1', word.lower())
                    if match2 != word and match2 != without_recurrent_letters(
                            word):  # to not count e.g. programmmerija in here
                        return match2
                    elif match1 + match1 == word.lower():  # e.g. EIEI
                        return match2
                    else:
                        return None

            # running spell_check
            def check_spelling(word_to_use):
                spell_check = vm.spellcheck([word_to_use], suggestions=True)
                for i in spell_check:
                    return i

            # checks the spelling and runs all the rules
            def use_rules(word_to_use, rule, form_to_use):
                speller_info = check_spelling(word_to_use)
                run_the_rules = compare(speller_info, rule, form_to_use)
                return run_the_rules

            # rules; normalized form is found or not
            def compare(speller_info, rule, form_to_use):

                new_form = re.sub(r"([a-zšžõäöüA-ZÜÕÄÖŠŽ])\1{2,}", r"\1",
                                  form_to_use.lower())
                abbrev_w_other_char = re.match(
                    r'^[A-ZÜÕÄÖŠŽ]{1,4}[\.\-][a-züõäöšž]+$',
                    speller_info["text"])
                two_letters = re.sub(r"([a-zšžõäöüA-ZÜÕÄÖŠŽ])\1{2,}", r"\1\1",
                                     form_to_use)

                # avoid abbrevs with endings, e.g. XXXX-le
                if abbrev_w_other_char != None:
                    return None
                # words like Mmm, Ooo, Eee etc to lowercase
                if len(speller_info["text"])>1 and len(set(without_recurrent_letters(speller_info["text"]))) == 1 \
                and speller_info["text"][0].isupper() and speller_info["text"][1:].islower():
                    speller_info["text"] = speller_info["text"].lower()

                # if SPELLING==TRUE
                if speller_info["spelling"] == True:

                    if not speller_info["text"].isupper(
                    ) and not speller_info["text"].islower():
                        if len(re.findall(r'[A-ZÜÕÄÖŠŽ]', speller_info["text"])
                               ) > 1:  # e.g. MMMnjaah -- mmmnjaah
                            speller_info["text"] = speller_info["text"].lower()

                        # e.g. Aaagaa - change to lowercase to avoid counting only as a proper name
                        if len(check_spelling(without_recurrent_letters(speller_info["text"]))["suggestions"])!=0 or \
                        check_spelling(without_recurrent_letters(speller_info["text"]))["spelling"]==True:
                            speller_info["text"] = speller_info["text"].lower()

                    # if word is uppercase + is a word with reps changed to either 2x or 1x
                    if rule!="nr 2" and speller_info["text"].isupper() and \
                    (MorphAnalyzedToken(without_recurrent_letters(speller_info["text"])).is_word==True or \
                     MorphAnalyzedToken(two_letters).is_word==True or len(speller_info["text"])>5) \
                    and all(c in ["I","V","X","L","C","D","M"] for c in speller_info["text"])==False:
                        # e.g. JAAAA, SEEEEE have to be changed, but roman numerals kept the same
                        return speller_info  # goes to next round

                    elif rule == "analysis OK":
                        return None
                    else:
                        if MorphAnalyzedToken(
                                speller_info["text"]
                        ).is_word == True:  # e.g. NII, EEL
                            return speller_info["text"]
                        elif MorphAnalyzedToken(
                                without_recurrent_letters(speller_info["text"])
                        ).is_word == True:  # e.g. AAAHHH
                            speller_info["text"] = speller_info["text"].lower()
                            return without_recurrent_letters(
                                speller_info["text"])
                        else:
                            return form_to_use

                else:
                    if len(speller_info["suggestions"]
                           ) == 1:  # if spelling==False, but 1 suggestion
                        w1 = without_recurrent_letters(speller_info["text"])
                        w2 = without_recurrent_letters(
                            speller_info["suggestions"][0])
                        # lists - e.g. "urr" - [1, 2]
                        count_letters1 = [
                            (sum(1 for i in group))
                            for label, group in groupby(speller_info["text"])
                        ]
                        count_letters2 = [(sum(1 for i in group))
                                          for label, group in groupby(
                                              speller_info["suggestions"][0])]
                        rep_word = find_repeats(w1)
                        test = 0  # e.g. "Urr" -- "uur"
                        for i, i2 in zip(count_letters1, count_letters2):
                            if i2 > i:
                                test += 1
                        # not in [1,2,3] - to avoid unnecessary suggestions by speller
                        if w1 == w2 and (
                            (test == 0) or
                            (test != 0 and len(set(w1)) not in [1, 2, 3])):
                            return speller_info["suggestions"][0]
                        elif rep_word != None:
                            return rep_word
                        # proper names
                        elif len(speller_info["suggestions"][0])>1 and (speller_info["suggestions"][0][0].isupper() \
                             and speller_info["suggestions"][0][1].islower()) and rule=="nr 2":
                            return speller_info["text"]
                        else:
                            if rule == "nr 2":
                                return new_form
                            else:
                                return speller_info

                    elif len(speller_info["suggestions"]) > 1:
                        speller_info["suggestions"].sort(
                            key=len)  # e.g. vitamiine vs vitamiinne
                        func_compare_words = compare_words_wo_repeating_letters(
                            speller_info["suggestions"], speller_info["text"])
                        if func_compare_words:
                            return func_compare_words

                        elif rule == "nr 2":
                            speller_try = check_spelling(new_form)
                            if len(speller_try["suggestions"]) == 1:
                                func_compare_words = compare_words_wo_repeating_letters(
                                    speller_try["suggestions"],
                                    speller_info["text"])
                                if func_compare_words:
                                    return func_compare_words
                                else:
                                    return new_form
                            else:
                                rep_word = find_repeats(
                                    without_recurrent_letters(new_form))
                                if rep_word != None:
                                    return rep_word
                                else:
                                    return new_form
                        else:
                            return speller_info

                    elif len(speller_info["suggestions"]
                             ) == 0 and rule == "nr 2":

                        w1 = without_recurrent_letters(new_form)
                        rep_word = find_repeats(w1)
                        if rep_word != None:
                            new_form = rep_word
                        speller_try = check_spelling(new_form)
                        if speller_try["spelling"] == True:
                            return new_form
                        elif len(speller_try["suggestions"]) == 1 or len(
                                speller_try["suggestions"]) > 1:
                            speller_try["suggestions"].sort(key=len)
                            func_compare_words = compare_words_wo_repeating_letters(
                                speller_try["suggestions"], new_form)
                            if func_compare_words:
                                return func_compare_words
                            else:
                                return new_form
                        else:
                            # goes as an abbrev, e.g. CCCPis
                            if rep_word == None and (form_to_use.isupper(
                            )) or (not form_to_use.isupper()
                                   and not form_to_use.islower()
                                   and form_to_use[0:2].isupper()):
                                return None
                            elif rep_word == None and speller_info["text"][
                                    0].isupper():
                                return speller_info["text"]
                            else:
                                return new_form
                    else:
                        return speller_info

        # words without diacritics, eg voimalus-võimalus
        if self.use_diacritics_fixes == True:

            # checks, whether word is corrected, if only one of the letters is changed
            def check_changes_separately(how_many, form_to_use, k, v, dict_nr):
                check = 0
                for i in range(how_many):
                    form_to_use_rep = form_to_use.replace(k, v, i + 1)
                    if MorphAnalyzedToken(
                            form_to_use_rep).is_word == True and re.search(
                                r'[^0-9]+', form_to_use_rep):
                        return form_to_use_rep
                    else:
                        check += 1
                if check == how_many and dict_nr != "dict 3":
                    return True
                elif check == how_many and dict_nr == "dict 3":
                    return None

            # makes changes
            def find_diacritics(form_to_use, k, v, dict_nr, new_form_to_use):
                alternatives = []
                if dict_nr == "dict 1":
                    new_form_to_use = form_to_use
                else:
                    new_form_to_use = new_form_to_use
                check_letters = 0
                how_many = form_to_use.lower().count(k)
                if k in form_to_use.lower():
                    if len(v) == 1:
                        form_to_use_rep = form_to_use.replace(k, v)
                        new_form_to_use = new_form_to_use.replace(k, v)

                        if MorphAnalyzedToken(
                                form_to_use_rep).is_word == True and re.search(
                                    r'[^0-9]+', form_to_use_rep):
                            alternatives.append(form_to_use_rep)
                        else:
                            if MorphAnalyzedToken(
                                    new_form_to_use
                            ).is_word == True and re.search(
                                    r'[^0-9]+', new_form_to_use):
                                alternatives.append(new_form_to_use)
                            else:
                                if k + k in form_to_use.lower():
                                    form_to_use_rep = form_to_use.replace(
                                        k + k, v + v)
                                    if MorphAnalyzedToken(
                                            form_to_use_rep
                                    ).is_word == True and re.search(
                                            r'[^0-9]+', form_to_use_rep):
                                        alternatives.append(form_to_use_rep)
                                    else:
                                        check = check_changes_separately(
                                            how_many, form_to_use, k, v,
                                            dict_nr)
                                        if type(check) == str:
                                            alternatives.append(check)
                                        elif check == True:
                                            return True, new_form_to_use
                                else:
                                    check = check_changes_separately(
                                        how_many, form_to_use, k, v, dict_nr)
                                    if type(check) == str:
                                        alternatives.append(check)
                                    elif check == True:
                                        return True, new_form_to_use

                    else:
                        for letter in v:
                            form_to_use_rep = form_to_use.replace(k, letter)
                            new_form_to_use = new_form_to_use.replace(
                                k, letter)

                            if MorphAnalyzedToken(
                                    form_to_use_rep
                            ).is_word == True and re.search(
                                    r'[^0-9]+', form_to_use_rep):
                                alternatives.append(form_to_use_rep)
                            else:
                                if MorphAnalyzedToken(
                                        new_form_to_use
                                ).is_word == True and re.search(
                                        r'[^0-9]+', new_form_to_use):
                                    alternatives.append(new_form_to_use)
                                else:
                                    if k + k in form_to_use.lower():
                                        form_to_use_rep = form_to_use.replace(
                                            k + k, letter + letter)
                                        if MorphAnalyzedToken(
                                                form_to_use_rep
                                        ).is_word == True and re.search(
                                                r'[^0-9]+', form_to_use_rep):
                                            alternatives.append(
                                                form_to_use_rep)
                                        else:
                                            check = check_changes_separately(
                                                how_many, form_to_use, k,
                                                letter, dict_nr)
                                            if type(check) == str:
                                                alternatives.append(check)
                                            else:
                                                check_letters += 1
                                    else:
                                        check = check_changes_separately(
                                            how_many, form_to_use, k, letter,
                                            dict_nr)
                                        if type(check) == str:
                                            alternatives.append(check)
                                        else:
                                            check_letters += 1

                if len(alternatives) != 0:
                    return alternatives
                elif check_letters == len(v) and dict_nr != "dict 3":
                    return True, new_form_to_use
                elif check_letters == len(v) and dict_nr == "dict 3":
                    return None

            # new form is found
            def use_diacritics_rules(form_to_use):

                dict_of_alterns_1 = {
                    "y": "ü",
                    "6": "õ",
                    "2": "ä",
                    "å": "ä",
                    "ô": "õ",
                    "ó": "õ",
                    "ō": "õ",
                    "û": "ü",
                    "ú": "ü"
                }
                dict_of_alterns_2 = {"a": "ä", "o": ["õ", "ö"], "u": "ü"}
                dict_of_alterns_3 = {
                    "ö": ["ü", "õ", "ö", "ä"],
                    "õ": ["ü", "õ", "ö", "ä"],
                    "ü": ["ü", "õ", "ö", "ä"],
                    "ä": ["ü", "õ", "ö", "ä"],
                    "e": ["ä", "ö", "õ"],
                    "?": ["ü", "õ", "ö", "ä"]
                }

                if_no_result = False
                new_form_to_use = ""

                # dict_of_alterns_1
                if self.use_diacritics_fixes_1 == True:
                    for k, v in dict_of_alterns_1.items():
                        find_dict1 = find_diacritics(form_to_use, k, v,
                                                     "dict 1", "")
                        if type(find_dict1) is list:
                            return find_dict1
                        elif find_dict1 == True:
                            if_no_result = True
                        elif find_dict1 is not None and len(find_dict1) == 2:
                            if_no_result = True
                            new_form_to_use = find_dict1[1]

                # dict_of_alterns_2
                if self.use_diacritics_fixes_2 == True:
                    if if_no_result == True or [
                            l for l in form_to_use.lower()
                            if l in dict_of_alterns_1
                    ] == []:
                        for k, v in dict_of_alterns_2.items():
                            find_dict2 = find_diacritics(
                                form_to_use, k, v, "dict 2", new_form_to_use)
                            if type(find_dict2) is list:
                                return find_dict2
                            elif find_dict2 == True:
                                if_no_result = True
                            elif find_dict2 is not None and len(
                                    find_dict2) == 2:
                                if_no_result = True
                                new_form_to_use = find_dict2[1]

                # dict_of_alterns_3
                if self.use_diacritics_fixes_3 == True:
                    if if_no_result==True or [l for l in form_to_use.lower() if l in dict_of_alterns_2]==[] or \
                    [l for l in form_to_use.lower() if l in dict_of_alterns_1]==[]:
                        for k, v in dict_of_alterns_3.items():
                            find_dict3 = find_diacritics(
                                form_to_use, k, v, "dict 3", new_form_to_use)
                            if type(find_dict3) is list:
                                return find_dict3

        for word_id, w in enumerate(words):
            form_to_use = _get_word_text(w)
            candidates = [form_to_use]
            spelling_results = [
                vm.spellcheck([form_to_use], suggestions=True)[0]["spelling"]
            ]

            if self.use_letter_reps == True:  # normalizes letter repetitions (e.g. väääääga)

                new_candidates = []
                for candidate, spelling in zip(candidates, spelling_results):
                    if spelling == False:
                        first_try = use_rules(candidate, "analysis OK",
                                              candidate)
                        if type(first_try) == str:
                            new_candidates.append(first_try)
                        elif first_try != None:
                            rule_1 = re.sub(r"([a-zšžõäöüA-ZÜÕÄÖŠŽ])\1{2,}",
                                            r"\1\1\1",
                                            first_try["text"])  # nr 1 - 3 reps
                            second_try = use_rules(rule_1, "nr 1", form_to_use)
                            if type(second_try) == str:
                                new_candidates.append(second_try)
                            elif second_try != None:
                                rule_2 = re.sub(
                                    r"([a-zšžõäöüA-ZÜÕÄÖŠŽ])\1{2,}", r"\1\1",
                                    second_try["text"])  # nr 2 - 2 reps
                                third_try = use_rules(rule_2, "nr 2",
                                                      form_to_use)
                                if type(third_try) == str:
                                    new_candidates.append(third_try)

                for new_c in new_candidates:
                    add_normalized_form(new_c, form_to_use, candidates,
                                        spelling_results)

            if self.use_diacritics_fixes == True:
                new_candidates = []
                for candidate, spelling in zip(candidates, spelling_results):
                    if spelling == False:
                        prev_word = ""
                        next_word = ""
                        if word_id - 1 > -1:
                            prev_word = words[word_id - 1]
                        if len(words) - 1 > word_id:
                            next_word = words[word_id + 1]
                        if (candidate[0].isupper() and (w.start>2 and words.text[w.start-2:w.start-1] in ["!","?","."])) \
                        or (candidate in self._english_words and ((not type(prev_word) is str and prev_word.text in self._english_words) \
                                                            or (not type(next_word) is str and next_word.text in self._english_words))) \
                        or (MorphAnalyzedToken(candidate.capitalize()).is_word==True):
                            continue
                        else:
                            first_try = use_diacritics_rules(candidate)
                            if type(first_try) is list:
                                for i in first_try:
                                    if i not in new_candidates:
                                        new_candidates.append(i)

                for new_c in new_candidates:
                    candidates.append(new_c)
                    spelling_new_form = vm.spellcheck(
                        [new_c], suggestions=True)[0]["spelling"]
                    spelling_results.append(spelling_new_form)

            # remove if first in the list is the original incorrect word
            if w.text == candidates[0]:
                candidates.pop(0)
                spelling_results.pop(0)

            if candidates:
                w.clear_annotations()
                for candidate in candidates:
                    w.add_annotation(Annotation(w, normalized_form=candidate))
            elif self.use_vabamorf_speller == True:
                if w.normalized_form[0] == None:
                    spell_check = vm.spellcheck([w.text], suggestions=True)
                    if len(spell_check[0]["suggestions"]) != 0:
                        w.clear_annotations()
                        for spell_sugg in spell_check[0]["suggestions"]:
                            w.add_annotation(
                                Annotation(w, normalized_form=spell_sugg))
                  encoding='utf-8') as fin:
            raw_words = []
            normalized_forms = []
            for line in fin:
                line = line.strip()
                line = line.split(" ")
                #The first element of a line is the non-standard word, the other one is the normalized form
                raw_words.append(line[0])
                normalized_forms.append(line[1])
            raw_text = " ".join(raw_words)
            text = Text(raw_text)
            text.tag_layer(['sentences'])
            for index, w in enumerate(text['words']):
                w.clear_annotations()
                w.add_annotation(
                    Annotation(w, normalized_form=normalized_forms[index]))
            vm.tag(text)
            for w in text['morph_analysis']:
                for annotation in w.annotations:
                    annotation['text'] = w.text
                    dicts[location].append(annotation)

#Write the dicts into tsv files
for location in dicts:
    dict = dicts[location]
    outfile = os.path.join(user_dict_dir, location + ".tsv")
    with open(outfile, 'w', encoding='utf-8', newline='\n') as csvfile:
        fieldnames = [
            'text', 'root', 'ending', 'clitic', 'partofspeech', 'form'
        ]
        writer = csv.writer(csvfile, delimiter='\t')