Exemple #1
0
    def init_from_str(self, str_src):
        """
                DStringLAT.init_from_str

                Function called by __init__(), initialize <self> and return
                <indexes_of_unrecognized_chars>.

                str_src : str

                HOW IT WORKS :
                * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
                * (2) = normalized_src -> (default symbols required) :
                *     replace_by_the_default_symbols() -> normalized_src
                * (3) initialisation from the recognized characters.
                *     re.finditer(DStringLAT.pattern) give the symbols{letter+diacritics}
                *     (3.1) base_char
                *     (3.2) length
                *     (3.3) stress
                *     (3.4) diaeresis
                *     (3.5) we add the new character
        """
        #.......................................................................
        # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
        #.......................................................................
        normalized_src = unicodedata.normalize('NFD', str_src)

        #.......................................................................
        # (2) = normalized_src -> (default symbols required) :
        #     replace_by_the_default_symbols() -> normalized_src
        #.......................................................................
        normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_LOWER_CASE.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_UPPER_CASE.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src)

        #.......................................................................
        # (3) initialisation from the recognized characters.
        #     re.finditer(DStringLAT.pattern) give the symbols{letter+diacritics}
        #.......................................................................
        indexes = []    # indexes of the substring well analyzed : ( start, end )
        for element in re.finditer(DStringLAT.pattern,
                                   normalized_src):

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # we add the unknown characters at the beginning and in the middle
            # of the string (see at the end of this function)
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if indexes:
                # <indexes> isn't empty :
                # ... we add the unknown character(s) between the last character and
                # the current one :
                for index in range( max(indexes[-1])+1, element.start() ):
                    new_character = DCharacterLAT(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )
            else:
                # <indexes> is empty :
                # ... we add the unknown character(s) before the first index in <indexes> :
                for index in range( 0, element.start() ):
                    new_character = DCharacterLAT(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )

            indexes.append( (element.start(), element.end()-1 ) )

            data = element.groupdict()
            letter     = data['letter']
            diacritics = data['diacritics']

            punctuation = letter in SYMB_PUNCTUATION.symbol2name
            capital_letter = letter in SYMB_UPPER_CASE.symbol2name

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.1) base_char
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if punctuation:
                # punctuation symbol :
                base_char = SYMB_PUNCTUATION.get_the_name_for_this_symbol(letter)
            elif not capital_letter:
                # lower case :
                base_char = SYMB_LOWER_CASE.get_the_name_for_this_symbol(letter)
            else:
                # upper case :
                base_char = SYMB_UPPER_CASE.get_the_name_for_this_symbol(letter)

            length = None
            stress = False
            diaeresis = False
            if diacritics is not None:
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.2) length
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                length_nbr = number_of_occurences( source_string = diacritics,
                                                   symbols = SYMB_DIACRITICS__LENGTH )

                if length_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), length defined several times."
                    raise DCharsError( context = "DStringLAT.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                length = None

                if SYMB_DIACRITICS.are_these_symbols_in_a_string('short', diacritics):
                    length = "short"
                elif SYMB_DIACRITICS.are_these_symbols_in_a_string('long', diacritics):
                    length = "long"

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.3) stress
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                stress_nbr = number_of_occurences( source_string = diacritics,
                                                   symbols = SYMB_DIACRITICS__STRESS)

                if stress_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), stress defined several times."
                    raise DCharsError( context = "DStringLAT.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                stress = SYMB_DIACRITICS.are_these_symbols_in_a_string("stress", diacritics)

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.4) diaeresis
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                diaeresis_nbr = number_of_occurences( source_string = diacritics,
                                                      symbols = SYMB_DIACRITICS__DIAERESIS)

                if diaeresis_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), diaeresis defined several times."
                    raise DCharsError( context = "DStringLAT.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                diaeresis = SYMB_DIACRITICS.are_these_symbols_in_a_string("diaeresis", diacritics)

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.5) we add the new character
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            new_character = DCharacterLAT(dstring_object = self,
                                          unknown_char = False,
                                          base_char = base_char,
                                          punctuation = punctuation,
                                          capital_letter = capital_letter,
                                          length = length,
                                          stress = stress,
                                          diaeresis = diaeresis)

            self.append( new_character )

        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        # we add the final unknown characters (see at the beginning of this
        # function)
        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        if indexes:
            # <element> is the last one and <indexes> isn't empty :
            for index in range( max(indexes[-1])+1, len(normalized_src) ):
                new_character = DCharacterLAT(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )
        else:
            # <indexes> is empty :
            for index in range( 0, len(normalized_src) ):
                new_character = DCharacterLAT(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )
Exemple #2
0
    def init_from_str(self, str_src):
        """
                DStringANG.init_from_str

                Function called by __init__(), initialize <self> and return
                <indexes_of_unrecognized_chars>.

                str_src : str

                HOW IT WORKS :
                * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
                * (2) = normalized_src -> (default symbols required) :
                *     replace_by_the_default_symbols() -> normalized_src
                * (3) initialisation from the recognized characters.
                *     re.finditer(DStringANG.pattern) give the symbols{letter+diacritics}
                *     (3.1) base_char
                *     (3.2) makron
                *     (3.3) stress
                *     (3.4) upperdot
                *     (3.5) we add the new character
        """
        #.......................................................................
        # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
        #.......................................................................
        normalized_src = unicodedata.normalize('NFD', str_src)

        #.......................................................................
        # (2) = normalized_src -> (default symbols required) :
        #     replace_by_the_default_symbols() -> normalized_src
        #.......................................................................
        normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_LOWER_CASE.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_UPPER_CASE.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src)

        #.......................................................................
        # (3) initialisation from the recognized characters.
        #     re.finditer(DStringANG.pattern) give the symbols{letter+diacritics}
        #.......................................................................
        indexes = []    # indexes of the substring well analyzed : ( start, end )
        for element in re.finditer(DStringANG.pattern,
                                   normalized_src):

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # we add the unknown characters at the beginning and in the middle
            # of the string (see at the end of this function)
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if indexes:
                # <indexes> isn't empty :
                # ... we add the unknown character(s) between the last character and
                # the current one :
                for index in range( max(indexes[-1])+1, element.start() ):
                    new_character = DCharacterANG(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )
            else:
                # <indexes> is empty :
                # ... we add the unknown character(s) before the first index in <indexes> :
                for index in range( 0, element.start() ):
                    new_character = DCharacterANG(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )

            indexes.append( (element.start(), element.end()-1 ) )

            data = element.groupdict()
            letter     = data['letter']
            diacritics = data['diacritics']

            punctuation = letter in SYMB_PUNCTUATION.symbol2name
            capital_letter = letter in SYMB_UPPER_CASE.symbol2name

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.1) base_char
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if punctuation:
                # punctuation symbol :
                base_char = SYMB_PUNCTUATION.get_the_name_for_this_symbol(letter)
            elif not capital_letter:
                # lower case :
                base_char = SYMB_LOWER_CASE.get_the_name_for_this_symbol(letter)
            else:
                # upper case :
                base_char = SYMB_UPPER_CASE.get_the_name_for_this_symbol(letter)

            makron = False
            stress = 0
            upperdot = False
            if diacritics is not None:
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.2) makron
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                makron_nbr = number_of_occurences( source_string = diacritics,
                                                   symbols = SYMB_DIACRITICS__MAKRON )

                if makron_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), makron defined several times."
                    raise DCharsError( context = "DStringANG.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                makron = SYMB_DIACRITICS.are_these_symbols_in_a_string("makron", diacritics)

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.3) stress
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                stressM1_nbr = number_of_occurences( source_string = diacritics,
                                                     symbols = SYMB_DIACRITICS__STRESS_MINUS1)
                stress1_nbr = number_of_occurences( source_string = diacritics,
                                                    symbols = SYMB_DIACRITICS__STRESS1)
                stress2_nbr = number_of_occurences( source_string = diacritics,
                                                    symbols = SYMB_DIACRITICS__STRESS2)

                if stressM1_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), stressM1(-1) defined several times."
                    raise DCharsError( context = "DStringANG.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                if stress1_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), stress1 defined several times."
                    raise DCharsError( context = "DStringANG.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                if stress2_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), stress2 defined several times."
                    raise DCharsError( context = "DStringANG.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                if stressM1_nbr + stress1_nbr + stress2_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), stressM1, stress1 and stress2 " \
                              "simultaneously defined."
                    raise DCharsError( context = "DStringANG.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                stress = 0

                if SYMB_DIACRITICS.are_these_symbols_in_a_string('stressM1', diacritics):
                    stress = -1
                if SYMB_DIACRITICS.are_these_symbols_in_a_string('stress1', diacritics):
                    stress = 1
                elif SYMB_DIACRITICS.are_these_symbols_in_a_string('stress2', diacritics):
                    stress = 2

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.4) upperdot
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                upperdot_nbr = number_of_occurences( source_string = diacritics,
                                                      symbols = SYMB_DIACRITICS__UPPERDOT)

                if upperdot_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), upperdot defined several times."
                    raise DCharsError( context = "DStringANG.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                upperdot = SYMB_DIACRITICS.are_these_symbols_in_a_string("upperdot", diacritics)

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.5) we add the new character
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            new_character = DCharacterANG(dstring_object = self,
                                          unknown_char = False,
                                          base_char = base_char,
                                          punctuation = punctuation,
                                          capital_letter = capital_letter,
                                          makron = makron,
                                          stress = stress,
                                          upperdot = upperdot)

            self.append( new_character )

        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        # we add the final unknown characters (see at the beginning of this
        # function)
        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        if indexes:
            # <element> is the last one and <indexes> isn't empty :
            for index in range( max(indexes[-1])+1, len(normalized_src) ):
                new_character = DCharacterANG(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )
        else:
            # <indexes> is empty :
            for index in range( 0, len(normalized_src) ):
                new_character = DCharacterANG(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )
Exemple #3
0
    def init_from_str(self, str_src):
        """
                DStringJPN.init_from_str

                Function called by __init__(), initialize <self> and return
                <indexes_of_unrecognized_chars>.

                str_src : str

                HOW IT WORKS :
                * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
                * (2) = normalized_src -> (default symbols required) :
                *     replace_by_the_default_symbols() -> normalized_src
                * (3) initialisation from the recognized characters.
                *     re.finditer(DStringJPN.pattern) give the symbols{letter+diacritics}
                *     (3.1) base_char, chartype, smallsize
                *     (3.2) diacritic
                *     (3.3) we add the new character
        """

        #.......................................................................
        # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
        #.......................................................................
        normalized_src = unicodedata.normalize('NFD', str_src)

        #.......................................................................
        # (2) = normalized_src -> (default symbols required) :
        #     replace_by_the_default_symbols() -> normalized_src
        #.......................................................................
        normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_CHOONPU.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_HIRAGANA.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_SMALL_HIRAGANA.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_KATAKANA.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_SMALL_KATAKANA.replace_by_the_default_symbols(normalized_src)

        #.......................................................................
        # (3) initialisation from the recognized characters.
        #     re.finditer(DStringJPN.pattern) give the symbols{letter+diacritics}
        #.......................................................................
        indexes = []    # indexes of the substring well analyzed : ( start, end )
        for element in re.finditer(DStringJPN.pattern,
                                   normalized_src):

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # we add the unknown characters at the beginning and in the middle
            # of the string (see at the end of this function)
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if indexes:
                # <indexes> isn't empty :
                # ... we add the unknown character(s) between the last character and
                # the current one :

                for index in range( max(indexes[-1])+1, element.start() ):
                    new_character = DCharacterJPN(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )

            else:
                # <indexes> is empty :
                # ... we add the unknown character(s) before the first index in <indexes> :
                for index in range( 0, element.start() ):
                    new_character = DCharacterJPN(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )

            indexes.append( (element.start(), element.end()-1 ) )

            data = element.groupdict()
            letter     = data['letter']
            diacritics = data['diacritics']

            punctuation = letter in SYMB_PUNCTUATION.symbol2name

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.1) base_char, chartype, smallsize
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

            if punctuation:
                # punctuation symbol :
                base_char = SYMB_PUNCTUATION.get_the_name_for_this_symbol(letter)
                smallsize = False
                chartype = "other"

            elif letter in SYMB_CHOONPU.symbol2name:
                # "ー" (the chōonpu 長音符 symbol)
                # confer http://en.wikipedia.org/wiki/Ch%C5%8Donpu
                base_char = SYMB_CHOONPU.get_the_name_for_this_symbol(letter)
                smallsize = False
                chartype = "choonpu"

            elif letter in SYMB_HIRAGANA.symbol2name:
                # hiragana :
                base_char = SYMB_HIRAGANA.get_the_name_for_this_symbol(letter)
                smallsize = False
                chartype = "hiragana"

            elif letter in SYMB_SMALL_HIRAGANA.symbol2name:
                # small hiragana :
                base_char = SYMB_HIRAGANA.get_the_name_for_this_symbol(\
                                                SMALL_HIRAGANA_TO_HIRAGANA[letter])
                smallsize = True
                chartype = "hiragana"

            elif letter in SYMB_KATAKANA.symbol2name:
                # katakana :
                base_char = SYMB_HIRAGANA.get_the_name_for_this_symbol(\
                        KATAKANA_TO_HIRAGANA[ SYMB_KATAKANA.get_the_name_for_this_symbol(letter) ])
                smallsize = False
                chartype = "katakana"

            elif letter in SYMB_SMALL_KATAKANA.symbol2name:
                # small katakana :
                base_char = SYMB_HIRAGANA.get_the_name_for_this_symbol(\
                                        KATAKANA_TO_HIRAGANA[SMALL_KATAKANA_TO_KATAKANA[letter]])
                smallsize = True
                chartype = "katakana"

            elif letter in SYMB_KANJI.symbol2name:
                # kanji :
                base_char = SYMB_KANJI.get_the_name_for_this_symbol(letter)
                smallsize = False
                chartype = "kanji"

            else:
                # other :
                base_char = letter
                smallsize = False
                chartype = "other"

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.2) diacritics
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            diacritic = None

            if diacritics is not None:

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.2.1) dakuten
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                dakuten_nbr = number_of_occurences( source_string = diacritics,
                                                  symbols = SYMB_DIACRITICS__DAKUTEN )

                if dakuten_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), dakuten defined several times."
                    raise DCharsError( context = "DStringJPN.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()))

                if SYMB_DIACRITICS.are_these_symbols_in_a_string('dakuten', diacritics):
                    diacritic = "dakuten"

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.2.2) handakuten
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                handakuten_nbr = number_of_occurences( source_string = diacritics,
                                                  symbols = SYMB_DIACRITICS__HANDAKUTEN )

                if handakuten_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), handakuten defined several times."
                    raise DCharsError( context = "DStringJPN.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()))

                if SYMB_DIACRITICS.are_these_symbols_in_a_string('handakuten', diacritics):
                    diacritic = "handakuten"


                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # dakuten + handakuten ? error
                if dakuten_nbr >= 1 and handakuten_nbr >= 1:
                    err_msg = "In '{0}' (start={1}, end={2}), dakuten and handakuten " \
                              "defined simultaneously"
                    raise DCharsError( context = "DStringJPN.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()))


            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.3) we add the new character
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            new_character = DCharacterJPN(dstring_object = self,
                                          unknown_char = False,
                                          base_char = base_char,
                                          diacritic = diacritic,
                                          punctuation = punctuation,
                                          chartype=chartype,
                                          smallsize = smallsize)

            self.append( new_character )

        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        # we add the final unknown characters (see at the beginning of this
        # function)
        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        if indexes:
            # <element> is the last one and <indexes> isn't empty :
            for index in range( max(indexes[-1])+1, len(normalized_src) ):
                new_character = DCharacterJPN(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )

        else:
            # <indexes> is empty :
            for index in range( 0, len(normalized_src) ):
                new_character = DCharacterJPN(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )
Exemple #4
0
    def init_from_str(self, str_src):
        """
                DStringSAN.init_from_str

                Function called by __init__(), initialize <self> and return
                <indexes_of_unrecognized_chars>.

                str_src : str

                HOW IT WORKS :
                * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
                * (2) = normalized_src -> (itrans symbols required) :
                *     replace_by_the_default_symbols() -> normalized_src
                * (3) initialisation from the recognized characters.
                *     re.finditer(DStringSAN.pattern) give the symbols{base_char, diacritics}
                *     (3.1) virama
                *     (3.2) base_char, punctuation, dependentvowel, is_an_independent_vowel
                *     (3.3) accent
                *     (3.4) nukta
                *     (3.5) anusvara_candrabindu
                *     (3.6) anudatta
                *     (3.7) we add the new character
        """
        #.......................................................................
        # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
        #.......................................................................
        normalized_src = unicodedata.normalize('NFD', str_src)

        #.......................................................................
        # (2) = normalized_src -> (itrans symbols required) :
        #     replace_by_the_default_symbols() -> normalized_src
        #.......................................................................
        normalized_src = SYMB_CONSONANTS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_INDEPENDENT_VOWELS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_DEPENDENT_VOWELS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src)

        #.......................................................................
        # (3) initialisation from the recognized characters.
        #     re.finditer(DStringSAN.pattern) give the symbols{basechar, diacritics}
        #.......................................................................
        indexes = []    # indexes of the substring well analyzed : ( start, end )
        for element in re.finditer(DStringSAN.pattern,
                                   normalized_src):

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # we add the unknown characters at the beginning and in the middle
            # of the string (see at the end of this function)
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if indexes:
                # <indexes> isn't empty :
                # ... we add the unknown character(s) between the last character and
                # the current one :
                for index in range( max(indexes[-1])+1, element.start() ):
                    new_character = DCharacterSAN(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )
            else:
                # <indexes> is empty :
                # ... we add the unknown character(s) before the first index in <indexes> :
                for index in range( 0, element.start() ):
                    new_character = DCharacterSAN(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )

            indexes.append( (element.start(), element.end()-1 ) )

            data = element.groupdict()
            base_char   = data['basechar']
            dependentvowel = data['dependentvowel']
            diacritics = data['diacritics']

            # base_char as "क" becomes "KA"
            base_char__punctuation = SYMB_PUNCTUATION.get_the_name_for_this_symbol(base_char)
            base_char__other_symbols = SYMB_OTHER_SYMBOLS.get_the_name_for_this_symbol(base_char)
            base_char__consonant = SYMB_CONSONANTS.get_the_name_for_this_symbol(base_char)
            base_char__ivowel = SYMB_INDEPENDENT_VOWELS.get_the_name_for_this_symbol(base_char)
            base_char__dvowel = SYMB_DEPENDENT_VOWELS.get_the_name_for_this_symbol(dependentvowel)

            is_an_independent_vowel = False # <is_an_independent_vowel> is set here since,
                                            # if base_char is a punctuation symbol,
                                            # it will never be set again but it is needed by
                                            # the call to new_character = DCharacterSAN(...)

            virama = False
            if diacritics is not None:
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.1) virama
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                virama_nbr = number_of_occurences( source_string = diacritics,
                                                   symbols = SYMB_DIACRITICS__VIRAMA)

                if virama_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), 'virama' defined several times."
                    raise DCharsError( context = "DStringSAN.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                virama = SYMB_DIACRITICS.are_these_symbols_in_a_string('DEVANAGARI SIGN VIRAMA',
                                                                       diacritics)

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.2) base_char, punctuation, dependentvowel, is_an_independent_vowel
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if base_char__punctuation is not None:
                # punctuation symbol :
                punctuation = True
                base_char = base_char__punctuation

            elif base_char__other_symbols is not None:
                # "other symbol" : not punctuation nor consonant nor independent vowel :
                punctuation = False
                base_char = base_char__other_symbols

            else:
                punctuation = False

                if base_char__consonant is not None:
                    # consonant :
                    is_an_independent_vowel = False
                    base_char = base_char__consonant

                    # dependent vowel ?
                    if base_char != 'DEVANAGARI SIGN VISARGA' and \
                       not virama and dependentvowel is None:
                        # special case : for normal consonants (and visarga is a pseudo-consonant)
                        #                written without any vowel symbol, the dependent vowel
                        #                is 'A'. E.g. 'क' stands for 'ka', not for 'k'.
                        dependentvowel = "A"
                    else:
                        dependentvowel = base_char__dvowel

                else:
                    # independent vowel :
                    is_an_independent_vowel = True
                    dependentvowel = None
                    base_char = base_char__ivowel


            accent = None
            nukta = False
            anusvara_candrabindu = None
            anudatta = False
            if diacritics is not None:
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.3) accent
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                accent_nbr = number_of_occurences( source_string = diacritics,
                                                   symbols = SYMB_DIACRITICS__ACCENTS )

                if accent_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), 'accent' defined several times."
                    raise DCharsError( context = "DStringSAN.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                accent = None
                for accent_char in SYMB_DIACRITICS__ACCENTS:
                    accent_name = SYMB_DIACRITICS.defaultsymbol2name[accent_char]
                    if SYMB_DIACRITICS.are_these_symbols_in_a_string(name=accent_name,
                                                                     string=diacritics):
                        accent = accent_name
                        break

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.4) nukta
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                nukta_nbr = number_of_occurences( source_string = diacritics,
                                                  symbols = SYMB_DIACRITICS__NUKTA )

                if nukta_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), 'nukta' defined several times."
                    raise DCharsError( context = "DStringSAN.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                nukta = SYMB_DIACRITICS.are_these_symbols_in_a_string('DEVANAGARI SIGN NUKTA',
                                                                      diacritics)

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.5) anusvara_candrabindu
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                anusvara_candrabindu_nbr = number_of_occurences(
                    source_string = diacritics,
                    symbols = SYMB_DIACRITICS__ANUSVARA_CANDRABINDU)

                if anusvara_candrabindu_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), " \
                              "'anusvara_candrabindu' defined several times."
                    raise DCharsError( context = "DStringSAN.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                anusvara_candrabindu = None
                for anusvara_candrabindu_char in SYMB_DIACRITICS__ANUSVARA_CANDRABINDU:
                    anusvara_candrabindu_name = SYMB_DIACRITICS.defaultsymbol2name[
                        anusvara_candrabindu_char]
                    if SYMB_DIACRITICS.are_these_symbols_in_a_string(name=anusvara_candrabindu_name,
                                                                     string=diacritics):
                        anusvara_candrabindu = anusvara_candrabindu_name
                        break

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.6) anudatta
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                anudatta_nbr = number_of_occurences( source_string = diacritics,
                                                   symbols = SYMB_DIACRITICS__ANUDATTA)

                if anudatta_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), 'anudatta' defined several times."
                    raise DCharsError( context = "DStringSAN.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                anudatta = SYMB_DIACRITICS.are_these_symbols_in_a_string(
                    'DEVANAGARI STRESS SIGN ANUDATTA',
                    diacritics)

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.7) we add the new character
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            new_character = DCharacterSAN(dstring_object = self,
                                          unknown_char = False,
                                          base_char = base_char,
                                          accent = accent,
                                          punctuation = punctuation,
                                          nukta = nukta,
                                          anusvara_candrabindu = anusvara_candrabindu,
                                          virama = virama,
                                          anudatta = anudatta,
                                          is_an_independent_vowel = is_an_independent_vowel,
                                          dependentvowel = dependentvowel)
            self.append( new_character )

        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        # we add the final unknown characters (see at the beginning of this
        # function)
        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        if indexes:
            # <element> is the last one and <indexes> isn't empty :
            for index in range( max(indexes[-1])+1, len(normalized_src) ):
                new_character = DCharacterSAN(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )

        else:
            # <indexes> is empty :
            for index in range( 0, len(normalized_src) ):
                new_character = DCharacterSAN(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )
Exemple #5
0
    def init_from_str(self, str_src):
        """
                DStringGRC.init_from_str

                Function called by __init__(), initialize <self> and return
                <indexes_of_unrecognized_chars>.

                str_src : str

                HOW IT WORKS :
                * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
                * (2) = normalized_src -> (default symbols required) :
                *     replace_by_the_default_symbols() -> normalized_src
                * (3) initialisation from the recognized characters.
                *     re.finditer(DStringGRC.pattern) give the symbols{letter+diacritics}
                *     (3.1) base_char
                *     (3.2) contextual_form
                *     (3.3) tonos (τόνος)
                *     (3.4) mekos (μῆκος)
                *     (3.5) pneuma (πνεῦμα)
                *     (3.6) hypogegrammene (ὑπογεγραμμένη)
                *     (3.7) dialutika (διαλυτικά)
                *     (3.8) we add the new character
        """

        #.......................................................................
        # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
        #.......................................................................
        normalized_src = unicodedata.normalize('NFD', str_src)

        #.......................................................................
        # (2) = normalized_src -> (default symbols required) :
        #     replace_by_the_default_symbols() -> normalized_src
        #.......................................................................
        normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_LOWER_CASE.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_UPPER_CASE.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_OTHER_SYMBOLS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src)

        #.......................................................................
        # (3) initialisation from the recognized characters.
        #     re.finditer(DStringGRC.pattern) give the symbols{letter+diacritics}
        #.......................................................................
        indexes = []    # indexes of the substring well analyzed : ( start, end )
        for element in re.finditer(DStringGRC.pattern,
                                   normalized_src):

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # we add the unknown characters at the beginning and in the middle
            # of the string (see at the end of this function)
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if indexes:
                # <indexes> isn't empty :
                # ... we add the unknown character(s) between the last character and
                # the current one :

                for index in range( max(indexes[-1])+1, element.start() ):
                    new_character = DCharacterGRC(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )

            else:
                # <indexes> is empty :
                # ... we add the unknown character(s) before the first index in <indexes> :
                for index in range( 0, element.start() ):
                    new_character = DCharacterGRC(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )

            indexes.append( (element.start(), element.end()-1 ) )

            data = element.groupdict()
            letter     = data['letter']
            diacritics = data['diacritics']

            punctuation = letter in SYMB_PUNCTUATION.symbol2name
            capital_letter = letter in SYMB_UPPER_CASE.symbol2name

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.1) base_char
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if punctuation:
                # punctuation symbol :
                base_char = SYMB_PUNCTUATION.get_the_name_for_this_symbol(letter)
            elif letter in SYMB_LOWER_CASE.symbol2name:
                # lower case :
                base_char = SYMB_LOWER_CASE.get_the_name_for_this_symbol(letter)
            elif letter in SYMB_UPPER_CASE.symbol2name:
                # upper case :
                base_char = SYMB_UPPER_CASE.get_the_name_for_this_symbol(letter)
            else:
                # other symbols :
                base_char = SYMB_OTHER_SYMBOLS.get_the_name_for_this_symbol(letter)

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.2) contextual_form
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if base_char == 'β' and not capital_letter:
                contextual_form = "initial"
            elif base_char == 'ϐ' and not capital_letter:
                base_char = 'β'
                contextual_form = "medium+final"
            elif base_char == 'σ' and not capital_letter:
                contextual_form = "initial+medium"
            elif base_char == 'ς' and not capital_letter:
                base_char = 'σ'
                contextual_form = "final"
            else:
                contextual_form = "initial+medium+final"

            tonos = None
            mekos = None
            pneuma = None
            hypogegrammene = False
            dialutika = False
            if diacritics is not None:

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.3) tonos (τόνος)
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                tonos_nbr = number_of_occurences( source_string = diacritics,
                                                  symbols = SYMB_DIACRITICS__TONOS )

                if tonos_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), τόνος defined several times."
                    raise DCharsError( context = "DStringGRC.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()))

                if SYMB_DIACRITICS.are_these_symbols_in_a_string('τόνος.βαρεῖα', diacritics):
                    tonos = "βαρεῖα"
                elif SYMB_DIACRITICS.are_these_symbols_in_a_string('τόνος.ὀξεῖα', diacritics):
                    tonos = "ὀξεῖα"
                elif SYMB_DIACRITICS.are_these_symbols_in_a_string('τόνος.περισπωμένη', diacritics):
                    tonos = "περισπωμένη"

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.4) mekos (μῆκος)
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                mekos_nbr = number_of_occurences( source_string = diacritics,
                                                  symbols = SYMB_DIACRITICS__MEKOS)

                if mekos_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), μῆκος defined several times."
                    raise DCharsError( context = "DStringGRC.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()))

                if SYMB_DIACRITICS.are_these_symbols_in_a_string('μῆκος.μακρόν', diacritics):
                    mekos = "μακρόν"
                elif SYMB_DIACRITICS.are_these_symbols_in_a_string('μῆκος.βραχύ', diacritics):
                    mekos = "βραχύ"

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.5) pneuma (πνεῦμα)
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                pneuma_nbr = number_of_occurences( source_string = diacritics,
                                                   symbols = SYMB_DIACRITICS__PNEUMA)

                if pneuma_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), πνεῦμα defined several times."
                    raise DCharsError( context = "DStringGRC.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()))

                if SYMB_DIACRITICS.are_these_symbols_in_a_string('πνεῦμα.ψιλὸν', diacritics):
                    pneuma = "ψιλὸν"
                elif SYMB_DIACRITICS.are_these_symbols_in_a_string('πνεῦμα.δασὺ', diacritics):
                    pneuma = "δασὺ"

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.6) hypogegrammene (ὑπογεγραμμένη)
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                hypogegrammene_nbr = number_of_occurences(
                    source_string = diacritics,
                    symbols = SYMB_DIACRITICS['ὑπογεγραμμένη'])

                if hypogegrammene_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), ὑπογεγραμμένη defined several times."
                    raise DCharsError( context = "DStringGRC.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()))

                hypogegrammene = SYMB_DIACRITICS.are_these_symbols_in_a_string('ὑπογεγραμμένη',
                                                                               diacritics)

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.7) dialutika (διαλυτικά)
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                dialutika_nbr = number_of_occurences( source_string = diacritics,
                                                      symbols = SYMB_DIACRITICS['διαλυτικά'])

                if dialutika_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), διαλυτικά defined several times."
                    raise DCharsError( context = "DStringGRC.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()))

                dialutika = SYMB_DIACRITICS.are_these_symbols_in_a_string('διαλυτικά', diacritics)

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.8) we add the new character
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            new_character = DCharacterGRC(dstring_object = self,
                                          unknown_char = False,
                                          base_char = base_char,
                                          contextual_form = contextual_form,
                                          punctuation = punctuation,
                                          capital_letter = capital_letter,
                                          tonos = tonos,
                                          pneuma = pneuma,
                                          hypogegrammene = hypogegrammene,
                                          dialutika = dialutika,
                                          mekos=mekos)

            self.append( new_character )

        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        # we add the final unknown characters (see at the beginning of this
        # function)
        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        if indexes:
            # <element> is the last one and <indexes> isn't empty :
            for index in range( max(indexes[-1])+1, len(normalized_src) ):
                new_character = DCharacterGRC(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )

        else:
            # <indexes> is empty :
            for index in range( 0, len(normalized_src) ):
                new_character = DCharacterGRC(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )
Exemple #6
0
    def init_from_str(self, str_src):
        """
                DStringHBO.init_from_str

                Function called by __init__(), initialize <self>

                str_src : str

                HOW IT WORKS :
                * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
                * (2) = normalized_src -> (default symbols required) :
                *     replace_by_the_default_symbols() -> normalized_src
                * (3) initialisation from the recognized characters.
                *     re.finditer(DStringHBO.pattern) give the symbols{base_char, diacritics}
                *     (3.1) contextual_form
                *     (3.2) shin_sin_dot
                *     (3.3) daghesh_mapiq
                *     (3.4) methegh
                *     (3.5) specialpoint
                *     (3.6) vowel
                *     (3.7) raphe
                *     (3.8) cantillation_mark
                *     (3.9) we add the new character
        """
        #.......................................................................
        # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
        #.......................................................................
        normalized_src = unicodedata.normalize('NFD', str_src)

        #.......................................................................
        # (2) = normalized_src -> (default symbols required) :
        #     replace_by_the_default_symbols() -> normalized_src
        #.......................................................................
        normalized_src = SYMB_LETTERS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_OTHER_SYMBOLS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_VOWELS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_POINTS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_SPECIALPOINTS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_CANTILLATION_MARKS.replace_by_the_default_symbols(normalized_src)

        #.......................................................................
        # (3) initialisation from the recognized characters.
        #     re.finditer(DStringHBO.pattern) give the symbols{basechar, diacritics}
        #.......................................................................
        indexes = []    # indexes of the substring well analyzed : ( start, end )
        for element in re.finditer(DStringHBO.pattern, normalized_src):

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # we add the unknown characters at the beginning and in the middle
            # of the string (see at the end of this function)
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if indexes:
                # <indexes> isn't empty :
                # ... we add the unknown character(s) between the last character and
                # the current one :
                for index in range( max(indexes[-1])+1, element.start() ):
                    new_character = DCharacterHBO(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )
            else:
                # <indexes> is empty :
                # ... we add the unknown character(s) before the first index in <indexes> :
                for index in range( 0, element.start() ):
                    new_character = DCharacterHBO(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )

            indexes.append( (element.start(), element.end()-1 ) )

            data = element.groupdict()
            base_char   = data['basechar']
            diacritics = data['diacritics']

            punctuation = base_char in SYMB_PUNCTUATION.symbol2name

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.1) contextual_form
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if base_char == "ך":
                base_char = "כ"
                contextual_form = "final"
            elif base_char == "ם":
                base_char = "מ"
                contextual_form = "final"
            elif base_char == "ן":
                base_char = "נ"
                contextual_form = "final"
            elif base_char == "ף":
                base_char = "פ"
                contextual_form = "final"
            elif base_char == "ץ":
                base_char = "צ"
                contextual_form = "final"
            elif punctuation == False:
                contextual_form = "initial+medium+final"
            else:
                contextual_form = None



            shin_sin_dot = None
            daghesh_mapiq = False
            methegh = False
            specialpoint = None
            vowel = None
            raphe = False
            cantillation_mark = None

            if diacritics is not None:
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.2) shin_sin_dot
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                shin_sin_dot_nbr = number_of_occurences( source_string = diacritics,
                                                         symbols = SYMB_DIACRITICS__SHIN_SIN_DOT )

                if shin_sin_dot_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), shin_sin_dot defined several times."
                    raise DCharsError( context = "DStringHBO.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                shin_sin_dot = None
                if SYMB_POINTS.are_these_symbols_in_a_string("HEBREW POINT SHIN DOT", diacritics):
                    shin_sin_dot = "HEBREW POINT SHIN DOT"
                elif SYMB_POINTS.are_these_symbols_in_a_string("HEBREW POINT SIN DOT", diacritics):
                    shin_sin_dot = "HEBREW POINT SIN DOT"

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.3) daghesh_mapiq
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                daghesh_mapiq_nbr = number_of_occurences( source_string = diacritics,
                                                          symbols = SYMB_DIACRITICS__DAGHESH_MAPIQ)

                if daghesh_mapiq_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), daghesh_mapiq defined several times."
                    raise DCharsError( context = "DStringHBO.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                daghesh_mapiq = SYMB_POINTS.are_these_symbols_in_a_string(
                    "HEBREW POINT DAGESH OR MAPIQ",
                    diacritics)

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.4) methegh
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                methegh_nbr = number_of_occurences( source_string = diacritics,
                                                    symbols = SYMB_DIACRITICS__METHEGH)

                if methegh_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), methegh defined several times."
                    raise DCharsError( context = "DStringHBO.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                methegh = SYMB_POINTS.are_these_symbols_in_a_string("HEBREW POINT METEG",
                                                                    diacritics)

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.5) specialpoint
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                specialpoint_nbr = number_of_occurences( source_string = diacritics,
                                                          symbols = SYMB_DIACRITICS__SPECIALPOINTS)

                if specialpoint_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), specialpoint defined several times."
                    raise DCharsError( context = "DStringHBO.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                specialpoint = None
                for specialpoint_char in SYMB_DIACRITICS__SPECIALPOINTS:
                    specialpoint_name = SYMB_SPECIALPOINTS.defaultsymbol2name[specialpoint_char]
                    if SYMB_SPECIALPOINTS.are_these_symbols_in_a_string(name=specialpoint_name,
                                                                        string=diacritics):
                        specialpoint = specialpoint_name
                        break

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.6) vowel
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                vowel_nbr = number_of_occurences( source_string = diacritics,
                                                  symbols = SYMB_DIACRITICS__VOWELS)

                if vowel_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), vowel defined several times."
                    raise DCharsError( context = "DStringHBO.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                vowel = None
                for vowel_char in SYMB_DIACRITICS__VOWELS:
                    vowel_name = SYMB_VOWELS.defaultsymbol2name[vowel_char]
                    if SYMB_VOWELS.are_these_symbols_in_a_string(name=vowel_name,
                                                                 string=diacritics):
                        vowel = vowel_name
                        break

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.7) raphe
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                raphe_nbr = number_of_occurences( source_string = diacritics,
                                                  symbols = SYMB_DIACRITICS__RAPHE)

                if raphe_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), raphe defined several times."
                    raise DCharsError( context = "DStringHBO.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                raphe = SYMB_POINTS.are_these_symbols_in_a_string("HEBREW POINT RAFE", diacritics)

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.8) cantillation_mark
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                cmark_nbr = number_of_occurences( source_string = diacritics,
                                                  symbols = SYMB_DIACRITICS__CANTILLATION_MARKS )

                if cmark_nbr > 2:
                    err_msg = "In '{0}' (start={1}, end={2}), " \
                              "cantillation marks defined more than two times."
                    raise DCharsError( context = "DStringHBO.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                cantillation_mark = []
                for cmark_char in SYMB_DIACRITICS__CANTILLATION_MARKS:
                    cmark_name = SYMB_CANTILLATION_MARKS.defaultsymbol2name[cmark_char]
                    if SYMB_CANTILLATION_MARKS.are_these_symbols_in_a_string(name=cmark_name,
                                                                             string=diacritics):
                        cantillation_mark.append( cmark_name )

                if cantillation_mark == []:
                    cantillation_mark = None

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.9) we add the new character
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            new_character = DCharacterHBO(dstring_object = self,
                                          unknown_char = False,
                                          base_char = base_char,
                                          contextual_form = contextual_form,
                                          punctuation = punctuation,
                                          shin_sin_dot = shin_sin_dot,
                                          daghesh_mapiq = daghesh_mapiq,
                                          methegh = methegh,
                                          specialpoint = specialpoint,
                                          vowel = vowel,
                                          raphe = raphe,
                                          cantillation_mark = cantillation_mark)

            self.append( new_character )

        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        # we add the final unknown characters (see at the beginning of this
        # function)
        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        if indexes:
            # <element> is the last one and <indexes> isn't empty :
            for index in range( max(indexes[-1])+1, len(normalized_src) ):
                new_character = DCharacterHBO(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )

        else:
            # <indexes> is empty :
            for index in range( 0, len(normalized_src) ):
                new_character = DCharacterHBO(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )