Esempio n. 1
0
    def init_from_str(self, str_src):
        """
                DStringSAN.init_from_str

                Function called by __init__(), initialize <self> and return
                <indexes_of_unrecognized_chars>.

                str_src : str

                HOW IT WORKS :
                * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
                * (2) = normalized_src -> (itrans symbols required) :
                *     replace_by_the_default_symbols() -> normalized_src
                * (3) initialisation from the recognized characters.
                *     re.finditer(DStringSAN.pattern) give the symbols{base_char, diacritics}
                *     (3.1) virama
                *     (3.2) base_char, punctuation, dependentvowel, is_an_independent_vowel
                *     (3.3) accent
                *     (3.4) nukta
                *     (3.5) anusvara_candrabindu
                *     (3.6) anudatta
                *     (3.7) we add the new character
        """
        #.......................................................................
        # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
        #.......................................................................
        normalized_src = unicodedata.normalize('NFD', str_src)

        #.......................................................................
        # (2) = normalized_src -> (itrans symbols required) :
        #     replace_by_the_default_symbols() -> normalized_src
        #.......................................................................
        normalized_src = SYMB_CONSONANTS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_INDEPENDENT_VOWELS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_DEPENDENT_VOWELS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src)

        #.......................................................................
        # (3) initialisation from the recognized characters.
        #     re.finditer(DStringSAN.pattern) give the symbols{basechar, diacritics}
        #.......................................................................
        indexes = []    # indexes of the substring well analyzed : ( start, end )
        for element in re.finditer(DStringSAN.pattern,
                                   normalized_src):

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # we add the unknown characters at the beginning and in the middle
            # of the string (see at the end of this function)
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if indexes:
                # <indexes> isn't empty :
                # ... we add the unknown character(s) between the last character and
                # the current one :
                for index in range( max(indexes[-1])+1, element.start() ):
                    new_character = DCharacterSAN(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )
            else:
                # <indexes> is empty :
                # ... we add the unknown character(s) before the first index in <indexes> :
                for index in range( 0, element.start() ):
                    new_character = DCharacterSAN(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )

            indexes.append( (element.start(), element.end()-1 ) )

            data = element.groupdict()
            base_char   = data['basechar']
            dependentvowel = data['dependentvowel']
            diacritics = data['diacritics']

            # base_char as "क" becomes "KA"
            base_char__punctuation = SYMB_PUNCTUATION.get_the_name_for_this_symbol(base_char)
            base_char__other_symbols = SYMB_OTHER_SYMBOLS.get_the_name_for_this_symbol(base_char)
            base_char__consonant = SYMB_CONSONANTS.get_the_name_for_this_symbol(base_char)
            base_char__ivowel = SYMB_INDEPENDENT_VOWELS.get_the_name_for_this_symbol(base_char)
            base_char__dvowel = SYMB_DEPENDENT_VOWELS.get_the_name_for_this_symbol(dependentvowel)

            is_an_independent_vowel = False # <is_an_independent_vowel> is set here since,
                                            # if base_char is a punctuation symbol,
                                            # it will never be set again but it is needed by
                                            # the call to new_character = DCharacterSAN(...)

            virama = False
            if diacritics is not None:
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.1) virama
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                virama_nbr = number_of_occurences( source_string = diacritics,
                                                   symbols = SYMB_DIACRITICS__VIRAMA)

                if virama_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), 'virama' defined several times."
                    raise DCharsError( context = "DStringSAN.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                virama = SYMB_DIACRITICS.are_these_symbols_in_a_string('DEVANAGARI SIGN VIRAMA',
                                                                       diacritics)

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.2) base_char, punctuation, dependentvowel, is_an_independent_vowel
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if base_char__punctuation is not None:
                # punctuation symbol :
                punctuation = True
                base_char = base_char__punctuation

            elif base_char__other_symbols is not None:
                # "other symbol" : not punctuation nor consonant nor independent vowel :
                punctuation = False
                base_char = base_char__other_symbols

            else:
                punctuation = False

                if base_char__consonant is not None:
                    # consonant :
                    is_an_independent_vowel = False
                    base_char = base_char__consonant

                    # dependent vowel ?
                    if base_char != 'DEVANAGARI SIGN VISARGA' and \
                       not virama and dependentvowel is None:
                        # special case : for normal consonants (and visarga is a pseudo-consonant)
                        #                written without any vowel symbol, the dependent vowel
                        #                is 'A'. E.g. 'क' stands for 'ka', not for 'k'.
                        dependentvowel = "A"
                    else:
                        dependentvowel = base_char__dvowel

                else:
                    # independent vowel :
                    is_an_independent_vowel = True
                    dependentvowel = None
                    base_char = base_char__ivowel


            accent = None
            nukta = False
            anusvara_candrabindu = None
            anudatta = False
            if diacritics is not None:
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.3) accent
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                accent_nbr = number_of_occurences( source_string = diacritics,
                                                   symbols = SYMB_DIACRITICS__ACCENTS )

                if accent_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), 'accent' defined several times."
                    raise DCharsError( context = "DStringSAN.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                accent = None
                for accent_char in SYMB_DIACRITICS__ACCENTS:
                    accent_name = SYMB_DIACRITICS.defaultsymbol2name[accent_char]
                    if SYMB_DIACRITICS.are_these_symbols_in_a_string(name=accent_name,
                                                                     string=diacritics):
                        accent = accent_name
                        break

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.4) nukta
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                nukta_nbr = number_of_occurences( source_string = diacritics,
                                                  symbols = SYMB_DIACRITICS__NUKTA )

                if nukta_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), 'nukta' defined several times."
                    raise DCharsError( context = "DStringSAN.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                nukta = SYMB_DIACRITICS.are_these_symbols_in_a_string('DEVANAGARI SIGN NUKTA',
                                                                      diacritics)

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.5) anusvara_candrabindu
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                anusvara_candrabindu_nbr = number_of_occurences(
                    source_string = diacritics,
                    symbols = SYMB_DIACRITICS__ANUSVARA_CANDRABINDU)

                if anusvara_candrabindu_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), " \
                              "'anusvara_candrabindu' defined several times."
                    raise DCharsError( context = "DStringSAN.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                anusvara_candrabindu = None
                for anusvara_candrabindu_char in SYMB_DIACRITICS__ANUSVARA_CANDRABINDU:
                    anusvara_candrabindu_name = SYMB_DIACRITICS.defaultsymbol2name[
                        anusvara_candrabindu_char]
                    if SYMB_DIACRITICS.are_these_symbols_in_a_string(name=anusvara_candrabindu_name,
                                                                     string=diacritics):
                        anusvara_candrabindu = anusvara_candrabindu_name
                        break

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.6) anudatta
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                anudatta_nbr = number_of_occurences( source_string = diacritics,
                                                   symbols = SYMB_DIACRITICS__ANUDATTA)

                if anudatta_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), 'anudatta' defined several times."
                    raise DCharsError( context = "DStringSAN.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                anudatta = SYMB_DIACRITICS.are_these_symbols_in_a_string(
                    'DEVANAGARI STRESS SIGN ANUDATTA',
                    diacritics)

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.7) we add the new character
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            new_character = DCharacterSAN(dstring_object = self,
                                          unknown_char = False,
                                          base_char = base_char,
                                          accent = accent,
                                          punctuation = punctuation,
                                          nukta = nukta,
                                          anusvara_candrabindu = anusvara_candrabindu,
                                          virama = virama,
                                          anudatta = anudatta,
                                          is_an_independent_vowel = is_an_independent_vowel,
                                          dependentvowel = dependentvowel)
            self.append( new_character )

        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        # we add the final unknown characters (see at the beginning of this
        # function)
        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        if indexes:
            # <element> is the last one and <indexes> isn't empty :
            for index in range( max(indexes[-1])+1, len(normalized_src) ):
                new_character = DCharacterSAN(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )

        else:
            # <indexes> is empty :
            for index in range( 0, len(normalized_src) ):
                new_character = DCharacterSAN(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )