Esempio n. 1
0
    def get_sourcestr_representation(self):
        """
                DCharacterSAN.get_sourcestr_representation

                Return a string.
        """
        # .......................................................................
        # unknown char ? Nothing to do :
        # .......................................................................
        if self.unknown_char:
            if self.dstring_object.options["anonymize the unknown characters"] == "yes":
                return UNKNOWN_CHAR_SYMBOL
            else:
                return self.base_char

        # .......................................................................
        # ok, the function can analyse <self> :
        # .......................................................................
        res = []

        if self.base_char is not None:

            if self.punctuation:
                # punctuation symbol :
                res.append(SYMB_PUNCTUATION.get_default_symbol(self.base_char))

            else:
                if self.base_char in SYMB_OTHER_SYMBOLS:
                    # "other symbol" : not punctuation nor consonant nor
                    # independent vowel.
                    res.append(SYMB_OTHER_SYMBOLS.get_default_symbol(self.base_char))
                elif not self.is_an_independent_vowel:
                    # consonant :
                    res.append(SYMB_CONSONANTS.get_default_symbol(self.base_char))
                else:
                    # independent vowel :
                    res.append(SYMB_INDEPENDENT_VOWELS.get_default_symbol(self.base_char))

                # dependent vowel ?
                if self.dependentvowel is not None:
                    # yes :
                    res.append(SYMB_DEPENDENT_VOWELS.get_default_symbol(self.dependentvowel))

        if self.nukta:
            res.append(DEFAULTSYMB__NUKTA)

        if self.accent is not None:
            res.append(SYMB_DIACRITICS.get_default_symbol(self.accent))

        if self.virama:
            res.append(DEFAULTSYMB__VIRAMA)

        if self.anudatta:
            res.append(DEFAULTSYMB__ANUDATTA)

        if self.anusvara_candrabindu is not None:
            res.append(SYMB_DIACRITICS.get_default_symbol(self.anusvara_candrabindu))

        res = "".join(res)

        # we have to delete the fake symbol for 'a' since there's no symbol in devanagari for
        # the vowel 'a'.
        res = res.replace(FAKE_A__SYMBOL, "")

        # (1/3) composition with PRE_NORMALIZE_NFC :
        for src, dest in PRE_NORMALIZE_NFC:
            res = res.replace(src, dest)
        # (2/3) composition with unicodedata.normalize :
        res = unicodedata.normalize("NFC", res)
        # (3/3) composition with POST_NORMALIZE_NFC :
        for src, dest in POST_NORMALIZE_NFC:
            res = res.replace(src, dest)

        return res
Esempio n. 2
0
def get_intstruct_from_trans_str( _src, dstring_object ):
    """
        function get_intstruct_from_trans_str()

        _src    : (str) transliterated string like "क".

        Return a ListOfInternalStructures object.
    """

    # list of InternalStructure objects.
    istructs = ListOfInternalStructures(anonymize_the_unknown_chars =\
                                dstring_object.options["anonymize the unknown characters"] == 'yes')

    # we read <_src> through a DSTRING_SAN object :
    dstring_san = new_dstring(language='संस्कृतम्',
                              transliteration_method="iso15919")
    dstring_san = dstring_san(_src)

    # In Sanskrit, if a consonant is followed by a virama, it means that the following
    # consonants are part of a cluster of consonants.
    #
    # E.g. in कर्म (0915=ka, 0930=ra, 094D=virama, 092E=ma) we have something like kar+ma,
    # the -m- having no vowel.
    #
    place_consonant_among_subjc = False

    for dchar_san in dstring_san:

        if dchar_san.unknown_char:
            new_istruct = InternalStructure( dstring_object = dstring_object,
                                             unknown_character = True )
            istructs.append(new_istruct)

        else:

            # punctation symbol :
            if dchar_san.base_char in SAN__SYMB_PUNCTUATION:
                unicode_symb = SAN__SYMB_PUNCTUATION.get_default_symbol(dchar_san.base_char)
                new_istruct = InternalStructure( dstring_object = dstring_object,
                                                 punctuation_or_other_symbol = \
                                                   PUNCTUATION_INVERSED[unicode_symb] )
                istructs.append(new_istruct)

                place_consonant_among_subjc = False

            # other symbol :
            elif dchar_san.base_char in SAN__SYMB_OTHER_SYMBOLS:
                unicode_symb = SAN__SYMB_OTHER_SYMBOLS.get_default_symbol(dchar_san.base_char)
                new_istruct = InternalStructure( dstring_object = dstring_object,
                                                 punctuation_or_other_symbol = \
                                                   OTHER_SYMBOLS_INVERSED[unicode_symb] )
                istructs.append(new_istruct)

                place_consonant_among_subjc = False

            # independent vowel:
            elif dchar_san.base_char in SAN__SYMB_INDEPENDENT_VOWELS:

                #...............................................................
                # _independent_vowel will be added as an independent vowel :
                #...............................................................
                if  dstring_object.options["san2bod quality"] == "normal" and \
                    dchar_san.base_char=='O':
                    #====================
                    # @@BOD2SAN-NORM-004
                    # (independent vowel) ओ(ō) > औ(au)
                    #====================
                    _independent_vowel = "AU"

                elif dstring_object.options["san2bod quality"] == "low" and \
                     dchar_san.base_char=='O':
                    #====================
                    # @@BOD2SAN-LOW-004
                    # (independent vowel) ओ(ō) > औ(au)
                    #====================
                    _independent_vowel = "AU"

                elif dstring_object.options["san2bod quality"] == "low" and \
                     dchar_san.base_char in ('AA', 'II', 'UU'):
                    #====================
                    # @@BOD2SAN-LOW-006
                    # (independent vowel) long vowels > short vowels
                    #====================
                    _independent_vowel = {'AA' : 'A',
                                          'II' : 'I',
                                          'UU' : 'U'}[dchar_san.base_char]

                else:
                    _independent_vowel = dchar_san.base_char

                unicode_symb = SAN__SYMB_INDEPENDENT_VOWELS.get_default_symbol(_independent_vowel)
                new_istruct = InternalStructure( dstring_object = dstring_object,
                                                 consonant = "A",
                                                 vowel1 = INDEPENDENT_VOWELS_INVERSED[unicode_symb])
                istructs.append(new_istruct)

                place_consonant_among_subjc = False

            # consonant :
            elif dchar_san.base_char in SAN__SYMB_CONSONANTS:

                if dchar_san.base_char == 'DEVANAGARI SIGN VISARGA':
                    # special case : the visarga symbol is placed among consonants in Sanskrit,
                    # among diacritics in Tibetan.

                    if dstring_object.options["san2bod quality"] == "normal" and \
                       dchar_san.base_char=='DEVANAGARI SIGN VISARGA':
                        #====================
                        # @@BOD2SAN-NORM-001
                        # the visarga is omitted if "san2bod quality" == "normal"
                        #====================
                        pass
                    elif dstring_object.options["san2bod quality"] == "low" and \
                         dchar_san.base_char=='DEVANAGARI SIGN VISARGA':
                        #===================
                        # @@BOD2SAN-LOW-001
                        # the visarga is omitted if "san2bod quality" == "low"
                        #===================
                        pass
                    else:
                        unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(dchar_san.base_char)
                        istructs[-1].rnam_bcad = True

                        place_consonant_among_subjc = False

                elif not place_consonant_among_subjc:
                    # consonant to be placed as a main consonant
                    # (and not among subjoined consonants) :

                    #...........................................................
                    # _base_char will be added as a main consonant :
                    #...........................................................
                    if  dstring_object.options["san2bod quality"] == "normal" and \
                        dchar_san.base_char=='VA':
                        #====================
                        # @@BOD2SAN-NORM-002
                        # the व(va) becomes ब(ba) if "san2bod quality" == "normal"
                        #====================
                        _base_char = "BA"

                    elif dstring_object.options["san2bod quality"] == "low" and \
                         dchar_san.base_char=='VA':
                        #===================
                        # @@BOD2SAN-LOW-002
                        # the व(va) becomes ब(ba) if "san2bod quality" == "normal"
                        #===================
                        _base_char = "BA"

                    elif (dstring_object.options["san2bod quality"] == "low" and \
                        dchar_san.base_char in ('TTA',
                                                'TTHA',
                                                'DDA',
                                                'DDHA',
                                                'NNA')):
                        #===================
                        # @@BOD2SAN-LOW-007
                        # retroflex consonant > non-retroflex consonant
                        # retroflex consonant + aspiration > non-retroflex
                        # consonant without aspiration
                        #===================
                        _base_char = {'TTA'   : "TA",
                                      'TTHA'  : "TA",
                                      'DDA'   : "DA",
                                      'DDHA'  : "DA",
                                      'NNA'   : "NA"
                                      }[dchar_san.base_char]

                    elif (dstring_object.options["san2bod quality"] == "low" and \
                        dchar_san.base_char in ('KHA',
                                                'GHA',
                                                'THA',
                                                'CHA',
                                                'JHA',
                                                'TTHA',
                                                'DDHA',
                                                'PHA',
                                                'BHA')):
                        #===================
                        # @@BOD2SAN-LOW-008
                        # consonant + aspiration > consonant without aspiration
                        #===================
                        _base_char = {'KHA'   : "KA",
                                      'GHA'   : "GA",
                                      'THA'   : "TA",
                                      'CHA'   : "CA",
                                      'JHA'   : "JA",
                                      'DHA'   : "DA",
                                      'TTHA'  : "TTA",
                                      'DDHA'  : "DDA",
                                      'PHA'   : "PA",
                                      'BHA'   : "BA"
                                      }[dchar_san.base_char]

                    else:
                        # general case :
                        _base_char = dchar_san.base_char

                    unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(_base_char)
                    bod_consonant = CONSONANTS_INVERSED[unicode_symb]

                    new_istruct = InternalStructure( dstring_object = dstring_object,
                                                     consonant = bod_consonant )
                    istructs.append(new_istruct)

                    if dchar_san.virama:
                        place_consonant_among_subjc = True

                else:
                    # consonant to be placed among subjoined consonants
                    # (and not as a main consonant) :
                    if istructs[-1].subfix is None:
                        istructs[-1].subfix = []

                    unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(dchar_san.base_char)
                    cons = CONSONANTS_INVERSED[unicode_symb]

                    add_this_consonant = True
                    if dstring_object.options["san2bod quality"] == "low" and \
                       istructs[-1].subfix == [] and \
                       istructs[-1].consonant == cons:
                        #===================
                        # @@BOD2SAN-LOW-008
                        # geminate consonant > 0
                        #===================
                        add_this_consonant = False
                        # no more subjoinded consonant : the other one will be treated
                        # like main consonants :
                        place_consonant_among_subjc = False

                    if add_this_consonant:
                        istructs[-1].subfix.append( cons )

                        if not dchar_san.virama:
                            place_consonant_among_subjc = False

                # dependent vowel :
                if dchar_san.dependentvowel is not None and dchar_san.dependentvowel != "A":

                    #...........................................................
                    # _dependent_vowel will be added as a dependent vowel :
                    #...........................................................
                    if  dstring_object.options["san2bod quality"] == "normal" and \
                        dchar_san.dependentvowel=='O':
                        #====================
                        # @@BOD2SAN-NORM-003
                        # (dependent vowel) ओ(ō) > औ(au)
                        #====================
                        _dependent_vowel = "AU"

                    elif dstring_object.options["san2bod quality"] == "low" and \
                         dchar_san.dependentvowel=='O':
                        #====================
                        # @@BOD2SAN-LOW-003
                        # (dependent vowel) ओ(ō) > औ(au)
                        #====================
                        _dependent_vowel = "AU"

                    elif  dstring_object.options["san2bod quality"] == "low" and \
                          dchar_san.dependentvowel in ('AA', 'II', 'UU'):
                        #====================
                        # @@BOD2SAN-LOW-005
                        # (dependent vowel) long vowels > short vowels
                        #====================
                        _dependent_vowel = {'AA' : 'A',
                                            'II' : 'I',
                                            'UU' : 'U'}[dchar_san.dependentvowel]

                    else:
                        _dependent_vowel = dchar_san.dependentvowel

                    unicode_symb = \
                      SAN__SYMB_DEPENDENT_VOWELS.get_default_symbol(_dependent_vowel)

                    istructs[-1].vowel1 = DEPENDENT_VOWELS_INVERSED[unicode_symb]

            # anusvara/candrabindu :
            if dchar_san.anusvara_candrabindu is not None:
                unicode_symb = \
                  SAN__SYMB_DIACRITICS.get_default_symbol(dchar_san.anusvara_candrabindu)

                istructs[-1].anusvara_candrabindu = DIACRITICS_INVERSED[unicode_symb]

    res = ListOfInternalStructures(anonymize_the_unknown_chars =\
                                dstring_object.options["anonymize the unknown characters"] == 'yes')

    # we add a tsheg after a "real" syllable (id est, not a punctuation sign, ...)
    for istruct in istructs:
        res.append(istruct)

        if istruct.consonant is not None:
            res.append( InternalStructure(
                dstring_object = dstring_object,
                punctuation_or_other_symbol = 'MARK INTERSYLLABIC TSHEG' ))

    return res
Esempio n. 3
0
    def init_from_str(self, str_src):
        """
                DStringSAN.init_from_str

                Function called by __init__(), initialize <self> and return
                <indexes_of_unrecognized_chars>.

                str_src : str

                HOW IT WORKS :
                * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
                * (2) = normalized_src -> (itrans symbols required) :
                *     replace_by_the_default_symbols() -> normalized_src
                * (3) initialisation from the recognized characters.
                *     re.finditer(DStringSAN.pattern) give the symbols{base_char, diacritics}
                *     (3.1) virama
                *     (3.2) base_char, punctuation, dependentvowel, is_an_independent_vowel
                *     (3.3) accent
                *     (3.4) nukta
                *     (3.5) anusvara_candrabindu
                *     (3.6) anudatta
                *     (3.7) we add the new character
        """
        #.......................................................................
        # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
        #.......................................................................
        normalized_src = unicodedata.normalize('NFD', str_src)

        #.......................................................................
        # (2) = normalized_src -> (itrans symbols required) :
        #     replace_by_the_default_symbols() -> normalized_src
        #.......................................................................
        normalized_src = SYMB_CONSONANTS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_INDEPENDENT_VOWELS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_DEPENDENT_VOWELS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src)

        #.......................................................................
        # (3) initialisation from the recognized characters.
        #     re.finditer(DStringSAN.pattern) give the symbols{basechar, diacritics}
        #.......................................................................
        indexes = []    # indexes of the substring well analyzed : ( start, end )
        for element in re.finditer(DStringSAN.pattern,
                                   normalized_src):

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # we add the unknown characters at the beginning and in the middle
            # of the string (see at the end of this function)
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if indexes:
                # <indexes> isn't empty :
                # ... we add the unknown character(s) between the last character and
                # the current one :
                for index in range( max(indexes[-1])+1, element.start() ):
                    new_character = DCharacterSAN(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )
            else:
                # <indexes> is empty :
                # ... we add the unknown character(s) before the first index in <indexes> :
                for index in range( 0, element.start() ):
                    new_character = DCharacterSAN(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )

            indexes.append( (element.start(), element.end()-1 ) )

            data = element.groupdict()
            base_char   = data['basechar']
            dependentvowel = data['dependentvowel']
            diacritics = data['diacritics']

            # base_char as "क" becomes "KA"
            base_char__punctuation = SYMB_PUNCTUATION.get_the_name_for_this_symbol(base_char)
            base_char__other_symbols = SYMB_OTHER_SYMBOLS.get_the_name_for_this_symbol(base_char)
            base_char__consonant = SYMB_CONSONANTS.get_the_name_for_this_symbol(base_char)
            base_char__ivowel = SYMB_INDEPENDENT_VOWELS.get_the_name_for_this_symbol(base_char)
            base_char__dvowel = SYMB_DEPENDENT_VOWELS.get_the_name_for_this_symbol(dependentvowel)

            is_an_independent_vowel = False # <is_an_independent_vowel> is set here since,
                                            # if base_char is a punctuation symbol,
                                            # it will never be set again but it is needed by
                                            # the call to new_character = DCharacterSAN(...)

            virama = False
            if diacritics is not None:
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.1) virama
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                virama_nbr = number_of_occurences( source_string = diacritics,
                                                   symbols = SYMB_DIACRITICS__VIRAMA)

                if virama_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), 'virama' defined several times."
                    raise DCharsError( context = "DStringSAN.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                virama = SYMB_DIACRITICS.are_these_symbols_in_a_string('DEVANAGARI SIGN VIRAMA',
                                                                       diacritics)

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.2) base_char, punctuation, dependentvowel, is_an_independent_vowel
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if base_char__punctuation is not None:
                # punctuation symbol :
                punctuation = True
                base_char = base_char__punctuation

            elif base_char__other_symbols is not None:
                # "other symbol" : not punctuation nor consonant nor independent vowel :
                punctuation = False
                base_char = base_char__other_symbols

            else:
                punctuation = False

                if base_char__consonant is not None:
                    # consonant :
                    is_an_independent_vowel = False
                    base_char = base_char__consonant

                    # dependent vowel ?
                    if base_char != 'DEVANAGARI SIGN VISARGA' and \
                       not virama and dependentvowel is None:
                        # special case : for normal consonants (and visarga is a pseudo-consonant)
                        #                written without any vowel symbol, the dependent vowel
                        #                is 'A'. E.g. 'क' stands for 'ka', not for 'k'.
                        dependentvowel = "A"
                    else:
                        dependentvowel = base_char__dvowel

                else:
                    # independent vowel :
                    is_an_independent_vowel = True
                    dependentvowel = None
                    base_char = base_char__ivowel


            accent = None
            nukta = False
            anusvara_candrabindu = None
            anudatta = False
            if diacritics is not None:
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.3) accent
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                accent_nbr = number_of_occurences( source_string = diacritics,
                                                   symbols = SYMB_DIACRITICS__ACCENTS )

                if accent_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), 'accent' defined several times."
                    raise DCharsError( context = "DStringSAN.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                accent = None
                for accent_char in SYMB_DIACRITICS__ACCENTS:
                    accent_name = SYMB_DIACRITICS.defaultsymbol2name[accent_char]
                    if SYMB_DIACRITICS.are_these_symbols_in_a_string(name=accent_name,
                                                                     string=diacritics):
                        accent = accent_name
                        break

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.4) nukta
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                nukta_nbr = number_of_occurences( source_string = diacritics,
                                                  symbols = SYMB_DIACRITICS__NUKTA )

                if nukta_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), 'nukta' defined several times."
                    raise DCharsError( context = "DStringSAN.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                nukta = SYMB_DIACRITICS.are_these_symbols_in_a_string('DEVANAGARI SIGN NUKTA',
                                                                      diacritics)

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.5) anusvara_candrabindu
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                anusvara_candrabindu_nbr = number_of_occurences(
                    source_string = diacritics,
                    symbols = SYMB_DIACRITICS__ANUSVARA_CANDRABINDU)

                if anusvara_candrabindu_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), " \
                              "'anusvara_candrabindu' defined several times."
                    raise DCharsError( context = "DStringSAN.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                anusvara_candrabindu = None
                for anusvara_candrabindu_char in SYMB_DIACRITICS__ANUSVARA_CANDRABINDU:
                    anusvara_candrabindu_name = SYMB_DIACRITICS.defaultsymbol2name[
                        anusvara_candrabindu_char]
                    if SYMB_DIACRITICS.are_these_symbols_in_a_string(name=anusvara_candrabindu_name,
                                                                     string=diacritics):
                        anusvara_candrabindu = anusvara_candrabindu_name
                        break

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.6) anudatta
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                anudatta_nbr = number_of_occurences( source_string = diacritics,
                                                   symbols = SYMB_DIACRITICS__ANUDATTA)

                if anudatta_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), 'anudatta' defined several times."
                    raise DCharsError( context = "DStringSAN.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()),)

                anudatta = SYMB_DIACRITICS.are_these_symbols_in_a_string(
                    'DEVANAGARI STRESS SIGN ANUDATTA',
                    diacritics)

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.7) we add the new character
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            new_character = DCharacterSAN(dstring_object = self,
                                          unknown_char = False,
                                          base_char = base_char,
                                          accent = accent,
                                          punctuation = punctuation,
                                          nukta = nukta,
                                          anusvara_candrabindu = anusvara_candrabindu,
                                          virama = virama,
                                          anudatta = anudatta,
                                          is_an_independent_vowel = is_an_independent_vowel,
                                          dependentvowel = dependentvowel)
            self.append( new_character )

        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        # we add the final unknown characters (see at the beginning of this
        # function)
        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        if indexes:
            # <element> is the last one and <indexes> isn't empty :
            for index in range( max(indexes[-1])+1, len(normalized_src) ):
                new_character = DCharacterSAN(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )

        else:
            # <indexes> is empty :
            for index in range( 0, len(normalized_src) ):
                new_character = DCharacterSAN(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )