Beispiel #1
0
def get_intstruct_from_trans_str( _src, dstring_object ):
    """
        function get_intstruct_from_trans_str()

        _src    : (str) transliterated string like "क".

        Return a ListOfInternalStructures object.
    """

    # list of InternalStructure objects.
    istructs = ListOfInternalStructures(anonymize_the_unknown_chars =\
                                dstring_object.options["anonymize the unknown characters"] == 'yes')

    # we read <_src> through a DSTRING_SAN object :
    dstring_san = new_dstring(language='संस्कृतम्',
                              transliteration_method="iso15919")
    dstring_san = dstring_san(_src)

    # In Sanskrit, if a consonant is followed by a virama, it means that the following
    # consonants are part of a cluster of consonants.
    #
    # E.g. in कर्म (0915=ka, 0930=ra, 094D=virama, 092E=ma) we have something like kar+ma,
    # the -m- having no vowel.
    #
    place_consonant_among_subjc = False

    for dchar_san in dstring_san:

        if dchar_san.unknown_char:
            new_istruct = InternalStructure( dstring_object = dstring_object,
                                             unknown_character = True )
            istructs.append(new_istruct)

        else:

            # punctation symbol :
            if dchar_san.base_char in SAN__SYMB_PUNCTUATION:
                unicode_symb = SAN__SYMB_PUNCTUATION.get_default_symbol(dchar_san.base_char)
                new_istruct = InternalStructure( dstring_object = dstring_object,
                                                 punctuation_or_other_symbol = \
                                                   PUNCTUATION_INVERSED[unicode_symb] )
                istructs.append(new_istruct)

                place_consonant_among_subjc = False

            # other symbol :
            elif dchar_san.base_char in SAN__SYMB_OTHER_SYMBOLS:
                unicode_symb = SAN__SYMB_OTHER_SYMBOLS.get_default_symbol(dchar_san.base_char)
                new_istruct = InternalStructure( dstring_object = dstring_object,
                                                 punctuation_or_other_symbol = \
                                                   OTHER_SYMBOLS_INVERSED[unicode_symb] )
                istructs.append(new_istruct)

                place_consonant_among_subjc = False

            # independent vowel:
            elif dchar_san.base_char in SAN__SYMB_INDEPENDENT_VOWELS:

                #...............................................................
                # _independent_vowel will be added as an independent vowel :
                #...............................................................
                if  dstring_object.options["san2bod quality"] == "normal" and \
                    dchar_san.base_char=='O':
                    #====================
                    # @@BOD2SAN-NORM-004
                    # (independent vowel) ओ(ō) > औ(au)
                    #====================
                    _independent_vowel = "AU"

                elif dstring_object.options["san2bod quality"] == "low" and \
                     dchar_san.base_char=='O':
                    #====================
                    # @@BOD2SAN-LOW-004
                    # (independent vowel) ओ(ō) > औ(au)
                    #====================
                    _independent_vowel = "AU"

                elif dstring_object.options["san2bod quality"] == "low" and \
                     dchar_san.base_char in ('AA', 'II', 'UU'):
                    #====================
                    # @@BOD2SAN-LOW-006
                    # (independent vowel) long vowels > short vowels
                    #====================
                    _independent_vowel = {'AA' : 'A',
                                          'II' : 'I',
                                          'UU' : 'U'}[dchar_san.base_char]

                else:
                    _independent_vowel = dchar_san.base_char

                unicode_symb = SAN__SYMB_INDEPENDENT_VOWELS.get_default_symbol(_independent_vowel)
                new_istruct = InternalStructure( dstring_object = dstring_object,
                                                 consonant = "A",
                                                 vowel1 = INDEPENDENT_VOWELS_INVERSED[unicode_symb])
                istructs.append(new_istruct)

                place_consonant_among_subjc = False

            # consonant :
            elif dchar_san.base_char in SAN__SYMB_CONSONANTS:

                if dchar_san.base_char == 'DEVANAGARI SIGN VISARGA':
                    # special case : the visarga symbol is placed among consonants in Sanskrit,
                    # among diacritics in Tibetan.

                    if dstring_object.options["san2bod quality"] == "normal" and \
                       dchar_san.base_char=='DEVANAGARI SIGN VISARGA':
                        #====================
                        # @@BOD2SAN-NORM-001
                        # the visarga is omitted if "san2bod quality" == "normal"
                        #====================
                        pass
                    elif dstring_object.options["san2bod quality"] == "low" and \
                         dchar_san.base_char=='DEVANAGARI SIGN VISARGA':
                        #===================
                        # @@BOD2SAN-LOW-001
                        # the visarga is omitted if "san2bod quality" == "low"
                        #===================
                        pass
                    else:
                        unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(dchar_san.base_char)
                        istructs[-1].rnam_bcad = True

                        place_consonant_among_subjc = False

                elif not place_consonant_among_subjc:
                    # consonant to be placed as a main consonant
                    # (and not among subjoined consonants) :

                    #...........................................................
                    # _base_char will be added as a main consonant :
                    #...........................................................
                    if  dstring_object.options["san2bod quality"] == "normal" and \
                        dchar_san.base_char=='VA':
                        #====================
                        # @@BOD2SAN-NORM-002
                        # the व(va) becomes ब(ba) if "san2bod quality" == "normal"
                        #====================
                        _base_char = "BA"

                    elif dstring_object.options["san2bod quality"] == "low" and \
                         dchar_san.base_char=='VA':
                        #===================
                        # @@BOD2SAN-LOW-002
                        # the व(va) becomes ब(ba) if "san2bod quality" == "normal"
                        #===================
                        _base_char = "BA"

                    elif (dstring_object.options["san2bod quality"] == "low" and \
                        dchar_san.base_char in ('TTA',
                                                'TTHA',
                                                'DDA',
                                                'DDHA',
                                                'NNA')):
                        #===================
                        # @@BOD2SAN-LOW-007
                        # retroflex consonant > non-retroflex consonant
                        # retroflex consonant + aspiration > non-retroflex
                        # consonant without aspiration
                        #===================
                        _base_char = {'TTA'   : "TA",
                                      'TTHA'  : "TA",
                                      'DDA'   : "DA",
                                      'DDHA'  : "DA",
                                      'NNA'   : "NA"
                                      }[dchar_san.base_char]

                    elif (dstring_object.options["san2bod quality"] == "low" and \
                        dchar_san.base_char in ('KHA',
                                                'GHA',
                                                'THA',
                                                'CHA',
                                                'JHA',
                                                'TTHA',
                                                'DDHA',
                                                'PHA',
                                                'BHA')):
                        #===================
                        # @@BOD2SAN-LOW-008
                        # consonant + aspiration > consonant without aspiration
                        #===================
                        _base_char = {'KHA'   : "KA",
                                      'GHA'   : "GA",
                                      'THA'   : "TA",
                                      'CHA'   : "CA",
                                      'JHA'   : "JA",
                                      'DHA'   : "DA",
                                      'TTHA'  : "TTA",
                                      'DDHA'  : "DDA",
                                      'PHA'   : "PA",
                                      'BHA'   : "BA"
                                      }[dchar_san.base_char]

                    else:
                        # general case :
                        _base_char = dchar_san.base_char

                    unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(_base_char)
                    bod_consonant = CONSONANTS_INVERSED[unicode_symb]

                    new_istruct = InternalStructure( dstring_object = dstring_object,
                                                     consonant = bod_consonant )
                    istructs.append(new_istruct)

                    if dchar_san.virama:
                        place_consonant_among_subjc = True

                else:
                    # consonant to be placed among subjoined consonants
                    # (and not as a main consonant) :
                    if istructs[-1].subfix is None:
                        istructs[-1].subfix = []

                    unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(dchar_san.base_char)
                    cons = CONSONANTS_INVERSED[unicode_symb]

                    add_this_consonant = True
                    if dstring_object.options["san2bod quality"] == "low" and \
                       istructs[-1].subfix == [] and \
                       istructs[-1].consonant == cons:
                        #===================
                        # @@BOD2SAN-LOW-008
                        # geminate consonant > 0
                        #===================
                        add_this_consonant = False
                        # no more subjoinded consonant : the other one will be treated
                        # like main consonants :
                        place_consonant_among_subjc = False

                    if add_this_consonant:
                        istructs[-1].subfix.append( cons )

                        if not dchar_san.virama:
                            place_consonant_among_subjc = False

                # dependent vowel :
                if dchar_san.dependentvowel is not None and dchar_san.dependentvowel != "A":

                    #...........................................................
                    # _dependent_vowel will be added as a dependent vowel :
                    #...........................................................
                    if  dstring_object.options["san2bod quality"] == "normal" and \
                        dchar_san.dependentvowel=='O':
                        #====================
                        # @@BOD2SAN-NORM-003
                        # (dependent vowel) ओ(ō) > औ(au)
                        #====================
                        _dependent_vowel = "AU"

                    elif dstring_object.options["san2bod quality"] == "low" and \
                         dchar_san.dependentvowel=='O':
                        #====================
                        # @@BOD2SAN-LOW-003
                        # (dependent vowel) ओ(ō) > औ(au)
                        #====================
                        _dependent_vowel = "AU"

                    elif  dstring_object.options["san2bod quality"] == "low" and \
                          dchar_san.dependentvowel in ('AA', 'II', 'UU'):
                        #====================
                        # @@BOD2SAN-LOW-005
                        # (dependent vowel) long vowels > short vowels
                        #====================
                        _dependent_vowel = {'AA' : 'A',
                                            'II' : 'I',
                                            'UU' : 'U'}[dchar_san.dependentvowel]

                    else:
                        _dependent_vowel = dchar_san.dependentvowel

                    unicode_symb = \
                      SAN__SYMB_DEPENDENT_VOWELS.get_default_symbol(_dependent_vowel)

                    istructs[-1].vowel1 = DEPENDENT_VOWELS_INVERSED[unicode_symb]

            # anusvara/candrabindu :
            if dchar_san.anusvara_candrabindu is not None:
                unicode_symb = \
                  SAN__SYMB_DIACRITICS.get_default_symbol(dchar_san.anusvara_candrabindu)

                istructs[-1].anusvara_candrabindu = DIACRITICS_INVERSED[unicode_symb]

    res = ListOfInternalStructures(anonymize_the_unknown_chars =\
                                dstring_object.options["anonymize the unknown characters"] == 'yes')

    # we add a tsheg after a "real" syllable (id est, not a punctuation sign, ...)
    for istruct in istructs:
        res.append(istruct)

        if istruct.consonant is not None:
            res.append( InternalStructure(
                dstring_object = dstring_object,
                punctuation_or_other_symbol = 'MARK INTERSYLLABIC TSHEG' ))

    return res
Beispiel #2
0
def get_intstruct_from_trans_str( _src, dstring_object ):
    """
        function get_intstruct_from_trans_str()

        _src    : (str) transliterated string like "ka".

        Return a ListOfInternalStructures object.

        This function CAN BE VERY SLOW on big <_src>.

        arguments : list of argument. For this function :
                arguments = [ (str)source string,
                              (bool)expected_structure,
                              (bool)look_up_in_the_buffers,
                              (bool)fill_the_buffers ]
    """

    expected_structure = dstring_object.options["expected structure"]
    look_up_in_the_buffers = dstring_object.options["look up in the buffers"] == 'yes'
    fill_the_buffers = dstring_object.options["fill the buffers"] == 'yes'
    anonymize_the_unknown_chars = \
                                 dstring_object.options["anonymize the unknown characters"] == 'yes'

    if len(_src) == 0:
        return ListOfInternalStructures(
            anonymize_the_unknown_chars = anonymize_the_unknown_chars)

    #...........................................................................
    # the quickest way to answer is to look in the buffer :
    #...........................................................................
    if expected_structure == 'Tibetan or Sanskrit' and look_up_in_the_buffers:
        if _src in ewts_buffer.EWTS_BUFFER__FROM_TRANS_STR:
            return ListOfInternalStructures(
                anonymize_the_unknown_chars = \
                anonymize_the_unknown_chars).init_from_pickle_repr(
                  src = ewts_buffer.EWTS_BUFFER__FROM_TRANS_STR[_src],
                  dstring_object = dstring_object)

    #...........................................................................
    # the big loop
    #...........................................................................
    # list of InternalStructure objects.
    istructs = ListOfInternalStructures(
        anonymize_the_unknown_chars=anonymize_the_unknown_chars)
    # we add an empty istruct to create a starting-point for the
    # big loop (for istruct in istructs, see below) :
    istructs.append( InternalStructure(dstring_object = None) )

    # <real_indexes> are defined from the source string, character by character but
    # <indexes> are defined from the string as it appeared to the regex :
    #
    #        E.g. for the transliterated string "²nya²" (with 2 unknown characters ) :
    #              real_indexes : /²/n/y/a/²/ =0,1,2,3,4
    #              indexes :      /²/ny/a/²/  =0,1,2,3
    #
    for index_char, char in enumerate(re.finditer( TRANS_PATTERN, _src )):

        real_indexes = range(char.start(), char.end())

        # None / (str) '.' or '+'
        dotpointorplus = char.group('dotpointorplus')
        # None / (str)
        base_char = char.group('base_char')
        # (bool)
        halanta = char.group('halanta') is not None
        # None / (str)
        anusvara_candrabindu = char.group('anusvara_candrabindu')
        # (bool)
        rnam_bcad = char.group('rnam_bcad') is not None


        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        # initialization of <future_istructs> from <char> :
        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        future_istructs = ListOfInternalStructures(
            anonymize_the_unknown_chars=anonymize_the_unknown_chars)

        if base_char in PUNCTUATION_INVERSED:
            # we add a new internal structure :
            new_istruct = InternalStructure(
                dstring_object = None,
                punctuation_or_other_symbol = PUNCTUATION_INVERSED[base_char])
            future_istructs.append( new_istruct )
            future_istructs[-1].indexes.add( index_char )
            future_istructs[-1].real_indexes.update( real_indexes  )

        elif base_char in OTHER_SYMBOLS_INVERSED:
            # we add a new internal structure :
            new_istruct = InternalStructure(
                dstring_object = None,
                punctuation_or_other_symbol = OTHER_SYMBOLS_INVERSED[base_char])
            future_istructs.append( new_istruct )
            future_istructs[-1].indexes.add( index_char )
            future_istructs[-1].real_indexes.update( real_indexes  )

        else:

            for index_istruct, istruct in enumerate(istructs):

                # we don't want to complete an unknown character, a punctuation
                # symbol or something equivalent (an other symbol) :
                if not istruct.unknown_character and \
                   istruct.punctuation_or_other_symbol is None:

                    # so we have something to complete :

                    # vowel :
                    if base_char in VOWELS_INVERSED and \
                       istruct.indexes_are_contiguous_to( [index_char,] ) and \
                       istruct.real_indexes_are_contiguous_to(real_indexes):

                        # we add a vowel1 to the current istruct :
                        if istruct.vowel1 is None and \
                           istruct.indexes_are_contiguous_to( [index_char,] ) and \
                           istruct.real_indexes_are_contiguous_to(real_indexes):

                            future_istructs.append( deepcopy(istruct) )

                            # we add the default consonant supporting vowel :
                            if future_istructs[-1].prefix is None and \
                               future_istructs[-1].consonant is None:

                                future_istructs[-1].consonant = "A"

                            future_istructs[-1].vowel1 = VOWELS_INVERSED[base_char]
                            future_istructs[-1].indexes.add( index_char )
                            future_istructs[-1].real_indexes.update( real_indexes  )

                            if anusvara_candrabindu is not None:

                                future_istructs[-1].anusvara_candrabindu = \
                                   DIACRITICS_INVERSED[anusvara_candrabindu]

                            if rnam_bcad:

                                future_istructs[-1].rnam_bcad = True

                        # we add a vowel2 to the current istruct :
                        if istruct.vowel1 is not None and \
                           dotpointorplus == '+' and istruct.vowel2 is None and \
                           istruct.indexes_are_contiguous_to( [index_char,] ) and \
                           istruct.real_indexes_are_contiguous_to(real_indexes):

                            future_istructs.append( deepcopy(istruct) )
                            future_istructs[-1].vowel2 = VOWELS_INVERSED[base_char]
                            future_istructs[-1].indexes.add( index_char )
                            future_istructs[-1].real_indexes.update( real_indexes  )

                            # a "rnam bcad" symbol can follow a vowel, as in
                            # "གཏིཿ"="gtiH"
                            if rnam_bcad:
                                future_istructs[-1].rnam_bcad = True

                            # a anusvara/candrabindu symbol may follow a vowel :
                            if anusvara_candrabindu is not None:
                                future_istructs[-1].anusvara_candrabindu = \
                                  DIACRITICS_INVERSED[anusvara_candrabindu]

                        # we add postsuffix 'u to the current istruct :
                        if base_char == VOWELS['U'] and \
                           istruct.consonant is not None and \
                           istruct.vowel1 is not None and \
                           istruct.suffix1 == "-" and \
                           istruct.suffix2 is None and \
                           not istruct.postsuffix_u and \
                           istruct.indexes_are_contiguous_to( [index_char,] ) and \
                           istruct.real_indexes_are_contiguous_to(real_indexes):

                            future_istructs.append( deepcopy(istruct) )
                            future_istructs[-1].suffix1 = None
                            future_istructs[-1].postsuffix_u = True
                            future_istructs[-1].indexes.add( index_char )
                            future_istructs[-1].real_indexes.update( real_indexes  )

                    # consonant :
                    elif base_char in CONSONANTS_INVERSED and \
                         istruct.indexes_are_contiguous_to( [index_char,] ) and \
                         istruct.real_indexes_are_contiguous_to(real_indexes):

                        # we add a prefix to the current syllable :
                        if dotpointorplus is None and \
                           not rnam_bcad and \
                           anusvara_candrabindu is None and \
                           istruct.prefix is None and \
                           istruct.superfix is None and \
                           istruct.consonant is None and \
                           istruct.vowel1 is None and \
                           CONSONANTS_INVERSED[base_char] in PREFIXES:

                            future_istructs.append( deepcopy(istruct) )
                            future_istructs[-1].prefix = CONSONANTS_INVERSED[base_char]
                            future_istructs[-1].indexes.add( index_char )
                            future_istructs[-1].real_indexes.update( real_indexes  )

                        # we add a superfix to the current syllable :
                        if dotpointorplus is None and \
                           not rnam_bcad and \
                           anusvara_candrabindu is None and \
                           istruct.superfix is None and \
                           istruct.consonant is None and \
                           istruct.vowel1 is None and \
                           CONSONANTS_INVERSED[base_char] in SUPERFIXES:

                            future_istructs.append( deepcopy(istruct) )
                            future_istructs[-1].superfix = CONSONANTS_INVERSED[base_char]
                            future_istructs[-1].indexes.add( index_char )
                            future_istructs[-1].real_indexes.update( real_indexes  )

                        # we add a main consonant to the current syllable :
                        if istruct.consonant is None and \
                           istruct.vowel1 is None and \
                           istruct.subfix is None and \
                           dotpointorplus != '+':
                            future_istructs.append( deepcopy(istruct) )
                            future_istructs[-1].consonant = CONSONANTS_INVERSED[base_char]
                            future_istructs[-1].indexes.add( index_char )
                            future_istructs[-1].real_indexes.update( real_indexes  )

                            # a anusvara/candrabindu symbol may follow a consonant :
                            if anusvara_candrabindu is not None:
                                future_istructs[-1].anusvara_candrabindu = \
                                  DIACRITICS_INVERSED[anusvara_candrabindu]

                            # a "rnam bcad" symbol can follow a consonant, as in
                            # གྲུཌཿ=gruDH.
                            if rnam_bcad:
                                future_istructs[-1].rnam_bcad = True

                            if halanta:
                                future_istructs[-1].halanta = True

                        # we add a subjoined consonant to the current syllable :

                        # let's initialize <part_of_a_common_cons_stack> :
                        cons = []
                        if istruct.consonant is not None:
                            cons.append( istruct.consonant )
                        if istruct.subfix is not None:
                            cons.extend( istruct.subfix )
                        cons.append( CONSONANTS_INVERSED[base_char] )

                        part_of_a_common_cons_stack = tuple(cons) in COMMON_CONSONANTS_STACK

                        if dotpointorplus == '+' and \
                           istruct.consonant is not None and \
                           istruct.suffix1 is None:

                            future_istructs.append( deepcopy(istruct) )

                            if future_istructs[-1].subfix is None:
                                future_istructs[-1].subfix = []
                            future_istructs[-1].subfix.append( CONSONANTS_INVERSED[base_char] )
                            future_istructs[-1].indexes.add( index_char )
                            future_istructs[-1].real_indexes.update( real_indexes  )

                        elif dotpointorplus is None and \
                             istruct.suffix1 is None and \
                             part_of_a_common_cons_stack and \
                             istruct.consonant is not None and \
                             istruct.vowel1 is None:

                            future_istructs.append( deepcopy(istruct) )
                            if future_istructs[-1].subfix is None:
                                future_istructs[-1].subfix = []
                            future_istructs[-1].subfix.append( CONSONANTS_INVERSED[base_char] )
                            future_istructs[-1].indexes.add( index_char )
                            future_istructs[-1].real_indexes.update( real_indexes  )

                        # we add a suffix-1 to the current syllable :
                        if dotpointorplus is None and \
                           istruct.consonant is not None and \
                           istruct.vowel1 is not None and \
                           istruct.suffix1 is None and \
                           CONSONANTS_INVERSED[base_char] in SUFFIXES1 and \
                           not istruct.postsuffix_u:
                            future_istructs.append( deepcopy(istruct) )
                            future_istructs[-1].suffix1 = CONSONANTS_INVERSED[base_char]

                            # is the suffix has a rnam_bcad/anusvara_candrabindu symbol,
                            # the future istruct gets this diacritic sign :
                            # (e.g. ལབཿ labH where -bH is a suffix)
                            if anusvara_candrabindu is not None:
                                future_istructs[-1].anusvara_candrabindu = \
                                  DIACRITICS_INVERSED[anusvara_candrabindu]
                            if rnam_bcad:
                                future_istructs[-1].rnam_bcad = True

                            future_istructs[-1].indexes.add( index_char )
                            future_istructs[-1].real_indexes.update( real_indexes  )

                        # we add a suffix-2 to the current syllable :
                        if dotpointorplus is None and \
                           istruct.consonant is not None and \
                           istruct.vowel1 is not None and \
                           istruct.suffix1 is not None and \
                           istruct.suffix2 is None and \
                           not istruct.postsuffix_u and \
                           CONSONANTS_INVERSED[base_char] in SUFFIXES2:
                            future_istructs.append( deepcopy(istruct) )
                            future_istructs[-1].suffix2 = CONSONANTS_INVERSED[base_char]

                            # is the suffix has a rnam_bcad/anusvara_candrabindu symbol,
                            # the future istruct gets this diacritic sign :
                            if anusvara_candrabindu is not None:
                                future_istructs[-1].anusvara_candrabindu = \
                                  DIACRITICS_INVERSED[anusvara_candrabindu]
                            if rnam_bcad:
                                future_istructs[-1].rnam_bcad = True

                            future_istructs[-1].indexes.add( index_char )
                            future_istructs[-1].real_indexes.update( real_indexes  )

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # we have to keep the current istruct as one of the future istructs :
                #
                # E.g., let read the word "khi'is", id est khi + suffix 'is.
                # The program will read khi'i as 'i is a valid suffix so we have to
                # keep the word khi in memory; the program will read 'is as an
                # isolated word and will join "khi" and "'is" later.
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                future_istructs.append( deepcopy((istruct) ))

        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        # <istructs> += <future_istructs>  :
        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        istructs += future_istructs

        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        # we clean the doublets :
        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        # doublets :
        for index_istruct, istruct in enumerate(istructs):
            for index_istruct2, istruct2 in enumerate(istructs):

                if index_istruct != index_istruct2 and \
                   not istruct.bad_internalstruct and \
                   not istruct2.bad_internalstruct:

                    if istruct.is_identical_to(istruct2):
                        istruct2.bad_internalstruct = True

        istructs.clean_off_bad_internalstructs()

        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        # postsuffixes འིས ('is), འམ ('am), འང ('ang), འི ('i),
        #
        # E.g. with འིས ('is) :
        # if we found an istruct equivalent to འིས ('is) we try to find another istruct which
        # could take 'is as a postsuffix (in gramm_postsuffix)
        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        new_istructs = []
        for postsuffix_name, consonant, vowel, suffix1 in (
                ("'i",  '-', "I", None),
                ("'is", '-', "I", 'S'),
                ("'am", '-', "A", 'M'),
                ("'ang",'-', "A", 'NG'),
                ):

            for index1, istruct1 in enumerate(istructs):

                if istruct1.prefix is None and \
                   istruct1.consonant == consonant and \
                   istruct1.vowel1 == vowel and \
                   istruct1.vowel2 is None and \
                   istruct1.subfix is None and \
                   istruct1.suffix1 == suffix1 and \
                   istruct1.suffix2 is None and \
                   not istruct1.postsuffix_u and \
                   istruct1.gramm_postsuffix is None:

                    # we have istruct1 as an istruct equivalent to 'is/'am/... ; let's try to find
                    # istruct0 as an istruct which could take 'is/'am/... as a postsuffix :
                    #
                    # NB :
                    # (a) we don't want to modify the istruct whithout indexes
                    #     [ len(index0.indexes)>0 ]
                    # (b) we don't want to analyse old istructs, hence the condition :
                    #      (index_char in istruct0.indexes or index_char in istruct1.indexes)
                    # (c) we have to check if istruct0 is placed just before istruct1
                    #     [ call to indexes_are_contiguous() functions ]
                    for index0, istruct0 in enumerate(istructs):

                        if index0 != index1 and \
                           (index_char in istruct0.indexes or index_char in istruct1.indexes) and \
                           len(istruct0.indexes)>0 and \
                           not istruct0.unknown_character and \
                           istruct0.punctuation_or_other_symbol is None and \
                           istruct0.suffix1 is None and \
                           istruct0.suffix2 is None and \
                           istruct0.gramm_postsuffix is None and \
                           istruct0.indexes_are_contiguous_to( istruct1.indexes ) and \
                           istruct0.real_indexes_are_contiguous_to( istruct1.real_indexes ):

                            # ok, istruct0 will take 'is/'am/... as a postsuffix :
                            new_istruct = deepcopy( istruct0 )
                            new_istruct.gramm_postsuffix = postsuffix_name
                            new_istruct.indexes.update( istruct1.indexes )
                            new_istruct.real_indexes.update( istruct1.real_indexes )
                            new_istructs.append( new_istruct )

        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        # postsuffix འོ ('o)
        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        for index1, istruct1 in enumerate(istructs):

            if istruct1.prefix is None and \
               istruct1.consonant == "-" and \
               istruct1.vowel1 == "O" and \
               istruct1.vowel2 is None and \
               istruct1.subfix is None and \
               istruct1.suffix1 is None and \
               istruct1.suffix2 is None and \
               not istruct1.postsuffix_u and \
               istruct1.gramm_postsuffix is None:

                # we have istruct1 as an istruct equivalent to 'o; let's try to find
                # istruct0 as an istruct which could take 'o as a postsuffix :
                #
                # NB :
                # (a) we don't want to modify the istruct whithout indexes [ len(index0.indexes)>0 ]
                # (b) we have to check if istruct0 is placed just before istruct1
                #     [ call to indexes_are_contiguous() functions ]
                for index0, istruct0 in enumerate(istructs):

                    if index0 != index1 and \
                       (index_char in istruct0.indexes or index_char in istruct1.indexes) and \
                       len(istruct0.indexes)>0 and \
                       not istruct0.unknown_character and \
                       istruct0.punctuation_or_other_symbol is None and \
                       istruct0.suffix1 is None and \
                       istruct0.suffix2 is None and \
                       not istruct0.postsuffix_o and \
                       istruct0.indexes_are_contiguous_to( istruct1.indexes ) and \
                       istruct0.real_indexes_are_contiguous_to( istruct1.real_indexes ):

                        # ok, istruct0 will take 'o as a postsuffix :
                        new_istruct = deepcopy( istruct0 )
                        new_istruct.postsuffix_o = True
                        new_istruct.indexes.update( istruct1.indexes )
                        new_istruct.real_indexes.update( istruct1.real_indexes )
                        new_istructs.append( new_istruct )

        istructs += new_istructs

    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    # (W) we clean the wrong istructs
    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    for istruct in istructs:

        # an istruct without indexes ? bad istruct :
        if len(istruct.indexes) ==  0:
            istruct.bad_internalstruct = True

        # a prefix without a main consonant ? bad istruct :
        if istruct.prefix is not None and istruct.consonant is None:
            istruct.bad_internalstruct = True
        # a suffix without a main consonant ? bad istruct :
        if istruct.suffix1 is not None and istruct.consonant is None:
            istruct.bad_internalstruct = True
        # a suffix2 without a suffix1 ? bad istruct :
        if istruct.suffix2 is not None and istruct.suffix1 is None:
            istruct.bad_internalstruct = True

    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    # (W.2) we clean the equivalent istructs
    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    for index1, istruct1 in enumerate(istructs):
        for index2, istruct2 in enumerate(istructs):

            if index1 != index2 and \
               not istruct1.bad_internalstruct and \
               not istruct2.bad_internalstruct and \
               istruct1.indexes == istruct2.indexes:

                #  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
                # (a) Equivalent istructs :
                #
                # prefix=STR1; superfix=0; consonant=STR2; subfix=None
                # prefix=None; superfix=0; consonant=STR1; subfix=[STR2,]
                #
                # ... we keep the second istruct as the most natural.
                #  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
                for index_x, index_y in ( (index1, index2),
                                          (index2, index1), ):

                    if istructs[index_x].indexes == istructs[index_y].indexes and \
                       istructs[index_x].real_indexes == istructs[index_y].real_indexes and \
                       istructs[index_x].superfix is None and \
                       istructs[index_x].superfix == istructs[index_y].superfix and \
                       istructs[index_x].prefix == istructs[index_y].consonant and \
                       istructs[index_y].subfix == [istructs[index_x].consonant, ]:

                        istructs[index_x].bad_internalstruct = True

                #  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
                # (b) Equivalent istructs :
                #
                # prefix=STR1; superfix=STR2; consonant=STR3; subfix=[STR4, ...]
                # prefix=STR1; superfix=0;    consonant=STR2; subfix=[STR3, STR4, ...]
                #
                # ... we keep the first istruct as the most natural.
                #  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
                for index_x, index_y in ( (index1, index2),
                                          (index2, index1), ):

                    if istructs[index_x].indexes == istructs[index_y].indexes and \
                       istructs[index_x].real_indexes == istructs[index_y].real_indexes and \
                       istructs[index_x].prefix == istructs[index_y].prefix and \
                       istructs[index_x].superfix == istructs[index_y].consonant and \
                       istructs[index_y].superfix is None and \
                       istructs[index_x].subfix is not None and \
                       istructs[index_y].subfix is not None and \
                       len(istructs[index_y].subfix) >= 2 and \
                       istructs[index_x].consonant == istructs[index_y].subfix[0] and \
                       istructs[index_x].subfix[0] == istructs[index_y].subfix[1]:

                        istructs[index_y].bad_internalstruct = True

                #  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
                # (c) Equivalent istructs :
                #
                # prefix=0; superfix=STR1; consonant=STR2; subfix = [...]
                # prefix=0; superfix=0;    consonant=STR1; subfix = [STR2, ...]
                #
                # ... we keep the first istruct as the most natural.
                #  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
                for index_x, index_y in ( (index1, index2),
                                          (index2, index1), ):

                    if istructs[index_x].indexes == istructs[index_y].indexes and \
                       istructs[index_x].real_indexes == istructs[index_y].real_indexes and \
                       istructs[index_x].prefix is None and \
                       istructs[index_x].prefix == istructs[index_y].prefix and \
                       istructs[index_x].superfix == istructs[index_y].consonant and \
                       istructs[index_y].superfix is None and \
                       istructs[index_y].subfix is not None and \
                       istructs[index_x].consonant == istructs[index_y].subfix[0]:

                        istructs[index_y].bad_internalstruct = True

                #  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
                # (d) Equivalent istructs :
                #
                # prefix=STR1; superfix=STR2; consonant=STR3;
                # prefix=STR1; superfix=0;    consonant=STR2; subfix=[STR3, ...]
                #
                # ... we keep the first istruct as the most natural.
                #  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
                for index_x, index_y in ( (index1, index2),
                                          (index2, index1), ):

                    if istructs[index_x].indexes == istructs[index_y].indexes and \
                       istructs[index_x].real_indexes == istructs[index_y].real_indexes and \
                       istructs[index_x].prefix == istructs[index_y].prefix and \
                       istructs[index_x].superfix == istructs[index_y].consonant and \
                       istructs[index_y].superfix is None and \
                       istructs[index_y].subfix is not None and \
                       istructs[index_x].consonant == istructs[index_y].subfix[0]:

                        istructs[index_y].bad_internalstruct = True

    istructs.clean_off_bad_internalstructs()

    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    # (W.3) special case : if a syllable is equivalent to "oM" it's not
    #       VOWEL=O+CANDRABINDU(=RJES SU NGA RO), it's simply the
    #       symbol oM.
    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    for istruct in istructs:
        if not istruct.unknown_character and \
           istruct.check_default_value(consonant = "A",
                                       vowel1 = 'O',
                                       anusvara_candrabindu = 'SIGN RJES SU NGA RO'):

            istruct.punctuation_or_other_symbol = 'SYLLABLE OM'
            istruct.anusvara_candrabindu = None
            istruct.vowel1 = None
            istruct.consonant = None

    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    # (W.4) special case : if a syllable contains only a superfix without prefix
    #       or consonant we treat this superfix as the (main) consonant.
    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    for istruct in istructs:
        if not istruct.unknown_character and \
           istruct.prefix is None and \
           istruct.superfix is not None and \
           istruct.consonant is None:

            istruct.consonant = istruct.superfix
            istruct.superfix = None

    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    #
    # "sra" : (consonant)S + (subfix)R [@@BOD-INTERNALSTRUCTURE-001]
    #
    # (W.4.1) special case : prefix=0, superfix='S', consonant='R', subfix=[...]
    #       must be treated as if we have :
    #                      prefix=0, superfix=0,   consonant='S', subfix=['R', ...]
    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    for istruct in istructs:
        if not istruct.unknown_character and \
           istruct.prefix is None and \
           istruct.superfix == 'S' and \
           istruct.consonant == 'R':

            istruct.superfix = None
            istruct.consonant = 'S'
            if istruct.subfix is None:
                istruct.subfix = ['R',]
            else:
                istruct.subfix.insert(0, "R")

    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    #
    # "rla" : (consonant)R + (subfix)L [@@BOD-INTERNALSTRUCTURE-002]
    #
    # (W.4.2) special case : prefix=0, superfix='R', consonant='L', subfix=[...]
    #       must be treated as if we have :
    #                      prefix=0, superfix=0,   consonant='R', subfix=['L', ...]
    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    for istruct in istructs:
        if not istruct.unknown_character and \
           istruct.prefix is None and \
           istruct.superfix == 'R' and \
           istruct.consonant == 'L':

            istruct.superfix = None
            istruct.consonant = 'R'
            if istruct.subfix is None:
                istruct.subfix = ['L',]
            else:
                istruct.subfix.insert(0, "L")

    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    #
    # "sla" : (consonant)S + (subfix)L [@@BOD-INTERNALSTRUCTURE-003]
    #
    # (W.4.3) special case : prefix=0, superfix='S', consonant='L', subfix=[...]
    #       must be treated as if we have :
    #                      prefix=0, superfix=0,   consonant='S', subfix=['L', ...]
    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    for istruct in istructs:
        if not istruct.unknown_character and \
           istruct.prefix is None and \
           istruct.superfix == 'S' and \
           istruct.consonant == 'L':

            istruct.superfix = None
            istruct.consonant = 'S'
            if istruct.subfix is None:
                istruct.subfix = ['L',]
            else:
                istruct.subfix.insert(0, "L")

    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    #
    # "rwa" : (consonant)R + (subfix)W [@@BOD-INTERNALSTRUCTURE-004]
    #
    # (W.4.4) special case : prefix=0, superfix='R', consonant='W', subfix=[...]
    #       must be treated as if we have :
    #                      prefix=0, superfix=0,   consonant='R', subfix=['W', ...]
    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    for istruct in istructs:
        if not istruct.unknown_character and \
           istruct.prefix is None and \
           istruct.superfix == 'R' and \
           istruct.consonant == 'W':

            istruct.superfix = None
            istruct.consonant = 'R'
            if istruct.subfix is None:
                istruct.subfix = ['W',]
            else:
                istruct.subfix.insert(0, "W")

    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    #
    # "lwa" : (consonant)L + (subfix)W [@@BOD-INTERNALSTRUCTURE-005]
    #
    # (W.4.5) special case : prefix=0, superfix='L', consonant='W', subfix=[...]
    #       must be treated as if we have :
    #                      prefix=0, superfix=0,   consonant='R', subfix=['W', ...]
    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    for istruct in istructs:
        if not istruct.unknown_character and \
           istruct.prefix is None and \
           istruct.superfix == 'L' and \
           istruct.consonant == 'W':

            istruct.superfix = None
            istruct.consonant = 'L'
            if istruct.subfix is None:
                istruct.subfix = ['W',]
            else:
                istruct.subfix.insert(0, "W")

    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    #
    # "swa" : (consonant)L + (subfix)W [@@BOD-INTERNALSTRUCTURE-006]
    #
    # (W.4.6) special case : prefix=0, superfix='S', consonant='W', subfix=[...]
    #       must be treated as if we have :
    #                      prefix=0, superfix=0,   consonant='S', subfix=['W', ...]
    #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
    for istruct in istructs:
        if not istruct.unknown_character and \
           istruct.prefix is None and \
           istruct.superfix == 'S' and \
           istruct.consonant == 'W':

            istruct.superfix = None
            istruct.consonant = 'S'
            if istruct.subfix is None:
                istruct.subfix = ['W',]
            else:
                istruct.subfix.insert(0, "W")

    #...........................................................................
    # istructs -> istructs
    #...........................................................................
    # we add the unknown characters, id est we add an istruct object linked to
    # every index not covered by the istructs.
    #...........................................................................
    real_indexes_ok = set()
    for istruct in istructs:
        real_indexes_ok.update( istruct.real_indexes )

    for real_index in range(0, len(_src)):
        if real_index not in real_indexes_ok:
            istructs.append ( InternalStructure( dstring_object = None,
                                                 unknown_character = True,
                                                 punctuation_or_other_symbol = _src[real_index],
                                                 real_indexes = OrderedSet( [real_index,]) ))

    #...........................................................................
    # istructs ---> res.get_the_complete_records() ----> res
    #...........................................................................
    complete_records = istructs.get_the_complete_records( last_index = len(_src)-1,
                                                          use_real_indexes = True )

    if len(complete_records) != 1:
        msg = "Zero or more than one lists of istructs describe the source string : "
        raise DCharsError( context = "ewts.py::get_intstruct_from_str()",
                           message = msg+str(complete_records) )

    res = ListOfInternalStructures(
        anonymize_the_unknown_chars=anonymize_the_unknown_chars)
    for index in complete_records[0]:
        res.append( istructs[index] )

    #...........................................................................
    # buffering ?
    #...........................................................................
    if fill_the_buffers and \
       _src not in ewts_buffer.EWTS_BUFFER__FROM_TRANS_STR and \
       not res.contains_unknown_characters():
        ewts_buffer.EWTS_BUFFER__FROM_TRANS_STR[_src] = res.pickle_repr()

    #...........................................................................
    # we can set the .dstring_object attribute :
    #...........................................................................
    for istruct in res:
        istruct.dstring_object = dstring_object

    return res