Esempio n. 1
0
    def init_from_str(self, str_src):
        """
                DStringGRC.init_from_str

                Function called by __init__(), initialize <self> and return
                <indexes_of_unrecognized_chars>.

                str_src : str

                HOW IT WORKS :
                * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
                * (2) = normalized_src -> (default symbols required) :
                *     replace_by_the_default_symbols() -> normalized_src
                * (3) initialisation from the recognized characters.
                *     re.finditer(DStringGRC.pattern) give the symbols{letter+diacritics}
                *     (3.1) base_char
                *     (3.2) contextual_form
                *     (3.3) tonos (τόνος)
                *     (3.4) mekos (μῆκος)
                *     (3.5) pneuma (πνεῦμα)
                *     (3.6) hypogegrammene (ὑπογεγραμμένη)
                *     (3.7) dialutika (διαλυτικά)
                *     (3.8) we add the new character
        """

        #.......................................................................
        # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src
        #.......................................................................
        normalized_src = unicodedata.normalize('NFD', str_src)

        #.......................................................................
        # (2) = normalized_src -> (default symbols required) :
        #     replace_by_the_default_symbols() -> normalized_src
        #.......................................................................
        normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_LOWER_CASE.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_UPPER_CASE.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_OTHER_SYMBOLS.replace_by_the_default_symbols(normalized_src)
        normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src)

        #.......................................................................
        # (3) initialisation from the recognized characters.
        #     re.finditer(DStringGRC.pattern) give the symbols{letter+diacritics}
        #.......................................................................
        indexes = []    # indexes of the substring well analyzed : ( start, end )
        for element in re.finditer(DStringGRC.pattern,
                                   normalized_src):

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # we add the unknown characters at the beginning and in the middle
            # of the string (see at the end of this function)
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if indexes:
                # <indexes> isn't empty :
                # ... we add the unknown character(s) between the last character and
                # the current one :

                for index in range( max(indexes[-1])+1, element.start() ):
                    new_character = DCharacterGRC(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )

            else:
                # <indexes> is empty :
                # ... we add the unknown character(s) before the first index in <indexes> :
                for index in range( 0, element.start() ):
                    new_character = DCharacterGRC(dstring_object = self,
                                                  unknown_char = True,
                                                  base_char = normalized_src[index])

                    self.append( new_character )

            indexes.append( (element.start(), element.end()-1 ) )

            data = element.groupdict()
            letter     = data['letter']
            diacritics = data['diacritics']

            punctuation = letter in SYMB_PUNCTUATION.symbol2name
            capital_letter = letter in SYMB_UPPER_CASE.symbol2name

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.1) base_char
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if punctuation:
                # punctuation symbol :
                base_char = SYMB_PUNCTUATION.get_the_name_for_this_symbol(letter)
            elif letter in SYMB_LOWER_CASE.symbol2name:
                # lower case :
                base_char = SYMB_LOWER_CASE.get_the_name_for_this_symbol(letter)
            elif letter in SYMB_UPPER_CASE.symbol2name:
                # upper case :
                base_char = SYMB_UPPER_CASE.get_the_name_for_this_symbol(letter)
            else:
                # other symbols :
                base_char = SYMB_OTHER_SYMBOLS.get_the_name_for_this_symbol(letter)

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.2) contextual_form
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            if base_char == 'β' and not capital_letter:
                contextual_form = "initial"
            elif base_char == 'ϐ' and not capital_letter:
                base_char = 'β'
                contextual_form = "medium+final"
            elif base_char == 'σ' and not capital_letter:
                contextual_form = "initial+medium"
            elif base_char == 'ς' and not capital_letter:
                base_char = 'σ'
                contextual_form = "final"
            else:
                contextual_form = "initial+medium+final"

            tonos = None
            mekos = None
            pneuma = None
            hypogegrammene = False
            dialutika = False
            if diacritics is not None:

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.3) tonos (τόνος)
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                tonos_nbr = number_of_occurences( source_string = diacritics,
                                                  symbols = SYMB_DIACRITICS__TONOS )

                if tonos_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), τόνος defined several times."
                    raise DCharsError( context = "DStringGRC.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()))

                if SYMB_DIACRITICS.are_these_symbols_in_a_string('τόνος.βαρεῖα', diacritics):
                    tonos = "βαρεῖα"
                elif SYMB_DIACRITICS.are_these_symbols_in_a_string('τόνος.ὀξεῖα', diacritics):
                    tonos = "ὀξεῖα"
                elif SYMB_DIACRITICS.are_these_symbols_in_a_string('τόνος.περισπωμένη', diacritics):
                    tonos = "περισπωμένη"

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.4) mekos (μῆκος)
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                mekos_nbr = number_of_occurences( source_string = diacritics,
                                                  symbols = SYMB_DIACRITICS__MEKOS)

                if mekos_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), μῆκος defined several times."
                    raise DCharsError( context = "DStringGRC.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()))

                if SYMB_DIACRITICS.are_these_symbols_in_a_string('μῆκος.μακρόν', diacritics):
                    mekos = "μακρόν"
                elif SYMB_DIACRITICS.are_these_symbols_in_a_string('μῆκος.βραχύ', diacritics):
                    mekos = "βραχύ"

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.5) pneuma (πνεῦμα)
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                pneuma_nbr = number_of_occurences( source_string = diacritics,
                                                   symbols = SYMB_DIACRITICS__PNEUMA)

                if pneuma_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), πνεῦμα defined several times."
                    raise DCharsError( context = "DStringGRC.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()))

                if SYMB_DIACRITICS.are_these_symbols_in_a_string('πνεῦμα.ψιλὸν', diacritics):
                    pneuma = "ψιλὸν"
                elif SYMB_DIACRITICS.are_these_symbols_in_a_string('πνεῦμα.δασὺ', diacritics):
                    pneuma = "δασὺ"

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.6) hypogegrammene (ὑπογεγραμμένη)
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                hypogegrammene_nbr = number_of_occurences(
                    source_string = diacritics,
                    symbols = SYMB_DIACRITICS['ὑπογεγραμμένη'])

                if hypogegrammene_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), ὑπογεγραμμένη defined several times."
                    raise DCharsError( context = "DStringGRC.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()))

                hypogegrammene = SYMB_DIACRITICS.are_these_symbols_in_a_string('ὑπογεγραμμένη',
                                                                               diacritics)

                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                # (3.7) dialutika (διαλυτικά)
                #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                dialutika_nbr = number_of_occurences( source_string = diacritics,
                                                      symbols = SYMB_DIACRITICS['διαλυτικά'])

                if dialutika_nbr > 1:
                    err_msg = "In '{0}' (start={1}, end={2}), διαλυτικά defined several times."
                    raise DCharsError( context = "DStringGRC.init_from_str",
                                       message = err_msg.format(element.string,
                                                                element.start(),
                                                                element.end()))

                dialutika = SYMB_DIACRITICS.are_these_symbols_in_a_string('διαλυτικά', diacritics)

            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            # (3.8) we add the new character
            #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
            new_character = DCharacterGRC(dstring_object = self,
                                          unknown_char = False,
                                          base_char = base_char,
                                          contextual_form = contextual_form,
                                          punctuation = punctuation,
                                          capital_letter = capital_letter,
                                          tonos = tonos,
                                          pneuma = pneuma,
                                          hypogegrammene = hypogegrammene,
                                          dialutika = dialutika,
                                          mekos=mekos)

            self.append( new_character )

        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        # we add the final unknown characters (see at the beginning of this
        # function)
        #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
        if indexes:
            # <element> is the last one and <indexes> isn't empty :
            for index in range( max(indexes[-1])+1, len(normalized_src) ):
                new_character = DCharacterGRC(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )

        else:
            # <indexes> is empty :
            for index in range( 0, len(normalized_src) ):
                new_character = DCharacterGRC(dstring_object = self,
                                              unknown_char = True,
                                              base_char = normalized_src[index])

                self.append( new_character )
Esempio n. 2
0
    def get_sourcestr_representation(self, ignore_makron = False):
        """
                DCharacterGRC.get_sourcestr_representation

                PARAMETER :
                o  (bool) ignore_makron : if True, no makron will be added on the
                                          characters

                RETURN VALUE : a (str) string.
        """

        #.......................................................................
        # unknown char ? Nothing to do :
        #.......................................................................
        if self.unknown_char:
            if self.dstring_object.options["anonymize the unknown characters"] == 'yes':
                return UNKNOWN_CHAR_SYMBOL
            else:
                return self.base_char

        #.......................................................................
        # ok, the function can analyse <self> :
        #.......................................................................

        res = []

        if self.base_char is not None:
            if self.punctuation:
                # punctuation symbol :
                res.append( self.base_char )
            elif self.base_char in SYMB_OTHER_SYMBOLS:
                # other symbol :
                res.append( self.base_char )
            elif not self.capital_letter:
                # lower case :

                base_char = self.base_char
                if base_char =='β' and \
                   not self.capital_letter and \
                   self.contextual_form == "medium+final":
                    base_char = "ϐ"
                elif base_char =='σ' and \
                     not self.capital_letter and \
                     self.contextual_form == "final":
                    base_char = "ς"

                res.append( SYMB_LOWER_CASE.get_default_symbol(base_char) )
            else:
                # upper case :
                res.append( SYMB_UPPER_CASE.get_default_symbol(self.base_char) )

        # CAVEAT : order matters !
        # e.g. : pneuma then tonos, NOT tonos then pneuma
        # unicodedata.normalize('NFC', chr(0x03BF)+chr(0x0314)+chr(0x301) ) = chr(0x1F45) (ok)
        # unicodedata.normalize('NFC', chr(0x03BF)+chr(0x0301)+chr(0x314) ) =
        #                                               chr(0x03CC) + chr(0x314) [NOT OK !]

        if self.pneuma == 'ψιλὸν':
            res.append( DEFAULTSYMB__PNEUMAPSILON )
        elif self.pneuma == 'δασὺ':
            res.append( DEFAULTSYMB__PNEUMADASU )

        if self.tonos == 'ὀξεῖα':
            res.append( DEFAULTSYMB__TONOSOXEIA )
        elif self.tonos == 'βαρεῖα':
            res.append( DEFAULTSYMB__TONOSBAREIA )
        elif self.tonos == 'περισπωμένη':
            res.append( DEFAULTSYMB__TONOSPERISPOMENE )

        if self.mekos == 'βραχύ':
            res.append( DEFAULTSYMB__MEKOSBRAXU )
        elif self.mekos == 'μακρόν' and not ignore_makron:
            res.append( DEFAULTSYMB__MEKOSMAKRON )

        if self.hypogegrammene == True:
            res.append( DEFAULTSYMB__HUPOGEGRAMMENE )

        if self.dialutika == True:
            res.append( DEFAULTSYMB__DIALYTIKA )

        res = "".join(res)

        # (1/2) composition with unicodedata.normalize :
        res = unicodedata.normalize('NFC', res)
        # (2/2) composition with COMPLETE_NORMALIZE_NFC :
        for before, after in COMPLETE_NORMALIZE_NFC:
            res = res.replace(before, after)

        return res