def init_from_str(self, str_src): """ DStringGRC.init_from_str Function called by __init__(), initialize <self> and return <indexes_of_unrecognized_chars>. str_src : str HOW IT WORKS : * (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src * (2) = normalized_src -> (default symbols required) : * replace_by_the_default_symbols() -> normalized_src * (3) initialisation from the recognized characters. * re.finditer(DStringGRC.pattern) give the symbols{letter+diacritics} * (3.1) base_char * (3.2) contextual_form * (3.3) tonos (τόνος) * (3.4) mekos (μῆκος) * (3.5) pneuma (πνεῦμα) * (3.6) hypogegrammene (ὑπογεγραμμένη) * (3.7) dialutika (διαλυτικά) * (3.8) we add the new character """ #....................................................................... # (1) str_src -> (decomposition) unicodedata.normalize('NFD',) = normalized_src #....................................................................... normalized_src = unicodedata.normalize('NFD', str_src) #....................................................................... # (2) = normalized_src -> (default symbols required) : # replace_by_the_default_symbols() -> normalized_src #....................................................................... normalized_src = SYMB_PUNCTUATION.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_LOWER_CASE.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_UPPER_CASE.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_OTHER_SYMBOLS.replace_by_the_default_symbols(normalized_src) normalized_src = SYMB_DIACRITICS.replace_by_the_default_symbols(normalized_src) #....................................................................... # (3) initialisation from the recognized characters. # re.finditer(DStringGRC.pattern) give the symbols{letter+diacritics} #....................................................................... indexes = [] # indexes of the substring well analyzed : ( start, end ) for element in re.finditer(DStringGRC.pattern, normalized_src): #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the unknown characters at the beginning and in the middle # of the string (see at the end of this function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <indexes> isn't empty : # ... we add the unknown character(s) between the last character and # the current one : for index in range( max(indexes[-1])+1, element.start() ): new_character = DCharacterGRC(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : # ... we add the unknown character(s) before the first index in <indexes> : for index in range( 0, element.start() ): new_character = DCharacterGRC(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) indexes.append( (element.start(), element.end()-1 ) ) data = element.groupdict() letter = data['letter'] diacritics = data['diacritics'] punctuation = letter in SYMB_PUNCTUATION.symbol2name capital_letter = letter in SYMB_UPPER_CASE.symbol2name #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.1) base_char #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if punctuation: # punctuation symbol : base_char = SYMB_PUNCTUATION.get_the_name_for_this_symbol(letter) elif letter in SYMB_LOWER_CASE.symbol2name: # lower case : base_char = SYMB_LOWER_CASE.get_the_name_for_this_symbol(letter) elif letter in SYMB_UPPER_CASE.symbol2name: # upper case : base_char = SYMB_UPPER_CASE.get_the_name_for_this_symbol(letter) else: # other symbols : base_char = SYMB_OTHER_SYMBOLS.get_the_name_for_this_symbol(letter) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.2) contextual_form #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if base_char == 'β' and not capital_letter: contextual_form = "initial" elif base_char == 'ϐ' and not capital_letter: base_char = 'β' contextual_form = "medium+final" elif base_char == 'σ' and not capital_letter: contextual_form = "initial+medium" elif base_char == 'ς' and not capital_letter: base_char = 'σ' contextual_form = "final" else: contextual_form = "initial+medium+final" tonos = None mekos = None pneuma = None hypogegrammene = False dialutika = False if diacritics is not None: #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.3) tonos (τόνος) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . tonos_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__TONOS ) if tonos_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), τόνος defined several times." raise DCharsError( context = "DStringGRC.init_from_str", message = err_msg.format(element.string, element.start(), element.end())) if SYMB_DIACRITICS.are_these_symbols_in_a_string('τόνος.βαρεῖα', diacritics): tonos = "βαρεῖα" elif SYMB_DIACRITICS.are_these_symbols_in_a_string('τόνος.ὀξεῖα', diacritics): tonos = "ὀξεῖα" elif SYMB_DIACRITICS.are_these_symbols_in_a_string('τόνος.περισπωμένη', diacritics): tonos = "περισπωμένη" #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.4) mekos (μῆκος) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . mekos_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__MEKOS) if mekos_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), μῆκος defined several times." raise DCharsError( context = "DStringGRC.init_from_str", message = err_msg.format(element.string, element.start(), element.end())) if SYMB_DIACRITICS.are_these_symbols_in_a_string('μῆκος.μακρόν', diacritics): mekos = "μακρόν" elif SYMB_DIACRITICS.are_these_symbols_in_a_string('μῆκος.βραχύ', diacritics): mekos = "βραχύ" #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.5) pneuma (πνεῦμα) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . pneuma_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS__PNEUMA) if pneuma_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), πνεῦμα defined several times." raise DCharsError( context = "DStringGRC.init_from_str", message = err_msg.format(element.string, element.start(), element.end())) if SYMB_DIACRITICS.are_these_symbols_in_a_string('πνεῦμα.ψιλὸν', diacritics): pneuma = "ψιλὸν" elif SYMB_DIACRITICS.are_these_symbols_in_a_string('πνεῦμα.δασὺ', diacritics): pneuma = "δασὺ" #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.6) hypogegrammene (ὑπογεγραμμένη) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . hypogegrammene_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS['ὑπογεγραμμένη']) if hypogegrammene_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), ὑπογεγραμμένη defined several times." raise DCharsError( context = "DStringGRC.init_from_str", message = err_msg.format(element.string, element.start(), element.end())) hypogegrammene = SYMB_DIACRITICS.are_these_symbols_in_a_string('ὑπογεγραμμένη', diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.7) dialutika (διαλυτικά) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . dialutika_nbr = number_of_occurences( source_string = diacritics, symbols = SYMB_DIACRITICS['διαλυτικά']) if dialutika_nbr > 1: err_msg = "In '{0}' (start={1}, end={2}), διαλυτικά defined several times." raise DCharsError( context = "DStringGRC.init_from_str", message = err_msg.format(element.string, element.start(), element.end())) dialutika = SYMB_DIACRITICS.are_these_symbols_in_a_string('διαλυτικά', diacritics) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # (3.8) we add the new character #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . new_character = DCharacterGRC(dstring_object = self, unknown_char = False, base_char = base_char, contextual_form = contextual_form, punctuation = punctuation, capital_letter = capital_letter, tonos = tonos, pneuma = pneuma, hypogegrammene = hypogegrammene, dialutika = dialutika, mekos=mekos) self.append( new_character ) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # we add the final unknown characters (see at the beginning of this # function) #. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if indexes: # <element> is the last one and <indexes> isn't empty : for index in range( max(indexes[-1])+1, len(normalized_src) ): new_character = DCharacterGRC(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character ) else: # <indexes> is empty : for index in range( 0, len(normalized_src) ): new_character = DCharacterGRC(dstring_object = self, unknown_char = True, base_char = normalized_src[index]) self.append( new_character )
def get_sourcestr_representation(self, ignore_makron = False): """ DCharacterGRC.get_sourcestr_representation PARAMETER : o (bool) ignore_makron : if True, no makron will be added on the characters RETURN VALUE : a (str) string. """ #....................................................................... # unknown char ? Nothing to do : #....................................................................... if self.unknown_char: if self.dstring_object.options["anonymize the unknown characters"] == 'yes': return UNKNOWN_CHAR_SYMBOL else: return self.base_char #....................................................................... # ok, the function can analyse <self> : #....................................................................... res = [] if self.base_char is not None: if self.punctuation: # punctuation symbol : res.append( self.base_char ) elif self.base_char in SYMB_OTHER_SYMBOLS: # other symbol : res.append( self.base_char ) elif not self.capital_letter: # lower case : base_char = self.base_char if base_char =='β' and \ not self.capital_letter and \ self.contextual_form == "medium+final": base_char = "ϐ" elif base_char =='σ' and \ not self.capital_letter and \ self.contextual_form == "final": base_char = "ς" res.append( SYMB_LOWER_CASE.get_default_symbol(base_char) ) else: # upper case : res.append( SYMB_UPPER_CASE.get_default_symbol(self.base_char) ) # CAVEAT : order matters ! # e.g. : pneuma then tonos, NOT tonos then pneuma # unicodedata.normalize('NFC', chr(0x03BF)+chr(0x0314)+chr(0x301) ) = chr(0x1F45) (ok) # unicodedata.normalize('NFC', chr(0x03BF)+chr(0x0301)+chr(0x314) ) = # chr(0x03CC) + chr(0x314) [NOT OK !] if self.pneuma == 'ψιλὸν': res.append( DEFAULTSYMB__PNEUMAPSILON ) elif self.pneuma == 'δασὺ': res.append( DEFAULTSYMB__PNEUMADASU ) if self.tonos == 'ὀξεῖα': res.append( DEFAULTSYMB__TONOSOXEIA ) elif self.tonos == 'βαρεῖα': res.append( DEFAULTSYMB__TONOSBAREIA ) elif self.tonos == 'περισπωμένη': res.append( DEFAULTSYMB__TONOSPERISPOMENE ) if self.mekos == 'βραχύ': res.append( DEFAULTSYMB__MEKOSBRAXU ) elif self.mekos == 'μακρόν' and not ignore_makron: res.append( DEFAULTSYMB__MEKOSMAKRON ) if self.hypogegrammene == True: res.append( DEFAULTSYMB__HUPOGEGRAMMENE ) if self.dialutika == True: res.append( DEFAULTSYMB__DIALYTIKA ) res = "".join(res) # (1/2) composition with unicodedata.normalize : res = unicodedata.normalize('NFC', res) # (2/2) composition with COMPLETE_NORMALIZE_NFC : for before, after in COMPLETE_NORMALIZE_NFC: res = res.replace(before, after) return res