regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + \ regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) HYPOGEGRAMMENE = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['ὑπογεγραμμένη']),] ) DIALUTIKA = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['διαλυτικά']),] ) MEKOS = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['μακρόν']), re.escape(DIACRITICS['βραχύ'])] ) PATTERN_TXT = "((?P<trans_pneuma>({0}))?" \ "(?P<trans_tonos>({1}))?" \ "(?P<base_char>({2}))" \ "(?P<trans_hypogegrammene>({3}))?" \ "(?P<trans_dialutika>({4}))?" \ "(?P<trans_mekos>({5}))?)".format("|".join(prepare_list_to_strformat(PNEUMA)), "|".join(prepare_list_to_strformat(TONOS)), "|".join(prepare_list_to_strformat(LETTERS)), "|".join(prepare_list_to_strformat(HYPOGEGRAMMENE)), "|".join(prepare_list_to_strformat(DIALUTIKA)), "|".join(prepare_list_to_strformat(MEKOS)), ) # we inverse the effect of prepare_list_to_strformat() PATTERN_TXT = PATTERN_TXT.replace('{{', '{') PATTERN_TXT = PATTERN_TXT.replace('}}', '}') PATTERN = re.compile(PATTERN_TXT) PATTERN_TXT2 = "(({0})?" \ "({1})?" \ "({2})" \ "({3})?" \
+ regexstring_list(tuple(UPPER_CASE_INVERSED.keys())) + regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) HYPOGEGRAMMENE = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["ὑπογεγραμμένη"])]) DIALUTIKA = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["διαλυτικά"])]) MEKOS = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["μακρόν"]), re.escape(DIACRITICS["βραχύ"])]) PATTERN_TXT = ( "((?P<trans_pneuma>({0}))?" "(?P<trans_tonos>({1}))?" "(?P<base_char>({2}))" "(?P<trans_hypogegrammene>({3}))?" "(?P<trans_dialutika>({4}))?" "(?P<trans_mekos>({5}))?)".format( "|".join(prepare_list_to_strformat(PNEUMA)), "|".join(prepare_list_to_strformat(TONOS)), "|".join(prepare_list_to_strformat(LETTERS)), "|".join(prepare_list_to_strformat(HYPOGEGRAMMENE)), "|".join(prepare_list_to_strformat(DIALUTIKA)), "|".join(prepare_list_to_strformat(MEKOS)), ) ) # we inverse the effect of prepare_list_to_strformat() PATTERN_TXT = PATTERN_TXT.replace("{{", "{") PATTERN_TXT = PATTERN_TXT.replace("}}", "}") PATTERN = re.compile(PATTERN_TXT) PATTERN_TXT2 = ( "(({0})?" "({1})?"
[re.escape(DIACRITICS['DEVANAGARI STRESS SIGN ANUDATTA']),]) ACCENT = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['DEVANAGARI STRESS SIGN UDATTA']), ]) ANUSVARA_CANDRABINDU = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['DEVANAGARI SIGN ANUSVARA']), re.escape(DIACRITICS['DEVANAGARI SIGN CANDRABINDU']), ]) PATTERN_TXT = "((?P<base_char>({0}))" \ "(?P<accent>({1}))?" \ "(?P<anudatta>({2}))?" \ "(?P<anusvara_candrabindu>({3}))?)".format( "|".join(prepare_list_to_strformat(BASE_CHAR)), "|".join(prepare_list_to_strformat(ACCENT)), "|".join(prepare_list_to_strformat(ANUDATTA)), "|".join(prepare_list_to_strformat(ANUSVARA_CANDRABINDU)), ) # we inverse the effect of prepare_list_to_strformat() PATTERN_TXT = PATTERN_TXT.replace('{{', '{') PATTERN_TXT = PATTERN_TXT.replace('}}', '}') PATTERN = re.compile(PATTERN_TXT) PATTERN_TXT2 = "(({0})" \ "({1})?" \ "({2})?" \ "({3})?)".format( "|".join(prepare_list_to_strformat(BASE_CHAR)),
MAKRON = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['makron']), ]) STRESS = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['stressM1']), re.escape(DIACRITICS['stress1']), re.escape(DIACRITICS['stress2']), ]) LETTERS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(LOWER_CASE_INVERSED.keys())) + \ regexstring_list(tuple(UPPER_CASE_INVERSED.keys())) + \ regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) PATTERN_TXT = "((?P<base_char>({0}))" \ "(?P<trans_stress>({1}))?" \ "(?P<trans_makron>({2}))?" \ "(?P<trans_upperdot>({3}))?" \ ")".format("|".join(prepare_list_to_strformat(LETTERS)), "|".join(prepare_list_to_strformat(STRESS)), "|".join(prepare_list_to_strformat(MAKRON)), "|".join(prepare_list_to_strformat(UPPERDOT)), ) # we inverse the effect of prepare_list_to_strformat() PATTERN_TXT = PATTERN_TXT.replace('{{', '{') PATTERN_TXT = PATTERN_TXT.replace('}}', '}') PATTERN = re.compile(PATTERN_TXT) PATTERN_TXT2 = "(({0})" \ "({1})?" \ "({2})?" \ "({3})?" \ ")".format("|".join(prepare_list_to_strformat(LETTERS)), "|".join(prepare_list_to_strformat(STRESS)),
T_METHEGH = isort_a_lstrings_bylen_nodup( [re.escape(POINTS["HEBREW POINT METEG"]),] ) T_RAPHE = isort_a_lstrings_bylen_nodup( [re.escape(POINTS["HEBREW POINT RAFE"]),] ) T_SPECIALPOINTS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(SPECIALPOINTS_INVERSED.keys())) ) T_CMARKS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(CANTILATIONMARKS_INVERSED.keys())) ) PATTERN_TXT = "((?P<base_char>({0}))" \ "(?P<trans_vowel>({1}))?" \ "(?P<trans_methegh>({2}))?" \ "(?P<trans_raphe>({3}))?" \ "(?P<trans_specialpoint>({4}))?" \ "(?P<trans_cmark>({5})+)?)".format( "|".join(prepare_list_to_strformat(T_BASECHARS)), "|".join(prepare_list_to_strformat(T_VOWELS)), "|".join(prepare_list_to_strformat(T_METHEGH)), "|".join(prepare_list_to_strformat(T_RAPHE)), "|".join(prepare_list_to_strformat(T_SPECIALPOINTS)), "|".join(prepare_list_to_strformat(T_CMARKS)), ) # we inverse the effect of prepare_list_to_strformat() PATTERN_TXT = PATTERN_TXT.replace('{{', '{') PATTERN_TXT = PATTERN_TXT.replace('}}', '}') PATTERN = re.compile(PATTERN_TXT) PATTERN_TXT2 = "(({0})" \ "({1})?" \ "({2})?" \ "({3})?" \
re.escape(DIACRITICS["stress3"]), ] ) CEDILLA = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["cedilla"])]) LETTERS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(LOWER_CASE_INVERSED.keys())) + regexstring_list(tuple(UPPER_CASE_INVERSED.keys())) + regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) PATTERN_TXT = ( "((?P<base_char>({0}))" "(?P<trans_stress>({1}))?" "(?P<trans_cedilla>({2}))?" ")".format( "|".join(prepare_list_to_strformat(LETTERS)), "|".join(prepare_list_to_strformat(STRESS)), "|".join(prepare_list_to_strformat(CEDILLA)), ) ) # we inverse the effect of prepare_list_to_strformat() PATTERN_TXT = PATTERN_TXT.replace("{{", "{") PATTERN_TXT = PATTERN_TXT.replace("}}", "}") PATTERN = re.compile(PATTERN_TXT) PATTERN_TXT2 = ( "(({0})" "({1})?" "({2})?" ")".format( "|".join(prepare_list_to_strformat(LETTERS)),
[re.escape(DIACRITICS['diaeresis'])] ) LENGTH = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['short']), re.escape(DIACRITICS['long'])] ) STRESS = isort_a_lstrings_bylen_nodup( [re.escape(DIACRITICS['stress']),]) LETTERS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(LOWER_CASE_INVERSED.keys())) + \ regexstring_list(tuple(UPPER_CASE_INVERSED.keys())) + \ regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) PATTERN_TXT = "((?P<base_char>({0}))" \ "(?P<trans_stress>({1}))?" \ "(?P<trans_length>({2}))?" \ "(?P<trans_diaeresis>({3}))?" \ ")".format("|".join(prepare_list_to_strformat(LETTERS)), "|".join(prepare_list_to_strformat(STRESS)), "|".join(prepare_list_to_strformat(LENGTH)), "|".join(prepare_list_to_strformat(DIAERESIS)), ) # we inverse the effect of prepare_list_to_strformat() PATTERN_TXT = PATTERN_TXT.replace('{{', '{') PATTERN_TXT = PATTERN_TXT.replace('}}', '}') PATTERN = re.compile(PATTERN_TXT) PATTERN_TXT2 = "(({0})" \ "({1})?" \ "({2})?" \ "({3})?" \ ")".format("|".join(prepare_list_to_strformat(LETTERS)), "|".join(prepare_list_to_strformat(STRESS)),
regexstring_list(tuple(VOWELS_INVERSED.keys())) ) TRANS_RNAM_BCAD = isort_a_lstrings_bylen_nodup( regexstring_list( (DIACRITICS['SIGN RNAM BCAD'])) ) TRANS_HALANTA = isort_a_lstrings_bylen_nodup( regexstring_list( (DIACRITICS['MARK HALANTA'])) ) TRANS_ANUSVARA_CANDRABINDU = isort_a_lstrings_bylen_nodup( regexstring_list( (DIACRITICS['SIGN RJES SU NGA RO'], DIACRITICS['SIGN NYI ZLA NAA DA'], DIACRITICS['SIGN SNA LDAN'] ))) TRANS_PATTERN_TXT = "(?P<dotpointorplus>({0}))?" \ "(?P<base_char>({1}))" \ "(?P<halanta>({2}))?" \ "(?P<anusvara_candrabindu>({3}))?" \ "(?P<rnam_bcad>({4}))?".format( "|".join(prepare_list_to_strformat(TRANS_DOT_OR_PLUS)), "|".join(prepare_list_to_strformat(TRANS_CONSONANTS_AND_VOWELS) + \ prepare_list_to_strformat(TRANS_PUNCTUATION_AND_OTHER_SYMBOL)), "|".join(prepare_list_to_strformat(TRANS_HALANTA)), "|".join(prepare_list_to_strformat(TRANS_ANUSVARA_CANDRABINDU)), "|".join(prepare_list_to_strformat(TRANS_RNAM_BCAD)), ) TRANS_PATTERN_TXT = TRANS_PATTERN_TXT.replace('{{', '{') TRANS_PATTERN_TXT = TRANS_PATTERN_TXT.replace('}}', '}') TRANS_PATTERN = re.compile(TRANS_PATTERN_TXT) #/////////////////////////////////////////////////////////////////////////////// def get_intstruct_from_trans_str( _src, dstring_object ): """ function get_intstruct_from_trans_str()
# in order to build the pattern strings for the regexes we have to SORT the # result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will. # We delete the possible duplicates in the resulting string. LETTERS = isort_a_lstrings_bylen_nodup( regexstring_list(tuple(CHOONPU.keys())) + \ regexstring_list(tuple(HIRAGANA_INVERSED.keys())) + \ regexstring_list(tuple(HIRAGANA_DAKUTEN_INVERSED.keys())) + \ regexstring_list(tuple(HIRAGANA_HANDAKUTEN_INVERSED.keys())) + \ regexstring_list(tuple(KATAKANA_INVERSED.keys())) + \ regexstring_list(tuple(KATAKANA_DAKUTEN_INVERSED.keys())) + \ regexstring_list(tuple(KATAKANA_HANDAKUTEN_INVERSED.keys())) + \ regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + \ regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) ) PATTERN_TXT = "(?P<base_char>({0}))".format("|".join(prepare_list_to_strformat(LETTERS)),) # we inverse the effect of prepare_list_to_strformat() PATTERN_TXT = PATTERN_TXT.replace('{{', '{') PATTERN_TXT = PATTERN_TXT.replace('}}', '}') PATTERN = re.compile(PATTERN_TXT) PATTERN_TXT2 = "({0})".format("|".join(prepare_list_to_strformat(LETTERS)),) # we inverse the effect of prepare_list_to_strformat() PATTERN_TXT2 = PATTERN_TXT2.replace('{{', '{') PATTERN_TXT2 = PATTERN_TXT2.replace('}}', '}') PATTERN2 = re.compile(PATTERN_TXT2) #///////////////////////////////////////////////////////////////////////////////