Beispiel #1
0
################################################################################
# transliteration's patterns :
# PATTERN  is used to cut one complex characters into its elements.
# PATTERN2 is used to cut several complex characters into a list of complex characters.
################################################################################

# in order to build the pattern strings for the regexes we have to SORT the
# result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will.
# We delete the possible duplicates in the resulting string.

PNEUMA = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["ψιλὸν"]), re.escape(DIACRITICS["δασὺ"])])
TONOS = isort_a_lstrings_bylen_nodup(
    [re.escape(DIACRITICS["βαρεῖα"]), re.escape(DIACRITICS["ὀξεῖα"]), re.escape(DIACRITICS["περισπωμένη"])]
)
LETTERS = isort_a_lstrings_bylen_nodup(
    regexstring_list(tuple(LOWER_CASE_INVERSED.keys()))
    + regexstring_list(tuple(UPPER_CASE_INVERSED.keys()))
    + regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys()))
    + regexstring_list(tuple(PUNCTUATION_INVERSED.keys()))
)
HYPOGEGRAMMENE = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["ὑπογεγραμμένη"])])
DIALUTIKA = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["διαλυτικά"])])
MEKOS = isort_a_lstrings_bylen_nodup([re.escape(DIACRITICS["μακρόν"]), re.escape(DIACRITICS["βραχύ"])])

PATTERN_TXT = (
    "((?P<trans_pneuma>({0}))?"
    "(?P<trans_tonos>({1}))?"
    "(?P<base_char>({2}))"
    "(?P<trans_hypogegrammene>({3}))?"
    "(?P<trans_dialutika>({4}))?"
    "(?P<trans_mekos>({5}))?)".format(
Beispiel #2
0
# PATTERN2 is used to cut several complex characters into a list of complex characters.
################################################################################

# in order to build the pattern strings for the regexes we have to SORT the
# result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will.
# We delete the possible duplicates in the resulting string.
UPPERDOT = isort_a_lstrings_bylen_nodup(
                 [re.escape(DIACRITICS['upperdot'])] )
MAKRON = isort_a_lstrings_bylen_nodup(
                 [re.escape(DIACRITICS['makron']), ])
STRESS = isort_a_lstrings_bylen_nodup(
                [re.escape(DIACRITICS['stressM1']),
                 re.escape(DIACRITICS['stress1']),
                 re.escape(DIACRITICS['stress2']), ])
LETTERS = isort_a_lstrings_bylen_nodup(
                regexstring_list(tuple(LOWER_CASE_INVERSED.keys())) + \
                regexstring_list(tuple(UPPER_CASE_INVERSED.keys())) + \
                regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) )

PATTERN_TXT = "((?P<base_char>({0}))" \
              "(?P<trans_stress>({1}))?" \
              "(?P<trans_makron>({2}))?" \
              "(?P<trans_upperdot>({3}))?" \
              ")".format("|".join(prepare_list_to_strformat(LETTERS)),
                         "|".join(prepare_list_to_strformat(STRESS)),
                         "|".join(prepare_list_to_strformat(MAKRON)),
                         "|".join(prepare_list_to_strformat(UPPERDOT)),
                         )
# we inverse the effect of prepare_list_to_strformat()
PATTERN_TXT = PATTERN_TXT.replace('{{', '{')
PATTERN_TXT = PATTERN_TXT.replace('}}', '}')
Beispiel #3
0
OTHER_SYMBOLS_INVERSED = invertdict(OTHER_SYMBOLS)
PUNCTUATION_INVERSED = invertdict(PUNCTUATION)
DIACRITICS_INVERSED = invertdict(DIACRITICS)

################################################################################
# transliteration's patterns :
# PATTERN  is used to cut one complex characters into its elements.
# PATTERN2 is used to cut several complex characters into a list of complex characters.
################################################################################

# in order to build the pattern strings for the regexes we have to SORT the
# result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will.
# We delete the possible duplicates in the resulting string.

BASE_CHAR = isort_a_lstrings_bylen_nodup(
                regexstring_list(tuple(CONSONANTS_INVERSED.keys())) + \
                regexstring_list(tuple(CONSONANTS_WITH_NUKTA_INVERSED.keys())) + \
                regexstring_list(tuple(VOWELS_INVERSED.keys())) + \
                regexstring_list(tuple(VOWELS_IN_HIATUS_INVERSED.keys())) + \
                regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + \
                regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) )

ANUDATTA = isort_a_lstrings_bylen_nodup(
                [re.escape(DIACRITICS['DEVANAGARI STRESS SIGN ANUDATTA']),])

ACCENT   = isort_a_lstrings_bylen_nodup(
                [re.escape(DIACRITICS['DEVANAGARI STRESS SIGN UDATTA']),
                ])

ANUSVARA_CANDRABINDU = isort_a_lstrings_bylen_nodup(
                [re.escape(DIACRITICS['DEVANAGARI SIGN ANUSVARA']),
Beispiel #4
0
# E.g. CANTILATIONMARKS["HEBREW ACCENT ZAQEF GADOL"] = <HEBREW ACCENT ZAQEF GADOL>
CANTILATIONMARKS = { cmark:"<"+cmark+">" for cmark in symbols.SYMB_CANTILLATION_MARKS.keys() }
CANTILATIONMARKS_INVERSED = invertdict( CANTILATIONMARKS )

################################################################################
# transliteration's patterns :
# PATTERN  is used to cut one complex characters into its elements.
# PATTERN2 is used to cut several complex characters into a list of complex characters.
################################################################################

# in order to build the pattern strings for the regexes we have to SORT the
# result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will.
# We delete the possible duplicates in the resulting string.

T_BASECHARS = isort_a_lstrings_bylen_nodup(
                regexstring_list(tuple(LETTERS_INVERSED.keys())) + \
                regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + \
                regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) )
T_VOWELS = isort_a_lstrings_bylen_nodup(
                regexstring_list(tuple(VOWELS_INVERSED.keys())))
T_METHEGH = isort_a_lstrings_bylen_nodup(
                 [re.escape(POINTS["HEBREW POINT METEG"]),] )
T_RAPHE = isort_a_lstrings_bylen_nodup(
                 [re.escape(POINTS["HEBREW POINT RAFE"]),] )
T_SPECIALPOINTS = isort_a_lstrings_bylen_nodup(
                regexstring_list(tuple(SPECIALPOINTS_INVERSED.keys())) )
T_CMARKS = isort_a_lstrings_bylen_nodup(
                regexstring_list(tuple(CANTILATIONMARKS_INVERSED.keys())) )

PATTERN_TXT = "((?P<base_char>({0}))" \
              "(?P<trans_vowel>({1}))?" \
Beispiel #5
0
#  +1 (text->transliteration)
#  -1 (transliteration->text)
#
################################################################################
AVAILABLE_DIRECTIONS = (-1, +1)

################################################################################
# transliteration's patterns :
################################################################################

# in order to build the pattern strings for the regexes we have to SORT the
# result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will.
# We delete the possible duplicates in the resulting string.

TRANS_DOT_OR_PLUS = isort_a_lstrings_bylen_nodup(
                    regexstring_list( ('+', '.') ))
TRANS_PUNCTUATION_AND_OTHER_SYMBOL = isort_a_lstrings_bylen_nodup(
                      regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + \
                      regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) )
TRANS_CONSONANTS_AND_VOWELS = isort_a_lstrings_bylen_nodup(
                      regexstring_list(tuple(CONSONANTS_INVERSED.keys())) + \
                      regexstring_list(tuple(VOWELS_INVERSED.keys())) )
TRANS_RNAM_BCAD = isort_a_lstrings_bylen_nodup(
                      regexstring_list( (DIACRITICS['SIGN RNAM BCAD'])) )
TRANS_HALANTA = isort_a_lstrings_bylen_nodup(
                      regexstring_list( (DIACRITICS['MARK HALANTA'])) )
TRANS_ANUSVARA_CANDRABINDU = isort_a_lstrings_bylen_nodup(
                      regexstring_list( (DIACRITICS['SIGN RJES SU NGA RO'],
                                         DIACRITICS['SIGN NYI ZLA NAA DA'],
                                         DIACRITICS['SIGN SNA LDAN'] )))
Beispiel #6
0
        ("[-]tsub"         , "bb"),
        ("[-]tsup"         , "pp"),
    ))

################################################################################
# transliteration's patterns :
# PATTERN  is used to cut one complex characters into its elements.
# PATTERN2 is used to cut several complex characters into a list of complex characters.
################################################################################

# in order to build the pattern strings for the regexes we have to SORT the
# result : (|a|b|t|th) won't find 'th' in "theatre" but (th|a|b|t) will.
# We delete the possible duplicates in the resulting string.

LETTERS = isort_a_lstrings_bylen_nodup(
                regexstring_list(tuple(CHOONPU.keys())) + \
                regexstring_list(tuple(HIRAGANA_INVERSED.keys())) + \
                regexstring_list(tuple(HIRAGANA_DAKUTEN_INVERSED.keys())) + \
                regexstring_list(tuple(HIRAGANA_HANDAKUTEN_INVERSED.keys())) + \
                regexstring_list(tuple(KATAKANA_INVERSED.keys())) + \
                regexstring_list(tuple(KATAKANA_DAKUTEN_INVERSED.keys())) + \
                regexstring_list(tuple(KATAKANA_HANDAKUTEN_INVERSED.keys())) + \
                regexstring_list(tuple(OTHER_SYMBOLS_INVERSED.keys())) + \
                regexstring_list(tuple(PUNCTUATION_INVERSED.keys())) )

PATTERN_TXT = "(?P<base_char>({0}))".format("|".join(prepare_list_to_strformat(LETTERS)),)

# we inverse the effect of prepare_list_to_strformat()
PATTERN_TXT = PATTERN_TXT.replace('{{', '{')
PATTERN_TXT = PATTERN_TXT.replace('}}', '}')
PATTERN = re.compile(PATTERN_TXT)