コード例 #1
0
    def __init__(self, norm_rules, trans_rules, analysis_rules):
        self.normalizer = Transliterator.createFromRules(
            "icu_normalization", norm_rules)
        trans_rules += ";[:Space:]+ > ' '"
        self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
                                                       trans_rules)
        self.search = Transliterator.createFromRules("icu_search",
                                                     norm_rules + trans_rules)

        self.analysis = {
            name: arules.create(self.to_ascii, arules.config)
            for name, arules in analysis_rules.items()
        }
コード例 #2
0
    def test_get_transliteration_rules(self):
        self.config_rules()
        loader = ICURuleLoader(self.project_env)
        rules = loader.get_transliteration_rules()
        trans = Transliterator.createFromRules("test", rules)

        assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
コード例 #3
0
    def testCustomFunctionality(self):

        # convert a's to b's and b's to c's
        rules = "a > b; b > c;"
        self._checkToken(
            Transliterator.createFromRules("test", rules,
                                           UTransDirection.FORWARD),
            "abacadaba", "bcbcbdbcb")
コード例 #4
0
    def testCustomFunctionality2(self):

        # convert a's to b's and b's to c's
        rules = "c { a > b; a > d;"
        self._checkToken(
            Transliterator.createFromRules("test", rules,
                                           UTransDirection.FORWARD), "caa",
            "cbd")
コード例 #5
0
ファイル: test_generic.py プロジェクト: lonvia/Nominatim
def make_analyser(*variants, variant_only=False):
    rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]}
    if variant_only:
        rules['mode'] = 'variant-only'
    config = module.configure(rules, DEFAULT_NORMALIZATION)
    trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)

    return module.create(trans, config)
コード例 #6
0
ファイル: test_generic.py プロジェクト: lonvia/Nominatim
def test_no_variants():
    rules = { 'analyzer': 'generic' }
    config = module.configure(rules, DEFAULT_NORMALIZATION)
    trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)

    proc = module.create(trans, config)

    assert get_normalized_variants(proc, '大德!') == ['dà dé']
コード例 #7
0
    def name_analyzer(self):
        """ Create a new analyzer for tokenizing names and queries
            using this tokinzer. Analyzers are context managers and should
            be used accordingly:

            ```
            with tokenizer.name_analyzer() as analyzer:
                analyser.tokenize()
            ```

            When used outside the with construct, the caller must ensure to
            call the close() function before destructing the analyzer.

            Analyzers are not thread-safe. You need to instantiate one per thread.
        """
        norm = Transliterator.createFromRules("normalizer", self.normalization)
        trans = Transliterator.createFromRules("trans", self.transliteration)
        return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
コード例 #8
0
 def __init__(self, config, phplib_dir, db_connection) -> None:
     self.db_connection = db_connection
     self.config = config
     self.phplib_dir = phplib_dir
     self.black_list, self.white_list = self._load_white_and_black_lists()
     #Compile the regex here to increase performances.
     self.occurence_pattern = re.compile(
         r'\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([\-YN])'
     )
     self.sanity_check_pattern = re.compile(r'^\w+$')
     self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
                                                          self.config.TERM_NORMALIZATION)
コード例 #9
0
def test_get_search_rules(cfgrules):
    loader = ICURuleLoader(cfgrules())

    rules = loader.get_search_rules()
    trans = Transliterator.createFromRules("test", rules)

    assert trans.transliterate(" Baum straße ") == " baum straße "
    assert trans.transliterate(" Baumstraße ") == " baumstraße "
    assert trans.transliterate(" Baumstrasse ") == " baumstrasse "
    assert trans.transliterate(" Baumstr ") == " baumstr "
    assert trans.transliterate(" Baumwegstr ") == " baumwegstr "
    assert trans.transliterate(" Αθήνα ") == " athēna "
    assert trans.transliterate(" проспект ") == " prospekt "
コード例 #10
0
    def test_get_search_rules(self):
        self.config_rules()
        loader = ICURuleLoader(self.project_env)

        rules = loader.get_search_rules()
        trans = Transliterator.createFromRules("test", rules)

        assert trans.transliterate(" Baum straße ") == " baum straße "
        assert trans.transliterate(" Baumstraße ") == " baumstraße "
        assert trans.transliterate(" Baumstrasse ") == " baumstrasse "
        assert trans.transliterate(" Baumstr ") == " baumstr "
        assert trans.transliterate(" Baumwegstr ") == " baumwegstr "
        assert trans.transliterate(" Αθήνα ") == " athēna "
        assert trans.transliterate(" проспект ") == " prospekt "
コード例 #11
0
    def test_transliteration_rules_from_file(self):
        self.write_config("""\
            normalization:
            transliteration:
                - "'ax' > 'b'"
                - !include transliteration.yaml
            token-analysis:
                - analyzer: generic
                  variants:
            """)
        transpath = self.project_env.project_dir / ('transliteration.yaml')
        transpath.write_text('- "x > y"')

        loader = ICURuleLoader(self.project_env)
        rules = loader.get_transliteration_rules()
        trans = Transliterator.createFromRules("test", rules)

        assert trans.transliterate(" axxt ") == " byt "
コード例 #12
0
def test_transliteration_rules_from_file(test_config):
    cfgpath = test_config.project_dir / ('icu_tokenizer.yaml')
    cfgpath.write_text(
        dedent("""\
        normalization:
        transliteration:
            - "'ax' > 'b'"
            - !include transliteration.yaml
        token-analysis:
            - analyzer: generic
              variants:
        """))
    transpath = test_config.project_dir / ('transliteration.yaml')
    transpath.write_text('- "x > y"')

    loader = ICURuleLoader(test_config)
    rules = loader.get_transliteration_rules()
    trans = Transliterator.createFromRules("test", rules)

    assert trans.transliterate(" axxt ") == " byt "
コード例 #13
0
 def __init__(self, config, phplib_dir, db_connection) -> None:
     self.db_connection = db_connection
     self.config = config
     self.phplib_dir = phplib_dir
     self.black_list, self.white_list = self._load_white_and_black_lists()
     #Compile the regex here to increase performances.
     self.occurence_pattern = re.compile(
         r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
     )
     self.sanity_check_pattern = re.compile(r'^\w+$')
     self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
                                                          self.config.TERM_NORMALIZATION)
     #This set will contain all existing phrases from the word table which
     #no longer exist on the wiki.
     #It contain tuples with the following format: (normalized_word, class, type, operator)
     self.words_phrases_to_delete = set()
     #This set will contain the phrases which still exist from the wiki.
     #It is used to prevent duplicates on the wiki by removing them from
     #the word_phrases_to_delete only at the end.
     self.words_phrases_still_exist = set()
     #This set will contain all existing place_classtype tables which doesn't match any
     #special phrases class/type on the wiki.
     self.table_phrases_to_delete = set()
コード例 #14
0
def test_get_normalization_rules(cfgrules):
    loader = ICURuleLoader(cfgrules())
    rules = loader.get_normalization_rules()
    trans = Transliterator.createFromRules("test", rules)

    assert trans.transliterate(" проспект-Prospekt ") == " проспект prospekt "
コード例 #15
0
ファイル: test_generic.py プロジェクト: lonvia/Nominatim
def get_normalized_variants(proc, name):
    norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
    return proc.get_variants_ascii(norm.transliterate(name).strip())
コード例 #16
0
 def __init__(self, norm_rules):
     self.norm = Transliterator.createFromRules("rule_loader_normalization",
                                                norm_rules)
コード例 #17
0
    def testCustomFunctionality(self):

        # convert a's to b's and b's to c's        
        rules = "a > b; b > c;"
        self._checkToken(Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "abacadaba", "bcbcbdbcb")
コード例 #18
0
 def testCustomFunctionality2(self):
     
     # convert a's to b's and b's to c's        
     rules = "c { a > b; a > d;"
     self._checkToken(Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "caa", "cbd")
コード例 #19
0
ファイル: translit.py プロジェクト: ayum/ayum.translit
def to_latin(string, locale=locale):
    ustring = UnicodeString(string)
    nfc = Normalizer2.getNFCInstance()
    ustring = nfc.normalize(ustring)

    trans = Transliterator.createFromRules(
        "",
        "$wb = [^[:Letter:]] ;"
        # е
        "$wb { е > ye ;"
        "[ыq] { е } $wb > e ;"
        "[уеёыаоэяиюьъiuoeaq] { е > ye ;"
        "е > e ;"
        # э
        "$wb { э > e ;"
        "[жшцйjwcy] { э > е ;"
        "э > qe ;"
        # ы
        "[жшцйjwcy] { ы > i ;"
        "ы > q ;"
        # ё
        "$wb { ё > yo ;"
        "[жшцйjwcy] { ё > o ;"
        "[уеёыаоэяиюьъiuoeaq] { ё > yo ;"
        "ё > ho ;"
        # ю
        "$wb { ю > yu ;"
        "[жшцйjwcy] { ю > u ;"
        "[уеёыаоэяиюьъiuoeaq] { ю > yu ;"
        "ю > hu ;"
        # я
        "$wb { я > ya ;"
        "[жшцйjwcy] { я > a ;"
        "[уеёыаоэяиюьъiuoeaq] { я > ya ;"
        "я > ha ;"
        # Буквосочетание ьо,  только в заимствованных
        "ньо > nyo ;"
        "льо > lyo ;"
        "мьо > myo ;"
        "рьо > ryo ;"
        # Остальные буквы
        "а > a ;"
        "б > b ;"
        "в > v ;"
        "г > g ;"
        "д > d ;"
        "ж > j ;"
        "з > z ;"
        "и > i ;"
        "й > y ;"
        "к > k ;"
        "л > l ;"
        "м > m ;"
        "н > n ;"
        "о > o ;"
        "п > p ;"
        "р > r ;"
        "с > s ;"
        "т > t ;"
        "у > u ;"
        "ф > f ;"
        "х > x ;"
        "ц > c ;"
        "ч > ch ;"
        "ш > w ;"
        "щ > wh ;"
        # Проход с начала
        ":: Any-Null ;"
        "[nlmr] { ь } y[aueioq] > ;"
        "ь > h ;"
        "[nlmr] { ъ } y[aueioq] > y;"
        "ъ > ;"
        # Проход с начала
        ":: Any-Null ;"
        "h+ > h ;")
    ustring = trans.transliterate(ustring)
    return ustring