def __init__(self, norm_rules, trans_rules, analysis_rules): self.normalizer = Transliterator.createFromRules( "icu_normalization", norm_rules) trans_rules += ";[:Space:]+ > ' '" self.to_ascii = Transliterator.createFromRules("icu_to_ascii", trans_rules) self.search = Transliterator.createFromRules("icu_search", norm_rules + trans_rules) self.analysis = { name: arules.create(self.to_ascii, arules.config) for name, arules in analysis_rules.items() }
def test_get_transliteration_rules(self): self.config_rules() loader = ICURuleLoader(self.project_env) rules = loader.get_transliteration_rules() trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
def testCustomFunctionality(self): # convert a's to b's and b's to c's rules = "a > b; b > c;" self._checkToken( Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "abacadaba", "bcbcbdbcb")
def testCustomFunctionality2(self): # convert a's to b's and b's to c's rules = "c { a > b; a > d;" self._checkToken( Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "caa", "cbd")
def make_analyser(*variants, variant_only=False): rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]} if variant_only: rules['mode'] = 'variant-only' config = module.configure(rules, DEFAULT_NORMALIZATION) trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) return module.create(trans, config)
def test_no_variants(): rules = { 'analyzer': 'generic' } config = module.configure(rules, DEFAULT_NORMALIZATION) trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) proc = module.create(trans, config) assert get_normalized_variants(proc, '大德!') == ['dà dé']
def name_analyzer(self): """ Create a new analyzer for tokenizing names and queries using this tokinzer. Analyzers are context managers and should be used accordingly: ``` with tokenizer.name_analyzer() as analyzer: analyser.tokenize() ``` When used outside the with construct, the caller must ensure to call the close() function before destructing the analyzer. Analyzers are not thread-safe. You need to instantiate one per thread. """ norm = Transliterator.createFromRules("normalizer", self.normalization) trans = Transliterator.createFromRules("trans", self.transliteration) return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
def __init__(self, config, phplib_dir, db_connection) -> None: self.db_connection = db_connection self.config = config self.phplib_dir = phplib_dir self.black_list, self.white_list = self._load_white_and_black_lists() #Compile the regex here to increase performances. self.occurence_pattern = re.compile( r'\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([\-YN])' ) self.sanity_check_pattern = re.compile(r'^\w+$') self.transliterator = Transliterator.createFromRules("special-phrases normalizer", self.config.TERM_NORMALIZATION)
def test_get_search_rules(cfgrules): loader = ICURuleLoader(cfgrules()) rules = loader.get_search_rules() trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" Baum straße ") == " baum straße " assert trans.transliterate(" Baumstraße ") == " baumstraße " assert trans.transliterate(" Baumstrasse ") == " baumstrasse " assert trans.transliterate(" Baumstr ") == " baumstr " assert trans.transliterate(" Baumwegstr ") == " baumwegstr " assert trans.transliterate(" Αθήνα ") == " athēna " assert trans.transliterate(" проспект ") == " prospekt "
def test_get_search_rules(self): self.config_rules() loader = ICURuleLoader(self.project_env) rules = loader.get_search_rules() trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" Baum straße ") == " baum straße " assert trans.transliterate(" Baumstraße ") == " baumstraße " assert trans.transliterate(" Baumstrasse ") == " baumstrasse " assert trans.transliterate(" Baumstr ") == " baumstr " assert trans.transliterate(" Baumwegstr ") == " baumwegstr " assert trans.transliterate(" Αθήνα ") == " athēna " assert trans.transliterate(" проспект ") == " prospekt "
def test_transliteration_rules_from_file(self): self.write_config("""\ normalization: transliteration: - "'ax' > 'b'" - !include transliteration.yaml token-analysis: - analyzer: generic variants: """) transpath = self.project_env.project_dir / ('transliteration.yaml') transpath.write_text('- "x > y"') loader = ICURuleLoader(self.project_env) rules = loader.get_transliteration_rules() trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" axxt ") == " byt "
def test_transliteration_rules_from_file(test_config): cfgpath = test_config.project_dir / ('icu_tokenizer.yaml') cfgpath.write_text( dedent("""\ normalization: transliteration: - "'ax' > 'b'" - !include transliteration.yaml token-analysis: - analyzer: generic variants: """)) transpath = test_config.project_dir / ('transliteration.yaml') transpath.write_text('- "x > y"') loader = ICURuleLoader(test_config) rules = loader.get_transliteration_rules() trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" axxt ") == " byt "
def __init__(self, config, phplib_dir, db_connection) -> None: self.db_connection = db_connection self.config = config self.phplib_dir = phplib_dir self.black_list, self.white_list = self._load_white_and_black_lists() #Compile the regex here to increase performances. self.occurence_pattern = re.compile( r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])' ) self.sanity_check_pattern = re.compile(r'^\w+$') self.transliterator = Transliterator.createFromRules("special-phrases normalizer", self.config.TERM_NORMALIZATION) #This set will contain all existing phrases from the word table which #no longer exist on the wiki. #It contain tuples with the following format: (normalized_word, class, type, operator) self.words_phrases_to_delete = set() #This set will contain the phrases which still exist from the wiki. #It is used to prevent duplicates on the wiki by removing them from #the word_phrases_to_delete only at the end. self.words_phrases_still_exist = set() #This set will contain all existing place_classtype tables which doesn't match any #special phrases class/type on the wiki. self.table_phrases_to_delete = set()
def test_get_normalization_rules(cfgrules): loader = ICURuleLoader(cfgrules()) rules = loader.get_normalization_rules() trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" проспект-Prospekt ") == " проспект prospekt "
def get_normalized_variants(proc, name): norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) return proc.get_variants_ascii(norm.transliterate(name).strip())
def __init__(self, norm_rules): self.norm = Transliterator.createFromRules("rule_loader_normalization", norm_rules)
def testCustomFunctionality(self): # convert a's to b's and b's to c's rules = "a > b; b > c;" self._checkToken(Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "abacadaba", "bcbcbdbcb")
def testCustomFunctionality2(self): # convert a's to b's and b's to c's rules = "c { a > b; a > d;" self._checkToken(Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "caa", "cbd")
def to_latin(string, locale=locale): ustring = UnicodeString(string) nfc = Normalizer2.getNFCInstance() ustring = nfc.normalize(ustring) trans = Transliterator.createFromRules( "", "$wb = [^[:Letter:]] ;" # е "$wb { е > ye ;" "[ыq] { е } $wb > e ;" "[уеёыаоэяиюьъiuoeaq] { е > ye ;" "е > e ;" # э "$wb { э > e ;" "[жшцйjwcy] { э > е ;" "э > qe ;" # ы "[жшцйjwcy] { ы > i ;" "ы > q ;" # ё "$wb { ё > yo ;" "[жшцйjwcy] { ё > o ;" "[уеёыаоэяиюьъiuoeaq] { ё > yo ;" "ё > ho ;" # ю "$wb { ю > yu ;" "[жшцйjwcy] { ю > u ;" "[уеёыаоэяиюьъiuoeaq] { ю > yu ;" "ю > hu ;" # я "$wb { я > ya ;" "[жшцйjwcy] { я > a ;" "[уеёыаоэяиюьъiuoeaq] { я > ya ;" "я > ha ;" # Буквосочетание ьо, только в заимствованных "ньо > nyo ;" "льо > lyo ;" "мьо > myo ;" "рьо > ryo ;" # Остальные буквы "а > a ;" "б > b ;" "в > v ;" "г > g ;" "д > d ;" "ж > j ;" "з > z ;" "и > i ;" "й > y ;" "к > k ;" "л > l ;" "м > m ;" "н > n ;" "о > o ;" "п > p ;" "р > r ;" "с > s ;" "т > t ;" "у > u ;" "ф > f ;" "х > x ;" "ц > c ;" "ч > ch ;" "ш > w ;" "щ > wh ;" # Проход с начала ":: Any-Null ;" "[nlmr] { ь } y[aueioq] > ;" "ь > h ;" "[nlmr] { ъ } y[aueioq] > y;" "ъ > ;" # Проход с начала ":: Any-Null ;" "h+ > h ;") ustring = trans.transliterate(ustring) return ustring