def _latinize_internal(text, ascii=False):
        if ascii:
            if not hasattr(latinize_text, '_ascii'):
                # Transform to latin, separate accents, decompose, remove
                # symbols, compose, push to ASCII
                latinize_text._ascii = Transliterator.createInstance(
                    'Any-Latin; NFKD; [:Symbol:] Remove; [:Nonspacing Mark:] Remove; NFKC; Accents-Any; Latin-ASCII'
                )  # noqa
            return latinize_text._ascii.transliterate(text)

        if not hasattr(latinize_text, '_tr'):
            latinize_text._tr = Transliterator.createInstance('Any-Latin')
        return latinize_text._tr.transliterate(text)
Example #2
0
    def __init__(self, norm_rules, trans_rules, analysis_rules):
        self.normalizer = Transliterator.createFromRules(
            "icu_normalization", norm_rules)
        trans_rules += ";[:Space:]+ > ' '"
        self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
                                                       trans_rules)
        self.search = Transliterator.createFromRules("icu_search",
                                                     norm_rules + trans_rules)

        self.analysis = {
            name: arules.create(self.to_ascii, arules.config)
            for name, arules in analysis_rules.items()
        }
Example #3
0
def compose_nfc(text):
    """Perform unicode composition."""
    if text is None:
        return None
    if not hasattr(compose_nfc, '_tr'):
        compose_nfc._tr = Transliterator.createInstance('Any-NFC')
    return compose_nfc._tr.transliterate(text)
Example #4
0
    def test_get_transliteration_rules(self):
        self.config_rules()
        loader = ICURuleLoader(self.project_env)
        rules = loader.get_transliteration_rules()
        trans = Transliterator.createFromRules("test", rules)

        assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
Example #5
0
def main(argv):
   inputfile = ''
   outputfile = ''
   try:
      opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
   except getopt.GetoptError:
      print 'test.py -i <inputfile> -o <outputfile>'
      sys.exit(2)
   for opt, arg in opts:
      if opt == '-h':
         print 'zg-my.py -i <inputfile> -o <outputfile>'
         sys.exit()
      elif opt in ("-i", "--ifile"):
         inputfile = arg
         outputfile = "converted_" + inputfile
      elif opt in ("-o", "--ofile"):
         outputfile = arg

   print 'Input file is ', inputfile
   print 'Output file is ', outputfile

   uni = Transliterator.createInstance('Zawgyi-my')

   f = open(inputfile, "r")

   converted = uni.transliterate(f.read())

   f.close()

   fo = open(outputfile, "w")
   fo.write(converted.encode('utf8'))
   fo.close()
    def testCustomFunctionality(self):

        # convert a's to b's and b's to c's
        rules = "a > b; b > c;"
        self._checkToken(
            Transliterator.createFromRules("test", rules,
                                           UTransDirection.FORWARD),
            "abacadaba", "bcbcbdbcb")
    def testCustomFunctionality2(self):

        # convert a's to b's and b's to c's
        rules = "c { a > b; a > d;"
        self._checkToken(
            Transliterator.createFromRules("test", rules,
                                           UTransDirection.FORWARD), "caa",
            "cbd")
Example #8
0
def make_analyser(*variants, variant_only=False):
    rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]}
    if variant_only:
        rules['mode'] = 'variant-only'
    config = module.configure(rules, DEFAULT_NORMALIZATION)
    trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)

    return module.create(trans, config)
Example #9
0
def test_no_variants():
    rules = { 'analyzer': 'generic' }
    config = module.configure(rules, DEFAULT_NORMALIZATION)
    trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)

    proc = module.create(trans, config)

    assert get_normalized_variants(proc, '大德!') == ['dà dé']
    def name_analyzer(self):
        """ Create a new analyzer for tokenizing names and queries
            using this tokinzer. Analyzers are context managers and should
            be used accordingly:

            ```
            with tokenizer.name_analyzer() as analyzer:
                analyser.tokenize()
            ```

            When used outside the with construct, the caller must ensure to
            call the close() function before destructing the analyzer.

            Analyzers are not thread-safe. You need to instantiate one per thread.
        """
        norm = Transliterator.createFromRules("normalizer", self.normalization)
        trans = Transliterator.createFromRules("trans", self.transliteration)
        return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
Example #11
0
def decompose_nfkd(text):
    """Perform unicode compatibility decomposition.

    This will replace some non-standard value representations in unicode and
    normalise them, while also separating characters and their diacritics into
    two separate codepoints.
    """
    if text is None:
        return None
    if not hasattr(decompose_nfkd, '_tr'):
        decompose_nfkd._tr = Transliterator.createInstance('Any-NFKD')
    return decompose_nfkd._tr.transliterate(text)
Example #12
0
def latinize_text(text, ascii=False):
    """Transliterate the given text to the latin script.

    This attempts to convert a given text to latin script using the
    closest match of characters vis a vis the original script.
    """
    if text is None or not isinstance(text, six.string_types) or not len(text):
        return text

    if ascii:
        if not hasattr(latinize_text, '_ascii'):
            # Transform to latin, separate accents, decompose, remove
            # symbols, compose, push to ASCII
            latinize_text._ascii = Transliterator.createInstance(
                'Any-Latin; NFKD; [:Symbol:] Remove; [:Nonspacing Mark:] Remove; NFKC; Accents-Any; Latin-ASCII'
            )  # noqa
        return latinize_text._ascii.transliterate(text)

    if not hasattr(latinize_text, '_tr'):
        latinize_text._tr = Transliterator.createInstance('Any-Latin')
    return latinize_text._tr.transliterate(text)
Example #13
0
 def __init__(self, config, phplib_dir, db_connection) -> None:
     self.db_connection = db_connection
     self.config = config
     self.phplib_dir = phplib_dir
     self.black_list, self.white_list = self._load_white_and_black_lists()
     #Compile the regex here to increase performances.
     self.occurence_pattern = re.compile(
         r'\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([\-YN])'
     )
     self.sanity_check_pattern = re.compile(r'^\w+$')
     self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
                                                          self.config.TERM_NORMALIZATION)
Example #14
0
def test_get_search_rules(cfgrules):
    loader = ICURuleLoader(cfgrules())

    rules = loader.get_search_rules()
    trans = Transliterator.createFromRules("test", rules)

    assert trans.transliterate(" Baum straße ") == " baum straße "
    assert trans.transliterate(" Baumstraße ") == " baumstraße "
    assert trans.transliterate(" Baumstrasse ") == " baumstrasse "
    assert trans.transliterate(" Baumstr ") == " baumstr "
    assert trans.transliterate(" Baumwegstr ") == " baumwegstr "
    assert trans.transliterate(" Αθήνα ") == " athēna "
    assert trans.transliterate(" проспект ") == " prospekt "
Example #15
0
    def test_get_search_rules(self):
        self.config_rules()
        loader = ICURuleLoader(self.project_env)

        rules = loader.get_search_rules()
        trans = Transliterator.createFromRules("test", rules)

        assert trans.transliterate(" Baum straße ") == " baum straße "
        assert trans.transliterate(" Baumstraße ") == " baumstraße "
        assert trans.transliterate(" Baumstrasse ") == " baumstrasse "
        assert trans.transliterate(" Baumstr ") == " baumstr "
        assert trans.transliterate(" Baumwegstr ") == " baumwegstr "
        assert trans.transliterate(" Αθήνα ") == " athēna "
        assert trans.transliterate(" проспект ") == " prospekt "
Example #16
0
def make_transliterator(script):
    try:
        from icu import Transliterator
        inst = Transliterator.createInstance(script)
        return inst.transliterate
    except ImportError:
        from text_unidecode import unidecode
        warnings.warn("Install 'pyicu' for better text transliteration.",
                      ICUWarning,
                      stacklevel=4)  # noqa

        def transliterate(text):
            text = compose_nfkc(text)
            return unidecode(text)

        return transliterate
Example #17
0
    def test_transliteration_rules_from_file(self):
        self.write_config("""\
            normalization:
            transliteration:
                - "'ax' > 'b'"
                - !include transliteration.yaml
            token-analysis:
                - analyzer: generic
                  variants:
            """)
        transpath = self.project_env.project_dir / ('transliteration.yaml')
        transpath.write_text('- "x > y"')

        loader = ICURuleLoader(self.project_env)
        rules = loader.get_transliteration_rules()
        trans = Transliterator.createFromRules("test", rules)

        assert trans.transliterate(" axxt ") == " byt "
Example #18
0
def test_transliteration_rules_from_file(test_config):
    cfgpath = test_config.project_dir / ('icu_tokenizer.yaml')
    cfgpath.write_text(
        dedent("""\
        normalization:
        transliteration:
            - "'ax' > 'b'"
            - !include transliteration.yaml
        token-analysis:
            - analyzer: generic
              variants:
        """))
    transpath = test_config.project_dir / ('transliteration.yaml')
    transpath.write_text('- "x > y"')

    loader = ICURuleLoader(test_config)
    rules = loader.get_transliteration_rules()
    trans = Transliterator.createFromRules("test", rules)

    assert trans.transliterate(" axxt ") == " byt "
Example #19
0
def make_trans(script: str) -> Callable[[str], Optional[str]]:
    try:
        from icu import Transliterator  # type: ignore

        inst = Transliterator.createInstance(script)
        return cast(Callable[[str], str], inst.transliterate)
    except ImportError:
        from text_unidecode import unidecode  # type: ignore

        warnings.warn("Install 'pyicu' for better text transliteration.",
                      ICUWarning,
                      stacklevel=4)  # noqa

        def transliterate(text: str) -> Optional[str]:
            clean = compose_nfkc(text)
            if clean is None:
                return None
            return cast(Optional[str], unidecode(clean))

        return transliterate
Example #20
0
 def __init__(self, config, phplib_dir, db_connection) -> None:
     self.db_connection = db_connection
     self.config = config
     self.phplib_dir = phplib_dir
     self.black_list, self.white_list = self._load_white_and_black_lists()
     #Compile the regex here to increase performances.
     self.occurence_pattern = re.compile(
         r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
     )
     self.sanity_check_pattern = re.compile(r'^\w+$')
     self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
                                                          self.config.TERM_NORMALIZATION)
     #This set will contain all existing phrases from the word table which
     #no longer exist on the wiki.
     #It contain tuples with the following format: (normalized_word, class, type, operator)
     self.words_phrases_to_delete = set()
     #This set will contain the phrases which still exist from the wiki.
     #It is used to prevent duplicates on the wiki by removing them from
     #the word_phrases_to_delete only at the end.
     self.words_phrases_still_exist = set()
     #This set will contain all existing place_classtype tables which doesn't match any
     #special phrases class/type on the wiki.
     self.table_phrases_to_delete = set()
Example #21
0
 def _compose_nfc(text):
     if not hasattr(_compose_nfc, '_tr'):
         _compose_nfc._tr = Transliterator.createInstance('Any-NFC')
     return _compose_nfc._tr.transliterate(text)
Example #22
0
def get_normalized_variants(proc, name):
    norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
    return proc.get_variants_ascii(norm.transliterate(name).strip())
Example #23
0
# -*- coding: utf-8 -*-
"""
Transliterating text to International Phonetic Alphabet (IPA)
Using International Components for Unicode (ICU)
https://github.com/ovalhub/pyicu
"""
from icu import Transliterator

_ICU_THAI_TO_LATIN = Transliterator.createInstance("Thai-Latin")


# ถอดเสียงภาษาไทยเป็นอักษรละติน
def transliterate(text: str) -> str:
    """
    Use ICU (International Components for Unicode) for transliteration
    ถอดเสียงภาษาไทยเป็นอักษรละติน รับค่า ''str'' ข้อความ คืนค่า ''str'' อักษรละติน
    :param str text: Thai text to be transliterated.
    :return: A string of Internaitonal Phonetic Alphabets indicating how the text should be pronounced.
    """
    return _ICU_THAI_TO_LATIN.transliterate(text)
Example #24
0
import os
import re
import six
import yaml
from icu import Transliterator


DATA_PAGE = 10000
WS_PATTERN = re.compile('\s+')
tr = Transliterator.createInstance('Any-Latin')


def resolve_includes(file_path, data):
    """Handle include statements in the configuration file."""
    if isinstance(data, (list, tuple, set)):
        data = [resolve_includes(file_path, i) for i in data]
    elif isinstance(data, dict):
        include_paths = data.pop('include', [])
        if not isinstance(include_paths, (list, tuple, set)):
            include_paths = [include_paths]
        for include_path in include_paths:
            dir_prefix = os.path.dirname(file_path)
            include_path = os.path.join(dir_prefix, include_path)
            data.update(load_config_file(include_path))
        for key, value in data.items():
            data[key] = resolve_includes(file_path, value)
    return data


def load_config_file(file_path):
    """Load a YAML (or JSON) model configuration file."""
Example #25
0
 def __init__(self, norm_rules):
     self.norm = Transliterator.createFromRules("rule_loader_normalization",
                                                norm_rules)
    def _getTransliterator(self, name):

        return Transliterator.createInstance(name, UTransDirection.FORWARD)
Example #27
0
 def _decompose_nfkd(text):
     if not hasattr(_decompose_nfkd, '_tr'):
         _decompose_nfkd._tr = Transliterator.createInstance('Any-NFKD')
     return _decompose_nfkd._tr.transliterate(text)
Example #28
0
def to_latin(string, locale=locale):
    ustring = UnicodeString(string)
    nfc = Normalizer2.getNFCInstance()
    ustring = nfc.normalize(ustring)

    trans = Transliterator.createFromRules(
        "",
        "$wb = [^[:Letter:]] ;"
        # е
        "$wb { е > ye ;"
        "[ыq] { е } $wb > e ;"
        "[уеёыаоэяиюьъiuoeaq] { е > ye ;"
        "е > e ;"
        # э
        "$wb { э > e ;"
        "[жшцйjwcy] { э > е ;"
        "э > qe ;"
        # ы
        "[жшцйjwcy] { ы > i ;"
        "ы > q ;"
        # ё
        "$wb { ё > yo ;"
        "[жшцйjwcy] { ё > o ;"
        "[уеёыаоэяиюьъiuoeaq] { ё > yo ;"
        "ё > ho ;"
        # ю
        "$wb { ю > yu ;"
        "[жшцйjwcy] { ю > u ;"
        "[уеёыаоэяиюьъiuoeaq] { ю > yu ;"
        "ю > hu ;"
        # я
        "$wb { я > ya ;"
        "[жшцйjwcy] { я > a ;"
        "[уеёыаоэяиюьъiuoeaq] { я > ya ;"
        "я > ha ;"
        # Буквосочетание ьо,  только в заимствованных
        "ньо > nyo ;"
        "льо > lyo ;"
        "мьо > myo ;"
        "рьо > ryo ;"
        # Остальные буквы
        "а > a ;"
        "б > b ;"
        "в > v ;"
        "г > g ;"
        "д > d ;"
        "ж > j ;"
        "з > z ;"
        "и > i ;"
        "й > y ;"
        "к > k ;"
        "л > l ;"
        "м > m ;"
        "н > n ;"
        "о > o ;"
        "п > p ;"
        "р > r ;"
        "с > s ;"
        "т > t ;"
        "у > u ;"
        "ф > f ;"
        "х > x ;"
        "ц > c ;"
        "ч > ch ;"
        "ш > w ;"
        "щ > wh ;"
        # Проход с начала
        ":: Any-Null ;"
        "[nlmr] { ь } y[aueioq] > ;"
        "ь > h ;"
        "[nlmr] { ъ } y[aueioq] > y;"
        "ъ > ;"
        # Проход с начала
        ":: Any-Null ;"
        "h+ > h ;")
    ustring = trans.transliterate(ustring)
    return ustring
    def testCustomFunctionality(self):

        # convert a's to b's and b's to c's        
        rules = "a > b; b > c;"
        self._checkToken(Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "abacadaba", "bcbcbdbcb")
    def _getTransliterator(self, name):

        return Transliterator.createInstance(name, UTransDirection.FORWARD)
Example #31
0
def test_get_normalization_rules(cfgrules):
    loader = ICURuleLoader(cfgrules())
    rules = loader.get_normalization_rules()
    trans = Transliterator.createFromRules("test", rules)

    assert trans.transliterate(" проспект-Prospekt ") == " проспект prospekt "
 def testCustomFunctionality2(self):
     
     # convert a's to b's and b's to c's        
     rules = "c { a > b; a > d;"
     self._checkToken(Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "caa", "cbd")
Example #33
0
flags.DEFINE_bool("build_fasttext", False, "build fasttext features")
flags.DEFINE_bool("build_tfrecord", False,
                  "build tensorflow record input files")
flags.DEFINE_integer("nrows", 100, "The TOP number of rows to query")

prog = re.compile("[\\W\\d]", re.UNICODE)
prog_with_digits = re.compile("[\\W]", re.UNICODE)

stemmer = SnowballStemmer("russian", ignore_stopwords=True)

float_prog = re.compile(r"[-+]?\d*\.\d+|\d+", re.UNICODE)
dot_prog = re.compile(r'[xх*]', re.UNICODE)

TransTable = str.maketrans(dict.fromkeys(r'~/-\[\]()|{}:^+', ' '))
wt = WordTokenizer()
trans = Transliterator.createInstance('Latin-Cyrillic')

unit_lookup = {
    'г': 'грамм',
    'грам': 'грамм',
    'гр': 'грамм',
    'грамм': 'грамм',
    'gr': 'грамм',
    'ml': 'мл',
    'милл': 'мл',
    'млитр': 'мл',
    'млтр': 'мл',
    'мл': 'мл',
    'ш': 'шт',
    'шт': 'шт',
    'тон': 'тонна',