def _latinize_internal(text, ascii=False): if ascii: if not hasattr(latinize_text, '_ascii'): # Transform to latin, separate accents, decompose, remove # symbols, compose, push to ASCII latinize_text._ascii = Transliterator.createInstance( 'Any-Latin; NFKD; [:Symbol:] Remove; [:Nonspacing Mark:] Remove; NFKC; Accents-Any; Latin-ASCII' ) # noqa return latinize_text._ascii.transliterate(text) if not hasattr(latinize_text, '_tr'): latinize_text._tr = Transliterator.createInstance('Any-Latin') return latinize_text._tr.transliterate(text)
def main(argv): inputfile = '' outputfile = '' try: opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="]) except getopt.GetoptError: print 'test.py -i <inputfile> -o <outputfile>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'zg-my.py -i <inputfile> -o <outputfile>' sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg outputfile = "converted_" + inputfile elif opt in ("-o", "--ofile"): outputfile = arg print 'Input file is ', inputfile print 'Output file is ', outputfile uni = Transliterator.createInstance('Zawgyi-my') f = open(inputfile, "r") converted = uni.transliterate(f.read()) f.close() fo = open(outputfile, "w") fo.write(converted.encode('utf8')) fo.close()
def compose_nfc(text): """Perform unicode composition.""" if text is None: return None if not hasattr(compose_nfc, '_tr'): compose_nfc._tr = Transliterator.createInstance('Any-NFC') return compose_nfc._tr.transliterate(text)
def decompose_nfkd(text): """Perform unicode compatibility decomposition. This will replace some non-standard value representations in unicode and normalise them, while also separating characters and their diacritics into two separate codepoints. """ if text is None: return None if not hasattr(decompose_nfkd, '_tr'): decompose_nfkd._tr = Transliterator.createInstance('Any-NFKD') return decompose_nfkd._tr.transliterate(text)
def latinize_text(text, ascii=False): """Transliterate the given text to the latin script. This attempts to convert a given text to latin script using the closest match of characters vis a vis the original script. """ if text is None or not isinstance(text, six.string_types) or not len(text): return text if ascii: if not hasattr(latinize_text, '_ascii'): # Transform to latin, separate accents, decompose, remove # symbols, compose, push to ASCII latinize_text._ascii = Transliterator.createInstance( 'Any-Latin; NFKD; [:Symbol:] Remove; [:Nonspacing Mark:] Remove; NFKC; Accents-Any; Latin-ASCII' ) # noqa return latinize_text._ascii.transliterate(text) if not hasattr(latinize_text, '_tr'): latinize_text._tr = Transliterator.createInstance('Any-Latin') return latinize_text._tr.transliterate(text)
def make_transliterator(script): try: from icu import Transliterator inst = Transliterator.createInstance(script) return inst.transliterate except ImportError: from text_unidecode import unidecode warnings.warn("Install 'pyicu' for better text transliteration.", ICUWarning, stacklevel=4) # noqa def transliterate(text): text = compose_nfkc(text) return unidecode(text) return transliterate
def make_trans(script: str) -> Callable[[str], Optional[str]]: try: from icu import Transliterator # type: ignore inst = Transliterator.createInstance(script) return cast(Callable[[str], str], inst.transliterate) except ImportError: from text_unidecode import unidecode # type: ignore warnings.warn("Install 'pyicu' for better text transliteration.", ICUWarning, stacklevel=4) # noqa def transliterate(text: str) -> Optional[str]: clean = compose_nfkc(text) if clean is None: return None return cast(Optional[str], unidecode(clean)) return transliterate
def _getTransliterator(self, name): return Transliterator.createInstance(name, UTransDirection.FORWARD)
# -*- coding: utf-8 -*- """ Transliterating text to International Phonetic Alphabet (IPA) Using International Components for Unicode (ICU) https://github.com/ovalhub/pyicu """ from icu import Transliterator _ICU_THAI_TO_LATIN = Transliterator.createInstance("Thai-Latin") # ถอดเสียงภาษาไทยเป็นอักษรละติน def transliterate(text: str) -> str: """ Use ICU (International Components for Unicode) for transliteration ถอดเสียงภาษาไทยเป็นอักษรละติน รับค่า ''str'' ข้อความ คืนค่า ''str'' อักษรละติน :param str text: Thai text to be transliterated. :return: A string of Internaitonal Phonetic Alphabets indicating how the text should be pronounced. """ return _ICU_THAI_TO_LATIN.transliterate(text)
import os import re import six import yaml from icu import Transliterator DATA_PAGE = 10000 WS_PATTERN = re.compile('\s+') tr = Transliterator.createInstance('Any-Latin') def resolve_includes(file_path, data): """Handle include statements in the configuration file.""" if isinstance(data, (list, tuple, set)): data = [resolve_includes(file_path, i) for i in data] elif isinstance(data, dict): include_paths = data.pop('include', []) if not isinstance(include_paths, (list, tuple, set)): include_paths = [include_paths] for include_path in include_paths: dir_prefix = os.path.dirname(file_path) include_path = os.path.join(dir_prefix, include_path) data.update(load_config_file(include_path)) for key, value in data.items(): data[key] = resolve_includes(file_path, value) return data def load_config_file(file_path): """Load a YAML (or JSON) model configuration file."""
def _getTransliterator(self, name): return Transliterator.createInstance(name, UTransDirection.FORWARD)
def _decompose_nfkd(text): if not hasattr(_decompose_nfkd, '_tr'): _decompose_nfkd._tr = Transliterator.createInstance('Any-NFKD') return _decompose_nfkd._tr.transliterate(text)
def _compose_nfc(text): if not hasattr(_compose_nfc, '_tr'): _compose_nfc._tr = Transliterator.createInstance('Any-NFC') return _compose_nfc._tr.transliterate(text)
flags.DEFINE_bool("build_fasttext", False, "build fasttext features") flags.DEFINE_bool("build_tfrecord", False, "build tensorflow record input files") flags.DEFINE_integer("nrows", 100, "The TOP number of rows to query") prog = re.compile("[\\W\\d]", re.UNICODE) prog_with_digits = re.compile("[\\W]", re.UNICODE) stemmer = SnowballStemmer("russian", ignore_stopwords=True) float_prog = re.compile(r"[-+]?\d*\.\d+|\d+", re.UNICODE) dot_prog = re.compile(r'[xх*]', re.UNICODE) TransTable = str.maketrans(dict.fromkeys(r'~/-\[\]()|{}:^+', ' ')) wt = WordTokenizer() trans = Transliterator.createInstance('Latin-Cyrillic') unit_lookup = { 'г': 'грамм', 'грам': 'грамм', 'гр': 'грамм', 'грамм': 'грамм', 'gr': 'грамм', 'ml': 'мл', 'милл': 'мл', 'млитр': 'мл', 'млтр': 'мл', 'мл': 'мл', 'ш': 'шт', 'шт': 'шт', 'тон': 'тонна',