def test_33_register_unregister(self): """ Testing register/unregister. """ from transliterate.contrib.languages.hy.translit_language_pack import ( ArmenianLanguagePack) class A(TranslitLanguagePack): language_code = "ru" language_name = "Example" mapping = data.test_33_register_unregister_mapping # Since key `ru` already exists in the registry it can't be replaced # (without force-register). res = registry.register(A) self.assertTrue(not res) # Now with force-register it can. res = registry.register(A, force=True) self.assertTrue(res) # Once we have it there and it's forced, we can't register another. res = registry.register(A, force=True) self.assertTrue(not res) # Unregister non-forced language pack. res = registry.unregister(ArmenianLanguagePack) self.assertTrue(res and not ArmenianLanguagePack.language_code in get_available_language_codes()) res = registry.unregister(A) self.assertTrue(not res and A.language_code in get_available_language_codes())
def test_33_register_unregister(self): """ Testing register/unregister. """ from transliterate.contrib.languages.hy.translit_language_pack import ArmenianLanguagePack class A(TranslitLanguagePack): language_code = "ru" language_name = "Example" mapping = ( u"abcdefghij", u"1234567890", ) # Since key `ru` already exists in the registry it can't be replaced (without force-register). res = registry.register(A) self.assertTrue(not res) # Now with force-register it can. res = registry.register(A, force=True) self.assertTrue(res) # Once we have it there and it's forced, we can't register another. res = registry.register(A, force=True) self.assertTrue(not res) # Unregister non-forced language pack. res = registry.unregister(ArmenianLanguagePack) self.assertTrue(res and not ArmenianLanguagePack.language_code in get_available_language_codes()) res = registry.unregister(A) self.assertTrue(not res and A.language_code in get_available_language_codes())
def main(): print "Csv2ldif started.." print get_available_language_codes() reader = unicode_csv_reader(open("users.csv"), delimiter=';') i = iter(reader) # skip csv header i.next() for row in i: user = {} org = row[0] dep = row[1] last_name = row[2] first_name = row[3] sure_name = row[4] password = random_password() uid = get_uid(last_name, first_name) cn = get_common_name(last_name, first_name, sure_name) dn_str = u"CN={},OU={},OU={},OU=Пользователи,DC=edu,dc=knu,dc=kg".format( cn, org, dep) user["cn"] = encode(cn) user["dn"] = encode(dn_str) user["sn"] = encode(last_name) user["givenName"] = encode(first_name) user["displayName"] = encode(u"{} {} {}".format( last_name, first_name, sure_name)) user["name"] = encode("user_name") user["uid"] = uid user["unicodePwd"] = encode(password) template = Template(open("template.jinja2").read()) print template.render(user) print "Csv2ldif finished."
def translate_text_table_to_en(text_table): language_code = text_table.language_code if language_code not in transliterate.get_available_language_codes(): raise ValueError( "Language code " + language_code + " not found in supported transliteration tables " + ",".join(transliterate.get_available_language_codes())) return TextTable( [(text_id, translate_text_to_en(text, text_table.language_code), text_type) for text_id, text, text_type in text_table.texts], 'en')
def transliterate(first_name, last_name, full_name_native, search_helper): vocab = set( first_name.split(' ') + last_name.split(' ') + full_name_native.split(' ') + search_helper.split(' ')) langs = get_available_language_codes() ascii_values = [] translate_table = { 0xe4: ord('a'), 0xc4: ord('A'), 0xf6: ord('o'), 0xd6: ord('O'), 0xfc: ord('u'), 0xdc: ord('U'), } for name in vocab: name = name.decode('utf-8') ascii_values.append(unidecode(name)) for lang in langs: try: ascii_values.append(str(translit(name, lang, reversed=True))) except UnicodeEncodeError: # if we encounter other characters = other languages # than German pass try: ascii_values.append( str(name.replace(u'\xdf', 'ss').translate(translate_table))) except UnicodeEncodeError: # if we encounter other characters = other languages than German pass return ' '.join(sorted(set(ascii_values))).strip()
def transliterate(first_name, last_name, full_name_native, search_helper): vocab = set(first_name.split(' ') + last_name.split(' ') + full_name_native.split(' ') + search_helper.split(' ')) langs = get_available_language_codes() ascii_values = [] translate_table = { 0xe4: ord('a'), 0xc4: ord('A'), 0xf6: ord('o'), 0xd6: ord('O'), 0xfc: ord('u'), 0xdc: ord('U'), } for name in vocab: name = name.decode('utf-8') ascii_values.append(unidecode(name)) for lang in langs: try: ascii_values.append( str(translit(name, lang, reversed=True))) except UnicodeEncodeError: # if we encounter other characters = other languages # than German pass try: ascii_values.append( str(name.replace(u'\xdf', 'ss').translate(translate_table))) except UnicodeEncodeError: # if we encounter other characters = other languages than German pass return ' '.join(sorted(set(ascii_values))).strip()
def parse_args(): lens = get_available_language_codes() parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description=\ "Usage: python wordtranny.py <OPTIONS> \n" + "Available Languages: \n" + colors.red + str(lens) + colors.normal) menu_group = parser.add_argument_group(colors.lightblue + 'Menu Options' + colors.normal) menu_group.add_argument('-f', '--file', help="wordlist to open", required=True) menu_group.add_argument('-l', '--language', help="language to convert to", required=True) menu_group.add_argument('-o', '--outfile', help="outfile to write the converted text to", required=True) args = parser.parse_args() output = None return args, output
def get_words(lang, file_obj): """ reads a dictionary file-object in hunspell format (utf-8 version) returns a cleaned up set of words """ words = set() lines = file_obj.readlines()[1:] for line in lines: # decode and remove comments line = line.decode('utf-8').partition('/')[0] # strip line from unwanted stuff line = line.strip() # transliterate line if needed if lang in get_available_language_codes(): line = translit(line, lang, reversed=True) # discard lines containing superscript/subscript if any(x in line for x in '⁰¹²³⁴⁵⁶⁷⁸⁹'): continue # discard lines containing non-alpha characters and with non-normal capitalization (acronyms...) if not line.isalpha() or (len(line) > 1 and not line[1:].islower()): continue words.add(line) # need to transform in list be able to save it as json words.discard('') words = list(words) return words
def test_01_get_available_language_codes(self): """Test ``autodiscover`` and ``get_available_language_codes``.""" res = get_available_language_codes() res.sort() c = ['el', 'hy', 'ka', 'ru', 'uk', 'bg', 'mk', 'mn'] c.sort() self.assertEqual(res, c) return res
def process_args(args): if sys.version_info[0] >= 3: ofp = codecs.getwriter('utf8')(sys.stdout.buffer) else: ofp = codecs.getwriter('utf8')(sys.stdout) # last language used lang = None # last language pack used translit = None for line in read_stdin(): try: obj = json.loads(line) text = obj['text'] source = obj['source'] target = obj['target'] # { "source" : "ru", "target" : "en", "text": "до свидания" } # { "source" : "en", "target": "ru", "text": "do svidanija" } reversed = True if target == 'en' else False lang = source if reversed is True else target translit = get_translit_function(lang) # ru tline = translit(text, reversed=reversed) # True # result json obj['lang'] = lang obj['reversed'] = reversed obj['trans'] = tline res = json.dumps(obj, ensure_ascii=False).encode('utf8') except Exception as ex: obj = {} obj['error'] = str(ex) obj['description'] = get_available_language_codes() res = json.dumps(obj, ensure_ascii=False).encode('utf8') # write to stdout try: # {"source": "ru", "trans": "Lorem ipsum dolor sit amet", "target": "en", "text": "Лорем ипсум долор сит амет"} ofp.writelines([res, '\n']) sys.stdout.flush() except Exception: pass except KeyboardInterrupt: # close files when process ends ifp.close() ofp.close() # gracefully exit sys.exit(0) # close files ofp.close()
def test_01_get_available_language_codes(self): """ Test ``autodiscover`` and ``get_available_language_codes``. """ res = get_available_language_codes() res.sort() c = ['el', 'hy', 'ka', 'ru', 'uk'] #'he', c.sort() self.assertEqual(res, c) return res
def transliterateField(request): fieldData = request.GET.get('fieldData') # print(translit(fieldData, 'en')) print(get_available_language_codes()) print(translit(fieldData, 'ukrTranslit')) fieldData = fieldData.lower().title() data = { 'transliteration': translit(fieldData, 'ukrTranslit'), } return JsonResponse(data)
def index_primary(primary, path_dest): logger = logging.getLogger(sys._getframe().f_code.co_name) transliterate_languages = transliterate.get_available_language_codes() doc = dominate.document(title=u'AWOL Index: Top-Level Resources') with doc.head: link(rel='stylesheet', type='text/css', href= 'http://yui.yahooapis.com/3.18.1/build/cssreset/cssreset-min.css') link(rel='stylesheet', type='text/css', href= 'http://yui.yahooapis.com/3.18.1/build/cssreset/cssreset-min.css') link( rel='stylesheet', type='text/css', href= 'http://fonts.googleapis.com/css?family=Open+Sans:400italic,600italic,700italic,400,600,700&subset=latin,cyrillic-ext,greek-ext,greek,latin-ext,cyrillic' ) link(rel='stylesheet', type='text/css', href='./index-style.css') doc += h1('Index of Top-Level Resources') _p = p('part of ', cls='subtitle') _p += a('The AWOL Index', href='../index.html') doc += _p _ul = doc.add(ul()) for pri in primary: rtitle = pri['title'] sort_key = rtitle.lower() if sort_key != unicode(sort_key.encode('ascii', 'ignore')): classification = langid.classify(sort_key) if classification[1] > 0.9: if classification[0] in transliterate_languages: sort_key = transliterate.translit(sort_key, classification[0], reversed=True) sort_key = unicodedata.normalize('NFKD', sort_key) sort_key = codecs.encode(sort_key, 'translit/long') sort_key = unidecode(sort_key) sort_key = sort_key.encode('ascii', 'ignore') if len(sort_key) == 0: sort_key = rtitle.lower() sort_key = RX_PUNCT.sub(u'', sort_key) sort_key = u''.join(sort_key.strip().split()).lower() logger.debug(u'sortkey for title "{0}": "{1}"'.format( rtitle, sort_key)) pri['sort_key'] = sort_key for pri in sorted([ pri for pri in primary if ' ' not in pri['domain'] and pri['title'] != u'' and pri['sort_key'] != u'' ], key=lambda k: k['sort_key']): _li = list_entry(_ul, pri) html_out(doc, os.path.join(path_dest, 'index-top.html'))
def load_texts(map_container, text_index_table): default_text_table = text_index_table.get_text_table() default_locale = default_text_table.language_code localizations = [default_text_table] if default_text_table.language_code in transliterate.get_available_language_codes( ): localizations.append(translate_text_table_to_en(default_text_table)) map_container.texts = localizations map_container.meta.locales = [x.language_code for x in localizations] map_container.meta.default_locale = default_locale
def transliteration(blob: str, source_langs: Union[List[str], str], target_lang: str = "la"): if blob is None: return None text: str = blob available_langs = get_available_language_codes() if isinstance(source_langs, str): source_langs = [source_langs] for lang in source_langs: if lang in available_langs: text = translit(text, lang, reversed=True) else: text = transliteration_slow(blob, target_lang) return text
def _method( lng: str = 'en', qnt: int = 1, underscore: bool = False) -> str: row_words = rv.random_words(count=qnt) char = ' ' if underscore: char = '_' result = char.join(row_words) if lng == 'en': return result elif lng in get_available_language_codes(): return translit(result, lng) else: return 'wrong lng'
def _transliterate(self, input_lang): # convert from russian to translit try: if input_lang == '': input_lang = detect(self._input_text) if input_lang not in get_available_language_codes(): input_lang = 'ru' self._output_text = translit(self._input_text, input_lang, reversed=True) except OSError as e: if e.errno != errno.EEXIST: raise pass
def test_15_register_custom_language_pack(self): """Test registering of a custom language pack.""" class ExampleLanguagePack(TranslitLanguagePack): """Example language pack.""" language_code = "example" language_name = "Example" mapping = data.test_15_register_custom_language_pack_mapping registry.register(ExampleLanguagePack) assert 'example' in get_available_language_codes() res = translit(self.latin_text, 'example') self.assertEqual(res, 'Lor5m 9psum 4olor s9t 1m5t') return res
class EuropeanTransliterator(TransliteratorBase): LANGS = set(euro_transliterate.get_available_language_codes() + ['en']) def __init__(self, src_lang, dest_lang): self.src_lang = src_lang self.dest_lang = dest_lang def transliterate(self, phrase): en_phrase = phrase if self.src_lang == 'en' else euro_transliterate.translit( phrase, self.src_lang, reversed=True) return euro_transliterate.translit(en_phrase, self.dest_lang) def reverse_transliterate(self, phrase): en_phrase = phrase if self.dest_lang == 'en' else euro_transliterate.translit( phrase, self.dest_lang, reversed=True) return euro_transliterate.translit(en_phrase, self.src_lang)
def transliterate(lang, value, reversed=True, stripnonwords=True, replacespaced=True): from transliterate import translit, get_available_language_codes if lang in get_available_language_codes() and reversed: value = translit(value, lang, reversed=reversed) if stripnonwords: value = re.sub(r'[^\s\w\d-]', '', value) if replacespaced: value = re.sub(r'\s+', '-', value) return value.lower()
def test_15_register_custom_language_pack(self): """ Testing registering of a custom language pack. """ class ExampleLanguagePack(TranslitLanguagePack): """ Example language pack. """ language_code = "example" language_name = "Example" mapping = data.test_15_register_custom_language_pack_mapping registry.register(ExampleLanguagePack) assert 'example' in get_available_language_codes() res = translit(self.latin_text, 'example') self.assertEqual(res, 'Lor5m 9psum 4olor s9t 1m5t') return res
def main(): if (len(sys.argv) < 3): print(USAGE) return (1) filename = sys.argv[1] langcodes = sys.argv[2:] tokens = tokenize(filename) langlist = [] for lang in langcodes: templist = [] for token in tokens: word = translate(token, lang, 'en').lower() if lang in get_available_language_codes(): word = translit(word, lang, reversed=True) templist.append(word) langlist.append(templist) print(langlist) lettergroups = []
def get_words(file, lang): """ reads a dictionary file in hunspell format (utf-8 version) returns a set of words """ words = set() # skip first line, header lines = file.readlines()[1:] for line in lines: # decode and remove comments line = line.decode('utf-8').partition('/')[0] # transliterate line if needed if lang.split('-')[0] in get_available_language_codes(): line = translit(line, lang, reversed=True) # only use non-empty lines if line: words.add(line) return words
def index_primary(primary, path_dest): logger = logging.getLogger(sys._getframe().f_code.co_name) transliterate_languages = transliterate.get_available_language_codes() doc = dominate.document(title=u'AWOL Index: Top-Level Resources') with doc.head: link(rel='stylesheet', type='text/css', href='http://yui.yahooapis.com/3.18.1/build/cssreset/cssreset-min.css') link(rel='stylesheet', type='text/css', href='http://yui.yahooapis.com/3.18.1/build/cssreset/cssreset-min.css') link(rel='stylesheet', type='text/css', href='http://fonts.googleapis.com/css?family=Open+Sans:400italic,600italic,700italic,400,600,700&subset=latin,cyrillic-ext,greek-ext,greek,latin-ext,cyrillic') link(rel='stylesheet', type='text/css', href='./index-style.css') doc += h1('Index of Top-Level Resources') _p = p('part of ', cls='subtitle') _p += a('The AWOL Index', href='../index.html') doc += _p _ul = doc.add(ul()) for pri in primary: rtitle = pri['title'] sort_key = rtitle.lower() if sort_key != unicode(sort_key.encode('ascii', 'ignore')): classification = langid.classify(sort_key) if classification[1] > 0.9: if classification[0] in transliterate_languages: sort_key = transliterate.translit(sort_key, classification[0], reversed=True) sort_key = unicodedata.normalize('NFKD', sort_key) sort_key = codecs.encode(sort_key, 'translit/long') sort_key = unidecode(sort_key) sort_key = sort_key.encode('ascii', 'ignore') if len(sort_key) == 0: sort_key = rtitle.lower() sort_key = RX_PUNCT.sub(u'', sort_key) sort_key = u''.join(sort_key.strip().split()).lower() logger.debug(u'sortkey for title "{0}": "{1}"'.format(rtitle, sort_key)) pri['sort_key'] = sort_key for pri in sorted([pri for pri in primary if ' ' not in pri['domain'] and pri['title'] != u'' and pri['sort_key'] != u''], key=lambda k: k['sort_key']): _li = list_entry(_ul, pri) html_out(doc, os.path.join(path_dest, 'index-top.html'))
def __init__(self, bot): self.bot = bot self.avail = transliterate.get_available_language_codes()
# Autodiscover available language packs #autodiscover() print '\nOriginal text\n---------------------------------------' text = "Lorem ipsum dolor sit amet" print text print '\nTransliteration to Armenian\n---------------------------------------' print translit(text, 'hy') print '\nTransliteration to Russian\n---------------------------------------' print translit(text, 'ru') print '\nList of available (registered) languages\n---------------------------------------' print get_available_language_codes() print '\nReversed transliteration from Armenian\n---------------------------------------' print translit(u'Լօրեմ իպսում դoլoր սիտ ամետ', 'hy', reversed=True) print '\nReversed transliteration from Russian\n---------------------------------------' print translit(u'Лорем ипсум долор сит амет', 'ru', reversed=True) print '\nTesting the function decorator\n---------------------------------------' from transliterate.decorators import transliterate_function @transliterate_function(language_code='hy') def decorator_test_armenian(text): return text
def __init__(self): self.engine = transliterate.translit self.languages = transliterate.get_available_language_codes()
# u"Ia": u"Я", u"Yu": u"Ю", u"Ya": u"Я", } reversed_specific_pre_processor_mapping = { u"ъ": u"", u"ь": u"", u"Ъ": u"", u"Ь": u"" } registry.register(Gost2006RuLangPack, force=True) LANG_PADDING = max([len(lang) for lang in get_available_language_codes()]) @lru_cache def transliterate(text, lang='ru-gost', reversed=True): trans = translit(text, lang, reversed) print(f"translit[{lang:{LANG_PADDING}}]: {text} => {trans}") if trans.isascii(): return trans else: for i in trans: if not i.isascii(): print(i, i.isascii()) raise ValueError( f"Incorrect transliteration table for '{lang}' language")
def sanitize_name(column_name: str): name = transliteration(column_name, get_available_language_codes()) if reserved_or_unsupported(name): return f"_{name}" return name
# Autodiscover available language packs #autodiscover() print '\nOriginal text\n---------------------------------------' text = "Lorem ipsum dolor sit amet" print text print '\nTransliteration to Armenian\n---------------------------------------' print translit(text, 'hy') print '\nTransliteration to Russian\n---------------------------------------' print translit(text, 'ru') print '\nList of available (registered) languages\n---------------------------------------' print get_available_language_codes() print '\nReversed transliteration from Armenian\n---------------------------------------' print translit(u'Լօրեմ իպսում դoլoր սիտ ամետ', 'hy', reversed=True) print '\nReversed transliteration from Russian\n---------------------------------------' print translit(u'Лорем ипсум долор сит амет', 'ru', reversed=True) print '\nTesting the function decorator\n---------------------------------------' from transliterate.decorators import transliterate_function @transliterate_function(language_code='hy') def decorator_test_armenian(text): return text print decorator_test_armenian(u"Lorem ipsum dolor sit amet")
u'ц': u'ts', u'ч': u'ch', u'ш': u'sh', u'щ': u'sch', u'ъ': u'', u'ы': u'y', u'ь': u'', u'э': u'e', u'ю': u'yu', u'я': u'ya', } registry.register(ExampleLanguagePack, force=True) print(get_available_language_codes()) # ['el', 'hy', 'ka', 'ru', 'example'] text = '40 лет Октября' def transliterate(text): trans = translit(text, 'example') print(trans) return trans if __name__ == "__main__": transliterate('Коммунистическая') # Lor5m 9psum 4olor s9t 1m5t