def main(argv): try: deva_index = int(argv[1]) except: # pylint: disable=bare-except deva_index = 0 try: arab_index = int(argv[2]) except: # pylint: disable=bare-except arab_index = 1 success = True for line in utf8.stdin: line = line.rstrip("\n") if not line or line.startswith("#"): continue line = unicodedata.normalize("NFC", line) fields = line.split("\t") assert len(fields) > max(deva_index, arab_index) deva = fields[deva_index] arab = fields[arab_index].replace(" ", "") fields.pop(min(deva_index, arab_index)) fields.pop(max(deva_index, arab_index) - 1) try: deva_sym = Symbolize(deva, DEVA_CODEPOINT_TO_SYMBOL) arab_sym = Symbolize(arab, ARAB_CODEPOINT_TO_SYMBOL) utf8.Print("\t".join([deva, deva_sym, arab_sym, arab] + fields)) except Exception as e: # pylint: disable=broad-except utf8.Print("Error symbolizing line %s: %s" % (line, e), file=utf8.stderr) success = False sys.exit(0 if success else 1) return
def main(argv): hi_ur = icu_util.LoadTransliterationRules(argv[1], 'hi-ur') success = True for line in utf8.stdin: line = line.rstrip('\n') if not line or line.startswith('#'): continue line = unicodedata.normalize('NFC', line) fields = line.split('\t') hi = fields[0] ur_with_diacritics = hi_ur.transliterate(hi) ur_without_diacritics = REMOVE_TASHKIL.transliterate( ur_with_diacritics) line_success = True if len(fields) > 1 and fields[1]: line_success &= StrEqual(fields[1], ur_with_diacritics, context=hi) if len(fields) > 2 and fields[2]: if fields[1]: assert StrEqualIgnoringTashkil(fields[1], fields[2], context=hi) line_success &= StrEqual(fields[2], ur_without_diacritics, context=hi) success &= line_success fields.extend([ur_with_diacritics, ur_without_diacritics]) utf8.Print('\t'.join(fields), file=utf8.stdout if line_success else utf8.stderr, flush=True) sys.exit(0 if success else 1) return
def ApplyPronunciationRules(xltor): # For interactive use. for line in utf8.stdin: for orth in line.split(): predicted = xltor.transliterate(orth) utf8.Print('%s\t%s' % (orth, predicted)) return
def main(args): if len(args) == 2: xltor = icu_util.LoadTransliterationRules(args[1], 'foo-bar') ApplyPronunciationRules(xltor) elif len(args) == 4: xltor = icu_util.LoadTransliterationRules(args[1], 'foo-bar') mapping = GetSampaToIpaMapping(args[2]) if TestPronunciationRules(xltor, mapping, args[3]): utf8.Print('PASS') sys.exit(0) else: utf8.Print('FAIL') sys.exit(1) else: utf8.Print('Usage: %s RULES [MAPPING DICTIONARY]' % args[0]) sys.exit(2) return
def main(unused_argv): token_count = {} for line in utf8.stdin: token = line.rstrip('\n') token_count[token] = token_count.get(token, 0) + 1 items = [(-count, token) for token, count in token_count.items()] for negative_count, token in sorted(items): utf8.Print('%s\t%d' % (token, -negative_count)) return
def main(argv): if len(argv) != 2: utf8.Print('Usage: %s SAMPLE_SIZE' % argv[0]) sys.exit(2) n = int(argv[1]) for in_sample, line in Sample(utf8.stdin, n, random.Random()): if in_sample: utf8.stdout.write(line) else: utf8.stderr.write(line) return
def main(unused_argv): for line in utf8.stdin: line = line.rstrip("\n") if not line or line.startswith("#"): continue line = unicodedata.normalize("NFC", line) fields = line.split("\t") assert fields arab = fields[0] arab = arab.replace(" ", "") arab_sym = Symbolize(arab) utf8.Print("\t".join([arab, arab_sym] + fields[1:])) return
def main(argv): if len(argv) != 2: utf8.Print('Usage: test_icu_transform RULES') sys.exit(2) xltor = icu_util.LoadTransliterationRules(argv[1], 'foo-bar') success = True for line in utf8.stdin: fields = line.rstrip('\n').split('\t') assert len(fields) >= 2 source, target = fields[:2] predicted = xltor.transliterate(source) if predicted != target: utf8.Print('%s\t%s != %s' % (source, target, predicted)) success = False if success: utf8.Print('PASS') sys.exit(0) else: utf8.Print('FAIL') sys.exit(1) return
def TestPronunciationRules(xltor, mapping, dictionary): # Batch testing against a dictionary. success = True with utf8.open(dictionary) as reader: for line in reader: line = line.rstrip('\n') fields = line.split('\t') assert len(fields) == 2 orth, pron = fields sampa = pron.split() ipa = ''.join(mapping[p] for p in sampa) if orth in EXCEPTIONAL_WORDS: continue predicted = xltor.transliterate(orth) if predicted != ipa: utf8.Print('%s\t%s\t%s != %s' % (orth, ' '.join(sampa), ipa, predicted)) success = False return success
def PrintSymbols(symtab, prefix="in"): for cp in sorted(symtab): utf8.Print('%sput_symbol { value: 0x%04X key: "%s" }' % (prefix, cp, symtab[cp])) return