Esempio n. 1
0
def main(argv):
    try:
        deva_index = int(argv[1])
    except:  # pylint: disable=bare-except
        deva_index = 0

    try:
        arab_index = int(argv[2])
    except:  # pylint: disable=bare-except
        arab_index = 1

    success = True
    for line in utf8.stdin:
        line = line.rstrip("\n")
        if not line or line.startswith("#"):
            continue
        line = unicodedata.normalize("NFC", line)
        fields = line.split("\t")
        assert len(fields) > max(deva_index, arab_index)
        deva = fields[deva_index]
        arab = fields[arab_index].replace(" ", "")
        fields.pop(min(deva_index, arab_index))
        fields.pop(max(deva_index, arab_index) - 1)
        try:
            deva_sym = Symbolize(deva, DEVA_CODEPOINT_TO_SYMBOL)
            arab_sym = Symbolize(arab, ARAB_CODEPOINT_TO_SYMBOL)
            utf8.Print("\t".join([deva, deva_sym, arab_sym, arab] + fields))
        except Exception as e:  # pylint: disable=broad-except
            utf8.Print("Error symbolizing line %s: %s" % (line, e),
                       file=utf8.stderr)
            success = False
    sys.exit(0 if success else 1)
    return
Esempio n. 2
0
def main(argv):
    hi_ur = icu_util.LoadTransliterationRules(argv[1], 'hi-ur')
    success = True
    for line in utf8.stdin:
        line = line.rstrip('\n')
        if not line or line.startswith('#'):
            continue
        line = unicodedata.normalize('NFC', line)
        fields = line.split('\t')
        hi = fields[0]
        ur_with_diacritics = hi_ur.transliterate(hi)
        ur_without_diacritics = REMOVE_TASHKIL.transliterate(
            ur_with_diacritics)
        line_success = True
        if len(fields) > 1 and fields[1]:
            line_success &= StrEqual(fields[1], ur_with_diacritics, context=hi)
        if len(fields) > 2 and fields[2]:
            if fields[1]:
                assert StrEqualIgnoringTashkil(fields[1],
                                               fields[2],
                                               context=hi)
            line_success &= StrEqual(fields[2],
                                     ur_without_diacritics,
                                     context=hi)
        success &= line_success
        fields.extend([ur_with_diacritics, ur_without_diacritics])
        utf8.Print('\t'.join(fields),
                   file=utf8.stdout if line_success else utf8.stderr,
                   flush=True)
    sys.exit(0 if success else 1)
    return
def ApplyPronunciationRules(xltor):
  # For interactive use.
  for line in utf8.stdin:
    for orth in line.split():
      predicted = xltor.transliterate(orth)
      utf8.Print('%s\t%s' % (orth, predicted))
  return
def main(args):
  if len(args) == 2:
    xltor = icu_util.LoadTransliterationRules(args[1], 'foo-bar')
    ApplyPronunciationRules(xltor)
  elif len(args) == 4:
    xltor = icu_util.LoadTransliterationRules(args[1], 'foo-bar')
    mapping = GetSampaToIpaMapping(args[2])
    if TestPronunciationRules(xltor, mapping, args[3]):
      utf8.Print('PASS')
      sys.exit(0)
    else:
      utf8.Print('FAIL')
      sys.exit(1)
  else:
    utf8.Print('Usage: %s RULES [MAPPING DICTIONARY]' % args[0])
    sys.exit(2)
  return
Esempio n. 5
0
def main(unused_argv):
    token_count = {}
    for line in utf8.stdin:
        token = line.rstrip('\n')
        token_count[token] = token_count.get(token, 0) + 1
    items = [(-count, token) for token, count in token_count.items()]
    for negative_count, token in sorted(items):
        utf8.Print('%s\t%d' % (token, -negative_count))
    return
Esempio n. 6
0
def main(argv):
    if len(argv) != 2:
        utf8.Print('Usage: %s SAMPLE_SIZE' % argv[0])
        sys.exit(2)
    n = int(argv[1])
    for in_sample, line in Sample(utf8.stdin, n, random.Random()):
        if in_sample:
            utf8.stdout.write(line)
        else:
            utf8.stderr.write(line)
    return
Esempio n. 7
0
def main(unused_argv):
    for line in utf8.stdin:
        line = line.rstrip("\n")
        if not line or line.startswith("#"):
            continue
        line = unicodedata.normalize("NFC", line)
        fields = line.split("\t")
        assert fields
        arab = fields[0]
        arab = arab.replace(" ", "")
        arab_sym = Symbolize(arab)
        utf8.Print("\t".join([arab, arab_sym] + fields[1:]))
    return
def main(argv):
    if len(argv) != 2:
        utf8.Print('Usage: test_icu_transform RULES')
        sys.exit(2)

    xltor = icu_util.LoadTransliterationRules(argv[1], 'foo-bar')
    success = True
    for line in utf8.stdin:
        fields = line.rstrip('\n').split('\t')
        assert len(fields) >= 2
        source, target = fields[:2]
        predicted = xltor.transliterate(source)
        if predicted != target:
            utf8.Print('%s\t%s != %s' % (source, target, predicted))
            success = False

    if success:
        utf8.Print('PASS')
        sys.exit(0)
    else:
        utf8.Print('FAIL')
        sys.exit(1)
    return
def TestPronunciationRules(xltor, mapping, dictionary):
  # Batch testing against a dictionary.
  success = True
  with utf8.open(dictionary) as reader:
    for line in reader:
      line = line.rstrip('\n')
      fields = line.split('\t')
      assert len(fields) == 2
      orth, pron = fields
      sampa = pron.split()
      ipa = ''.join(mapping[p] for p in sampa)
      if orth in EXCEPTIONAL_WORDS:
        continue
      predicted = xltor.transliterate(orth)
      if predicted != ipa:
        utf8.Print('%s\t%s\t%s != %s' %
                   (orth, ' '.join(sampa), ipa, predicted))
        success = False
  return success
Esempio n. 10
0
def PrintSymbols(symtab, prefix="in"):
    for cp in sorted(symtab):
        utf8.Print('%sput_symbol { value: 0x%04X key: "%s" }' %
                   (prefix, cp, symtab[cp]))
    return