Beispiel #1
0
def lang_fsts(lang: str, sigma: pynini.Fst) -> List[pynini.Fst]:
    """FSTs for visual normalization of abjad / alphabet script languages."""
    anywhere_rewrite = rule.fst_from_rule_file(
        u.LANG_DIR / lang / 'visual_norm.tsv', sigma)

    nonfinal_file = u.LANG_DIR / lang / 'visual_norm_nonfinal.tsv'
    nonfinal_rule = uf.StringFile(nonfinal_file, return_if_empty=uf.EPSILON)
    nonfinal_rewrite = rewrite.Rewrite(nonfinal_rule, sigma, right=sigma)

    final_isolated_file = (u.LANG_DIR / lang /
                           'visual_norm_final_isolated.tsv')
    final_isolated_rule = uf.StringFile(final_isolated_file,
                                        return_if_empty=uf.EPSILON)
    final_rewrite = rewrite.Rewrite(final_isolated_rule,
                                    sigma,
                                    left=sigma,
                                    right='[EOS]')

    isolated_file = u.LANG_DIR / lang / 'visual_norm_isolated.tsv'
    isolated_rule = uf.StringFile(isolated_file, return_if_empty=uf.EPSILON)
    isolated_rewrite = rewrite.Rewrite(pynini.union(final_isolated_rule,
                                                    isolated_rule),
                                       sigma,
                                       left='[BOS]',
                                       right='[EOS]')

    return [
        anywhere_rewrite,
        nonfinal_rewrite,
        final_rewrite,
        isolated_rewrite,
    ]
Beispiel #2
0
def core_visual_norm_fsts(rewrite_file: os.PathLike,
                          preserve_file: os.PathLike,
                          consonant_file: os.PathLike,
                          sigma: pynini.Fst) -> List[pynini.Fst]:
  """Creates a visual normalization FST.

  Given a rewrite file, preserve file, and consonant file, returns an FST
  that will perform the rewrites described in the StringFile `rewrite_file`,
  additionally clearing out instances of ZWJ, ZWNJ, and ZWS except for those
  that match preserve_file when occurring between consonants (which are
  specified in the consonants file).

  Args:
    rewrite_file: Path relative to the runfiles directory of a StringFile of visual rewrites.
    preserve_file: Path relative to the runfiles directory of a StringFile of ZWJ sequences to
      preserve.
    consonant_file: Path relative to the runfiles directory of a StringFile containing a
      native--latin consonant mapping.
    sigma: An Fst with which to consider the complete alphabet for cdrewrites.
  Returns:
    Visual normalization FST.
  """
  rewrite_fst = rule.fst_from_rule_file(rewrite_file, sigma)
  preserve = uf.StringFile(preserve_file)
  consonant_map = uf.StringFile(consonant_file)
  consonant = pynini.project(consonant_map, 'input')

  # This makes sure that the generated symbols used as implementation
  # detail symbols for ZWJ preservation are considered as part of sigma.
  # Generated symbols are those delimited by square brackets, such as
  # `[ZWJ,VIRAMA]` for example.
  intermediate_sigma = u.BuildSigmaFstFromSymbolTable(
      pynini.generated_symbols()).union(sigma)

  mark_preserve = ur.Rewrite(preserve, intermediate_sigma, consonant, consonant)
  clean_joiner = ur.Rewrite(
      pynutil.delete(pynini.union(uc.ZWNJ, uc.ZWJ, uc.ZWS)), intermediate_sigma)
  reinstate = ur.Rewrite(pynini.invert(preserve), intermediate_sigma)

  return [rewrite_fst, mark_preserve, clean_joiner, reinstate,
          # We right-compose with sigma.star to ensure the generated_symbols
          # don't leak through into the visual_norm fst.
          sigma.star]
Beispiel #3
0
def generator_main(exporter_map: multi_grm.ExporterMapping):
  """Generates FSTs for visual normalization of Brahmic scripts."""
  for token_type in ('byte', 'utf8'):
    rewrite_map = {}
    with pynini.default_token_type(token_type):
      sigma_map = {}
      for script in u.SCRIPTS:
        sigma = u.OpenSigma(script, token_type)
        sigma_map[script] = sigma
        dedup = cu.dedup_marks_fst(script, sigma)
        nfc = open_nfc(script, token_type)
        rewrite_map[script] = ur.ComposeFsts(
            [nfc, dedup] + core_visual_norm_fsts(
                u.SCRIPT_DIR / script / 'visual_rewrite.tsv',
                u.SCRIPT_DIR / script / 'preserve.tsv',
                u.SCRIPT_DIR / script / 'consonant.tsv',
                sigma))

      for script, langs in u.LANG_SCRIPT_MAP.items():
        for lang in langs:
          sigma = sigma_map[script]
          consonant_map = uf.StringFile(u.SCRIPT_DIR / script / 'consonant.tsv')
          consonant = pynini.project(consonant_map, 'input')

          before_cons = uf.StringFile(
              u.SCRIPT_DIR / script / lang / 'before_consonant.tsv')
          rewrite_before_cons = ur.Rewrite(before_cons, sigma, right=consonant)
          after_cons = uf.StringFile(
              u.SCRIPT_DIR / script / lang / 'after_consonant.tsv')
          rewrite_after_cons = ur.Rewrite(after_cons, sigma, left=consonant)
          rewrite_map[lang] = ur.ComposeFsts([
              rewrite_map[script], rewrite_before_cons, rewrite_after_cons])

      exporter = exporter_map[token_type]
      for name, fst in rewrite_map.items():
        exporter[name.upper()] = fst
def generator_main(exporter: grm.Exporter):
    """FSTs for language-agnostic reversible romanization of abjad/alphabets."""
    # Construct NFC transducer - it is different from the standalone FST
    # transducer in that it allows letters that are not abjad / alphabet.
    nfc_file = u.LANG_DIR / 'nfc.tsv'
    nfc_fst = rule.fst_from_rule_file(nfc_file, byte.BYTE)

    # Build language-agnostic visual normalization transducer.
    visual_norm_file = u.LANG_DIR / 'common' / 'visual_norm.tsv'
    visual_norm_fst = rule.fst_from_rule_file(visual_norm_file, byte.BYTE)

    # Compile romanisation transducer. In the Latin direction we apply NFC and
    # visual normalization first. No visual normalization is required in the
    # opposite direction.
    roman_mapping_file = u.LANG_DIR / 'reversible_roman.tsv'
    roman_fst = rule.fst_from_rule_file(roman_mapping_file, byte.BYTE)
    exporter['FROM_ARAB'] = pynini.optimize(
        nfc_fst @ visual_norm_fst @ roman_fst)

    # Transforming Latin to native is simpler.
    roman_strings = f.StringFile(roman_mapping_file)
    roman_inv_fst = pynini.invert(roman_strings).star
    exporter['TO_ARAB'] = roman_inv_fst.optimize()
Beispiel #5
0
def _input_string_file(filename: os.PathLike,
                       return_if_empty: pynini.Fst = uf.EMPTY) -> pynini.Fst:
    fst = uf.StringFile(filename, return_if_empty)
    return pynini.project(fst, 'input').rmepsilon()
Beispiel #6
0
def brahmic_to_iso(consonant_file: os.PathLike,
                   inherent_vowel_file: os.PathLike,
                   vowel_sign_file: os.PathLike, vowel_file: os.PathLike,
                   vowel_length_sign_file: os.PathLike, coda_file: os.PathLike,
                   dead_consonant_file: os.PathLike,
                   standalone_file: os.PathLike,
                   subjoined_consonant_file: os.PathLike,
                   virama_file: os.PathLike) -> pynini.Fst:
    """Creates an FST that transduces a Brahmic script to ISO 15919.

  Args:
    consonant_file: Path relative to the runfiles directory of a StringFile containing a
      native--latin consonant mapping.
    inherent_vowel_file: Path relative to depot of a StringFile containing the
      inherent vowel.
    vowel_sign_file: Path relative to depot of a StringFile containing a
      native--latin vowel matra mapping.
    vowel_file: Path relative to depot of a StringFile containing a
      native--latin independent vowel mapping.
    vowel_length_sign_file: Path relative to depot of a StringFile containing a
      native--latin vowel length sign mapping.
    coda_file: Path relative to depot of a StringFile containing a
      native--latin coda mapping.
    dead_consonant_file: Path relative to depot of a StringFile containing a
      native--latin dead consonant mapping.
    standalone_file: Path relative to depot of a StringFile containing a
      native--latin standalone string mapping.
    subjoined_consonant_file: Path relative to depot of a StringFile containing
      a native--latin subjoined consonant mapping.
    virama_file: Path relative to depot of a StringFile containing the virama
      for the script.

  Returns:
    Brahmic script to ISO FST.
  """
    core_consonant = uf.StringFile(consonant_file)
    inherent_vowel = uf.StringFile(inherent_vowel_file)
    vowel_sign = uf.StringFile(vowel_sign_file)
    vowel = uf.StringFile(vowel_file)
    vowel_length_sign = uf.StringFile(vowel_length_sign_file)
    coda = uf.StringFile(coda_file)
    dead_consonant = uf.StringFile(dead_consonant_file)
    standalone = uf.StringFile(standalone_file)
    subjoined_consonant = uf.StringFile(subjoined_consonant_file)
    virama = uf.StringFile(virama_file)

    common_symbol = uf.StringFile(u.SCRIPT_DIR / 'common' / 'symbol.tsv')

    ins_inherent = pynutil.insert(inherent_vowel)
    ins_dash = pynutil.insert('-')
    ins_dot = pynutil.insert('.')
    del_virama = pynutil.delete(virama)
    virama_mark = pynini.cross(virama, '˘')

    low_priority_epsilon = pynini.accep('', weight=1)
    consonant = core_consonant + uf.QuesSafe(subjoined_consonant)
    convert_to_iso = pynini.union(
        consonant + vowel_sign,
        consonant + ins_inherent + low_priority_epsilon,
        consonant + del_virama + low_priority_epsilon,
        vowel + low_priority_epsilon,
        coda,
        dead_consonant,
        vowel_length_sign,
        standalone,

        # Rare cases:
        # Dangling vowel signs.
        ins_dash + vowel_sign + (ins_dot + vowel).star + low_priority_epsilon,
        virama_mark + low_priority_epsilon,  # Explicit virama elsewhere.
        common_symbol,  # Joiners.

        # Independent vowel not as the first letter:
        vowel + (ins_dot + vowel).plus + low_priority_epsilon,
        consonant + vowel_sign + (ins_dot + vowel).plus,
        consonant + del_virama + (ins_dot + vowel).plus,
        consonant + ins_inherent + (ins_dot + vowel).plus)

    return pynini.optimize(convert_to_iso.star)