def read_textproto(proto_path, proto): logging.info('Parsing %s ...', proto_path) if not os.path.exists(proto_path): proto_path = uf.AsResourcePath(proto_path) with open(proto_path, encoding='utf8') as f: text_format.Parse(f.read(), proto) logging.info('Read %d items.', len(proto.item)) return proto
def test_iso_roundtrip(self, tag: str): tag = tag.upper() far_path = u.FAR_DIR / 'iso.far' with pynini.Far(file.AsResourcePath(far_path), 'r') as far: natv_to_iso = far[f'FROM_{tag}'] iso_to_natv = far[f'TO_{tag}'] self.assertFstProbablyIdentity([natv_to_iso, iso_to_natv], token_type='byte', samples=test_util.NUM_TEST_SAMPLES)
def _fixed_rule_fst(script: str) -> pynini.Fst: """Creates an FST that transduces fixed rule romanization to ISO 15919.""" path = u.SCRIPT_DIR / script / 'fixed.tsv' resource_file = uf.AsResourcePath(path) chars = uc.derive_chars(both_sides=[path], input_side=[]) # ASCII printable characters are pass through. # Pynini's symbol generation characters ('[', ']') are avoided. sigma = uc.derive_sigma(chars | set(string.printable) - set('[]')) return rule.fst_from_cascading_rule_file(resource_file, sigma)
def test_romanization_roundtrip(self): far_path = u.FAR_DIR / 'reversible_roman.far' with pynini.Far(uf.AsResourcePath(far_path), 'r') as far: natv_to_latin = far['FROM_ARAB'] latin_to_natv = far['TO_ARAB'] round_trip = natv_to_latin @ latin_to_natv self.assertFstProbablyFunctional(round_trip, token_type='byte', samples=ut.NUM_TEST_SAMPLES)
def MaybeLoadScriptConfig( file_path: os.PathLike) -> script_config_pb2.ScriptConfig: """Loads script configuration, if present.""" pb = script_config_pb2.ScriptConfig() if not uf.IsFileExist(file_path): return pb file_path = uf.AsResourcePath(file_path) with open(file_path, encoding="utf8") as f: text_format.Parse(f.read(), pb) return pb
def setUp(self): super().setUp() self._letters_proto = letter_languages.read_textproto( u.LANG_DIR / 'letter_languages.textproto') self._roman_proto = unicode_strings_util.read_textproto( u.LANG_DIR / 'reversible_roman.textproto') far_path = u.FAR_DIR / 'reversible_roman.far' with pynini.Far(uf.AsResourcePath(far_path), 'r') as far: natv_to_roman = far['FROM_ARAB'] roman_to_natv = far['TO_ARAB'] self._round_trip = natv_to_roman @ roman_to_natv
def _read_string_file_chars_to_set(files: Iterable[os.PathLike], relevant_fields: int) -> Set[str]: """Reads the characters under some selection from some file paths into a set. Arguments: files: An Iterable of filepaths relevant_fields: The number of tab-delimited fields from the beginning in a StringFile to process. Must be a positive integer. Returns: Set[str] -- The set of all characters, under the selection in the files. """ chars = set() for fname in files: with pathlib.Path(uf.AsResourcePath(pathlib.Path(fname))).open( "rt", encoding="utf8") as f: for line in f: if line.startswith("#"): continue fields = line.strip().split("\t")[0:relevant_fields] for field in fields: chars.update(field) return chars
def rules_from_string_file(file: os.PathLike) -> Iterator[Rule]: """Yields string rules from a text resource with unweighted string maps.""" return rules_from_string_path(uf.AsResourcePath(file))
def _LoadFar(self) -> pynini.Far: return pynini.Far(uf.AsResourcePath(self._path_to_far))