def testInflector(self): forms = rewrite.rewrites("grádus[case=gen][num=pl]", self.paradigm_a.inflector) self.assertSameElements(["grádusov"], forms) forms = rewrite.rewrites("stól[case=gen][num=pl]", self.paradigm_b.inflector) self.assertSameElements(["stolóv"], forms)
def testInflector(self): forms = rewrite.rewrites("aqua[case=gen][num=pl]", self.paradigm.inflector) self.assertSameElements(["aquārum"], forms) forms = rewrite.rewrites("puella[case=dat][num=pl]", self.paradigm.inflector) self.assertSameElements(["puellīs"], forms)
def testTagger(self): tagger = self.paradigm.tagger @ self.paradigm.feature_label_rewriter forms = rewrite.rewrites("aquārum", tagger) self.assertSameElements(["aquārum[case=gen][num=pl]"], forms) forms = rewrite.rewrites("puellīs", tagger) self.assertSameElements( ["puellīs[case=dat][num=pl]", "puellīs[case=abl][num=pl]"], forms)
def testAnalyzer(self): analyzer = self.paradigm_a.analyzer @ self.paradigm_a.feature_label_rewriter forms = rewrite.rewrites("grádusov", analyzer) self.assertSameElements(["grádus+ov[case=gen][num=pl]"], forms) analyzer = self.paradigm_b.analyzer @ self.paradigm_b.feature_label_rewriter forms = rewrite.rewrites("stolóv", analyzer) self.assertSameElements(["stol+óv[case=gen][num=pl]"], forms)
def testLemmatizer(self): lemmatizer = self.paradigm.lemmatizer @ self.paradigm.feature_label_rewriter forms = rewrite.rewrites("pacem", lemmatizer) self.assertSameElements(["pax[case=acc][num=sg]"], forms) forms = rewrite.rewrites("noctibus", lemmatizer) self.assertSameElements( ["nox[case=dat][num=pl]", "nox[case=abl][num=pl]"], forms)
def testInflector(self): forms = rewrite.rewrites("pax[case=acc][num=sg]", self.paradigm.inflector) self.assertSameElements(["pacem"], forms) forms = rewrite.rewrites("nox[case=dat][num=pl]", self.paradigm.inflector) self.assertSameElements(["noctibus"], forms)
def testLemmatizer(self): lemmatizer = self.paradigm.lemmatizer @ self.paradigm.feature_label_rewriter forms = rewrite.rewrites("aquārum", lemmatizer) self.assertSameElements(["aqua[case=gen][num=pl]"], forms) forms = rewrite.rewrites("puellīs", lemmatizer) self.assertSameElements( ["puella[case=dat][num=pl]", "puella[case=abl][num=pl]"], forms)
def testGenerator(self): generator = (self.paradigm.stems_to_forms @ self.paradigm.feature_label_rewriter) forms = rewrite.rewrites("noct", generator) self.assertSameElements( [ "nox+[case=nom][num=sg]", "noct+is[case=gen][num=sg]", "noct+ī[case=dat][num=sg]", "noct+em[case=acc][num=sg]", "noct+e[case=abl][num=sg]", "noct+ēs[case=nom][num=pl]", "noct+um[case=gen][num=pl]", # TODO(rws): Actually "noctium". "noct+ibus[case=dat][num=pl]", "noct+ēs[case=acc][num=pl]", # Also -īs for /i/ stems. "noct+ibus[case=abl][num=pl]" ], forms) forms = rewrite.rewrites("rēg", generator) self.assertSameElements([ "rēx+[case=nom][num=sg]", "rēg+is[case=gen][num=sg]", "rēg+ī[case=dat][num=sg]", "rēg+em[case=acc][num=sg]", "rēg+e[case=abl][num=sg]", "rēg+ēs[case=nom][num=pl]", "rēg+um[case=gen][num=pl]", "rēg+ibus[case=dat][num=pl]", "rēg+ēs[case=acc][num=pl]", "rēg+ibus[case=abl][num=pl]" ], forms) forms = rewrite.rewrites("ōs", generator) self.assertSameElements([ "ōs+[case=nom][num=sg]", "ōr+is[case=gen][num=sg]", "ōr+ī[case=dat][num=sg]", "ōr+em[case=acc][num=sg]", "ōr+e[case=abl][num=sg]", "ōr+ēs[case=nom][num=pl]", "ōr+um[case=gen][num=pl]", "ōr+ibus[case=dat][num=pl]", "ōr+ēs[case=acc][num=pl]", "ōr+ibus[case=abl][num=pl]" ], forms)
def testTagger(self): tagger = self.paradigm_a.tagger @ self.paradigm_a.feature_label_rewriter forms = rewrite.rewrites("grádusov", tagger) self.assertSameElements(["grádusov[case=gen][num=pl]"], forms) tagger = self.paradigm_b.tagger @ self.paradigm_b.feature_label_rewriter forms = rewrite.rewrites("stolóv", tagger) self.assertSameElements(["stolóv[case=gen][num=pl]"], forms)
def testLemmatizer(self): lemmatizer = (self.paradigm_a.lemmatizer @ self.paradigm_a.feature_label_rewriter) forms = rewrite.rewrites("grádusov", lemmatizer) self.assertSameElements(["grádus[case=gen][num=pl]"], forms) lemmatizer = (self.paradigm_b.lemmatizer @ self.paradigm_b.feature_label_rewriter) forms = rewrite.rewrites("stolóv", lemmatizer) self.assertSameElements(["stól[case=gen][num=pl]"], forms)
def testSetStemToForms(self): stems_and_forms = [ ("caw", [ "caw+al[aspect=dubitative]", "caw+inay[aspect=gerundial]", "cawaa+ʔaa[aspect=durative]", "caw[aspect=root]" ]), ("cuum", [ "cuum+al[aspect=dubitative]", "cum+inay[aspect=gerundial]", "cumuu+ʔaa[aspect=durative]", "cuum[aspect=root]" ]), ("diiyl", [ "diiyl+al[aspect=dubitative]", "diyl+inay[aspect=gerundial]", "diyiil+ʔaa[aspect=durative]", "diiyl[aspect=root]" ]), ("hiwiit", [ "hiwiit+al[aspect=dubitative]", "hiwt+inay[aspect=gerundial]", "hiwiit+ʔaa[aspect=durative]", "hiwiit[aspect=root]" ]), ("hoyoo", [ "hoyoo+al[aspect=dubitative]", "hoy+inay[aspect=gerundial]", "hoyoo+ʔaa[aspect=durative]", "hoyoo[aspect=root]" ]), ("ʔilk", [ "ʔilk+al[aspect=dubitative]", "ʔilk+inay[aspect=gerundial]", "ʔiliik+ʔaa[aspect=durative]", "ʔilk[aspect=root]" ]) ] generate = (self.paradigm.stems_to_forms @ self.paradigm.feature_label_rewriter) for (stem, expected) in stems_and_forms: predicted = rewrite.rewrites(stem, generate) self.assertSameElements(expected, predicted)
def normalize( self, text: str, n_tagged: int, punct_pre_process: bool = True, punct_post_process: bool = True, verbose: bool = False, ) -> str: """ Main function. Normalizes tokens from written to spoken form e.g. 12 kg -> twelve kilograms Args: text: string that may include semiotic classes n_tagged: number of tagged options to consider, -1 - to get all possible tagged options punct_pre_process: whether to perform punctuation pre-processing, for example, [25] -> [ 25 ] punct_post_process: whether to normalize punctuation verbose: whether to print intermediate meta information Returns: normalized text options (usually there are multiple ways of normalizing a given semiotic class) """ if punct_pre_process: text = pre_process(text) text = text.strip() if not text: if verbose: print(text) return text text = pynini.escape(text) if n_tagged == -1: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) # non-deterministic Eng normalization uses tagger composed with verbalizer, no permutation in between if self.lang == 'en': normalized_texts = tagged_texts else: normalized_texts = [] for tagged_text in tagged_texts: self._verbalize(tagged_text, normalized_texts) if len(normalized_texts) == 0: raise ValueError() if punct_post_process: normalized_texts = [ post_process_punctuation(t) for t in normalized_texts ] # do post-processing based on Moses detokenizer if self.processor: normalized_texts = [ self.processor.detokenize([t]) for t in normalized_texts ] normalized_texts = set(normalized_texts) return normalized_texts
def testGenerator(self): generator = (self.paradigm.stems_to_forms @ self.paradigm.feature_label_rewriter) forms = rewrite.rewrites("noct__1000__", generator) self.assertSameElements( [ "nox__1000__+[case=nom][num=sg]", "noct__1000__+is[case=gen][num=sg]", "noct__1000__+ī[case=dat][num=sg]", "noct__1000__+em[case=acc][num=sg]", "noct__1000__+e[case=abl][num=sg]", "noct__1000__+ēs[case=nom][num=pl]", "noct__1000__+um[case=gen][num=pl]", "noct__1000__+ibus[case=dat][num=pl]", "noct__1000__+ēs[case=acc][num=pl]", # Also -īs for /i/ stems. "noct__1000__+ibus[case=abl][num=pl]" ], forms) forms = rewrite.rewrites("rēg__1003__", generator) self.assertSameElements([ "rēx__1003__+[case=nom][num=sg]", "rēg__1003__+is[case=gen][num=sg]", "rēg__1003__+ī[case=dat][num=sg]", "rēg__1003__+em[case=acc][num=sg]", "rēg__1003__+e[case=abl][num=sg]", "rēg__1003__+ēs[case=nom][num=pl]", "rēg__1003__+um[case=gen][num=pl]", "rēg__1003__+ibus[case=dat][num=pl]", "rēg__1003__+ēs[case=acc][num=pl]", "rēg__1003__+ibus[case=abl][num=pl]" ], forms) forms = rewrite.rewrites("ōs__1001__", generator) self.assertSameElements([ "ōs__1001__+[case=nom][num=sg]", "ōr__1001__+is[case=gen][num=sg]", "ōr__1001__+ī[case=dat][num=sg]", "ōr__1001__+em[case=acc][num=sg]", "ōr__1001__+e[case=abl][num=sg]", "ōr__1001__+ēs[case=nom][num=pl]", "ōr__1001__+um[case=gen][num=pl]", "ōr__1001__+ibus[case=dat][num=pl]", "ōr__1001__+ēs[case=acc][num=pl]", "ōr__1001__+ibus[case=abl][num=pl]" ], forms)
def _get_tagged_text(self, text, n_tagged): """ Returns text after tokenize and classify Args; text: input text n_tagged: number of tagged options to consider, -1 - return all possible tagged options """ if n_tagged == -1: if self.lang == "en": # this to keep arpabet phonemes in the list of options if "[" in text and "]" in text: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: try: tagged_texts = rewrite.rewrites( text, self.tagger.fst_no_digits) except pynini.lib.rewrite.Error: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: if self.lang == "en": # this to keep arpabet phonemes in the list of options if "[" in text and "]" in text: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) else: try: # try self.tagger graph that produces output without digits tagged_texts = rewrite.top_rewrites( text, self.tagger.fst_no_digits, nshortest=n_tagged) except pynini.lib.rewrite.Error: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) else: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) return tagged_texts
def testGenerator(self): generator = (self.paradigm.stems_to_forms @ self.paradigm.feature_label_rewriter) forms = rewrite.rewrites("aqu", generator) self.assertSameElements([ "aqu+a[case=nom][num=sg]", "aqu+ae[case=gen][num=sg]", "aqu+ae[case=dat][num=sg]", "aqu+am[case=acc][num=sg]", "aqu+ā[case=abl][num=sg]", "aqu+ae[case=nom][num=pl]", "aqu+ārum[case=gen][num=pl]", "aqu+īs[case=dat][num=pl]", "aqu+ās[case=acc][num=pl]", "aqu+īs[case=abl][num=pl]" ], forms)
def testOptionalRewrite(self): rule = pynini.cdrewrite(pynutil.delete(self.td), self.consonant, "[EOS]", self.sigstar, mode="opt").optimize() with self.assertRaisesRegex(rewrite.Error, r"Multiple top rewrites"): unused_var = rewrite.one_top_rewrite("fist", rule) self.assertCountEqual(["fist", "fis"], rewrite.rewrites("fist", rule)) self.assertTrue(rewrite.matches("fist", "fis", rule)) self.assertTrue(rewrite.matches("fist", "fist", rule)) self.assertFalse(rewrite.matches("fis", "fist", rule))
def testMandatoryRewrite(self): rule = pynini.cdrewrite(pynutil.delete(self.td), self.consonant, "[EOS]", self.sigstar).optimize() rewrites = tuple(rewrite.rewrites("fist", rule)) # pylint: disable=g-generic-assert self.assertEqual(len(rewrites), 1) # pylint: enable=g-generic-assert self.assertEqual("fis", rewrites[0]) self.assertEqual("fis", rewrite.top_rewrite("fist", rule)) self.assertEqual("fis", rewrite.one_top_rewrite("fist", rule)) self.assertTrue(rewrite.matches("fist", "fis", rule)) self.assertFalse(rewrite.matches("fis", "fist", rule))
def testGenerator(self): generator = (self.paradigm_a.stems_to_forms @ self.paradigm_a.feature_label_rewriter) forms = rewrite.rewrites("grádus", generator) self.assertSameElements([ "grádus[case=nom][num=sg]", "grádus+a[case=gen][num=sg]", "grádus+u[case=dat][num=sg]", "grádus[case=acc][num=sg]", "grádus+om[case=ins][num=sg]", "grádus+e[case=prp][num=sg]", "grádus+y[case=nom][num=pl]", "grádus+ov[case=gen][num=pl]", "grádus+am[case=dat][num=pl]", "grádus+y[case=acc][num=pl]", "grádus+ami[case=ins][num=pl]", "grádus+ax[case=prp][num=pl]" ], forms) generator = (self.paradigm_b.stems_to_forms @ self.paradigm_b.feature_label_rewriter) forms = rewrite.rewrites("stól", generator) self.assertSameElements([ "stól[case=nom][num=sg]", "stol+á[case=gen][num=sg]", "stol+ú[case=dat][num=sg]", "stól[case=acc][num=sg]", "stol+óm[case=ins][num=sg]", "stol+é[case=prp][num=sg]", "stol+óv[case=gen][num=pl]", "stol+ý[case=acc][num=pl]", "stol+ý[case=nom][num=pl]", "stol+ám[case=dat][num=pl]", "stol+ámi[case=ins][num=pl]", "stol+áx[case=prp][num=pl]" ], forms)
def get_verbalized_text(tagged_text): tagged_text = pynini.escape(tagged_text) return rewrite.rewrites(tagged_text, self.verbalizer.fst)
def testAnalyzer(self): analyzer = self.paradigm.analyzer @ self.paradigm.feature_label_rewriter forms = rewrite.rewrites("ōs", analyzer) self.assertSameElements(["ōs+[case=nom][num=sg]"], forms) forms = rewrite.rewrites("rēge", analyzer) self.assertSameElements(["rēg+e[case=abl][num=sg]"], forms)
def get_verbalized_text(tagged_text): return rewrite.rewrites(tagged_text, self.verbalizer.fst)
def normalize( self, text: str, n_tagged: int, punct_post_process: bool = True, verbose: bool = False, ) -> str: """ Main function. Normalizes tokens from written to spoken form e.g. 12 kg -> twelve kilograms Args: text: string that may include semiotic classes n_tagged: number of tagged options to consider, -1 - to get all possible tagged options punct_post_process: whether to normalize punctuation verbose: whether to print intermediate meta information Returns: normalized text options (usually there are multiple ways of normalizing a given semiotic class) """ assert ( len(text.split()) < 500 ), "Your input is too long. Please split up the input into sentences, or strings with fewer than 500 words" original_text = text text = pre_process(text) # to handle [] text = text.strip() if not text: if verbose: print(text) return text text = pynini.escape(text) if self.lm: if self.lang not in ["en"]: raise ValueError(f"{self.lang} is not supported in LM mode") if self.lang == "en": try: lattice = rewrite.rewrite_lattice( text, self.tagger.fst_no_digits) except pynini.lib.rewrite.Error: lattice = rewrite.rewrite_lattice(text, self.tagger.fst) lattice = rewrite.lattice_to_nshortest(lattice, n_tagged) tagged_texts = [(x[1], float(x[2])) for x in lattice.paths().items()] tagged_texts.sort(key=lambda x: x[1]) tagged_texts, weights = list(zip(*tagged_texts)) else: if n_tagged == -1: if self.lang == "en": try: tagged_texts = rewrite.rewrites( text, self.tagger.fst_no_digits) except pynini.lib.rewrite.Error: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: if self.lang == "en": try: tagged_texts = rewrite.top_rewrites( text, self.tagger.fst_no_digits, nshortest=n_tagged) except pynini.lib.rewrite.Error: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) else: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) # non-deterministic Eng normalization uses tagger composed with verbalizer, no permutation in between if self.lang == "en": normalized_texts = tagged_texts else: normalized_texts = [] for tagged_text in tagged_texts: self._verbalize(tagged_text, normalized_texts, verbose=verbose) if len(normalized_texts) == 0: raise ValueError() if punct_post_process: # do post-processing based on Moses detokenizer if self.processor: normalized_texts = [ self.processor.detokenize([t]) for t in normalized_texts ] normalized_texts = [ post_process_punct(input=original_text, normalized_text=t) for t in normalized_texts ] if self.lm: return normalized_texts, weights normalized_texts = set(normalized_texts) return normalized_texts
def normalize(self, text: str, n_tagged: int, punct_post_process: bool = True, verbose: bool = False,) -> str: """ Main function. Normalizes tokens from written to spoken form e.g. 12 kg -> twelve kilograms Args: text: string that may include semiotic classes n_tagged: number of tagged options to consider, -1 - to get all possible tagged options punct_post_process: whether to normalize punctuation verbose: whether to print intermediate meta information Returns: normalized text options (usually there are multiple ways of normalizing a given semiotic class) """ original_text = text if self.lang == "en": text = pre_process(text) text = text.strip() if not text: if verbose: print(text) return text text = pynini.escape(text) if n_tagged == -1: if self.lang == "en": try: tagged_texts = rewrite.rewrites(text, self.tagger.fst_no_digits) except pynini.lib.rewrite.Error: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: if self.lang == "en": try: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst_no_digits, nshortest=n_tagged) except pynini.lib.rewrite.Error: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) else: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) # non-deterministic Eng normalization uses tagger composed with verbalizer, no permutation in between if self.lang == "en": normalized_texts = tagged_texts else: normalized_texts = [] for tagged_text in tagged_texts: self._verbalize(tagged_text, normalized_texts, verbose=verbose) if len(normalized_texts) == 0: raise ValueError() if punct_post_process: # do post-processing based on Moses detokenizer if self.processor: normalized_texts = [self.processor.detokenize([t]) for t in normalized_texts] normalized_texts = [ post_process_punct(input=original_text, normalized_text=t) for t in normalized_texts ] else: print("NEMO_NLP collection is not available: skipping punctuation post_processing") normalized_texts = set(normalized_texts) return normalized_texts
def testTagger(self): tagger = self.paradigm.tagger @ self.paradigm.feature_label_rewriter forms = rewrite.rewrites("ōs", tagger) self.assertSameElements(["ōs[case=nom][num=sg]"], forms) forms = rewrite.rewrites("rēge", tagger) self.assertSameElements(["rēge[case=abl][num=sg]"], forms)